Version in base suite: 10.11.11-0+deb12u1 Base version: mariadb_10.11.11-0+deb12u1 Target version: mariadb_10.11.13-0+deb12u1 Base file: /srv/ftp-master.debian.org/ftp/pool/main/m/mariadb/mariadb_10.11.11-0+deb12u1.dsc Target file: /srv/ftp-master.debian.org/policy/pool/main/m/mariadb/mariadb_10.11.13-0+deb12u1.dsc /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png |binary /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png |binary mariadb-10.11.13/CMakeLists.txt | 10 mariadb-10.11.13/Docs/INFO_SRC | 10 mariadb-10.11.13/VERSION | 2 mariadb-10.11.13/appveyor.yml | 38 mariadb-10.11.13/client/mysql_upgrade.c | 14 mariadb-10.11.13/client/mysqlbinlog.cc | 61 mariadb-10.11.13/client/mysqldump.c | 15 mariadb-10.11.13/client/mysqlslap.c | 12 mariadb-10.11.13/client/mysqltest.cc | 16 mariadb-10.11.13/cmake/cpack_rpm.cmake | 4 mariadb-10.11.13/cmake/libfmt.cmake | 5 mariadb-10.11.13/cmake/os/Windows.cmake | 520 mariadb-10.11.13/cmake/os/WindowsCache.cmake | 19 mariadb-10.11.13/cmake/pcre.cmake | 12 mariadb-10.11.13/cmake/plugin.cmake | 5 mariadb-10.11.13/config.h.cmake | 48 mariadb-10.11.13/debian/changelog | 39 mariadb-10.11.13/debian/mariadb-server-core.postinst | 49 mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch | 26 mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch | 38 mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch | 150 mariadb-10.11.13/debian/patches/series | 3 mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh | 12 mariadb-10.11.13/debian/salsa-ci.yml | 11 mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected | 5 mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected | 21 mariadb-10.11.13/extra/mariabackup/backup_mysql.cc | 2 mariadb-10.11.13/extra/mariabackup/common_engine.cc | 6 mariadb-10.11.13/extra/mariabackup/innobackupex.cc | 7 mariadb-10.11.13/extra/mariabackup/write_filt.cc | 12 mariadb-10.11.13/extra/mariabackup/xtrabackup.cc | 80 mariadb-10.11.13/include/json_lib.h | 5 mariadb-10.11.13/include/my_base.h | 5 mariadb-10.11.13/include/my_cpu.h | 7 mariadb-10.11.13/include/my_stack_alloc.h | 2 mariadb-10.11.13/include/my_sys.h | 8 mariadb-10.11.13/include/my_virtual_mem.h | 37 mariadb-10.11.13/include/source_revision.h | 2 mariadb-10.11.13/include/sslopt-longopts.h | 3 mariadb-10.11.13/libmariadb/CMakeLists.txt | 2 mariadb-10.11.13/libmariadb/include/errmsg.h | 3 mariadb-10.11.13/libmariadb/include/ma_context.h | 25 mariadb-10.11.13/libmariadb/include/mariadb_com.h | 22 mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt | 6 mariadb-10.11.13/libmariadb/libmariadb/ma_context.c | 38 mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c | 2 mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c | 9 mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c | 6 mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c | 10 mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c | 83 mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c | 74 mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c | 52 mariadb-10.11.13/mysql-test/CMakeLists.txt | 2 mariadb-10.11.13/mysql-test/include/long_test.inc | 2 mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm | 3 mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc | 17 mariadb-10.11.13/mysql-test/main/backup_locks.test | 1 mariadb-10.11.13/mysql-test/main/comment_database.result | 13 mariadb-10.11.13/mysql-test/main/comment_database.test | 8 mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result | 2 mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result | 194 mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test | 22 mariadb-10.11.13/mysql-test/main/derived_view.result | 2 mariadb-10.11.13/mysql-test/main/func_json.result | 37 mariadb-10.11.13/mysql-test/main/func_json.test | 22 mariadb-10.11.13/mysql-test/main/func_like.result | 19 mariadb-10.11.13/mysql-test/main/func_like.test | 15 mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result | 28 mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test | 2 mariadb-10.11.13/mysql-test/main/gis-precise.result | 8 mariadb-10.11.13/mysql-test/main/gis-precise.test | 8 mariadb-10.11.13/mysql-test/main/gis.result | 32 mariadb-10.11.13/mysql-test/main/gis.test | 32 mariadb-10.11.13/mysql-test/main/group_by.result | 74 mariadb-10.11.13/mysql-test/main/group_by.test | 22 mariadb-10.11.13/mysql-test/main/group_min_max.result | 24 mariadb-10.11.13/mysql-test/main/group_min_max.test | 36 mariadb-10.11.13/mysql-test/main/insert.result | 72 mariadb-10.11.13/mysql-test/main/insert.test | 56 mariadb-10.11.13/mysql-test/main/insert_returning.result | 2 mariadb-10.11.13/mysql-test/main/insert_returning.test | 2 mariadb-10.11.13/mysql-test/main/insert_select.result | 135 mariadb-10.11.13/mysql-test/main/insert_select.test | 56 mariadb-10.11.13/mysql-test/main/join.result | 29 mariadb-10.11.13/mysql-test/main/join.test | 25 mariadb-10.11.13/mysql-test/main/join_cache.result | 26 mariadb-10.11.13/mysql-test/main/join_cache.test | 27 mariadb-10.11.13/mysql-test/main/join_nested.result | 12 mariadb-10.11.13/mysql-test/main/join_nested.test | 13 mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result | 12 mariadb-10.11.13/mysql-test/main/large_pages.opt | 2 mariadb-10.11.13/mysql-test/main/large_pages.result | 1 mariadb-10.11.13/mysql-test/main/large_pages.test | 4 mariadb-10.11.13/mysql-test/main/long_unique.result | 22 mariadb-10.11.13/mysql-test/main/long_unique.test | 22 mariadb-10.11.13/mysql-test/main/lowercase_table2.result | 2 mariadb-10.11.13/mysql-test/main/lowercase_view.result | 12 mariadb-10.11.13/mysql-test/main/lowercase_view.test | 12 mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result | 35 mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test | 113 mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result | 21 mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test | 22 mariadb-10.11.13/mysql-test/main/mdl_sync.result | 5 mariadb-10.11.13/mysql-test/main/mdl_sync.test | 8 mariadb-10.11.13/mysql-test/main/merge.result | 17 mariadb-10.11.13/mysql-test/main/merge.test | 17 mariadb-10.11.13/mysql-test/main/multi_update.result | 20 mariadb-10.11.13/mysql-test/main/multi_update.test | 28 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt | 1 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result | 8 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test | 8 mariadb-10.11.13/mysql-test/main/myisam-big.result | 8 mariadb-10.11.13/mysql-test/main/myisam-big.test | 13 mariadb-10.11.13/mysql-test/main/mysql-interactive.result | 4 mariadb-10.11.13/mysql-test/main/mysql-interactive.test | 11 mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result | 2 mariadb-10.11.13/mysql-test/main/mysql_upgrade.result | 23 mariadb-10.11.13/mysql-test/main/mysql_upgrade.test | 27 mariadb-10.11.13/mysql-test/main/mysqld--help.result | 3 mariadb-10.11.13/mysql-test/main/mysqldump-system.result | 6 mariadb-10.11.13/mysql-test/main/mysqldump.result | 33 mariadb-10.11.13/mysql-test/main/mysqldump.test | 11 mariadb-10.11.13/mysql-test/main/mysqlslap.result | 3 mariadb-10.11.13/mysql-test/main/mysqlslap.test | 6 mariadb-10.11.13/mysql-test/main/mysqltest.result | 9 mariadb-10.11.13/mysql-test/main/mysqltest.test | 6 mariadb-10.11.13/mysql-test/main/partition_myisam.result | 21 mariadb-10.11.13/mysql-test/main/partition_myisam.test | 28 mariadb-10.11.13/mysql-test/main/query_cache.result | 23 mariadb-10.11.13/mysql-test/main/query_cache.test | 22 mariadb-10.11.13/mysql-test/main/range_notembedded.result | 67 mariadb-10.11.13/mysql-test/main/range_notembedded.test | 48 mariadb-10.11.13/mysql-test/main/secondary_key_costs.result | 76 mariadb-10.11.13/mysql-test/main/secondary_key_costs.test | 37 mariadb-10.11.13/mysql-test/main/skip_grants.result | 8 mariadb-10.11.13/mysql-test/main/skip_grants.test | 11 mariadb-10.11.13/mysql-test/main/sp-bugs.result | 9 mariadb-10.11.13/mysql-test/main/sp-bugs.test | 20 mariadb-10.11.13/mysql-test/main/sp-row.result | 41 mariadb-10.11.13/mysql-test/main/sp-row.test | 61 mariadb-10.11.13/mysql-test/main/subselect.result | 20 mariadb-10.11.13/mysql-test/main/subselect.test | 10 mariadb-10.11.13/mysql-test/main/subselect_elimination.result | 12 mariadb-10.11.13/mysql-test/main/subselect_elimination.test | 7 mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result | 20 mariadb-10.11.13/mysql-test/main/subselect_no_mat.result | 20 mariadb-10.11.13/mysql-test/main/subselect_no_opts.result | 20 mariadb-10.11.13/mysql-test/main/subselect_no_scache.result | 20 mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result | 20 mariadb-10.11.13/mysql-test/main/temp_table_frm.result | 6 mariadb-10.11.13/mysql-test/main/temp_table_frm.test | 13 mariadb-10.11.13/mysql-test/main/timezone.test | 2 mariadb-10.11.13/mysql-test/main/trigger_null.result | 15 mariadb-10.11.13/mysql-test/main/trigger_null.test | 11 mariadb-10.11.13/mysql-test/main/type_binary.result | 58 mariadb-10.11.13/mysql-test/main/type_binary.test | 11 mariadb-10.11.13/mysql-test/main/type_blob.result | 190 mariadb-10.11.13/mysql-test/main/type_blob.test | 45 mariadb-10.11.13/mysql-test/main/type_num_innodb.result | 128 mariadb-10.11.13/mysql-test/main/type_varbinary.result | 42 mariadb-10.11.13/mysql-test/main/type_varbinary.test | 10 mariadb-10.11.13/mysql-test/main/update.result | 80 mariadb-10.11.13/mysql-test/main/update.test | 40 mariadb-10.11.13/mysql-test/main/userstat.result | 7 mariadb-10.11.13/mysql-test/main/userstat.test | 7 mariadb-10.11.13/mysql-test/main/view.result | 49 mariadb-10.11.13/mysql-test/main/view.test | 30 mariadb-10.11.13/mysql-test/main/view_grant.result | 46 mariadb-10.11.13/mysql-test/main/view_grant.test | 47 mariadb-10.11.13/mysql-test/mariadb-test-run.pl | 36 mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt | 24 mariadb-10.11.13/mysql-test/suite/archive/archive-big.test | 3 mariadb-10.11.13/mysql-test/suite/atomic/README.txt | 2 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc | 198 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt | 1 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result | 3135 ----- mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test | 198 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test | 2 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt | 1 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result | 1396 ++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test | 7 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result | 1741 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test | 6 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test | 2 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test | 2 mariadb-10.11.13/mysql-test/suite/atomic/create_table.test | 1 mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test | 1 mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test | 1 mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result | 116 mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result | 45 mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test | 135 mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test | 13 mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test | 1 mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result | 7 mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result | 24 mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt | 2 mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test | 30 mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt | 2 mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result | 4 mariadb-10.11.13/mysql-test/suite/federated/federatedx.result | 2 mariadb-10.11.13/mysql-test/suite/federated/federatedx.test | 2 mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result | 4 mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test | 11 mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test | 2 mariadb-10.11.13/mysql-test/suite/galera/disabled.def | 6 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf | 8 mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc | 1 mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc | 1 mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc | 1 mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc | 4 mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc | 4 mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc | 35 mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result | 31 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result | 16 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result | 22 mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result | 25 mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result | 5 mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result | 5 mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result | 685 - mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result | 5 mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result | 10 mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result | 7 mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result | 176 mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result | 8 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff | 11 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result | 16 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result | 152 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result | 350 mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result | 30 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff | 6 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff | 210 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result | 534 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff | 210 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result | 534 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result | 80 mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result | 27 mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result | 1 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result | 112 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result | 94 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result | 102 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result | 12 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result | 9 mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result | 84 mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result | 2 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result | 3 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff | 12 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff | 15 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result | 4 mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result | 2 mariadb-10.11.13/mysql-test/suite/galera/suite.pm | 80 mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test | 7 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test | 5 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf | 13 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test | 22 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test | 39 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test | 43 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test | 7 mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test | 10 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test | 105 mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt | 1 mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc | 2 mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test | 73 mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/create.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test | 11 mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test | 11 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test | 8 mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test | 15 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test | 7 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test | 7 mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test | 11 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test | 133 mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test | 13 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test | 27 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test | 115 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test | 255 mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf | 11 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test | 82 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf | 28 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test | 29 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf | 23 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test | 29 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test | 3 mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test | 60 mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf | 7 mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf | 7 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test | 39 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test | 86 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test | 17 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test | 6 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf | 20 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test | 165 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf | 21 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test | 73 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc | 79 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc | 33 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf | 21 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test | 100 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test | 12 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test | 5 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test | 14 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test | 81 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test | 4 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf | 7 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test | 11 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test | 2 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/rename.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/view.test | 1 mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def | 3 mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result | 61 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result | 41 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result | 35 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result | 10 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result | 4 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result | 26 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm | 82 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test | 110 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test | 89 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf | 9 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations | 5 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test | 75 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test | 11 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test | 28 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test | 80 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test | 10 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test | 5 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf | 6 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf | 4 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test | 64 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result | 10 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm | 39 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test | 8 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test | 2 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test | 3 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test | 1 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def | 4 mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result | 9 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm | 80 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test | 23 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf | 3 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test | 3 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf | 5 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf | 7 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test | 2 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test | 1 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test | 4 mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result | 2 mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result | 52 mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test | 37 mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result | 21 mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result | 26 mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff | 91 mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result | 37 mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result | 8 mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result | 18 mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result | 21 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result | 26 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result | 4 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result | 47 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result | 14 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result | 25 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result | 26 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result | 4 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result | 40 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result | 6 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result | 2 mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff | 7 mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result | 11 mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result | 88 mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result | 2 mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result | 2 mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff | 11 mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result | 33 mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result | 15 mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result | 2 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff | 16 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff | 16 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff | 16 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff | 16 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff | 16 mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result | 13 mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result | 10 mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result | 10 mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test | 21 mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test | 42 mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test | 21 mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test | 27 mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test | 45 mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test | 30 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test | 25 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test | 4 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test | 73 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test | 28 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test | 35 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt | 3 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test | 61 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test | 4 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test | 45 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test | 14 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test | 12 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test | 134 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt | 3 mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test | 36 mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test | 25 mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test | 6 mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt | 2 mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test | 25 mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test | 9 mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test | 12 mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt | 1 mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result | 3 mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result | 9 mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test | 6 mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test | 9 mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result | 13 mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test | 13 mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test | 1 mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result | 2 mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result | 5 mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test | 3 mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result | 3 mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test | 12 mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test | 2 mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result | 4 mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result | 2 mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result | 11 mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test | 25 mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result | 4 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt | 1 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result | 18 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test | 38 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf | 13 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result | 45 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test | 83 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test | 1 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test | 1 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test | 1 mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result | 10 mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test | 2 mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result | 3 mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test | 4 mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result | 2 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result | 158 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result | 2 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result | 3 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result | 7 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result | 41 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result | 53 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result | 26 mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test | 6 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test | 161 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt | 2 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test | 2 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test | 12 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test | 68 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test | 100 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test | 1 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test | 63 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt | 1 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result | 26 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test | 19 mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result | 47 mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test | 50 mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result | 2 mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result | 1 mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test | 1 mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result | 2 mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test | 1 mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result | 30 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff | 125 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result | 46 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result | 4 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result | 4 mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result | 51 mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result | 15 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt | 1 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt | 1 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test | 37 mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt | 4 mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test | 4 mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test | 46 mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test | 19 mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result | 35 mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test | 43 mariadb-10.11.13/mysql-test/suite/wsrep/README | 1 mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc | 1 mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result | 2 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result | 18 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result | 65 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff | 2 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result | 51 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result | 8 mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm | 6 mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test | 2 mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test | 3 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf | 10 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test | 28 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf | 14 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test | 73 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf | 2 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf | 7 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test | 48 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf | 6 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test | 11 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test | 1 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf | 1 mariadb-10.11.13/mysys/CMakeLists.txt | 5 mariadb-10.11.13/mysys/mf_keycache.c | 9 mariadb-10.11.13/mysys/my_default.c | 3 mariadb-10.11.13/mysys/my_getopt.c | 3 mariadb-10.11.13/mysys/my_largepage.c | 111 mariadb-10.11.13/mysys/my_pread.c | 9 mariadb-10.11.13/mysys/my_virtual_mem.c | 201 mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c | 4 mariadb-10.11.13/plugin/server_audit/server_audit.c | 12 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result | 23 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test | 12 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc | 13 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result | 12 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result | 12 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result | 12 mariadb-10.11.13/plugin/userstat/client_stats.cc | 4 mariadb-10.11.13/plugin/versioning/versioning.cc | 1 mariadb-10.11.13/scripts/mysqlhotcopy.sh | 2 mariadb-10.11.13/scripts/wsrep_sst_common.sh | 13 mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh | 2 mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh | 4 mariadb-10.11.13/scripts/wsrep_sst_rsync.sh | 2 mariadb-10.11.13/sql/filesort.cc | 49 mariadb-10.11.13/sql/ha_partition.cc | 40 mariadb-10.11.13/sql/ha_sequence.cc | 6 mariadb-10.11.13/sql/ha_sequence.h | 3 mariadb-10.11.13/sql/handle_connections_win.cc | 3 mariadb-10.11.13/sql/handler.cc | 71 mariadb-10.11.13/sql/handler.h | 4 mariadb-10.11.13/sql/item.cc | 14 mariadb-10.11.13/sql/item.h | 47 mariadb-10.11.13/sql/item_cmpfunc.h | 30 mariadb-10.11.13/sql/item_func.cc | 10 mariadb-10.11.13/sql/item_func.h | 7 mariadb-10.11.13/sql/item_geofunc.cc | 26 mariadb-10.11.13/sql/item_jsonfunc.cc | 114 mariadb-10.11.13/sql/item_strfunc.cc | 10 mariadb-10.11.13/sql/item_subselect.cc | 24 mariadb-10.11.13/sql/item_subselect.h | 1 mariadb-10.11.13/sql/lex_string.h | 2 mariadb-10.11.13/sql/log.cc | 41 mariadb-10.11.13/sql/log.h | 1 mariadb-10.11.13/sql/mysql_install_db.cc | 23 mariadb-10.11.13/sql/mysql_upgrade_service.cc | 129 mariadb-10.11.13/sql/mysqld.cc | 41 mariadb-10.11.13/sql/mysqld.h | 1 mariadb-10.11.13/sql/net_serv.cc | 25 mariadb-10.11.13/sql/opt_range.cc | 91 mariadb-10.11.13/sql/opt_range.h | 29 mariadb-10.11.13/sql/rpl_injector.h | 1 mariadb-10.11.13/sql/rpl_mi.cc | 67 mariadb-10.11.13/sql/rpl_mi.h | 11 mariadb-10.11.13/sql/rpl_parallel.cc | 16 mariadb-10.11.13/sql/semisync_master.cc | 4 mariadb-10.11.13/sql/semisync_slave.cc | 9 mariadb-10.11.13/sql/semisync_slave.h | 2 mariadb-10.11.13/sql/signal_handler.cc | 2 mariadb-10.11.13/sql/slave.cc | 72 mariadb-10.11.13/sql/sp_head.cc | 12 mariadb-10.11.13/sql/sql_acl.cc | 17 mariadb-10.11.13/sql/sql_base.cc | 257 mariadb-10.11.13/sql/sql_base.h | 21 mariadb-10.11.13/sql/sql_cache.cc | 2 mariadb-10.11.13/sql/sql_class.cc | 18 mariadb-10.11.13/sql/sql_class.h | 5 mariadb-10.11.13/sql/sql_cmd.h | 1 mariadb-10.11.13/sql/sql_db.cc | 46 mariadb-10.11.13/sql/sql_db.h | 4 mariadb-10.11.13/sql/sql_error.cc | 22 mariadb-10.11.13/sql/sql_insert.cc | 145 mariadb-10.11.13/sql/sql_insert.h | 2 mariadb-10.11.13/sql/sql_lex.cc | 45 mariadb-10.11.13/sql/sql_lex.h | 6 mariadb-10.11.13/sql/sql_parse.cc | 19 mariadb-10.11.13/sql/sql_prepare.cc | 4 mariadb-10.11.13/sql/sql_priv.h | 1 mariadb-10.11.13/sql/sql_reload.cc | 2 mariadb-10.11.13/sql/sql_select.cc | 164 mariadb-10.11.13/sql/sql_show.cc | 31 mariadb-10.11.13/sql/sql_statistics.cc | 11 mariadb-10.11.13/sql/sql_string.h | 2 mariadb-10.11.13/sql/sql_table.cc | 85 mariadb-10.11.13/sql/sql_trigger.cc | 7 mariadb-10.11.13/sql/sql_truncate.cc | 35 mariadb-10.11.13/sql/sql_update.cc | 5 mariadb-10.11.13/sql/sql_view.cc | 15 mariadb-10.11.13/sql/sql_yacc.yy | 2 mariadb-10.11.13/sql/structs.h | 2 mariadb-10.11.13/sql/sys_vars.cc | 11 mariadb-10.11.13/sql/table.cc | 48 mariadb-10.11.13/sql/table.h | 13 mariadb-10.11.13/sql/vers_string.h | 2 mariadb-10.11.13/sql/wsrep_applier.cc | 15 mariadb-10.11.13/sql/wsrep_client_service.cc | 6 mariadb-10.11.13/sql/wsrep_high_priority_service.cc | 4 mariadb-10.11.13/sql/wsrep_mysqld.cc | 114 mariadb-10.11.13/sql/wsrep_mysqld.h | 3 mariadb-10.11.13/sql/wsrep_server_service.cc | 1 mariadb-10.11.13/sql/wsrep_sst.cc | 11 mariadb-10.11.13/sql/wsrep_thd.h | 66 mariadb-10.11.13/sql/wsrep_trans_observer.h | 15 mariadb-10.11.13/sql/wsrep_var.cc | 55 mariadb-10.11.13/sql/wsrep_var.h | 3 mariadb-10.11.13/sql/wsrep_xid.cc | 43 mariadb-10.11.13/sql/wsrep_xid.h | 4 mariadb-10.11.13/sql/yy_mariadb.cc | 2 mariadb-10.11.13/sql/yy_oracle.cc | 2 mariadb-10.11.13/storage/connect/CMakeLists.txt | 6 mariadb-10.11.13/storage/connect/connect.cc | 8 mariadb-10.11.13/storage/connect/plgxml.h | 4 mariadb-10.11.13/storage/connect/tabxml.cpp | 3 mariadb-10.11.13/storage/connect/user_connect.cc | 19 mariadb-10.11.13/storage/federatedx/federatedx_io.cc | 1 mariadb-10.11.13/storage/federatedx/ha_federatedx.cc | 23 mariadb-10.11.13/storage/innobase/CMakeLists.txt | 1 mariadb-10.11.13/storage/innobase/btr/btr0sea.cc | 104 mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc | 327 mariadb-10.11.13/storage/innobase/buf/buf0buf.cc | 2299 +--- mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc | 73 mariadb-10.11.13/storage/innobase/buf/buf0dump.cc | 8 mariadb-10.11.13/storage/innobase/buf/buf0flu.cc | 239 mariadb-10.11.13/storage/innobase/buf/buf0lru.cc | 139 mariadb-10.11.13/storage/innobase/buf/buf0rea.cc | 7 mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc | 116 mariadb-10.11.13/storage/innobase/dict/dict0dict.cc | 244 mariadb-10.11.13/storage/innobase/dict/dict0load.cc | 2 mariadb-10.11.13/storage/innobase/dict/dict0stats.cc | 730 - mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc | 22 mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc | 33 mariadb-10.11.13/storage/innobase/fts/fts0config.cc | 2 mariadb-10.11.13/storage/innobase/fts/fts0fts.cc | 13 mariadb-10.11.13/storage/innobase/fts/fts0opt.cc | 2 mariadb-10.11.13/storage/innobase/gis/gis0sea.cc | 24 mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc | 1556 +- mariadb-10.11.13/storage/innobase/handler/ha_innodb.h | 3 mariadb-10.11.13/storage/innobase/handler/handler0alter.cc | 138 mariadb-10.11.13/storage/innobase/handler/i_s.cc | 130 mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc | 30 mariadb-10.11.13/storage/innobase/include/btr0sea.h | 10 mariadb-10.11.13/storage/innobase/include/buf0buddy.h | 40 mariadb-10.11.13/storage/innobase/include/buf0buf.h | 446 mariadb-10.11.13/storage/innobase/include/buf0buf.inl | 2 mariadb-10.11.13/storage/innobase/include/buf0dblwr.h | 3 mariadb-10.11.13/storage/innobase/include/buf0lru.h | 4 mariadb-10.11.13/storage/innobase/include/dict0dict.h | 53 mariadb-10.11.13/storage/innobase/include/dict0dict.inl | 4 mariadb-10.11.13/storage/innobase/include/dict0mem.h | 105 mariadb-10.11.13/storage/innobase/include/dict0stats.h | 141 mariadb-10.11.13/storage/innobase/include/dict0stats.inl | 219 mariadb-10.11.13/storage/innobase/include/fil0fil.h | 9 mariadb-10.11.13/storage/innobase/include/fsp0fsp.h | 6 mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h | 10 mariadb-10.11.13/storage/innobase/include/log0log.h | 191 mariadb-10.11.13/storage/innobase/include/log0recv.h | 12 mariadb-10.11.13/storage/innobase/include/mtr0mtr.h | 9 mariadb-10.11.13/storage/innobase/include/os0file.h | 2 mariadb-10.11.13/storage/innobase/include/row0row.h | 16 mariadb-10.11.13/storage/innobase/include/row0row.inl | 49 mariadb-10.11.13/storage/innobase/include/row0sel.h | 5 mariadb-10.11.13/storage/innobase/include/srv0srv.h | 21 mariadb-10.11.13/storage/innobase/include/trx0trx.h | 26 mariadb-10.11.13/storage/innobase/include/trx0types.h | 9 mariadb-10.11.13/storage/innobase/include/ut0new.h | 1 mariadb-10.11.13/storage/innobase/lock/lock0lock.cc | 72 mariadb-10.11.13/storage/innobase/log/log0crypt.cc | 2 mariadb-10.11.13/storage/innobase/log/log0log.cc | 283 mariadb-10.11.13/storage/innobase/log/log0recv.cc | 159 mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc | 272 mariadb-10.11.13/storage/innobase/os/os0file.cc | 22 mariadb-10.11.13/storage/innobase/pars/pars0pars.cc | 5 mariadb-10.11.13/storage/innobase/row/row0ins.cc | 151 mariadb-10.11.13/storage/innobase/row/row0log.cc | 13 mariadb-10.11.13/storage/innobase/row/row0mysql.cc | 20 mariadb-10.11.13/storage/innobase/row/row0purge.cc | 2 mariadb-10.11.13/storage/innobase/row/row0sel.cc | 120 mariadb-10.11.13/storage/innobase/row/row0uins.cc | 10 mariadb-10.11.13/storage/innobase/row/row0umod.cc | 7 mariadb-10.11.13/storage/innobase/row/row0upd.cc | 4 mariadb-10.11.13/storage/innobase/srv/srv0mon.cc | 17 mariadb-10.11.13/storage/innobase/srv/srv0srv.cc | 35 mariadb-10.11.13/storage/innobase/srv/srv0start.cc | 46 mariadb-10.11.13/storage/innobase/trx/trx0purge.cc | 83 mariadb-10.11.13/storage/innobase/trx/trx0rec.cc | 26 mariadb-10.11.13/storage/innobase/trx/trx0trx.cc | 5 mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc | 2 mariadb-10.11.13/storage/maria/ma_control_file.c | 45 mariadb-10.11.13/storage/maria/ma_pagecache.c | 8 mariadb-10.11.13/storage/maria/ma_unique.c | 6 mariadb-10.11.13/storage/mroonga/CMakeLists.txt | 2 mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp | 8 mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt | 2 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c | 4 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c | 14 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c | 4 mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt | 2 mariadb-10.11.13/storage/myisam/mi_unique.c | 6 mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake | 112 mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc | 11 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result | 10 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result | 10 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result | 2 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result | 2 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result | 10 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc | 15 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result | 20 mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc | 5 mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain | 54 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml | 872 + mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml | 256 mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt | 523 mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md | 836 + mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile | 1873 ++- mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/README.md | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS | 1090 +- mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py | 142 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py | 125 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform | 282 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/format-diff.sh | 148 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel | 52 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator | 1063 + mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc | 72 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc | 275 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc | 794 + mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h | 134 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h | 183 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h | 125 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc | 271 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h | 132 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc | 188 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h | 191 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc | 506 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc | 129 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc | 180 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc | 496 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h | 192 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc | 1660 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc | 112 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h | 75 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc | 115 mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h | 146 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc | 326 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc | 34 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h | 37 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc | 156 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h | 67 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc | 210 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc | 375 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc | 672 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc | 102 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h | 52 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc | 268 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h | 101 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc | 134 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h | 57 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc | 173 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc | 62 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h | 170 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc | 582 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h | 106 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc | 974 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc | 100 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h | 102 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc | 196 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h | 187 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc | 145 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h | 149 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc | 132 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc | 172 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc | 1026 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc | 718 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc | 82 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc | 572 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc | 335 mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc | 1248 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c | 1230 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc | 338 mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h | 176 mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc | 264 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc | 126 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc | 160 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h | 113 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h | 275 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc | 258 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc | 140 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h | 49 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc | 853 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h | 289 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc | 500 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc | 1893 ++- mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h | 253 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc | 61 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc | 478 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc | 115 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc | 182 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc | 1111 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc | 437 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc | 825 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h | 92 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc | 541 mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc | 2583 +++- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc | 436 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc | 1228 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc | 952 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc | 289 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc | 3058 ++++- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc | 85 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc | 413 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc | 1958 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc | 173 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h | 118 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc | 1790 ++- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h | 559 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc | 1399 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc | 72 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc | 465 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc | 858 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc | 245 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc | 601 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc | 869 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc | 63 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc | 135 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc | 686 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h | 91 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc | 581 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc | 402 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc | 197 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc | 127 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc | 513 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc | 89 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc | 264 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc | 473 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc | 520 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc | 407 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc | 1260 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc | 640 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc | 72 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc | 347 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc | 1522 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc | 2788 ++++- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc | 344 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h | 475 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc | 383 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc | 1151 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc | 3217 +++++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc | 121 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc | 793 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc | 199 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc | 81 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h | 233 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc | 108 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc | 504 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h | 68 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc | 2663 ++++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc | 871 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc | 167 mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc | 732 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc | 389 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc | 531 mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc | 116 mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc | 688 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc | 358 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc | 99 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc | 80 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc | 151 mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc | 559 - mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h | 167 mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h | 394 mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc | 681 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc | 171 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc | 209 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc | 353 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h | 163 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc | 439 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h | 109 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc | 247 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc | 97 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc | 247 mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc | 161 mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h | 48 mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc | 108 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc | 117 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h | 78 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc | 236 mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc | 148 mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc | 75 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc | 227 mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc | 173 mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc | 268 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h | 121 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h | 88 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc | 91 mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc | 1324 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h | 49 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc | 1452 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc | 366 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h | 241 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc | 980 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h | 309 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc | 373 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc | 2965 ++--- mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h | 477 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc | 2140 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc | 204 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h | 166 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc | 214 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc | 78 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc | 99 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc | 963 + mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h | 181 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc | 414 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc | 501 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc | 101 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc | 328 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc | 31 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc | 67 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc | 138 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h | 125 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc | 74 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc | 289 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc | 148 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h | 207 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc | 1122 +- mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc | 144 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc | 616 + mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h | 287 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc | 1037 + mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h | 302 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc | 282 mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock | 331 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown | 101 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown | 195 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown | 157 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown | 281 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc | 464 mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h | 1101 -- mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h | 114 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc | 960 + mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc | 204 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc | 349 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc | 1385 +- mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h | 116 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc | 340 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc | 1320 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc | 189 mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc | 519 mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h | 447 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc | 416 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h | 107 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc | 306 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h | 139 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc | 518 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h | 141 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc | 140 mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc | 742 - mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h | 191 mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc | 164 mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h | 71 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c | 25 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc | 53 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h | 118 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc | 208 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h | 71 mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc | 114 mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h | 48 mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc | 68 mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h | 59 mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc | 1004 + mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc | 363 mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h | 120 mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc | 483 mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc | 66 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h | 84 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc | 104 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h | 57 mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc | 599 - mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile | 61 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md | 160 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc | 164 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc | 107 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc | 185 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h | 66 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h | 286 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h | 618 + mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h | 247 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h | 146 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h | 397 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h | 199 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h | 233 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h | 420 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h | 465 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h | 465 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h | 92 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h | 423 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h | 150 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h | 342 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h | 278 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h | 534 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h | 25 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h | 85 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h | 142 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h | 196 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h | 222 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h | 116 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h | 308 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h | 247 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h | 187 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h | 58 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h | 616 + mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h | 335 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h | 142 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h | 368 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h | 473 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h | 946 + mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h | 48 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h | 62 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h | 58 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h | 145 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h | 144 mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt | 66 mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile | 213 mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java | 100 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template | 178 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom | 150 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc | 88 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc | 502 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h | 122 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc | 1388 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h | 1215 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc | 735 + mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc | 25 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc | 216 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc | 34 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java | 31 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java | 334 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java | 148 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java | 254 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java | 167 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java | 448 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java | 47 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java | 227 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java | 295 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java | 335 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java | 112 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java | 186 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java | 53 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java | 60 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java | 157 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java | 109 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java | 256 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java | 387 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java | 271 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java | 805 + mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java | 107 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java | 86 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java | 86 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java | 112 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java | 74 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java | 92 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java | 63 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java | 75 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java | 313 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java | 126 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java | 85 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java | 513 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java | 50 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java | 130 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java | 763 + mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java | 654 - mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java | 525 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java | 94 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java | 397 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java | 188 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java | 191 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java | 62 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java | 21 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java | 135 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java | 72 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java | 193 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md | 79 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc | 79 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc | 154 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc | 281 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc | 91 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc | 243 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h | 49 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc | 58 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h | 53 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc | 225 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc | 236 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc | 134 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc | 156 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc | 116 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc | 49 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc | 396 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc | 935 + mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h | 145 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc | 785 + mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h | 187 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc | 880 + mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h | 126 mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc | 137 mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc | 2132 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc | 785 + mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h | 53 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc | 91 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc | 2621 +--- mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h | 204 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc | 559 - mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h | 66 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h | 50 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc | 189 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc | 3434 +++++- mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc | 67 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc | 75 mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc | 1027 - mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h | 413 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc | 603 - mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h | 270 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc | 126 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk | 239 mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc | 73 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h | 48 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc | 499 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h | 339 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc | 1682 ++- mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc | 754 - mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h | 152 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc | 382 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h | 273 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc | 3001 +---- mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h | 503 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h | 163 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc | 357 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc | 74 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h | 225 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc | 100 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h | 66 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc | 67 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc | 75 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc | 1057 + mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h | 117 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc | 64 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc | 95 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h | 34 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc | 25 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc | 147 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h | 49 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc | 47 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc | 55 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h | 85 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc | 252 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h | 54 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc | 162 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h | 159 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc | 207 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h | 54 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc | 52 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc | 257 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h | 69 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc | 521 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc | 79 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc | 134 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc | 496 mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h | 261 mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc | 192 mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h | 67 mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc | 363 mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h | 71 mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc | 255 mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h | 178 mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h | 94 mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc | 340 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h | 61 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc | 58 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc | 502 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h | 97 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc | 266 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc | 185 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h | 139 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc | 139 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc | 65 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc | 2268 ++-- mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc | 166 mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h | 59 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc | 437 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h | 225 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc | 38 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc | 44 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc | 58 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h | 71 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc | 464 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h | 677 - mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h | 118 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h | 390 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/CMakeLists.txt | 35 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/advisor/README.md | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/backup_db.sh | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/benchmark.sh | 343 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/blob_dump.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc | 18 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_all_python.py | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_format_compatible.sh | 361 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool.cc | 2086 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_crashtest.py | 442 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_repl_stress.cc | 121 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc | 25 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc | 189 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc | 144 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd.cc | 932 + mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h | 78 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc | 539 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_test.py | 208 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_tool.cc | 37 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/rdb.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/regression_test.sh | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh | 42 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/restore_db.sh | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/run_blob_bench.sh | 195 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc | 246 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h | 126 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump.cc | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_test.cc | 245 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc | 660 - mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc | 279 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc | 585 - mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h | 142 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_external_sst.sh | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress.cc | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress_runner.py | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc | 303 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.h | 185 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc | 352 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record.cc | 206 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc | 190 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc | 146 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc | 817 - mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.h | 172 mariadb-10.11.13/storage/rocksdb/rocksdb/util/aligned_buffer.h | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector.h | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector_test.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_impl.h | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_test.cc | 724 + mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.cc.in | 74 mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.h | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/util/cast_util.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/util/channel.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding.h | 107 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_lean.h | 101 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_test.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/util/comparator.cc | 90 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression.h | 221 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression_context_cache.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.cc | 221 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.cc | 146 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.c | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_test.cc | 47 mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer.h | 31 mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer_test.cc | 11 mariadb-10.11.13/storage/rocksdb/rocksdb/util/duplicate_detector.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc | 10 mariadb-10.11.13/storage/rocksdb/rocksdb/util/fastrange.h | 114 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.cc | 95 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.h | 95 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc | 613 - mariadb-10.11.13/storage/rocksdb/rocksdb/util/filelock_test.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/util/filter_bench.cc | 94 mariadb-10.11.13/storage/rocksdb/rocksdb/util/gflags_compat.h | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.cc | 128 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.h | 103 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash128.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_map.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_test.cc | 545 - mariadb-10.11.13/storage/rocksdb/rocksdb/util/heap.h | 7 mariadb-10.11.13/storage/rocksdb/rocksdb/util/kv_map.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/log_write_bench.cc | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/util/math.h | 242 mariadb-10.11.13/storage/rocksdb/rocksdb/util/math128.h | 310 mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/util/mutexlock.h | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.cc | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.cc | 439 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.h | 86 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter_test.cc | 386 mariadb-10.11.13/storage/rocksdb/rocksdb/util/regex.cc | 50 mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread.h | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_alg.h | 1225 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.cc | 506 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.h | 182 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_impl.h | 1137 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_test.cc | 1308 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/set_comparator.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice.cc | 223 mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice_test.cc | 54 mariadb-10.11.13/storage/rocksdb/rocksdb/util/status.cc | 37 mariadb-10.11.13/storage/rocksdb/rocksdb/util/stop_watch.h | 36 mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.cc | 106 mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.h | 55 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_guard.h | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_list_test.cc | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local_test.cc | 51 mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.cc | 72 mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer.h | 331 mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer_test.cc | 402 mariadb-10.11.13/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/util/util.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/util/vector_iterator.h | 46 mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue.h | 150 mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue_test.cc | 268 mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxh3p.h | 1648 --- mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.cc | 1181 -- mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.h | 5444 +++++++++- mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxph3.h | 1762 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc | 2543 +++- mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc | 2557 ++++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc | 409 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h | 130 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc | 268 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h | 34 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc | 567 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h | 3 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc | 106 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc | 149 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h | 133 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc | 105 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h | 82 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc | 139 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h | 94 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc | 69 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc | 489 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h | 365 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc | 73 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc | 28 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc | 206 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc | 22 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc | 1 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/format.h | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc | 340 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h | 34 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc | 173 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters.cc | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h | 41 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/debug.cc | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.cc | 104 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.md | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados_test.cc | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_mirror.cc | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.cc | 286 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.h | 97 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed_test.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc | 548 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.h | 258 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc | 994 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h | 582 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc | 110 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h | 94 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc | 50 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory_allocators.h | 104 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.cc | 120 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.h | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc | 252 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc | 24 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry.cc | 227 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry_test.cc | 619 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util.cc | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc | 519 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc | 5 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h | 19 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h | 8 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc | 17 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h | 9 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h | 12 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc | 6 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc | 32 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc | 2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc | 84 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc | 57 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc | 199 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h | 14 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc | 138 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc | 43 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc | 316 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h | 86 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc | 29 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h | 82 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h | 209 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc | 718 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h | 223 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc | 181 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h | 319 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc | 270 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h | 99 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h | 30 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc | 422 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 | 661 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 | 174 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 | 339 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h | 76 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h | 138 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h | 102 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc | 139 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h | 174 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc | 222 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h | 141 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc | 525 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h | 253 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc | 1024 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h | 580 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc | 527 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc | 265 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h | 178 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc | 520 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h | 302 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc | 120 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h | 92 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc | 213 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h | 124 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h | 215 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h | 39 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h | 130 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h | 286 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h | 87 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h | 520 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h | 179 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h | 176 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h | 27 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc | 132 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc | 153 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h | 98 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h | 144 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc | 201 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h | 141 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h | 794 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h | 1295 ++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h | 165 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h | 76 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc | 503 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h | 137 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc | 156 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h | 146 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc | 33 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h | 16 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc | 672 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc | 200 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h | 15 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc | 107 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h | 20 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc | 267 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h | 56 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc | 745 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h | 158 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc | 568 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h | 45 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc | 70 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h | 52 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc | 772 + mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc | 83 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc | 78 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h | 13 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc | 105 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc | 143 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h | 4 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc | 26 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h | 40 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc | 375 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h | 231 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc | 276 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/wal_filter.cc | 23 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc | 669 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc | 655 - mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h | 196 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc | 1766 ++- mariadb-10.11.13/storage/spider/CMakeLists.txt | 11 mariadb-10.11.13/storage/spider/ha_spider.cc | 27 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/basic_sql.test | 3 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha_part.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result | 19 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result | 16 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result | 51 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result | 25 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/subquery.result | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/insert_select.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test | 8 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test | 5 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test | 25 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test | 3 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test | 21 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test | 53 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test | 30 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/subquery.test | 1 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/r/pushdown_case.result | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/pushdown_case.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/auto_increment.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join_using.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha_part.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_cond_push.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_fulltext.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test | 6 mariadb-10.11.13/storage/spider/mysql-test/spider/t/pushdown_not_like.test | 4 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_0.test | 10 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_1.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_2.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_3.test | 12 mariadb-10.11.13/storage/spider/mysql-test/spider/t/slave_trx_isolation.test | 2 mariadb-10.11.13/storage/spider/mysql-test/spider/t/timestamp.test | 28 mariadb-10.11.13/storage/spider/mysql-test/spider/t/udf_pushdown.inc | 4 mariadb-10.11.13/storage/spider/spd_db_conn.cc | 39 mariadb-10.11.13/storage/spider/spd_db_include.h | 4 mariadb-10.11.13/storage/spider/spd_db_mysql.cc | 4 mariadb-10.11.13/storage/spider/spd_direct_sql.cc | 4 mariadb-10.11.13/storage/spider/spd_group_by_handler.cc | 10 mariadb-10.11.13/storage/spider/spd_table.cc | 4 mariadb-10.11.13/storage/spider/spd_trx.cc | 219 mariadb-10.11.13/storage/spider/spd_trx.h | 5 mariadb-10.11.13/strings/ctype-bin.c | 2 mariadb-10.11.13/strings/ctype-latin1.c | 3 mariadb-10.11.13/strings/ctype-mb.c | 2 mariadb-10.11.13/strings/ctype-simple.c | 2 mariadb-10.11.13/strings/ctype-uca.inl | 2 mariadb-10.11.13/strings/ctype-ucs2.c | 10 mariadb-10.11.13/strings/ctype-utf8.c | 4 mariadb-10.11.13/strings/json_lib.c | 10 mariadb-10.11.13/strings/strings_def.h | 2 mariadb-10.11.13/support-files/mariadb.service.in | 8 mariadb-10.11.13/support-files/mariadb@.service.in | 8 mariadb-10.11.13/support-files/rpm/server-prein.sh | 23 mariadb-10.11.13/tests/mysql_client_fw.c | 4 mariadb-10.11.13/tests/mysql_client_test.c | 210 mariadb-10.11.13/tpool/aio_liburing.cc | 10 mariadb-10.11.13/tpool/tpool_generic.cc | 1 mariadb-10.11.13/win/packaging/ca/CMakeLists.txt | 5 mariadb-10.11.13/win/upgrade_wizard/CMakeLists.txt | 20 mariadb-10.11.13/wsrep-lib/.github/workflows/cmake.yml | 71 mariadb-10.11.13/wsrep-lib/.gitignore | 3 mariadb-10.11.13/wsrep-lib/CMakeLists.txt | 2 mariadb-10.11.13/wsrep-lib/CONTRIBUTORS.txt | 1 mariadb-10.11.13/wsrep-lib/cmake/boost.cmake | 2 mariadb-10.11.13/wsrep-lib/include/wsrep/client_state.hpp | 6 mariadb-10.11.13/wsrep-lib/include/wsrep/connection_monitor_service.hpp | 71 mariadb-10.11.13/wsrep-lib/include/wsrep/id.hpp | 5 mariadb-10.11.13/wsrep-lib/include/wsrep/provider.hpp | 26 mariadb-10.11.13/wsrep-lib/include/wsrep/seqno.hpp | 5 mariadb-10.11.13/wsrep-lib/include/wsrep/server_state.hpp | 44 mariadb-10.11.13/wsrep-lib/include/wsrep/storage_service.hpp | 11 mariadb-10.11.13/wsrep-lib/include/wsrep/transaction.hpp | 6 mariadb-10.11.13/wsrep-lib/include/wsrep/view.hpp | 4 mariadb-10.11.13/wsrep-lib/src/CMakeLists.txt | 1 mariadb-10.11.13/wsrep-lib/src/client_state.cpp | 16 mariadb-10.11.13/wsrep-lib/src/config_service_v1.cpp | 5 mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.cpp | 142 mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.hpp | 56 mariadb-10.11.13/wsrep-lib/src/id.cpp | 34 mariadb-10.11.13/wsrep-lib/src/provider.cpp | 7 mariadb-10.11.13/wsrep-lib/src/server_state.cpp | 47 mariadb-10.11.13/wsrep-lib/src/transaction.cpp | 21 mariadb-10.11.13/wsrep-lib/src/view.cpp | 2 mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.cpp | 33 mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.hpp | 2 mariadb-10.11.13/wsrep-lib/test/id_test.cpp | 54 mariadb-10.11.13/wsrep-lib/test/mock_provider.hpp | 22 mariadb-10.11.13/wsrep-lib/test/mock_server_state.hpp | 29 mariadb-10.11.13/wsrep-lib/test/test_utils.cpp | 16 mariadb-10.11.13/wsrep-lib/test/test_utils.hpp | 2 mariadb-10.11.13/wsrep-lib/test/transaction_test.cpp | 4 mariadb-10.11.13/wsrep-lib/test/transaction_test_2pc.cpp | 45 mariadb-10.11.13/wsrep-lib/test/transaction_test_xa.cpp | 29 mariadb-10.11.13/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt | 1 mariadb-10.11.13/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h | 134 2471 files changed, 239030 insertions(+), 65503 deletions(-) diff -Nru mariadb-10.11.11/CMakeLists.txt mariadb-10.11.13/CMakeLists.txt --- mariadb-10.11.11/CMakeLists.txt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/CMakeLists.txt 2025-05-19 16:14:23.000000000 +0000 @@ -14,7 +14,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12) +CMAKE_MINIMUM_REQUIRED(VERSION 2.8...3.12) IF(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) # Setting build type to RelWithDebInfo as none was specified. @@ -31,7 +31,7 @@ # in RPM's: #set(CPACK_RPM_SPEC_MORE_DEFINE "%define __spec_install_post /bin/true") -FOREACH(p CMP0022 CMP0046 CMP0040 CMP0048 CMP0054 CMP0067 CMP0074 CMP0075 CMP0069 CMP0135) +FOREACH(p CMP0022 CMP0046 CMP0040 CMP0048 CMP0054 CMP0056 CMP0067 CMP0074 CMP0075 CMP0069 CMP0135 CMP0091) IF(POLICY ${p}) CMAKE_POLICY(SET ${p} NEW) ENDIF() @@ -246,7 +246,7 @@ OPTION(WITH_MSAN "Enable memory sanitizer" OFF) IF (WITH_MSAN) - MY_CHECK_AND_SET_COMPILER_FLAG("-fsanitize=memory -fsanitize-memory-track-origins -U_FORTIFY_SOURCE" DEBUG RELWITHDEBINFO) + MY_CHECK_AND_SET_COMPILER_FLAG("-fsanitize=memory -fsanitize-memory-track-origins -U_FORTIFY_SOURCE") IF(NOT (have_C__fsanitize_memory__fsanitize_memory_track_origins__U_FORTIFY_SOURCE AND have_CXX__fsanitize_memory__fsanitize_memory_track_origins__U_FORTIFY_SOURCE)) MESSAGE(FATAL_ERROR "Compiler doesn't support -fsanitize=memory flags") @@ -256,7 +256,7 @@ MESSAGE(FATAL_ERROR "C++ Compiler requires support for -stdlib=libc++") ENDIF() SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") - MY_CHECK_AND_SET_LINKER_FLAG("-fsanitize=memory" DEBUG RELWITHDEBINFO) + MY_CHECK_AND_SET_LINKER_FLAG("-fsanitize=memory") IF(NOT HAVE_LINK_FLAG__fsanitize_memory) MESSAGE(FATAL_ERROR "Linker doesn't support -fsanitize=memory flags") ENDIF() @@ -633,7 +633,7 @@ perror replace) IF(WIN32) - ADD_DEPENDENCIES(minbuild echo mariadb-install-db my_safe_kill) + ADD_DEPENDENCIES(minbuild echo mariadb-install-db my_safe_kill mariadb-upgrade-service) ENDIF() ADD_CUSTOM_TARGET(smoketest COMMAND perl ./mysql-test-run.pl main.1st diff -Nru mariadb-10.11.11/Docs/INFO_SRC mariadb-10.11.13/Docs/INFO_SRC --- mariadb-10.11.11/Docs/INFO_SRC 2025-01-30 11:01:27.000000000 +0000 +++ mariadb-10.11.13/Docs/INFO_SRC 2025-05-19 16:14:28.000000000 +0000 @@ -1,8 +1,8 @@ -commit: e69f8cae1a15e15b9e4f5e0f8497e1f17bdc81a4 -date: 2025-01-30 11:55:13 +0100 -build-date: 2025-01-30 11:01:27 +0000 -short: e69f8cae1a1 +commit: 8fb09426b98583916ccfd4f8c49741adc115bac3 +date: 2025-05-13 12:27:50 +0300 +build-date: 2025-05-19 16:14:28 +0000 +short: 8fb09426b98 branch: HEAD -MariaDB source 10.11.11 +MariaDB source 10.11.13 diff -Nru mariadb-10.11.11/VERSION mariadb-10.11.13/VERSION --- mariadb-10.11.11/VERSION 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/VERSION 2025-05-19 16:14:23.000000000 +0000 @@ -1,4 +1,4 @@ MYSQL_VERSION_MAJOR=10 MYSQL_VERSION_MINOR=11 -MYSQL_VERSION_PATCH=11 +MYSQL_VERSION_PATCH=13 SERVER_MATURITY=stable diff -Nru mariadb-10.11.11/appveyor.yml mariadb-10.11.13/appveyor.yml --- mariadb-10.11.11/appveyor.yml 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/appveyor.yml 2025-05-19 16:14:23.000000000 +0000 @@ -1,6 +1,42 @@ version: build-{build}~branch-{branch} -clone_depth: 1 +clone_depth: 10 + +skip_branch_with_pr: true +before_build: + - ps: | + function Get-Remote-Ref($ref) { + try { + $result = git ls-remote origin $ref 2>$null + if (-not $result) { + "Warning: Could not fetch remote ref '$ref'" + return $null + } + return ($result -split "`t")[0] + } catch { + "Warning: Exception while running git ls-remote for '$ref': $_" + return $null + } + } + Get-ChildItem Env: | Where-Object { $_.Name -like 'APPVEYOR*COMMIT' } | ForEach-Object { "$($_.Name)=$($_.Value)" } + $commit = $env:APPVEYOR_REPO_COMMIT + $commit2 = $env:APPVEYOR_PULL_REQUEST_HEAD_COMMIT + $branch = $env:APPVEYOR_REPO_BRANCH + $latest = $null + $mainBranch = $branch -match '^(main|\d+\.\d+)$' + if ($env:APPVEYOR_PULL_REQUEST_NUMBER -eq $null) { + "Branch build detected" + $latest = Get-Remote-Ref "refs/heads/$branch" + } else { + $pr = $env:APPVEYOR_PULL_REQUEST_NUMBER + $latest = Get-Remote-Ref "refs/pull/$pr/head" + $mainBranch = $False + "Pull Request build detected" + } + if ($latest -and ($commit -ne $latest) -and ($commit2 -ne $latest) -and (-not $mainBranch)) { + "Skipping outdated commit (latest is $latest)" + Exit-AppVeyorBuild + } build_script: # dump some system info diff -Nru mariadb-10.11.11/client/mysql_upgrade.c mariadb-10.11.13/client/mysql_upgrade.c --- mariadb-10.11.11/client/mysql_upgrade.c 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/client/mysql_upgrade.c 2025-05-19 16:14:24.000000000 +0000 @@ -855,8 +855,7 @@ s= strchr(version, '.'); s= strchr(s + 1, '.'); - if (strncmp(upgrade_from_version, version, - (size_t)(s - version + 1))) + if (strncmp(upgrade_from_version, version, (size_t)(s - version + 1))) { if (calc_server_version(upgrade_from_version) <= MYSQL_VERSION_ID) { @@ -870,9 +869,14 @@ } if (!silent) { - verbose("This installation of MariaDB is already upgraded to %s.\n" - "There is no need to run mysql_upgrade again for %s.", - upgrade_from_version, version); + if (strcmp(upgrade_from_version, version)) + verbose("This installation of MariaDB is already upgraded to %s.\n" + "There is no need to run mysql_upgrade again for %s, because " + "they're both %.*s.", + upgrade_from_version, version, (int)(s - version), version); + else + verbose("This installation of MariaDB is already upgraded to %s.\n" + "There is no need to run mysql_upgrade again.", version); if (!opt_check_upgrade) verbose("You can use --force if you still want to run mysql_upgrade"); } diff -Nru mariadb-10.11.11/client/mysqlbinlog.cc mariadb-10.11.13/client/mysqlbinlog.cc --- mariadb-10.11.11/client/mysqlbinlog.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/client/mysqlbinlog.cc 2025-05-19 16:14:24.000000000 +0000 @@ -160,7 +160,13 @@ static char *start_datetime_str, *stop_datetime_str; static my_time_t start_datetime= 0, stop_datetime= MY_TIME_T_MAX; -static my_time_t last_processed_datetime= MY_TIME_T_MAX; + +typedef struct _last_processed_ev_t +{ + ulonglong position; + my_time_t datetime; +} last_processed_ev_t; +static last_processed_ev_t last_processed_ev= {0, MY_TIME_T_MAX}; static ulonglong rec_count= 0; static MYSQL* mysql = NULL; @@ -1611,7 +1617,19 @@ end: rec_count++; end_skip_count: - last_processed_datetime= ev_when; + /* + Update the last_processed_ev, unless the event is a fake event (i.e. format + description (ev pointer is reset to 0) or rotate event (ev->when is 0)), or + the event is encrypted (i.e. type is Unknown). + */ + if (ev && + !(ev_type == UNKNOWN_EVENT && + ((Unknown_log_event *) ev)->what == Unknown_log_event::ENCRYPTED) && + !(ev_type == ROTATE_EVENT && !ev->when)) + { + last_processed_ev.position= pos + ev->data_written; + last_processed_ev.datetime= ev_when; + } DBUG_PRINT("info", ("end event processing")); /* @@ -2925,6 +2943,9 @@ if (old_off != BIN_LOG_HEADER_SIZE) *len= 1; // fake event, don't increment old_off } + DBUG_ASSERT(old_off + ev->data_written == old_off + (*len - 1) || + (*len == 1 && + (type == ROTATE_EVENT || type == FORMAT_DESCRIPTION_EVENT))); Exit_status retval= process_event(print_event_info, ev, old_off, logname); if (retval != OK_CONTINUE) DBUG_RETURN(retval); @@ -2943,6 +2964,9 @@ DBUG_RETURN(ERROR_STOP); } + DBUG_ASSERT(old_off + ev->data_written == old_off + (*len - 1) || + (*len == 1 && + (type == ROTATE_EVENT || type == FORMAT_DESCRIPTION_EVENT))); retval= process_event(print_event_info, ev, old_off, logname); if (retval != OK_CONTINUE) { @@ -3342,6 +3366,8 @@ the new one, so we should not do it ourselves in this case. */ + DBUG_ASSERT(tmp_pos + new_description_event->data_written == + my_b_tell(file)); Exit_status retval= process_event(print_event_info, new_description_event, tmp_pos, logname); @@ -3495,20 +3521,17 @@ } // else read_error == 0 means EOF, that's OK, we break in this case - /* - Emit a warning in the event that we finished processing input - before reaching the boundary indicated by --stop-position. - */ - if (((longlong)stop_position != stop_position_default) && - stop_position > my_b_tell(file)) - { - retval = OK_STOP; - warning("Did not reach stop position %llu before " - "end of input", stop_position); - } - goto end; } + + /* + The real location that we have read up to in the file should align with + the size of the event, unless the event is encrypted. + */ + DBUG_ASSERT( + ((ev->get_type_code() == UNKNOWN_EVENT && + ((Unknown_log_event *) ev)->what == Unknown_log_event::ENCRYPTED)) || + old_off + ev->data_written == my_b_tell(file)); if ((retval= process_event(print_event_info, ev, old_off, logname)) != OK_CONTINUE) goto end; @@ -3687,10 +3710,18 @@ start_position= BIN_LOG_HEADER_SIZE; } + /* + Emit a warning if we finished processing input before reaching the stop + boundaries indicated by --stop-datetime or --stop-position. + */ if (stop_datetime != MY_TIME_T_MAX && - stop_datetime > last_processed_datetime) + stop_datetime > last_processed_ev.datetime) warning("Did not reach stop datetime '%s' before end of input", stop_datetime_str); + if ((static_cast(stop_position) != stop_position_default) && + stop_position > last_processed_ev.position) + warning("Did not reach stop position %llu before end of input", + stop_position); /* If enable flashback, need to print the events from the end to the diff -Nru mariadb-10.11.11/client/mysqldump.c mariadb-10.11.13/client/mysqldump.c --- mariadb-10.11.11/client/mysqldump.c 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/client/mysqldump.c 2025-05-19 16:14:24.000000000 +0000 @@ -2158,7 +2158,7 @@ *to++='\\'; } if (*name == '\'') - *to++= '\\'; + *to++= '\''; *to++= *name++; } to[0]= '\''; @@ -3713,7 +3713,7 @@ fprintf(sql_file, "DELIMITER ;;\n" - "/*!50003 SET SESSION SQL_MODE=\"%s\" */;;\n" + "/*!50003 SET SESSION SQL_MODE='%s' */;;\n" "/*!50003 CREATE */ ", (*show_trigger_row)[6]); @@ -4730,17 +4730,19 @@ return 1; while ((row= mysql_fetch_row(tableres))) { + char buf[200]; if (opt_replace_into) /* Protection against removing the current import user */ /* MySQL-8.0 export capability */ fprintf(md_result_file, "DELIMITER |\n" - "/*M!100101 IF current_user()=\"%s\" THEN\n" + "/*M!100101 IF current_user()=%s THEN\n" " SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001," " MESSAGE_TEXT=\"Don't remove current user %s'\";\n" "END IF */|\n" "DELIMITER ;\n" - "/*!50701 DROP USER IF EXISTS %s */;\n", row[0], row[0], row[0]); + "/*!50701 DROP USER IF EXISTS %s */;\n", + quote_for_equal(row[0],buf), row[0], row[0]); if (dump_create_user(row[0])) result= 1; /* if roles exist, defer dumping grants until after roles created */ @@ -6858,6 +6860,7 @@ char *result_table, *opt_quoted_table; char table_buff[NAME_LEN*2+3]; char table_buff2[NAME_LEN*2+3]; + char temp_buff[NAME_LEN*2 + 3], temp_buff2[NAME_LEN*2 + 3]; char query[QUERY_LENGTH]; FILE *sql_file= md_result_file; DBUG_ENTER("get_view_structure"); @@ -6918,7 +6921,9 @@ "SELECT CHECK_OPTION, DEFINER, SECURITY_TYPE, " " CHARACTER_SET_CLIENT, COLLATION_CONNECTION " "FROM information_schema.views " - "WHERE table_name=\"%s\" AND table_schema=\"%s\"", table, db); + "WHERE table_name=%s AND table_schema=%s", + quote_for_equal(table, temp_buff2), + quote_for_equal(db, temp_buff)); if (mysql_query(mysql, query)) { diff -Nru mariadb-10.11.11/client/mysqlslap.c mariadb-10.11.13/client/mysqlslap.c --- mariadb-10.11.11/client/mysqlslap.c 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/client/mysqlslap.c 2025-05-19 16:14:24.000000000 +0000 @@ -2237,6 +2237,13 @@ stats *ptr; unsigned int x; + if (eng && eng->string) + con->engine= eng->string; + + /* Early return when iterations is 0 to avoid accessing uninitialized sptr */ + if (iterations == 0) + return; + con->min_timing= sptr->timing; con->max_timing= sptr->timing; con->min_rows= sptr->rows; @@ -2257,11 +2264,6 @@ con->min_timing= ptr->timing; } con->avg_timing= con->avg_timing/iterations; - - if (eng && eng->string) - con->engine= eng->string; - else - con->engine= NULL; } void diff -Nru mariadb-10.11.11/client/mysqltest.cc mariadb-10.11.13/client/mysqltest.cc --- mariadb-10.11.11/client/mysqltest.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/client/mysqltest.cc 2025-05-19 16:14:24.000000000 +0000 @@ -6744,7 +6744,7 @@ my_bool have_slash= FALSE; enum {R_NORMAL, R_Q, R_SLASH_IN_Q, - R_COMMENT, R_LINE_START} state= R_LINE_START; + R_COMMENT, R_LINE_START, R_CSTYLE_COMMENT} state= R_LINE_START; DBUG_ENTER("read_line"); *p= 0; @@ -6831,9 +6831,23 @@ state= R_Q; } } + else if (c == '*' && last_char == '/') + { + state= R_CSTYLE_COMMENT; + break; + } have_slash= is_escape_char(c, last_quote); break; + case R_CSTYLE_COMMENT: + if (c == '!') + // Got the hint introducer '/*!'. Switch to normal processing of + // next following characters + state= R_NORMAL; + else if (c == '/' && last_char == '*') + state= R_NORMAL; + break; + case R_COMMENT: if (c == '\n') { diff -Nru mariadb-10.11.11/cmake/cpack_rpm.cmake mariadb-10.11.13/cmake/cpack_rpm.cmake --- mariadb-10.11.11/cmake/cpack_rpm.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/cpack_rpm.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -245,7 +245,7 @@ "galera-4" "rsync" "grep" "gawk" "iproute" "coreutils" "findutils" "tar") SETA(CPACK_RPM_server_PACKAGE_RECOMMENDS "lsof" "socat" "pv") - SETA(CPACK_RPM_test_PACKAGE_REQUIRES "socat") + SETA(CPACK_RPM_test_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}" "socat") ENDIF() SET(CPACK_RPM_server_PRE_INSTALL_SCRIPT_FILE ${CMAKE_SOURCE_DIR}/support-files/rpm/server-prein.sh) @@ -292,7 +292,7 @@ ALTERNATIVE_NAME("server" "mariadb-server") ALTERNATIVE_NAME("server" "mysql-compat-server") ALTERNATIVE_NAME("test" "mariadb-test") -ELSEIF(RPM MATCHES "(rhel|centos|rocky)[89]") +ELSEIF(RPM MATCHES "(rhel|centos|rocky)") SET(epoch 3:) ALTERNATIVE_NAME("backup" "mariadb-backup") ALTERNATIVE_NAME("client" "mariadb") diff -Nru mariadb-10.11.11/cmake/libfmt.cmake mariadb-10.11.13/cmake/libfmt.cmake --- mariadb-10.11.11/cmake/libfmt.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/libfmt.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -28,15 +28,14 @@ IF(WITH_LIBFMT STREQUAL "system" OR WITH_LIBFMT STREQUAL "auto") SET(CMAKE_REQUIRED_INCLUDES ${LIBFMT_INCLUDE_DIR}) CHECK_CXX_SOURCE_RUNS( - "#define FMT_STATIC_THOUSANDS_SEPARATOR ',' - #define FMT_HEADER_ONLY 1 + "#define FMT_HEADER_ONLY 1 #include int main() { using ArgStore= fmt::dynamic_format_arg_store; ArgStore arg_store; int answer= 4321; arg_store.push_back(answer); - return fmt::vformat(\"{:L}\", arg_store).compare(\"4,321\"); + return fmt::vformat(\"{}\", arg_store).compare(\"4321\"); }" HAVE_SYSTEM_LIBFMT) SET(CMAKE_REQUIRED_INCLUDES) ENDIF() diff -Nru mariadb-10.11.11/cmake/os/Windows.cmake mariadb-10.11.13/cmake/os/Windows.cmake --- mariadb-10.11.11/cmake/os/Windows.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/os/Windows.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -15,352 +15,212 @@ # This file includes Windows specific hacks, mostly around compiler flags -INCLUDE (CheckCSourceCompiles) -INCLUDE (CheckCXXSourceCompiles) -INCLUDE (CheckStructHasMember) -INCLUDE (CheckLibraryExists) -INCLUDE (CheckFunctionExists) -INCLUDE (CheckCSourceRuns) -INCLUDE (CheckSymbolExists) -INCLUDE (CheckTypeSize) - -IF(MSVC) - IF(CMAKE_CXX_COMPILER_ARCHITECTURE_ID STREQUAL ARM64) - SET(MSVC_ARM64 1) - SET(MSVC_INTEL 0) - ELSE() - SET(MSVC_INTEL 1) - ENDIF() -ENDIF() + +if(MSVC) + if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID STREQUAL ARM64) + set(MSVC_ARM64 1) + set(MSVC_INTEL 0) + else() + set(MSVC_INTEL 1) + endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL Clang) + set(CLANG_CL TRUE) + endif() +endif() # avoid running system checks by using pre-cached check results # system checks are expensive on VS since every tiny program is to be compiled in # a VC solution. -GET_FILENAME_COMPONENT(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) -INCLUDE(${_SCRIPT_DIR}/WindowsCache.cmake) - +get_filename_component(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) +include(${_SCRIPT_DIR}/WindowsCache.cmake) # OS display name (version_compile_os etc). -# Used by the test suite to ignore bugs on some platforms, -IF(CMAKE_SIZEOF_VOID_P MATCHES 8) - SET(SYSTEM_TYPE "Win64") -ELSE() - SET(SYSTEM_TYPE "Win32") -ENDIF() - -# Intel compiler is almost Visual C++ -# (same compile flags etc). Set MSVC flag -IF(CMAKE_C_COMPILER MATCHES "icl") - SET(MSVC TRUE) -ENDIF() - -IF(MSVC AND CMAKE_CXX_COMPILER_ID MATCHES Clang) - SET(CLANG_CL TRUE) -ENDIF() - -ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE) -ADD_DEFINITIONS(-D_WIN32_WINNT=0x0A00) -# We do not want the windows.h , or winsvc.h macros min/max -ADD_DEFINITIONS(-DNOMINMAX -DNOSERVICE) -# Speed up build process excluding unused header files -ADD_DEFINITIONS(-DWIN32_LEAN_AND_MEAN) - -# Adjust compiler and linker flags -IF(MINGW AND CMAKE_SIZEOF_VOID_P EQUAL 4) - # mininal architecture flags, i486 enables GCC atomics - ADD_DEFINITIONS(-march=i486) -ENDIF() - -MACRO(ENABLE_SANITIZERS) - IF(NOT MSVC) - MESSAGE(FATAL_ERROR "clang-cl or MSVC necessary to enable asan/ubsan") - ENDIF() - # currently, asan is broken with static CRT. - IF(CLANG_CL AND NOT(MSVC_CRT_TYPE STREQUAL "/MD")) - SET(MSVC_CRT_TYPE "/MD" CACHE INTERNAL "" FORCE) - ENDIF() - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - SET(ASAN_ARCH i386) - ELSE() - SET(ASAN_ARCH x86_64) - ENDIF() - - # After installation, clang lib directory should be added to PATH - # (e.g C:/Program Files/LLVM/lib/clang/5.0.1/lib/windows) - SET(SANITIZER_LIBS) - SET(SANITIZER_LINK_LIBRARIES) - SET(SANITIZER_COMPILE_FLAGS) - IF(WITH_ASAN) - IF(CLANG_CL) - LIST(APPEND SANITIZER_LIBS - clang_rt.asan_dynamic-${ASAN_ARCH}.lib clang_rt.asan_dynamic_runtime_thunk-${ASAN_ARCH}.lib) - ENDIF() - STRING(APPEND SANITIZER_COMPILE_FLAGS " -fsanitize=address") - ENDIF() - IF(WITH_UBSAN) - STRING(APPEND SANITIZER_COMPILE_FLAGS " -fsanitize=undefined -fno-sanitize=alignment") - ENDIF() - FOREACH(lib ${SANITIZER_LIBS}) - FIND_LIBRARY(${lib}_fullpath ${lib}) - IF(NOT ${lib}_fullpath) - MESSAGE(FATAL_ERROR "Can't enable sanitizer : missing ${lib}") +# Used by the test suite to ignore bugs on some platforms +if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(SYSTEM_TYPE "Win64") +else() + set(SYSTEM_TYPE "Win32") +endif() + +function(find_asan_runtime result_list) + set(${result_list} "" PARENT_SCOPE) + if(CMAKE_C_COMPILER_VERSION) + set(CLANG_VERSION "${CMAKE_C_COMPILER_VERSION}") + else() + return() + endif() + + get_filename_component(CLANG_BIN_DIR "${CMAKE_C_COMPILER}" DIRECTORY) + get_filename_component(LLVM_ROOT "${CLANG_BIN_DIR}" DIRECTORY) + + # Determine target architecture + execute_process( + COMMAND "${CMAKE_C_COMPILER}" --version + OUTPUT_VARIABLE CLANG_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + + if(CLANG_VERSION_OUTPUT MATCHES "x86_64") + set(ARCH_SUFFIX "x86_64") + elseif(CLANG_VERSION_OUTPUT MATCHES "i686|i386") + set(ARCH_SUFFIX "i386") + elseif(CLANG_VERSION_OUTPUT MATCHES "aarch64") + set(ARCH_SUFFIX "aarch64") + else() + message(FATAL_ERROR "unknown arch") + endif() + + string(REGEX MATCH "^[0-9]+" CLANG_MAJOR_VERSION "${CMAKE_C_COMPILER_VERSION}") + set(CLANG_VERSION_DIR "${LLVM_ROOT}/lib/clang/${CLANG_MAJOR_VERSION}") + + set(out) + foreach(name clang_rt.asan_dynamic-${ARCH_SUFFIX}.lib + clang_rt.asan_dynamic_runtime_thunk-${ARCH_SUFFIX}.lib) + set(path "${CLANG_VERSION_DIR}/lib/windows/${name}") + if(EXISTS "${path}") + list(APPEND out ${path}) + else() + message(FATAL_ERROR "expected library ${path} not found") ENDIF() - LIST(APPEND CMAKE_REQUIRED_LIBRARIES ${${lib}_fullpath}) - STRING(APPEND CMAKE_C_STANDARD_LIBRARIES " \"${${lib}_fullpath}\" ") - STRING(APPEND CMAKE_CXX_STANDARD_LIBRARIES " \"${${lib}_fullpath}\" ") - ENDFOREACH() - STRING(APPEND CMAKE_C_FLAGS ${SANITIZER_COMPILE_FLAGS}) - STRING(APPEND CMAKE_CXX_FLAGS ${SANITIZER_COMPILE_FLAGS}) -ENDMACRO() + endforeach() + set(${result_list} ${out} PARENT_SCOPE) +endfunction() + +macro(enable_sanitizers) + # Remove the runtime checks from the compiler flags + # ASAN does the same thing, in many cases better + foreach(lang C CXX) + foreach(suffix "_DEBUG" "_DEBUG_INIT") + string(REGEX REPLACE "/RTC[1su]" "" CMAKE_${lang}_FLAGS${suffix} "${CMAKE_${lang}_FLAGS${suffix}}") + endforeach() + endforeach() + + if(WITH_ASAN) + add_compile_options($<$:/fsanitize=address>) + endif() + if(WITH_UBSAN) + include(CheckCCompilerFlag) + check_c_compiler_flag(/fsanitize=undefined HAVE_fsanitize_undefined) + if (HAVE_fsanitize_undefined) + add_compile_options($<$:/fsanitize=undefined>) + else() + message(FATAL_ERROR "UBSAN not supported by this compiler yet") + endif() + endif() + if(CLANG_CL) + find_asan_runtime(asan_libs) + foreach(lib ${asan_libs}) + link_libraries(${lib}) + string(APPEND CMAKE_C_STANDARD_LIBRARIES " \"${lib}\"") + string(APPEND CMAKE_CXX_STANDARD_LIBRARIES " \"${lib}\"") + endforeach() + else() + add_link_options(/INCREMENTAL:NO) + endif() +endmacro() -IF(MSVC) - IF(MSVC_VERSION LESS 1920) - MESSAGE(FATAL_ERROR "Visual Studio 2019 or later is required") - ENDIF() +if(MSVC) # Disable mingw based pkg-config found in Strawberry perl - SET(PKG_CONFIG_EXECUTABLE 0 CACHE INTERNAL "") + set(PKG_CONFIG_EXECUTABLE 0 CACHE INTERNAL "") - SET(MSVC_CRT_TYPE /MT CACHE STRING - "Runtime library - specify runtime library for linking (/MT,/MTd,/MD,/MDd)" - ) - SET(VALID_CRT_TYPES /MTd /MDd /MD /MT) - IF (NOT ";${VALID_CRT_TYPES};" MATCHES ";${MSVC_CRT_TYPE};") - MESSAGE(FATAL_ERROR "Invalid value ${MSVC_CRT_TYPE} for MSVC_CRT_TYPE, choose one of /MT,/MTd,/MD,/MDd ") - ENDIF() - - IF(MSVC_CRT_TYPE MATCHES "/MD") - # Dynamic runtime (DLLs), need to install CRT libraries. - SET(CMAKE_INSTALL_SYSTEM_RUNTIME_COMPONENT VCCRT) - SET(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS TRUE) - IF(MSVC_CRT_TYPE STREQUAL "/MDd") - SET (CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY TRUE) - ENDIF() - INCLUDE(InstallRequiredSystemLibraries) - ENDIF() - - IF(WITH_ASAN AND (NOT CLANG_CL)) - SET(DYNAMIC_UCRT_LINK_DEFAULT OFF) - ELSE() - SET(DYNAMIC_UCRT_LINK_DEFAULT ON) - ENDIF() - - OPTION(DYNAMIC_UCRT_LINK "Link Universal CRT dynamically, if MSVC_CRT_TYPE=/MT" ${DYNAMIC_UCRT_LINK_DEFAULT}) - SET(DYNAMIC_UCRT_LINKER_OPTION " /NODEFAULTLIB:libucrt.lib /DEFAULTLIB:ucrt.lib") - - # Enable debug info also in Release build, - # and create PDB to be able to analyze crashes. - FOREACH(type EXE SHARED MODULE) - SET(CMAKE_${type}_LINKER_FLAGS_RELEASE - "${CMAKE_${type}_LINKER_FLAGS_RELEASE} /debug") - SET(CMAKE_${type}_LINKER_FLAGS_MINSIZEREL - "${CMAKE_${type}_LINKER_FLAGS_MINSIZEREL} /debug") - ENDFOREACH() - - # Force runtime libraries - # Compile with /Zi to get debugging information + if(NOT DEFINED CMAKE_MSVC_RUNTIME_LIBRARY) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL) + endif() + + if(CMAKE_MSVC_RUNTIME_LIBRARY MATCHES "DLL") + # Dynamic runtime (DLLs), need to install CRT libraries. + set(CMAKE_INSTALL_SYSTEM_RUNTIME_COMPONENT VCCRT) + set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS TRUE) + if(CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebugDLL") + set(CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY TRUE) + endif() + include(InstallRequiredSystemLibraries) + endif() - FOREACH(lang C CXX) - SET(CMAKE_${lang}_FLAGS_RELEASE "${CMAKE_${lang}_FLAGS_RELEASE} /Zi") - ENDFOREACH() - FOREACH(flag - CMAKE_C_FLAGS CMAKE_CXX_FLAGS - CMAKE_C_FLAGS_INIT CMAKE_CXX_FLAGS_INIT - CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_DEBUG_INIT - CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG_INIT - CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_MINSIZEREL - ) - STRING(REGEX REPLACE "/M[TD][d]?" "${MSVC_CRT_TYPE}" "${flag}" "${${flag}}" ) - STRING(REPLACE "/ZI " "/Zi " "${flag}" "${${flag}}") - IF((NOT "${${flag}}" MATCHES "/Zi") AND (NOT "${${flag}}" MATCHES "/Z7")) - STRING(APPEND ${flag} " /Zi") - ENDIF() - # Remove inlining flags, added by CMake, if any. - # Compiler default is fine. - STRING(REGEX REPLACE "/Ob[0-3]" "" "${flag}" "${${flag}}" ) - ENDFOREACH() - - # Allow to overwrite the inlining flag - SET(MSVC_INLINE "" CACHE STRING - "MSVC Inlining option, either empty, or one of /Ob0,/Ob1,/Ob2,/Ob3") - IF(MSVC_INLINE MATCHES "/Ob[0-3]") - ADD_COMPILE_OPTIONS(${MSVC_INLINE}) - ELSEIF(NOT(MSVC_INLINE STREQUAL "")) - MESSAGE(FATAL_ERROR "Invalid option for MSVC_INLINE") - ENDIF() + # Compile with /Zi to get debugging information + if (NOT DEFINED CMAKE_MSVC_DEBUG_INFORMATION_FORMAT) + set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "ProgramDatabase") + add_link_options(/DEBUG) # Ensure debugging info at link time + endif() - IF(WITH_ASAN OR WITH_UBSAN) + if(WITH_ASAN OR WITH_UBSAN) # Workaround something Linux specific - SET(SECURITY_HARDENED 0 CACHE INTERNAL "" FORCE) - ENABLE_SANITIZERS() - ENDIF() - - IF(CLANG_CL) - SET(CLANG_CL_FLAGS -"-Wno-unknown-warning-option -Wno-unused-private-field \ --Wno-unused-parameter -Wno-inconsistent-missing-override \ --Wno-unused-command-line-argument -Wno-pointer-sign \ --Wno-deprecated-register -Wno-missing-braces \ --Wno-unused-function -Wno-unused-local-typedef -msse4.2 " + set(SECURITY_HARDENED 0 CACHE INTERNAL "" FORCE) + enable_sanitizers() + endif() + + add_compile_definitions( + _CRT_SECURE_NO_DEPRECATE + _CRT_NONSTDC_NO_WARNINGS + _WIN32_WINNT=0x0A00 + # We do not want the windows.h , or winsvc.h macros min/max + NOMINMAX NOSERVICE + # Speed up build process excluding unused header files + WIN32_LEAN_AND_MEAN + ) + if(CLANG_CL) + add_compile_options( + -Wno-unknown-warning-option + -Wno-unused-private-field + -Wno-unused-parameter + -Wno-inconsistent-missing-override + -Wno-unused-command-line-argument + -Wno-pointer-sign + -Wno-deprecated-register + -Wno-missing-braces + -Wno-unused-function + -Wno-unused-local-typedef + -Wno-microsoft-static-assert + -Wno-c++17-extensions + -msse4.2 ) - IF(CMAKE_SIZEOF_VOID_P MATCHES 8) - STRING(APPEND CLANG_CL_FLAGS "-mpclmul ") - ENDIF() - STRING(APPEND CMAKE_C_FLAGS " ${CLANG_CL_FLAGS} ${MSVC_CRT_TYPE}") - STRING(APPEND CMAKE_CXX_FLAGS " ${CLANG_CL_FLAGS} ${MSVC_CRT_TYPE}") - ENDIF() - - FOREACH(type EXE SHARED MODULE) - STRING(REGEX REPLACE "/STACK:([^ ]+)" "" CMAKE_${type}_LINKER_FLAGS "${CMAKE_${type}_LINKER_FLAGS}") - IF(WITH_ASAN) - SET(build_types RELWITHDEBINFO DEBUG) - ELSE() - SET(build_types RELWITHDEBINFO) - ENDIF() - FOREACH(btype ${build_types}) - STRING(REGEX REPLACE "/INCREMENTAL:([^ ]+)" "/INCREMENTAL:NO" CMAKE_${type}_LINKER_FLAGS_${btype} "${CMAKE_${type}_LINKER_FLAGS_${btype}}") - STRING(REGEX REPLACE "/INCREMENTAL$" "/INCREMENTAL:NO" CMAKE_${type}_LINKER_FLAGS_${btype} "${CMAKE_${type}_LINKER_FLAGS_${btype}}") - ENDFOREACH() - IF(NOT CLANG_CL) - STRING(APPEND CMAKE_${type}_LINKER_FLAGS_RELWITHDEBINFO " /release /OPT:REF,ICF") - ENDIF() - IF(DYNAMIC_UCRT_LINK AND (MSVC_CRT_TYPE STREQUAL "/MT")) - FOREACH(config RELEASE RELWITHDEBINFO DEBUG MINSIZEREL) - STRING(APPEND CMAKE_${type}_LINKER_FLAGS_${config} ${DYNAMIC_UCRT_LINKER_OPTION}) - ENDFOREACH() - ENDIF() - ENDFOREACH() + if((CMAKE_SIZEOF_VOID_P MATCHES 8) AND MSVC_INTEL) + add_compile_options(-mpclmul) + endif() + endif() - # Mark 32 bit executables large address aware so they can # use > 2GB address space - IF(CMAKE_SIZEOF_VOID_P MATCHES 4) - STRING(APPEND CMAKE_EXE_LINKER_FLAGS " /LARGEADDRESSAWARE") - ENDIF() - - # Speed up multiprocessor build - IF (NOT CLANG_CL) - STRING(APPEND CMAKE_C_FLAGS " /MP") - STRING(APPEND CMAKE_CXX_FLAGS " /MP") - STRING(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " /Gw") - STRING(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Gw") - ENDIF() - - #TODO: update the code and remove the disabled warnings - STRING(APPEND CMAKE_C_FLAGS " /we4700 /we4311 /we4477 /we4302 /we4090") - STRING(APPEND CMAKE_CXX_FLAGS " /we4099 /we4700 /we4311 /we4477 /we4302 /we4090") - IF(MSVC_VERSION GREATER 1910 AND NOT CLANG_CL) - STRING(APPEND CMAKE_CXX_FLAGS " /permissive-") - STRING(APPEND CMAKE_C_FLAGS " /diagnostics:caret") - STRING(APPEND CMAKE_CXX_FLAGS " /diagnostics:caret") - ENDIF() - ADD_DEFINITIONS(-D_CRT_NONSTDC_NO_WARNINGS) - IF(MYSQL_MAINTAINER_MODE MATCHES "ERR") - STRING(APPEND CMAKE_C_FLAGS " /WX") - STRING(APPEND CMAKE_CXX_FLAGS " /WX") - FOREACH(type EXE SHARED MODULE) - FOREACH(cfg RELEASE DEBUG RELWITHDEBINFO) - SET(CMAKE_${type}_LINKER_FLAGS_${cfg} "${CMAKE_${type}_LINKER_FLAGS_${cfg}} /WX") - ENDFOREACH() - ENDFOREACH() - ENDIF() - - IF(FAST_BUILD) - STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - ELSEIF (NOT CLANG_CL) - STRING(APPEND CMAKE_CXX_FLAGS_RELEASE " /d2OptimizeHugeFunctions") - STRING(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /d2OptimizeHugeFunctions") - ENDIF() - ADD_COMPILE_OPTIONS($<$:/utf-8>) -ENDIF() - -# Always link with socket/synchronization libraries -STRING(APPEND CMAKE_C_STANDARD_LIBRARIES " ws2_32.lib synchronization.lib") -STRING(APPEND CMAKE_CXX_STANDARD_LIBRARIES " ws2_32.lib synchronization.lib") - -# System checks -SET(SIGNAL_WITH_VIO_CLOSE 1) # Something that runtime team needs - -# IPv6 constants appeared in Vista SDK first. We need to define them in any case if they are -# not in headers, to handle dual mode sockets correctly. -CHECK_SYMBOL_EXISTS(IPPROTO_IPV6 "winsock2.h" HAVE_IPPROTO_IPV6) -IF(NOT HAVE_IPPROTO_IPV6) - SET(HAVE_IPPROTO_IPV6 41) -ENDIF() -CHECK_SYMBOL_EXISTS(IPV6_V6ONLY "winsock2.h;ws2ipdef.h" HAVE_IPV6_V6ONLY) -IF(NOT HAVE_IPV6_V6ONLY) - SET(IPV6_V6ONLY 27) -ENDIF() - -# Some standard functions exist there under different -# names (e.g popen is _popen or strok_r is _strtok_s) -# If a replacement function exists, HAVE_FUNCTION is -# defined to 1. CMake variable will also -# be defined to the replacement name. -# So for example, CHECK_FUNCTION_REPLACEMENT(popen _popen) -# will define HAVE_POPEN to 1 and set variable named popen -# to _popen. If the header template, one needs to have -# cmakedefine popen @popen@ which will expand to -# define popen _popen after CONFIGURE_FILE - -MACRO(CHECK_FUNCTION_REPLACEMENT function replacement) - STRING(TOUPPER ${function} function_upper) - CHECK_FUNCTION_EXISTS(${function} HAVE_${function_upper}) - IF(NOT HAVE_${function_upper}) - CHECK_FUNCTION_EXISTS(${replacement} HAVE_${replacement}) - IF(HAVE_${replacement}) - SET(HAVE_${function_upper} 1 ) - SET(${function} ${replacement}) - ENDIF() - ENDIF() -ENDMACRO() -MACRO(CHECK_SYMBOL_REPLACEMENT symbol replacement header) - STRING(TOUPPER ${symbol} symbol_upper) - CHECK_SYMBOL_EXISTS(${symbol} ${header} HAVE_${symbol_upper}) - IF(NOT HAVE_${symbol_upper}) - CHECK_SYMBOL_EXISTS(${replacement} ${header} HAVE_${replacement}) - IF(HAVE_${replacement}) - SET(HAVE_${symbol_upper} 1) - SET(${symbol} ${replacement}) - ENDIF() - ENDIF() -ENDMACRO() + if(CMAKE_SIZEOF_VOID_P MATCHES 4) + add_link_options(/LARGEADDRESSAWARE) + endif() + + # RelWithDebInfo is deoptimized wrt inlining. + # Fix it to default + foreach(lang C CXX) + foreach(suffix "_RELWITHDEBINFO" "_RELWITHDEBINFO_INIT") + string(REGEX REPLACE "/Ob[0-1]" "" CMAKE_${lang}_FLAGS${suffix} "${CMAKE_${lang}_FLAGS${suffix}}") + endforeach() + endforeach() + + if(NOT CLANG_CL) + add_link_options("$<$:/INCREMENTAL:NO;/RELEASE;/OPT:REF,ICF>") + add_compile_options($<$:$<$:/Gw>>) + add_compile_options($<$:/MP>) + add_compile_options("$<$:/we4099;/we4700;/we4311;/we4477;/we4302;/we4090>") + add_compile_options($<$:/permissive->) + add_compile_options($<$:/diagnostics:caret>) + add_compile_options($<$:/utf-8>) + if(NOT FAST_BUILD) + add_compile_options($<$:$<$:/d2OptimizeHugeFunctions>>) + endif() + endif() + + if(MYSQL_MAINTAINER_MODE MATCHES "ERR") + set(CMAKE_COMPILE_WARNING_AS_ERROR ON) + add_link_options(/WX) + endif() +endif() + +# avoid running system checks by using pre-cached check results +# system checks are expensive on VS generator +get_filename_component(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) +include(${_SCRIPT_DIR}/WindowsCache.cmake) + +# this is out of place, not really a system check +set(FN_NO_CASE_SENSE 1) +set(USE_SYMDIR 1) +set(HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1) -CHECK_SYMBOL_REPLACEMENT(S_IROTH _S_IREAD sys/stat.h) -CHECK_SYMBOL_REPLACEMENT(S_IFIFO _S_IFIFO sys/stat.h) -CHECK_SYMBOL_REPLACEMENT(SIGQUIT SIGTERM signal.h) -CHECK_SYMBOL_REPLACEMENT(SIGPIPE SIGINT signal.h) -CHECK_FUNCTION_REPLACEMENT(popen _popen) -CHECK_FUNCTION_REPLACEMENT(pclose _pclose) -CHECK_FUNCTION_REPLACEMENT(access _access) -CHECK_FUNCTION_REPLACEMENT(strcasecmp _stricmp) -CHECK_FUNCTION_REPLACEMENT(strncasecmp _strnicmp) -CHECK_SYMBOL_REPLACEMENT(snprintf _snprintf stdio.h) -CHECK_FUNCTION_REPLACEMENT(strtok_r strtok_s) -CHECK_FUNCTION_REPLACEMENT(strtoll _strtoi64) -CHECK_FUNCTION_REPLACEMENT(strtoull _strtoui64) -CHECK_FUNCTION_REPLACEMENT(vsnprintf _vsnprintf) -CHECK_TYPE_SIZE(ssize_t SIZE_OF_SSIZE_T) -IF(NOT HAVE_SIZE_OF_SSIZE_T) - SET(ssize_t SSIZE_T) -ENDIF() - -SET(FN_NO_CASE_SENSE 1) -SET(USE_SYMDIR 1) - -# Force static C runtime for targets in current directory -# (useful to get rid of MFC dll's dependency, or in installer) -MACRO(FORCE_STATIC_CRT) - FOREACH(flag - CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO - CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_DEBUG_INIT - CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG_INIT - CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_MINSIZEREL - ) - STRING(REGEX REPLACE "/MD[d]?" "/MT" "${flag}" "${${flag}}" ) - STRING(REPLACE "${DYNAMIC_UCRT_LINKER_OPTION}" "" "${flag}" "${${flag}}") - ENDFOREACH() -ENDMACRO() diff -Nru mariadb-10.11.11/cmake/os/WindowsCache.cmake mariadb-10.11.13/cmake/os/WindowsCache.cmake --- mariadb-10.11.11/cmake/os/WindowsCache.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/os/WindowsCache.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -203,10 +203,10 @@ SET(HAVE_STRNDUP CACHE INTERNAL "") SET(HAVE_STRNLEN 1 CACHE INTERNAL "") SET(HAVE_STRPBRK 1 CACHE INTERNAL "") -SET(HAVE_STRTOK_R CACHE INTERNAL "") -SET(HAVE_STRTOLL CACHE INTERNAL "") +SET(HAVE_STRTOK_R 1 CACHE INTERNAL "") +SET(HAVE_STRTOLL 1 CACHE INTERNAL "") SET(HAVE_STRTOUL 1 CACHE INTERNAL "") -SET(HAVE_STRTOULL CACHE INTERNAL "") +SET(HAVE_STRTOULL 1 CACHE INTERNAL "") SET(HAVE_SYNCH_H CACHE INTERNAL "") SET(HAVE_SYSENT_H CACHE INTERNAL "") SET(HAVE_SYS_DIR_H CACHE INTERNAL "") @@ -294,6 +294,7 @@ SET(HAVE_LINUX_UNISTD_H CACHE INTERNAL "") SET(HAVE_SYS_UTSNAME_H CACHE INTERNAL "") SET(HAVE_PTHREAD_ATTR_GETGUARDSIZE CACHE INTERNAL "") +SET(HAVE_PTHREAD_GETATTR_NP CACHE INTERNAL "") SET(HAVE_SOCKPEERCRED CACHE INTERNAL "") SET(HAVE_ABI_CXA_DEMANGLE CACHE INTERNAL "") SET(HAVE_GCC_C11_ATOMICS CACHE INTERNAL "") @@ -348,4 +349,16 @@ SET(HAVE_GETPAGESIZES CACHE INTERNAL "") SET(HAVE_LINUX_LIMITS_H CACHE INTERNAL "") SET(HAVE_FILE_UCONTEXT_H CACHE INTERNAL "") +SET(have_C__Werror CACHE INTERNAL "") +SET(HAVE_SIGNAL_H 1 CACHE INTERNAL "") +SET(HAVE_UINT CACHE INTERNAL "") +SET(HAVE_SOCKET_LEN_T CACHE INTERNAL "") +SET(HAVE_GETTHRID CACHE INTERNAL "") +SET(HAVE_THREAD_LOCAL 1 CACHE INTERNAL "") +SET(have_CXX__Wno_unused_but_set_variable CACHE INTERNAL "") +SET(HAVE_UNISTD_H CACHE INTERNAL "") +SET(HAVE_LINUX_UNISTD_H CACHE INTERNAL "") +SET(OFF64_T CACHE INTERNAL "") +SET(Z_HAVE_UNISTD_H CACHE INTERNAL "") +SET(HAVE_OFF64_T CACHE FALSE INTERNAL "") ENDIF(MSVC) diff -Nru mariadb-10.11.11/cmake/pcre.cmake mariadb-10.11.13/cmake/pcre.cmake --- mariadb-10.11.11/cmake/pcre.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/pcre.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -54,11 +54,18 @@ ENDIF() ENDFOREACH() + IF(CMAKE_MSVC_RUNTIME_LIBRARY) + SET(CMAKE_MSVC_RUNTIME_LIBRARY_ARG + "-DCMAKE_MSVC_RUNTIME_LIBRARY=${CMAKE_MSVC_RUNTIME_LIBRARY}") + ELSE() + SET(CMAKE_MSVC_RUNTIME_LIBRARY_ARG) + ENDIF() + ExternalProject_Add( pcre2 PREFIX "${dir}" - URL "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-10.44/pcre2-10.44.zip" - URL_MD5 dfab8313154b3377a6959c3b6377841e + URL "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-10.45/pcre2-10.45.zip" + URL_MD5 873da56c6469ec207ca5c5ae9688b83a INSTALL_COMMAND "" CMAKE_ARGS "-DCMAKE_WARN_DEPRECATED=FALSE" @@ -72,6 +79,7 @@ "-DCMAKE_C_FLAGS_RELEASE=${pcre2_flags_RELEASE}" "-DCMAKE_C_FLAGS_MINSIZEREL=${pcre2_flags_MINSIZEREL}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + ${CMAKE_MSVC_RUNTIME_LIBRARY_ARG} ${stdlibs} ${byproducts} ) diff -Nru mariadb-10.11.11/cmake/plugin.cmake mariadb-10.11.13/cmake/plugin.cmake --- mariadb-10.11.11/cmake/plugin.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/cmake/plugin.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -214,6 +214,11 @@ TARGET_LINK_LIBRARIES (${target} mysqlservices ${ARG_LINK_LIBRARIES}) + IF(WIN32) + # A popular library, turns out many plugins need it for gethostname() + TARGET_LINK_LIBRARIES (${target} ws2_32) + ENDIF() + IF(CMAKE_SYSTEM_NAME MATCHES AIX) TARGET_LINK_OPTIONS(${target} PRIVATE "-Wl,-bE:${CMAKE_SOURCE_DIR}/libservices/mysqlservices_aix.def") ENDIF() diff -Nru mariadb-10.11.11/config.h.cmake mariadb-10.11.13/config.h.cmake --- mariadb-10.11.11/config.h.cmake 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/config.h.cmake 2025-05-19 16:14:24.000000000 +0000 @@ -402,38 +402,27 @@ #cmakedefine SIGNAL_WITH_VIO_CLOSE 1 /* Windows stuff, mostly functions, that have Posix analogs but named differently */ -#cmakedefine S_IROTH @S_IROTH@ -#cmakedefine S_IFIFO @S_IFIFO@ -#cmakedefine IPPROTO_IPV6 @IPPROTO_IPV6@ -#cmakedefine IPV6_V6ONLY @IPV6_V6ONLY@ -#cmakedefine sigset_t @sigset_t@ -#cmakedefine mode_t @mode_t@ -#cmakedefine SIGQUIT @SIGQUIT@ -#cmakedefine SIGPIPE @SIGPIPE@ -#cmakedefine popen @popen@ -#cmakedefine pclose @pclose@ -#cmakedefine ssize_t @ssize_t@ -#cmakedefine strcasecmp @strcasecmp@ -#cmakedefine strncasecmp @strncasecmp@ -#cmakedefine snprintf @snprintf@ -#cmakedefine strtok_r @strtok_r@ -#cmakedefine strtoll @strtoll@ -#cmakedefine strtoull @strtoull@ -#cmakedefine vsnprintf @vsnprintf@ -#if defined(_MSC_VER) && (_MSC_VER > 1800) +#ifdef _WIN32 +#define S_IROTH _S_IREAD +#define S_IFIFO _S_IFIFO +#define SIGQUIT SIGTERM +#define SIGPIPE SIGINT +#define sigset_t int +#define mode_t int +#define popen _popen +#define pclose _pclose +#define ssize_t SSIZE_T +#define strcasecmp _stricmp +#define strncasecmp _strnicmp +#define strtok_r strtok_s #define tzname _tzname #define P_tmpdir "C:\\TEMP" -#endif -#if defined(_MSC_VER) && (_MSC_VER > 1310) -# define HAVE_SETENV #define setenv(a,b,c) _putenv_s(a,b) -#endif -#define PSAPI_VERSION 1 /* for GetProcessMemoryInfo() */ -/* We don't want the min/max macros */ -#ifdef _WIN32 +#define HAVE_SETENV #define NOMINMAX 1 -#endif +#define PSAPI_VERSION 2 /* for GetProcessMemoryInfo() */ +#endif /* _WIN32 */ /* MySQL features @@ -457,6 +446,11 @@ /* This should mean case insensitive file system */ #cmakedefine FN_NO_CASE_SENSE 1 +/* Whether an anonymous private mapping is unaccessible after +madvise(MADV_DONTNEED) or madvise(MADV_FREE) or similar has been invoked; +this is the case with Microsoft Windows VirtualFree(MEM_DECOMMIT) */ +#cmakedefine HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1 + #cmakedefine HAVE_CHARSET_armscii8 1 #cmakedefine HAVE_CHARSET_ascii 1 #cmakedefine HAVE_CHARSET_big5 1 diff -Nru mariadb-10.11.11/debian/changelog mariadb-10.11.13/debian/changelog --- mariadb-10.11.11/debian/changelog 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/changelog 2025-05-23 21:26:02.000000000 +0000 @@ -1,3 +1,39 @@ +mariadb (1:10.11.13-0+deb12u1) bookworm; urgency=medium + + * New upstream version 10.11.13. Includes fixes for several severe regressions + as noted at https://mariadb.com/kb/en/mariadb-10-11-13-release-notes/, which + were discovered soon after the 10.11.12 release, which was skipped in Debian + intentionally. + * This release includes upstream version 10.11.12, with fixes for regressions + as noted at https://mariadb.com/kb/en/mariadb-10-11-12-release-notes/ + well as security issues (Closes: #1100437, #1105976): + - CVE-2023-52969 + - CVE-2023-52970 + - CVE-2023-52971 + - CVE-2025-30693 + - CVE-2025-30722 + * Drop all RocksDB patches now upstream due to update to version 6.29fb + * New upstream version has now CEST as allowed in main.timezone test + (Closes: #1084293) + * New upstream includes systemd service fix for restarts on crashes + (Closes: #1073847) + * New upstream also fixes regression in INSERT SELECT on NOT NULL columns + while having BEFORE UPDATE trigger (Closes: #1099515) + * Revert "Set CAP_IPC_LOCK capability if possible" because of MDEV-36229 + (Closes: #1100575) + * Update configuration traces to have --ssl-verify-server-cert from MDEV-28908 + * Update configuration traces to include new upstream system variables: + - innodb-buffer-pool-size-auto-min (default: 0) + - innodb-buffer-pool-size-max (default: 0) + - innodb-log-checkpoint-now (default: FALSE) + * Also update configuration traces to match that in 10.11.12 the variables + innodb-buffer-pool-chunk-size and innodb-log-spin-wait-delay are advertised + as deprecated. + * Fix changelog entry formatting in 1:10.11.11-0+deb12u1 + * Salsa CI: Adapt piuparts helper script to new source format in Bookworm + + -- Otto Kekäläinen Fri, 23 May 2025 14:26:02 -0700 + mariadb (1:10.11.11-0+deb12u1) bookworm; urgency=medium [ Otto Kekäläinen ] @@ -27,7 +63,8 @@ unstable in MariaDB 11.4 for a long time, and which are likely needed to avoid occasional shutdown issues, in particular on upgrades (LP: #2034125) in both Debian and Ubuntu - - Make SysV init more verbose in case of MariaDB start failures (Related: #1033234) + - Make SysV init more verbose in case of MariaDB start failures + (Related: #1033234) - Limit check of running mysqld/mariadbd to system users (Closes: #1032047) - When shutting down 'mariadbd', fallback to 'mysqld' * Add Lintian overrides for new upstream documentation JavaScript files diff -Nru mariadb-10.11.11/debian/mariadb-server-core.postinst mariadb-10.11.13/debian/mariadb-server-core.postinst --- mariadb-10.11.11/debian/mariadb-server-core.postinst 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/mariadb-server-core.postinst 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -#!/bin/bash -set -e - -# shellcheck source=/dev/null -. /usr/share/debconf/confmodule - -if [ -n "$DEBIAN_SCRIPT_DEBUG" ] -then - set -v -x - DEBIAN_SCRIPT_TRACE=1 -fi - -${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2} - -export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin - -# inspired by iputils-ping -# -# cap_ipc_lock is required if a user wants to use --memlock -# and has insufficient RLIMIT_MEMLOCK (MDEV-33301) - -PROGRAM=$(dpkg-divert --truename /usr/sbin/mysqld) - -case "$1" in - configure) - # If we have setcap installed, try setting - # which allows us to install our binaries without the setuid - # bit. - if command -v setcap > /dev/null - then - if ! setcap cap_ipc_lock+ep "$PROGRAM" - then - echo "Setcap failed on $PROGRAM, required with --memlock if insufficent RLIMIT_MEMLOCK" >&2 - fi - fi - ;; - - abort-upgrade|abort-remove|abort-configure|triggered) - ;; - - *) - echo "postinst called with unknown argument '$1'" 1>&2 - exit 1 - ;; -esac - -db_stop # in case invoke fails - -#DEBHELPER# diff -Nru mariadb-10.11.11/debian/patches/fix-reproducible-builds-rocksdb.patch mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch --- mariadb-10.11.11/debian/patches/fix-reproducible-builds-rocksdb.patch 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -Origin: https://github.com/facebook/rocksdb/commit/0a9a05ae12943b1529ef1eabbca5ce5a71c986bf -# Merged in RocksDB 6.19.3, but not updated into MariaDB yet -Bug: https://github.com/facebook/rocksdb/issues/7035 -Author: Otto Kekäläinen -Subject: Make RocksDB build reproducible - -The RocksDB binary included a string with the build timestamp: -> rocksdb_build_git_date:@2021-05-23·16:04:38@ - -As this changes from build to build, it makes the builds unreproducible. -Simply removing it solves the issue. - -This temporary fix can be removed when a proper fix already done in upstream -lands in MariaDB when the RocksDB submodule is updated to a newer release. - ---- a/storage/rocksdb/rocksdb/util/build_version.cc.in -+++ b/storage/rocksdb/rocksdb/util/build_version.cc.in -@@ -1,5 +1,5 @@ - // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. - #include "build_version.h" --const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@"; --const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@"; --const char* rocksdb_build_compile_date = __DATE__; -+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:REDACTED"; -+const char* rocksdb_build_git_date = "rocksdb_build_git_date:REDACTED"; -+const char* rocksdb_build_compile_date = "REDACTED"; diff -Nru mariadb-10.11.11/debian/patches/fix-spelling-rocksdb.patch mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch --- mariadb-10.11.11/debian/patches/fix-spelling-rocksdb.patch 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -Forwarded: https://github.com/facebook/rocksdb/pull/9653 -Origin: https://patch-diff.githubusercontent.com/raw/facebook/rocksdb/pull/9653.patch -From: Otto Kekäläinen -Date: Wed, 2 Mar 2022 18:13:18 -0800 -Subject: Fix various spelling errors still found in code - Two upstream PRs remain that have been merged, but not imported on MariaDB yet. - ---- a/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc -+++ b/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc -@@ -46,7 +46,7 @@ - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && - f.cf_id != cfd_->GetID()) { - return Status::InvalidArgument( -- "External file column family id dont match"); -+ "External file column family id don't match"); - } - } - -@@ -646,7 +646,7 @@ - return Status::InvalidArgument("Global seqno is required, but disabled"); - } else if (file_to_ingest->global_seqno_offset == 0) { - return Status::InvalidArgument( -- "Trying to set global seqno for a file that dont have a global seqno " -+ "Trying to set global seqno for a file that don't have a global seqno " - "field"); - } - ---- a/storage/rocksdb/rocksdb/include/rocksdb/cache.h -+++ b/storage/rocksdb/rocksdb/include/rocksdb/cache.h -@@ -60,7 +60,7 @@ - // If greater than zero, the LRU list will be split into a high-pri - // list and a low-pri list. High-pri entries will be insert to the - // tail of high-pri list, while low-pri entries will be first inserted to -- // the low-pri list (the midpoint). This is refered to as -+ // the low-pri list (the midpoint). This is referred to as - // midpoint insertion strategy to make entries never get hit in cache - // age out faster. - // diff -Nru mariadb-10.11.11/debian/patches/rocksdb-kfreebsd.patch mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch --- mariadb-10.11.11/debian/patches/rocksdb-kfreebsd.patch 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,150 +0,0 @@ -Forwarded: https://github.com/facebook/rocksdb/pull/6992 -From: Andrew Kryczka -Date: Tue, 16 Jun 2020 19:34:21 -0700 -# Merged in RocksDB 6.13.fb, but not updated into MariaDB yet -Bug: https://jira.mariadb.org/browse/MDEV-19251 -Description: - Upstream has merged this but we still need to wait for it to be included - in a RocksDB release and imported into MariaDB and then into Debian. ---- a/storage/rocksdb/build_rocksdb.cmake -+++ b/storage/rocksdb/build_rocksdb.cmake -@@ -90,6 +90,8 @@ - add_definitions(-DOS_LINUX) - elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS") - add_definitions(-DOS_SOLARIS) -+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") -+ add_definitions(-DOS_GNU_KFREEBSD) - elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_definitions(-DOS_FREEBSD) - elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD") ---- a/storage/rocksdb/rocksdb/CMakeLists.txt -+++ b/storage/rocksdb/rocksdb/CMakeLists.txt -@@ -91,7 +91,7 @@ - option(WITH_XPRESS "build with windows built in compression" OFF) - include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc) - else() -- if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") -+ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") - # FreeBSD has jemalloc as default malloc - # but it does not have all the jemalloc files in include/... - set(WITH_JEMALLOC ON) -@@ -413,6 +413,8 @@ - add_definitions(-DOS_LINUX) - elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS") - add_definitions(-DOS_SOLARIS) -+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") -+ add_definitions(-DOS_GNU_KFREEBSD) - elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_definitions(-DOS_FREEBSD) - elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD") ---- a/storage/rocksdb/rocksdb/build_tools/build_detect_platform -+++ b/storage/rocksdb/rocksdb/build_tools/build_detect_platform -@@ -190,6 +190,17 @@ - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" - # PORT_FILES=port/freebsd/freebsd_specific.cc - ;; -+ GNU/kFreeBSD) -+ PLATFORM=OS_GNU_KFREEBSD -+ COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD" -+ if [ -z "$USE_CLANG" ]; then -+ COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" -+ else -+ PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" -+ fi -+ PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" -+ # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc -+ ;; - NetBSD) - PLATFORM=OS_NETBSD - COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" ---- a/storage/rocksdb/rocksdb/env/env_posix.cc -+++ b/storage/rocksdb/rocksdb/env/env_posix.cc -@@ -41,7 +41,7 @@ - #include - #include - // Get nano time includes --#if defined(OS_LINUX) || defined(OS_FREEBSD) -+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) - #elif defined(__MACH__) - #include - #include -@@ -287,7 +287,8 @@ - } - - uint64_t NowNanos() override { --#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) -+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ -+ defined(OS_AIX) - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -@@ -307,8 +308,8 @@ - } - - uint64_t NowCPUNanos() override { --#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \ -- (defined(__MACH__) && defined(__MAC_10_12)) -+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ -+ defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) - struct timespec ts; - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; ---- a/storage/rocksdb/rocksdb/port/stack_trace.cc -+++ b/storage/rocksdb/rocksdb/port/stack_trace.cc -@@ -32,7 +32,7 @@ - - namespace { - --#if defined(OS_LINUX) || defined(OS_FREEBSD) -+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) - const char* GetExecutableName() { - static char name[1024]; - ---- a/storage/rocksdb/rdb_io_watchdog.h -+++ b/storage/rocksdb/rdb_io_watchdog.h -@@ -56,19 +56,19 @@ - int stop_timers() { - int ret = 0; - -- if (m_io_check_watchdog_timer) { -+ if (m_io_check_watchdog_timer != reinterpret_cast(-1)) { - ret = timer_delete(m_io_check_watchdog_timer); - - if (!ret) { -- m_io_check_watchdog_timer = nullptr; -+ m_io_check_watchdog_timer = reinterpret_cast(-1); - } - } - -- if (m_io_check_timer && !ret) { -+ if (m_io_check_timer != reinterpret_cast(-1) && !ret) { - ret = timer_delete(m_io_check_timer); - - if (!ret) { -- m_io_check_timer = nullptr; -+ m_io_check_timer = reinterpret_cast(-1); - } - } - -@@ -93,8 +93,8 @@ - - public: - explicit Rdb_io_watchdog(std::vector &&directories) -- : m_io_check_timer(nullptr), -- m_io_check_watchdog_timer(nullptr), -+ : m_io_check_timer(reinterpret_cast(-1)), -+ m_io_check_watchdog_timer(reinterpret_cast(-1)), - m_io_in_progress(false), - m_dirs_to_check(std::move(directories)), - m_buf(nullptr) { ---- a/storage/rocksdb/rdb_io_watchdog.cc -+++ b/storage/rocksdb/rdb_io_watchdog.cc -@@ -111,7 +111,7 @@ - sql_print_warning("Deleting the watchdog I/O timer failed with %d.", errno); - } - -- m_io_check_watchdog_timer = nullptr; -+ m_io_check_watchdog_timer = reinterpret_cast(-1); - - RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); - } diff -Nru mariadb-10.11.11/debian/patches/series mariadb-10.11.13/debian/patches/series --- mariadb-10.11.11/debian/patches/series 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/patches/series 2025-05-23 21:26:02.000000000 +0000 @@ -1,5 +1,2 @@ -rocksdb-kfreebsd.patch env-perl-usr-bin-perl.patch -fix-spelling-rocksdb.patch -fix-reproducible-builds-rocksdb.patch mroonga-mrn-lib-dirs-path-reproducible-build.patch diff -Nru mariadb-10.11.11/debian/salsa-ci-enable-sec-and-update-repos.sh mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh --- mariadb-10.11.11/debian/salsa-ci-enable-sec-and-update-repos.sh 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh 2025-05-23 21:26:02.000000000 +0000 @@ -1,10 +1,14 @@ #!/bin/sh -set -x -set -e +echo "Running salsa-ci-enable-sec-and-update-repos.sh to enable the same" +echo "repositories thate were available at build time in e.g." +echo "registry.salsa.debian.org/salsa-ci-team/pipeline/base:bullseye" + +# Debug what repositories are available to begin +head /etc/apt/sources.list /etc/apt/sources.list.d/* || true -# Debug what repositories are available to begin with -grep -r "^deb " /etc/apt/sources.* +# Fail on non-zero exit codes from this point onward +set -e # Enable the same repositories that were available at build time in # registry.salsa.debian.org/salsa-ci-team/pipeline/base:bullseye diff -Nru mariadb-10.11.11/debian/salsa-ci.yml mariadb-10.11.13/debian/salsa-ci.yml --- mariadb-10.11.11/debian/salsa-ci.yml 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/salsa-ci.yml 2025-05-23 21:26:02.000000000 +0000 @@ -24,10 +24,17 @@ # For unknown reason Lintian v2.116.3 in Bookworm errors on valid changelog entry SALSA_CI_LINTIAN_SUPPRESS_TAGS: 'bad-distribution-in-changes-file' -# Extend Salsa-CI build jobs to have longer timeout as the default GitLab -# timeout (1h) is often not enough .build-package: + # Extend Salsa CI build jobs to have longer timeout as the default GitLab + # timeout (1h) is often not enough timeout: 3h + # Default 5G sporadically fails builds on not having enough disk space + variables: + CCACHE_MAXSIZE: 3G + # Salsa instance runners typically have 30G volumes with 14G free disk space + before_script: + - echo "Total and free disk space:" + - df -h . stages: - provisioning diff -Nru mariadb-10.11.11/debian/tests/traces/mariadb-verbose-help.expected mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected --- mariadb-10.11.11/debian/tests/traces/mariadb-verbose-help.expected 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected 2025-05-23 21:26:02.000000000 +0000 @@ -156,9 +156,8 @@ --ssl-crlpath=name Certificate revocation list path (implies --ssl). --tls-version=name TLS protocol version for secure connection. --ssl-verify-server-cert - Verify server's "Common Name" in its cert against - hostname used when connecting. This option is disabled by - default. + Verify server's certificate to prevent man-in-the-middle + attacks -t, --table Output in table format. --tee=name Append everything into outfile. See interactive help (\h) also. Does not work in batch mode. Disable with diff -Nru mariadb-10.11.11/debian/tests/traces/mariadbd-verbose-help.expected mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected --- mariadb-10.11.11/debian/tests/traces/mariadbd-verbose-help.expected 2025-02-19 00:56:41.000000000 +0000 +++ mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected 2025-05-23 21:26:02.000000000 +0000 @@ -575,9 +575,7 @@ FORCE_PLUS_PERMANENT (like FORCE, but the plugin can not be uninstalled). --innodb-buffer-pool-chunk-size=# - Size of a single memory chunk for resizing buffer pool. - Online buffer pool resizing happens at this granularity. - 0 means autosize this variable based on buffer pool size. + Deprecated parameter with no effect --innodb-buffer-pool-dump-at-shutdown Dump the buffer pool into a file named @@innodb_buffer_pool_filename @@ -603,6 +601,11 @@ --innodb-buffer-pool-size=# The size of the memory buffer InnoDB uses to cache data and indexes of its tables. + --innodb-buffer-pool-size-auto-min=# + Minimum innodb_buffer_pool_size for dynamic shrinking on + memory pressure + --innodb-buffer-pool-size-max=# + Maximum innodb_buffer_pool_size --innodb-buffer-pool-stats[=name] Enable or disable INNODB_BUFFER_POOL_STATS plugin. One of: ON, OFF, FORCE (don't start if the plugin fails to @@ -883,6 +886,9 @@ be uninstalled). --innodb-log-buffer-size=# Redo log buffer size in bytes. + --innodb-log-checkpoint-now + Write back dirty pages from the buffer pool and update + the log checkpoint --innodb-log-file-buffering Whether the file system cache for ib_logfile0 is enabled --innodb-log-file-mmap @@ -894,8 +900,7 @@ --innodb-log-group-home-dir=name Path to ib_logfile0 --innodb-log-spin-wait-delay[=#] - Delay between log buffer spin lock polls (0 to use a - blocking latch) + Deprecated parameter with no effect --innodb-log-write-ahead-size=# Redo log write size to avoid read-on-write; must be a power of two @@ -1449,7 +1454,8 @@ keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in - selectivity_for_indexes. selectivity_multiplier. This + selectivity_for_indexes. fix_derived_table_read_cost = + Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. Use 'ALL' to set all combinations. @@ -2611,6 +2617,8 @@ innodb-buffer-pool-load-at-startup TRUE innodb-buffer-pool-load-now FALSE innodb-buffer-pool-size 134217728 +innodb-buffer-pool-size-auto-min 0 +innodb-buffer-pool-size-max 0 innodb-buffer-pool-stats ON innodb-change-buffer-max-size 25 innodb-change-buffering none @@ -2685,6 +2693,7 @@ innodb-lock-waits ON innodb-locks ON innodb-log-buffer-size 16777216 +innodb-log-checkpoint-now FALSE innodb-log-file-buffering FALSE innodb-log-file-mmap TRUE innodb-log-file-size 100663296 diff -Nru mariadb-10.11.11/extra/mariabackup/backup_mysql.cc mariadb-10.11.13/extra/mariabackup/backup_mysql.cc --- mariadb-10.11.11/extra/mariabackup/backup_mysql.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/extra/mariabackup/backup_mysql.cc 2025-05-19 16:14:24.000000000 +0000 @@ -1893,7 +1893,7 @@ srv_log_file_size, srv_page_size, srv_undo_dir, - (uint) srv_undo_tablespaces, + srv_undo_tablespaces, page_zip_level, innobase_buffer_pool_filename ? "innodb_buffer_pool_filename=" : "", diff -Nru mariadb-10.11.11/extra/mariabackup/common_engine.cc mariadb-10.11.13/extra/mariabackup/common_engine.cc --- mariadb-10.11.11/extra/mariabackup/common_engine.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/extra/mariabackup/common_engine.cc 2025-05-19 16:14:24.000000000 +0000 @@ -64,8 +64,10 @@ for (const auto &fname : m_fnames) { File file = mysql_file_open(0, fname.c_str(),O_RDONLY | O_SHARE, MYF(0)); if (file < 0) { - msg(thread_num, "Error on file %s open during %s table copy", - fname.c_str(), full_tname.c_str()); + char buf[MYSYS_STRERROR_SIZE]; + msg(thread_num, "Error %i on file %s open during %s table copy: %s", + errno, fname.c_str(), full_tname.c_str(), + my_strerror(buf, sizeof(buf), errno)); goto exit; } files.push_back(file); diff -Nru mariadb-10.11.11/extra/mariabackup/innobackupex.cc mariadb-10.11.13/extra/mariabackup/innobackupex.cc --- mariadb-10.11.11/extra/mariabackup/innobackupex.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/extra/mariabackup/innobackupex.cc 2025-05-19 16:14:24.000000000 +0000 @@ -44,8 +44,8 @@ #include #include #include -#include #include +#include "buf0buf.h" #include #include #include @@ -594,8 +594,9 @@ "--apply-log.", (uchar*) &ibx_xtrabackup_use_memory, (uchar*) &ibx_xtrabackup_use_memory, - 0, GET_LL, REQUIRED_ARG, 100*1024*1024L, 1024*1024L, LONGLONG_MAX, 0, - 1024*1024L, 0}, + 0, GET_LL, REQUIRED_ARG, 96 << 20, + innodb_buffer_pool_extent_size, SIZE_T_MAX, 0, + innodb_buffer_pool_extent_size, 0}, {"innodb-force-recovery", OPT_INNODB_FORCE_RECOVERY, "This option starts up the embedded InnoDB instance in crash " diff -Nru mariadb-10.11.11/extra/mariabackup/write_filt.cc mariadb-10.11.13/extra/mariabackup/write_filt.cc --- mariadb-10.11.11/extra/mariabackup/write_filt.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/extra/mariabackup/write_filt.cc 2025-05-19 16:14:24.000000000 +0000 @@ -144,18 +144,6 @@ return false; } - /* Check whether TRX_SYS page has been changed */ - if (mach_read_from_4(page + FIL_PAGE_SPACE_ID) - == TRX_SYS_SPACE - && mach_read_from_4(page + FIL_PAGE_OFFSET) - == TRX_SYS_PAGE_NO) { - msg(cursor->thread_n, - "--incremental backup is impossible if " - "the server had been restarted with " - "different innodb_undo_tablespaces."); - return false; - } - /* updated page */ if (cp->npages == page_size / 4) { /* flush buffer */ diff -Nru mariadb-10.11.11/extra/mariabackup/xtrabackup.cc mariadb-10.11.13/extra/mariabackup/xtrabackup.cc --- mariadb-10.11.11/extra/mariabackup/xtrabackup.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/extra/mariabackup/xtrabackup.cc 2025-05-19 16:14:24.000000000 +0000 @@ -201,8 +201,6 @@ xb_filter_entry_t *name_hash; }; -lsn_t checkpoint_lsn_start; -lsn_t checkpoint_no_start; /** whether log_copying_thread() is active; protected by recv_sys.mutex */ static bool log_copying_running; /** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */ @@ -1383,6 +1381,7 @@ OPT_XTRA_MYSQLD_ARGS, OPT_XB_IGNORE_INNODB_PAGE_CORRUPTION, OPT_INNODB_FORCE_RECOVERY, + OPT_INNODB_CHECKPOINT, OPT_ARIA_LOG_DIR_PATH }; @@ -1414,8 +1413,9 @@ "The value is used in place of innodb_buffer_pool_size. " "This option is only relevant when the --prepare option is specified.", (G_PTR *) &xtrabackup_use_memory, (G_PTR *) &xtrabackup_use_memory, 0, - GET_LL, REQUIRED_ARG, 100 * 1024 * 1024L, 1024 * 1024L, LONGLONG_MAX, 0, - 1024 * 1024L, 0}, + GET_ULL, REQUIRED_ARG, 96 << 20, innodb_buffer_pool_extent_size, + size_t(-ssize_t(innodb_buffer_pool_extent_size)), + 0, innodb_buffer_pool_extent_size, 0}, {"throttle", OPT_XTRA_THROTTLE, "limit count of IO operations (pairs of read&write) per second to IOS " "values (for '--backup')", @@ -1787,10 +1787,7 @@ static const char *dbug_option; #endif -#ifdef HAVE_URING -extern const char *io_uring_may_be_unsafe; -bool innodb_use_native_aio_default(); -#endif +static my_bool innodb_log_checkpoint_now; struct my_option xb_server_options[] = { @@ -1927,12 +1924,7 @@ "Use native AIO if supported on this platform.", (G_PTR*) &srv_use_native_aio, (G_PTR*) &srv_use_native_aio, 0, GET_BOOL, NO_ARG, -#ifdef HAVE_URING - innodb_use_native_aio_default(), -#else - TRUE, -#endif - 0, 0, 0, 0, 0}, + TRUE, 0, 0, 0, 0, 0}, {"innodb_page_size", OPT_INNODB_PAGE_SIZE, "The universal page size of the database.", (G_PTR*) &innobase_page_size, (G_PTR*) &innobase_page_size, 0, @@ -2019,6 +2011,12 @@ (G_PTR*)&srv_force_recovery, 0, GET_ULONG, OPT_ARG, 0, 0, SRV_FORCE_IGNORE_CORRUPT, 0, 0, 0}, + {"innodb_log_checkpoint_now", OPT_INNODB_CHECKPOINT, + "(for --backup): Force an InnoDB checkpoint", + (G_PTR*)&innodb_log_checkpoint_now, + (G_PTR*)&innodb_log_checkpoint_now, + 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, + {"mysqld-args", OPT_XTRA_MYSQLD_ARGS, "All arguments that follow this argument are considered as server " "options, and if some of them are not supported by mariabackup, they " @@ -2482,7 +2480,7 @@ } srv_sys_space.normalize_size(); - srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); + srv_lock_table_size = 5 * buf_pool.curr_size(); /* -------------- Log files ---------------------------*/ @@ -2504,11 +2502,8 @@ srv_adaptive_flushing = FALSE; - /* We set srv_pool_size here in units of 1 kB. InnoDB internally - changes the value so that it becomes the number of database pages. */ - - srv_buf_pool_size = (ulint) xtrabackup_use_memory; - srv_buf_pool_chunk_unit = srv_buf_pool_size; + buf_pool.size_in_bytes_max = size_t(xtrabackup_use_memory); + buf_pool.size_in_bytes_requested = buf_pool.size_in_bytes_max; srv_n_read_io_threads = (uint) innobase_read_io_threads; srv_n_write_io_threads = (uint) innobase_write_io_threads; @@ -2534,12 +2529,8 @@ msg("InnoDB: Using Linux native AIO"); } #elif defined(HAVE_URING) - if (!srv_use_native_aio) { - } else if (io_uring_may_be_unsafe) { - msg("InnoDB: Using liburing on this kernel %s may cause hangs;" - " see https://jira.mariadb.org/browse/MDEV-26674", - io_uring_may_be_unsafe); - } else { + + if (srv_use_native_aio) { msg("InnoDB: Using liburing"); } #else @@ -2679,7 +2670,7 @@ } recv_sys.lsn= log_sys.next_checkpoint_lsn= - log_sys.get_lsn() - SIZE_OF_FILE_CHECKPOINT; + log_get_lsn() - SIZE_OF_FILE_CHECKPOINT; log_sys.set_latest_format(false); // not encrypted log_hdr_init(); byte *b= &log_hdr_buf[log_t::START_OFFSET]; @@ -2946,6 +2937,15 @@ const regex_list_t& list, const char* name) { + if (list.empty()) return (FALSE); + + /* + regexec/pcre2_regexec is not threadsafe, also documented. + Serialize access from multiple threads to compiled regexes. + */ + static std::mutex regex_match_mutex; + std::lock_guard lock(regex_match_mutex); + regmatch_t tables_regmatch[1]; for (regex_list_t::const_iterator i = list.begin(), end = list.end(); i != end; ++i) { @@ -5405,6 +5405,14 @@ } msg("cd to %s", mysql_real_data_home); encryption_plugin_backup_init(mysql_connection); + if (innodb_log_checkpoint_now != false && mysql_send_query( + mysql_connection, + C_STRING_WITH_LEN("SET GLOBAL " + "innodb_log_checkpoint_now=ON;"))) { + msg("initiating checkpoint failed"); + return(false); + } + msg("open files limit requested %lu, set to %lu", xb_open_files_limit, xb_set_max_open_files(xb_open_files_limit)); @@ -5517,6 +5525,11 @@ goto fail; } + /* try to wait for a log checkpoint, but do not fail if the + server does not support this */ + if (innodb_log_checkpoint_now != false) { + mysql_read_query_result(mysql_connection); + } /* label it */ recv_sys.file_checkpoint = log_sys.next_checkpoint_lsn; log_hdr_init(); @@ -6230,9 +6243,22 @@ buf + FSP_HEADER_OFFSET + FSP_SIZE); if (mach_read_from_4(buf + FIL_PAGE_SPACE_ID)) { +#ifdef _WIN32 + os_offset_t last_page = + os_file_get_size(dst_file) / + page_size; + + /* os_file_set_size() would + shrink the size of the file */ + if (last_page < n_pages && + !os_file_set_size( + dst_path, dst_file, + n_pages * page_size)) +#else if (!os_file_set_size( dst_path, dst_file, n_pages * page_size)) +#endif /* _WIN32 */ goto error; } else if (fil_space_t* space = fil_system.sys_space) { diff -Nru mariadb-10.11.11/include/json_lib.h mariadb-10.11.13/include/json_lib.h --- mariadb-10.11.11/include/json_lib.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/json_lib.h 2025-05-19 16:14:24.000000000 +0000 @@ -387,7 +387,7 @@ Returns negative integer in the case of an error, the length of the result otherwise. */ -int json_unescape(CHARSET_INFO *json_cs, +int __attribute__((warn_unused_result)) json_unescape(CHARSET_INFO *json_cs, const uchar *json_str, const uchar *json_end, CHARSET_INFO *res_cs, uchar *res, uchar *res_end); @@ -401,7 +401,8 @@ JSON_ERROR_OUT_OF_SPACE Not enough space in the provided buffer JSON_ERROR_ILLEGAL_SYMBOL Source symbol cannot be represented in JSON */ -int json_escape(CHARSET_INFO *str_cs, const uchar *str, const uchar *str_end, +int __attribute__((warn_unused_result)) json_escape(CHARSET_INFO *str_cs, + const uchar *str, const uchar *str_end, CHARSET_INFO *json_cs, uchar *json, uchar *json_end); diff -Nru mariadb-10.11.11/include/my_base.h mariadb-10.11.13/include/my_base.h --- mariadb-10.11.11/include/my_base.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/my_base.h 2025-05-19 16:14:24.000000000 +0000 @@ -219,7 +219,10 @@ /** Start writing rows during ALTER TABLE...ALGORITHM=COPY. */ HA_EXTRA_BEGIN_ALTER_COPY, /** Finish writing rows during ALTER TABLE...ALGORITHM=COPY. */ - HA_EXTRA_END_ALTER_COPY + HA_EXTRA_END_ALTER_COPY, + /** Abort of writing rows during ALTER TABLE..ALGORITHM=COPY or + CREATE..SELCT */ + HA_EXTRA_ABORT_ALTER_COPY }; /* Compatible option, to be deleted in 6.0 */ diff -Nru mariadb-10.11.11/include/my_cpu.h mariadb-10.11.13/include/my_cpu.h --- mariadb-10.11.11/include/my_cpu.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/my_cpu.h 2025-05-19 16:14:24.000000000 +0000 @@ -97,7 +97,12 @@ /* Changed from __ppc_get_timebase for musl and clang compatibility */ __builtin_ppc_get_timebase(); #elif defined __GNUC__ && defined __riscv - __builtin_riscv_pause(); + /* The GCC-only __builtin_riscv_pause() or the pause instruction is + encoded like a fence instruction with special parameters. On RISC-V + implementations that do not support arch=+zihintpause this + instruction could be interpreted as a more expensive memory fence; + it should not be an illegal instruction. */ + __asm__ volatile(".long 0x0100000f" ::: "memory"); #elif defined __GNUC__ /* Mainly, prevent the compiler from optimizing away delay loops */ __asm__ __volatile__ ("":::"memory"); diff -Nru mariadb-10.11.11/include/my_stack_alloc.h mariadb-10.11.13/include/my_stack_alloc.h --- mariadb-10.11.11/include/my_stack_alloc.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/my_stack_alloc.h 2025-05-19 16:14:24.000000000 +0000 @@ -38,6 +38,8 @@ #if defined(__GNUC__) || defined(__clang__) /* GCC and Clang compilers */ #if defined(__i386__) /* Intel x86 (32-bit) */ __asm__ volatile ("movl %%esp, %0" : "=r" (stack_ptr)); +#elif defined(__x86_64__) && defined (__ILP32__) /* Intel x86-64 (64-bit), X32 ABI */ + __asm__ volatile ("movl %%esp, %0" : "=r" (stack_ptr)); #elif defined(__x86_64__) /* Intel x86-64 (64-bit) */ __asm__ volatile ("movq %%rsp, %0" : "=r" (stack_ptr)); #elif defined(__powerpc__) /* PowerPC (32-bit) */ diff -Nru mariadb-10.11.11/include/my_sys.h mariadb-10.11.13/include/my_sys.h --- mariadb-10.11.11/include/my_sys.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/my_sys.h 2025-05-19 16:14:24.000000000 +0000 @@ -173,9 +173,15 @@ extern void *my_memdup(PSI_memory_key key, const void *from,size_t length,myf MyFlags); extern char *my_strdup(PSI_memory_key key, const char *from,myf MyFlags); extern char *my_strndup(PSI_memory_key key, const char *from, size_t length, myf MyFlags); +extern my_bool my_use_large_pages; -int my_init_large_pages(my_bool super_large_pages); +int my_init_large_pages(void); uchar *my_large_malloc(size_t *size, myf my_flags); +#ifdef _WIN32 +/* On Windows, use my_virtual_mem_reserve() and my_virtual_mem_commit(). */ +#else +char *my_large_virtual_alloc(size_t *size); +#endif void my_large_free(void *ptr, size_t size); void my_large_page_truncate(size_t *size); diff -Nru mariadb-10.11.11/include/my_virtual_mem.h mariadb-10.11.13/include/my_virtual_mem.h --- mariadb-10.11.11/include/my_virtual_mem.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/include/my_virtual_mem.h 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,37 @@ +/* Copyright (c) 2025, MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#pragma once +/* + Functionality for handling virtual memory + (reserve, commit, decommit, release) +*/ +#include /*size_t*/ + +#ifdef __cplusplus +extern "C" { +#endif + +# ifdef _WIN32 +char *my_virtual_mem_reserve(size_t *size); +# endif +char *my_virtual_mem_commit(char *ptr, size_t size); +void my_virtual_mem_decommit(char *ptr, size_t size); +void my_virtual_mem_release(char *ptr, size_t size); + +#ifdef __cplusplus +} +#endif + diff -Nru mariadb-10.11.11/include/source_revision.h mariadb-10.11.13/include/source_revision.h --- mariadb-10.11.11/include/source_revision.h 2025-01-30 11:01:27.000000000 +0000 +++ mariadb-10.11.13/include/source_revision.h 2025-05-19 16:14:28.000000000 +0000 @@ -1 +1 @@ -#define SOURCE_REVISION "e69f8cae1a15e15b9e4f5e0f8497e1f17bdc81a4" +#define SOURCE_REVISION "8fb09426b98583916ccfd4f8c49741adc115bac3" diff -Nru mariadb-10.11.11/include/sslopt-longopts.h mariadb-10.11.13/include/sslopt-longopts.h --- mariadb-10.11.11/include/sslopt-longopts.h 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/include/sslopt-longopts.h 2025-05-19 16:14:24.000000000 +0000 @@ -51,8 +51,7 @@ #ifdef MYSQL_CLIENT {"ssl-verify-server-cert", 0, - "Verify server's \"Common Name\" in its cert against hostname used " - "when connecting. This option is disabled by default.", + "Verify server's certificate to prevent man-in-the-middle attacks", &opt_ssl_verify_server_cert, &opt_ssl_verify_server_cert, 0, GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0}, #endif diff -Nru mariadb-10.11.11/libmariadb/CMakeLists.txt mariadb-10.11.13/libmariadb/CMakeLists.txt --- mariadb-10.11.11/libmariadb/CMakeLists.txt 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -52,7 +52,7 @@ SET(CPACK_PACKAGE_VERSION_MAJOR 3) SET(CPACK_PACKAGE_VERSION_MINOR 3) -SET(CPACK_PACKAGE_VERSION_PATCH 14) +SET(CPACK_PACKAGE_VERSION_PATCH 16) SET(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") MATH(EXPR MARIADB_PACKAGE_VERSION_ID "${CPACK_PACKAGE_VERSION_MAJOR} * 10000 + ${CPACK_PACKAGE_VERSION_MINOR} * 100 + diff -Nru mariadb-10.11.11/libmariadb/include/errmsg.h mariadb-10.11.13/libmariadb/include/errmsg.h --- mariadb-10.11.11/libmariadb/include/errmsg.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/include/errmsg.h 2025-05-19 16:14:27.000000000 +0000 @@ -115,10 +115,11 @@ #define CR_BINLOG_INVALID_FILE 5022 #define CR_BINLOG_SEMI_SYNC_ERROR 5023 #define CR_INVALID_CLIENT_FLAG 5024 +#define CR_ERR_MISSING_ERROR_INFO 5026 /* Always last, if you add new error codes please update the value for CR_MARIADB_LAST_ERROR */ -#define CR_MARIADB_LAST_ERROR CR_INVALID_CLIENT_FLAG +#define CR_MARIADB_LAST_ERROR CR_ERR_MISSING_ERROR_INFO #endif diff -Nru mariadb-10.11.11/libmariadb/include/ma_context.h mariadb-10.11.13/libmariadb/include/ma_context.h --- mariadb-10.11.11/libmariadb/include/ma_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/include/ma_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -26,8 +26,33 @@ (This particular implementation uses Posix ucontext swapcontext().) */ + +/* + When running with address sanitizer, the stack switching can cause confusion + unless the __sanitizer_{start,finish}_switch_fiber() functions are used + (CONC-618). + + In this case prefer the use of boost::context or ucontext, which should have + this instrumentation, over our custom assembler variants. +*/ +#ifdef __has_feature + /* Clang */ +# if __has_feature(address_sanitizer) +# define ASAN_PREFER_NON_ASM 1 +# endif +#else + /* GCC */ +# ifdef __SANITIZE_ADDRESS__ +# define ASAN_PREFER_NON_ASM 1 +# endif +#endif + #ifdef _WIN32 #define MY_CONTEXT_USE_WIN32_FIBERS 1 +#elif defined(ASAN_PREFER_NON_ASM) && defined(HAVE_BOOST_CONTEXT_H) +#define MY_CONTEXT_USE_BOOST_CONTEXT +#elif defined(ASAN_PREFER_NON_ASM) && defined(HAVE_UCONTEXT_H) +#define MY_CONTEXT_USE_UCONTEXT #elif defined(__GNUC__) && __GNUC__ >= 3 && defined(__x86_64__) && !defined(__ILP32__) #define MY_CONTEXT_USE_X86_64_GCC_ASM #elif defined(__GNUC__) && __GNUC__ >= 3 && defined(__i386__) diff -Nru mariadb-10.11.11/libmariadb/include/mariadb_com.h mariadb-10.11.13/libmariadb/include/mariadb_com.h --- mariadb-10.11.11/libmariadb/include/mariadb_com.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/include/mariadb_com.h 2025-05-19 16:14:27.000000000 +0000 @@ -423,6 +423,28 @@ double max_value_dbl; }; + /* The following is for user defined functions */ + +typedef struct st_udf_args +{ + unsigned int arg_count; /* Number of arguments */ + enum Item_result *arg_type; /* Pointer to item_results */ + char **args; /* Pointer to argument */ + unsigned long *lengths; /* Length of string arguments */ + char *maybe_null; /* Set to 1 for all maybe_null args */ +} UDF_ARGS; + + /* This holds information about the result */ + +typedef struct st_udf_init +{ + my_bool maybe_null; /* 1 if function can return NULL */ + unsigned int decimals; /* for real functions */ + unsigned int max_length; /* For string functions */ + char *ptr; /* free pointer for function data */ + my_bool const_item; /* 0 if result is independent of arguments */ +} UDF_INIT; + /* Connection types */ #define MARIADB_CONNECTION_UNIXSOCKET 0 #define MARIADB_CONNECTION_TCP 1 diff -Nru mariadb-10.11.11/libmariadb/libmariadb/CMakeLists.txt mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt --- mariadb-10.11.11/libmariadb/libmariadb/CMakeLists.txt 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -168,12 +168,6 @@ mysql_use_result mysql_warning_count) -# some gcc versions fail to compile asm parts of my_context.c, -# if build type is "Release" (see CONC-133), so we need to add -g flag -IF(CMAKE_COMPILER_IS_GNUCC AND CMAKE_BUILD_TYPE MATCHES "Release") - SET_SOURCE_FILES_PROPERTIES(my_context.c PROPERTIES COMPILE_FLAGS -g) -ENDIF() - IF(ZLIB_FOUND AND WITH_EXTERNAL_ZLIB) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) ELSE() diff -Nru mariadb-10.11.11/libmariadb/libmariadb/ma_context.c mariadb-10.11.13/libmariadb/libmariadb/ma_context.c --- mariadb-10.11.11/libmariadb/libmariadb/ma_context.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/libmariadb/ma_context.c 2025-05-19 16:14:27.000000000 +0000 @@ -105,9 +105,23 @@ c->user_func= f; c->user_data= d; c->active= 1; + u.a[1]= 0; /* Otherwise can give uninitialized warnings on 32-bit. */ u.p= c; + /* + makecontext function expects function pointer to receive multiple + ints as an arguments, however is declared in ucontext.h header with + a void (empty) argument list. Ignore clang cast-function-type-strict + warning for this function call. + */ +# ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-function-type-strict" +# endif makecontext(&c->spawned_context, (uc_func_t)my_context_spawn_internal, 2, u.a[0], u.a[1]); +# ifdef __clang__ +# pragma clang diagnostic pop +# endif return my_context_continue(c); } @@ -204,7 +218,7 @@ ( "movq %%rsp, (%[save])\n\t" "movq %[stack], %%rsp\n\t" -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __clang__) && !defined(__INTEL_COMPILER) +#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13) /* This emits a DWARF DW_CFA_undefined directive to make the return address undefined. This indicates that this is the top of the stack frame, and @@ -440,7 +454,7 @@ ( "movl %%esp, (%[save])\n\t" "movl %[stack], %%esp\n\t" -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __clang__) && !defined(__INTEL_COMPILER) +#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13) /* This emits a DWARF DW_CFA_undefined directive to make the return address undefined. This indicates that this is the top of the stack frame, and @@ -675,7 +689,7 @@ ( "mov x10, sp\n\t" "mov sp, %[stack]\n\t" -#if !defined(__INTEL_COMPILER) +#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13) /* This emits a DWARF DW_CFA_undefined directive to make the return address (UNW_AARCH64_X30) undefined. This indicates that this is the top of the @@ -724,7 +738,11 @@ [stack] "+r" (stack) : [save] "r" (save) : "x3", "x4", "x5", "x6", "x7", - "x9", "x10", "x11", "x14", "x15", "x18", "x30", + "x9", "x10", "x11", "x14", "x15", +#if defined(__linux__) && !defined(__ANDROID__) + "x18", +#endif + "x30", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", @@ -827,7 +845,11 @@ : [ret] "=r" (ret) : [save] "r" (save) : "x1", "x2", "x3", "x4", "x5", "x6", "x7", - "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x18", "x30", + "x9", "x10", "x11", "x12", "x13", "x14", "x15", +#if defined(__linux__) && !defined(__ANDROID__) + "x18", +#endif + "x30", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", @@ -904,7 +926,11 @@ : : [save] "r" (save) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", - "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x18", "x30", + "x9", "x10", "x11", "x12", "x13", "x14", "x15", +#if defined(__linux__) && !defined(__ANDROID__) + "x18", +#endif + "x30", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", diff -Nru mariadb-10.11.11/libmariadb/libmariadb/ma_errmsg.c mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c --- mariadb-10.11.11/libmariadb/libmariadb/ma_errmsg.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c 2025-05-19 16:14:27.000000000 +0000 @@ -119,6 +119,8 @@ /* 5022 */ "File '%s' is not a binary log file", /* 5023 */ "Semi sync request error: %s", /* 5024 */ "Invalid client flags (%lu) specified. Supported flags: %lu", + /* 5025 */ "", + /* 5026 */ "Server returned an error packet without further information", "" }; diff -Nru mariadb-10.11.11/libmariadb/libmariadb/mariadb_lib.c mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c --- mariadb-10.11.11/libmariadb/libmariadb/mariadb_lib.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c 2025-05-19 16:14:27.000000000 +0000 @@ -81,7 +81,7 @@ #define strncasecmp _strnicmp #endif -#define ASYNC_CONTEXT_DEFAULT_STACK_SIZE (4096*15) +#define ASYNC_CONTEXT_DEFAULT_STACK_SIZE (256*1024) #define MA_RPL_VERSION_HACK "5.5.5-" #define CHARSET_NAME_LEN 64 @@ -274,6 +274,11 @@ ma_strmake(net->last_error,(char*) pos, min(len,sizeof(net->last_error)-1)); } + /* MDEV-35935: if server sends error packet without error, we have to + set error manually */ + if (!net->last_errno) { + my_set_error(mysql, CR_ERR_MISSING_ERROR_INFO, SQLSTATE_UNKNOWN, 0); + } } else { @@ -402,7 +407,7 @@ /* CONC-589: If reconnect option was specified, we have to check if the connection (socket) is still available */ - if (command != COM_QUIT && mysql->options.reconnect && ma_pvio_is_alive(mysql->net.pvio)) + if (command != COM_QUIT && mysql->options.reconnect && !ma_pvio_is_alive(mysql->net.pvio)) { ma_pvio_close(mysql->net.pvio); mysql->net.pvio= NULL; diff -Nru mariadb-10.11.11/libmariadb/libmariadb/mariadb_stmt.c mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c --- mariadb-10.11.11/libmariadb/libmariadb/mariadb_stmt.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c 2025-05-19 16:14:27.000000000 +0000 @@ -425,6 +425,9 @@ stmt->bind[i].is_null= &stmt->bind[i].is_null_value; *stmt->bind[i].is_null= 1; stmt->bind[i].u.row_ptr= NULL; + if (!stmt->bind[i].length) + stmt->bind[i].length= &stmt->bind[i].length_value; + *stmt->bind[i].length= stmt->bind[i].length_value= 0; } } else { @@ -437,6 +440,9 @@ if (stmt->result_callback) stmt->result_callback(stmt->user_data, i, &row); else { + if (!stmt->bind[i].is_null) + stmt->bind[i].is_null= &stmt->bind[i].is_null_value; + *stmt->bind[i].is_null= 0; if (mysql_ps_fetch_functions[stmt->fields[i].type].pack_len >= 0) length= mysql_ps_fetch_functions[stmt->fields[i].type].pack_len; else diff -Nru mariadb-10.11.11/libmariadb/plugins/pvio/pvio_socket.c mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c --- mariadb-10.11.11/libmariadb/plugins/pvio/pvio_socket.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c 2025-05-19 16:14:27.000000000 +0000 @@ -1101,10 +1101,10 @@ res= poll(&poll_fd, 1, 0); if (res <= 0) /* timeout or error */ - return FALSE; + return TRUE; if (!(poll_fd.revents & (POLLIN | POLLPRI))) - return FALSE; - return TRUE; + return TRUE; + return FALSE; #else /* We can't use the WSAPoll function, it's broken :-( (see Windows 8 Bugs 309411 - WSAPoll does not report failed connections) @@ -1117,8 +1117,8 @@ res= select((int)csock->socket + 1, &sfds, NULL, NULL, &tv); if (res > 0 && FD_ISSET(csock->socket, &sfds)) - return TRUE; - return FALSE; + return FALSE; + return TRUE; #endif } /* }}} */ diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/connection.c mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c --- mariadb-10.11.11/libmariadb/unittest/libmariadb/connection.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c 2025-05-19 16:14:27.000000000 +0000 @@ -2339,6 +2339,7 @@ MYSQL *mysql; int i; const char *ciphers[3]= {"TLS_AES_128_GCM_SHA256", "TLS_AES_256_GCM_SHA384", "TLS_CHACHA20_POLY1305_SHA256"}; + my_bool verify= 0; SKIP_MAXSCALE; @@ -2348,6 +2349,7 @@ mysql= mysql_init(NULL); mysql_ssl_set(mysql, NULL, NULL, NULL, NULL, NULL); + mysql_optionsv(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, &verify); mysql_optionsv(mysql, MYSQL_OPT_SSL_CIPHER, ciphers[i]); if (!my_test_connect(mysql, hostname, username, @@ -2370,7 +2372,6 @@ static int test_conc589(MYSQL *my) { MYSQL *mysql= mysql_init(NULL); - MYSQL_RES *result; int rc; my_bool reconnect= 1, verify= 0; unsigned long last_thread_id= 0; @@ -2391,15 +2392,85 @@ check_mysql_rc(rc, mysql); last_thread_id= mysql_thread_id(mysql); + rc= mysql_query(mysql, "SET @a:=1"); + check_mysql_rc(rc, mysql); + + sleep(10); + + rc= mysql_query(mysql, "SET @a:=2"); + check_mysql_rc(rc, mysql); + FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id"); + last_thread_id= mysql_thread_id(mysql); + + mysql_kill(my, last_thread_id); + + sleep(10); + + rc= mysql_query(mysql, "SET @a:=3"); + check_mysql_rc(rc, mysql); + FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id"); + mysql_close(mysql); + return OK; +} + +#ifdef WIN32 +static int test_conc760(MYSQL *my) +{ + MYSQL *mysql= mysql_init(NULL); + MYSQL_RES *result; + MYSQL_ROW row; + int rc; + char named_pipe_name[128]; + my_bool reconnect= 1, verify= 0; + unsigned long last_thread_id= 0; + unsigned int protocol= MYSQL_PROTOCOL_PIPE; + my_bool have_named_pipe= 0; + + SKIP_MAXSCALE; + + rc= mysql_query(my, "select @@named_pipe, @@socket"); + check_mysql_rc(rc, mysql); + + if ((result= mysql_store_result(my))) + { + if((row= mysql_fetch_row(result))) + have_named_pipe= atoi(row[0]); + strncpy(named_pipe_name, row[1], sizeof(named_pipe_name)-1); + named_pipe_name[sizeof(named_pipe_name)-1]= '\0'; + mysql_free_result(result); + } + + if (!have_named_pipe) + { + diag("Server doesn't support named pipes"); + return SKIP; + } + + mysql_options(mysql, MYSQL_OPT_RECONNECT, &reconnect); + mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, &verify); + mysql_options(mysql, MYSQL_OPT_PROTOCOL, &protocol); + + if (!my_test_connect(mysql, hostname, username, + password, schema, port, named_pipe_name, CLIENT_REMEMBER_OPTIONS)) + { + diag("error: %s", mysql_error(mysql)); + return FAIL; + } + + rc= mysql_query(mysql, "SET SESSION wait_timeout=5"); + check_mysql_rc(rc, mysql); + + last_thread_id= mysql_thread_id(mysql); if ((rc= mysql_query(mysql, "SELECT 1")) || (result= mysql_store_result(mysql)) == NULL) check_mysql_rc(rc, mysql); mysql_free_result(result); sleep(10); - if ((rc= mysql_query(mysql, "SELECT 2")) || (result= mysql_store_result(mysql)) == NULL) - check_mysql_rc(rc, mysql); - mysql_free_result(result); + rc= mysql_query(mysql, "SELECT 2"); + check_mysql_rc(rc, mysql); + if (result= mysql_store_result(mysql)) + mysql_free_result(result); FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id"); last_thread_id= mysql_thread_id(mysql); @@ -2414,8 +2485,12 @@ mysql_close(mysql); return OK; } +#endif struct my_tests_st my_tests[] = { +#ifdef WIN32 + {"test_conc760", test_conc760, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, +#endif {"test_conc589", test_conc589, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, #ifdef HAVE_test_conc748 {"test_conc748", test_conc748, TEST_CONNECTION_NONE, 0, NULL, NULL}, diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/errors.c mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c --- mariadb-10.11.11/libmariadb/unittest/libmariadb/errors.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c 2025-05-19 16:14:27.000000000 +0000 @@ -272,8 +272,82 @@ return OK; } +#define TEST_ARRAY_SIZE 1024 + +static int test_mdev35935(MYSQL *mysql) +{ + MYSQL_STMT *stmt= mysql_stmt_init(mysql); + const char *stmt_str= "INSERT INTO bulk1 (a,b) VALUES (?,?)"; + unsigned int array_size= TEST_ARRAY_SIZE; + int rc; + unsigned int i; + char **buffer; + unsigned long *lengths; + unsigned int *vals; + MYSQL_BIND bind[2]; + const char *data= "test"; + + SKIP_MAXSCALE; + SKIP_MYSQL(mysql); + + rc= mysql_select_db(mysql, schema); + + rc= mysql_query(mysql, "DROP TABLE IF EXISTS bulk1"); + check_mysql_rc(rc, mysql); + + rc= mysql_query(mysql, "CREATE TABLE bulk1 (a int , b VARCHAR(255))"); + check_mysql_rc(rc, mysql); + + rc= mysql_stmt_prepare(stmt, SL(stmt_str)); + check_stmt_rc(rc, stmt); + + rc= mysql_query(mysql, "ALTER TABLE bulk1 ADD c int"); + check_mysql_rc(rc, mysql); + + /* allocate memory */ + buffer= calloc(TEST_ARRAY_SIZE, sizeof(char *)); + lengths= calloc(TEST_ARRAY_SIZE, sizeof *lengths); + vals= calloc(TEST_ARRAY_SIZE, sizeof *vals); + + for (i=0; i < TEST_ARRAY_SIZE; i++) + { + buffer[i]= (void *)data; + lengths[i]= -1; + vals[i]= i; + } + + memset(bind, 0, sizeof(MYSQL_BIND) * 2); + bind[0].buffer_type= MYSQL_TYPE_LONG; + bind[0].buffer= vals; + bind[1].buffer_type= MYSQL_TYPE_STRING; + bind[1].buffer= (void *)buffer; + bind[1].length= (unsigned long *)lengths; + + rc= mysql_stmt_attr_set(stmt, STMT_ATTR_ARRAY_SIZE, &array_size); + check_stmt_rc(rc, stmt); + + rc= mysql_stmt_bind_param(stmt, bind); + check_stmt_rc(rc, stmt); + + if ((rc= mysql_stmt_execute(stmt))) + { + FAIL_IF((!mysql_stmt_errno(stmt) || !mysql_errno(mysql)), "Error number > 0 expected"); + } + + mysql_stmt_close(stmt); + rc= mysql_query(mysql, "DROP TABLE IF EXISTS bulk1"); + check_mysql_rc(rc, mysql); + + free(buffer); + free(lengths); + free(vals); + return OK; +} + + struct my_tests_st my_tests[] = { + {"test_mdev35935", test_mdev35935, TEST_CONNECTION_DEFAULT, 0, NULL , NULL}, {"test_client_warnings", test_client_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL}, {"test_ps_client_warnings", test_ps_client_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL}, {"test_server_warnings", test_server_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL}, diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/ps_bugs.c mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c --- mariadb-10.11.11/libmariadb/unittest/libmariadb/ps_bugs.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c 2025-05-19 16:14:27.000000000 +0000 @@ -5001,7 +5001,7 @@ for (i=0; i < 10; i++, frac=frac*10+i) { - unsigned long expected= 0; + unsigned int expected= frac; sprintf(query, "SELECT '2018-11-05 22:25:59.%ld'", frac); diag("%d: %s", i, query); @@ -5027,11 +5027,15 @@ diag("second_part: %ld", tm.second_part); - expected= i > 6 ? 123456 : frac * (unsigned int)powl(10, (6 - i)); + while (expected && expected < 100000) + expected *= 10; + while (expected >= 1000000) + expected /= 10; if (tm.second_part != expected) { - diag("Error: tm.second_part=%ld expected=%ld", tm.second_part, expected); + diag("Error: tm.second_part=%ld expected=%d", tm.second_part, expected); + mysql_stmt_close(stmt); return FAIL; } } @@ -5618,6 +5622,7 @@ rc= mysql_stmt_attr_set(stmt, STMT_ATTR_CB_PARAM, conc623_param_callback); check_stmt_rc(rc, stmt); + memset(&bind, 0, sizeof(MYSQL_BIND)); bind.buffer_type= MYSQL_TYPE_LONG; rc= mysql_stmt_bind_param(stmt, &bind); check_stmt_rc(rc, stmt); @@ -5910,9 +5915,50 @@ return OK; } +static int test_conc762(MYSQL *mysql) +{ + int rc; + MYSQL_STMT *stmt= mysql_stmt_init(mysql); + MYSQL_BIND bind[2]; + my_bool is_null[2]= {1,1}; + unsigned long length[2]= {1,1}; + + rc= mysql_stmt_prepare(stmt, SL("SELECT NULL, 'foo'")); + check_stmt_rc(rc, stmt); + + memset(&bind, 0, sizeof(MYSQL_BIND) * 2); + + bind[0].buffer_type = MYSQL_TYPE_STRING; + bind[1].buffer_type = MYSQL_TYPE_STRING; + bind[0].is_null= &is_null[0]; + bind[1].is_null= &is_null[1]; + bind[0].buffer_length= bind[1].buffer_length= 0; + bind[0].length= &length[0]; + bind[1].length= &length[1]; + + rc= mysql_stmt_execute(stmt); + check_stmt_rc(rc, stmt); + + rc= mysql_stmt_bind_result(stmt, bind); + + mysql_stmt_fetch(stmt); + FAIL_IF(is_null[0]==0, "Expected NULL value"); + FAIL_IF(is_null[1]==1, "Expected non NULL value"); + FAIL_IF(length[0]!=0, "Expected length=0"); + FAIL_IF(length[1]!=3, "Expected length=3"); + +// FAIL_IF(length[0] != 0, "Expected length=0"); + +//FAIL_IF(length[1] != 3, "Expected length=3)"; + + mysql_stmt_close(stmt); + return OK; +} + struct my_tests_st my_tests[] = { {"test_conc702", test_conc702, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, + {"test_conc762", test_conc762, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, {"test_conc176", test_conc176, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, {"test_conc739", test_conc739, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, {"test_conc633", test_conc633, TEST_CONNECTION_DEFAULT, 0, NULL, NULL}, diff -Nru mariadb-10.11.11/mysql-test/CMakeLists.txt mariadb-10.11.13/mysql-test/CMakeLists.txt --- mariadb-10.11.11/mysql-test/CMakeLists.txt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/CMakeLists.txt 2025-05-19 16:14:24.000000000 +0000 @@ -14,7 +14,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA -INSTALL_MYSQL_TEST("." ".") +INSTALL_MYSQL_TEST("." "") IF(NOT ${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) # Enable running mtr from build directory diff -Nru mariadb-10.11.11/mysql-test/include/long_test.inc mariadb-10.11.13/mysql-test/include/long_test.inc --- mariadb-10.11.11/mysql-test/include/long_test.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/include/long_test.inc 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ # We use this --source include to mark a test as taking long to run. # We can use this to schedule such test early (to not be left with -# only one or two long tests running, and rests of works idle), or to +# only one or two long tests running, and rests of workers idle), or to # run a quick test skipping long-running test cases. --source include/no_valgrind_without_big.inc diff -Nru mariadb-10.11.11/mysql-test/lib/My/SafeProcess/safe_process.cc mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc --- mariadb-10.11.11/mysql-test/lib/My/SafeProcess/safe_process.cc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc 2025-05-19 16:14:24.000000000 +0000 @@ -220,6 +220,7 @@ pid_t own_pid= getpid(); pid_t parent_pid= getppid(); bool nocore = false; + int open_files_limit = 1024; struct sigaction sa,sa_abort; sa.sa_handler= handle_signal; @@ -268,7 +269,14 @@ } else if ( strncmp (arg, "--env ", 6) == 0 ) { - putenv(strdup(arg+6)); + putenv(strdup(arg+6)); + } + else if ( strncmp(arg, "--open-files-limit=", 19) == 0 ) + { + const char* start = arg + 19; + open_files_limit = atoi(start); + if (open_files_limit <= 0) + die("Invalid value '%s' passed to --open-files-limit", start); } else die("Unknown option: %s", arg); @@ -318,11 +326,8 @@ if (nocore) setlimit(RLIMIT_CORE, 0, 0); - /* - mysqld defaults depend on that. make test results stable and independent - from the environment - */ - setlimit(RLIMIT_NOFILE, 1024, 1024); + // Set open files limit + setlimit(RLIMIT_NOFILE, open_files_limit, open_files_limit); // Signal that child is ready buf= 37; diff -Nru mariadb-10.11.11/mysql-test/lib/My/SafeProcess.pm mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm --- mariadb-10.11.11/mysql-test/lib/My/SafeProcess.pm 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm 2025-05-19 16:14:24.000000000 +0000 @@ -138,6 +138,7 @@ my $error = delete($opts{'error'}); my $verbose = delete($opts{'verbose'}) || $::opt_verbose; my $nocore = delete($opts{'nocore'}); + my $open_files_limit = delete($opts{'open_files_limit'}); my $host = delete($opts{'host'}); my $shutdown = delete($opts{'shutdown'}); my $user_data= delete($opts{'user_data'}); @@ -161,6 +162,8 @@ push(@safe_args, "--verbose") if $verbose > 0; push(@safe_args, "--nocore") if $nocore; + push(@safe_args, "--open-files-limit=$open_files_limit") if $open_files_limit; + # Point the safe_process at the right parent if running on cygwin push(@safe_args, "--parent-pid=".Cygwin::pid_to_winpid($$)) if IS_CYGWIN; diff -Nru mariadb-10.11.11/mysql-test/main/backup_locks.test mariadb-10.11.13/mysql-test/main/backup_locks.test --- mariadb-10.11.11/mysql-test/main/backup_locks.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/backup_locks.test 2025-05-19 16:14:24.000000000 +0000 @@ -2,6 +2,7 @@ # Tests BACKUP STAGE locking ######################################################################## +--source include/long_test.inc --source include/have_innodb.inc --source include/have_metadata_lock_info.inc --source include/not_embedded.inc diff -Nru mariadb-10.11.11/mysql-test/main/comment_database.result mariadb-10.11.13/mysql-test/main/comment_database.result --- mariadb-10.11.11/mysql-test/main/comment_database.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/comment_database.result 2025-05-19 16:14:24.000000000 +0000 @@ -76,3 +76,16 @@ CATALOG_NAME SCHEMA_NAME DEFAULT_CHARACTER_SET_NAME DEFAULT_COLLATION_NAME SQL_PATH SCHEMA_COMMENT def comment latin2 latin2_general_ci NULL comment DROP DATABASE comment; +CREATE DATABASE db1; +# restart +SHOW CREATE DATABASE db1; +Database Create Database +db1 CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci */ +Warnings: +Note 1105 Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed +SHOW CREATE DATABASE db1; +Database Create Database +db1 CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci */ +Warnings: +Note 1105 Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed +DROP DATABASE db1; diff -Nru mariadb-10.11.11/mysql-test/main/comment_database.test mariadb-10.11.13/mysql-test/main/comment_database.test --- mariadb-10.11.11/mysql-test/main/comment_database.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/comment_database.test 2025-05-19 16:14:24.000000000 +0000 @@ -63,3 +63,11 @@ WHERE schema_name='comment'; DROP DATABASE comment; --enable_service_connection + +CREATE DATABASE db1; +--remove_file $MARIADB_DATADIR/db1/db.opt +--source include/restart_mysqld.inc +# We need to call this two times to ensure all code paths are used +SHOW CREATE DATABASE db1; +SHOW CREATE DATABASE db1; +DROP DATABASE db1; diff -Nru mariadb-10.11.11/mysql-test/main/ctype_utf8_def_upgrade.result mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result --- mariadb-10.11.11/mysql-test/main/ctype_utf8_def_upgrade.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result 2025-05-19 16:14:24.000000000 +0000 @@ -53,6 +53,8 @@ SHOW CREATE DATABASE db1; Database Create Database db1 CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci */ +Warnings: +Note 1105 Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed USE db1; SELECT @@character_set_database, 'taken from defaults' AS comment; @@character_set_database comment diff -Nru mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.result mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result --- mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result 2025-05-19 16:14:24.000000000 +0000 @@ -11761,9 +11761,8 @@ EXPLAIN INSERT INTO t1 SELECT * FROM ( SELECT t1.f FROM v1 JOIN t1 ) AS t WHERE f IS NOT NULL; id select_type table type possible_keys key key_len ref rows Extra -1 PRIMARY ALL NULL NULL NULL NULL 144 Using where -2 DERIVED ALL NULL NULL NULL NULL 12 -2 DERIVED t1 ALL NULL NULL NULL NULL 12 Using where; Using join buffer (flat, BNL join) +1 PRIMARY ALL NULL NULL NULL NULL 12 Using temporary +1 PRIMARY t1 ALL NULL NULL NULL NULL 12 Using where; Using join buffer (flat, BNL join) 4 DERIVED t1 ALL NULL NULL NULL NULL 12 EXPLAIN FORMAT=JSON INSERT INTO t1 SELECT * FROM ( SELECT t1.f FROM v1 JOIN t1 ) AS t WHERE f IS NOT NULL; @@ -11771,61 +11770,47 @@ { "query_block": { "select_id": 1, - "nested_loop": [ - { - "table": { - "table_name": "", - "access_type": "ALL", - "rows": 144, - "filtered": 100, - "attached_condition": "t.f is not null", - "materialized": { - "query_block": { - "select_id": 2, - "nested_loop": [ - { - "table": { - "table_name": "", - "access_type": "ALL", - "rows": 12, - "filtered": 100, - "materialized": { - "query_block": { - "select_id": 4, - "nested_loop": [ - { - "table": { - "table_name": "t1", - "access_type": "ALL", - "rows": 12, - "filtered": 100 - } - } - ] - } - } - } - }, - { - "block-nl-join": { + "temporary_table": { + "nested_loop": [ + { + "table": { + "table_name": "", + "access_type": "ALL", + "rows": 12, + "filtered": 100, + "materialized": { + "query_block": { + "select_id": 4, + "nested_loop": [ + { "table": { "table_name": "t1", "access_type": "ALL", "rows": 12, - "filtered": 100, - "attached_condition": "t1.f is not null" - }, - "buffer_type": "flat", - "buffer_size": "64", - "join_type": "BNL" + "filtered": 100 + } } - } - ] + ] + } } } + }, + { + "block-nl-join": { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 12, + "filtered": 100, + "attached_condition": "t1.f is not null" + }, + "buffer_type": "flat", + "buffer_size": "64", + "join_type": "BNL" + } } - } - ] + ] + } } } SELECT * FROM t1; @@ -11854,62 +11839,48 @@ { "query_block": { "select_id": 1, - "nested_loop": [ - { - "table": { - "table_name": "", - "access_type": "ALL", - "rows": 16, - "filtered": 100, - "attached_condition": "t.f is not null", - "materialized": { - "query_block": { - "select_id": 2, - "nested_loop": [ - { - "table": { - "table_name": "t1", - "access_type": "ALL", - "rows": 8, - "filtered": 100, - "attached_condition": "t1.f is not null" - } - }, - { - "table": { - "table_name": "", - "access_type": "ref", - "possible_keys": ["key0"], - "key": "key0", - "key_length": "4", - "used_key_parts": ["f"], - "ref": ["test.t1.f"], - "rows": 2, - "filtered": 100, - "materialized": { - "query_block": { - "select_id": 4, - "nested_loop": [ - { - "table": { - "table_name": "t1", - "access_type": "ALL", - "rows": 8, - "filtered": 100, - "attached_condition": "t1.f is not null" - } - } - ] - } + "temporary_table": { + "nested_loop": [ + { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 8, + "filtered": 100, + "attached_condition": "t1.f is not null" + } + }, + { + "table": { + "table_name": "", + "access_type": "ref", + "possible_keys": ["key0"], + "key": "key0", + "key_length": "4", + "used_key_parts": ["f"], + "ref": ["test.t1.f"], + "rows": 2, + "filtered": 100, + "materialized": { + "query_block": { + "select_id": 4, + "nested_loop": [ + { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 8, + "filtered": 100, + "attached_condition": "t1.f is not null" } } - } - ] + ] + } } } } - } - ] + ] + } } } SELECT * FROM t1; @@ -21669,6 +21640,27 @@ GROUP BY 1 ; ( SELECT 1 FROM ( SELECT 1 FROM cte1) dt GROUP BY x HAVING x= 1 ) 1 +create table t1 (f int); +create view v1 as select f, count(*) c from t1 group by f; +# +# MDEV-25012 Server crash in find_field_in_tables, Assertion `name' failed in find_field_in_table_ref +# +select * from v1 where export_set(1, default(f), 'x', aes_decrypt('secret', f)); +f c +show warnings; +Level Code Message +drop view v1; +drop table t1; +create table t(c3 longtext) ; +with cte1 as +( +select default(c3) as a +from t group by 1 +) +select * from cte1 +where cte1.a >= 1; +a +drop table t; # End of 10.5 tests # # MDEV-28958: condition pushable into view after simplification diff -Nru mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.test mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test --- mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test 2025-05-19 16:14:24.000000000 +0000 @@ -4271,6 +4271,28 @@ FROM cte2 GROUP BY 1 ; +create table t1 (f int); +create view v1 as select f, count(*) c from t1 group by f; + +--echo # +--echo # MDEV-25012 Server crash in find_field_in_tables, Assertion `name' failed in find_field_in_table_ref +--echo # +select * from v1 where export_set(1, default(f), 'x', aes_decrypt('secret', f)); +show warnings; +# cleanup +drop view v1; +drop table t1; + +create table t(c3 longtext) ; +with cte1 as +( + select default(c3) as a + from t group by 1 +) +select * from cte1 +where cte1.a >= 1; +drop table t; + --echo # End of 10.5 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/derived_view.result mariadb-10.11.13/mysql-test/main/derived_view.result --- mariadb-10.11.11/mysql-test/main/derived_view.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/derived_view.result 2025-05-19 16:14:24.000000000 +0000 @@ -2461,6 +2461,8 @@ a 1 1 +1 +1 drop table t1,t2; set optimizer_switch=@save968720_optimizer_switch; # diff -Nru mariadb-10.11.11/mysql-test/main/func_json.result mariadb-10.11.13/mysql-test/main/func_json.result --- mariadb-10.11.11/mysql-test/main/func_json.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_json.result 2025-05-19 16:14:24.000000000 +0000 @@ -1766,6 +1766,43 @@ data # +# MDEV-35614 JSON_UNQUOTE doesn't work with emojis +# +SELECT HEX(JSON_UNQUOTE('"\\ud83d\\ude0a"')) as hex_smiley; +hex_smiley +F09F988A +set names utf8mb4; +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') as smiley; +smiley +😊 +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') = JSON_UNQUOTE('"\\ud83d\\ude0a"') as equal_smileys; +equal_smileys +1 +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') <= JSON_UNQUOTE('"\\ud83d\\ude0a"') as less_or_equal_smileys; +less_or_equal_smileys +1 +set @v='{ "color":"😊" }'; +select @v as v, collation(@v) as collation_v; +v collation_v +{ "color":"😊" } utf8mb4_general_ci +select json_valid(@v) as valid; +valid +1 +select json_extract(@v,'$.color') as color_extraction, collation(json_extract(@v,'$.color')) as color_extraction_collation; +color_extraction color_extraction_collation +"😊" utf8mb4_general_ci +select json_unquote(json_extract(@v,'$.color')) as unquoted, collation(json_unquote(json_extract(@v,'$.color'))) as unquoted_collation; +unquoted unquoted_collation +😊 utf8mb4_bin +SELECT JSON_UNQUOTE('"\\uc080\\ude0a"') as invalid_utf8mb4; +invalid_utf8mb4 +"\uc080\ude0a" +Warnings: +Warning 4035 Broken JSON string in argument 1 to function 'json_unquote' at position 13 +show warnings; +Level Code Message +Warning 4035 Broken JSON string in argument 1 to function 'json_unquote' at position 13 +# # End of 10.6 tests # # diff -Nru mariadb-10.11.11/mysql-test/main/func_json.test mariadb-10.11.13/mysql-test/main/func_json.test --- mariadb-10.11.11/mysql-test/main/func_json.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_json.test 2025-05-19 16:14:24.000000000 +0000 @@ -1194,6 +1194,7 @@ SET @@collation_connection= @save_collation_connection; + --echo # --echo # End of 10.5 tests --echo # @@ -1231,6 +1232,27 @@ data FROM JSON_TABLE (@data, '$[*]' COLUMNS (data text PATH '$.Data')) AS t; + +--echo # +--echo # MDEV-35614 JSON_UNQUOTE doesn't work with emojis +--echo # + +SELECT HEX(JSON_UNQUOTE('"\\ud83d\\ude0a"')) as hex_smiley; +set names utf8mb4; +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') as smiley; + +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') = JSON_UNQUOTE('"\\ud83d\\ude0a"') as equal_smileys; +SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') <= JSON_UNQUOTE('"\\ud83d\\ude0a"') as less_or_equal_smileys; + +set @v='{ "color":"😊" }'; +select @v as v, collation(@v) as collation_v; +select json_valid(@v) as valid; +select json_extract(@v,'$.color') as color_extraction, collation(json_extract(@v,'$.color')) as color_extraction_collation; +select json_unquote(json_extract(@v,'$.color')) as unquoted, collation(json_unquote(json_extract(@v,'$.color'))) as unquoted_collation; + +SELECT JSON_UNQUOTE('"\\uc080\\ude0a"') as invalid_utf8mb4; +show warnings; + --echo # --echo # End of 10.6 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/func_like.result mariadb-10.11.13/mysql-test/main/func_like.result --- mariadb-10.11.11/mysql-test/main/func_like.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_like.result 2025-05-19 16:14:24.000000000 +0000 @@ -424,3 +424,22 @@ Note 1003 select 1 like `test`.`t1`.`c1` | `test`.`t1`.`c2` AS `1 LIKE c1|c2`,1 like `test`.`t1`.`c1` & `test`.`t1`.`c2` AS `1 LIKE c1&c2`,1 like `test`.`t1`.`c2` >> `test`.`t1`.`c1` AS `1 LIKE c2>>c1`,2 like `test`.`t1`.`c2` << `test`.`t1`.`c1` AS `2 LIKE c2< 0 AS `1 LIKE c1||c2`,2 like `test`.`t1`.`c1` + `test`.`t1`.`c2` AS `2 LIKE c1+c2`,-1 like `test`.`t1`.`c1` - `test`.`t1`.`c2` AS `-1 LIKE c1-c2`,2 like `test`.`t1`.`c1` * `test`.`t1`.`c2` AS `2 LIKE c1*c2`,0.5000 like `test`.`t1`.`c1` / `test`.`t1`.`c2` AS `0.5000 LIKE c1/c2`,0 like `test`.`t1`.`c1` DIV `test`.`t1`.`c2` AS `0 LIKE c1 DIV c2`,0 like `test`.`t1`.`c1` MOD `test`.`t1`.`c2` AS `0 LIKE c1 MOD c2` from `test`.`t1` order by `test`.`t1`.`c2` DROP VIEW v1; DROP TABLE t1; +# +# MDEV-36211 Incorrect query result for binary_column NOT LIKE binary_column +# +CREATE TABLE t1 (c1 BLOB NOT NULL); +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE c1 NOT LIKE c1; +c1 +SELECT c1 FROM t1 WHERE c1 LIKE c1; +c1 +1 +DROP TABLE t1; +CREATE TABLE t1 (c1 BLOB); +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE c1 NOT LIKE c1; +c1 +SELECT c1 FROM t1 WHERE c1 LIKE c1; +c1 +1 +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/func_like.test mariadb-10.11.13/mysql-test/main/func_like.test --- mariadb-10.11.11/mysql-test/main/func_like.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_like.test 2025-05-19 16:14:24.000000000 +0000 @@ -291,3 +291,18 @@ EXPLAIN EXTENDED SELECT * FROM v1; DROP VIEW v1; DROP TABLE t1; + +--echo # +--echo # MDEV-36211 Incorrect query result for binary_column NOT LIKE binary_column +--echo # +CREATE TABLE t1 (c1 BLOB NOT NULL); +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE c1 NOT LIKE c1; +SELECT c1 FROM t1 WHERE c1 LIKE c1; +DROP TABLE t1; + +CREATE TABLE t1 (c1 BLOB); +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE c1 NOT LIKE c1; +SELECT c1 FROM t1 WHERE c1 LIKE c1; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/func_regexp_pcre.result mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result --- mariadb-10.11.11/mysql-test/main/func_regexp_pcre.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result 2025-05-19 16:14:24.000000000 +0000 @@ -60,7 +60,7 @@ INSERT INTO t2 VALUES ('\\p{Cyrillic}'),('\\p{Greek}'),('\\p{Latin}'); INSERT INTO t2 VALUES ('\\p{Han}'),('\\p{Hangul}'); INSERT INTO t2 VALUES ('\\p{Sinhala}'), ('\\p{Tamil}'); -INSERT INTO t2 VALUES ('\\p{L}'),('\\p{Ll}'),('\\p{Lu}'),('\\p{L&}'); +INSERT INTO t2 VALUES ('\\p{L}'), /* buggy before v10.45 ('\\p{Ll}'),('\\p{Lu}'),*/ ('\\p{L&}'); INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]'); SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch; class ch ch RLIKE class @@ -168,32 +168,6 @@ \p{Latin} à¶´ 0 \p{Latin} ã— 0 \p{Latin} ê°· 0 -\p{Ll} 1 0 -\p{Ll} A 0 -\p{Ll} a 1 -\p{Ll} À 0 -\p{Ll} à 1 -\p{Ll} Σ 0 -\p{Ll} σ 1 -\p{Ll} Я 0 -\p{Ll} Ñ 1 -\p{Ll} ௨ 0 -\p{Ll} à¶´ 0 -\p{Ll} ã— 0 -\p{Ll} ê°· 0 -\p{Lu} 1 0 -\p{Lu} A 1 -\p{Lu} a 0 -\p{Lu} À 1 -\p{Lu} à 0 -\p{Lu} Σ 1 -\p{Lu} σ 0 -\p{Lu} Я 1 -\p{Lu} Ñ 0 -\p{Lu} ௨ 0 -\p{Lu} à¶´ 0 -\p{Lu} ã— 0 -\p{Lu} ê°· 0 \p{L} 1 0 \p{L} A 1 \p{L} a 1 diff -Nru mariadb-10.11.11/mysql-test/main/func_regexp_pcre.test mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test --- mariadb-10.11.11/mysql-test/main/func_regexp_pcre.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test 2025-05-19 16:14:24.000000000 +0000 @@ -41,7 +41,7 @@ INSERT INTO t2 VALUES ('\\p{Cyrillic}'),('\\p{Greek}'),('\\p{Latin}'); INSERT INTO t2 VALUES ('\\p{Han}'),('\\p{Hangul}'); INSERT INTO t2 VALUES ('\\p{Sinhala}'), ('\\p{Tamil}'); -INSERT INTO t2 VALUES ('\\p{L}'),('\\p{Ll}'),('\\p{Lu}'),('\\p{L&}'); +INSERT INTO t2 VALUES ('\\p{L}'), /* buggy before v10.45 ('\\p{Ll}'),('\\p{Lu}'),*/ ('\\p{L&}'); INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]'); SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch; DROP TABLE t1, t2; diff -Nru mariadb-10.11.11/mysql-test/main/gis-precise.result mariadb-10.11.13/mysql-test/main/gis-precise.result --- mariadb-10.11.11/mysql-test/main/gis-precise.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/gis-precise.result 2025-05-19 16:14:24.000000000 +0000 @@ -776,7 +776,7 @@ ST_DISTANCE_SPHERE(1, 1, NULL) NULL SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(1 0)'), ST_GEOMFROMTEXT('LINESTRING(0 0, 1 1)')) as result; -ERROR HY000: Internal error: st_distance_sphere +ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments. # Test Points and radius SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)')) as result; result @@ -788,9 +788,9 @@ result 0.024682056391766436 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 0) as result; -ERROR HY000: Internal error: Radius must be greater than zero. +ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments. SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), -1) as result; -ERROR HY000: Internal error: Radius must be greater than zero. +ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments. # Test longitude/lattitude SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 1)'), ST_GEOMFROMTEXT('POINT(1 2)')), 10) as result; result @@ -843,7 +843,7 @@ result 0.04933028646581131 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),0) as result; -ERROR HY000: Internal error: Radius must be greater than zero. +ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments. set @pt1 = ST_GeomFromText('POINT(190 -30)'); set @pt2 = ST_GeomFromText('POINT(-30 50)'); SELECT ST_Distance_Sphere(@pt1, @pt2); diff -Nru mariadb-10.11.11/mysql-test/main/gis-precise.test mariadb-10.11.13/mysql-test/main/gis-precise.test --- mariadb-10.11.11/mysql-test/main/gis-precise.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/gis-precise.test 2025-05-19 16:14:24.000000000 +0000 @@ -422,7 +422,7 @@ # Return NULL if radius is NULL SELECT ST_DISTANCE_SPHERE(1, 1, NULL); # Wrong geometry ---error ER_INTERNAL_ERROR +--error ER_GIS_UNSUPPORTED_ARGUMENT SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(1 0)'), ST_GEOMFROMTEXT('LINESTRING(0 0, 1 1)')) as result; --echo # Test Points and radius @@ -430,9 +430,9 @@ # make bb x86 happy SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(-1 -1)'), ST_GEOMFROMTEXT('POINT(-2 -2)')), 10) as result; SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 1) as result; ---error ER_INTERNAL_ERROR +--error ER_GIS_UNSUPPORTED_ARGUMENT SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 0) as result; ---error ER_INTERNAL_ERROR +--error ER_GIS_UNSUPPORTED_ARGUMENT SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), -1) as result; --echo # Test longitude/lattitude # make bb x86 happy @@ -456,7 +456,7 @@ SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )')), 10) as result; # make bb x86 happy SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),1), 17) as result; ---error ER_INTERNAL_ERROR +--error ER_GIS_UNSUPPORTED_ARGUMENT SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),0) as result; # Longitude out of range [-180,180] diff -Nru mariadb-10.11.11/mysql-test/main/gis.result mariadb-10.11.13/mysql-test/main/gis.result --- mariadb-10.11.11/mysql-test/main/gis.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/gis.result 2025-05-19 16:14:24.000000000 +0000 @@ -5474,4 +5474,36 @@ SELECT NTH_VALUE(a,b) OVER () FROM t; ERROR HY000: Illegal parameter data types point and bigint for operation '-' DROP TABLE t; +# +# MDEV-32619 Settng SRID on geometry with ST_*FromWKKB(g, srid) +# +SELECT +ST_SRID(g1), +ST_SRID(ST_GeomFromWKB(g1, 4326)), +ST_SRID(ST_GeomFromWKB(g1)), +ST_AsText(g1), +ST_SRID(ST_PointFromWKB(g2, 4326)), +ST_SRID(g2), +ST_SRID(ST_LineStringFromWKB(g3, 3)), +ST_SRID(ST_PolygonFromWKB(g4, 4)), +ST_SRID(ST_MultiPointFromWKB(g5, 5)), +ST_SRID(ST_MultiLineStringFromWKB(g6, 6)), +ST_SRID(ST_MultiPolygonFromWKB(g7, 7)) +FROM ( +SELECT +POINT(1, 2) AS g1, +POINT(4, 3) AS g2, +LINESTRING(POINT(4, 3), POINT(4, 4)) AS g3, +POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3))) AS g4, +MULTIPOINT(POINT(4, 3)) AS g5, +MULTILINESTRING(LINESTRING(POINT(4, 3), POINT(4, 4))) AS g6, +MULTIPOLYGON(POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3)))) AS g7 +) AS t; +ST_SRID(g1) ST_SRID(ST_GeomFromWKB(g1, 4326)) ST_SRID(ST_GeomFromWKB(g1)) ST_AsText(g1) ST_SRID(ST_PointFromWKB(g2, 4326)) ST_SRID(g2) ST_SRID(ST_LineStringFromWKB(g3, 3)) ST_SRID(ST_PolygonFromWKB(g4, 4)) ST_SRID(ST_MultiPointFromWKB(g5, 5)) ST_SRID(ST_MultiLineStringFromWKB(g6, 6)) ST_SRID(ST_MultiPolygonFromWKB(g7, 7)) +0 4326 0 POINT(1 2) 4326 0 3 4 5 6 7 +# +# MDEV-35117 Error message "ERROR 1815 (HY000): Internal error: st_distance_sphere' could be improved +# +SELECT ST_DISTANCE_SPHERE(st_geomfromtext('linestring( 2 2, 2 8) '), ST_GeomFromText('POINT(18.413076 43.856258)')) ; +ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments. # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/gis.test mariadb-10.11.13/mysql-test/main/gis.test --- mariadb-10.11.11/mysql-test/main/gis.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/gis.test 2025-05-19 16:14:24.000000000 +0000 @@ -3482,4 +3482,36 @@ SELECT NTH_VALUE(a,b) OVER () FROM t; DROP TABLE t; +--echo # +--echo # MDEV-32619 Settng SRID on geometry with ST_*FromWKKB(g, srid) +--echo # +SELECT + ST_SRID(g1), + ST_SRID(ST_GeomFromWKB(g1, 4326)), + ST_SRID(ST_GeomFromWKB(g1)), + ST_AsText(g1), + ST_SRID(ST_PointFromWKB(g2, 4326)), + ST_SRID(g2), + ST_SRID(ST_LineStringFromWKB(g3, 3)), + ST_SRID(ST_PolygonFromWKB(g4, 4)), + ST_SRID(ST_MultiPointFromWKB(g5, 5)), + ST_SRID(ST_MultiLineStringFromWKB(g6, 6)), + ST_SRID(ST_MultiPolygonFromWKB(g7, 7)) +FROM ( + SELECT + POINT(1, 2) AS g1, + POINT(4, 3) AS g2, + LINESTRING(POINT(4, 3), POINT(4, 4)) AS g3, + POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3))) AS g4, + MULTIPOINT(POINT(4, 3)) AS g5, + MULTILINESTRING(LINESTRING(POINT(4, 3), POINT(4, 4))) AS g6, + MULTIPOLYGON(POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3)))) AS g7 +) AS t; + +--echo # +--echo # MDEV-35117 Error message "ERROR 1815 (HY000): Internal error: st_distance_sphere' could be improved +--echo # +--error ER_GIS_UNSUPPORTED_ARGUMENT +SELECT ST_DISTANCE_SPHERE(st_geomfromtext('linestring( 2 2, 2 8) '), ST_GeomFromText('POINT(18.413076 43.856258)')) ; + --echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/group_by.result mariadb-10.11.13/mysql-test/main/group_by.result --- mariadb-10.11.11/mysql-test/main/group_by.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/group_by.result 2025-05-19 16:14:24.000000000 +0000 @@ -2997,5 +2997,79 @@ ERROR 42S22: Reference 'c' not supported (forward reference in item list) DROP TABLE t1; # +# MDEV-35238: Wrong results from a tables with a single record and an aggregate +# +CREATE OR REPLACE TABLE t1 (a int) ENGINE=myisam; +SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +1+0 min(1) +1 NULL +explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +EXPLAIN +{ + "query_block": { + "select_id": 1, + "table": { + "message": "Impossible WHERE noticed after reading const tables" + } + } +} +INSERT INTO t1 VALUES (NULL); +SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +1+0 min(1) +1 NULL +explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +EXPLAIN +{ + "query_block": { + "select_id": 1, + "pseudo_bits_condition": "if(uuid_short(),NULL,1)", + "nested_loop": [ + { + "table": { + "table_name": "t1", + "access_type": "system", + "rows": 1, + "filtered": 100 + } + } + ] + } +} +DROP TABLE t1; +CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=myisam; +INSERT INTO t1 VALUES (1); +CREATE TABLE t2 (a int NOT NULL) ENGINE=myisam; +INSERT INTO t2 VALUES (10); +SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand(); +1+0 MIN(t1.a) +1 1 +explain format=json SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand(); +EXPLAIN +{ + "query_block": { + "select_id": 1, + "pseudo_bits_condition": "10 = rand()", + "nested_loop": [ + { + "table": { + "table_name": "t1", + "access_type": "system", + "rows": 1, + "filtered": 100 + } + }, + { + "table": { + "table_name": "t2", + "access_type": "system", + "rows": 1, + "filtered": 100 + } + } + ] + } +} +DROP TABLE t1,t2; +# # End of 10.5 tests # diff -Nru mariadb-10.11.11/mysql-test/main/group_by.test mariadb-10.11.13/mysql-test/main/group_by.test --- mariadb-10.11.11/mysql-test/main/group_by.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/group_by.test 2025-05-19 16:14:24.000000000 +0000 @@ -2153,5 +2153,27 @@ DROP TABLE t1; --echo # +--echo # MDEV-35238: Wrong results from a tables with a single record and an aggregate +--echo # +CREATE OR REPLACE TABLE t1 (a int) ENGINE=myisam; +SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +INSERT INTO t1 VALUES (NULL); +SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1); +DROP TABLE t1; + +CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=myisam; +INSERT INTO t1 VALUES (1); + +CREATE TABLE t2 (a int NOT NULL) ENGINE=myisam; +INSERT INTO t2 VALUES (10); + +SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand(); +explain format=json SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand(); + +DROP TABLE t1,t2; + +--echo # --echo # End of 10.5 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/group_min_max.result mariadb-10.11.13/mysql-test/main/group_min_max.result --- mariadb-10.11.11/mysql-test/main/group_min_max.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/group_min_max.result 2025-05-19 16:14:24.000000000 +0000 @@ -4349,3 +4349,27 @@ # # End of 10.6 tests # +# +# MDEV-36118 Wrong result in loose index scan +# +CREATE TABLE t1 (a int, b int, KEY (a, b)); +insert into t1 values (1, 3), (1, 1); +SELECT MAX(b) FROM t1 WHERE (b > 2 AND b < 4) OR (b = 5) GROUP BY a; +MAX(b) +3 +drop table t1; +# +# MDEV-36220 ASAN unknown-crash in loose index scan of MIN with IS NULL +# +CREATE TABLE t1 (a int, b int, KEY (a, b)); +insert into t1 values (4, NULL), (1, 14), (4, 3); +SELECT MIN(b) FROM t1 WHERE b = 3 OR b IS NULL GROUP BY a; +MIN(b) +3 +SELECT MIN(b) FROM t1 WHERE b IS NULL GROUP BY a; +MIN(b) +NULL +drop table t1; +# +# End of 10.11 tests +# diff -Nru mariadb-10.11.11/mysql-test/main/group_min_max.test mariadb-10.11.13/mysql-test/main/group_min_max.test --- mariadb-10.11.11/mysql-test/main/group_min_max.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/group_min_max.test 2025-05-19 16:14:24.000000000 +0000 @@ -2007,3 +2007,39 @@ --echo # --echo # End of 10.6 tests --echo # + +--echo # +--echo # MDEV-36118 Wrong result in loose index scan +--echo # + +CREATE TABLE t1 (a int, b int, KEY (a, b)); +insert into t1 values (1, 3), (1, 1); +--source include/maybe_debug.inc +if ($have_debug) { + --disable_query_log + set @old_debug=@@debug; + set debug="+d,force_group_by"; + --enable_query_log +} +SELECT MAX(b) FROM t1 WHERE (b > 2 AND b < 4) OR (b = 5) GROUP BY a; +if ($have_debug) { + --disable_query_log + set debug=@old_debug; + --enable_query_log +} + +drop table t1; + +--echo # +--echo # MDEV-36220 ASAN unknown-crash in loose index scan of MIN with IS NULL +--echo # + +CREATE TABLE t1 (a int, b int, KEY (a, b)); +insert into t1 values (4, NULL), (1, 14), (4, 3); +SELECT MIN(b) FROM t1 WHERE b = 3 OR b IS NULL GROUP BY a; +SELECT MIN(b) FROM t1 WHERE b IS NULL GROUP BY a; +drop table t1; + +--echo # +--echo # End of 10.11 tests +--echo # diff -Nru mariadb-10.11.11/mysql-test/main/insert.result mariadb-10.11.13/mysql-test/main/insert.result --- mariadb-10.11.11/mysql-test/main/insert.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert.result 2025-05-19 16:14:24.000000000 +0000 @@ -806,5 +806,75 @@ 8 drop table t1; # -# End of 10.5 tests +# MDEV-32086 Server crash when inserting from derived table containing insert target table +# (part 2) +# +create table t1 (pk int, id int); +insert into t1 values (2,2), (3,3), (4,4); +select * from t1; +pk id +2 2 +3 3 +4 4 +select 101+count(*) +from +( +select dt2.id +from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id<1000; +101+count(*) +104 +prepare s from ' +insert into t1 values( + (select 101+count(*) + from + ( + select dt2.id + from (select id from t1) dt2, t1 t where t.id=dt2.id + ) dt + where dt.id<1000 + ), 123 +) +'; +execute s; +select * from t1; +pk id +2 2 +3 3 +4 4 +104 123 +select 101+count(*) +from +( +select dt2.id +from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id<1000; +101+count(*) +105 +execute s; +select * from t1; +pk id +2 2 +3 3 +4 4 +104 123 +105 123 +drop table t1; # +# Try this: INSERT INTO t1 VALUES ... reference to t1 +# RETURNING (subquery not touching t1) +create table t1 (a int, b int); +create table t2 (a int, b int); +# This is accepted: +insert into t1 (a) values +(3), +((select max(a) from t1)) +returning +a, b, (select max(a) from t2); +a b (select max(a) from t2) +3 NULL NULL +NULL NULL NULL +drop table t1,t2; +# End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/insert.test mariadb-10.11.13/mysql-test/main/insert.test --- mariadb-10.11.11/mysql-test/main/insert.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert.test 2025-05-19 16:14:24.000000000 +0000 @@ -675,5 +675,59 @@ drop table t1; --echo # ---echo # End of 10.5 tests +--echo # MDEV-32086 Server crash when inserting from derived table containing insert target table +--echo # (part 2) +--echo # + +create table t1 (pk int, id int); +insert into t1 values (2,2), (3,3), (4,4); +select * from t1; +select 101+count(*) + from + ( + select dt2.id + from (select id from t1) dt2, t1 t where t.id=dt2.id + ) dt + where dt.id<1000; +prepare s from ' +insert into t1 values( + (select 101+count(*) + from + ( + select dt2.id + from (select id from t1) dt2, t1 t where t.id=dt2.id + ) dt + where dt.id<1000 + ), 123 +) +'; +execute s; +select * from t1; +select 101+count(*) + from + ( + select dt2.id + from (select id from t1) dt2, t1 t where t.id=dt2.id + ) dt + where dt.id<1000; +execute s; +select * from t1; + +drop table t1; + --echo # +--echo # Try this: INSERT INTO t1 VALUES ... reference to t1 +--echo # RETURNING (subquery not touching t1) +create table t1 (a int, b int); +create table t2 (a int, b int); + +--echo # This is accepted: +insert into t1 (a) values + (3), + ((select max(a) from t1)) +returning + a, b, (select max(a) from t2); + +drop table t1,t2; + +--echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/insert_returning.result mariadb-10.11.13/mysql-test/main/insert_returning.result --- mariadb-10.11.11/mysql-test/main/insert_returning.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert_returning.result 2025-05-19 16:14:24.000000000 +0000 @@ -498,6 +498,8 @@ 5 6 INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT id2 FROM t2); ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT 1 UNION SELECT id2 FROM t2); +ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t2 (id2, val2) VALUES (6,'f') RETURNING t1.*; ERROR 42S02: Unknown table 'test.t1' # diff -Nru mariadb-10.11.11/mysql-test/main/insert_returning.test mariadb-10.11.13/mysql-test/main/insert_returning.test --- mariadb-10.11.11/mysql-test/main/insert_returning.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert_returning.test 2025-05-19 16:14:24.000000000 +0000 @@ -199,6 +199,8 @@ t1 WHERE id1=1); --error ER_UPDATE_TABLE_USED INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT id2 FROM t2); +--error ER_UPDATE_TABLE_USED +INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT 1 UNION SELECT id2 FROM t2); --error ER_BAD_TABLE_ERROR INSERT INTO t2 (id2, val2) VALUES (6,'f') RETURNING t1.*; diff -Nru mariadb-10.11.11/mysql-test/main/insert_select.result mariadb-10.11.13/mysql-test/main/insert_select.result --- mariadb-10.11.11/mysql-test/main/insert_select.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert_select.result 2025-05-19 16:14:24.000000000 +0000 @@ -1030,6 +1030,139 @@ 3 DROP VIEW v1; DROP TABLE t1; +create table t1 (pk int, id int); +insert into t1 values (2,2), (3,3), (4,4); +insert into t1 +select 1,10 +from +( +select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id=3; +select * from t1; +pk id +2 2 +3 3 +4 4 +1 10 +explain insert into t1 +select 1,10 +from +( +select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id=3; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 4 Using where; Using temporary +1 SIMPLE t ALL NULL NULL NULL NULL 4 Using where; Using join buffer (flat, BNL join) +explain format=json insert into t1 +select 1,10 +from +( +select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id=3; +EXPLAIN +{ + "query_block": { + "select_id": 1, + "temporary_table": { + "nested_loop": [ + { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 4, + "filtered": 100, + "attached_condition": "t1.`id` = 3" + } + }, + { + "block-nl-join": { + "table": { + "table_name": "t", + "access_type": "ALL", + "rows": 4, + "filtered": 100, + "attached_condition": "t.`id` = 3" + }, + "buffer_type": "flat", + "buffer_size": "65", + "join_type": "BNL" + } + } + ] + } + } +} +prepare stmt from "insert into t1 +select 1,10 +from +( +select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id=3"; +execute stmt; +select * from t1; +pk id +2 2 +3 3 +4 4 +1 10 +1 10 +execute stmt; +select * from t1; +pk id +2 2 +3 3 +4 4 +1 10 +1 10 +1 10 +deallocate prepare stmt; +create procedure p() insert into t1 +select 1,10 +from +( +select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id +) dt +where dt.id=3; +call p(); +select * from t1; +pk id +2 2 +3 3 +4 4 +1 10 +1 10 +1 10 +1 10 +call p(); +select * from t1; +pk id +2 2 +3 3 +4 4 +1 10 +1 10 +1 10 +1 10 +1 10 +drop procedure p; +drop table t1; # -# End of 10.5 test +# MDEV-33139: Crash of INSERT SELECT when preparing structures for +# split optimization # +CREATE TABLE v0 ( v1 INT UNIQUE ) ; +INSERT INTO v0 ( v1 ) VALUES +( ( SELECT 1 +FROM +( SELECT v1 +FROM v0 GROUP BY v1 ) AS v6 NATURAL JOIN +v0 AS v2 NATURAL JOIN +v0 AS v4 NATURAL JOIN +v0 AS v3 NATURAL JOIN +( SELECT v1 FROM v0 ) AS v7 ) ) ; +DROP TABLE v0; +# End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/insert_select.test mariadb-10.11.13/mysql-test/main/insert_select.test --- mariadb-10.11.11/mysql-test/main/insert_select.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/insert_select.test 2025-05-19 16:14:24.000000000 +0000 @@ -591,6 +591,60 @@ DROP VIEW v1; DROP TABLE t1; +# +# MDEV-32086: condition pushdown into two mergeable derived tables, +# one containing the other, when they are forced to be +# materialized in INSERT +# +create table t1 (pk int, id int); +insert into t1 values (2,2), (3,3), (4,4); + +let $q= +insert into t1 + select 1,10 + from + ( + select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id + ) dt + where dt.id=3; + +eval $q; +select * from t1; + +eval explain $q; +eval explain format=json $q; + +eval prepare stmt from "$q"; +execute stmt; +select * from t1; +execute stmt; +select * from t1; +deallocate prepare stmt; + +eval create procedure p() $q; +call p(); +select * from t1; +call p(); +select * from t1; +drop procedure p; + +drop table t1; + --echo # ---echo # End of 10.5 test +--echo # MDEV-33139: Crash of INSERT SELECT when preparing structures for +--echo # split optimization --echo # + +CREATE TABLE v0 ( v1 INT UNIQUE ) ; +INSERT INTO v0 ( v1 ) VALUES + ( ( SELECT 1 + FROM + ( SELECT v1 + FROM v0 GROUP BY v1 ) AS v6 NATURAL JOIN + v0 AS v2 NATURAL JOIN + v0 AS v4 NATURAL JOIN + v0 AS v3 NATURAL JOIN + ( SELECT v1 FROM v0 ) AS v7 ) ) ; +DROP TABLE v0; + +--echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/join.result mariadb-10.11.13/mysql-test/main/join.result --- mariadb-10.11.11/mysql-test/main/join.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join.result 2025-05-19 16:14:24.000000000 +0000 @@ -3611,3 +3611,32 @@ 1 SIMPLE t1 ALL NULL NULL NULL NULL 100 Using where 1 SIMPLE t2 ref kp1 kp1 5 test.t1.a 1 Using index condition drop table t1,t2; +# +# MDEV-36592: If the join_condition is specified via USING (column_list), the query plan depends ... +# +CREATE TABLE t1 ( +id int(11), +f1 char(255), +PRIMARY KEY (id) +); +INSERT INTO t1 (id) VALUES (1),(2),(3); +UPDATE t1 SET f1=REPEAT('a',250); +CREATE TABLE t2 (id int(11), f2 INT NOT NULL); +INSERT INTO t2 select seq, seq from seq_1_to_20; +ANALYZE TABLE t1, t2; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze status OK +# In both queries, t1 should use type=index, not type=ALL: +EXPLAIN SELECT count(*) FROM t2 JOIN t1 USING (id); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index PRIMARY PRIMARY 4 NULL 3 Using index +1 SIMPLE t2 ALL NULL NULL NULL NULL 20 Using where; Using join buffer (flat, BNL join) +EXPLAIN SELECT count(*) FROM t1 JOIN t2 USING (id); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 index PRIMARY PRIMARY 4 NULL 3 Using index +1 SIMPLE t2 ALL NULL NULL NULL NULL 20 Using where; Using join buffer (flat, BNL join) +DROP TABLE t1,t2; +# End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/join.test mariadb-10.11.13/mysql-test/main/join.test --- mariadb-10.11.11/mysql-test/main/join.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join.test 2025-05-19 16:14:24.000000000 +0000 @@ -2015,3 +2015,28 @@ t2.kp1=t1.a and t2.kp1<=100 and t2.kp2<=20; drop table t1,t2; + +--echo # +--echo # MDEV-36592: If the join_condition is specified via USING (column_list), the query plan depends ... +--echo # +CREATE TABLE t1 ( + id int(11), + f1 char(255), + PRIMARY KEY (id) +); +INSERT INTO t1 (id) VALUES (1),(2),(3); +UPDATE t1 SET f1=REPEAT('a',250); + +CREATE TABLE t2 (id int(11), f2 INT NOT NULL); +INSERT INTO t2 select seq, seq from seq_1_to_20; + +ANALYZE TABLE t1, t2; + +--echo # In both queries, t1 should use type=index, not type=ALL: +EXPLAIN SELECT count(*) FROM t2 JOIN t1 USING (id); +EXPLAIN SELECT count(*) FROM t1 JOIN t2 USING (id); + +DROP TABLE t1,t2; + +--echo # End of 10.11 tests + diff -Nru mariadb-10.11.11/mysql-test/main/join_cache.result mariadb-10.11.13/mysql-test/main/join_cache.result --- mariadb-10.11.11/mysql-test/main/join_cache.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join_cache.result 2025-05-19 16:14:24.000000000 +0000 @@ -6443,3 +6443,29 @@ # # End of 10.5 tests # +# +# MDEV-36165: BKA join cache buffer is employed despite join_cache_level=3 (flat BNLH) +# +CREATE TABLE t1(a INT); +INSERT INTO t1 VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +CREATE TABLE t2(a INT, b INT); +INSERT INTO t2 SELECT a, a from t1; +CREATE TABLE t3(a INT, b INT, c INT, key (a,b)); +INSERT INTO t3 select a, a, a FROM t1; +SET optimizer_switch = 'join_cache_hashed=off,join_cache_bka=on,mrr=on'; +SET join_cache_level = 3; +EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t2 ALL NULL NULL NULL NULL 10 Using where +1 SIMPLE t3 ref a a 5 test.t2.a 1 Using index condition +SET join_cache_level = 4; +EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t2 ALL NULL NULL NULL NULL 10 Using where +1 SIMPLE t3 ref a a 5 test.t2.a 1 Using index condition +SET join_cache_level = default; +SET optimizer_switch = default; +DROP TABLE t1, t2, t3; +# +# End of 10.11 tests +# diff -Nru mariadb-10.11.11/mysql-test/main/join_cache.test mariadb-10.11.13/mysql-test/main/join_cache.test --- mariadb-10.11.11/mysql-test/main/join_cache.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join_cache.test 2025-05-19 16:14:24.000000000 +0000 @@ -4321,3 +4321,30 @@ --echo # --echo # End of 10.5 tests --echo # + +--echo # +--echo # MDEV-36165: BKA join cache buffer is employed despite join_cache_level=3 (flat BNLH) +--echo # +--source include/have_sequence.inc +CREATE TABLE t1(a INT); +INSERT INTO t1 VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +CREATE TABLE t2(a INT, b INT); +INSERT INTO t2 SELECT a, a from t1; +CREATE TABLE t3(a INT, b INT, c INT, key (a,b)); +INSERT INTO t3 select a, a, a FROM t1; + +SET optimizer_switch = 'join_cache_hashed=off,join_cache_bka=on,mrr=on'; + +SET join_cache_level = 3; +EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1); + +SET join_cache_level = 4; +EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1); + +SET join_cache_level = default; +SET optimizer_switch = default; +DROP TABLE t1, t2, t3; + +--echo # +--echo # End of 10.11 tests +--echo # diff -Nru mariadb-10.11.11/mysql-test/main/join_nested.result mariadb-10.11.13/mysql-test/main/join_nested.result --- mariadb-10.11.11/mysql-test/main/join_nested.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join_nested.result 2025-05-19 16:14:24.000000000 +0000 @@ -2051,3 +2051,15 @@ DROP TABLE t1, t2, t3; set join_cache_level= @save_join_cache_level; # end of 10.3 tests +# +# MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release +# +CREATE TABLE t1 (i int); +INSERT INTO t1 values (1),(2); +SELECT 1 FROM t1 WHERE i IN +(SELECT 1 FROM t1 c +LEFT JOIN (t1 a LEFT JOIN t1 b ON t1.i = b.i) ON c.i = t1.i); +1 +1 +DROP TABLE t1; +# end of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/join_nested.test mariadb-10.11.13/mysql-test/main/join_nested.test --- mariadb-10.11.11/mysql-test/main/join_nested.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join_nested.test 2025-05-19 16:14:24.000000000 +0000 @@ -1458,3 +1458,16 @@ set join_cache_level= @save_join_cache_level; --echo # end of 10.3 tests + +--echo # +--echo # MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release +--echo # +CREATE TABLE t1 (i int); +INSERT INTO t1 values (1),(2); + +SELECT 1 FROM t1 WHERE i IN + (SELECT 1 FROM t1 c + LEFT JOIN (t1 a LEFT JOIN t1 b ON t1.i = b.i) ON c.i = t1.i); + +DROP TABLE t1; +--echo # end of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/join_nested_jcl6.result mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result --- mariadb-10.11.11/mysql-test/main/join_nested_jcl6.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result 2025-05-19 16:14:24.000000000 +0000 @@ -2060,6 +2060,18 @@ DROP TABLE t1, t2, t3; set join_cache_level= @save_join_cache_level; # end of 10.3 tests +# +# MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release +# +CREATE TABLE t1 (i int); +INSERT INTO t1 values (1),(2); +SELECT 1 FROM t1 WHERE i IN +(SELECT 1 FROM t1 c +LEFT JOIN (t1 a LEFT JOIN t1 b ON t1.i = b.i) ON c.i = t1.i); +1 +1 +DROP TABLE t1; +# end of 10.11 tests CREATE TABLE t5 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b)); CREATE TABLE t6 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b)); CREATE TABLE t7 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b)); diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.opt mariadb-10.11.13/mysql-test/main/large_pages.opt --- mariadb-10.11.11/mysql-test/main/large_pages.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/large_pages.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---large-pages +--large-pages --loose-innodb-buffer-pool-size-max=16m diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.result mariadb-10.11.13/mysql-test/main/large_pages.result --- mariadb-10.11.11/mysql-test/main/large_pages.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/large_pages.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ call mtr.add_suppression("\\[Warning\\] (mysqld|mariadbd): Couldn't allocate [0-9]+ bytes \\((Large/HugeTLB memory|MEMLOCK) page size [0-9]+\\).*"); +call mtr.add_suppression("\\[ERROR\\]*Lock Pages in memory access rights required.*"); create table t1 ( a int not null auto_increment, b char(16) not null, diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.test mariadb-10.11.13/mysql-test/main/large_pages.test --- mariadb-10.11.11/mysql-test/main/large_pages.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/large_pages.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,11 +1,9 @@ # Test of large pages (or at least the fallback to conventional allocation) -# Windows needs SeLockMemoryPrivilege ---source include/not_windows.inc --source include/have_innodb.inc call mtr.add_suppression("\\[Warning\\] (mysqld|mariadbd): Couldn't allocate [0-9]+ bytes \\((Large/HugeTLB memory|MEMLOCK) page size [0-9]+\\).*"); - +call mtr.add_suppression("\\[ERROR\\]*Lock Pages in memory access rights required.*"); create table t1 ( a int not null auto_increment, b char(16) not null, diff -Nru mariadb-10.11.11/mysql-test/main/long_unique.result mariadb-10.11.13/mysql-test/main/long_unique.result --- mariadb-10.11.11/mysql-test/main/long_unique.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/long_unique.result 2025-05-19 16:14:24.000000000 +0000 @@ -1452,4 +1452,26 @@ # CREATE TABLE t1 (pk INT, a TEXT NOT NULL DEFAULT '', PRIMARY KEY (pk), b INT AUTO_INCREMENT, UNIQUE(b), UNIQUE (a,b)) ENGINE=myisam; ERROR HY000: AUTO_INCREMENT column `b` cannot be used in the UNIQUE index `a` +# +# MDEV-35620 UBSAN: runtime error: applying zero offset to null pointer in _ma_unique_hash, skip_trailing_space, my_hash_sort_mb_nopad_bin and my_strnncollsp_utf8mb4_bin +# +# Disable result log. The exact result is not important. +# We just need to make sure UBSAN nullptr-with-offset is not reported. +SELECT DISTINCT user,authentication_string FROM mysql.user; +SELECT DISTINCT USER,PASSWORD FROM mysql.user; +SELECT DISTINCT USER,plugin FROM mysql.user; +# Enabling result log again. +create or replace table t1 (t text) engine=aria; +insert into t1 values (''); +insert into t1 values (NULL); +select distinct t from t1; +t + +NULL +alter table t1 ENGINE=MyISAM; +select distinct t from t1; +t + +NULL +DROP TABLE t1; # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/long_unique.test mariadb-10.11.13/mysql-test/main/long_unique.test --- mariadb-10.11.11/mysql-test/main/long_unique.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/long_unique.test 2025-05-19 16:14:24.000000000 +0000 @@ -551,4 +551,26 @@ --error ER_NO_AUTOINCREMENT_WITH_UNIQUE CREATE TABLE t1 (pk INT, a TEXT NOT NULL DEFAULT '', PRIMARY KEY (pk), b INT AUTO_INCREMENT, UNIQUE(b), UNIQUE (a,b)) ENGINE=myisam; +--echo # +--echo # MDEV-35620 UBSAN: runtime error: applying zero offset to null pointer in _ma_unique_hash, skip_trailing_space, my_hash_sort_mb_nopad_bin and my_strnncollsp_utf8mb4_bin +--echo # + +--echo # Disable result log. The exact result is not important. +--echo # We just need to make sure UBSAN nullptr-with-offset is not reported. +--disable_result_log +SELECT DISTINCT user,authentication_string FROM mysql.user; +SELECT DISTINCT USER,PASSWORD FROM mysql.user; +SELECT DISTINCT USER,plugin FROM mysql.user; +--enable_result_log +--echo # Enabling result log again. + +create or replace table t1 (t text) engine=aria; +insert into t1 values (''); +insert into t1 values (NULL); +select distinct t from t1; +alter table t1 ENGINE=MyISAM; +select distinct t from t1; +DROP TABLE t1; + + --echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_table2.result mariadb-10.11.13/mysql-test/main/lowercase_table2.result --- mariadb-10.11.11/mysql-test/main/lowercase_table2.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/lowercase_table2.result 2025-05-19 16:14:24.000000000 +0000 @@ -185,7 +185,7 @@ select TABLE_SCHEMA,TABLE_NAME FROM information_schema.TABLES where TABLE_SCHEMA ='mysqltest_LC2'; TABLE_SCHEMA TABLE_NAME -mysqltest_lc2 myUC +mysqltest_LC2 myUC use test; drop database mysqltest_LC2; # diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_view.result mariadb-10.11.13/mysql-test/main/lowercase_view.result --- mariadb-10.11.11/mysql-test/main/lowercase_view.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/lowercase_view.result 2025-05-19 16:14:24.000000000 +0000 @@ -16,29 +16,17 @@ create view v2aA as select * from v1aA; create view v3Aa as select v2Aa.col1 from v2aA,t2Aa where v2Aa.col1 = t2aA.col1; insert into v2Aa values ((select max(col1) from v1aA)); -ERROR HY000: The definition of table 'v1aA' prevents operation INSERT on table 'v2Aa' insert into t1aA values ((select max(col1) from v1Aa)); -ERROR HY000: The definition of table 'v1Aa' prevents operation INSERT on table 't1aA' insert into v2aA values ((select max(col1) from v1aA)); -ERROR HY000: The definition of table 'v1aA' prevents operation INSERT on table 'v2aA' insert into v2Aa values ((select max(col1) from t1Aa)); -ERROR HY000: The definition of table 'v2Aa' prevents operation INSERT on table 'v2Aa' insert into t1aA values ((select max(col1) from t1Aa)); -ERROR HY000: Table 't1aA' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into v2aA values ((select max(col1) from t1aA)); -ERROR HY000: The definition of table 'v2aA' prevents operation INSERT on table 'v2aA' insert into v2Aa values ((select max(col1) from v2aA)); -ERROR HY000: Table 'v2Aa' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into t1Aa values ((select max(col1) from v2Aa)); -ERROR HY000: The definition of table 'v2Aa' prevents operation INSERT on table 't1Aa' insert into v2aA values ((select max(col1) from v2Aa)); -ERROR HY000: Table 'v2aA' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into v3Aa (col1) values ((select max(col1) from v1Aa)); -ERROR HY000: The definition of table 'v1Aa' prevents operation INSERT on table 'v3Aa' insert into v3aA (col1) values ((select max(col1) from t1aA)); -ERROR HY000: The definition of table 'v3aA' prevents operation INSERT on table 'v3aA' insert into v3Aa (col1) values ((select max(col1) from v2aA)); -ERROR HY000: The definition of table 'v2aA' prevents operation INSERT on table 'v3Aa' drop view v3aA,v2Aa,v1aA; drop table t1Aa,t2Aa; create table t1Aa (col1 int); diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_view.test mariadb-10.11.13/mysql-test/main/lowercase_view.test --- mariadb-10.11.11/mysql-test/main/lowercase_view.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/lowercase_view.test 2025-05-19 16:14:24.000000000 +0000 @@ -23,29 +23,17 @@ create view v1Aa as select * from t1aA; create view v2aA as select * from v1aA; create view v3Aa as select v2Aa.col1 from v2aA,t2Aa where v2Aa.col1 = t2aA.col1; --- error 1443 insert into v2Aa values ((select max(col1) from v1aA)); --- error 1443 insert into t1aA values ((select max(col1) from v1Aa)); --- error 1443 insert into v2aA values ((select max(col1) from v1aA)); --- error 1443 insert into v2Aa values ((select max(col1) from t1Aa)); --- error 1093 insert into t1aA values ((select max(col1) from t1Aa)); --- error 1443 insert into v2aA values ((select max(col1) from t1aA)); --- error 1093 insert into v2Aa values ((select max(col1) from v2aA)); --- error 1443 insert into t1Aa values ((select max(col1) from v2Aa)); --- error 1093 insert into v2aA values ((select max(col1) from v2Aa)); --- error 1443 insert into v3Aa (col1) values ((select max(col1) from v1Aa)); --- error 1443 insert into v3aA (col1) values ((select max(col1) from t1aA)); --- error 1443 insert into v3Aa (col1) values ((select max(col1) from v2aA)); drop view v3aA,v2Aa,v1aA; drop table t1Aa,t2Aa; diff -Nru mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.result mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result --- mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,35 @@ +use mysql; +# run mysql_install_db with --service parameter +# Start service +# -- Upgrade service (online) -- +Phase 1/10: Stopping service +Phase 2/10: Start and stop server in the old version, to avoid crash recovery (skipped) +Phase 3/10: Fixing server config file +Phase 4/10: Starting mysqld for upgrade +Phase 5/10: Waiting for startup to complete +Phase 6/10: Running mysql_upgrade +Phase 7/10: Changing service configuration +Phase 8/10: Initiating server shutdown +Phase 9/10: Waiting for shutdown to complete +Phase 10/10: Starting service +Service 'SERVICE_NAME' successfully upgraded. +Log file is written to UPGRADE_LOG +# upgrade_success(online)=1 +# Service stopped +# -- Upgrade service (offline) -- +Phase 1/10: Stopping service +Phase 2/10: Start and stop server in the old version, to avoid crash recovery ,this can take some time +Phase 3/10: Fixing server config file +Phase 4/10: Starting mysqld for upgrade +Phase 5/10: Waiting for startup to complete +Phase 6/10: Running mysql_upgrade +Phase 7/10: Changing service configuration +Phase 8/10: Initiating server shutdown +Phase 9/10: Waiting for shutdown to complete +Phase 10/10: Starting service (skipped) +Service 'SERVICE_NAME' successfully upgraded. +Log file is written to UPGRADE_LOG +# upgrade_success(offline)=1 +# Delete service +connection default; +# restart diff -Nru mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.test mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test --- mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,113 @@ +source include/windows.inc; +let $datadir_name=data; +let $service_name_prefix=mariadb; +let $password=password; + +source include/check_windows_admin.inc; + +# The test uses return code from sc.exe utility, which are as follows +let $ERROR_SERVICE_DOES_NOT_EXIST= 1060; +let $ERROR_SERVICE_CANNOT_ACCEPT_CTRL=1061;# intermediate, during start or stop +let $ERROR_SERVICE_NOT_ACTIVE=1062;# service stopped +let $ERROR_INVALID_SERVICE_CONTROL=1052; # The requested control is not valid for this service + +let $sc_exe= C:\Windows\System32\sc.exe; +let $ddir= $MYSQLTEST_VARDIR/tmp/$datadir_name; +let $service_name=$service_name_prefix$MASTER_MYPORT; +let TMP= $MYSQLTEST_VARDIR/tmp; +let $upgrade_log=$TMP/mysql_upgrade_service.$service_name.log; + +use mysql; +error 0,1; +rmdir $ddir; + +--disable_result_log +error 0,$ERROR_SERVICE_DOES_NOT_EXIST; +exec $sc_exe delete $service_name; +--enable_result_log + +source include/shutdown_mysqld.inc; +echo # run mysql_install_db with --service parameter; +--disable_result_log +exec $MYSQL_INSTALL_DB_EXE --datadir=$ddir --port=$MASTER_MYPORT --password=$password --service=$service_name --verbose-bootstrap -R; +--enable_result_log + +echo # Start service; +--disable_result_log +exec $sc_exe start $service_name; +--enable_result_log + +enable_reconnect; +source include/wait_until_connected_again.inc; +disable_reconnect; + +echo # -- Upgrade service (online) --; +--replace_result $upgrade_log UPGRADE_LOG $service_name SERVICE_NAME +let $sys_errno=0; +let $upgrade_success = 1; +error 0,1; +exec $MARIADB_UPGRADE_SERVICE_EXE --service=$service_name; + +if($sys_errno != 0) +{ + let $upgrade_success = 0; +} + +echo # upgrade_success(online)=$upgrade_success; +file_exists $upgrade_log; +if ($upgrade_success == 0) +{ + echo --detailed error(online upgrade)--; + cat_file $upgrade_log; +} +# stop service +--disable_result_log +# Wait until stopped +let $sys_errno=0; +while($sys_errno != $ERROR_SERVICE_NOT_ACTIVE) +{ + --error 0,$ERROR_SERVICE_CANNOT_ACCEPT_CTRL,$ERROR_SERVICE_NOT_ACTIVE, $ERROR_INVALID_SERVICE_CONTROL + exec $sc_exe stop $service_name; + if($sys_errno != $ERROR_SERVICE_NOT_ACTIVE) + { + --real_sleep 0.1 + } +} +--enable_result_log +echo # Service stopped; + +echo # -- Upgrade service (offline) --; +--replace_result $upgrade_log UPGRADE_LOG $service_name SERVICE_NAME +let $sys_errno=0; +let $upgrade_success = 1; +error 0,1; +exec $MARIADB_UPGRADE_SERVICE_EXE --service=$service_name; + +if($sys_errno != 0) +{ + let $upgrade_success = 0; +} + +echo # upgrade_success(offline)=$upgrade_success; +file_exists $upgrade_log; +if ($upgrade_success == 0) +{ + echo --detailed error(online upgrade)--; + cat_file $upgrade_log; +} + +echo # Delete service; +let $sys_errno=0; +--disable_result_log +exec $sc_exe delete $service_name; +--enable_result_log + +# Cleanup +source include/wait_until_disconnected.inc; +rmdir $ddir; +remove_file $upgrade_log; +let TEMP=$old_temp; + +#restart original server +connection default; +source include/start_mysqld.inc; diff -Nru mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.result mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result --- mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,21 @@ +CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB; +INSERT INTO t VALUES ('a','b'); +DROP TABLE t; +CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB; +DELETE FROM t; +DROP TABLE t; +CREATE TABLE t (a INT(1),d INT(1),b VARCHAR(1),c CHAR(1),c3 INT(1) GENERATED ALWAYS AS ((a + LENGTH (d))) STORED,c2 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,k1 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,PRIMARY KEY(b (1),a,d),KEY d (d),KEY a (a),KEY c_renamed (c (1),b (1)),KEY b (b (1),c (1),a),KEY k1 (k1),KEY a_2 (a,k1),KEY k1_2 (k1,d)) DEFAULT CHARSET=latin1 ENGINE=InnoDB; +DELETE FROM t; +DROP TABLE t; +CREATE TABLE t (a INT,ROW_START TIMESTAMP(6) AS ROW START,ROW_END TIMESTAMP(6) AS ROW END,PERIOD FOR SYSTEM_TIME(ROW_START,ROW_END),INDEX (ROW_START),INDEX (ROW_END),PRIMARY KEY(ROW_END,a,ROW_START),INDEX (ROW_END,ROW_START,a)) WITH SYSTEM VERSIONING ENGINE=InnoDB; +SHOW INDEX FROM t; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t 0 PRIMARY 1 ROW_END A 0 NULL NULL BTREE NO +t 0 PRIMARY 2 a A 0 NULL NULL BTREE NO +t 0 PRIMARY 3 ROW_START A 0 NULL NULL BTREE NO +t 1 ROW_START 1 ROW_START A 0 NULL NULL BTREE NO +t 1 ROW_END 1 ROW_END A 0 NULL NULL BTREE NO +t 1 ROW_END_2 1 ROW_END A 0 NULL NULL BTREE NO +t 1 ROW_END_2 2 ROW_START A 0 NULL NULL BTREE NO +t 1 ROW_END_2 3 a A 0 NULL NULL BTREE NO +DROP TABLE t; diff -Nru mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.test mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test --- mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,22 @@ + +--source include/have_innodb.inc + +CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB; +INSERT INTO t VALUES ('a','b'); + +DROP TABLE t; + +CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB; +DELETE FROM t; + +DROP TABLE t; + +CREATE TABLE t (a INT(1),d INT(1),b VARCHAR(1),c CHAR(1),c3 INT(1) GENERATED ALWAYS AS ((a + LENGTH (d))) STORED,c2 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,k1 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,PRIMARY KEY(b (1),a,d),KEY d (d),KEY a (a),KEY c_renamed (c (1),b (1)),KEY b (b (1),c (1),a),KEY k1 (k1),KEY a_2 (a,k1),KEY k1_2 (k1,d)) DEFAULT CHARSET=latin1 ENGINE=InnoDB; +DELETE FROM t; + +DROP TABLE t; + +CREATE TABLE t (a INT,ROW_START TIMESTAMP(6) AS ROW START,ROW_END TIMESTAMP(6) AS ROW END,PERIOD FOR SYSTEM_TIME(ROW_START,ROW_END),INDEX (ROW_START),INDEX (ROW_END),PRIMARY KEY(ROW_END,a,ROW_START),INDEX (ROW_END,ROW_START,a)) WITH SYSTEM VERSIONING ENGINE=InnoDB; +SHOW INDEX FROM t; + +DROP TABLE t; diff -Nru mariadb-10.11.11/mysql-test/main/mdl_sync.result mariadb-10.11.13/mysql-test/main/mdl_sync.result --- mariadb-10.11.11/mysql-test/main/mdl_sync.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mdl_sync.result 2025-05-19 16:14:24.000000000 +0000 @@ -2431,9 +2431,6 @@ create table t2 (a int) stats_persistent=0, engine=innodb; insert into t1 values (1); insert into t2 values (1); -connect con1, localhost, root; -start transaction with consistent snapshot; -connection default; SET DEBUG_SYNC= 'after_open_table_mdl_shared SIGNAL table_opened WAIT_FOR grlwait execute 2'; update t1,t2 set t1.a=2,t2.a=3; connection con2; @@ -2456,6 +2453,7 @@ SET DEBUG_SYNC= 'now WAIT_FOR table_opened'; SET DEBUG_SYNC= 'mdl_acquire_lock_wait SIGNAL grlwait'; FLUSH TABLES WITH READ LOCK; +InnoDB 0 transactions not purged SELECT LOCK_MODE, LOCK_TYPE, TABLE_SCHEMA, TABLE_NAME FROM information_schema.metadata_lock_info; LOCK_MODE LOCK_TYPE TABLE_SCHEMA TABLE_NAME MDL_BACKUP_FTWRL2 Backup lock @@ -2465,7 +2463,6 @@ SET DEBUG_SYNC= 'RESET'; drop table t1,t2; disconnect con2; -disconnect con1; # # Bug#50786 Assertion `thd->mdl_context.trans_sentinel() == __null' # failed in open_ltable() diff -Nru mariadb-10.11.11/mysql-test/main/mdl_sync.test mariadb-10.11.13/mysql-test/main/mdl_sync.test --- mariadb-10.11.11/mysql-test/main/mdl_sync.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mdl_sync.test 2025-05-19 16:14:24.000000000 +0000 @@ -3115,12 +3115,6 @@ insert into t1 values (1); insert into t2 values (1); -connect (con1, localhost, root); -# disable innodb purge thread, otherwise it might start purging t2, -# and will take an mdl, affecting metadata_lock_info output. -start transaction with consistent snapshot; -connection default; - SET DEBUG_SYNC= 'after_open_table_mdl_shared SIGNAL table_opened WAIT_FOR grlwait execute 2'; --send update t1,t2 set t1.a=2,t2.a=3 @@ -3156,6 +3150,7 @@ let $wait_condition= SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info; --source include/wait_condition.inc +--source ../suite/innodb/include/wait_all_purged.inc SELECT LOCK_MODE, LOCK_TYPE, TABLE_SCHEMA, TABLE_NAME FROM information_schema.metadata_lock_info; unlock tables; @@ -3166,7 +3161,6 @@ SET DEBUG_SYNC= 'RESET'; drop table t1,t2; disconnect con2; -disconnect con1; --echo # --echo # Bug#50786 Assertion `thd->mdl_context.trans_sentinel() == __null' diff -Nru mariadb-10.11.11/mysql-test/main/merge.result mariadb-10.11.13/mysql-test/main/merge.result --- mariadb-10.11.11/mysql-test/main/merge.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/merge.result 2025-05-19 16:14:24.000000000 +0000 @@ -3678,33 +3678,22 @@ insert into t1 (a) values (1); insert into t3 (b) values (1); insert into m1 (a) values ((select max(a) from m1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from m2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t3, m1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t3, m2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t3, t1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from t3, t2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from tmp, m1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from tmp, m2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from tmp, t1)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from tmp, t2)); -ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into m1 (a) values ((select max(a) from v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'm1' insert into m1 (a) values ((select max(a) from tmp, v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'm1' +select count(*) from m1; +count(*) +15 drop view v1; drop temporary table tmp; drop table t1, t2, t3, m1, m2; diff -Nru mariadb-10.11.11/mysql-test/main/merge.test mariadb-10.11.13/mysql-test/main/merge.test --- mariadb-10.11.11/mysql-test/main/merge.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/merge.test 2025-05-19 16:14:24.000000000 +0000 @@ -2670,37 +2670,24 @@ insert into t1 (a) values (1); insert into t3 (b) values (1); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from m1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from m2)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t2)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t3, m1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t3, m2)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t3, t1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from t3, t2)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from tmp, m1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from tmp, m2)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from tmp, t1)); ---error ER_UPDATE_TABLE_USED insert into m1 (a) values ((select max(a) from tmp, t2)); - ---error ER_VIEW_PREVENT_UPDATE + insert into m1 (a) values ((select max(a) from v1)); ---error ER_VIEW_PREVENT_UPDATE insert into m1 (a) values ((select max(a) from tmp, v1)); +select count(*) from m1; drop view v1; diff -Nru mariadb-10.11.11/mysql-test/main/multi_update.result mariadb-10.11.13/mysql-test/main/multi_update.result --- mariadb-10.11.11/mysql-test/main/multi_update.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/multi_update.result 2025-05-19 16:14:24.000000000 +0000 @@ -1389,3 +1389,23 @@ 12 5 8 drop table t1,t2,t3,t; # End of 10.4 tests +# +# MDEV-31647 Stack looping and SIGSEGV in Item_args::walk_args on UPDATE +# +create table t1 (c int, c2 int) engine=innodb; +update t1 set c=0 where c=( +select 1 from (select 1 as v1) as v2 +natural join t1) order by last_value (c2) over (order by c2); +ERROR HY000: Invalid use of group function +update t1 set c=0 where c=( +select 1 from (select 1 as v1) as v2 +natural join t1) order by last_value (c2) over (); +ERROR HY000: Invalid use of group function +update t1 set c=0 where c=( +select 1 from (select 1 as v1) as v2 +natural join t1) order by c2; +select 1 from (select 1 as v1) as v2 +natural join t1 order by last_value (c2) over (order by c2); +1 +drop table t1; +# End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/multi_update.test mariadb-10.11.13/mysql-test/main/multi_update.test --- mariadb-10.11.11/mysql-test/main/multi_update.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/multi_update.test 2025-05-19 16:14:24.000000000 +0000 @@ -1200,3 +1200,31 @@ drop table t1,t2,t3,t; --echo # End of 10.4 tests + +--echo # +--echo # MDEV-31647 Stack looping and SIGSEGV in Item_args::walk_args on UPDATE +--echo # +--source include/have_innodb.inc +create table t1 (c int, c2 int) engine=innodb; + +--error ER_INVALID_GROUP_FUNC_USE +update t1 set c=0 where c=( + select 1 from (select 1 as v1) as v2 + natural join t1) order by last_value (c2) over (order by c2); + +--error ER_INVALID_GROUP_FUNC_USE +update t1 set c=0 where c=( + select 1 from (select 1 as v1) as v2 + natural join t1) order by last_value (c2) over (); + +update t1 set c=0 where c=( + select 1 from (select 1 as v1) as v2 + natural join t1) order by c2; + +select 1 from (select 1 as v1) as v2 + natural join t1 order by last_value (c2) over (order by c2); + + +drop table t1; + +--echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.opt mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt --- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1 @@ +--slOw_QuEry_loG=OFF diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.result mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result --- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,8 @@ +# +# MDEV-27126: my_getopt compares option names case sensitively +# +# Check if the variable is set correctly from options +SELECT @@GLOBAL.slow_query_log; +@@GLOBAL.slow_query_log +0 +# End of test. diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.test mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test --- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,8 @@ +--echo # +--echo # MDEV-27126: my_getopt compares option names case sensitively +--echo # + +--echo # Check if the variable is set correctly from options +SELECT @@GLOBAL.slow_query_log; + +--echo # End of test. diff -Nru mariadb-10.11.11/mysql-test/main/myisam-big.result mariadb-10.11.13/mysql-test/main/myisam-big.result --- mariadb-10.11.11/mysql-test/main/myisam-big.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/myisam-big.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,7 @@ drop table if exists t1,t2; +call mtr.add_suppression("Index.*try to repair it"); +call mtr.add_suppression("Disk got full"); +call mtr.add_suppression("Got an error from thread_id"); create table t1 (id int, sometext varchar(100)) engine=myisam; insert into t1 values (1, "hello"),(2, "hello2"),(4, "hello3"),(4, "hello4"); create table t2 like t1; @@ -43,4 +46,9 @@ connection con2; disconnect con2; connection default; +SET @saved_dbug = @@SESSION.debug_dbug; +SET debug_dbug='+d,simulate_file_pwrite_error'; +insert into t1 select * from t2; +ERROR HY000: Disk got full writing 'test.t1' (Errcode: 28 "No space left on device") +SET debug_dbug= @saved_dbug; drop table t1,t2; diff -Nru mariadb-10.11.11/mysql-test/main/myisam-big.test mariadb-10.11.13/mysql-test/main/myisam-big.test --- mariadb-10.11.11/mysql-test/main/myisam-big.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/myisam-big.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,12 +1,17 @@ # # Test bugs in the MyISAM code that require more space/time --source include/big_test.inc +--source include/have_debug.inc # Initialise --disable_warnings drop table if exists t1,t2; --enable_warnings +call mtr.add_suppression("Index.*try to repair it"); +call mtr.add_suppression("Disk got full"); +call mtr.add_suppression("Got an error from thread_id"); + # # BUG#925377: # Querying myisam table metadata while 'alter table..enable keys' is @@ -61,4 +66,12 @@ reap; disconnect con2; connection default; + +# +# Test error message from disk full +SET @saved_dbug = @@SESSION.debug_dbug; +SET debug_dbug='+d,simulate_file_pwrite_error'; +--error ER_DISK_FULL +insert into t1 select * from t2; +SET debug_dbug= @saved_dbug; drop table t1,t2; diff -Nru mariadb-10.11.11/mysql-test/main/mysql-interactive.result mariadb-10.11.13/mysql-test/main/mysql-interactive.result --- mariadb-10.11.11/mysql-test/main/mysql-interactive.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysql-interactive.result 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,7 @@ delimiter $ select 1; $ +exit Welcome to the MariaDB monitor. Commands end with ; or \g. Your MariaDB connection id is X Server version: Y @@ -21,4 +22,5 @@ +---+ 1 row in set -MariaDB [(none)]> \ No newline at end of file +MariaDB [(none)]> exit +Bye diff -Nru mariadb-10.11.11/mysql-test/main/mysql-interactive.test mariadb-10.11.13/mysql-test/main/mysql-interactive.test --- mariadb-10.11.11/mysql-test/main/mysql-interactive.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysql-interactive.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,23 +6,16 @@ # this would need an instrumented ncurses library source include/not_msan.inc; -error 0,1; -exec $MYSQL -V|grep -q readline; -if ($sys_errno == 1) -{ - # strangely enough - skip does not work with libedit; -} - write_file $MYSQL_TMP_DIR/mysql_in; delimiter $ select 1; $ +exit EOF let TERM=dumb; replace_regex /id is \d+/id is X/ /Server version: .*/Server version: Y/ / \(\d+\.\d+ sec\)//; error 0,127; -exec socat EXEC:"$MYSQL",pty STDIO < $MYSQL_TMP_DIR/mysql_in; +exec socat -t10 EXEC:"$MYSQL",pty STDIO < $MYSQL_TMP_DIR/mysql_in; if ($sys_errno == 127) { remove_file $MYSQL_TMP_DIR/mysql_in; diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade-34014.result mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result --- mariadb-10.11.11/mysql-test/main/mysql_upgrade-34014.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result 2025-05-19 16:14:24.000000000 +0000 @@ -12,6 +12,8 @@ SHOW CREATE DATABASE sys; Database Create Database sys CREATE DATABASE `sys` /*!40100 DEFAULT CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci */ +Warnings: +Note 1105 Database 'sys' does not have a db.opt file. You can create one with ALTER DATABASE if needed Phase 1/8: Checking and upgrading mysql database Processing databases mysql diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade.result mariadb-10.11.13/mysql-test/main/mysql_upgrade.result --- mariadb-10.11.11/mysql-test/main/mysql_upgrade.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysql_upgrade.result 2025-05-19 16:14:24.000000000 +0000 @@ -151,7 +151,8 @@ Phase 8/8: Running 'FLUSH PRIVILEGES' OK Run it again - should say already completed -This installation of MariaDB is already upgraded to VERSION.There is no need to run mysql_upgrade again for VERSION. +This installation of MariaDB is already upgraded to X.Y.Z-MariaDB. +There is no need to run mysql_upgrade again. You can use --force if you still want to run mysql_upgrade Force should run it regardless of whether it has been run before Phase 1/8: Checking and upgrading mysql database @@ -1911,11 +1912,11 @@ # # MDEV-27279: mariadb_upgrade add --check-if-upgrade-is-needed # -This installation of MariaDB is already upgraded to MariaDB . -There is no need to run mysql_upgrade again for MariaDB . +This installation of MariaDB is already upgraded to X.Y.Z-MariaDB. +There is no need to run mysql_upgrade again. Looking for 'mariadb' as: mariadb -This installation of MariaDB is already upgraded to MariaDB . -There is no need to run mysql_upgrade again for MariaDB . +This installation of MariaDB is already upgraded to X.Y.Z-MariaDB. +There is no need to run mysql_upgrade again. # # MDEV-27279: mariadb_upgrade check-if-upgrade absence is do it # @@ -1925,17 +1926,17 @@ # MDEV-27279: mariadb_upgrade check-if-upgrade with minor version change # Looking for 'mariadb' as: mariadb -This installation of MariaDB is already upgraded to MariaDB . -There is no need to run mysql_upgrade again for MariaDB . -This installation of MariaDB is already upgraded to MariaDB . -There is no need to run mysql_upgrade again for MariaDB . +This installation of MariaDB is already upgraded to X.Y.0-MariaDB. +There is no need to run mysql_upgrade again for X.Y.Z-MariaDB, because they're both X.Y. +This installation of MariaDB is already upgraded to X.Y.0-MariaDB. +There is no need to run mysql_upgrade again for X.Y.Z-MariaDB, because they're both X.Y. You can use --force if you still want to run mysql_upgrade # # MDEV-27279: mariadb_upgrade check-if-upgrade with major version change # -Major version upgrade detected from MariaDB to MariaDB . Check required! +Major version upgrade detected from X.0.99 to X.Y.Z-MariaDB. Check required! Looking for 'mysql' as: mysql -Major version upgrade detected from MariaDB to MariaDB . Check required! +Major version upgrade detected from X.0.99 to X.Y.Z-MariaDB. Check required! drop table mysql.global_priv; rename table mysql.global_priv_bak to mysql.global_priv; # End of 10.2 tests diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade.test mariadb-10.11.13/mysql-test/main/mysql_upgrade.test --- mariadb-10.11.11/mysql-test/main/mysql_upgrade.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysql_upgrade.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,8 +1,12 @@ +--source include/long_test.inc -- source include/mysql_upgrade_preparation.inc -- source include/have_working_dns.inc -- source include/have_innodb.inc -- source include/have_partition.inc --- source include/no_valgrind_without_big.inc + +let majorminor=`select substring_index(version(), '.', 2)`; +# for major upgrade test, see below +let major=`select substring_index(version(), '.', 1) - (version() like '%.0.%')`; set sql_mode=""; @@ -19,7 +23,7 @@ file_exists $MYSQLD_DATADIR/mysql_upgrade_info; --echo Run it again - should say already completed ---replace_regex /upgraded to [^\n].*/upgraded to VERSION./ /again for [^\n]*/again for VERSION./ +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB --exec $MYSQL_UPGRADE 2>&1 # It should have created a file in the MySQL Servers datadir @@ -289,10 +293,11 @@ --error 1 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB --error 1 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/ +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB +--replace_regex /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/ --error 1 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose @@ -320,16 +325,18 @@ my $file= $ENV{'DATADIR'} or die "MYSQLD_DATADIR not set"; $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.$2.0$4/; open(FILE, ">$file/mysql_upgrade_info") or die "Failed to open $file"; + binmode FILE; print FILE "$ver\n"; close(FILE); EOF --error 1 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/ +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $majorminor X.Y +--replace_regex /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/ --error 1 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $majorminor X.Y --exec $MYSQL_UPGRADE --remove_file $MYSQLD_DATADIR/mysql_upgrade_info @@ -344,16 +351,18 @@ perl; my $ver= $ENV{'MYSQL_SERVER_VERSION'} or die "MYSQL_SERVER_VERSION not set"; my $file= $ENV{'DATADIR'} or die "MYSQLD_DATADIR not set"; - $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.0.$3$4/; + $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.0.99/; open(FILE, ">$file/mysql_upgrade_info") or die "Failed to open $file"; + binmode FILE; print FILE "$ver\n"; close(FILE); EOF --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $major X --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed ---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mysql' as: mysql/ +--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $major X +--replace_regex /'mariadb.* as:[^\n]*/'mysql' as: mysql/ --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose --remove_file $MYSQLD_DATADIR/mysql_upgrade_info drop table mysql.global_priv; diff -Nru mariadb-10.11.11/mysql-test/main/mysqld--help.result mariadb-10.11.13/mysql-test/main/mysqld--help.result --- mariadb-10.11.11/mysql-test/main/mysqld--help.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqld--help.result 2025-05-19 16:14:24.000000000 +0000 @@ -748,7 +748,8 @@ keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in - selectivity_for_indexes. selectivity_multiplier. This + selectivity_for_indexes. fix_derived_table_read_cost = + Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. Use 'ALL' to set all combinations. diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump-system.result mariadb-10.11.13/mysql-test/main/mysqldump-system.result --- mariadb-10.11.11/mysql-test/main/mysqldump-system.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqldump-system.result 2025-05-19 16:14:24.000000000 +0000 @@ -650,21 +650,21 @@ /*M!100401 UNINSTALL PLUGIN IF EXIST cleartext_plugin_server */; INSTALL PLUGIN cleartext_plugin_server SONAME 'AUTH_TEST_PLUGIN_LIB'; DELIMITER | -/*M!100101 IF current_user()="'mariadb.sys'@'localhost'" THEN +/*M!100101 IF current_user()='''mariadb.sys''@''localhost''' THEN SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'mariadb.sys'@'localhost''"; END IF */| DELIMITER ; /*!50701 DROP USER IF EXISTS 'mariadb.sys'@'localhost' */; CREATE /*M!100103 OR REPLACE */ USER `mariadb.sys`@`localhost` PASSWORD EXPIRE; DELIMITER | -/*M!100101 IF current_user()="'root'@'localhost'" THEN +/*M!100101 IF current_user()='''root''@''localhost''' THEN SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'root'@'localhost''"; END IF */| DELIMITER ; /*!50701 DROP USER IF EXISTS 'root'@'localhost' */; CREATE /*M!100103 OR REPLACE */ USER `root`@`localhost`; DELIMITER | -/*M!100101 IF current_user()="'foobar'@'%'" THEN +/*M!100101 IF current_user()='''foobar''@''%''' THEN SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'foobar'@'%''"; END IF */| DELIMITER ; diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump.result mariadb-10.11.13/mysql-test/main/mysqldump.result --- mariadb-10.11.11/mysql-test/main/mysqldump.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqldump.result 2025-05-19 16:14:24.000000000 +0000 @@ -6747,6 +6747,39 @@ /*!40101 SET character_set_client = @saved_cs_client */; ERROR at line 9: Not allowed in the sandbox mode drop table t1; +# +# MDEV-36268 mariadb-dump used wrong quoting character +# +create table t1 (a int); +create view `v'1"2` as select * from t1 with check option; +/*M!999999\- enable the sandbox mode */ +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!40101 SET character_set_client = utf8mb4 */; +CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci; +/*!40101 SET character_set_client = @saved_cs_client */; +SET @saved_cs_client = @@character_set_client; +SET character_set_client = utf8mb4; +/*!50001 CREATE VIEW `v'1"2` AS SELECT + 1 AS `a` */; +SET character_set_client = @saved_cs_client; +/*!50001 DROP VIEW IF EXISTS `v'1"2`*/; +/*!50001 SET @saved_cs_client = @@character_set_client */; +/*!50001 SET @saved_cs_results = @@character_set_results */; +/*!50001 SET @saved_col_connection = @@collation_connection */; +/*!50001 SET character_set_client = utf8mb3 */; +/*!50001 SET character_set_results = utf8mb3 */; +/*!50001 SET collation_connection = utf8mb3_general_ci */; +/*!50001 CREATE ALGORITHM=UNDEFINED */ +/*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ +/*!50001 VIEW `v'1"2` AS select `t1`.`a` AS `a` from `t1` */ +/*!50002 WITH CASCADED CHECK OPTION */; +/*!50001 SET character_set_client = @saved_cs_client */; +/*!50001 SET character_set_results = @saved_cs_results */; +/*!50001 SET collation_connection = @saved_col_connection */; +drop view `v'1"2`; +drop table t1; # End of 10.5 tests # # MDEV-16733 mysqldump --tab and --xml options are conflicting diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump.test mariadb-10.11.13/mysql-test/main/mysqldump.test --- mariadb-10.11.11/mysql-test/main/mysqldump.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqldump.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ ---source include/no_valgrind_without_big.inc +--source include/long_test.inc --source include/have_utf8mb4.inc call mtr.add_suppression("@003f.frm' \\(errno: 22\\)"); @@ -3029,6 +3029,15 @@ --remove_file $MYSQLTEST_VARDIR/tmp/mdev33727.sql drop table t1; +--echo # +--echo # MDEV-36268 mariadb-dump used wrong quoting character +--echo # +create table t1 (a int); +create view `v'1"2` as select * from t1 with check option; # "' +--exec $MYSQL_DUMP --compact test +drop view `v'1"2`; # "' +drop table t1; + --echo # End of 10.5 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/mysqlslap.result mariadb-10.11.13/mysql-test/main/mysqlslap.result --- mariadb-10.11.11/mysql-test/main/mysqlslap.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqlslap.result 2025-05-19 16:14:24.000000000 +0000 @@ -260,3 +260,6 @@ # # Bug MDEV-15789 (Upstream: #80329): MYSQLSLAP OPTIONS --AUTO-GENERATE-SQL-GUID-PRIMARY and --AUTO-GENERATE-SQL-SECONDARY-INDEXES DONT WORK # +# +# Bug MDEV-34621: Fix division by zero in mariadb-slap when iterations=0 +# diff -Nru mariadb-10.11.11/mysql-test/main/mysqlslap.test mariadb-10.11.13/mysql-test/main/mysqlslap.test --- mariadb-10.11.11/mysql-test/main/mysqlslap.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqlslap.test 2025-05-19 16:14:24.000000000 +0000 @@ -88,3 +88,9 @@ --exec $MYSQL_SLAP --concurrency=1 --silent --iterations=1 --number-int-cols=2 --number-char-cols=3 --auto-generate-sql --auto-generate-sql-guid-primary --create-schema=slap --exec $MYSQL_SLAP --concurrency=1 --silent --iterations=1 --number-int-cols=2 --number-char-cols=3 --auto-generate-sql --auto-generate-sql-secondary-indexes=1 --create-schema=slap + +--echo # +--echo # Bug MDEV-34621: Fix division by zero in mariadb-slap when iterations=0 +--echo # + +--exec $MYSQL_SLAP -i0 --only-print diff -Nru mariadb-10.11.11/mysql-test/main/mysqltest.result mariadb-10.11.13/mysql-test/main/mysqltest.result --- mariadb-10.11.11/mysql-test/main/mysqltest.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqltest.result 2025-05-19 16:14:24.000000000 +0000 @@ -989,4 +989,13 @@ foo\"bar foo\"bar set sql_mode=default; +# +# MDEV-29344: engines/iuds.insert_time cannot run with PS protocol (syntax error) +# +SELECT 1 /* doesn't throw error */; +1 +1 +SELECT 1 /* doesn't throw error */; +1 +1 End of tests diff -Nru mariadb-10.11.11/mysql-test/main/mysqltest.test mariadb-10.11.13/mysql-test/main/mysqltest.test --- mariadb-10.11.11/mysql-test/main/mysqltest.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/mysqltest.test 2025-05-19 16:14:24.000000000 +0000 @@ -2954,6 +2954,12 @@ select "foo\""bar"; set sql_mode=default; +--echo # +--echo # MDEV-29344: engines/iuds.insert_time cannot run with PS protocol (syntax error) +--echo # +SELECT 1 /* doesn't throw error */; +SELECT 1 /* doesn't throw error */; + --echo End of tests # Wait till we reached the initial number of concurrent sessions diff -Nru mariadb-10.11.11/mysql-test/main/partition_myisam.result mariadb-10.11.13/mysql-test/main/partition_myisam.result --- mariadb-10.11.11/mysql-test/main/partition_myisam.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/partition_myisam.result 2025-05-19 16:14:24.000000000 +0000 @@ -259,3 +259,24 @@ Table Op Msg_type Msg_text test.t1 check status OK DROP TABLE t1; +# +# MDEV-31122 Server crash in get_lock_data / mysql_lock_abort_for_thread +# +CREATE TABLE t1 (a INT); +CREATE TABLE t2 (b INT, c varchar(5)) +PARTITION BY RANGE COLUMNS(c) +SUBPARTITION by key(b) SUBPARTITIONS 2 ( +PARTITION p0 VALUES LESS THAN ('m'), +PARTITION p1 VALUES LESS THAN ('z') +); +connect con1,localhost,root,,; +HANDLER t1 OPEN; +SELECT b FROM t2 PARTITION (p0); +connection default; +SET lock_wait_timeout= 1; +ALTER TABLE t1 FORCE; +connection con1; +b +disconnect con1; +connection default; +DROP TABLE t2, t1; diff -Nru mariadb-10.11.11/mysql-test/main/partition_myisam.test mariadb-10.11.13/mysql-test/main/partition_myisam.test --- mariadb-10.11.11/mysql-test/main/partition_myisam.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/partition_myisam.test 2025-05-19 16:14:24.000000000 +0000 @@ -249,3 +249,31 @@ ALTER TABLE `t1` REMOVE PARTITIONING; CHECK TABLE `t1` EXTENDED; DROP TABLE t1; + +--echo # +--echo # MDEV-31122 Server crash in get_lock_data / mysql_lock_abort_for_thread +--echo # +CREATE TABLE t1 (a INT); + +CREATE TABLE t2 (b INT, c varchar(5)) + PARTITION BY RANGE COLUMNS(c) + SUBPARTITION by key(b) SUBPARTITIONS 2 ( + PARTITION p0 VALUES LESS THAN ('m'), + PARTITION p1 VALUES LESS THAN ('z') + ); + +--connect (con1,localhost,root,,) +HANDLER t1 OPEN; +--send + SELECT b FROM t2 PARTITION (p0); + +--connection default +SET lock_wait_timeout= 1; +--error 0,ER_STATEMENT_TIMEOUT,ER_LOCK_WAIT_TIMEOUT +ALTER TABLE t1 FORCE; + +--connection con1 +--reap +--disconnect con1 +--connection default +DROP TABLE t2, t1; diff -Nru mariadb-10.11.11/mysql-test/main/query_cache.result mariadb-10.11.13/mysql-test/main/query_cache.result --- mariadb-10.11.11/mysql-test/main/query_cache.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/query_cache.result 2025-05-19 16:14:24.000000000 +0000 @@ -2241,6 +2241,29 @@ set global Query_cache_size=18446744073709547520; SET GLOBAL query_cache_size= @qc; # +# MDEV-34075 corruption when query cache cannot allocate block +# +set global query_cache_type=1; +create table t1 (c1 smallint null, c2 binary (25) not null, c3 tinyint(4) null, c4 binary (15) not null primary key, c5 smallint not null unique key,c6 decimal(10,8) not null default 3.141592) engine=innodb; +set global query_cache_size=81920; +select * from t1 where b=1 and c=1; +ERROR 42S22: Unknown column 'b' in 'WHERE' +set session query_cache_type=1; +drop table t1; +create table t1 (c1 int not null, c2 char(5)) engine=innodb partition by linear key(c1) partitions 99; +select * from t1 where c1 <='1998-12-29 00:00:00' order by c1,c2; +c1 c2 +select group_concat(a separator '###') as names from t1 having left(names, 1)='j'; +ERROR 42S22: Unknown column 'a' in 'SELECT' +select * from t1; +c1 c2 +select count(*) from t1; +count(*) +0 +select G.a, c.a from t1 c, t1 G; +ERROR 42S22: Unknown column 'G.a' in 'SELECT' +drop table t1; +# # End of 10.5 tests # # diff -Nru mariadb-10.11.11/mysql-test/main/query_cache.test mariadb-10.11.13/mysql-test/main/query_cache.test --- mariadb-10.11.11/mysql-test/main/query_cache.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/query_cache.test 2025-05-19 16:14:24.000000000 +0000 @@ -2,6 +2,8 @@ -- source include/long_test.inc -- source include/no_valgrind_without_big.inc -- source include/no_view_protocol.inc +-- source include/have_partition.inc +-- source include/have_innodb.inc --disable_ps2_protocol set @save_query_cache_size=@@query_cache_size; @@ -1853,6 +1855,26 @@ --enable_warnings --echo # +--echo # MDEV-34075 corruption when query cache cannot allocate block +--echo # +set global query_cache_type=1; +create table t1 (c1 smallint null, c2 binary (25) not null, c3 tinyint(4) null, c4 binary (15) not null primary key, c5 smallint not null unique key,c6 decimal(10,8) not null default 3.141592) engine=innodb; +set global query_cache_size=81920; +--error ER_BAD_FIELD_ERROR +select * from t1 where b=1 and c=1; +set session query_cache_type=1; +drop table t1; +create table t1 (c1 int not null, c2 char(5)) engine=innodb partition by linear key(c1) partitions 99; +select * from t1 where c1 <='1998-12-29 00:00:00' order by c1,c2; +--error ER_BAD_FIELD_ERROR +select group_concat(a separator '###') as names from t1 having left(names, 1)='j'; +select * from t1; +select count(*) from t1; +--error ER_BAD_FIELD_ERROR +select G.a, c.a from t1 c, t1 G; +drop table t1; + +--echo # --echo # End of 10.5 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/range_notembedded.result mariadb-10.11.13/mysql-test/main/range_notembedded.result --- mariadb-10.11.11/mysql-test/main/range_notembedded.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/range_notembedded.result 2025-05-19 16:14:24.000000000 +0000 @@ -247,3 +247,70 @@ id 5 DROP TABLE t1; +# +# MDEV-34620: Many index_merge variants made and discarded for a big OR +# +CREATE TABLE t1 ( +a1 int NOT NULL, +a2 int NOT NULL, +filler char(100), +KEY key1 (a1,a2), +KEY key2 (a2,a1) +); +insert into t1 (a1,a2) values (1,1),(2,2),(3,3); +set @query= concat( +"explain select * from t1 where\n", +(select +group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " ) +from seq_1_to_30) +); +set optimizer_trace=1; +prepare s from @query; +execute s; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL key1,key2 NULL NULL NULL 3 Using where +set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis'); +# Observe that "key1" is a a part of several index_merge_union: +select json_pretty(json_search(@trace, 'all', 'key1')); +json_pretty(json_search(@trace, 'all', 'key1')) +[ + "$[0].potential_range_indexes[0].index", + "$[0].analyzing_range_alternatives.range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[0].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[0].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[1].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[1].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[0].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[0].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[1].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[1].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[2].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[2].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[0].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[0].index_to_merge", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[1].range_scan_alternatives[0].index", + "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[1].index_to_merge" +] +# +# Now, same as above but for a long IN-list +# +set @query= concat( +"explain select * from t1 where\n", +(select +group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " ) +from seq_1_to_120) +); +set optimizer_trace=1; +prepare s from @query; +execute s; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL key1,key2 NULL NULL NULL 3 Using where +set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis'); +# Observe that there are NO index_merge_union candidates. Only one potential range scan: +select json_pretty(json_search(@trace, 'all', 'key1')); +json_pretty(json_search(@trace, 'all', 'key1')) +[ + "$[0].potential_range_indexes[0].index", + "$[0].analyzing_range_alternatives.range_scan_alternatives[0].index" +] +drop table t1; diff -Nru mariadb-10.11.11/mysql-test/main/range_notembedded.test mariadb-10.11.13/mysql-test/main/range_notembedded.test --- mariadb-10.11.11/mysql-test/main/range_notembedded.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/range_notembedded.test 2025-05-19 16:14:24.000000000 +0000 @@ -162,3 +162,51 @@ SELECT id FROM t1 WHERE id IS NULL OR id NOT BETWEEN 1 AND 4; DROP TABLE t1; +--echo # +--echo # MDEV-34620: Many index_merge variants made and discarded for a big OR +--echo # + +CREATE TABLE t1 ( + a1 int NOT NULL, + a2 int NOT NULL, + filler char(100), + KEY key1 (a1,a2), + KEY key2 (a2,a1) +); +insert into t1 (a1,a2) values (1,1),(2,2),(3,3); + + +set @query= concat( + "explain select * from t1 where\n", + (select + group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " ) + from seq_1_to_30) + ); + +set optimizer_trace=1; +prepare s from @query; +execute s; +set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis'); + +--echo # Observe that "key1" is a a part of several index_merge_union: +select json_pretty(json_search(@trace, 'all', 'key1')); + +--echo # +--echo # Now, same as above but for a long IN-list +--echo # +set @query= concat( + "explain select * from t1 where\n", + (select + group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " ) + from seq_1_to_120) + ); + +set optimizer_trace=1; +prepare s from @query; +execute s; +set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis'); + +--echo # Observe that there are NO index_merge_union candidates. Only one potential range scan: +select json_pretty(json_search(@trace, 'all', 'key1')); +drop table t1; + diff -Nru mariadb-10.11.11/mysql-test/main/secondary_key_costs.result mariadb-10.11.13/mysql-test/main/secondary_key_costs.result --- mariadb-10.11.11/mysql-test/main/secondary_key_costs.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/secondary_key_costs.result 2025-05-19 16:14:24.000000000 +0000 @@ -177,4 +177,80 @@ drop table t1,t2; set global userstat=@save_userstat; set global innodb_stats_persistent_sample_pages=@save_ispsp; +# +# MDEV-35958: Cost estimates for materialized derived tables are poor +# +set optimizer_trace=1; +create table t1 ( +a int +); +insert into t1 select seq from seq_1_to_10000; +explain +select * +from +t1 as t1_base, +(select a from t1 limit 10000) as TBL; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1_base ALL NULL NULL NULL NULL 10000 +1 PRIMARY ALL NULL NULL NULL NULL 10000 Using join buffer (flat, BNL join) +2 DERIVED t1 ALL NULL NULL NULL NULL 10000 +set @trace=(select trace from information_schema.optimizer_trace); +# BEFORE, without fix_derived_table_read_cost: derived2 has cost=rows=10000 +select json_detailed( +json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]') +) as Trace; +Trace +[ + { + "table": "t1_base", + "table_scan": + { + "rows": 10000, + "cost": 19.08984375 + } + }, + { + "table": "", + "table_scan": + { + "rows": 10000, + "cost": 10000 + } + } +] +set optimizer_adjust_secondary_key_costs='fix_derived_table_read_cost'; +explain +select * +from +t1 as t1_base, +(select a from t1 limit 10000) as TBL; +id select_type table type possible_keys key key_len ref rows Extra +1 PRIMARY t1_base ALL NULL NULL NULL NULL 10000 +1 PRIMARY ALL NULL NULL NULL NULL 10000 Using join buffer (flat, BNL join) +2 DERIVED t1 ALL NULL NULL NULL NULL 10000 +set @trace=(select trace from information_schema.optimizer_trace); +# AFTER, with fix_derived_table_read_cost: derived2 has more realistic cost +select json_detailed( +json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]') +) as Trace; +Trace +[ + { + "table": "t1_base", + "table_scan": + { + "rows": 10000, + "cost": 19.08984375 + } + }, + { + "table": "", + "table_scan": + { + "rows": 10000, + "cost": 501 + } + } +] +drop table t1; set @@optimizer_adjust_secondary_key_costs=default; diff -Nru mariadb-10.11.11/mysql-test/main/secondary_key_costs.test mariadb-10.11.13/mysql-test/main/secondary_key_costs.test --- mariadb-10.11.11/mysql-test/main/secondary_key_costs.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/secondary_key_costs.test 2025-05-19 16:14:24.000000000 +0000 @@ -109,4 +109,41 @@ set global userstat=@save_userstat; set global innodb_stats_persistent_sample_pages=@save_ispsp; +--echo # +--echo # MDEV-35958: Cost estimates for materialized derived tables are poor +--echo # +set optimizer_trace=1; +create table t1 ( + a int +); +insert into t1 select seq from seq_1_to_10000; + +explain +select * +from + t1 as t1_base, + (select a from t1 limit 10000) as TBL; + +set @trace=(select trace from information_schema.optimizer_trace); +--echo # BEFORE, without fix_derived_table_read_cost: derived2 has cost=rows=10000 +select json_detailed( + json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]') + ) as Trace; + +set optimizer_adjust_secondary_key_costs='fix_derived_table_read_cost'; + +explain +select * +from + t1 as t1_base, + (select a from t1 limit 10000) as TBL; + +set @trace=(select trace from information_schema.optimizer_trace); +--echo # AFTER, with fix_derived_table_read_cost: derived2 has more realistic cost +select json_detailed( + json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]') + ) as Trace; + +drop table t1; + set @@optimizer_adjust_secondary_key_costs=default; diff -Nru mariadb-10.11.11/mysql-test/main/skip_grants.result mariadb-10.11.13/mysql-test/main/skip_grants.result --- mariadb-10.11.11/mysql-test/main/skip_grants.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/skip_grants.result 2025-05-19 16:14:24.000000000 +0000 @@ -138,6 +138,14 @@ # End of 10.3 tests # # +# MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables +# +CREATE DEFINER=a PROCEDURE p() SELECT 1; +CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100; +DROP PROCEDURE p; +DROP FUNCTION f; +# End of 10.5 tests +# # MDEV-24815 Show "--skip-grant-tables" state in SYSTEM VARIABLES # SELECT @@skip_grant_tables AS EXPECT_1; diff -Nru mariadb-10.11.11/mysql-test/main/skip_grants.test mariadb-10.11.13/mysql-test/main/skip_grants.test --- mariadb-10.11.11/mysql-test/main/skip_grants.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/skip_grants.test 2025-05-19 16:14:24.000000000 +0000 @@ -170,6 +170,17 @@ --echo # --echo # +--echo # MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables +--echo # +CREATE DEFINER=a PROCEDURE p() SELECT 1; +CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100; + +DROP PROCEDURE p; +DROP FUNCTION f; + +--echo # End of 10.5 tests + +--echo # --echo # MDEV-24815 Show "--skip-grant-tables" state in SYSTEM VARIABLES --echo # diff -Nru mariadb-10.11.11/mysql-test/main/sp-bugs.result mariadb-10.11.13/mysql-test/main/sp-bugs.result --- mariadb-10.11.11/mysql-test/main/sp-bugs.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/sp-bugs.result 2025-05-19 16:14:24.000000000 +0000 @@ -388,5 +388,14 @@ DROP PROCEDURE p2; DROP TABLE t1, t2; # +# MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables +# +# This test is a duplicate of the one located in the file skip_grants.test +# and placed here to check the same test case against embedded-server +CREATE DEFINER=a PROCEDURE p() SELECT 1; +CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100; +DROP PROCEDURE p; +DROP FUNCTION f; +# # End of 10.5 tests # diff -Nru mariadb-10.11.11/mysql-test/main/sp-bugs.test mariadb-10.11.13/mysql-test/main/sp-bugs.test --- mariadb-10.11.11/mysql-test/main/sp-bugs.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/sp-bugs.test 2025-05-19 16:14:24.000000000 +0000 @@ -415,5 +415,25 @@ DROP TABLE t1, t2; --echo # +--echo # MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables +--echo # +--echo # This test is a duplicate of the one located in the file skip_grants.test +--echo # and placed here to check the same test case against embedded-server + +# Disable warnings before running the following CREATE PROCEDURE/FUNCTION +# statement since the warning message +# "The user specified as a definer ('a'@'%') does not exist" +# is output in case the test be run against a regular server +# and isn't output if embedded server is used (@sa sp_process_definer() +# in sql_parse.cc). +--disable_warnings +CREATE DEFINER=a PROCEDURE p() SELECT 1; +CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100; +--enable_warnings + +DROP PROCEDURE p; +DROP FUNCTION f; + +--echo # --echo # End of 10.5 tests --echo # diff -Nru mariadb-10.11.11/mysql-test/main/sp-row.result mariadb-10.11.13/mysql-test/main/sp-row.result --- mariadb-10.11.11/mysql-test/main/sp-row.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/sp-row.result 2025-05-19 16:14:24.000000000 +0000 @@ -2313,3 +2313,44 @@ END; $$ ERROR 21000: Operand should contain 1 column(s) +# Start of 10.6 tests +# +# MDEV-36179 Assertion `0' failed in virtual bool Type_handler_row::Item_save_in_value(THD*, Item*, st_value*) const +# +CREATE PROCEDURE p0 (IN a ROW(a INT,b INT)) +BEGIN +SET a=ROW(0,0); +END; +/ +PREPARE s0 FROM 'CALL p0(?)'; +EXECUTE s0 USING @a; +ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?' +DROP PROCEDURE p0; +CREATE PROCEDURE p0 (INOUT a ROW(a INT,b INT)) +BEGIN +SET a=ROW(0,0); +END; +/ +PREPARE s0 FROM 'CALL p0(?)'; +EXECUTE s0 USING @a; +ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?' +DROP PROCEDURE p0; +CREATE PROCEDURE p0 (OUT a ROW(a INT,b INT)) +BEGIN +SET a=ROW(0,0); +END; +/ +PREPARE s0 FROM 'CALL p0(?)'; +EXECUTE s0 USING @a; +ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?' +DROP PROCEDURE p0; +CREATE FUNCTION f0(a ROW(a INT,b INT)) RETURNS BOOLEAN +BEGIN +RETURN FALSE; +END; +/ +PREPARE s0 FROM 'SELECT f0(?)'; +EXECUTE s0 USING @a; +ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?' +DROP FUNCTION f0; +# End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/main/sp-row.test mariadb-10.11.13/mysql-test/main/sp-row.test --- mariadb-10.11.11/mysql-test/main/sp-row.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/sp-row.test 2025-05-19 16:14:24.000000000 +0000 @@ -1544,3 +1544,64 @@ END; $$ DELIMITER ;$$ + + +--echo # Start of 10.6 tests + + +--echo # +--echo # MDEV-36179 Assertion `0' failed in virtual bool Type_handler_row::Item_save_in_value(THD*, Item*, st_value*) const +--echo # + +DELIMITER /; +CREATE PROCEDURE p0 (IN a ROW(a INT,b INT)) +BEGIN + SET a=ROW(0,0); +END; +/ +DELIMITER ;/ +PREPARE s0 FROM 'CALL p0(?)'; +--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION +EXECUTE s0 USING @a; +DROP PROCEDURE p0; + + +DELIMITER /; +CREATE PROCEDURE p0 (INOUT a ROW(a INT,b INT)) +BEGIN + SET a=ROW(0,0); +END; +/ +DELIMITER ;/ +PREPARE s0 FROM 'CALL p0(?)'; +--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION +EXECUTE s0 USING @a; +DROP PROCEDURE p0; + + +DELIMITER /; +CREATE PROCEDURE p0 (OUT a ROW(a INT,b INT)) +BEGIN + SET a=ROW(0,0); +END; +/ +DELIMITER ;/ +PREPARE s0 FROM 'CALL p0(?)'; +--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION +EXECUTE s0 USING @a; +DROP PROCEDURE p0; + + +DELIMITER /; +CREATE FUNCTION f0(a ROW(a INT,b INT)) RETURNS BOOLEAN +BEGIN + RETURN FALSE; +END; +/ +DELIMITER ;/ +PREPARE s0 FROM 'SELECT f0(?)'; +--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION +EXECUTE s0 USING @a; +DROP FUNCTION f0; + +--echo # End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/main/subselect.result mariadb-10.11.13/mysql-test/main/subselect.result --- mariadb-10.11.11/mysql-test/main/subselect.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect.result 2025-05-19 16:14:24.000000000 +0000 @@ -679,22 +679,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -702,6 +704,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -711,6 +714,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -727,7 +731,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -795,13 +799,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect.test mariadb-10.11.13/mysql-test/main/subselect.test --- mariadb-10.11.11/mysql-test/main/subselect.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect.test 2025-05-19 16:14:24.000000000 +0000 @@ -419,7 +419,6 @@ create table t3 (b int); insert into t2 values (1); insert into t3 values (1),(2); --- error ER_UPDATE_TABLE_USED INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -- error ER_SUBQUERY_NO_1_ROW INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); @@ -454,7 +453,7 @@ insert into t2 values (1); insert into t3 values (1),(2); select * from t1; --- error ER_UPDATE_TABLE_USED +-- error ER_BAD_NULL_ERROR replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -- error ER_SUBQUERY_NO_1_ROW replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); @@ -494,10 +493,13 @@ --disable_prepare_warnings SELECT * FROM t2 WHERE id IN (SELECT 5 UNION SELECT 3); SELECT * FROM t2 WHERE id IN (SELECT 5 UNION SELECT 2); --- error ER_UPDATE_TABLE_USED +-- error ER_SUBQUERY_NO_1_ROW INSERT INTO t2 VALUES ((SELECT * FROM t2)); --- error ER_UPDATE_TABLE_USED +-- error ER_SUBQUERY_NO_1_ROW INSERT INTO t2 VALUES ((SELECT id FROM t2)); +select * from t2; +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect_elimination.result mariadb-10.11.13/mysql-test/main/subselect_elimination.result --- mariadb-10.11.11/mysql-test/main/subselect_elimination.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_elimination.result 2025-05-19 16:14:24.000000000 +0000 @@ -136,12 +136,22 @@ # access within null pointer CREATE TABLE x (x INT) ENGINE=InnoDB; INSERT INTO x (x) VALUES (0); +select NULL IN (SELECT (SELECT x FROM (SELECT x FROM +(SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT +(SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN +(SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x) +AS x) IN (SELECT 0 AS x) AS x FROM x) as exp; +exp +NULL INSERT INTO x (x) VALUES (x IN (SELECT (SELECT x FROM (SELECT x FROM (SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT (SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN (SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x) AS x) IN (SELECT 0 AS x) AS x FROM x)); -ERROR HY000: Table 'x' is specified twice, both as a target for 'INSERT' and as a separate source for data +select * from x; +x +0 +NULL DROP TABLE x; # MDEV-28622: Item_subselect eliminated flag set but Item still # evaluated/used. diff -Nru mariadb-10.11.11/mysql-test/main/subselect_elimination.test mariadb-10.11.13/mysql-test/main/subselect_elimination.test --- mariadb-10.11.11/mysql-test/main/subselect_elimination.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_elimination.test 2025-05-19 16:14:24.000000000 +0000 @@ -133,12 +133,17 @@ CREATE TABLE x (x INT) ENGINE=InnoDB; INSERT INTO x (x) VALUES (0); ---error ER_UPDATE_TABLE_USED +select NULL IN (SELECT (SELECT x FROM (SELECT x FROM +(SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT +(SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN +(SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x) +AS x) IN (SELECT 0 AS x) AS x FROM x) as exp; INSERT INTO x (x) VALUES (x IN (SELECT (SELECT x FROM (SELECT x FROM (SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT (SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN (SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x) AS x) IN (SELECT 0 AS x) AS x FROM x)); +select * from x; DROP TABLE x; --echo # MDEV-28622: Item_subselect eliminated flag set but Item still diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_exists_to_in.result mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result --- mariadb-10.11.11/mysql-test/main/subselect_no_exists_to_in.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result 2025-05-19 16:14:24.000000000 +0000 @@ -683,22 +683,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -706,6 +708,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -715,6 +718,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -731,7 +735,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -799,13 +803,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_mat.result mariadb-10.11.13/mysql-test/main/subselect_no_mat.result --- mariadb-10.11.11/mysql-test/main/subselect_no_mat.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_no_mat.result 2025-05-19 16:14:24.000000000 +0000 @@ -686,22 +686,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -709,6 +711,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -718,6 +721,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -734,7 +738,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -802,13 +806,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_opts.result mariadb-10.11.13/mysql-test/main/subselect_no_opts.result --- mariadb-10.11.11/mysql-test/main/subselect_no_opts.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_no_opts.result 2025-05-19 16:14:24.000000000 +0000 @@ -682,22 +682,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -705,6 +707,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -714,6 +717,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -730,7 +734,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -798,13 +802,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_scache.result mariadb-10.11.13/mysql-test/main/subselect_no_scache.result --- mariadb-10.11.11/mysql-test/main/subselect_no_scache.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_no_scache.result 2025-05-19 16:14:24.000000000 +0000 @@ -685,22 +685,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -708,6 +710,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -717,6 +720,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -733,7 +737,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -801,13 +805,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_semijoin.result mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result --- mariadb-10.11.11/mysql-test/main/subselect_no_semijoin.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result 2025-05-19 16:14:24.000000000 +0000 @@ -682,22 +682,24 @@ insert into t2 values (1); insert into t3 values (1),(2); INSERT INTO t1 (x) VALUES ((SELECT x FROM t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data INSERT INTO t1 (x) VALUES ((SELECT b FROM t3)); ERROR 21000: Subquery returns more than 1 row INSERT INTO t1 (x) VALUES ((SELECT a FROM t2)); select * from t1; x +NULL 1 insert into t2 values (1); INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2; select * from t1; x +NULL 1 2 3 @@ -705,6 +707,7 @@ INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2; select * from t1; x +NULL 1 2 3 @@ -714,6 +717,7 @@ INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2)); select * from t1; x +NULL 1 2 3 @@ -730,7 +734,7 @@ select * from t1; x y replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 23000: Column 'x' cannot be null replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2)); ERROR 21000: Subquery returns more than 1 row replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2)); @@ -798,13 +802,21 @@ id 2 INSERT INTO t2 VALUES ((SELECT * FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row INSERT INTO t2 VALUES ((SELECT id FROM t2)); -ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data +ERROR 21000: Subquery returns more than 1 row +select * from t2; +id +1 +2 +INSERT INTO t2 VALUES ((SELECT count(*) FROM t2)); +INSERT INTO t2 VALUES ((SELECT max(id) FROM t2)); SELECT * FROM t2; id 1 2 +2 +2 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1; INSERT INTO t1 values (1),(1); UPDATE t2 SET id=(SELECT * FROM t1); diff -Nru mariadb-10.11.11/mysql-test/main/temp_table_frm.result mariadb-10.11.13/mysql-test/main/temp_table_frm.result --- mariadb-10.11.11/mysql-test/main/temp_table_frm.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/temp_table_frm.result 2025-05-19 16:14:24.000000000 +0000 @@ -25,3 +25,9 @@ set @@use_stat_tables= @save_use_stat_tables; set @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; drop table t1; +# +# MDEV-36138 Server null-pointer crash at startup when tmptables left in --tmpdir +# +create table t1 (c int); +drop table t1; +# restart diff -Nru mariadb-10.11.11/mysql-test/main/temp_table_frm.test mariadb-10.11.13/mysql-test/main/temp_table_frm.test --- mariadb-10.11.11/mysql-test/main/temp_table_frm.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/temp_table_frm.test 2025-05-19 16:14:24.000000000 +0000 @@ -24,4 +24,15 @@ from information_schema.session_status join t1 using (variable_name); set @@use_stat_tables= @save_use_stat_tables; set @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; -drop table t1; \ No newline at end of file +drop table t1; + +--echo # +--echo # MDEV-36138 Server null-pointer crash at startup when tmptables left in --tmpdir +--echo # + +create table t1 (c int); +let $MYSQLD_TMPDIR=`SELECT @@tmpdir`; +let $MYSQLD_DATADIR=`SELECT @@datadir`; +--copy_file $MYSQLD_DATADIR/test/t1.frm $MYSQLD_TMPDIR/#sqlt1.frm +drop table t1; +--source include/restart_mysqld.inc diff -Nru mariadb-10.11.11/mysql-test/main/timezone.test mariadb-10.11.13/mysql-test/main/timezone.test --- mariadb-10.11.11/mysql-test/main/timezone.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/timezone.test 2025-05-19 16:14:24.000000000 +0000 @@ -8,7 +8,7 @@ enable_query_log; # The following is because of daylight saving time ---replace_result MEST CET MET CET +--replace_result MEST CET MET CET CEST CET show variables like "system_time_zone"; --echo # diff -Nru mariadb-10.11.11/mysql-test/main/trigger_null.result mariadb-10.11.13/mysql-test/main/trigger_null.result --- mariadb-10.11.11/mysql-test/main/trigger_null.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/trigger_null.result 2025-05-19 16:14:24.000000000 +0000 @@ -399,4 +399,19 @@ Warning 1364 Field 'c5' doesn't have a default value drop table t1; set sql_mode=default; +# +# MDEV-36026 Problem with INSERT SELECT on NOT NULL columns while having BEFORE UPDATE trigger +# +create table t1 (b int(11) not null); +create trigger t1bu before update on t1 for each row begin end; +insert t1 (b) select 1 union select 2; +create trigger trgi before insert on t1 for each row set new.b=ifnull(new.b,10); +insert t1 (b) select NULL union select 11; +select * from t1; +b +1 +2 +10 +11 +drop table t1; # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/trigger_null.test mariadb-10.11.13/mysql-test/main/trigger_null.test --- mariadb-10.11.11/mysql-test/main/trigger_null.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/trigger_null.test 2025-05-19 16:14:24.000000000 +0000 @@ -425,4 +425,15 @@ drop table t1; set sql_mode=default; +--echo # +--echo # MDEV-36026 Problem with INSERT SELECT on NOT NULL columns while having BEFORE UPDATE trigger +--echo # +create table t1 (b int(11) not null); +create trigger t1bu before update on t1 for each row begin end; +insert t1 (b) select 1 union select 2; +create trigger trgi before insert on t1 for each row set new.b=ifnull(new.b,10); +insert t1 (b) select NULL union select 11; +select * from t1; +drop table t1; + --echo # End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/main/type_binary.result mariadb-10.11.13/mysql-test/main/type_binary.result --- mariadb-10.11.11/mysql-test/main/type_binary.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_binary.result 2025-05-19 16:14:24.000000000 +0000 @@ -397,3 +397,61 @@ DROP TABLE t2; DROP TABLE t1; SET note_verbosity=DEFAULT; +# +# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +# +CREATE TABLE t1 (c1 BINARY(16), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2); +SELECT HEX(c1) FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +HEX(c1) +31000000000000000000000000000000 +32000000000000000000000000000000 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1); +HEX(c1) +31000000000000000000000000000000 +32000000000000000000000000000000 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +SELECT HEX(c1) FROM t1 WHERE '#' BETWEEN c1 AND 0; +HEX(c1) +2D310000000000000000000000000000 +2D320000000000000000000000000000 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0; +HEX(c1) +2D320000000000000000000000000000 +2D310000000000000000000000000000 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/type_binary.test mariadb-10.11.13/mysql-test/main/type_binary.test --- mariadb-10.11.11/mysql-test/main/type_binary.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_binary.test 2025-05-19 16:14:24.000000000 +0000 @@ -178,3 +178,14 @@ --source unusable_keys_joins.inc DROP TABLE t1; SET note_verbosity=DEFAULT; + +--echo # +--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +--echo # +CREATE TABLE t1 (c1 BINARY(16), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2); +SELECT HEX(c1) FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1); +SELECT HEX(c1) FROM t1 WHERE '#' BETWEEN c1 AND 0; +SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/type_blob.result mariadb-10.11.13/mysql-test/main/type_blob.result --- mariadb-10.11.11/mysql-test/main/type_blob.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_blob.result 2025-05-19 16:14:24.000000000 +0000 @@ -1419,3 +1419,193 @@ DROP TABLE t2; DROP TABLE t1; SET note_verbosity=DEFAULT; +# +# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +# +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam; +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +DROP TABLE t1; +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))); +INSERT INTO t1 (c1) VALUES (1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE 3 BETWEEN 10*POW(-1,c1) AND (c1); +c1 +3 +5 +SELECT c1 FROM t1 WHERE 'a' BETWEEN 10*POW(-1,c1) AND (c1); +c1 +1 +3 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +DROP TABLE t1; +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +DROP TABLE t1; +CREATE TABLE t1 (c1 TINYBLOB NOT NULL); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +DROP TABLE t1; +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=innodb; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +ALTER TABLE t1 engine=myisam; +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +DROP TABLE t1; +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1)) engine=innodb; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +3 +4 +5 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/type_blob.test mariadb-10.11.13/mysql-test/main/type_blob.test --- mariadb-10.11.11/mysql-test/main/type_blob.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_blob.test 2025-05-19 16:14:24.000000000 +0000 @@ -808,3 +808,48 @@ --source unusable_keys_joins.inc DROP TABLE t1; SET note_verbosity=DEFAULT; + +--echo # +--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +--echo # +# myisam has a special optimization for tables with one row +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam; +INSERT INTO t1 (c1) VALUES (1); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +DROP TABLE t1; + +# This case shows that we don't transform the entire WHERE clause +# into a range condition. +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))); +INSERT INTO t1 (c1) VALUES (1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE 3 BETWEEN 10*POW(-1,c1) AND (c1); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 10*POW(-1,c1) AND (c1); +DROP TABLE t1; + +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; + +CREATE TABLE t1 (c1 TINYBLOB NOT NULL); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; + +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=innodb; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +ALTER TABLE t1 engine=myisam; +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; + +CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1)) engine=innodb; +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/type_num_innodb.result mariadb-10.11.13/mysql-test/main/type_num_innodb.result --- mariadb-10.11.11/mysql-test/main/type_num_innodb.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_num_innodb.result 2025-05-19 16:14:24.000000000 +0000 @@ -46,23 +46,70 @@ SELECT * FROM t1,t2 WHERE a=d; a b c pk d e Warnings: -Warning 1292 Truncated incorrect DECIMAL value: 'd' -Warning 1292 Truncated incorrect DECIMAL value: 'd' -Warning 1292 Truncated incorrect DECIMAL value: 'f' -Warning 1292 Truncated incorrect DECIMAL value: 'f' -Warning 1292 Truncated incorrect DECIMAL value: 'g' -Warning 1292 Truncated incorrect DECIMAL value: 'k' -Warning 1292 Truncated incorrect DECIMAL value: 'm' -Warning 1292 Truncated incorrect DECIMAL value: 'm' -Warning 1292 Truncated incorrect DECIMAL value: 'm' -Warning 1292 Truncated incorrect DECIMAL value: 'o' -Warning 1292 Truncated incorrect DECIMAL value: 'q' -Warning 1292 Truncated incorrect DECIMAL value: 'r' -Warning 1292 Truncated incorrect DECIMAL value: 'u' -Warning 1292 Truncated incorrect DECIMAL value: 'w' -Warning 1292 Truncated incorrect DECIMAL value: 'x' -Warning 1292 Truncated incorrect DECIMAL value: 'x' -Warning 1292 Truncated incorrect DECIMAL value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' +Warning 1292 Truncated incorrect DOUBLE value: 'w' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' +Warning 1292 Truncated incorrect DOUBLE value: 'w' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' +Warning 1292 Truncated incorrect DOUBLE value: 'w' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' ALTER TABLE t1 MODIFY a DOUBLE; SELECT * FROM t1,t2 WHERE a=d; a b c pk d e @@ -84,6 +131,53 @@ Warning 1292 Truncated incorrect DOUBLE value: 'x' Warning 1292 Truncated incorrect DOUBLE value: 'x' Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' +Warning 1292 Truncated incorrect DOUBLE value: 'w' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' +Warning 1292 Truncated incorrect DOUBLE value: 'w' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'x' +Warning 1292 Truncated incorrect DOUBLE value: 'y' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'd' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'f' +Warning 1292 Truncated incorrect DOUBLE value: 'g' +Warning 1292 Truncated incorrect DOUBLE value: 'k' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'm' +Warning 1292 Truncated incorrect DOUBLE value: 'o' +Warning 1292 Truncated incorrect DOUBLE value: 'q' +Warning 1292 Truncated incorrect DOUBLE value: 'r' +Warning 1292 Truncated incorrect DOUBLE value: 'u' DROP TABLE t1,t2; # # End of 10.2 tests diff -Nru mariadb-10.11.11/mysql-test/main/type_varbinary.result mariadb-10.11.13/mysql-test/main/type_varbinary.result --- mariadb-10.11.11/mysql-test/main/type_varbinary.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_varbinary.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,42 @@ +# +# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +# +CREATE TABLE t1 (c1 VARBINARY(10), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1); +c1 +1 +2 +Warnings: +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +Warning 1292 Truncated incorrect DOUBLE value: 'a' +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +c1 +-1 +-2 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +SELECT c1 FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0; +c1 +-2 +-1 +Warnings: +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +Warning 1292 Truncated incorrect DECIMAL value: '#' +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/type_varbinary.test mariadb-10.11.13/mysql-test/main/type_varbinary.test --- mariadb-10.11.11/mysql-test/main/type_varbinary.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/type_varbinary.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,10 @@ +--echo # +--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +--echo # +CREATE TABLE t1 (c1 VARBINARY(10), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2); +SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1); +SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0; +SELECT c1 FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/main/update.result mariadb-10.11.13/mysql-test/main/update.result --- mariadb-10.11.11/mysql-test/main/update.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/update.result 2025-05-19 16:14:24.000000000 +0000 @@ -765,3 +765,83 @@ u xxb drop table t1; # End of MariaDB 10.4 tests +# +# MDEV-35955 Wrong result for UPDATE ... ORDER BY LIMIT which uses tmp.table +# +create table t1 (id int primary key, v int); +create table t2 (id int primary key, v int); +insert into t1 (id, v) values (2,3),(1,4); +insert into t2 (id, v) values (5,5),(6,6); +select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2; +id v id v +1 4 5 5 +1 4 6 6 +UPDATE t1, t2 SET t1.v=-1, t2.v=-1 ORDER BY t1.id, t2.id LIMIT 2; +select * from t1; +id v +2 3 +1 -1 +select * from t2; +id v +5 -1 +6 -1 +drop table t1, t2; +create table t1 (id int primary key, v text) engine=myisam; +create table t2 (id int primary key, v text) engine=myisam; +insert into t1 (id, v) values (1,'b'),(2,'fo'),(3,'bar'),(4,'barr'),(5,'bazzz'); +insert into t2 (id, v) values (6,'quxqux'),(7,'foofoof'),(8,'barbarba'),(9,'quxquxqux'),(10,'bazbazbazb'); +select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2; +id v id v +1 b 6 quxqux +1 b 7 foofoof +update t1, t2 set t1.v='DELETED', t2.v='DELETED' order by t1.id, t2.id limit 2; +select * from t1; +id v +1 DELETED +2 fo +3 bar +4 barr +5 bazzz +select * from t2; +id v +6 DELETED +7 DELETED +8 barbarba +9 quxquxqux +10 bazbazbazb +drop table t1, t2; +create table t1 (id int primary key, v int); +create table t2 (id int primary key, v int); +create table t3 (id int primary key, v int); +insert into t1 (id, v) values (1, 1000), (2, 2000), (3, 3000), (4, 4000), (5, 5000); +insert into t2 (id, v) values (10, 100), (20, 200), (30, 300), (40, 400), (50, 500); +insert into t3 (id, v) values (11, 111), (22, 222), (33, 333), (44, 444), (55, 555); +select t1.*, t2.*, t3.* from t1, t2, t3 order by t1.id, t2.id, t3.id limit 3; +id v id v id v +1 1000 10 100 11 111 +1 1000 10 100 22 222 +1 1000 10 100 33 333 +UPDATE t1, t2, t3 SET t1.v=-1, t2.v=-2, t3.v=-3 ORDER BY t1.id, t2.id, t3.id LIMIT 3; +select * from t1; +id v +1 -1 +2 2000 +3 3000 +4 4000 +5 5000 +select * from t2; +id v +10 -2 +20 200 +30 300 +40 400 +50 500 +select * from t3; +id v +11 -3 +22 -3 +33 -3 +44 444 +55 555 +drop table t1, t2, t3; +# End of MariaDB 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/update.test mariadb-10.11.13/mysql-test/main/update.test --- mariadb-10.11.11/mysql-test/main/update.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/update.test 2025-05-19 16:14:24.000000000 +0000 @@ -707,3 +707,43 @@ drop table t1; --echo # End of MariaDB 10.4 tests + +--echo # +--echo # MDEV-35955 Wrong result for UPDATE ... ORDER BY LIMIT which uses tmp.table +--echo # + +create table t1 (id int primary key, v int); +create table t2 (id int primary key, v int); +insert into t1 (id, v) values (2,3),(1,4); +insert into t2 (id, v) values (5,5),(6,6); +select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2; +UPDATE t1, t2 SET t1.v=-1, t2.v=-1 ORDER BY t1.id, t2.id LIMIT 2; +select * from t1; +select * from t2; + +drop table t1, t2; +create table t1 (id int primary key, v text) engine=myisam; +create table t2 (id int primary key, v text) engine=myisam; +insert into t1 (id, v) values (1,'b'),(2,'fo'),(3,'bar'),(4,'barr'),(5,'bazzz'); +insert into t2 (id, v) values (6,'quxqux'),(7,'foofoof'),(8,'barbarba'),(9,'quxquxqux'),(10,'bazbazbazb'); +select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2; +update t1, t2 set t1.v='DELETED', t2.v='DELETED' order by t1.id, t2.id limit 2; +select * from t1; +select * from t2; + +drop table t1, t2; +create table t1 (id int primary key, v int); +create table t2 (id int primary key, v int); +create table t3 (id int primary key, v int); +insert into t1 (id, v) values (1, 1000), (2, 2000), (3, 3000), (4, 4000), (5, 5000); +insert into t2 (id, v) values (10, 100), (20, 200), (30, 300), (40, 400), (50, 500); +insert into t3 (id, v) values (11, 111), (22, 222), (33, 333), (44, 444), (55, 555); +select t1.*, t2.*, t3.* from t1, t2, t3 order by t1.id, t2.id, t3.id limit 3; +UPDATE t1, t2, t3 SET t1.v=-1, t2.v=-2, t3.v=-3 ORDER BY t1.id, t2.id, t3.id LIMIT 3; +select * from t1; +select * from t2; +select * from t3; + +drop table t1, t2, t3; + +--echo # End of MariaDB 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/userstat.result mariadb-10.11.13/mysql-test/main/userstat.result --- mariadb-10.11.11/mysql-test/main/userstat.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/userstat.result 2025-05-19 16:14:24.000000000 +0000 @@ -247,6 +247,11 @@ ERROR 21000: Subquery returns more than 1 row set global userstat= 0; drop function f; -# # End of 10.2 tests # +# MDEV-36586 USER_STATISTICS.BUSY_TIME is in microseconds +# +select distinct busy_time>1e5, cpu_time>1e5 from information_schema.user_statistics; +busy_time>1e5 cpu_time>1e5 +0 0 +# End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/userstat.test mariadb-10.11.13/mysql-test/main/userstat.test --- mariadb-10.11.11/mysql-test/main/userstat.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/userstat.test 2025-05-19 16:14:24.000000000 +0000 @@ -135,6 +135,11 @@ drop function f; --enable_ps2_protocol ---echo # --echo # End of 10.2 tests + --echo # +--echo # MDEV-36586 USER_STATISTICS.BUSY_TIME is in microseconds +--echo # +select distinct busy_time>1e5, cpu_time>1e5 from information_schema.user_statistics; + +--echo # End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/main/view.result mariadb-10.11.13/mysql-test/main/view.result --- mariadb-10.11.11/mysql-test/main/view.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/view.result 2025-05-19 16:14:24.000000000 +0000 @@ -944,31 +944,19 @@ create view v2 as select * from v1; create view v3 as select v2.col1 from v2,t2 where v2.col1 = t2.col1; insert into v2 values ((select max(col1) from v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v2' insert into t1 values ((select max(col1) from v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 't1' insert into v2 values ((select max(col1) from v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v2' insert into v2 values ((select max(col1) from t1)); -ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v2' insert into t1 values ((select max(col1) from t1)); -ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into v2 values ((select max(col1) from t1)); -ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v2' insert into v2 values ((select max(col1) from v2)); -ERROR HY000: Table 'v2' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into t1 values ((select max(col1) from v2)); -ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 't1' insert into v2 values ((select max(col1) from v2)); -ERROR HY000: Table 'v2' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into v3 (col1) values ((select max(col1) from v1)); -ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v3' insert into v3 (col1) values ((select max(col1) from t1)); -ERROR HY000: The definition of table 'v3' prevents operation INSERT on table 'v3' insert into v3 (col1) values ((select max(col1) from v2)); -ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v3' -insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2)); -ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v3' +insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2 LIMIT 1)); +ERROR 22003: Out of range value for column 'col1' at row 3 insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2)); insert into t3 values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2)); ERROR 23000: Column 'col1' cannot be null @@ -978,6 +966,18 @@ select * from t1; col1 NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL +NULL 1 2 3 @@ -1332,9 +1332,26 @@ insert into v3 values (30); ERROR HY000: The target table v3 of the INSERT is not insertable-into create view v4 as select * from v2 where 20 < (select (s1) from t1); +select * from t1; +s1 insert into v4 values (30); -ERROR HY000: The target table v4 of the INSERT is not insertable-into -drop view v4, v3, v2, v1; +select * from t1; +s1 +30 +create view v5 as select * from v2 where s1 < (select min(s1) from t1) WITH CHECK OPTION; +# can't insert only less then minimum +insert into v5 values (40); +ERROR 44000: CHECK OPTION failed `test`.`v5` +# allow insert the new minimum +insert into v5 values (10); +# always emply view (can't be something less than minimum) +select * from v5; +s1 +select * from t1; +s1 +30 +10 +drop view v5, v4, v3, v2, v1; drop table t1; create table t1 (a int); create view v1 as select * from t1; diff -Nru mariadb-10.11.11/mysql-test/main/view.test mariadb-10.11.13/mysql-test/main/view.test --- mariadb-10.11.11/mysql-test/main/view.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/view.test 2025-05-19 16:14:24.000000000 +0000 @@ -866,33 +866,21 @@ create view v1 as select * from t1; create view v2 as select * from v1; create view v3 as select v2.col1 from v2,t2 where v2.col1 = t2.col1; --- error ER_VIEW_PREVENT_UPDATE insert into v2 values ((select max(col1) from v1)); --- error ER_VIEW_PREVENT_UPDATE insert into t1 values ((select max(col1) from v1)); --- error ER_VIEW_PREVENT_UPDATE insert into v2 values ((select max(col1) from v1)); --- error ER_VIEW_PREVENT_UPDATE insert into v2 values ((select max(col1) from t1)); --- error ER_UPDATE_TABLE_USED insert into t1 values ((select max(col1) from t1)); --- error ER_VIEW_PREVENT_UPDATE insert into v2 values ((select max(col1) from t1)); --- error ER_UPDATE_TABLE_USED insert into v2 values ((select max(col1) from v2)); --- error ER_VIEW_PREVENT_UPDATE insert into t1 values ((select max(col1) from v2)); --- error ER_UPDATE_TABLE_USED insert into v2 values ((select max(col1) from v2)); --- error ER_VIEW_PREVENT_UPDATE insert into v3 (col1) values ((select max(col1) from v1)); --- error ER_VIEW_PREVENT_UPDATE insert into v3 (col1) values ((select max(col1) from t1)); --- error ER_VIEW_PREVENT_UPDATE insert into v3 (col1) values ((select max(col1) from v2)); # check with TZ tables in list --- error ER_VIEW_PREVENT_UPDATE -insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2)); +--error ER_WARN_DATA_OUT_OF_RANGE +insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2 LIMIT 1)); insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2)); -- error ER_BAD_NULL_ERROR insert into t3 values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2)); @@ -1210,9 +1198,19 @@ -- error ER_NON_INSERTABLE_TABLE insert into v3 values (30); create view v4 as select * from v2 where 20 < (select (s1) from t1); --- error ER_NON_INSERTABLE_TABLE +select * from t1; insert into v4 values (30); -drop view v4, v3, v2, v1; +select * from t1; +create view v5 as select * from v2 where s1 < (select min(s1) from t1) WITH CHECK OPTION; +--echo # can't insert only less then minimum +--error ER_VIEW_CHECK_FAILED +insert into v5 values (40); +--echo # allow insert the new minimum +insert into v5 values (10); +--echo # always emply view (can't be something less than minimum) +select * from v5; +select * from t1; +drop view v5, v4, v3, v2, v1; drop table t1; # diff -Nru mariadb-10.11.11/mysql-test/main/view_grant.result mariadb-10.11.13/mysql-test/main/view_grant.result --- mariadb-10.11.11/mysql-test/main/view_grant.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/view_grant.result 2025-05-19 16:14:24.000000000 +0000 @@ -1982,6 +1982,52 @@ DROP VIEW v1; DROP USER foo; DROP USER FOO; +# +# MDEV-36380: User has unauthorized access to a sequence through +# a view with security invoker +# +create database db; +use db; +create sequence s; +create sql security invoker view vin as select nextval(s); +create sql security definer view vdn as select nextval(s); +create sql security invoker view vil as select lastval(s); +create sql security definer view vdl as select lastval(s); +create sql security invoker view vis as select setval(s,20); +create sql security definer view vds as select setval(s,30); +create user u@localhost; +grant select on db.vin to u@localhost; +grant select on db.vdn to u@localhost; +grant select on db.vil to u@localhost; +grant select on db.vdl to u@localhost; +grant select on db.vis to u@localhost; +grant select on db.vds to u@localhost; +connect con1,localhost,u,,db; +select nextval(s); +ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `db`.`s` +select * from vin; +ERROR HY000: View 'db.vin' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them +select * from vdn; +nextval(s) +1 +select lastval(s); +ERROR 42000: SELECT command denied to user 'u'@'localhost' for table `db`.`s` +select * from vil; +ERROR HY000: View 'db.vil' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them +select * from vdl; +lastval(s) +1 +select setval(s,10); +ERROR 42000: INSERT command denied to user 'u'@'localhost' for table `db`.`s` +select * from vis; +ERROR HY000: View 'db.vis' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them +select * from vds; +setval(s,30) +30 +disconnect con1; +connection default; +drop database db; +drop user u@localhost; # End of 10.5 tests # Check that a user without access to the schema 'foo' cannot query # a JSON_TABLE view in that schema. diff -Nru mariadb-10.11.11/mysql-test/main/view_grant.test mariadb-10.11.13/mysql-test/main/view_grant.test --- mariadb-10.11.11/mysql-test/main/view_grant.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/main/view_grant.test 2025-05-19 16:14:24.000000000 +0000 @@ -2237,6 +2237,53 @@ DROP USER foo; DROP USER FOO; +--echo # +--echo # MDEV-36380: User has unauthorized access to a sequence through +--echo # a view with security invoker +--echo # +create database db; +use db; +create sequence s; +create sql security invoker view vin as select nextval(s); +create sql security definer view vdn as select nextval(s); +create sql security invoker view vil as select lastval(s); +create sql security definer view vdl as select lastval(s); +create sql security invoker view vis as select setval(s,20); +create sql security definer view vds as select setval(s,30); +create user u@localhost; +grant select on db.vin to u@localhost; +grant select on db.vdn to u@localhost; +grant select on db.vil to u@localhost; +grant select on db.vdl to u@localhost; +grant select on db.vis to u@localhost; +grant select on db.vds to u@localhost; + +--connect (con1,localhost,u,,db) +--error ER_TABLEACCESS_DENIED_ERROR +select nextval(s); +--error ER_VIEW_INVALID +select * from vin; +--disable_ps2_protocol +select * from vdn; +--enable_ps2_protocol + +--error ER_TABLEACCESS_DENIED_ERROR +select lastval(s); +--error ER_VIEW_INVALID +select * from vil; +select * from vdl; + +--error ER_TABLEACCESS_DENIED_ERROR +select setval(s,10); +--error ER_VIEW_INVALID +select * from vis; +select * from vds; + +--disconnect con1 +--connection default +drop database db; +drop user u@localhost; + --echo # End of 10.5 tests --echo # Check that a user without access to the schema 'foo' cannot query diff -Nru mariadb-10.11.11/mysql-test/mariadb-test-run.pl mariadb-10.11.13/mysql-test/mariadb-test-run.pl --- mariadb-10.11.11/mysql-test/mariadb-test-run.pl 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/mariadb-test-run.pl 2025-05-19 16:14:24.000000000 +0000 @@ -130,6 +130,8 @@ our $path_current_testlog; our $path_testlog; +our $opt_open_files_limit; + our $default_vardir; our $opt_vardir; # Path to use for var/ dir our $plugindir; @@ -268,6 +270,9 @@ our $opt_skip_not_found= 0; our $opt_mem= $ENV{'MTR_MEM'}; our $opt_clean_vardir= $ENV{'MTR_CLEAN_VARDIR'}; +our $opt_catalogs= 0; +our $opt_catalog_name=""; +our $catalog_name="def"; our $opt_gcov; our $opt_gprof; @@ -1274,6 +1279,7 @@ 'list-options' => \$opt_list_options, 'skip-test-list=s' => \@opt_skip_test_list, 'xml-report=s' => \$opt_xml_report, + 'open-files-limit=i', => \$opt_open_files_limit, My::Debugger::options(), My::CoreDump::options(), @@ -2223,6 +2229,9 @@ { $ENV{'MYSQL_INSTALL_DB_EXE'}= mtr_exe_exists("$bindir/sql$multiconfig/mariadb-install-db", "$bindir/bin/mariadb-install-db"); + $ENV{'MARIADB_UPGRADE_SERVICE_EXE'}= mtr_exe_exists("$bindir/sql$multiconfig/mariadb-upgrade-service", + "$bindir/bin/mariadb-upgrade-service"); + $ENV{'MARIADB_UPGRADE_EXE'}= mtr_exe_exists("$path_client_bindir/mariadb-upgrade"); } my $client_config_exe= @@ -3945,6 +3954,23 @@ } } + # Set up things for catalogs + # The values of MARIADB_TOPDIR and MARIAD_DATADIR should + # be taken from the values used by the default (first) + # connection that is used by mariadb-test. + my ($mysqld, @servers); + @servers= all_servers(); + $mysqld= $servers[0]; + $ENV{'MARIADB_TOPDIR'}= $mysqld->value('datadir'); + if (!$opt_catalogs) + { + $ENV{'MARIADB_DATADIR'}= $mysqld->value('datadir'); + } + else + { + $ENV{'MARIADB_DATADIR'}= $mysqld->value('datadir') . "/" . $catalog_name; + } + # Write start of testcase to log mark_log($path_current_testlog, $tinfo); @@ -4458,14 +4484,13 @@ ( @global_suppressions, qr/error .*connecting to master/, - qr/InnoDB: Error: in ALTER TABLE `test`.`t[12]`/, - qr/InnoDB: Error: table `test`.`t[12]` .*does not exist in the InnoDB internal/, - qr/InnoDB: Warning: a long semaphore wait:/, qr/InnoDB: Dumping buffer pool.*/, qr/InnoDB: Buffer pool.*/, qr/InnoDB: Could not free any blocks in the buffer pool!/, - qr/InnoDB: Warning: Writer thread is waiting this semaphore:/, qr/InnoDB: innodb_open_files .* should not be greater than/, + qr/InnoDB: Trying to delete tablespace.*but there are.*pending/, + qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/, + qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/, qr/Slave: Unknown table 't1' .* 1051/, qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/, qr/slave SQL thread aborted/, @@ -5745,6 +5770,7 @@ append => 1, error => $path_current_testlog, verbose => $opt_verbose, + open_files_limit => $opt_open_files_limit, ); mtr_verbose("Started $proc"); return $proc; @@ -6043,6 +6069,8 @@ timediff With --timestamp, also print time passed since *previous* test started max-connections=N Max number of open connection to server in mysqltest + open-files-limit=N Max number of open files allowed for any of the children + of my_safe_process. Default is 1024. report-times Report how much time has been spent on different phases of test execution. stress=ARGS Run stress test, providing options to diff -Nru mariadb-10.11.11/mysql-test/std_data/galera_certs/galera.root.crt mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt --- mariadb-10.11.11/mysql-test/std_data/galera_certs/galera.root.crt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt 2025-05-19 16:14:24.000000000 +0000 @@ -2,7 +2,7 @@ MIIFlTCCA32gAwIBAgIUKCF88W+48rZzdfgYpE2dXVMGSKgwDQYJKoZIhvcNAQEL BQAwWjELMAkGA1UEBhMCRkkxETAPBgNVBAgMCEhlbHNpbmtpMREwDwYDVQQHDAhI ZWxzaW5raTEPMA0GA1UECgwGR2FsZXJhMRQwEgYDVQQDDAtnYWxlcmEucm9vdDAe -Fw0yMTAyMDQxMzE3MDJaFw0yMzExMjUxMzE3MDJaMFoxCzAJBgNVBAYTAkZJMREw +Fw0yMzEyMDExMzQzNDBaFw0zMzExMjgxMzQzNDBaMFoxCzAJBgNVBAYTAkZJMREw DwYDVQQIDAhIZWxzaW5raTERMA8GA1UEBwwISGVsc2lua2kxDzANBgNVBAoMBkdh bGVyYTEUMBIGA1UEAwwLZ2FsZXJhLnJvb3QwggIiMA0GCSqGSIb3DQEBAQUAA4IC DwAwggIKAoICAQDKqL45jbaq8RLOj+DeilPcEnBN5gn/y9V3IfZ0BQCd4bR09zLz @@ -18,15 +18,15 @@ F+XZTdTiaOWPEmvFFGLLUQxKl4w872hJaupqfteqdiZ+3ICVIUI8qnXHmwIDAQAB o1MwUTAdBgNVHQ4EFgQUs75v/MgjJ5RHGE6+0qdiVo4BwlowHwYDVR0jBBgwFoAU s75v/MgjJ5RHGE6+0qdiVo4BwlowDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0B -AQsFAAOCAgEAOVhBs28dwwvD5q2r7oVVcxLc+tb8zu4XxpXT1p6hiZYUyPguCh00 -GVdXCgR4JMI/NcyM5fBAbF3S8oK3+9rw2kW09afVV06Qf/8o3nIyOiDl7598tGIP -CCK4QsUW/dGajx5kvhtQ7qce+u9KfFTof6lq2xkYtFBBhmBdSv9A1jAZJMw2x3bc -nr99PS8XZMphS0MIExHKj6Ry5DdYm722zZHyIEiiEGyMViDm2m1iug5r/LPH5Z56 -BjQiH4VP+0y5mevBOUGuH8ID+J9Hu9BeoXLhkv+W2Ljs/S6wqzjinMBqVG+wwe0Y -a8F5pABkl5uX38nMQ7CikSbLxSbn7nRf+sux1sbzqjMldeCSqiv9mI5Ysq97+Ni1 -5qMxNxNc0u/wGRnrXH8fWfxBKPP5moA7DQfVcUWPgDGQwDpA8kn8RlJxFk3g4yaK -+NMwk5MORKyx3tz/A3Yhs9AUXk3okvmQCT2YVSHcKUB8PAU+TaKqbr3wk07Y/tL/ -jFPHS+t3eD91Y05KGUXjdtGi+33zpV0biHmTWAZT78VQowDNvEpTnXhkSx8HGHYR -nqSMU2m2LboHSatY113RYznx0LJ1azczRlJdGs8oyPWLPDD2JCesZaQqGZVRJoms -lK4EzYEb5mZTCRgtgoiO+iKcf6XifuOCrWZXoLm4FlLEfOQ3b8yAFlo= +AQsFAAOCAgEAKLV6mkWb88HEJXo1XlmAzznIYNfilrvvxwcjhceluDE8s8sPSpYM +Bz5ebWlHCgEkC/ezhA/PDtZsZlQKwv4jb++lAlFSlebT1GW77xKkdRBTKgkFAaOA +pF5eZao6IP8l76fA4OoI2Tttw5jeb23kOoklDp/8VS0JEAT3wm/hZiE20aUbAFC+ +kPiCucBztzaTHQud9CgtxRH/B3D9FaPuwae/H6FYrvQVNVjcaHTIUh9fTcyKRXYm +oYbvK7fIhCjZkG2LRWRU9Kirivb+ktO4POsuK4BgYrsFaOBf9HYsojA7llyGDopN +cfw9jtb27Qb/uMKJnClFg14u685CU5JAzY31E5OQPPUUx9PqP4Z9PgXRQ0xI6H/4 +sejlcQuqGCDKiL2lOzUjbT86EjO4ZfiKHR+lKOIuT5mXiR8cbS1JeyX3Mrv1Ds4r +UVcdtSXTy6/XYWFIzhu+MrsFon6VX0HkmSH1HjSoLMOZcHAZIFZZ/uAahLmMNaEG +lV15fD5+t5QRKwqmdFUW2ETiqSJxRs6Y++ptxpiiH38QVWPvBWeRgcPpf3A478Bl +iGO0xn0N57TnhFs3g0C0xyZgTBMozfVostYpps1Tqqz0VOhtmURxTZm9JZgTb7qv +nMURY0SIQKXpHCcJuNtxZcDSu8uxgUcMsLSSC7Zmk7/cSeUfmOgZVzU= -----END CERTIFICATE----- diff -Nru mariadb-10.11.11/mysql-test/suite/archive/archive-big.test mariadb-10.11.13/mysql-test/suite/archive/archive-big.test --- mariadb-10.11.11/mysql-test/suite/archive/archive-big.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/archive/archive-big.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,7 @@ --source include/big_test.inc -# Valgrind is to slow for this test +# Valgrind and msan is to slow for this test --source include/not_valgrind.inc +--source include/not_msan.inc --source include/have_archive.inc CREATE TABLE t1(a BLOB) ENGINE=ARCHIVE; --disable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/README.txt mariadb-10.11.13/mysql-test/suite/atomic/README.txt --- mariadb-10.11.11/mysql-test/suite/atomic/README.txt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/README.txt 2025-05-19 16:14:24.000000000 +0000 @@ -3,7 +3,7 @@ - Add # before --exec echo "restart" ... - Force $e (engine), $c (crash point) and $r (crash position) to the values - where things goes wrong. See comments in alter_table.test for how to do this. + where things goes wrong. See comments in alter_table.inc for how to do this. - start mariadbd in a debugger run the following in the debugger diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.inc mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.inc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,198 @@ +--source include/long_test.inc +--source include/have_debug.inc +--source include/have_log_bin.inc + +if (!$BIG_TEST) +{ + --source include/not_valgrind.inc + --source include/not_msan.inc +} + +# +# Testing of atomic create table with crashes in a lot of different places +# +# Things tested: +# With myisam and InnoDB engines to ensure that cover both normal and +# online alter table paths. +# Alter table with new columns +# Alter table which only touches .frm +# Alter table disable keys (has it own code path) +# Alter table with rename +# Alter table with rename and only options that touches .frm +# Alter table with rename and add new columns +# Alter table with storage engine change (with and without column definition +# changes) +# Alter table with storage engine change and rename +# Alter table to another database + +--disable_query_log +call mtr.add_suppression("InnoDB: .* does not exist in the InnoDB internal"); +# Speed up wait_until_connected_again.inc +let NO_WSREP=1; +--enable_query_log +let $MYSQLD_DATADIR= `SELECT @@datadir`; + +create database test2; +RESET MASTER; + +if ($engine_count == "") +{ + let $engine_count=2; + let $engines='myisam','innodb'; +} +if ($extra_engine == "") +{ + let $extra_engine=aria; +} + +let $crash_count=13; +let $crash_points='ddl_log_alter_after_create_frm', 'ddl_log_alter_after_create_table', 'ddl_log_alter_after_prepare_inplace','ddl_log_alter_after_copy', 'ddl_log_alter_after_log', 'ddl_log_alter_after_rename_to_backup', 'ddl_log_alter_after_rename_to_backup_log', 'ddl_log_alter_rename_frm', 'ddl_log_alter_after_rename_to_original', 'ddl_log_alter_before_rename_triggers', 'ddl_log_alter_after_rename_triggers', 'ddl_log_alter_after_delete_backup', 'ddl_log_alter_after_drop_original_table'; + +let $statement_count=16; +let $statements='ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"', + 'ALTER TABLE t1 COMMENT "new"', + 'ALTER TABLE t1 change column a c int COMMENT "new"', + 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2', + 'ALTER TABLE t1 disable keys', + 'ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"', + 'ALTER TABLE t1 rename t2', + 'ALTER TABLE t1 COMMENT "new", rename t2', + 'ALTER TABLE t1 change column a c int COMMENT "new", rename t2', + 'ALTER TABLE t1 ENGINE=$extra_engine, COMMENT "new"', + 'ALTER TABLE t1 change column a c int COMMENT "new", engine=$extra_engine', + 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=$extra_engine', + 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2', + 'ALTER TABLE t1 COMMENT "new", rename test2.t2', + 'ALTER TABLE t1 ADD key(b), COMMENT "new"', + 'ALTER TABLE t1 DROP INDEX a'; + +# If there is a need of testing one specific state (crash point and query), +# one can use the comments below to execute one specific test combination +#let $crash_count=1; +#let $crash_points='ddl_log_alter_after_create_frm'; +#let $statement_count= 1; +#let $statements='ALTER TABLE t1 ADD COLUMN c int, COMMENT "new"'; +#let $engine_count=1; +#let $engines='rocksdb'; +#--source include/have_rocksdb.inc + +let $old_debug=`select @@debug_dbug`; +let $e=0; +let $keep_include_silent=1; +let $grep_script=ALTER; +--disable_query_log + +while ($e < $engine_count) +{ + inc $e; + let $engine=`select ELT($e, $engines)`; + let $default_engine=$engine; + + --echo + --echo engine: $engine + --echo + + let $r=0; + while ($r < $statement_count) + { + inc $r; + let $statement=`select ELT($r, $statements)`; + --echo + --echo query: $statement + --echo + let $c=0; + while ($c < $crash_count) + { + inc $c; + let $crash=`select ELT($c, $crash_points)`; + + --eval create table t1 (a int, b int, key(a)) engine=$engine + insert into t1 values (1,1),(2,2); + commit; + flush tables; + + FLUSH BINARY LOGS; + --let $start_binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) + --echo crash point: $crash + if ($crash_count > 1) + { + --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect + } +# The following can be used for testing one specific failure +# if ($crash == "ddl_log_alter_after_log") +# { +# if ($r == 2) +# { +# --remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +# } +# } + --disable_reconnect + --eval set @@debug_dbug="+d,$crash",@debug_crash_counter=1 + let $errno=0; + --error 0,2013 + --eval $statement; + let $error=$errno; + --enable_reconnect + --source include/wait_until_connected_again.inc + --disable_query_log + --eval set @@debug_dbug="$old_debug" + + if ($error == 0) + { + echo "No crash!"; + } + if ($error != 0) + { + --list_files $MYSQLD_DATADIR/test t* + --list_files $MYSQLD_DATADIR/test *sql* + --list_files $MYSQLD_DATADIR/test2 t* + --list_files $MYSQLD_DATADIR/test2 *sql* + # Check which tables still exists + --error 0,1 + --file_exists $MYSQLD_DATADIR/test/t1.frm + let $error2=$errno; + if ($error2 == 0) + { + show create table t1; + select count(*) from t1; + } + if ($error2 == 1) + { + --error 0,1 + --file_exists $MYSQLD_DATADIR/test/t2.frm + let $error3=$errno; + if ($error3 == 0) + { + show create table t2; + select count(*) from t2; + } + if ($error3 == 1) + { + --echo "Table is in test2" + show create table test2.t2; + select count(*) from test2.t2; + } + } + --let $binlog_file=$start_binlog_file + --let $binlog_output_name=master-bin.000001 + + --source include/show_binlog_events.inc + if ($error) + { + --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) + --let $binlog_output_name=master-bin.000002 + if ($binlog_file != $start_binlog_file) + { + --source include/show_binlog_events.inc + } + } + } + --disable_warnings + drop table if exists t1,t2; + drop table if exists test2.t2; + --enable_warnings + } + } +} +drop database test2; +--enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.opt mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ ---innodb-max-dirty-pages-pct=0 diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,3135 +0,0 @@ -create database test2; -RESET MASTER; - -engine: myisam - - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" - -query: ALTER TABLE t1 COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 change column a c int COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup_log -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_before_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_drop_original_table -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 - -query: ALTER TABLE t1 disable keys - -crash point: ddl_log_alter_after_create_frm -"No crash!" -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -"No crash!" -crash point: ddl_log_alter_after_log -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" - -query: ALTER TABLE t1 rename t2 - -crash point: ddl_log_alter_after_create_frm -"No crash!" -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -"No crash!" -crash point: ddl_log_alter_after_log -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_rename_triggers -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_log -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_log -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.MYD -t2.MYI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ENGINE=aria, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" - -query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_rename_to_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_drop_original_table -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_to_backup -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_to_backup_log -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_before_rename_triggers -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_triggers -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_delete_backup -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_drop_original_table -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_to_backup -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_to_backup_log -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_before_rename_triggers -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_triggers -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_delete_backup -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_drop_original_table -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 - -query: ALTER TABLE t1 COMMENT "new", rename test2.t2 - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_triggers -t2.MYD -t2.MYI -t2.frm -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ADD key(b), COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" - -query: ALTER TABLE t1 DROP INDEX a - -crash point: ddl_log_alter_after_create_frm -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_after_rename_to_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_after_drop_original_table -t1.MYD -t1.MYI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a - -engine: innodb - - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 change column a c int COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 disable keys - -crash point: ddl_log_alter_after_create_frm -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_create_table -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_copy -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_log -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_rename_frm -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_delete_backup -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -Warnings: -Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option -"No crash!" - -query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" - -query: ALTER TABLE t1 rename t2 - -crash point: ddl_log_alter_after_create_frm -"No crash!" -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -"No crash!" -crash point: ddl_log_alter_after_log -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_rename_triggers -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2 - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_rename_triggers -t2.frm -t2.ibd -Table Create Table -t2 CREATE TABLE `t2` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ENGINE=aria, COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" -crash point: ddl_log_alter_after_drop_original_table -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" - -query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_rename_to_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_rename_to_backup_log -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria -crash point: ddl_log_alter_after_drop_original_table -t1.MAD -t1.MAI -t1.frm -Table Create Table -t1 CREATE TABLE `t1` ( - `c` int(11) DEFAULT NULL COMMENT 'new', - `b` int(11) DEFAULT NULL, - KEY `a` (`c`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_prepare_inplace -"No crash!" -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_to_backup -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_to_backup_log -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_before_rename_triggers -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_rename_triggers -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_delete_backup -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria -crash point: ddl_log_alter_after_drop_original_table -t2.MAD -t2.MAI -t2.frm -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria - -query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.frm -t2.ibd -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_triggers -t2.frm -t2.ibd -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - `c` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 COMMENT "new", rename test2.t2 - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -t2.frm -t2.ibd -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_rename_triggers -t2.frm -t2.ibd -"Table is in test2" -Table Create Table -t2 CREATE TABLE `t2` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 ADD key(b), COMMENT "new" - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`), - KEY `b` (`b`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" - -query: ALTER TABLE t1 DROP INDEX a - -crash point: ddl_log_alter_after_create_frm -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_create_table -"No crash!" -crash point: ddl_log_alter_after_prepare_inplace -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_copy -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL, - KEY `a` (`a`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -crash point: ddl_log_alter_after_log -t1.frm -t1.ibd -Table Create Table -t1 CREATE TABLE `t1` ( - `a` int(11) DEFAULT NULL, - `b` int(11) DEFAULT NULL -) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -count(*) -2 -master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a -crash point: ddl_log_alter_after_rename_to_backup -"No crash!" -crash point: ddl_log_alter_after_rename_to_backup_log -"No crash!" -crash point: ddl_log_alter_rename_frm -"No crash!" -crash point: ddl_log_alter_after_rename_to_original -"No crash!" -crash point: ddl_log_alter_before_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_rename_triggers -"No crash!" -crash point: ddl_log_alter_after_delete_backup -"No crash!" -crash point: ddl_log_alter_after_drop_original_table -"No crash!" diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test 1970-01-01 00:00:00.000000000 +0000 @@ -1,198 +0,0 @@ ---source include/have_debug.inc ---source include/have_innodb.inc ---source include/have_log_bin.inc - -if (!$BIG_TEST) -{ - --source include/not_valgrind.inc - --source include/not_msan.inc -} - -# -# Testing of atomic create table with crashes in a lot of different places -# -# Things tested: -# With myisam and InnoDB engines to ensure that cover both normal and -# online alter table paths. -# Alter table with new columns -# Alter table which only touches .frm -# Alter table disable keys (has it own code path) -# Alter table with rename -# Alter table with rename and only options that touches .frm -# Alter table with rename and add new columns -# Alter table with storage engine change (with and without column definition -# changes) -# Alter table with storage engine change and rename -# Alter table to another database - ---disable_query_log -call mtr.add_suppression("InnoDB: .* does not exist in the InnoDB internal"); -# Speed up wait_until_connected_again.inc -let NO_WSREP=1; ---enable_query_log -let $MYSQLD_DATADIR= `SELECT @@datadir`; - -create database test2; -RESET MASTER; - -if ($engine_count == "") -{ - let $engine_count=2; - let $engines='myisam','innodb'; -} -if ($extra_engine == "") -{ - let $extra_engine=aria; -} - -let $crash_count=13; -let $crash_points='ddl_log_alter_after_create_frm', 'ddl_log_alter_after_create_table', 'ddl_log_alter_after_prepare_inplace','ddl_log_alter_after_copy', 'ddl_log_alter_after_log', 'ddl_log_alter_after_rename_to_backup', 'ddl_log_alter_after_rename_to_backup_log', 'ddl_log_alter_rename_frm', 'ddl_log_alter_after_rename_to_original', 'ddl_log_alter_before_rename_triggers', 'ddl_log_alter_after_rename_triggers', 'ddl_log_alter_after_delete_backup', 'ddl_log_alter_after_drop_original_table'; - -let $statement_count=16; -let $statements='ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"', - 'ALTER TABLE t1 COMMENT "new"', - 'ALTER TABLE t1 change column a c int COMMENT "new"', - 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2', - 'ALTER TABLE t1 disable keys', - 'ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"', - 'ALTER TABLE t1 rename t2', - 'ALTER TABLE t1 COMMENT "new", rename t2', - 'ALTER TABLE t1 change column a c int COMMENT "new", rename t2', - 'ALTER TABLE t1 ENGINE=$extra_engine, COMMENT "new"', - 'ALTER TABLE t1 change column a c int COMMENT "new", engine=$extra_engine', - 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=$extra_engine', - 'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2', - 'ALTER TABLE t1 COMMENT "new", rename test2.t2', - 'ALTER TABLE t1 ADD key(b), COMMENT "new"', - 'ALTER TABLE t1 DROP INDEX a'; - -# If there is a need of testing one specific state (crash point and query), -# one can use the comments below to execute one specific test combination -#let $crash_count=1; -#let $crash_points='ddl_log_alter_after_create_frm'; -#let $statement_count= 1; -#let $statements='ALTER TABLE t1 ADD COLUMN c int, COMMENT "new"'; -#let $engine_count=1; -#let $engines='rocksdb'; -#--source include/have_rocksdb.inc - -let $old_debug=`select @@debug_dbug`; -let $e=0; -let $keep_include_silent=1; -let $grep_script=ALTER; ---disable_query_log - -while ($e < $engine_count) -{ - inc $e; - let $engine=`select ELT($e, $engines)`; - let $default_engine=$engine; - - --echo - --echo engine: $engine - --echo - - let $r=0; - while ($r < $statement_count) - { - inc $r; - let $statement=`select ELT($r, $statements)`; - --echo - --echo query: $statement - --echo - let $c=0; - while ($c < $crash_count) - { - inc $c; - let $crash=`select ELT($c, $crash_points)`; - - --eval create table t1 (a int, b int, key(a)) engine=$engine - insert into t1 values (1,1),(2,2); - commit; - flush tables; - - FLUSH BINARY LOGS; - --let $start_binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) - --echo crash point: $crash - if ($crash_count > 1) - { - --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect - } -# The following can be used for testing one specific failure -# if ($crash == "ddl_log_alter_after_log") -# { -# if ($r == 2) -# { -# --remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -# } -# } - --disable_reconnect - --eval set @@debug_dbug="+d,$crash",@debug_crash_counter=1 - let $errno=0; - --error 0,2013 - --eval $statement; - let $error=$errno; - --enable_reconnect - --source include/wait_until_connected_again.inc - --disable_query_log - --eval set @@debug_dbug="$old_debug" - - if ($error == 0) - { - echo "No crash!"; - } - if ($error != 0) - { - --list_files $MYSQLD_DATADIR/test t* - --list_files $MYSQLD_DATADIR/test *sql* - --list_files $MYSQLD_DATADIR/test2 t* - --list_files $MYSQLD_DATADIR/test2 *sql* - # Check which tables still exists - --error 0,1 - --file_exists $MYSQLD_DATADIR/test/t1.frm - let $error2=$errno; - if ($error2 == 0) - { - show create table t1; - select count(*) from t1; - } - if ($error2 == 1) - { - --error 0,1 - --file_exists $MYSQLD_DATADIR/test/t2.frm - let $error3=$errno; - if ($error3 == 0) - { - show create table t2; - select count(*) from t2; - } - if ($error3 == 1) - { - --echo "Table is in test2" - show create table test2.t2; - select count(*) from test2.t2; - } - } - --let $binlog_file=$start_binlog_file - --let $binlog_output_name=master-bin.000001 - - --source include/show_binlog_events.inc - if ($error) - { - --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) - --let $binlog_output_name=master-bin.000002 - if ($binlog_file != $start_binlog_file) - { - --source include/show_binlog_events.inc - } - } - } - --disable_warnings - drop table if exists t1,t2; - drop table if exists test2.t2; - --enable_warnings - } - } -} -drop database test2; ---enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_aria.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_aria.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test 2025-05-19 16:14:24.000000000 +0000 @@ -4,4 +4,4 @@ let $engine_count=1; let $engines='aria'; let $extra_engine=myisam; ---source alter_table.test +--source alter_table.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.opt mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1 @@ +--innodb-max-dirty-pages-pct=0 diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,1396 @@ +create database test2; +RESET MASTER; + +engine: innodb + + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 change column a c int COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 disable keys + +crash point: ddl_log_alter_after_create_frm +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_create_table +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_copy +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_log +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_rename_frm +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_delete_backup +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +Warnings: +Note 1031 Storage engine InnoDB of the table `test`.`t1` doesn't have this option +"No crash!" + +query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" + +query: ALTER TABLE t1 rename t2 + +crash point: ddl_log_alter_after_create_frm +"No crash!" +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +"No crash!" +crash point: ddl_log_alter_after_log +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_rename_triggers +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.frm +t2.ibd +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ENGINE=aria, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" + +query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_rename_to_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_drop_original_table +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_to_backup +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_to_backup_log +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_before_rename_triggers +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_triggers +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_delete_backup +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_drop_original_table +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.frm +t2.ibd +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_triggers +t2.frm +t2.ibd +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 COMMENT "new", rename test2.t2 + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.frm +t2.ibd +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_triggers +t2.frm +t2.ibd +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ADD key(b), COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 DROP INDEX a + +crash point: ddl_log_alter_after_create_frm +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.frm +t1.ibd +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,7 @@ +# +# Test atomic alter table with InnoDB + +--source include/have_innodb.inc +let $engine_count=1; +let $engines='innodb'; +--source alter_table.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,1741 @@ +create database test2; +RESET MASTER; + +engine: myisam + + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new" + +query: ALTER TABLE t1 COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 change column a c int COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup_log +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_before_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 +crash point: ddl_log_alter_after_drop_original_table +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2 + +query: ALTER TABLE t1 disable keys + +crash point: ddl_log_alter_after_create_frm +"No crash!" +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +"No crash!" +crash point: ddl_log_alter_after_log +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new" + +query: ALTER TABLE t1 rename t2 + +crash point: ddl_log_alter_after_create_frm +"No crash!" +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +"No crash!" +crash point: ddl_log_alter_after_log +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_rename_triggers +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_log +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2 + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_log +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_rename_triggers +t2.MYD +t2.MYI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ENGINE=aria, COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new" + +query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_rename_to_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria +crash point: ddl_log_alter_after_drop_original_table +t1.MAD +t1.MAI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `c` int(11) DEFAULT NULL COMMENT 'new', + `b` int(11) DEFAULT NULL, + KEY `a` (`c`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_to_backup +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_to_backup_log +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_before_rename_triggers +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_rename_triggers +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_delete_backup +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria +crash point: ddl_log_alter_after_drop_original_table +t2.MAD +t2.MAI +t2.frm +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria + +query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_to_backup +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_to_backup_log +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_before_rename_triggers +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_triggers +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_delete_backup +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_drop_original_table +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `c` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2 + +query: ALTER TABLE t1 COMMENT "new", rename test2.t2 + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +"No crash!" +crash point: ddl_log_alter_after_prepare_inplace +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_to_backup +"No crash!" +crash point: ddl_log_alter_after_rename_to_backup_log +"No crash!" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +"No crash!" +crash point: ddl_log_alter_before_rename_triggers +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_rename_triggers +t2.MYD +t2.MYI +t2.frm +"Table is in test2" +Table Create Table +t2 CREATE TABLE `t2` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2 +crash point: ddl_log_alter_after_delete_backup +"No crash!" +crash point: ddl_log_alter_after_drop_original_table +"No crash!" + +query: ALTER TABLE t1 ADD key(b), COMMENT "new" + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" +crash point: ddl_log_alter_after_drop_original_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`), + KEY `b` (`b`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new' +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new" + +query: ALTER TABLE t1 DROP INDEX a + +crash point: ddl_log_alter_after_create_frm +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_create_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_prepare_inplace +"No crash!" +crash point: ddl_log_alter_after_copy +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + KEY `a` (`a`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +crash point: ddl_log_alter_after_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_after_rename_to_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_after_rename_to_backup_log +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_rename_frm +"No crash!" +crash point: ddl_log_alter_after_rename_to_original +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_before_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_rename_triggers +"No crash!" +crash point: ddl_log_alter_after_delete_backup +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a +crash point: ddl_log_alter_after_drop_original_table +t1.MYD +t1.MYI +t1.frm +Table Create Table +t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +count(*) +2 +master-bin.000002 # Query # # use `test`; ALTER TABLE t1 DROP INDEX a diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,6 @@ +# +# Test atomic alter table with MyISAM + +let $engine_count=1; +let $engines='myisam'; +--source alter_table.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_rocksdb.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_rocksdb.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,4 +3,4 @@ let $engine_count=1; let $engines='rocksdb'; set global rocksdb_flush_log_at_trx_commit=1; ---source alter_table.test +--source alter_table.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_trigger.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test --- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_trigger.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,7 +7,7 @@ # # Testing of atomic create table with crashes in a lot of different places # -# This is very similar to the alter_table.test, but includes testing of +# This is very similar to the alter_table.inc, but includes testing of # triggers in with ALTER TABLE .. RENAME. # diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/create_table.test mariadb-10.11.13/mysql-test/suite/atomic/create_table.test --- mariadb-10.11.11/mysql-test/suite/atomic/create_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/create_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_debug.inc --source include/have_sequence.inc --source include/have_innodb.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/drop_table.test mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test --- mariadb-10.11.11/mysql-test/suite/atomic/drop_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_debug.inc --source include/have_innodb.inc --source include/have_csv.inc diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/rename_table.test mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test --- mariadb-10.11.11/mysql-test/suite/atomic/rename_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_debug.inc --source include/have_innodb.inc --source include/have_csv.inc diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_commit_fail.result mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result --- mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_commit_fail.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,116 @@ +set @@session.gtid_domain_id=1; +set @save_gtid_stric_mode=@@global.gtid_strict_mode; +create table ta (a int) engine=aria; +create table ti (a int) engine=innodb; +create table ti_pk (a int primary key) engine=innodb; +create table t (a int) engine=innodb; +create function f_i() +returns integer +begin +insert into ti set a=1; +return 1; +end | +create function f_ia(arg int) +returns integer +begin +insert into ti_pk set a=1; +insert into ta set a=1; +insert into ti_pk set a=arg; +return 1; +end | +call mtr.add_suppression("Error writing file"); +select count(*) as zero from t; +zero +0 +select count(*) as zero from ta; +zero +0 +select count(*) as zero from ti; +zero +0 +# 1. simple Innodb test +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +insert into t set a=1; +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +# observe effective rollback +select count(*) as zero from t; +zero +0 +# 2. simple Aira test +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +insert into ta values (1),(2); +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +# note no rollback +select count(*) as '*NON-zero*' from ta; +*NON-zero* +2 +delete from ta; +# 3. multi-engine test +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +insert into ta set a=f_i(); +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +# note no rollback.. +select count(*) as one from ta; +one +1 +# ..except transactional engine +select count(*) as zero from ti; +zero +0 +delete from ta; +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +insert into t set a=f_ia(0); +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +# note no rollback.. +select count(*) as one from ta; +one +1 +# ..except transactional engine +select count(*) as zero from t; +zero +0 +select count(*) as zero from ti_pk; +zero +0 +delete from ta; +# 4. create-table-select-f() +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +create table f_x (a int) select f_i() as a; +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +# rollback indeed takes place in the pure transactional case +select count(*) as zero from ti; +zero +0 +set @@global.gtid_strict_mode=0; +set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +create table t_x (a int) engine=aria select f_ia(0) as a; +ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled +select * from t_x; +ERROR 42S02: Table 'test.t_x' doesn't exist +# **TODO**: fix MDEV-36027 +# **TODO**: the empty binlog is buggy .. +include/show_binlog_events.inc +# .. as non-transactional `ta` (and `t_x` sic!) are modified +select count(*) as one from ta; +one +1 +select count(*) as zero from ti; +zero +0 +delete from ta; +#. +set @@global.gtid_strict_mode=@save_gtid_stric_mode; +drop function f_i; +drop function f_ia; +drop table t, ta, ti, ti_pk; diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result --- mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result 2025-05-19 16:14:24.000000000 +0000 @@ -18,6 +18,51 @@ # Ensuring file offset of binlog_f2_mid < binlog_f1_end # # +# Test using --read-from-remote-server +# +connection default; +# +# --stop-position tests +# +# Case 1.a) With one binlog file, a --stop-position before the end of +# the file should not result in a warning +# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f1_pre_rotate binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1 +# +# Case 1.b) With one binlog file, a --stop-position at the exact end of +# the file should not result in a warning +# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f1_end binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1 +# +# Case 1.c) With one binlog file, a --stop-position past the end of the +# file should(!) result in a warning +# MYSQL_BINLOG --read-from-remote-server --short-form --stop-position=binlog_f1_over_eof binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1 +WARNING: Did not reach stop position before end of input +# +# Case 2.a) With two binlog files, a --stop-position targeting b2 which +# exists in the size of b1 should: +# 1) not provide any warnings +# 2) not prevent b2 from outputting its desired events before the +# stop position +# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_mid binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1 +include/assert_grep.inc [Ensure all intended GTIDs are present] +include/assert_grep.inc [Ensure the next GTID binlogged is _not_ present] +# +# Case 2.b) With two binlog files, a --stop-position targeting the end +# of binlog 2 should: +# 1) not provide any warnings +# 2) not prevent b2 from outputting its entire binary log +# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_end binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1 +include/assert_grep.inc [Ensure a GTID exists for each transaction] +include/assert_grep.inc [Ensure the last GTID binlogged is present] +# +# Case 2.c) With two binlog files, a --stop-position targeting beyond +# the eof of binlog 2 should: +# 1) provide a warning that the stop position was not reached +# 2) not prevent b2 from outputting its entire binary log +# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_over_eof binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1 +WARNING: Did not reach stop position before end of input +include/assert_grep.inc [Ensure a GTID exists for each transaction] +# +# # Test using local binlog files # connection default; diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_commit_fail.test mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test --- mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_commit_fail.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,135 @@ +# Tests of commit time failures. +# At committing of an auto-commit statement a failure to commit in its +# binlog branch should rollback at least the transactional part of the statement. +# +# References: +# MDEV-35506 commit policy of one-phase-commit even at errored-out binlogging leads to assert +# MDEV-36027 Errored-out CREATE-SELECT does not binlog results of non-transactional table modification + +source include/have_innodb.inc; +source include/have_binlog_format_row.inc; + +set @@session.gtid_domain_id=1; +set @save_gtid_stric_mode=@@global.gtid_strict_mode; + +create table ta (a int) engine=aria; +create table ti (a int) engine=innodb; +create table ti_pk (a int primary key) engine=innodb; +create table t (a int) engine=innodb; +delimiter |; +create function f_i() +returns integer +begin + insert into ti set a=1; +return 1; +end | +create function f_ia(arg int) +returns integer +begin + insert into ti_pk set a=1; + insert into ta set a=1; + insert into ti_pk set a=arg; + return 1; +end | +delimiter ;| + +call mtr.add_suppression("Error writing file"); + +# Naturally all empty now +select count(*) as zero from t; +select count(*) as zero from ta; +select count(*) as zero from ti; + +# Force manual value assignement to gtid::seq_no while in the strict mode +# so that the value is rejected. Despite the errorred out statement +# being at its commit phase it will eventually be rolled back. +# Side effects of non-transactional engines, like Aria, are displayed. +--echo # 1. simple Innodb test +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +# mask possible allowed seq_no shift +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +insert into t set a=1; + +--echo # observe effective rollback +select count(*) as zero from t; + +--echo # 2. simple Aira test +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +insert into ta values (1),(2); + +--echo # note no rollback +select count(*) as '*NON-zero*' from ta; +# local cleanup +delete from ta; + +--echo # 3. multi-engine test +# A. non-transactional top-level +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +insert into ta set a=f_i(); +--echo # note no rollback.. +select count(*) as one from ta; +--echo # ..except transactional engine +select count(*) as zero from ti; +delete from ta; + +# B. non-transactional in the leaf +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +insert into t set a=f_ia(0); + +--echo # note no rollback.. +select count(*) as one from ta; +--echo # ..except transactional engine +select count(*) as zero from t; +select count(*) as zero from ti_pk; +delete from ta; + +--echo # 4. create-table-select-f() +--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) +# A. two phase commit branch +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +create table f_x (a int) select f_i() as a; +--echo # rollback indeed takes place in the pure transactional case +select count(*) as zero from ti; + +# B. one phase commit branch +--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) +set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1; +set @@global.gtid_strict_mode=1; +--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/ +--error ER_GTID_STRICT_OUT_OF_ORDER +create table t_x (a int) engine=aria select f_ia(0) as a; +--error ER_NO_SUCH_TABLE +select * from t_x; + +--echo # **TODO**: fix MDEV-36027 +--echo # **TODO**: the empty binlog is buggy .. +--source include/show_binlog_events.inc +--echo # .. as non-transactional `ta` (and `t_x` sic!) are modified +select count(*) as one from ta; +select count(*) as zero from ti; + +delete from ta; +--echo #. + +# cleanup + +set @@global.gtid_strict_mode=@save_gtid_stric_mode; +drop function f_i; +drop function f_ia; +drop table t, ta, ti, ti_pk; diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test --- mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test 2025-05-19 16:14:24.000000000 +0000 @@ -64,13 +64,12 @@ --die Mid point chosen to end in binlog 2 does not exist in earlier binlog } -#--echo # -#--echo # -#--echo # Test using --read-from-remote-server -#--echo # -#--let $read_from_remote_server= 1 -#--emit warning is not supported by --read-from-remote-server now -#--source binlog_mysqlbinlog_warn_stop_position.inc +--echo # +--echo # +--echo # Test using --read-from-remote-server +--echo # +--let $read_from_remote_server= 1 +--source binlog_mysqlbinlog_warn_stop_position.inc --echo # --echo # diff -Nru mariadb-10.11.11/mysql-test/suite/binlog_encryption/encrypted_master.test mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test --- mariadb-10.11.11/mysql-test/suite/binlog_encryption/encrypted_master.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test 2025-05-19 16:14:24.000000000 +0000 @@ -18,6 +18,7 @@ # - with annotated events, default checksums and minimal binlog row image # +--source include/long_test.inc # The test can take very long time with valgrind --source include/not_valgrind.inc diff -Nru mariadb-10.11.11/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result --- mariadb-10.11.11/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,16 +1,15 @@ ***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts *** include/master-slave.inc [connection master] -connection server_2; -SET sql_log_bin=0; +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CALL mtr.add_suppression("InnoDB: Transaction was aborted due to "); CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); -SET sql_log_bin=1; +connection server_2; SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; include/stop_slave.inc SET GLOBAL slave_parallel_threads=10; CHANGE MASTER TO master_use_gtid=slave_pos; connection server_1; -ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB; INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6); connect con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,; diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/r/doublewrite_debug.result mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result --- mariadb-10.11.11/mysql-test/suite/encryption/r/doublewrite_debug.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result 2025-05-19 16:14:24.000000000 +0000 @@ -3,8 +3,9 @@ call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); -create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0; -create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0; +create table t1 (f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=yes stats_persistent=0; +create table t2(f1 int primary key, f2 blob)engine=innodb encrypted=yes stats_persistent=0; +create table t3(f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=no stats_persistent=0; start transaction; insert into t1 values(1, repeat('#',12)); insert into t1 values(2, repeat('+',12)); @@ -12,29 +13,37 @@ insert into t1 values(4, repeat('-',12)); insert into t1 values(5, repeat('.',12)); insert into t2 select * from t1; +insert into t3 select * from t1; commit work; SET GLOBAL innodb_fast_shutdown = 0; # restart: --debug_dbug=+d,ib_log_checkpoint_avoid_hard --innodb_flush_sync=0 select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name='test/t1'; select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name='test/t2'; +select space into @t3_space_id from information_schema.innodb_sys_tablespaces where name='test/t3'; begin; insert into t1 values (6, repeat('%', 400)); insert into t2 values (6, repeat('%', 400)); +insert into t3 values (6, repeat('%', 400)); # xtrabackup prepare set global innodb_saved_page_number_debug = 3; set global innodb_fil_make_page_dirty_debug = @t1_space_id; set global innodb_saved_page_number_debug = 3; set global innodb_fil_make_page_dirty_debug = @t2_space_id; +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t3_space_id; set global innodb_buf_flush_list_now = 1; # Kill the server # restart -FOUND 2 /InnoDB: Recovered page \[page id: space=[1-9]*, page number=3\]/ in mysqld.1.err +FOUND 3 /InnoDB: Recovered page \[page id: space=[1-9]*, page number=3\]/ in mysqld.1.err check table t1; Table Op Msg_type Msg_text test.t1 check status OK check table t2; Table Op Msg_type Msg_text test.t2 check status OK +check table t3; +Table Op Msg_type Msg_text +test.t3 check status OK select f1, f2 from t1; f1 f2 1 ############ @@ -49,6 +58,13 @@ 3 //////////// 4 ------------ 5 ............ +select f1, f2 from t3; +f1 f2 +1 ############ +2 ++++++++++++ +3 //////////// +4 ------------ +5 ............ SET GLOBAL innodb_fast_shutdown = 0; # shutdown server # remove datadir @@ -78,4 +94,4 @@ 3 //////////// 4 ------------ 5 ............ -drop table t2, t1; +drop table t3, t2, t1; diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.opt mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt --- mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,3 @@ --innodb-use-atomic-writes=0 ---innodb-encrypt-tables=FORCE +--innodb-encrypt-tables=on --innodb_sys_tablespaces diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.test mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test --- mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test 2025-05-19 16:14:24.000000000 +0000 @@ -12,8 +12,9 @@ let MYSQLD_DATADIR=`select @@datadir`; let ALGO=`select @@innodb_checksum_algorithm`; -create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0; -create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0; +create table t1 (f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=yes stats_persistent=0; +create table t2(f1 int primary key, f2 blob)engine=innodb encrypted=yes stats_persistent=0; +create table t3(f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=no stats_persistent=0; start transaction; insert into t1 values(1, repeat('#',12)); @@ -22,6 +23,7 @@ insert into t1 values(4, repeat('-',12)); insert into t1 values(5, repeat('.',12)); insert into t2 select * from t1; +insert into t3 select * from t1; commit work; # Slow shutdown and restart to make sure ibuf merge is finished @@ -33,15 +35,17 @@ select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name='test/t1'; select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name='test/t2'; +select space into @t3_space_id from information_schema.innodb_sys_tablespaces where name='test/t3'; begin; insert into t1 values (6, repeat('%', 400)); insert into t2 values (6, repeat('%', 400)); +insert into t3 values (6, repeat('%', 400)); -# Copy the t1.ibd, t2.ibd file +# Copy the t1.ibd, t2.ibd, t3.ibd file let $targetdir=$MYSQLTEST_VARDIR/tmp/backup_1; --disable_result_log -exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir; +exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --skip-innodb-log-checkpoint-now --target-dir=$targetdir; --enable_result_log echo # xtrabackup prepare; @@ -54,8 +58,11 @@ set global innodb_saved_page_number_debug = 3; set global innodb_fil_make_page_dirty_debug = @t2_space_id; +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t3_space_id; + set global innodb_buf_flush_list_now = 1; ---let CLEANUP_IF_CHECKPOINT=drop table t1, t2, unexpected_checkpoint; +--let CLEANUP_IF_CHECKPOINT=drop table t1, t2, t3, unexpected_checkpoint; --source ../../suite/innodb/include/no_checkpoint_end.inc # Corrupt the page 3 in t1.ibd, t2.ibd file perl; @@ -103,6 +110,15 @@ sysseek(FILE, 3*$page_size, 0); print FILE chr(0) x ($ENV{'INNODB_PAGE_SIZE'}); close FILE; + +# Zero the complete page +my $fname= "$ENV{'MYSQLD_DATADIR'}test/t3.ibd"; +open(FILE, "+<", $fname) or die; +FILE->autoflush(1); +binmode FILE; +sysseek(FILE, 3*$page_size, 0); +print FILE chr(0) x ($ENV{'INNODB_PAGE_SIZE'}); +close FILE; EOF # Successful recover from doublewrite buffer @@ -114,8 +130,10 @@ check table t1; check table t2; +check table t3; select f1, f2 from t1; select f1, f2 from t2; +select f1, f2 from t3; SET GLOBAL innodb_fast_shutdown = 0; let $shutdown_timeout=; @@ -220,4 +238,4 @@ --source ../../mariabackup/include/restart_and_restore.inc select * from t1; -drop table t2, t1; +drop table t3, t2, t1; diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt --- mariadb-10.11.11/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,2 +1,2 @@ ---innodb_buffer_pool_size=5M +--innodb_buffer_pool_size=6M --innodb_encrypt_temporary_tables=1 diff -Nru mariadb-10.11.11/mysql-test/suite/engines/iuds/r/insert_time.result mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result --- mariadb-10.11.11/mysql-test/suite/engines/iuds/r/insert_time.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result 2025-05-19 16:14:24.000000000 +0000 @@ -5073,10 +5073,14 @@ INSERT INTO t3(c1,c2) VALUES('34 9:23','34 9:23') /* throws error as row exists with c1='34 9:23',c2='34 9:23' */; ERROR 23000: Duplicate entry '825:23:00-825:23:00' for key 'idx' INSERT IGNORE INTO t1(c1,c2) VALUES('10:22:33','10:22:34') /* doesn't throw error */; +Warnings: +Warning 1062 Duplicate entry '10:22:33' for key 'PRIMARY' INSERT IGNORE INTO t2(c1,c2) VALUES('12:34:56.78','12:34:56.78') /*doesn't throw error */; Warnings: Warning 1062 Duplicate entry '12:34:56-12:34:56' for key 'PRIMARY' INSERT IGNORE INTO t1(c1,c2) VALUES('10:22:34','34 9:23') /*doesn't throw error */; +Warnings: +Warning 1062 Duplicate entry '825:23:00' for key 'c2' INSERT IGNORE INTO t3(c1,c2) VALUES('34 9:23','34 9:23') /*doesn't throw error */; Warnings: Warning 1062 Duplicate entry '825:23:00-825:23:00' for key 'idx' diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx.result mariadb-10.11.13/mysql-test/suite/federated/federatedx.result --- mariadb-10.11.11/mysql-test/suite/federated/federatedx.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/federated/federatedx.result 2025-05-19 16:14:24.000000000 +0000 @@ -79,7 +79,7 @@ `name` varchar(32) NOT NULL default '' ) ENGINE="FEDERATED" DEFAULT CHARSET=latin1 -CONNECTION='mysql://root@127.0.0.1:SLAVE_PORT/federated/t1'; +CONNECTION='mariadb://root@127.0.0.1:SLAVE_PORT/federated/t1'; INSERT INTO federated.t1 (id, name) VALUES (1, 'foo'); INSERT INTO federated.t1 (id, name) VALUES (2, 'fee'); INSERT INTO federated.t1 (id, `group`) VALUES (3, 42); diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx.test mariadb-10.11.13/mysql-test/suite/federated/federatedx.test --- mariadb-10.11.11/mysql-test/suite/federated/federatedx.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/federated/federatedx.test 2025-05-19 16:14:24.000000000 +0000 @@ -92,7 +92,7 @@ `name` varchar(32) NOT NULL default '' ) ENGINE="FEDERATED" DEFAULT CHARSET=latin1 - CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t1'; + CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t1'; INSERT INTO federated.t1 (id, name) VALUES (1, 'foo'); INSERT INTO federated.t1 (id, name) VALUES (2, 'fee'); diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.result mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result --- mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result 2025-05-19 16:14:24.000000000 +0000 @@ -479,12 +479,12 @@ INSERT INTO federated.t3 VALUES (1),(2),(3); CREATE TABLE federated.t4 (a INT); connection master; -CREATE SERVER fedlink FOREIGN DATA WRAPPER mysql +CREATE SERVER fedlink FOREIGN DATA WRAPPER mariadb OPTIONS (USER 'root', HOST '127.0.0.1', DATABASE 'federated', PORT SLAVE_PORT); CREATE TABLE federated.t3 (a INT) ENGINE=FEDERATED -CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t3' +CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t3' PARTITION BY list (a) (PARTITION p1 VALUES IN (1) CONNECTION='fedlink/t3', PARTITION p2 VALUES IN (2) CONNECTION='fedlink/t4'); diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.test mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test --- mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,9 +7,6 @@ set global federated_pushdown=1; -#Enable after fix MDEV-31846 or in v. 10.5 and later ---disable_cursor_protocol - connection slave; DROP TABLE IF EXISTS federated.t1; @@ -168,11 +165,13 @@ --sorted_result select * from federated.t4; +--disable_cursor_protocol select name into @var from federated.t1 where id=3 limit 1 ; select @var; --disable_ps2_protocol select name into outfile 'tmp.txt' from federated.t1; --enable_ps2_protocol +--enable_cursor_protocol let $path=`select concat(@@datadir, 'test/tmp.txt')`; remove_file $path; @@ -307,13 +306,13 @@ connection master; --replace_result $SLAVE_MYPORT SLAVE_PORT -eval CREATE SERVER fedlink FOREIGN DATA WRAPPER mysql +eval CREATE SERVER fedlink FOREIGN DATA WRAPPER mariadb OPTIONS (USER 'root', HOST '127.0.0.1', DATABASE 'federated', PORT $SLAVE_MYPORT); CREATE TABLE federated.t3 (a INT) ENGINE=FEDERATED - CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t3' + CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t3' PARTITION BY list (a) (PARTITION p1 VALUES IN (1) CONNECTION='fedlink/t3', PARTITION p2 VALUES IN (2) CONNECTION='fedlink/t4'); @@ -439,7 +438,5 @@ set global federated_pushdown=0; ---enable_cursor_protocol - source include/federated_cleanup.inc; diff -Nru mariadb-10.11.11/mysql-test/suite/funcs_2/t/innodb_charset.test mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test --- mariadb-10.11.11/mysql-test/suite/funcs_2/t/innodb_charset.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,7 +6,7 @@ # Checking of other prerequisites is in charset_master.test # ################################################################################ ---source include/no_valgrind_without_big.inc +--source include/long_test.inc --source include/have_innodb.inc # Starting with MariaDB 10.6, ensure that DDL recovery will have completed diff -Nru mariadb-10.11.11/mysql-test/suite/galera/disabled.def mariadb-10.11.13/mysql-test/suite/galera/disabled.def --- mariadb-10.11.11/mysql-test/suite/galera/disabled.def 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/disabled.def 2025-05-19 16:14:24.000000000 +0000 @@ -10,5 +10,7 @@ # ############################################################################## -galera_sequences : MDEV-35934/MDEV-33850 For Galera, create sequence with low cache got signal 6 error: [ERROR] WSREP: FSM: no such a transition REPLICATING -> COMMITTED -MDEV-26266 : MDEV-26266 +galera_wan : MDEV-35940 Unallowed state transition: donor -> synced in galera_wan +galera_vote_rejoin_ddl : MDEV-35940 Unallowed state transition: donor -> synced in galera_wan +MW-329 : MDEV-35951 Complete freeze during MW-329 test +galera_vote_rejoin_dml : MDEV-35964 Assertion `ist_seqno >= cc_seqno' failed in galera_vote_rejoin_dml diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -17,7 +17,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -28,7 +28,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_master.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_master.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -25,7 +25,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -38,7 +38,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -24,7 +24,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -37,7 +37,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_slave.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -24,7 +24,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -37,7 +37,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_3nodes_as_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_3nodes_as_slave.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -24,7 +24,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -37,7 +37,7 @@ #sst_port=@OPT.port wsrep_provider=@ENV.WSREP_PROVIDER wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' @@ -50,7 +50,7 @@ #sst_port=@OPT.port wsrep-provider=@ENV.WSREP_PROVIDER wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_4nodes.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf --- mariadb-10.11.11/mysql-test/suite/galera/galera_4nodes.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -18,7 +18,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' @@ -30,7 +30,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' @@ -42,7 +42,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port' @@ -54,7 +54,7 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.4.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.4.port wsrep_sst_receive_address='127.0.0.1:@mysqld.4.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/auto_increment_offset_save.inc mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/auto_increment_offset_save.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc 2025-05-19 16:14:24.000000000 +0000 @@ -42,4 +42,3 @@ --connection $node_4 let $auto_increment_offset_node_4 = `SELECT @@global.auto_increment_offset`; } - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_dump_sr_table.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/galera_dump_sr_table.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc 2025-05-19 16:14:24.000000000 +0000 @@ -25,4 +25,3 @@ --inc $seqno } - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc 2025-05-19 16:14:24.000000000 +0000 @@ -118,4 +118,3 @@ SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; DROP TABLE t1; COMMIT; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_start_replication.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/galera_start_replication.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc 2025-05-19 16:14:24.000000000 +0000 @@ -41,9 +41,9 @@ my $counter = 1000; #my $found = false - + while ($counter > 0) { - + open(FILE, "$logfile") or die("Unable to open $logfile : $!\n"); my $new_sync_count = () = grep(/Synchronized with group/g,); close(FILE); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_wsrep_recover.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/galera_wsrep_recover.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc 2025-05-19 16:14:24.000000000 +0000 @@ -9,14 +9,14 @@ } --perl - use strict; + use strict; my $wsrep_start_position_str = "grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'"; my $wsrep_start_position = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`; chomp($wsrep_start_position); die if $wsrep_start_position eq ''; - open(FILE, ">", "$ENV{MYSQL_TMP_DIR}/galera_wsrep_start_position.inc") or die; + open(FILE, ">", "$ENV{MYSQL_TMP_DIR}/galera_wsrep_start_position.inc") or die; print FILE "--let \$galera_wsrep_start_position = $wsrep_start_position\n"; close FILE; EOF diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc --- mariadb-10.11.11/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,35 @@ +# include/wait_condition_with_debug_and_kill.inc +# +# SUMMARY +# +# Waits until the passed statement returns true, or the operation +# times out. If the operation times out, the additional error +# statement will be executed and server is killed. +# +# USAGE +# +# let $wait_condition= +# SELECT c = 3 FROM t; +# let $wait_condition_on_error_output= select count(*) from t; +# [let $explicit_default_wait_timeout= N] # to override the default reset +# --source include/wait_condition_with_debug_and_kill.inc +# +# OR +# +# let $wait_timeout= 60; # Override default 30 seconds with 60. +# let $wait_condition= +# SELECT c = 3 FROM t; +# let $wait_condition_on_error_output= select count(*) from t; +# --source include/wait_condition_with_debug_and_kill.inc +# --echo Executed the test condition $wait_condition_reps times +# +# +# EXAMPLE +# events_bugs.test, events_time_zone.test +# + +--source include/wait_condition_with_debug.inc +if (!$success) +{ + --source include/kill_galera.inc +} diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/GAL-401.result mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result --- mariadb-10.11.11/mysql-test/suite/galera/r/GAL-401.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result 2025-05-19 16:14:24.000000000 +0000 @@ -24,6 +24,6 @@ PRIMARY KEY (`f1`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci DROP TABLE t1; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_1; SET GLOBAL wsrep_provider_options = 'pc.ignore_sb=false'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20225.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20225.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result 2025-05-19 16:14:24.000000000 +0000 @@ -15,7 +15,7 @@ SET GLOBAL debug_dbug = 'RESET'; SET DEBUG_SYNC = 'now SIGNAL signal.mdev_20225_continue'; SET DEBUG_SYNC = 'RESET'; -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; connection node_2; SHOW TRIGGERS; Trigger Event Table Statement Timing Created sql_mode Definer character_set_client collation_connection Database Collation diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20793.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20793.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result 2025-05-19 16:14:24.000000000 +0000 @@ -41,4 +41,4 @@ ERROR 40001: Deadlock found when trying to get lock; try restarting transaction SET debug_sync = "RESET"; DROP TABLE t1; -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-21479.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-21479.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result 2025-05-19 16:14:24.000000000 +0000 @@ -66,7 +66,7 @@ Variable_name Value wsrep_desync_count 0 SET @@global.wsrep_desync = 0; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_1; # Wait until both nodes are back to cluster SET GLOBAL wsrep_provider_options = 'pc.ignore_sb=false'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-25389.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-25389.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result 2025-05-19 16:14:24.000000000 +0000 @@ -15,3 +15,4 @@ SELECT @@wsrep_slave_threads; @@wsrep_slave_threads 1 +connection node_2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-26266.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-26266.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result 2025-05-19 16:14:24.000000000 +0000 @@ -19,5 +19,5 @@ INSERT INTO t2 VALUES (4); INSERT INTO t2 VALUES (5); CREATE VIEW v1 AS SELECT c1 FROM t1 WHERE c1 IN (SELECT a FROM t2) GROUP BY c1; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +DROP VIEW v1; DROP TABLE t1,t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-33136.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-33136.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,7 @@ connection node_1; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; connection node_1a; -TRUNCATE TABLE t1; +RENAME TABLE t1 TO tmp, tmp TO t1; SET SESSION wsrep_retry_autocommit = 0; SET DEBUG_SYNC = 'dict_stats_mdl_acquired SIGNAL may_toi WAIT_FOR bf_abort'; INSERT INTO t1 VALUES (1); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-34647.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-34647.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result 2025-05-19 16:14:24.000000000 +0000 @@ -95,7 +95,6 @@ 4 d 5 d 6 d -set global wsrep_mode=default; connection node_1; drop table t1,t2,t3,t4,t5; set global wsrep_mode=default; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35748.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35748.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,31 @@ +connection node_2; +connection node_1; +connection node_1; +INSTALL PLUGIN IF NOT EXISTS connect SONAME 'ha_connect'; +CREATE TABLE t1 (f INT) ENGINE=CONNECT; +Warnings: +Warning 1105 No table_type. Will be set to DOS +Warning 1105 No file name. Table will use t1.dos +CREATE TABLE t2 (f INT) ENGINE=ROCKSDB; +CREATE TABLE t3 (f INT) ENGINE=SEQUENCE; +ERROR 42000: This version of MariaDB doesn't yet support 'non-InnoDB sequences in Galera cluster' +show warnings; +Level Code Message +Error 1235 This version of MariaDB doesn't yet support 'non-InnoDB sequences in Galera cluster' +Note 1235 ENGINE=SEQUENCE not supported by Galera +connection node_2; +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `f` int(11) DEFAULT NULL +) ENGINE=CONNECT DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +show create table t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `f` int(11) DEFAULT NULL +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +show create table t3; +ERROR 42S02: Table 'test.t3' doesn't exist +connection node_1; +DROP TABLE t1, t2; +UNINSTALL PLUGIN IF EXISTS connect; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35946.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35946.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,16 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1'; +SET SESSION wsrep_sync_wait=0; +SET SESSION wsrep_sync_wait=DEFAULT; +DELETE FROM mysql.wsrep_streaming_log; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +SET SESSION wsrep_sync_wait=0; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0'; +SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +VARIABLE_VALUE +Primary +SET SESSION wsrep_sync_wait=DEFAULT; +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-36116.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-36116.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,22 @@ +connection node_2; +connection node_1; +connect con1,127.0.0.1,root,,test,$NODE_MYPORT_1; +connection node_1; +CALL mtr.add_suppression("CREATE TABLE isolation failure"); +SET DEBUG_SYNC = 'wsrep_kill_thd_before_enter_toi SIGNAL may_kill WAIT_FOR continue'; +CREATE TABLE t1 (a INT) ENGINE=InnoDB; +connection con1; +SET DEBUG_SYNC = 'now WAIT_FOR may_kill'; +SET DEBUG_SYNC = 'now SIGNAL continue'; +connection node_1; +Got one of the listed errors +connection node_2; +SHOW TABLES LIKE 't1'; +Tables_in_test (t1) +connection con1; +SHOW TABLES LIKE 't1'; +Tables_in_test (t1) +SET DEBUG_SYNC = 'RESET'; +disconnect con1; +disconnect node_2; +disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-284.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MW-284.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result 2025-05-19 16:14:24.000000000 +0000 @@ -13,7 +13,7 @@ SELECT @@wsrep_on; @@wsrep_on 0 -call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use (server_errno=1047)"); +call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use \\(server_errno ?= ?1047\\)"); START SLAVE; include/wait_for_slave_param.inc [Slave_IO_Running] connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-329.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MW-329.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result 2025-05-19 16:14:24.000000000 +0000 @@ -18,5 +18,6 @@ connection node_1; DROP PROCEDURE proc_insert; DROP TABLE t1; +disconnect node_1b; CALL mtr.add_suppression("WSREP: .* conflict state after post commit "); set global innodb_status_output=Default; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-329F.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MW-329F.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,25 @@ +connection node_2; +connection node_1; +CREATE TABLE t1 (f1 INTEGER, f2 CHAR(20) DEFAULT 'abc') ENGINE=InnoDB; +INSERT INTO t1 (f1) VALUES (1),(65535); +CREATE PROCEDURE proc_insert (repeat_count int) +BEGIN +DECLARE current_num int; +DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; +SET current_num = 0; +SET SESSION wsrep_sync_wait = 0; +WHILE current_num < repeat_count do +INSERT INTO t1 (f1) VALUES (FLOOR( 1 + RAND( ) * 65535 )); +SELECT SLEEP(0.1); +SET current_num = current_num + 1; +END WHILE; +END| +connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connection node_1b; +connection node_1b; +connection node_1; +DROP PROCEDURE proc_insert; +DROP TABLE t1; +disconnect node_1b; +CALL mtr.add_suppression("WSREP: .* conflict state after post commit "); +set global innodb_status_output=Default; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-416.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result --- mariadb-10.11.11/mysql-test/suite/galera/r/MW-416.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result 2025-05-19 16:14:24.000000000 +0000 @@ -20,13 +20,13 @@ Got one of the listed errors CREATE DATABASE db; Got one of the listed errors -CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1; +CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1; Got one of the listed errors CREATE FUNCTION fun1() RETURNS int RETURN(1); Got one of the listed errors CREATE FUNCTION fun1 RETURNS STRING SONAME 'funlib.so'; Got one of the listed errors -CREATE PROCEDURE proc1() BEGIN END; +CREATE PROCEDURE proc1() BEGIN END; Got one of the listed errors CREATE INDEX idx ON tbl(id); Got one of the listed errors @@ -100,3 +100,4 @@ performance_schema sys test +disconnect userMW416; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_2primary_replica.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_2primary_replica.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result 2025-05-19 16:14:24.000000000 +0000 @@ -13,10 +13,13 @@ connect replica, 127.0.0.1, root, , test, $NODE_MYPORT_1; connection replica; connection node_2; +connection primary1; +connection primary2; connection replica; # Galera replica changing master to primary1 -SET @@default_master_connection='stream2'; +SET @@default_master_connection='stream1'; # Primary node changing master to primary2 +SET @@default_master_connection='stream2'; START ALL SLAVES; Warnings: Note 1937 SLAVE 'stream1' started diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_alter_engine_myisam.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_alter_engine_myisam.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result 2025-05-19 16:14:24.000000000 +0000 @@ -26,3 +26,4 @@ 1 DROP TABLE t1; connection node_1; +SET GLOBAL wsrep_mode = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result 2025-05-19 16:14:24.000000000 +0000 @@ -13,7 +13,7 @@ SELECT 1 FROM DUAL; 1 1 -SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'; +SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); COUNT(*) = 1 1 UNLOCK TABLES; @@ -25,7 +25,7 @@ `f2` int(11) DEFAULT NULL, PRIMARY KEY (`f1`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'; +SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); COUNT(*) = 0 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_as_slave_nonprim.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_as_slave_nonprim.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result 2025-05-19 16:14:24.000000000 +0000 @@ -12,7 +12,6 @@ connection node_4; INSERT INTO t1 VALUES (1),(2),(3),(4),(5); connection node_2; -connection node_1; expected_error 1 connection node_2; @@ -27,7 +26,7 @@ RESET SLAVE ALL; CALL mtr.add_suppression("Slave SQL: Error 'Unknown command' on query"); CALL mtr.add_suppression("Slave: Unknown command Error_code: 1047"); -CALL mtr.add_suppression("Transport endpoint is not connected"); +CALL mtr.add_suppression("(Transport endpoint|Socket) is not connected"); CALL mtr.add_suppression("Slave SQL: Error in Xid_log_event: Commit could not be completed, 'Deadlock found when trying to get lock; try restarting transaction', Error_code: 1213"); CALL mtr.add_suppression("Slave SQL: Node has dropped from cluster, Error_code: 1047"); connection node_4; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,685 +0,0 @@ -SET SESSION wsrep_sync_wait = 0; -galera_sr_bf_abort_at_commit = 0 -after_replicate_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync'; -INSERT INTO t1 VALUES (3); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync'; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -local_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync'; -INSERT INTO t1 VALUES (3); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync'; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -apply_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync'; -INSERT INTO t1 VALUES (3); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync'; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -commit_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync'; -INSERT INTO t1 VALUES (3); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync'; -ERROR 40001: Deadlock found when trying to get lock; try restarting transaction -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -galera_sr_bf_abort_at_commit = 1 -after_replicate_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -local_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -apply_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -commit_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 1; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -galera_sr_bf_abort_at_commit = 1 -after_replicate_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 0; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -local_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 0; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -apply_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 0; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -commit_monitor_master_enter_sync -CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; -SET SESSION wsrep_trx_fragment_size = 0; -SET AUTOCOMMIT=OFF; -INSERT INTO t1 VALUES (1); -SELECT * FROM t1 FOR UPDATE; -f1 -1 -SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; -SET AUTOCOMMIT=ON; -INSERT INTO t1 VALUES (2); -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync'; -COMMIT; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; -SET SESSION wsrep_on = 0; -SET SESSION wsrep_on = 1; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync'; -ROLLBACK; -SET GLOBAL wsrep_provider_options = 'dbug='; -SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT * FROM t1; -f1 -1 -2 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2; -COUNT(*) = 1 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log; -COUNT(*) = 0 -1 -SET AUTOCOMMIT=ON; -SET SESSION wsrep_trx_fragment_size = 0; -DELETE FROM t1; -DROP TABLE t1; -CALL mtr.add_suppression("WSREP: fragment replication failed: 1"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,7 @@ connection node_1; INSERT INTO t1 VALUES (2); connection node_2; +SET SESSION wsrep_sync_wait = 0; UNLOCK TABLES; COMMIT; SELECT COUNT(*) = 1 FROM t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result 2025-05-19 16:14:24.000000000 +0000 @@ -53,7 +53,7 @@ FOUND 1 /Server not desynched from group at BLOCK_DDL because WSREP_MODE_BF_MARIABACKUP is used./ in mysqld.2.err # Should return FOUND 1 as server did desync and pause at BLOCK_COMMIT FOUND 1 /Server desynched from group during BACKUP STAGE BLOCK_COMMIT./ in mysqld.2.err -SET GLOBAL wsrep_mode = ""; +SET GLOBAL wsrep_mode = DEFAULT; connection node_1; DROP TABLE t; disconnect node_2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ ---- a/home/panda/mariadb-10.5/mysql-test/suite/galera/r/galera_bf_kill.result -+++ b/home/panda/mariadb-10.5/mysql-test/suite/galera/r/galera_bf_kill.reject +--- r/galera_bf_kill.result ++++ r/galera_bf_kill,debug.reject @@ -77,4 +77,34 @@ a b 5 2 disconnect node_2a; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill_debug.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill_debug.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result 2025-05-19 16:14:24.000000000 +0000 @@ -40,18 +40,19 @@ disconnect node_2a; connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; connection node_2a; -CREATE TABLE t1 (i int primary key); +CREATE TABLE t1 (i int primary key) engine=innodb; SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue"; INSERT INTO t1 VALUES (1); connection node_2; SET DEBUG_SYNC = "now WAIT_FOR bwoc_reached"; SET DEBUG_SYNC = "now SIGNAL bwoc_continue"; -SET DEBUG_SYNC='RESET'; connection node_2a; connection node_2; +SET DEBUG_SYNC='RESET'; select * from t1; i 1 disconnect node_2a; +disconnect node_2b; connection node_1; drop table t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_checksum.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_checksum.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result 2025-05-19 16:14:24.000000000 +0000 @@ -27,4 +27,5 @@ 1 connection node_1; DROP TABLE t1; +SET @@global.wsrep_mode=DEFAULT; # End of tests. diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,11 +1,11 @@ connection node_2; connection node_1; connection node_1; -SET GLOBAL auto_increment_offset=1; connection node_2; -SET GLOBAL auto_increment_offset=2; connection node_1; +SET GLOBAL auto_increment_offset=1; connection node_2; +SET GLOBAL auto_increment_offset=2; connection node_2; SET GLOBAL wsrep_forced_binlog_format='STATEMENT'; connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_circular_replication.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_circular_replication.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result 2025-05-19 16:14:24.000000000 +0000 @@ -12,6 +12,7 @@ connection replica1; connection node_2; connection primary2; +connection primary1; connection replica1; # Galera replica changing master to primary1 START SLAVE; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result 2025-05-19 16:14:24.000000000 +0000 @@ -298,6 +298,7 @@ ###################################################################### connection node_1; SET SESSION wsrep_sync_wait=0; +FLUSH STATUS; CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); INSERT INTO p1 VALUES (1, 'INITIAL VALUE'); CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); @@ -491,6 +492,7 @@ ###################################################################### connection node_1; SET SESSION wsrep_sync_wait=0; +FLUSH STATUS; CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); INSERT INTO p1 VALUES (1, 'INITIAL VALUE'); CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); @@ -684,6 +686,7 @@ ###################################################################### connection node_1; SET SESSION wsrep_sync_wait=0; +FLUSH STATUS; CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); INSERT INTO p1 VALUES (1, 'INITIAL VALUE'); CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30)); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_defaults.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_defaults.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,9 @@ connection node_2; connection node_1; # Correct Galera library found +SELECT COUNT(*) `expect 51` FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%'; +expect 51 +51 SELECT VARIABLE_NAME, VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_gcs_fragment.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_gcs_fragment.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result 2025-05-19 16:14:24.000000000 +0000 @@ -22,7 +22,7 @@ connection node_1a; SET GLOBAL wsrep_provider_options = 'signal=gcs_core_after_frag_send'; connection node_1; -ERROR HY000: Got error 6 "No such device or address" during COMMIT +ERROR HY000: Error while appending streaming replication fragment(provider status: Not connected to Primary Component) INSERT INTO t1 VALUES (3, "cccccaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); SELECT * FROM t1; f1 f2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result 2025-05-19 16:14:24.000000000 +0000 @@ -3,8 +3,11 @@ CREATE TABLE t1(id int not null primary key, b int) engine=InnoDB; INSERT INTO t1 VALUES (0,0),(1,1),(2,2),(3,3); BEGIN; +SET DEBUG_SYNC = 'wsrep_after_statement_enter SIGNAL blocked'; UPDATE t1 set b = 100 where id between 1 and 2;; connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET DEBUG_SYNC = 'now WAIT_FOR blocked'; +SET DEBUG_SYNC = 'wsrep_after_statement_enter CLEAR'; connection node_1b; SET @save_dbug = @@SESSION.debug_dbug; SET @@SESSION.innodb_lock_wait_timeout=2; @@ -20,5 +23,6 @@ 1 100 2 100 3 3 +SET DEBUG_SYNC = 'RESET'; disconnect node_1b; DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ ---- suite/galera/r/galera_ist_MDEV-28423.result 2022-06-13 09:40:33.073863796 +0300 -+++ suite/galera/r/galera_ist_MDEV-28423.reject 2022-06-13 09:58:59.936874991 +0300 +--- r/galera_ist_MDEV-28423.result ++++ r/galera_ist_MDEV-28423,debug.reject @@ -517,3 +517,187 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ ---- suite/galera/r/galera_ist_MDEV-28583.result 2022-06-11 10:48:16.875034382 +0300 -+++ suite/galera/r/galera_ist_MDEV-28583,debug.reject 2022-06-11 11:25:55.616481509 +0300 +--- r/galera_ist_MDEV-28583.result ++++ r/galera_ist_MDEV-28583,debug.reject @@ -517,3 +517,187 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,13 +1,12 @@ --- r/galera_ist_mysqldump.result +++ r/galera_ist_mysqldump,debug.reject -@@ -354,11 +354,195 @@ +@@ -354,6 +354,190 @@ 1 DROP TABLE t1; COMMIT; +Performing State Transfer on a server that has been killed and restarted +while a DDL was in progress on it - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); ++connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; @@ -189,12 +188,6 @@ +DROP TABLE t1; +COMMIT; +SET GLOBAL debug_dbug = $debug_orig; -+connection node_1; -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); + connection node_1; + CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ ---- r/galera_ist_mysqldump.result -+++ r/galera_ist_mysqldump.reject -@@ -355,10 +355,10 @@ - DROP TABLE t1; - COMMIT; - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,8 @@ connection node_2; connection node_1; +call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to "); +connection node_1; +connection node_2; Setting SST method to mysqldump ... call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to '127\\.0\\.0\\.1'"); call mtr.add_suppression("Failed to load slave replication state from table mysql\\.gtid_slave_pos"); @@ -9,9 +12,6 @@ SET GLOBAL wsrep_sst_auth = 'sst:'; connection node_2; SET GLOBAL wsrep_sst_method = 'mysqldump'; -call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to "); -connection node_1; -connection node_2; Performing State Transfer on a server that has been shut down cleanly and restarted connection node_1; CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; @@ -355,10 +355,10 @@ DROP TABLE t1; COMMIT; connection node_1; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; connection node_2; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); CALL mtr.add_suppression("Can't open and lock time zone table"); CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_nonPK_and_PA.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_nonPK_and_PA.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result 2025-05-19 16:14:24.000000000 +0000 @@ -8,7 +8,7 @@ SET SESSION wsrep_sync_wait = 0; SET GLOBAL wsrep_slave_threads = 2; *************************************************************** -scenario 1, conflicting UPDATE +scenario 1, conflicting UPDATE *************************************************************** SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync'; connection node_1; @@ -31,7 +31,7 @@ SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync'; SET GLOBAL wsrep_provider_options = 'dbug='; *************************************************************** -scenario 2, conflicting DELETE +scenario 2, conflicting DELETE *************************************************************** SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync'; connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result 2025-05-19 16:14:24.000000000 +0000 @@ -10,10 +10,10 @@ INSERT INTO t2 VALUES (1); connection node_2a; SET SESSION wsrep_sync_wait=0; -SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification'); +SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification'); EXPECT_1 1 -SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE '%Waiting for table metadata lock%'; +SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); EXPECT_1 1 SELECT COUNT(*) AS EXPECT_0 FROM t1; @@ -32,9 +32,8 @@ SELECT COUNT(*) AS EXPECT_1 FROM t2; EXPECT_1 1 -SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE '%committed%' or STATE = 'Waiting for certification'); +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification'); EXPECT_2 2 -SET GLOBAL wsrep_slave_threads = 1;; DROP TABLE t1; DROP TABLE t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_simple.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_simple.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result 2025-05-19 16:14:24.000000000 +0000 @@ -34,6 +34,5 @@ SELECT COUNT(*) as expect_20 FROM t2; expect_20 20 -SET GLOBAL wsrep_slave_threads = 1;; DROP TABLE t1; DROP TABLE t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_partitioned_tables.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_partitioned_tables.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,176 @@ +connection node_2; +connection node_1; +call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine partition for table"); +# wsrep-mode= DEFAULT +SET GLOBAL wsrep_mode = ""; +SELECT @@wsrep_mode; +@@wsrep_mode + +CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB +PARTITION BY KEY (v1) +PARTITIONS 2; +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM +PARTITION BY KEY (v1) +PARTITIONS 2; +ALTER TABLE t1 ADD COLUMN v2 int; +ALTER TABLE t2 ADD COLUMN v2 int; +INSERT INTO t1 VALUES (1,1),(2,2); +INSERT INTO t2 VALUES (1,1),(2,2); +ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM; +ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria; +UPDATE t1 SET v3 = 3; +UPDATE t2 SET v3 = 3; +CREATE INDEX xx1 ON t1(v2); +CREATE INDEX xx2 ON t2(v2); +DROP INDEX xx1 ON t1; +DROP INDEX xx2 ON t2; +TRUNCATE TABLE t1; +TRUNCATE TABLE t2; +RENAME TABLE t1 TO t1_v2; +RENAME TABLE t2 TO t2_v2; +CREATE VIEW x1 AS SELECT * FROM t1_v2; +CREATE VIEW x2 AS SELECT * FROM t2_v2; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1 +AFTER INSERT ON t1_v2 FOR EACH ROW +UPDATE t1_v2 SET t1_v2.v3 = t1_v2.v3+1; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2 +AFTER INSERT ON t2_v2 FOR EACH ROW +UPDATE t2_v2 SET t2_v2.v3 = t2_v2.v3+1; +connection node_2; +SHOW CREATE TABLE t1_v2; +Table Create Table +t1_v2 CREATE TABLE `t1_v2` ( + `v1` int(11) NOT NULL, + `v2` int(11) DEFAULT NULL, + `v3` int(11) DEFAULT NULL, + PRIMARY KEY (`v1`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci + PARTITION BY KEY (`v1`) +PARTITIONS 2 +SHOW CREATE TABLE t2_v2; +Table Create Table +t2_v2 CREATE TABLE `t2_v2` ( + `v1` int(11) NOT NULL, + `v2` int(11) DEFAULT NULL, + `v3` int(11) DEFAULT NULL, + PRIMARY KEY (`v1`) +) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci + PARTITION BY KEY (`v1`) +PARTITIONS 2 +SHOW CREATE VIEW x1; +View Create View character_set_client collation_connection +x1 CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x1` AS select `t1_v2`.`v1` AS `v1`,`t1_v2`.`v2` AS `v2`,`t1_v2`.`v3` AS `v3` from `t1_v2` latin1 latin1_swedish_ci +SHOW CREATE VIEW x2; +View Create View character_set_client collation_connection +x2 CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x2` AS select `t2_v2`.`v1` AS `v1`,`t2_v2`.`v2` AS `v2`,`t2_v2`.`v3` AS `v3` from `t2_v2` latin1 latin1_swedish_ci +SELECT * FROM t1_v2; +v1 v2 v3 +SELECT * FROM t2_v2; +v1 v2 v3 +connection node_1; +DROP VIEW x1; +DROP VIEW x2; +DROP TRIGGER increment_before_t1; +DROP TRIGGER increment_before_t2; +DROP TABLE t1_v2; +DROP TABLE t2_v2; +SET GLOBAL wsrep_mode = ""; +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM +PARTITION BY KEY (v1) +PARTITIONS 2; +# wsrep-mode= STRICT_REPLICATION +SET GLOBAL wsrep_mode = "STRICT_REPLICATION"; +SELECT @@wsrep_mode; +@@wsrep_mode +STRICT_REPLICATION +CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB +PARTITION BY KEY (v1) +PARTITIONS 2; +CREATE OR REPLACE TABLE t3 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM +PARTITION BY KEY (v1) +PARTITIONS 2; +ERROR HY000: Galera replication not supported +ALTER TABLE t1 ADD COLUMN v2 int; +ALTER TABLE t2 ADD COLUMN v2 int; +ERROR HY000: Galera replication not supported +INSERT INTO t1 VALUES (1,1),(2,2); +Warnings: +Warning 1290 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t1' is not supported in Galera +INSERT INTO t2 VALUES (1),(2); +Warnings: +Warning 1290 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t2' is not supported in Galera +ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM; +ERROR HY000: Galera replication not supported +ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria; +ERROR HY000: Galera replication not supported +UPDATE t1 SET v2 = v2 + 3; +Warnings: +Warning 1290 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t1' is not supported in Galera +UPDATE t2 SET v1 = v1 + 3; +Warnings: +Warning 1290 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t2' is not supported in Galera +CREATE INDEX xx1 ON t1(v2); +CREATE INDEX xx2 ON t2(v2); +ERROR HY000: Galera replication not supported +DROP INDEX xx1 ON t1; +DROP INDEX xx2 on t2; +ERROR HY000: Galera replication not supported +TRUNCATE TABLE t1; +TRUNCATE TABLE t2; +ERROR HY000: Galera replication not supported +RENAME TABLE t1 TO t1_v2; +RENAME TABLE t2 TO t2_v2; +RENAME TABLE t2_v2 TO t2; +CREATE VIEW x1 AS SELECT * FROM t1_v2; +CREATE VIEW x2 AS SELECT * FROM t2; +ERROR HY000: Galera replication not supported +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1 +AFTER INSERT ON t1_v2 FOR EACH ROW +UPDATE t1_v2 SET t1_v2.v2 = t1_v2.v2+1; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2 +AFTER INSERT ON t2 FOR EACH ROW +UPDATE t2 SET t2.v1 = t2.v1+1; +ERROR HY000: Galera replication not supported +connection node_2; +SHOW CREATE TABLE t1_v2; +Table Create Table +t1_v2 CREATE TABLE `t1_v2` ( + `v1` int(11) NOT NULL, + `v2` int(11) DEFAULT NULL, + PRIMARY KEY (`v1`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci + PARTITION BY KEY (`v1`) +PARTITIONS 2 +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `v1` int(11) NOT NULL, + `v2` int(11) DEFAULT NULL, + PRIMARY KEY (`v1`) +) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci + PARTITION BY KEY (`v1`) +PARTITIONS 2 +SHOW CREATE VIEW x1; +View Create View character_set_client collation_connection +x1 CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x1` AS select `t1_v2`.`v1` AS `v1`,`t1_v2`.`v2` AS `v2` from `t1_v2` latin1 latin1_swedish_ci +SELECT * FROM t1_v2; +v1 v2 +SELECT * FROM t2; +v1 v2 +connection node_1; +DROP VIEW x1; +DROP TRIGGER increment_before_t1; +DROP TABLE t1_v2; +DROP TABLE t2; +SET GLOBAL wsrep_mode = ""; +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM +PARTITION BY KEY (v1) +PARTITIONS 2; +# wsrep-mode= STRICT_REPLICATION +SET GLOBAL wsrep_mode = "STRICT_REPLICATION"; +SELECT @@wsrep_mode; +@@wsrep_mode +STRICT_REPLICATION +ALTER TABLE t2 ENGINE=InnoDB; +DROP TABLE t2; +SET GLOBAL wsrep_mode = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_restart_replica.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_restart_replica.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,7 @@ ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; connection node_1; connection replica; +connection primary; connection replica; START SLAVE; connection primary; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequence_engine.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequence_engine.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,10 @@ connection node_2; connection node_1; +connection node_2; +SET GLOBAL wsrep_ignore_apply_errors=0; +connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_2a; +SET SESSION wsrep_sync_wait=0; SET GLOBAL wsrep_ignore_apply_errors=0; SET SESSION AUTOCOMMIT=0; SET SESSION max_error_count=0; @@ -8,5 +13,4 @@ connection node_2; SHOW CREATE TABLE t0; ERROR 42S02: Table 'test.t0' doesn't exist -connection node_1; -SET GLOBAL wsrep_ignore_apply_errors=DEFAULT; +disconnect node_2a; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,11 @@ +--- r/galera_sequences.result ++++ r/galera_sequences,binlogoff.reject +@@ -313,7 +313,7 @@ + 7 4 + SELECT NEXTVAL(t); + NEXTVAL(t) +-42 ++2 + connection node_1; + DROP TABLE t1; + DROP SEQUENCE t; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result 2025-05-19 16:14:24.000000000 +0000 @@ -47,6 +47,9 @@ NEXT VALUE FOR Seq1_1 4 connection node_1; +SHOW CREATE SEQUENCE Seq1_1; +Table Create Table +Seq1_1 CREATE SEQUENCE `Seq1_1` start with 1 minvalue 1 maxvalue 9223372036854775806 increment by 1 nocache nocycle ENGINE=InnoDB DROP SEQUENCE Seq1_1; connection node_1; CREATE TABLE t2 (d CHAR(1)KEY); @@ -279,6 +282,9 @@ connection node_1; DROP TABLE t1; DROP SEQUENCE t; +connection node_2; +SET SESSION wsrep_sync_wait=15; +connection node_1; CREATE SEQUENCE t INCREMENT BY 0 CACHE=20 ENGINE=INNODB; CREATE TABLE t1(a int not null primary key default nextval(t), b int) engine=innodb; BEGIN; @@ -324,4 +330,14 @@ ALTER SEQUENCE IF EXISTS t MINVALUE=1; ERROR 42000: This version of MariaDB doesn't yet support 'CACHE without INCREMENT BY 0 in Galera cluster' DROP TABLE t; + +MDEV-32631: + +CREATE OR REPLACE TABLE t1(c INT ) ENGINE=ARIA; +SET SESSION WSREP_OSU_METHOD=RSU; +INSERT INTO t1 SELECT seq,concat(seq,1) FROM seq_1_to_100; +ERROR 42000: This version of MariaDB doesn't yet support 'RSU on this table engine' +SET SESSION WSREP_OSU_METHOD=TOI; +DROP TABLE t1; + End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_bf_kill.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_bf_kill.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,152 @@ +connection node_2; +connection node_1; +connection node_1; +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, 0), (3, 0); +connection node_1; +START TRANSACTION; +INSERT INTO t1 VALUES (4, next value for s); +INSERT INTO t1 VALUES (5, next value for s); +INSERT INTO t1 VALUES (6, next value for s); +INSERT INTO t1 VALUES (7, next value for s); +INSERT INTO t1 VALUES (8, next value for s); +INSERT INTO t1 VALUES (9, next value for s); +INSERT INTO t1 VALUES (10, next value for s); +INSERT INTO t1 VALUES (11, next value for s); +INSERT INTO t1 VALUES (12, next value for s); +INSERT INTO t1 VALUES (13, next value for s); +INSERT INTO t1 VALUES (14, next value for s); +SELECT * FROM t1 WHERE f1 > 0 FOR UPDATE; +f1 f2 +1 0 +3 0 +4 1 +5 3 +6 5 +7 7 +8 9 +9 11 +10 13 +11 15 +12 17 +13 19 +14 21 +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET SESSION wsrep_sync_wait=0; +SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; +connection node_2; +INSERT INTO t1 VALUES (2, 2); +connection node_1a; +SET SESSION wsrep_on = 0; +SET SESSION wsrep_on = 1; +SET GLOBAL wsrep_provider_options = 'dbug='; +SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync'; +connection node_1; +COMMIT; +connection node_1a; +SET SESSION wsrep_on = 0; +SET SESSION wsrep_on = 1; +SET GLOBAL wsrep_provider_options = 'dbug='; +SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end'; +SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; +SET SESSION wsrep_on = 0; +SET SESSION wsrep_on = 1; +SET GLOBAL wsrep_provider_options = 'dbug='; +SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end'; +SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync'; +connection node_1; +wsrep_local_replays +1 +INSERT INTO t1 VALUES (22, next value for s); +INSERT INTO t1 VALUES (23, next value for s); +INSERT INTO t1 VALUES (24, next value for s); +INSERT INTO t1 VALUES (25, next value for s); +INSERT INTO t1 VALUES (26, next value for s); +INSERT INTO t1 VALUES (27, next value for s); +INSERT INTO t1 VALUES (28, next value for s); +INSERT INTO t1 VALUES (29, next value for s); +INSERT INTO t1 VALUES (30, next value for s); +INSERT INTO t1 VALUES (31, next value for s); +INSERT INTO t1 VALUES (32, next value for s); +INSERT INTO t1 VALUES (33, next value for s); +INSERT INTO t1 VALUES (34, next value for s); +INSERT INTO t1 VALUES (35, next value for s); +connection node_1; +SELECT * FROM t1; +f1 f2 +1 0 +2 2 +3 0 +4 1 +5 3 +6 5 +7 7 +8 9 +9 11 +10 13 +11 15 +12 17 +13 19 +14 21 +22 31 +23 33 +24 35 +25 37 +26 39 +27 41 +28 43 +29 45 +30 47 +31 49 +32 51 +33 53 +34 55 +35 57 +SELECT LASTVAL(s); +LASTVAL(s) +57 +connection node_2; +SELECT * FROM t1; +f1 f2 +1 0 +2 2 +3 0 +4 1 +5 3 +6 5 +7 7 +8 9 +9 11 +10 13 +11 15 +12 17 +13 19 +14 21 +22 31 +23 33 +24 35 +25 37 +26 39 +27 41 +28 43 +29 45 +30 47 +31 49 +32 51 +33 53 +34 55 +35 57 +SELECT LASTVAL(s); +LASTVAL(s) +NULL +connection node_1; +SELECT NEXTVAL(s); +NEXTVAL(s) +59 +connection node_2; +SELECT NEXTVAL(s); +NEXTVAL(s) +62 +DROP SEQUENCE s; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_transaction.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_transaction.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,350 @@ +connection node_2; +connection node_1; +connection node_1; +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_1; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; +connection node_2; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; +connection node_2a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; +connection node_1a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; +connection node_2; +SELECT LASTVAL(s); +LASTVAL(s) +40 +connection node_1; +SELECT LASTVAL(s); +LASTVAL(s) +19 +connection node_2a; +SELECT LASTVAL(s); +LASTVAL(s) +60 +connection node_1a; +SELECT LASTVAL(s); +LASTVAL(s) +79 +connection node_1; +SELECT * FROM t1; +f1 f2 +1 1 +3 1 +5 1 +7 1 +9 1 +11 1 +13 1 +15 1 +17 1 +19 1 +22 1 +24 1 +26 1 +28 1 +30 1 +32 1 +34 1 +36 1 +38 1 +40 1 +42 1 +44 1 +46 1 +48 1 +50 1 +52 1 +54 1 +56 1 +58 1 +60 1 +61 1 +63 1 +65 1 +67 1 +69 1 +71 1 +73 1 +75 1 +77 1 +79 1 +connection node_2; +SELECT * FROM t1; +f1 f2 +1 1 +3 1 +5 1 +7 1 +9 1 +11 1 +13 1 +15 1 +17 1 +19 1 +22 1 +24 1 +26 1 +28 1 +30 1 +32 1 +34 1 +36 1 +38 1 +40 1 +42 1 +44 1 +46 1 +48 1 +50 1 +52 1 +54 1 +56 1 +58 1 +60 1 +61 1 +63 1 +65 1 +67 1 +69 1 +71 1 +73 1 +75 1 +77 1 +79 1 +connection node_1; +DROP TABLE t1; +DROP SEQUENCE s; +connection node_1; +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; +connection node_1; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; +connection node_2; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; +connection node_2a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; +connection node_1a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; +connection node_2; +SELECT LASTVAL(s); +LASTVAL(s) +20 +connection node_1; +SELECT LASTVAL(s); +LASTVAL(s) +19 +connection node_2a; +SELECT LASTVAL(s); +LASTVAL(s) +40 +connection node_1a; +SELECT LASTVAL(s); +LASTVAL(s) +39 +connection node_1; +SELECT * FROM t1; +f1 f2 +connection node_2; +SELECT * FROM t1; +f1 f2 +connection node_1; +DROP TABLE t1; +DROP SEQUENCE s; +connection node_1; +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; +connection node_1; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +connection node_1a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +connection node_2a; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +connection node_2; +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +connection node_1; +COMMIT; +connection node_1a; +ROLLBACK; +connection node_2; +COMMIT; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +connection node_2a; +ROLLBACK; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +connection node_2; +SELECT LASTVAL(s); +LASTVAL(s) +40 +connection node_1; +SELECT LASTVAL(s); +LASTVAL(s) +19 +connection node_2a; +SELECT LASTVAL(s); +LASTVAL(s) +20 +connection node_1a; +SELECT LASTVAL(s); +LASTVAL(s) +39 +connection node_1; +SELECT * FROM t1; +f1 f2 +1 1 +3 1 +5 1 +7 1 +9 1 +11 1 +13 1 +15 1 +17 1 +19 1 +connection node_2; +SELECT * FROM t1; +f1 f2 +1 1 +3 1 +5 1 +7 1 +9 1 +11 1 +13 1 +15 1 +17 1 +19 1 +connection node_1; +DROP TABLE t1; +DROP SEQUENCE s; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_slave_replay.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_slave_replay.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ -connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; -connection node_2a; connection node_2; connection node_1; +connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_2a; ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3; connection node_3; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_split_brain.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_split_brain.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result 2025-05-19 16:14:24.000000000 +0000 @@ -2,6 +2,7 @@ connection node_1; connection node_1; connection node_2; +connection node_2; call mtr.add_suppression("WSREP: TO isolation failed for: "); connection node_1; call mtr.add_suppression("CREATE TABLE isolation failure"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,8 @@ connection node_2; connection node_1; +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; +expect 0 +0 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; VARIABLE_VALUE = 'Synced' 1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_cipher.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_cipher.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,30 @@ +connection node_2; +connection node_1; +# Correct Galera library found +connection node_1; +connection node_2; +connection node_1; +connection node_2; +SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; +VARIABLE_VALUE = 'Synced' +1 +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +VARIABLE_VALUE = 2 +1 +connection node_1; +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +VARIABLE_VALUE = 2 +1 +connection node_2; +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +VARIABLE_VALUE = 2 +1 +connection node_1; +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +VARIABLE_VALUE = 2 +1 +connection node_2; +connection node_1; +call mtr.add_suppression("WSREP: write_handler\\(\\)"); +connection node_2; +call mtr.add_suppression("WSREP: write_handler\\(\\)"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_compression.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_compression.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,8 @@ connection node_2; connection node_1; +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; +expect 0 +0 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; VARIABLE_VALUE = 'Synced' 1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_upgrade.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_upgrade.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,8 @@ connection node_2; connection node_1; +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; +expect 0 +0 connection node_1; connection node_2; connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ ---- galera/r/galera_sst_mariabackup.result 2024-04-11 09:53:12.950512316 +0300 -+++ galera/r/galera_sst_mariabackup,debug.reject 2024-04-11 10:00:36.771144955 +0300 -@@ -524,6 +524,190 @@ +--- r/galera_sst_mariabackup.result ++++ r/galera_sst_mariabackup,debug.reject +@@ -516,5 +516,189 @@ 1 DROP TABLE t1; COMMIT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ ---- r/galera_sst_mariabackup.result -+++ r/galera_sst_mariabackup,debug.reject +--- r/galera_sst_mariabackup_force_recovery.result ++++ r/galera_sst_mariabackup_force_recovery,debug.reject @@ -516,5 +516,189 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,210 @@ +--- r/galera_sst_mariabackup_gtid.result ++++ r/galera_sst_mariabackup_gtid,debug.reject +@@ -516,19 +516,203 @@ + 1 + DROP TABLE t1; + COMMIT; ++Performing State Transfer on a server that has been killed and restarted ++while a DDL was in progress on it ++connection node_1; ++CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 VALUES (1,'node1_committed_before'); ++INSERT INTO t1 VALUES (2,'node1_committed_before'); ++INSERT INTO t1 VALUES (3,'node1_committed_before'); ++INSERT INTO t1 VALUES (4,'node1_committed_before'); ++INSERT INTO t1 VALUES (5,'node1_committed_before'); ++connection node_2; ++START TRANSACTION; ++INSERT INTO t1 VALUES (6,'node2_committed_before'); ++INSERT INTO t1 VALUES (7,'node2_committed_before'); ++INSERT INTO t1 VALUES (8,'node2_committed_before'); ++INSERT INTO t1 VALUES (9,'node2_committed_before'); ++INSERT INTO t1 VALUES (10,'node2_committed_before'); ++COMMIT; ++SET GLOBAL debug_dbug = 'd,sync.alter_opened_table'; ++connection node_1; ++ALTER TABLE t1 ADD COLUMN f2 INTEGER; ++connection node_2; ++SET wsrep_sync_wait = 0; ++Killing server ... ++connection node_1; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (11,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (12,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (13,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (14,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (15,'node1_committed_during'); ++COMMIT; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (16,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (17,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (18,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (19,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (20,'node1_to_be_committed_after'); ++connect node_1a_galera_st_kill_slave_ddl, 127.0.0.1, root, , test, $NODE_MYPORT_1; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (21,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (22,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (23,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (24,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (25,'node1_to_be_rollbacked_after'); ++connection node_2; ++Performing --wsrep-recover ... ++connection node_2; ++Starting server ... ++Using --wsrep-start-position when starting mysqld ... ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (26,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (27,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (28,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (29,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (30,'node2_committed_after'); ++COMMIT; ++connection node_1; ++INSERT INTO t1 (id,f1) VALUES (31,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (32,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (33,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (34,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (35,'node1_to_be_committed_after'); ++COMMIT; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (36,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (37,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (38,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (39,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (40,'node1_committed_after'); ++COMMIT; ++connection node_1a_galera_st_kill_slave_ddl; ++INSERT INTO t1 (id,f1) VALUES (41,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (42,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (43,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (44,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (45,'node1_to_be_rollbacked_after'); ++ROLLBACK; ++SET AUTOCOMMIT=ON; ++SET SESSION wsrep_sync_wait=15; ++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; ++EXPECT_3 ++3 ++SELECT COUNT(*) AS EXPECT_35 FROM t1; ++EXPECT_35 ++35 ++SELECT * FROM t1; ++id f1 f2 ++1 node1_committed_before NULL ++2 node1_committed_before NULL ++3 node1_committed_before NULL ++4 node1_committed_before NULL ++5 node1_committed_before NULL ++6 node2_committed_before NULL ++7 node2_committed_before NULL ++8 node2_committed_before NULL ++9 node2_committed_before NULL ++10 node2_committed_before NULL ++11 node1_committed_during NULL ++12 node1_committed_during NULL ++13 node1_committed_during NULL ++14 node1_committed_during NULL ++15 node1_committed_during NULL ++16 node1_to_be_committed_after NULL ++17 node1_to_be_committed_after NULL ++18 node1_to_be_committed_after NULL ++19 node1_to_be_committed_after NULL ++20 node1_to_be_committed_after NULL ++26 node2_committed_after NULL ++27 node2_committed_after NULL ++28 node2_committed_after NULL ++29 node2_committed_after NULL ++30 node2_committed_after NULL ++31 node1_to_be_committed_after NULL ++32 node1_to_be_committed_after NULL ++33 node1_to_be_committed_after NULL ++34 node1_to_be_committed_after NULL ++35 node1_to_be_committed_after NULL ++36 node1_committed_after NULL ++37 node1_committed_after NULL ++38 node1_committed_after NULL ++39 node1_committed_after NULL ++40 node1_committed_after NULL ++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; ++COUNT(*) = 0 ++1 ++COMMIT; ++connection node_1; ++SET AUTOCOMMIT=ON; ++SET SESSION wsrep_sync_wait=15; ++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; ++EXPECT_3 ++3 ++SELECT COUNT(*) AS EXPECT_35 FROM t1; ++EXPECT_35 ++35 ++SELECT * FROM t1; ++id f1 f2 ++1 node1_committed_before NULL ++2 node1_committed_before NULL ++3 node1_committed_before NULL ++4 node1_committed_before NULL ++5 node1_committed_before NULL ++6 node2_committed_before NULL ++7 node2_committed_before NULL ++8 node2_committed_before NULL ++9 node2_committed_before NULL ++10 node2_committed_before NULL ++11 node1_committed_during NULL ++12 node1_committed_during NULL ++13 node1_committed_during NULL ++14 node1_committed_during NULL ++15 node1_committed_during NULL ++16 node1_to_be_committed_after NULL ++17 node1_to_be_committed_after NULL ++18 node1_to_be_committed_after NULL ++19 node1_to_be_committed_after NULL ++20 node1_to_be_committed_after NULL ++26 node2_committed_after NULL ++27 node2_committed_after NULL ++28 node2_committed_after NULL ++29 node2_committed_after NULL ++30 node2_committed_after NULL ++31 node1_to_be_committed_after NULL ++32 node1_to_be_committed_after NULL ++33 node1_to_be_committed_after NULL ++34 node1_to_be_committed_after NULL ++35 node1_to_be_committed_after NULL ++36 node1_committed_after NULL ++37 node1_committed_after NULL ++38 node1_committed_after NULL ++39 node1_committed_after NULL ++40 node1_committed_after NULL ++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; ++COUNT(*) = 0 ++1 ++DROP TABLE t1; ++COMMIT; ++SET GLOBAL debug_dbug = $debug_orig; + connection node_1; + # Node_1 + SHOW global variables like 'gtid%pos'; + Variable_name Value +-gtid_binlog_pos 100-10-24 +-gtid_current_pos 100-10-24 ++gtid_binlog_pos 100-10-33 ++gtid_current_pos 100-10-33 + gtid_slave_pos + connection node_2; + # Node_2 + SHOW global variables like 'gtid%pos'; + Variable_name Value +-gtid_binlog_pos 100-10-24 +-gtid_current_pos 100-10-24 ++gtid_binlog_pos 100-10-33 ++gtid_current_pos 100-10-33 + gtid_slave_pos + disconnect node_2; + disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,534 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +Performing State Transfer on a server that has been shut down cleanly and restarted +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_shutdown_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_15 FROM t1; +EXPECT_15 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_15 FROM t1; +EXPECT_15 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +Performing State Transfer on a server that starts from a clean var directory +This is accomplished by shutting down node #2 and removing its var directory before restarting it +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +Cleaning var directory ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_clean_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +Performing State Transfer on a server that has been killed and restarted +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Killing server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Performing --wsrep-recover ... +Starting server ... +Using --wsrep-start-position when starting mysqld ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_kill_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (46,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * FROM t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * FROM t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +connection node_1; +# Node_1 +SHOW global variables like 'gtid%pos'; +Variable_name Value +gtid_binlog_pos 100-10-24 +gtid_current_pos 100-10-24 +gtid_slave_pos +connection node_2; +# Node_2 +SHOW global variables like 'gtid%pos'; +Variable_name Value +gtid_binlog_pos 100-10-24 +gtid_current_pos 100-10-24 +gtid_slave_pos +disconnect node_2; +disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ --- r/galera_sst_mariabackup_logarchive.result -+++ r/galera_sst_mariabackup_logarchive.reject ++++ r/galera_sst_mariabackup_logarchive,debug.reject @@ -516,5 +516,189 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result 2025-05-19 16:14:24.000000000 +0000 @@ -8,6 +8,6 @@ Cleaning var directory ... connection node_2; Starting server ... -include/assert_grep.inc [mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\)] +include/assert_grep.inc [mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\)] disconnect node_2; disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,13 +1,12 @@ --- r/galera_sst_mysqldump.result +++ r/galera_sst_mysqldump,debug.reject -@@ -698,11 +698,195 @@ +@@ -698,6 +698,190 @@ 1 DROP TABLE t1; COMMIT; +Performing State Transfer on a server that has been killed and restarted +while a DDL was in progress on it - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); ++connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; @@ -189,12 +188,6 @@ +DROP TABLE t1; +COMMIT; +SET GLOBAL debug_dbug = $debug_orig; -+connection node_1; -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); + connection node_1; + CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ ---- r/galera_sst_mysqldump.result -+++ r/galera_sst_mysqldump.reject -@@ -699,10 +699,10 @@ - DROP TABLE t1; - COMMIT; - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result 2025-05-19 16:14:24.000000000 +0000 @@ -699,10 +699,10 @@ DROP TABLE t1; COMMIT; connection node_1; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; connection node_2; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); CALL mtr.add_suppression("Can't open and lock time zone table"); CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,13 +1,12 @@ --- r/galera_sst_mysqldump_with_key.result +++ r/galera_sst_mysqldump_with_key,debug.reject -@@ -358,11 +358,195 @@ +@@ -358,6 +358,190 @@ 1 DROP TABLE t1; COMMIT; +Performing State Transfer on a server that has been killed and restarted +while a DDL was in progress on it - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); ++connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; @@ -189,12 +188,6 @@ +DROP TABLE t1; +COMMIT; +SET GLOBAL debug_dbug = $debug_orig; -+connection node_1; -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); + connection node_1; + CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ ---- r/galera_sst_mysqldump_with_key.result -+++ r/galera_sst_mysqldump_with_key.reject -@@ -359,10 +359,10 @@ - DROP TABLE t1; - COMMIT; - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result 2025-05-19 16:14:24.000000000 +0000 @@ -359,10 +359,10 @@ DROP TABLE t1; COMMIT; connection node_1; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; connection node_2; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); CALL mtr.add_suppression("Can't open and lock time zone table"); CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ --- galera_sst_rsync.result -+++ galera_sst_rsync.reject ++++ galera_sst_rsync,debug.reject @@ -516,3 +516,187 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,210 @@ +--- r/galera_sst_rsync_gtid.result ++++ r/galera_sst_rsync_gtid,debug.reject +@@ -516,19 +516,203 @@ + 1 + DROP TABLE t1; + COMMIT; ++Performing State Transfer on a server that has been killed and restarted ++while a DDL was in progress on it ++connection node_1; ++CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 VALUES (1,'node1_committed_before'); ++INSERT INTO t1 VALUES (2,'node1_committed_before'); ++INSERT INTO t1 VALUES (3,'node1_committed_before'); ++INSERT INTO t1 VALUES (4,'node1_committed_before'); ++INSERT INTO t1 VALUES (5,'node1_committed_before'); ++connection node_2; ++START TRANSACTION; ++INSERT INTO t1 VALUES (6,'node2_committed_before'); ++INSERT INTO t1 VALUES (7,'node2_committed_before'); ++INSERT INTO t1 VALUES (8,'node2_committed_before'); ++INSERT INTO t1 VALUES (9,'node2_committed_before'); ++INSERT INTO t1 VALUES (10,'node2_committed_before'); ++COMMIT; ++SET GLOBAL debug_dbug = 'd,sync.alter_opened_table'; ++connection node_1; ++ALTER TABLE t1 ADD COLUMN f2 INTEGER; ++connection node_2; ++SET wsrep_sync_wait = 0; ++Killing server ... ++connection node_1; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (11,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (12,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (13,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (14,'node1_committed_during'); ++INSERT INTO t1 (id,f1) VALUES (15,'node1_committed_during'); ++COMMIT; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (16,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (17,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (18,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (19,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (20,'node1_to_be_committed_after'); ++connect node_1a_galera_st_kill_slave_ddl, 127.0.0.1, root, , test, $NODE_MYPORT_1; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (21,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (22,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (23,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (24,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (25,'node1_to_be_rollbacked_after'); ++connection node_2; ++Performing --wsrep-recover ... ++connection node_2; ++Starting server ... ++Using --wsrep-start-position when starting mysqld ... ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (26,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (27,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (28,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (29,'node2_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (30,'node2_committed_after'); ++COMMIT; ++connection node_1; ++INSERT INTO t1 (id,f1) VALUES (31,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (32,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (33,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (34,'node1_to_be_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (35,'node1_to_be_committed_after'); ++COMMIT; ++SET AUTOCOMMIT=OFF; ++START TRANSACTION; ++INSERT INTO t1 (id,f1) VALUES (36,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (37,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (38,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (39,'node1_committed_after'); ++INSERT INTO t1 (id,f1) VALUES (40,'node1_committed_after'); ++COMMIT; ++connection node_1a_galera_st_kill_slave_ddl; ++INSERT INTO t1 (id,f1) VALUES (41,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (42,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (43,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (44,'node1_to_be_rollbacked_after'); ++INSERT INTO t1 (id,f1) VALUES (45,'node1_to_be_rollbacked_after'); ++ROLLBACK; ++SET AUTOCOMMIT=ON; ++SET SESSION wsrep_sync_wait=15; ++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; ++EXPECT_3 ++3 ++SELECT COUNT(*) AS EXPECT_35 FROM t1; ++EXPECT_35 ++35 ++SELECT * FROM t1; ++id f1 f2 ++1 node1_committed_before NULL ++2 node1_committed_before NULL ++3 node1_committed_before NULL ++4 node1_committed_before NULL ++5 node1_committed_before NULL ++6 node2_committed_before NULL ++7 node2_committed_before NULL ++8 node2_committed_before NULL ++9 node2_committed_before NULL ++10 node2_committed_before NULL ++11 node1_committed_during NULL ++12 node1_committed_during NULL ++13 node1_committed_during NULL ++14 node1_committed_during NULL ++15 node1_committed_during NULL ++16 node1_to_be_committed_after NULL ++17 node1_to_be_committed_after NULL ++18 node1_to_be_committed_after NULL ++19 node1_to_be_committed_after NULL ++20 node1_to_be_committed_after NULL ++26 node2_committed_after NULL ++27 node2_committed_after NULL ++28 node2_committed_after NULL ++29 node2_committed_after NULL ++30 node2_committed_after NULL ++31 node1_to_be_committed_after NULL ++32 node1_to_be_committed_after NULL ++33 node1_to_be_committed_after NULL ++34 node1_to_be_committed_after NULL ++35 node1_to_be_committed_after NULL ++36 node1_committed_after NULL ++37 node1_committed_after NULL ++38 node1_committed_after NULL ++39 node1_committed_after NULL ++40 node1_committed_after NULL ++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; ++COUNT(*) = 0 ++1 ++COMMIT; ++connection node_1; ++SET AUTOCOMMIT=ON; ++SET SESSION wsrep_sync_wait=15; ++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; ++EXPECT_3 ++3 ++SELECT COUNT(*) AS EXPECT_35 FROM t1; ++EXPECT_35 ++35 ++SELECT * FROM t1; ++id f1 f2 ++1 node1_committed_before NULL ++2 node1_committed_before NULL ++3 node1_committed_before NULL ++4 node1_committed_before NULL ++5 node1_committed_before NULL ++6 node2_committed_before NULL ++7 node2_committed_before NULL ++8 node2_committed_before NULL ++9 node2_committed_before NULL ++10 node2_committed_before NULL ++11 node1_committed_during NULL ++12 node1_committed_during NULL ++13 node1_committed_during NULL ++14 node1_committed_during NULL ++15 node1_committed_during NULL ++16 node1_to_be_committed_after NULL ++17 node1_to_be_committed_after NULL ++18 node1_to_be_committed_after NULL ++19 node1_to_be_committed_after NULL ++20 node1_to_be_committed_after NULL ++26 node2_committed_after NULL ++27 node2_committed_after NULL ++28 node2_committed_after NULL ++29 node2_committed_after NULL ++30 node2_committed_after NULL ++31 node1_to_be_committed_after NULL ++32 node1_to_be_committed_after NULL ++33 node1_to_be_committed_after NULL ++34 node1_to_be_committed_after NULL ++35 node1_to_be_committed_after NULL ++36 node1_committed_after NULL ++37 node1_committed_after NULL ++38 node1_committed_after NULL ++39 node1_committed_after NULL ++40 node1_committed_after NULL ++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; ++COUNT(*) = 0 ++1 ++DROP TABLE t1; ++COMMIT; ++SET GLOBAL debug_dbug = $debug_orig; + connection node_1; + # Node_1 + SHOW global variables like 'gtid%pos'; + Variable_name Value +-gtid_binlog_pos 100-10-24 +-gtid_current_pos 100-10-24 ++gtid_binlog_pos 100-10-33 ++gtid_current_pos 100-10-33 + gtid_slave_pos + connection node_2; + # Node_2 + SHOW global variables like 'gtid%pos'; + Variable_name Value +-gtid_binlog_pos 100-10-24 +-gtid_current_pos 100-10-24 ++gtid_binlog_pos 100-10-33 ++gtid_current_pos 100-10-33 + gtid_slave_pos + disconnect node_2; + disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,534 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +Performing State Transfer on a server that has been shut down cleanly and restarted +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_shutdown_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_15 FROM t1; +EXPECT_15 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_15 FROM t1; +EXPECT_15 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +Performing State Transfer on a server that starts from a clean var directory +This is accomplished by shutting down node #2 and removing its var directory before restarting it +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Shutting down server ... +connection node_1; +Cleaning var directory ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Starting server ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_clean_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * from t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +Performing State Transfer on a server that has been killed and restarted +connection node_1; +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (1,'node1_committed_before'); +INSERT INTO t1 VALUES (2,'node1_committed_before'); +INSERT INTO t1 VALUES (3,'node1_committed_before'); +INSERT INTO t1 VALUES (4,'node1_committed_before'); +INSERT INTO t1 VALUES (5,'node1_committed_before'); +COMMIT; +connection node_2; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (6,'node2_committed_before'); +INSERT INTO t1 VALUES (7,'node2_committed_before'); +INSERT INTO t1 VALUES (8,'node2_committed_before'); +INSERT INTO t1 VALUES (9,'node2_committed_before'); +INSERT INTO t1 VALUES (10,'node2_committed_before'); +COMMIT; +Killing server ... +connection node_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (11,'node1_committed_during'); +INSERT INTO t1 VALUES (12,'node1_committed_during'); +INSERT INTO t1 VALUES (13,'node1_committed_during'); +INSERT INTO t1 VALUES (14,'node1_committed_during'); +INSERT INTO t1 VALUES (15,'node1_committed_during'); +COMMIT; +START TRANSACTION; +INSERT INTO t1 VALUES (16,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (17,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (18,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (19,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (20,'node1_to_be_committed_after'); +connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after'); +connection node_2; +Performing --wsrep-recover ... +Starting server ... +Using --wsrep-start-position when starting mysqld ... +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (26,'node2_committed_after'); +INSERT INTO t1 VALUES (27,'node2_committed_after'); +INSERT INTO t1 VALUES (28,'node2_committed_after'); +INSERT INTO t1 VALUES (29,'node2_committed_after'); +INSERT INTO t1 VALUES (30,'node2_committed_after'); +COMMIT; +connection node_1; +INSERT INTO t1 VALUES (31,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (32,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (33,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (34,'node1_to_be_committed_after'); +INSERT INTO t1 VALUES (35,'node1_to_be_committed_after'); +COMMIT; +SET AUTOCOMMIT=OFF; +START TRANSACTION; +INSERT INTO t1 VALUES (36,'node1_committed_after'); +INSERT INTO t1 VALUES (37,'node1_committed_after'); +INSERT INTO t1 VALUES (38,'node1_committed_after'); +INSERT INTO t1 VALUES (39,'node1_committed_after'); +INSERT INTO t1 VALUES (40,'node1_committed_after'); +COMMIT; +connection node_1a_galera_st_kill_slave; +INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after'); +INSERT INTO t1 VALUES (46,'node1_to_be_rollbacked_after'); +ROLLBACK; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * FROM t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +COMMIT; +connection node_1; +SET AUTOCOMMIT=ON; +SET SESSION wsrep_sync_wait=15; +SELECT COUNT(*) AS EXPECT_35 FROM t1; +EXPECT_35 +35 +SELECT * FROM t1; +id f1 +1 node1_committed_before +2 node1_committed_before +3 node1_committed_before +4 node1_committed_before +5 node1_committed_before +6 node2_committed_before +7 node2_committed_before +8 node2_committed_before +9 node2_committed_before +10 node2_committed_before +11 node1_committed_during +12 node1_committed_during +13 node1_committed_during +14 node1_committed_during +15 node1_committed_during +16 node1_to_be_committed_after +17 node1_to_be_committed_after +18 node1_to_be_committed_after +19 node1_to_be_committed_after +20 node1_to_be_committed_after +26 node2_committed_after +27 node2_committed_after +28 node2_committed_after +29 node2_committed_after +30 node2_committed_after +31 node1_to_be_committed_after +32 node1_to_be_committed_after +33 node1_to_be_committed_after +34 node1_to_be_committed_after +35 node1_to_be_committed_after +36 node1_committed_after +37 node1_committed_after +38 node1_committed_after +39 node1_committed_after +40 node1_committed_after +SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1; +COUNT(*) = 0 +1 +DROP TABLE t1; +COMMIT; +connection node_1; +# Node_1 +SHOW global variables like 'gtid%pos'; +Variable_name Value +gtid_binlog_pos 100-10-24 +gtid_current_pos 100-10-24 +gtid_slave_pos +connection node_2; +# Node_2 +SHOW global variables like 'gtid%pos'; +Variable_name Value +gtid_binlog_pos 100-10-24 +gtid_current_pos 100-10-24 +gtid_slave_pos +disconnect node_2; +disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,5 @@ +--- r/galera_sst_rsync_recv_auto.result ++++ r/galera_sst_rsync_recv_auto,debug.reject @@ -516,3 +516,187 @@ 1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_innodb.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_innodb.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ connection node_2; connection node_1; -call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine .*"); +call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine "); CREATE TABLE t1(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=INNODB; CREATE TABLE t2(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=MYISAM; CREATE TABLE t3(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=ARIA; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_primary_key.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_primary_key.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ connection node_2; connection node_1; -call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled. Table .*"); +call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled\\. Table "); CREATE TABLE t1(a int, b varchar(50)) ENGINE=INNODB; CREATE TABLE t2(a int, b varchar(50)) ENGINE=MYISAM; CREATE TABLE t3(a int, b varchar(50)) ENGINE=MEMORY; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,29 +1,69 @@ connection node_2; connection node_1; +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_1; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY AUTO_INCREMENT, f2 INTEGER); +INSERT INTO t1(f2) SELECT seq FROM seq_1_to_1000; +connection node_2a; +SET SESSION wsrep_sync_wait=0; +connection node_1a; +# Block the applier on node_1 and issue a ddl from node_2 +SET SESSION wsrep_sync_wait=0; +SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync'; connection node_2; -ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 123);; +# DDL 1 +ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 VALUES (NULL, 10000, 10000);; +connection node_1a; +SET SESSION wsrep_on = 0; +SET SESSION wsrep_on = 1; +SET GLOBAL wsrep_provider_options = 'dbug='; +# This will block on acquiring total order isolation connection node_1; +# DDL 2 CREATE UNIQUE INDEX i1 ON t1(f2);; +connection node_1a; +# Signal DDL 1 +SET GLOBAL wsrep_provider_options = 'dbug='; +SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync'; +connection node_2; +connection node_1; connection node_2; -INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 234); -SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; -COUNT(*) = 3 -1 -SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; -COUNT(*) = 2 -1 -SELECT COUNT(*) = 2 FROM t1; -COUNT(*) = 2 -1 +SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +EXPECT_3 +3 +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; +EXPECT_2 +2 +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `f1` int(11) NOT NULL AUTO_INCREMENT, + `f2` int(11) DEFAULT NULL, + `f3` int(11) DEFAULT NULL, + PRIMARY KEY (`f1`), + UNIQUE KEY `i1` (`f2`) +) ENGINE=InnoDB AUTO_INCREMENT=2002 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +SELECT COUNT(*) AS EXPECT_1001 FROM t1; +EXPECT_1001 +1001 connection node_1; -SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; -COUNT(*) = 3 -1 -SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; -COUNT(*) = 2 -1 -SELECT COUNT(*) = 2 FROM t1; -COUNT(*) = 2 -1 +SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +EXPECT_3 +3 +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; +EXPECT_2 +2 +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `f1` int(11) NOT NULL AUTO_INCREMENT, + `f2` int(11) DEFAULT NULL, + `f3` int(11) DEFAULT NULL, + PRIMARY KEY (`f1`), + UNIQUE KEY `i1` (`f2`) +) ENGINE=InnoDB AUTO_INCREMENT=2047 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci +SELECT COUNT(*) AS EXPECT_1001 FROM t1; +EXPECT_1001 +1001 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result 2025-05-19 16:14:24.000000000 +0000 @@ -52,8 +52,8 @@ 0 DROP TABLE t1; connection node_1; -CREATE TABLE t1 (f1 INTEGER) ENGINE=MyISAM; -CREATE TABLE t2 (f1 INTEGER) ENGINE=InnoDB; +CREATE TABLE t1 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=MyISAM; +CREATE TABLE t2 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=InnoDB; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES (1); @@ -203,6 +203,9 @@ 3 200 4 5 connection node_2; +SELECT COUNT(*) FROM t1; +COUNT(*) +10 SELECT * FROM t1 ORDER BY id; id b 1 1 @@ -224,15 +227,29 @@ DROP TRIGGER tr1; DROP TRIGGER tr2; DROP TRIGGER tr3; -DROP TABLE t1,t2; +DROP TABLE t1, t2; +CREATE TABLE t1 (a INT, b INT, UNIQUE(a)) ENGINE=MyISAM; +CREATE TRIGGER tr1 BEFORE INSERT ON t1 FOR EACH ROW SET NEW.a=1; +INSERT INTO t1 (a,b) VALUES (10,20); +SELECT * from t1; +a b +1 20 +connection node_2; +SELECT * from t1; +a b +1 20 +connection node_1; +DROP TABLE t1; # # MDEV-11152: wsrep_replicate_myisam: SELECT gets replicated using TO # connection node_1; -CREATE TABLE t1 (i INT) ENGINE=INNODB; +CREATE TABLE t1 (i INT NOT NULL PRIMARY KEY) ENGINE=INNODB; INSERT INTO t1 VALUES(1); SELECT * FROM t1; i 1 DROP TABLE t1; -connection node_1; +SET GLOBAL wsrep_mode = DEFAULT; +connection node_2; +SET GLOBAL wsrep_mode = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_slave_threads.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_slave_threads.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result 2025-05-19 16:14:24.000000000 +0000 @@ -33,7 +33,6 @@ SELECT COUNT(*) FROM t2; COUNT(*) 70 -SET GLOBAL wsrep_slave_threads = 1; DROP TABLE t1; DROP TABLE t2; # diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_during_ist.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_during_ist.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,112 @@ +connection node_4; +connection node_3; +connection node_2; +connection node_1; +# Correct Galera library found +connection node_1; +connection node_2; +connection node_3; +connection node_4; +connection node_1; +CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY); +CREATE PROCEDURE p1(IN max INT) +BEGIN +DECLARE i INT; +DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; +SET i = 0; +WHILE i < max DO +INSERT IGNORE INTO t1 VALUES (DEFAULT); +SET i = i + 1; +END WHILE; +END| +CALL p1(130); +connection node_4; +Shutting down server 4... +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_2; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_3; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +Server 4 left the cluster +connection node_1; +CALL p1(130); +connection node_1; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +connection node_2; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +connection node_3; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +INSERT INTO t2 VALUES (DEFAULT); +CALL p1(130); +connection node_1; +SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation"; +Restarting server 4 +Wait for server 1 to become a donor +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached"; +Server 1 got SST request from server 4 +SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue"; +SET GLOBAL debug = ""; +SET DEBUG_SYNC='RESET'; +Waiting for server 4 to leave the cluster +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_2; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_3; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_4; +Server 4 left the cluster, killing it... +Killed server 4... +Restarting server 4... +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_1; +SELECT count(*) AS expect1_390 FROM t1; +expect1_390 +390 +SELECT count(*) AS expect1_1 FROM t2; +expect1_1 +1 +connection node_2; +SELECT count(*) AS expect2_390 FROM t1; +expect2_390 +390 +SELECT count(*) AS expect2_1 FROM t2; +expect2_1 +1 +connection node_3; +SELECT count(*) AS expect3_390 FROM t1; +expect3_390 +390 +SELECT count(*) AS expect3_1 FROM t2; +expect3_1 +1 +connection node_4; +SELECT count(*) AS expect4_390 FROM t1; +expect4_390 +390 +SELECT count(*) AS expect4_1 FROM t2; +expect4_1 +1 +DROP TABLE t1; +DROP TABLE t2; +DROP PROCEDURE p1; +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +CALL mtr.add_suppression("Inconsistency detected: Failed on preordered"); +CALL mtr.add_suppression("Failed to apply write set"); +CALL mtr.add_suppression("Sending JOIN failed: -103"); +CALL mtr.add_suppression("Failed to JOIN the cluster after SST"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_apply.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_apply.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,94 @@ +connection node_4; +connection node_3; +connection node_2; +connection node_1; +# Correct Galera library found +connection node_1; +connection node_2; +connection node_3; +connection node_4; +connection node_1; +CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY); +CREATE PROCEDURE p1(IN max INT) +BEGIN +DECLARE i INT; +DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; +SET i = 0; +WHILE i < max DO +INSERT IGNORE INTO t1 VALUES (DEFAULT); +SET i = i + 1; +END WHILE; +END| +CALL p1(130); +connection node_4; +Shutting down server 4... +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +SET GLOBAL debug = "+d,sync.wsrep_donor_state"; +connection node_4; +Restarting server 4... +connection node_1; +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached"; +Tables on server 1 flushed and locked for SST to server 4 +SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state"; +SET GLOBAL debug = ""; +SET DEBUG_SYNC='RESET'; +Wait for the state snapshot to be copied to server 4 +SST script unlocked server 1 +connection node_1; +CALL p1(130); +connection node_1; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +connection node_2; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +connection node_3; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +INSERT INTO t2 VALUES (DEFAULT); +CALL p1(130); +Waiting for server 4 to leave the cluster +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_2; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_4; +Server 4 left the cluster, killing it... +Killed server 4... +Restarting server 4... +DROP TABLE t2; +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_1; +SELECT count(*) AS expect1_390 FROM t1; +expect1_390 +390 +connection node_2; +SELECT count(*) AS expect2_390 FROM t1; +expect2_390 +390 +connection node_3; +SELECT count(*) AS expect3_390 FROM t1; +expect3_390 +390 +connection node_4; +SELECT count(*) AS expect4_390 FROM t1; +expect4_390 +390 +DROP TABLE t1; +DROP PROCEDURE p1; +connection node_4; +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus"); +CALL mtr.add_suppression("Failed to apply write set: gtid:"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_skip.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_skip.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,102 @@ +connection node_4; +connection node_3; +connection node_2; +connection node_1; +# Correct Galera library found +connection node_1; +connection node_2; +connection node_3; +connection node_4; +connection node_1; +CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY); +CREATE PROCEDURE p1(IN max INT) +BEGIN +DECLARE i INT; +DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; +SET i = 0; +WHILE i < max DO +INSERT IGNORE INTO t1 VALUES (DEFAULT); +SET i = i + 1; +END WHILE; +END| +CALL p1(130); +connection node_4; +Shutting down server 4... +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +SET GLOBAL debug = "+d,sync.wsrep_donor_state"; +connection node_4; +Restarting server 4... +connection node_1; +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached"; +Tables on server 1 flushed and locked for SST to server 4 +SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state"; +SET GLOBAL debug = ""; +SET DEBUG_SYNC='RESET'; +Wait for the state snapshot to be copied to server 4 +SST script unlocked server 1 +connection node_1; +CALL p1(130); +connection node_3; +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; +INSERT INTO t2 VALUES (DEFAULT); +SET SESSION wsrep_on = OFF; +connection node_1; +CALL p1(130); +Waiting for server 3 to leave the cluster +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_2; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_4; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_3; +Server 3 left the cluster, killing it... +Killed server 3. +Restarting server 3... +Waiting for server 3 to rejoin the cluster +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_3; +sleeping for 20 +Waiting ready +Server 3 restarted. +connection node_1; +SET SESSION wsrep_on = ON; +SET SESSION wsrep_sync_wait = 15; +connection node_1; +SELECT count(*) AS expect1_390 FROM t1; +expect1_390 +390 +connection node_2; +SELECT count(*) AS expect2_390 FROM t1; +expect2_390 +390 +connection node_3; +SELECT count(*) AS expect3_390 FROM t1; +expect3_390 +390 +connection node_4; +SELECT count(*) AS expect4_390 FROM t1; +expect4_390 +390 +DROP TABLE t1; +DROP PROCEDURE p1; +connection node_1; +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +connection node_2; +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +connection node_3; +CALL mtr.add_suppression("Vote 0 \\(success\\) on .+ is inconsistent with group"); +connection node_4; +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wan.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wan.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,9 +1,9 @@ connection node_2; connection node_1; -CALL mtr.add_suppression("WSREP: Stray state UUID msg:"); -CALL mtr.add_suppression("Sending JOIN failed: "); -CALL mtr.add_suppression("WSREP: .* sending install message failed: Socket is not connected"); -CALL mtr.add_suppression("There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside"); +CALL mtr.add_suppression("WSREP: Stray state UUID msg: "); +CALL mtr.add_suppression("WSREP: .*Sending JOIN failed: "); +CALL mtr.add_suppression("WSREP: .*sending install message failed: (Transport endpoint|Socket) is not connected"); +CALL mtr.add_suppression("WSREP: .*There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside"); SELECT VARIABLE_VALUE = 4 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; VARIABLE_VALUE = 4 1 @@ -36,8 +36,8 @@ 1 DROP TABLE t1; connection node_1; -call mtr.add_suppression("WSREP: read_completion_condition.*"); -call mtr.add_suppression("WSREP: read_handler.*"); +call mtr.add_suppression("WSREP: read_completion_condition"); +call mtr.add_suppression("WSREP: read_handler"); disconnect node_3; disconnect node_4; disconnect node_2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ connection node_2; connection node_1; -call mtr.add_suppression("WSREP\: Unknown parameter 'gmcasts\\.segment'"); -call mtr.add_suppression("WSREP\: Set options returned 7"); +call mtr.add_suppression("WSREP: Unknown parameter 'gmcasts\\.segment'"); +call mtr.add_suppression("WSREP: Set options returned 7"); SET GLOBAL wsrep_provider_options="gmcasts.segment=1"; ERROR HY000: Incorrect arguments to SET Unhandled exceptions: 0 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result --- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result 2025-05-19 16:14:24.000000000 +0000 @@ -3,10 +3,17 @@ connection node_1; connection node_2; connection node_1; -call mtr.add_suppression("WSREP:.*"); +call mtr.add_suppression("WSREP: async IST sender failed to serve"); +call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused"); +call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect"); +call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error"); SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options; SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true;pc.weight=2'; connection node_2; +call mtr.add_suppression("WSREP: async IST sender failed to serve"); +call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused"); +call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect"); +call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error"); SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address; SET GLOBAL WSREP_ON=0; SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mdev-29775.result mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result --- mariadb-10.11.11/mysql-test/suite/galera/r/mdev-29775.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,84 @@ +connection node_2; +connection node_1; +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM; +INSERT INTO t VALUES(); +SELECT * FROM t; +f0 +NULL +connection node_2; +SELECT * FROM t; +f0 +NULL +DROP TABLE t; +connection node_1; +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +SET GLOBAL wsrep_forced_binlog_format=ROW; +CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM; +INSERT INTO t VALUES(); +SELECT * FROM t; +f0 +NULL +connection node_2; +SELECT * FROM t; +f0 +NULL +DROP TABLE t; +connection node_1; +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria; +INSERT INTO t VALUES(); +SELECT * FROM t; +f0 +NULL +connection node_2; +SELECT * FROM t; +f0 +NULL +DROP TABLE t; +connection node_1; +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +SET GLOBAL wsrep_forced_binlog_format=ROW; +CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria; +INSERT INTO t VALUES(); +SELECT * FROM t; +f0 +NULL +connection node_2; +SELECT * FROM t; +f0 +NULL +DROP TABLE t; +connection node_1; +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_mode = REPLICATE_ARIA; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_mode = REPLICATE_ARIA; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_forced_binlog_format=DEFAULT; +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW] +SET GLOBAL wsrep_forced_binlog_format=DEFAULT; +SET GLOBAL wsrep_mode=DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mdev-30653.result mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result --- mariadb-10.11.11/mysql-test/suite/galera/r/mdev-30653.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,7 @@ create table t2 (id serial, val int) engine=aria; insert into t1 values(1, 23); insert into t2 values(2, 42); -call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental. Storage engine Aria for table 'test'.'t2' is not supported in Galera"); +call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental\\. Storage engine Aria for table 'test'\\.'t2' is not supported in Galera"); begin; update t1 set val=24 where id=1; update t2 set val=41 where id=2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#198.result mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result --- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#198.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result 2025-05-19 16:14:24.000000000 +0000 @@ -31,3 +31,6 @@ test.t2 repair note The storage engine for the table doesn't support repair DROP TABLE t1; DROP TABLE t2; +connection node_1; +disconnect node_2a; +disconnect node_2b; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ --- r/mysql-wsrep#33.result +++ r/mysql-wsrep#33,debug.reject -@@ -698,12 +698,196 @@ +@@ -698,6 +698,190 @@ 1 DROP TABLE t1; COMMIT; @@ -190,12 +190,4 @@ +SET GLOBAL debug_dbug = $debug_orig; connection node_2; connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); + CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff --- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ ---- r/mysql-wsrep#33.result -+++ r/mysql-wsrep#33.reject -@@ -700,10 +700,10 @@ - COMMIT; - connection node_2; - connection node_1; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - DROP USER sst; - connection node_2; --CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); -+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); - CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); - CALL mtr.add_suppression("Can't open and lock time zone table"); - CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33.result mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result --- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result 2025-05-19 16:14:24.000000000 +0000 @@ -700,10 +700,10 @@ COMMIT; connection node_2; connection node_1; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); DROP USER sst; connection node_2; -CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); +CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query"); CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found"); CALL mtr.add_suppression("Can't open and lock time zone table"); CALL mtr.add_suppression("Can't open and lock privilege tables"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result --- mariadb-10.11.11/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result 2025-05-19 16:14:24.000000000 +0000 @@ -32,6 +32,8 @@ Level Code Message Error 4165 Galera replication not supported Warning 1031 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine MyISAM not supported. +Error 4165 Galera replication not supported +Warning 1031 WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine MyISAM not supported. SHOW CREATE TABLE t2; Table Create Table t2 CREATE TABLE `t2` ( diff -Nru mariadb-10.11.11/mysql-test/suite/galera/suite.pm mariadb-10.11.13/mysql-test/suite/galera/suite.pm --- mariadb-10.11.11/mysql-test/suite/galera/suite.pm 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/suite.pm 2025-05-19 16:14:24.000000000 +0000 @@ -10,61 +10,61 @@ push @::global_suppressions, ( - qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1), - qr(WSREP: Could not open saved state file for reading: .*), - qr(WSREP: Could not open state file for reading: .*), - qr(WSREP: Gap in state sequence. Need state transfer.), + qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1), + qr(WSREP: Could not open saved state file for reading: ), + qr(WSREP: Could not open state file for reading: ), + qr(WSREP: Gap in state sequence\. Need state transfer\.), qr(WSREP: Failed to prepare for incremental state transfer:), - qr(WSREP:.*down context.*), + qr(WSREP: .*down context.*), qr(WSREP: Failed to send state UUID:), - qr(WSREP: last inactive check more than .* skipping check), - qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.), - qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|, + qr(WSREP: last inactive check more than .+ skipping check), + qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.), + qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|, qr(WSREP: Quorum: No node with complete state), qr(WSREP: Initial position was provided by configuration or SST, avoiding override), - qr|WSREP: discarding established \(time wait\) .*|, - qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.), + qr|WSREP: discarding established \(time wait\) |, + qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.), qr(WSREP: evs::proto.*), - qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|, + qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|, qr(WSREP: no nodes coming from prim view, prim not possible), - qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable), + qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable), qr(WSREP: user message in state LEAVING), - qr(WSREP: .* sending install message failed: Transport endpoint is not connected), + qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected), qr(WSREP: .* sending install message failed: Resource temporarily unavailable), - qr(WSREP: Maximum writeset size exceeded by .*), - qr(WSREP: transaction size exceeded.*), - qr(WSREP: RBR event .*), - qr(WSREP: Ignoring error for TO isolated action: .*), - qr(WSREP: transaction size limit .*), - qr(WSREP: rbr write fail, .*), - qr(WSREP: .*Backend not supported: foo.*), - qr(WSREP: .*Failed to initialize backend using .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: Maximum writeset size exceeded by ), + qr(WSREP: transaction size exceeded), + qr(WSREP: RBR event ), + qr(WSREP: Ignoring error for TO isolated action: ), + qr(WSREP: transaction size limit ), + qr(WSREP: rbr write fail, ), + qr(WSREP: .*Backend not supported: foo), + qr(WSREP: .*Failed to initialize backend using ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Socket type not supported), qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*), - qr(WSREP: .*Failed to open backend connection: -110 .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: .*Failed to open backend connection: -110 ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Connection timed out), qr|WSREP: wsrep::connect\(.*\) failed: 7|, - qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.), + qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.), qr(WSREP: Could not find peer:), - qr(WSREP: TO isolation failed for: .*), - qr|WSREP: gcs_caused\(\) returned .*|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|, - qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|, + qr(WSREP: TO isolation failed for: ), + qr|WSREP: gcs_caused\(\) returned |, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|, + qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|, qr(WSREP: Action message in non-primary configuration from member [0-9]*), qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*), - qr(WSREP: discarding established .*), - qr|WSREP: .*core_handle_uuid_msg.*|, - qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on), - qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|, - qr|Query apply failed:*|, - qr(WSREP: Ignoring error*), - qr(WSREP: Failed to remove page file .*), - qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*), - qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|, - qr|WSREP: Send action \{.* STATE_REQUEST} returned -107 \(Transport endpoint is not connected\)|, + qr(WSREP: discarding established ), + qr|WSREP: .*core_handle_uuid_msg|, + qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on), + qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|, + qr|WSREP: .*Query apply failed:|, + qr(WSREP: Ignoring error), + qr(WSREP: Failed to remove page file ), + qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ), + qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+, + qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+, qr|WSREP: Trying to continue unpaused monitor|, qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|, qr|WSREP: Failed to report last committed|, diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/GAL-401.test mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test --- mariadb-10.11.11/mysql-test/suite/galera/t/GAL-401.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test 2025-05-19 16:14:24.000000000 +0000 @@ -48,7 +48,7 @@ SET SESSION wsrep_sync_wait=15; SHOW CREATE TABLE t1; DROP TABLE t1; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_1 --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/GCF-939.test mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test --- mariadb-10.11.11/mysql-test/suite/galera/t/GCF-939.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # --source include/galera_cluster.inc +--source include/have_innodb.inc --exec rm -rf $MYSQLTEST_VARDIR/mysqld.2/data/GRA_*.log @@ -30,5 +31,6 @@ DROP TABLE t1; CALL mtr.add_suppression("Ignoring error 'Unknown table 'test\\.t1'' on query"); + --connection node_2 CALL mtr.add_suppression("Error 'Unknown table 'test\\.t1'' on query"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-10715.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-10715.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,11 +4,13 @@ log-bin=mysqld-bin log-slave-updates binlog-format=ROW + [mysqld.1] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.2] gtid-domain-id=1 wsrep_gtid_mode=1 -wsrep_gtid_domain_id=1 \ No newline at end of file +wsrep_gtid_domain_id=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-15443.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-15443.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,7 @@ !include ../galera_2nodes.cnf + [mysqld.1] wsrep_auto_increment_control=OFF + [mysqld.2] wsrep_auto_increment_control=OFF diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-18832.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-18832.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,6 @@ --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/have_sequence.inc CREATE SEQUENCE Seq1_1 START WITH 1 INCREMENT BY 1 NOCACHE; CREATE TABLE t1 (Id int(11) NOT NULL, PRIMARY KEY (Id)); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20225.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20225.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test 2025-05-19 16:14:24.000000000 +0000 @@ -41,7 +41,7 @@ SET GLOBAL debug_dbug = 'RESET'; SET DEBUG_SYNC = 'now SIGNAL signal.mdev_20225_continue'; SET DEBUG_SYNC = 'RESET'; -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; --connection node_2 # Trigger should now be dropped on node_2. diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20793.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20793.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test 2025-05-19 16:14:24.000000000 +0000 @@ -99,4 +99,4 @@ SET debug_sync = "RESET"; DROP TABLE t1; -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-21479.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-21479.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test 2025-05-19 16:14:24.000000000 +0000 @@ -77,7 +77,7 @@ --let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; --source include/wait_condition.inc -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_1 --echo # Wait until both nodes are back to cluster diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22227.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22227.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test 2025-05-19 16:14:24.000000000 +0000 @@ -13,7 +13,7 @@ --connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1 --connection node_1b SET SESSION wsrep_sync_wait = 0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table level lock' +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table level lock' --source include/wait_condition.inc --connection node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22708.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22708.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ !include ../galera_2nodes.cnf [mysqld] -log-bin \ No newline at end of file +log-bin diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24143.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24143.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test 2025-05-19 16:14:24.000000000 +0000 @@ -21,4 +21,3 @@ ALTER TABLE t1 DROP COLUMN c2; SELECT get_lock ('test', 1.5); DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24327.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24327.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -3,4 +3,3 @@ [mysqld.1] log-bin=mariadb-bin log-slave-updates=OFF - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-25389.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-25389.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,8 @@ --source ../galera/include/auto_increment_offset_save.inc --connection node_2 +--let $wsrep_slave_threads_orig = `SELECT @@wsrep_slave_threads` + call mtr.add_suppression("WSREP: Failed to create/initialize system thread"); SET GLOBAL debug_dbug='+d,wsrep_simulate_failed_connection_1'; --error ER_WRONG_ARGUMENTS @@ -21,4 +23,9 @@ # issue is fixed. --source include/restart_mysqld.inc +--connection node_2 +--disable_query_log +--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig; +--enable_query_log + --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26266.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26266.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test 2025-05-19 16:14:24.000000000 +0000 @@ -31,7 +31,6 @@ INSERT INTO t2 VALUES (3); INSERT INTO t2 VALUES (4); INSERT INTO t2 VALUES (5); ---error ER_LOCK_DEADLOCK CREATE VIEW v1 AS SELECT c1 FROM t1 WHERE c1 IN (SELECT a FROM t2) GROUP BY c1; - +DROP VIEW v1; DROP TABLE t1,t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26597.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26597.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test 2025-05-19 16:14:24.000000000 +0000 @@ -28,5 +28,3 @@ --source ../../galera/include/auto_increment_offset_restore.inc --connection node_1 DROP TABLE t3; - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---partition=ON \ No newline at end of file +--partition=ON diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test 2025-05-19 16:14:24.000000000 +0000 @@ -4,4 +4,4 @@ CREATE TABLE t3 (c INT) PARTITION BY RANGE (c) (PARTITION p1 VALUES LESS THAN (1000)); CREATE TABLE tp2 (c INT); ALTER TABLE t3 CONVERT TABLE tp2 TO PARTITION p2 VALUES LESS THAN (2000); -DROP TABLE t3; \ No newline at end of file +DROP TABLE t3; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27123.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27123.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,2 +1 @@ --wsrep_auto_increment_control=OFF --auto_increment_increment=3 --auto_increment_offset=3 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27862.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27862.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,6 @@ --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/have_sequence.inc --disable_ps2_protocol diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-28053.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-28053.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test 2025-05-19 16:14:24.000000000 +0000 @@ -39,6 +39,7 @@ --disable_result_log --eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3; START SLAVE; + --eval SELECT MASTER_GTID_WAIT('$gtid', 600) --enable_result_log --enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29293.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29293.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test 2025-05-19 16:14:24.000000000 +0000 @@ -38,4 +38,3 @@ --reap DROP TABLE t1; SET DEBUG_SYNC= 'RESET'; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29512.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29512.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -10,6 +10,4 @@ max-binlog-size=4096 expire-logs-days=1 - [mysqld.2] - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-32549.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-32549.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # statement is rolled back # --source include/galera_cluster.inc +--source include/have_aria.inc CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) engine=innodb; CREATE TABLE t2 (f1 INTEGER PRIMARY KEY) engine=aria; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33136.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33136.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test 2025-05-19 16:14:24.000000000 +0000 @@ -10,6 +10,7 @@ # transaction in the MDL conflict handling code. --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_debug_sync.inc --source include/have_debug.inc @@ -19,8 +20,8 @@ CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; --connection node_1a -TRUNCATE TABLE t1; -# TRUNCATE forces the next statement to re-read statistics from persistent storage, +RENAME TABLE t1 TO tmp, tmp TO t1; +# RENAME forces the next statement to re-read statistics from persistent storage, # which will acquire MDL locks on the statistics tables in InnoDB. SET SESSION wsrep_retry_autocommit = 0; SET DEBUG_SYNC = 'dict_stats_mdl_acquired SIGNAL may_toi WAIT_FOR bf_abort'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -2,3 +2,12 @@ [mysqld] log-bin +log-slave-updates + +[mysqld.1] +auto-increment-increment=2 +auto-increment-offset=1 + +[mysqld.2] +auto-increment-increment=2 +auto-increment-offset=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,5 @@ --source include/galera_cluster.inc --source include/have_innodb.inc ---source include/have_aria.inc SET AUTOCOMMIT=ON; SELECT @@autocommit; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,13 @@ +!include ../galera_2nodes.cnf + +[mysqld] +log-bin +log-slave-updates + +[mysqld.1] +auto-increment-increment=2 +auto-increment-offset=1 + +[mysqld.2] +auto-increment-increment=2 +auto-increment-offset=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_aria.inc create table t1(id serial, val varchar(100)) engine=myisam; @@ -38,14 +39,12 @@ insert into t5 select null, 'd' from t5; select * from t2; - --connection node_2 select * from t1; select * from t2; select * from t3; select * from t4; select * from t5; -set global wsrep_mode=default; --connection node_1 drop table t1,t2,t3,t4,t5; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1 @@ +--plugin-load=$HA_ROCKSDB_SO diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,22 @@ +--source include/galera_cluster.inc +--source include/have_sequence.inc +--source include/have_rocksdb.inc + +--connection node_1 +INSTALL PLUGIN IF NOT EXISTS connect SONAME 'ha_connect'; + +CREATE TABLE t1 (f INT) ENGINE=CONNECT; +CREATE TABLE t2 (f INT) ENGINE=ROCKSDB; +--error ER_NOT_SUPPORTED_YET +CREATE TABLE t3 (f INT) ENGINE=SEQUENCE; +show warnings; + +--connection node_2 +show create table t1; +show create table t2; +--error ER_NO_SUCH_TABLE +show create table t3; + +--connection node_1 +DROP TABLE t1, t2; +UNINSTALL PLUGIN IF EXISTS connect; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35946.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35946.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,39 @@ +# +# MDEV-35946: Assertion `thd->is_error()' failed in Sql_cmd_dml::prepare +# +--source include/have_innodb.inc +--source include/galera_cluster.inc + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +# +# Disconnect from the cluster +# +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1'; +SET SESSION wsrep_sync_wait=0; +--let $wait_condition = SELECT VARIABLE_VALUE = 'non-Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc +SET SESSION wsrep_sync_wait=DEFAULT; + +# +# If bug is present, assertion will fire +# during the execution of the following DELETE +# +--error ER_LOCK_WAIT_TIMEOUT +DELETE FROM mysql.wsrep_streaming_log; + +# +# Reconnect to the cluster +# +SET SESSION wsrep_sync_wait=0; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0'; +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc +SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +SET SESSION wsrep_sync_wait=DEFAULT; + +--source include/auto_increment_offset_restore.inc +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-36116.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-36116.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,43 @@ +# +# MDEV-36116: TOI crashes in debug assert if executing thread is killed. +# + +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/have_debug.inc + +--connect con1,127.0.0.1,root,,test,$NODE_MYPORT_1 + +# Start TOI operation and wait for the thread to be killed. +--connection node_1 +CALL mtr.add_suppression("CREATE TABLE isolation failure"); + +--let $connection_id = `SELECT CONNECTION_ID()` +SET DEBUG_SYNC = 'wsrep_kill_thd_before_enter_toi SIGNAL may_kill WAIT_FOR continue'; +--send + CREATE TABLE t1 (a INT) ENGINE=InnoDB; + +# Kill the thread and let it continue. +--connection con1 +SET DEBUG_SYNC = 'now WAIT_FOR may_kill'; +--disable_query_log +--eval KILL CONNECTION $connection_id +--enable_query_log +SET DEBUG_SYNC = 'now SIGNAL continue'; + +--connection node_1 +--error 2013,2026 +--reap + +# Verify no tables created on either nodes. +--connection node_2 +SHOW TABLES LIKE 't1'; + +--connection con1 +SHOW TABLES LIKE 't1'; + +# Cleanup +SET DEBUG_SYNC = 'RESET'; +--disconnect con1 +--source include/galera_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,8 @@ !include ../galera_2nodes_as_slave.cnf +[mysqld.1] +wsrep-slave-threads=10 + [mysqld.2] slave-parallel-threads=2 slave-parallel-mode=optimistic -[mysqld.1] -wsrep-slave-threads=10 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,7 @@ --connection node_2 --disable_query_log ---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3, MASTER_USE_GTID=slave_pos; +--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3, master_use_gtid=slave_pos; --enable_query_log START SLAVE; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-259.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-259.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test 2025-05-19 16:14:24.000000000 +0000 @@ -39,4 +39,3 @@ # Cleanup SET DEBUG_SYNC= 'RESET'; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-284.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-284.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test 2025-05-19 16:14:24.000000000 +0000 @@ -2,15 +2,16 @@ # MW-284 Slave I/O retry on ER_COM_UNKNOWN_ERROR # ---source include/have_log_bin.inc --source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_log_bin.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 call mtr.add_suppression("\\[ERROR\\] Error reading packet from server: WSREP has not yet prepared node for application use "); call mtr.add_suppression("WSREP has not yet prepared node for application use"); --disable_query_log ---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_PORT=$NODE_MYPORT_1, MASTER_USER='root', MASTER_CONNECT_RETRY=1; +--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_1, master_connect_retry=1; --enable_query_log --connection node_1 @@ -29,7 +30,7 @@ --connection node_3 SELECT @@wsrep_on; --sleep 1 -call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use (server_errno=1047)"); +call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use \\(server_errno ?= ?1047\\)"); START SLAVE; --let $slave_param= Slave_IO_Running --let $slave_param_value= Connecting diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-313.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-313.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,3 @@ [mysqld.2] log-bin log-slave-updates - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,3 @@ wsrep-retry-autocommit=0 [mysqld.2] - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ # -# #MW-329 Fix incorrect affected rows count after replay +# MW-329 Fix incorrect affected rows count after replay. # --source include/galera_cluster.inc @@ -11,7 +11,7 @@ INSERT INTO t1 (f1) VALUES (1),(65535); # -# Run concurrent INSERTs +# Run concurrent INSERTs # DELIMITER |; @@ -86,6 +86,10 @@ --eval KILL CONNECTION $connection_id --enable_query_log +# +# getting execution results for --send +# + --connection node_1b --error 0,1317,2013,2026 --reap @@ -96,6 +100,8 @@ DROP PROCEDURE proc_insert; DROP TABLE t1; +--disconnect node_1b + # Due to MW-330, Multiple "conflict state 3 after post commit" warnings if table is dropped while SP is running CALL mtr.add_suppression("WSREP: .* conflict state after post commit "); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,6 @@ +!include ../galera_2nodes.cnf + +[mysqld.1] +wsrep-retry-autocommit=0 + +[mysqld.2] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,105 @@ +# +# MW-329F Fix incorrect affected rows count after replay. +# +# This is a version of MW-329 without the infinite loop that +# in the original test is closed by killing the connection. +# + +--source include/galera_cluster.inc +--source include/have_innodb.inc + +CREATE TABLE t1 (f1 INTEGER, f2 CHAR(20) DEFAULT 'abc') ENGINE=InnoDB; + +# We start with a populated table +INSERT INTO t1 (f1) VALUES (1),(65535); + +# +# Run concurrent INSERTs +# + +DELIMITER |; +CREATE PROCEDURE proc_insert (repeat_count int) +BEGIN + DECLARE current_num int; + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; + SET current_num = 0; + SET SESSION wsrep_sync_wait = 0; + WHILE current_num < repeat_count do + INSERT INTO t1 (f1) VALUES (FLOOR( 1 + RAND( ) * 65535 )); + SELECT SLEEP(0.1); + SET current_num = current_num + 1; + END WHILE; +END| +DELIMITER ;| + +--connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connection node_1b +--let $connection_id = `SELECT CONNECTION_ID()` +--disable_query_log +--disable_result_log +--send CALL proc_insert(500); + +# +# Run concurrent UPDATEs. We expect that each UPDATE will report that +# some rows were matched and updated +# + +--connection node_2 +--let $count = 2 +--let $wsrep_local_replays_old = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'` + +while ($count) +{ + --let $signature = `SELECT LEFT(MD5(RAND()), 10)` + --disable_query_log + --error 0,ER_LOCK_DEADLOCK + --eval UPDATE t1 SET f2 = '$signature' + --enable_query_log + --let $row_count = `SELECT ROW_COUNT()` + if (`SELECT @@error_count = 0`) { + if (`SELECT $row_count = 0`) { + --die ROW_COUNT() = 0 + } + } + + # + # Ensure at least one replay happens + # + + --let $wsrep_replays = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'` + --disable_query_log + if (`SELECT $wsrep_replays - $wsrep_local_replays_old > 0`) { + --dec $count + } + --enable_query_log +} + +# +# Confirm that some transaction replays occurred +# + +--let $wsrep_local_replays_new = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'` +--disable_query_log +--eval SELECT $wsrep_local_replays_new - $wsrep_local_replays_old > 0 AS wsrep_local_replays; +--enable_query_log + +# +# getting execution results for --send +# + +--connection node_1b +--error 0,1317,2013,2026 +--reap +--enable_query_log +--enable_result_log + +--connection node_1 +DROP PROCEDURE proc_insert; +DROP TABLE t1; + +--disconnect node_1b + +# Due to MW-330, Multiple "conflict state 3 after post commit" warnings if table is dropped while SP is running +CALL mtr.add_suppression("WSREP: .* conflict state after post commit "); + +set global innodb_status_output=Default; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-360-master.opt mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-360-master.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,2 +1 @@ --gtid-domain-id=1 --log-bin --log-slave-updates - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-369.inc mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-369.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc 2025-05-19 16:14:24.000000000 +0000 @@ -80,5 +80,3 @@ SET GLOBAL DEBUG_DBUG = ""; SET DEBUG_SYNC = 'RESET'; - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-416.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-416.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test 2025-05-19 16:14:24.000000000 +0000 @@ -21,73 +21,71 @@ #ALTER INSTANCE ROTATE INNODB MASTER KEY; --error 1044,1227,1370 ALTER PROCEDURE proc1 COMMENT 'foo'; ---error 1044,1227,1370 +--error 1044,1227 ALTER SERVER srv OPTIONS (USER 'sally'); ---error 1044,1142,1227,1370 +--error 1044,1142,1227 ALTER TABLE tbl DROP COLUMN col; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 ALTER VIEW vw AS SELECT 1; ---error 1044,1227,1370 +--error 1044,1227 CREATE DATABASE db; ---error 1044,1227,1370 -CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1; +--error 1044,1227 +CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1; --error 1044,1227,1370 CREATE FUNCTION fun1() RETURNS int RETURN(1); --error 1044,1227,1370 CREATE FUNCTION fun1 RETURNS STRING SONAME 'funlib.so'; --error 1044,1227,1370 -CREATE PROCEDURE proc1() BEGIN END; ---error 1044,1142,1227,1370 +CREATE PROCEDURE proc1() BEGIN END; +--error 1044,1142,1227 CREATE INDEX idx ON tbl(id); ---error 1044,1142,1227,1370 +--error 1044,1227 CREATE SERVER srv FOREIGN DATA WRAPPER 'fdw' OPTIONS (USER 'user'); ---error 1044,1142,1227,1370 +--error 1044,1142,1227 CREATE TABLE t (i int); ---error 1044,1142,1227,1370 +--error 1044,1142,1227 CREATE TRIGGER trg BEFORE UPDATE ON t FOR EACH ROW BEGIN END; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 CREATE VIEW vw AS SELECT 1; - - ---error 1044,1142,1227,1370 +--error 1044,1227 DROP DATABASE db; ---error 1044,1142,1227,1370 +--error 1044,1227 DROP EVENT ev; ---error 1044,1142,1227,1370 +--error 1044,1227,1370 DROP FUNCTION fun1; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 DROP INDEX idx ON t0; ---error 1044,1142,1227,1370 +--error 1044,1227,1370 DROP PROCEDURE proc1; ---error 1044,1142,1227,1370 +--error 1044,1227 DROP SERVEr srv; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 DROP TABLE t0; ---error 1044,1142,1227,1360,1370 +--error 1044,1227,1360 DROP TRIGGER trg; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 DROP VIEW vw; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 RENAME TABLE t0 TO t1; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 TRUNCATE TABLE t0; # DCL # account management ---error 1044,1142,1227,1370,1064 +--error 1044,1227,1064 ALTER USER myuser PASSWORD EXPIRE; ---error 1044,1142,1227,1370 +--error 1044,1227 CREATE USER myuser IDENTIFIED BY 'pass'; ---error 1044,1142,1227,1370 +--error 1044,1227 DROP USER myuser; ---error 1044,1045,1142,1227,1370 +--error 1044,1045,1227 GRANT ALL ON *.* TO 'myuser'; ---error 1044,1142,1227,1370 +--error 1044,1227 RENAME USER myuser TO mariauser; --error 1044,1142,1227,1370 REVOKE SELECT ON test FROM myuser; @@ -97,24 +95,25 @@ REVOKE PROXY ON myuser FROM myuser; # table maintenance ---error 1044,1142,1227,1370 +--error 1044,1142,1227 ANALYZE TABLE db.tbl; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 CHECK TABLE db.tbl; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 CHECKSUM TABLE db.tbl; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 OPTIMIZE TABLE db.tbl; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 REPAIR TABLE db.tbl; # plugin and user defined functions ---error 1044,1142,1227,1370 +--error 1044,1142,1227 INSTALL PLUGIN plg SONAME 'plg.so'; ---error 1044,1142,1227,1370 +--error 1044,1142,1227 UNINSTALL PLUGIN plg; --connection node_1 DROP USER 'userMW416'@'localhost'; SHOW DATABASES; +--disconnect userMW416 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-86-wait8.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/MW-86-wait8.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,4 +7,3 @@ [mysqld.2] log-bin log-slave-updates - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/binlog_checksum.test mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test --- mariadb-10.11.11/mysql-test/suite/galera/t/binlog_checksum.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --echo # On node_1 --connection node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/create.test mariadb-10.11.13/mysql-test/suite/galera/t/create.test --- mariadb-10.11.11/mysql-test/suite/galera/t/create.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/create.test 2025-05-19 16:14:24.000000000 +0000 @@ -86,4 +86,3 @@ --source include/galera_end.inc --echo # End of tests - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera#414.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera#414.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=2' +wsrep_provider_options='gcs.max_packet_size=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=2' +wsrep_provider_options='gcs.max_packet_size=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera#500.test mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera#500.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,7 +3,12 @@ # thrown from gcomm background thread, the provider terminates properly # and wsrep_ready becomes 0. # +# Not to be run with ASAN. Provider leaks memory when gcomm +# thread is aborted forcifully and ASAN crashes during leak report +# after provider is unloaded. +# +--source include/not_asan.inc --source include/have_innodb.inc --source include/galera_cluster.inc --source include/galera_have_debug_sync.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_2primary_replica.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_2primary_replica.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test 2025-05-19 16:14:24.000000000 +0000 @@ -41,17 +41,19 @@ --let $node_1 = replica --let $node_2 = node_2 +--let $node_3 = primary1 +--let $node_4 = primary2 --source include/auto_increment_offset_save.inc --connection replica --echo # Galera replica changing master to primary1 ---disable_query_log SET @@default_master_connection='stream1'; +--disable_query_log --eval CHANGE MASTER 'stream1' TO master_host='127.0.0.1', master_user='repl', master_password='repl', master_port=$NODE_MYPORT_3, master_use_gtid=slave_pos; --enable_query_log -SET @@default_master_connection='stream2'; --echo # Primary node changing master to primary2 +SET @@default_master_connection='stream2'; --disable_query_log --eval CHANGE MASTER 'stream2' TO master_host='127.0.0.1', master_user='repl2', master_password='repl2', master_port=$NODE_MYPORT_4, master_use_gtid=slave_pos; --enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_MDEV-29512.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_MDEV-29512.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -10,6 +10,4 @@ max-binlog-size=4096 expire-logs-days=1 - [mysqld.2] - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_alter_engine_myisam.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_alter_engine_myisam.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_aria.inc # @@ -35,7 +36,4 @@ DROP TABLE t1; --connection node_1 ---disable_query_log SET GLOBAL wsrep_mode = DEFAULT; ---enable_query_log - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -20,7 +20,7 @@ --connection node_1a SET SESSION wsrep_sync_wait = 0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock' +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); --source include/wait_condition.inc SELECT COUNT(*) = 0 FROM t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,3 @@ lock_wait_timeout=5 innodb_lock_wait_timeout=5 wait_timeout=5 - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test 2025-05-19 16:14:24.000000000 +0000 @@ -27,16 +27,16 @@ --connection node_1 SELECT 1 FROM DUAL; # Wait ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'; +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); --source include/wait_condition.inc -SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'; +SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); UNLOCK TABLES; SET SESSION wsrep_sync_wait = 15; SHOW CREATE TABLE t1; -SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'; +SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_ctas.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test 2025-05-19 16:14:24.000000000 +0000 @@ -73,4 +73,3 @@ --connection node_3 RESET MASTER; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_nonprim.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_nonprim.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test 2025-05-19 16:14:24.000000000 +0000 @@ -2,7 +2,7 @@ # Test the behavior of a Galera async slave if it goes non-prim. Async replication # should abort with an error but it should be possible to restart it. # -# The galera/galera_2node_slave.cnf describes the setup of the nodes +# The galera_3nodes_as_slave.cnf describes the setup of the nodes # --source include/have_innodb.inc @@ -17,9 +17,10 @@ --connection node_2 --disable_query_log ---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_PORT=$NODE_MYPORT_4, MASTER_USER='root'; +--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_4; --enable_query_log START SLAVE; + SET SESSION wsrep_sync_wait = 0; --connection node_4 @@ -44,9 +45,8 @@ INSERT INTO t1 VALUES (1),(2),(3),(4),(5); --connection node_2 ---sleep 5 +wait_for_slave_to_stop; --let $value = query_get_value(SHOW SLAVE STATUS, Last_SQL_Error, 1) ---connection node_1 --disable_query_log --eval SELECT "$value" IN ("Error 'Unknown command' on query. Default database: 'test'. Query: 'BEGIN'", "Node has dropped from cluster") AS expected_error --enable_query_log @@ -74,7 +74,6 @@ --connection node_4 DROP TABLE t1; ---sleep 2 --connection node_2 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc @@ -84,7 +83,7 @@ CALL mtr.add_suppression("Slave SQL: Error 'Unknown command' on query"); CALL mtr.add_suppression("Slave: Unknown command Error_code: 1047"); -CALL mtr.add_suppression("Transport endpoint is not connected"); +CALL mtr.add_suppression("(Transport endpoint|Socket) is not connected"); CALL mtr.add_suppression("Slave SQL: Error in Xid_log_event: Commit could not be completed, 'Deadlock found when trying to get lock; try restarting transaction', Error_code: 1213"); CALL mtr.add_suppression("Slave SQL: Node has dropped from cluster, Error_code: 1047"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,7 @@ wsrep_sst_auth="root:" [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_backup_stage.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_backup_stage.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test 2025-05-19 16:14:24.000000000 +0000 @@ -56,7 +56,7 @@ # reach commit stage. In the unlikely case the interleaving is different, the # result of the test should not change. --connection node_1c ---let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (State='Commit' OR State='Waiting for certification') AND ID=$insert_id +--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification') AND ID=$insert_id --source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id --source include/wait_condition.inc @@ -83,11 +83,11 @@ # wait for insert to get blocked --connection node_1c ---let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (State='Commit' OR State='Waiting for certification') AND ID=$insert_id +--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification') AND ID=$insert_id --source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id --source include/wait_condition.inc ---let $wait_condition = SELECT COUNT(*)=2 FROM information_schema.processlist WHERE Info like 'INSERT INTO t1 (f1) values("node1%")' AND (State = 'Commit' OR State='Waiting for certification') +--let $wait_condition = SELECT COUNT(*)=2 FROM information_schema.processlist WHERE Info like 'INSERT INTO t1 (f1) values("node1%")' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification') --source include/wait_condition.inc # nothing after BLOCK_DDL is applied diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test 2025-05-19 16:14:24.000000000 +0000 @@ -55,4 +55,3 @@ --disconnect node_2a --disconnect node_2b - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test 2025-05-19 16:14:24.000000000 +0000 @@ -17,12 +17,12 @@ --connection node_2 SET SESSION wsrep_sync_wait = 0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock' +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); --source include/wait_condition.inc UNLOCK TABLES; ---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock' +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); --source include/wait_condition.inc COMMIT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -16,13 +16,16 @@ INSERT INTO t1 VALUES (2); --connection node_2 ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock' ---source include/wait_condition.inc +SET SESSION wsrep_sync_wait = 0; +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); +--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST +--source include/wait_condition_with_debug.inc UNLOCK TABLES; ---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock' ---source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); +--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST +--source include/wait_condition_with_debug.inc COMMIT; SELECT COUNT(*) = 1 FROM t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,10 +6,10 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test 2025-05-19 16:14:24.000000000 +0000 @@ -129,7 +129,7 @@ let SEARCH_PATTERN = Server desynched from group during BACKUP STAGE BLOCK_COMMIT.; --source include/search_pattern_in_file.inc -SET GLOBAL wsrep_mode = ""; +SET GLOBAL wsrep_mode = DEFAULT; --connection node_1 DROP TABLE t; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,4 @@ !include ../galera_2nodes.cnf + [mysqltest] -ps-protocol \ No newline at end of file +ps-protocol diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,5 +5,3 @@ [mysqld.2] innodb_stats_persistent=ON - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test 2025-05-19 16:14:24.000000000 +0000 @@ -46,4 +46,3 @@ --enable_query_log DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test 2025-05-19 16:14:24.000000000 +0000 @@ -113,7 +113,7 @@ --connection node_2b SET SESSION wsrep_sync_wait=0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table metadata lock'; +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'; --source include/wait_condition.inc --connection node_2a diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill_debug.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill_debug.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test 2025-05-19 16:14:24.000000000 +0000 @@ -110,7 +110,7 @@ --connection node_2a --let $connection_id = `SELECT CONNECTION_ID()` -CREATE TABLE t1 (i int primary key); +CREATE TABLE t1 (i int primary key) engine=innodb; # Set up sync point SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue"; @@ -129,17 +129,17 @@ --enable_query_log SET DEBUG_SYNC = "now SIGNAL bwoc_continue"; -SET DEBUG_SYNC='RESET'; --connection node_2a --error 0,1213,2013,2026 --reap --connection node_2 +SET DEBUG_SYNC='RESET'; # victim was able to complete the INSERT select * from t1; --disconnect node_2a +--disconnect node_2b --connection node_1 drop table t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_lock_wait.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_lock_wait.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test 2025-05-19 16:14:24.000000000 +0000 @@ -97,4 +97,3 @@ --disconnect node_1_p2 --disconnect node_2_p1 --disconnect node_2_p2 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -9,5 +9,3 @@ binlog-checksum=CRC32 master-verify-checksum=1 slave-sql-verify-checksum=1 - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test 2025-05-19 16:14:24.000000000 +0000 @@ -38,8 +38,6 @@ --connection node_1 DROP TABLE t1; ---disable_query_log SET @@global.wsrep_mode=DEFAULT; ---enable_query_log --echo # End of tests. diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,3 @@ binlog-row-event-max-size=4294967040 [mysqld.2] - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,3 @@ binlog-row-event-max-size=256 [mysqld.2] - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test 2025-05-19 16:14:24.000000000 +0000 @@ -12,4 +12,3 @@ SELECT COUNT(*) = 1 FROM t1 WHERE f1 = REPEAT('x', 1000); DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_row_image.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_row_image.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test 2025-05-19 16:14:24.000000000 +0000 @@ -94,7 +94,3 @@ DROP TABLE t1; DROP TABLE t2; - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test 2025-05-19 16:14:24.000000000 +0000 @@ -5,15 +5,15 @@ --source include/galera_cluster.inc --source include/force_restart.inc +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + --connection node_1 SET GLOBAL auto_increment_offset=1; --connection node_2 SET GLOBAL auto_increment_offset=2; ---let $node_1=node_1 ---let $node_2=node_2 ---source include/auto_increment_offset_save.inc - ## ## Verify the correct operation of the auto-increment when the binlog ## format artificially set to the 'STATEMENT' (although this mode is diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_cache_index.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_cache_index.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc CREATE TABLE t1 (c1 int, UNIQUE INDEX (c1)) engine=innodb; INSERT INTO t1 VALUES (1),(2),(3),(4),(5); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_can_run_toi.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_can_run_toi.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc # # MDEV-24833 : Signal 11 on wsrep_can_run_in_toi at wsrep_mysqld.cc:1994 # diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_change_user.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_change_user.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test 2025-05-19 16:14:24.000000000 +0000 @@ -26,4 +26,3 @@ --connection node_1 DROP TABLE t1; DROP USER user1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_circular_replication.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_circular_replication.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test 2025-05-19 16:14:24.000000000 +0000 @@ -45,6 +45,7 @@ --let $node_1 = replica1 --let $node_2 = node_2 --let $node_3 = primary2 +--let $node_4 = primary1 --source include/auto_increment_offset_save.inc --connection replica1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_concurrent_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_concurrent_ctas.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test 2025-05-19 16:14:24.000000000 +0000 @@ -98,4 +98,3 @@ --source include/galera_end.inc --echo # End of test - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_create_trigger.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_create_trigger.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test 2025-05-19 16:14:24.000000000 +0000 @@ -41,4 +41,3 @@ DROP TABLE definer_default; DROP USER 'user1'; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ctas.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,6 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_aria.inc --connection node_1 create table t1_Aria(a int, count int, b int, key(b)) engine=Aria; @@ -36,4 +38,3 @@ DROP TABLE t2, t3,t4; DROP TABLE t1_MyISAM, t1_Aria,t1_InnoDB; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,5 +7,3 @@ [mysqld.2] wsrep-debug=1 loose-galera-ddl-fk-conflict=1 - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test 2025-05-19 16:14:24.000000000 +0000 @@ -43,4 +43,3 @@ --source galera_ddl_fk_conflict_with_tmp.inc # CHECK and ANALYZE are not affected - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_multiline.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_multiline.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test 2025-05-19 16:14:24.000000000 +0000 @@ -51,4 +51,3 @@ --connection node_1 DROP TABLE t1, t2, t3, t4, t5, t6; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test 2025-05-19 16:14:24.000000000 +0000 @@ -13,11 +13,13 @@ --source include/force_restart.inc # Make sure that the test is operating on the right version of galera library. ---let $galera_version=26.4.11 +--let $galera_version=26.4.21 source ../wsrep/include/check_galera_version.inc; # Global Variables +SELECT COUNT(*) `expect 51` FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%'; + SELECT VARIABLE_NAME, VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_disallow_local_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_disallow_local_gtid.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test 2025-05-19 16:14:24.000000000 +0000 @@ -101,4 +101,3 @@ DROP TABLE tab1; DROP TABLE tab2; DROP TABLE tab3; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ !include ../galera_2nodes.cnf -[mysqld] +[mysqld] encrypt-tmp-files = 1 plugin-load-add= @ENV.FILE_KEY_MANAGEMENT_SO file-key-management diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_fk_truncate.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_fk_truncate.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,9 @@ +!include ../galera_2nodes.cnf + +[mysqld.1] +auto_increment_offset=1 +auto_increment_increment=1 + +[mysqld.2] +auto_increment_offset=2 +auto_increment_increment=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_flush_local.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_flush_local.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # PXC-391 --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_query_cache.inc --disable_warnings @@ -72,7 +73,6 @@ SELECT COUNT(*) AS EXPECT_10000 FROM t2; SELECT COUNT(*) AS EXPECT_10 FROM x2; - --connection node_1 DROP TABLE t1, t2, x1, x2; CREATE TABLE t1 (f1 INTEGER); @@ -144,4 +144,3 @@ --disable_query_log SET GLOBAL wsrep_mode = DEFAULT; --enable_query_log - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_forced_binlog_format.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_forced_binlog_format.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test 2025-05-19 16:14:24.000000000 +0000 @@ -49,4 +49,3 @@ #--source include/galera_end.inc --echo # End of tests - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true' +wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes' +wsrep_provider_options='gcache.recover=yes;gcache.size=128M;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/big_test.inc CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -3,7 +3,7 @@ [mysqld.1] max_allowed_packet=10M innodb_log_file_size=220M -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true;gcache.size=10M' +wsrep_provider_options='gcache.recover=yes;gcache.size=10M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes;pc.ignore_sb=true;gcache.size=10M' +wsrep_provider_options='gcache.recover=yes;gcache.size=10M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/big_test.inc SET SESSION wsrep_sync_wait = 0; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -2,8 +2,8 @@ [mysqld.1] innodb_log_file_size=220M -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true;' +wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S' [mysqld.2] innodb_log_file_size=220M -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes;pc.ignore_sb=true;' +wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test 2025-05-19 16:14:24.000000000 +0000 @@ -5,6 +5,7 @@ --source include/galera_cluster.inc --source include/big_test.inc +--source include/have_innodb.inc --source include/have_log_bin.inc SET SESSION wsrep_sync_wait = 0; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_fragment.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_fragment.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,7 @@ !include ../galera_2nodes.cnf + [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=64' +wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=64' +wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,7 @@ !include ../galera_2nodes.cnf + [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=64;gcache.size=10M' +wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=64;gcache.size=10M' +wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,4 +7,3 @@ [mysqld.2] log-bin log-slave-updates - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_server_id.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_server_id.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -13,4 +13,3 @@ server-id=12 log_slave_updates log_bin - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,13 +4,16 @@ log-bin=mysqld-bin log-slave-updates binlog-format=ROW + [mysqld.1] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.2] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.3] gtid-domain-id=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,14 +5,16 @@ log-slave-updates binlog-format=ROW wsrep_sst_method=rsync + [mysqld.1] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.2] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.3] gtid-domain-id=2 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test 2025-05-19 16:14:24.000000000 +0000 @@ -13,6 +13,7 @@ # As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it # we open the node_3 connection here --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + --echo #Connection 2 --connection node_2 --disable_query_log @@ -30,6 +31,7 @@ SELECT @@global.gtid_binlog_state; --source include/save_master_gtid.inc + --echo #Connection 2 --connection node_2 --source include/sync_with_master_gtid.inc @@ -39,6 +41,7 @@ INSERT INTO t2 VALUES(5,55); INSERT INTO t2 VALUES(6,66); SELECT @@global.gtid_binlog_state; + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME= 't2'; @@ -60,6 +63,7 @@ INSERT INTO t1 VALUES ('node1_committed_before'); COMMIT; --source include/save_master_gtid.inc + --echo #Connection 2 --connection node_2 --source include/sync_with_master_gtid.inc @@ -68,6 +72,7 @@ INSERT INTO t1 VALUES ('node2_committed_before'); INSERT INTO t1 VALUES ('node2_committed_before'); COMMIT; + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME= 't1'; @@ -77,10 +82,12 @@ --let $node_1= node_1 --let $node_2= node_2 --source include/auto_increment_offset_save.inc + --echo #Connection 2 --connection node_2 --echo Shutting down server ... --source include/shutdown_mysqld.inc + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' @@ -90,6 +97,7 @@ INSERT INTO t1 VALUES ('node1_committed_during'); INSERT INTO t1 VALUES ('node1_committed_during'); COMMIT; + --echo #Connection 2 --connection node_2 --echo Starting server ... @@ -103,11 +111,13 @@ INSERT INTO t1 VALUES ('node2_committed_after'); INSERT INTO t1 VALUES ('node2_committed_after'); COMMIT; + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT COUNT(*) = 8 FROM t1; --source include/wait_condition.inc Select * from t1 order by f1; + --echo #Connection 2 --connection node_2 Select * from t1 order by f1; @@ -153,12 +163,14 @@ INSERT INTO t1 VALUES ('node2_slave_started'); SELECT count(*) from t1; SELECT @@global.gtid_binlog_state; + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT COUNT(*) = 12 FROM t1; --source include/wait_condition.inc SELECT count(*) from t1; SELECT @@global.gtid_binlog_state; + --echo #Connection 3 --connection node_3 DROP TABLE t2,t1; @@ -173,10 +185,12 @@ --connection node_2 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2'; --source include/wait_condition.inc + --echo #Connection 1 --connection node_1 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc + --echo #Connection 2 --connection node_2 STOP SLAVE; @@ -194,6 +208,7 @@ set global wsrep_on=OFF; reset master; set global wsrep_on=ON; + --echo #Connection 3 --connection node_3 reset master; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,10 +4,12 @@ log-bin=mysqld-bin log-slave-updates binlog-format=ROW + [mysqld.1] gtid-domain-id=1 wsrep_gtid_mode=1 wsrep_gtid_domain_id=1 + [mysqld.2] gtid-domain-id=1 wsrep_gtid_mode=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,9 +6,14 @@ INSERT INTO t1 VALUES (0,0),(1,1),(2,2),(3,3); BEGIN; +SET DEBUG_SYNC = 'wsrep_after_statement_enter SIGNAL blocked'; --send UPDATE t1 set b = 100 where id between 1 and 2; --connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1 + +SET DEBUG_SYNC = 'now WAIT_FOR blocked'; +SET DEBUG_SYNC = 'wsrep_after_statement_enter CLEAR'; + --connection node_1b SET @save_dbug = @@SESSION.debug_dbug; SET @@SESSION.innodb_lock_wait_timeout=2; @@ -21,5 +26,6 @@ --reap COMMIT; SELECT * FROM t1; +SET DEBUG_SYNC = 'RESET'; --disconnect node_1b DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_ignore.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_ignore.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test 2025-05-19 16:14:24.000000000 +0000 @@ -57,4 +57,3 @@ DROP TABLE t2; DROP TABLE t3; --eval SET GLOBAL wsrep_sync_wait = $wsrep_sync_wait_orig - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_multi.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_multi.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test 2025-05-19 16:14:24.000000000 +0000 @@ -113,10 +113,3 @@ SELECT COUNT(*) = 2 FROM t1; DROP TABLE t1; - - - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -35,10 +35,10 @@ log_bin=binlog [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -35,10 +35,10 @@ log_bin=binlog [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,10 +5,10 @@ wsrep_sst_auth=root: [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,10 +7,10 @@ innodb_flush_log_at_trx_commit=0 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -9,11 +9,11 @@ ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' loose-innodb-log-file-buffering [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' loose-innodb-log-file-buffering [sst] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,7 @@ # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test 2025-05-19 16:14:24.000000000 +0000 @@ -2,14 +2,14 @@ --source include/galera_cluster.inc --source include/have_innodb.inc ---source suite/galera/include/galera_sst_set_mysqldump.inc - call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to "); --let $node_1=node_1 --let $node_2=node_2 --source include/auto_increment_offset_save.inc +--source suite/galera/include/galera_sst_set_mysqldump.inc + # mysql-wsrep#33 - nnoDB: Failing assertion: xid_seqno > trx_sys_cur_xid_seqno in trx_sys_update_wsrep_checkpoint with mysqldump IST # --source suite/galera/include/galera_st_disconnect_slave.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_progress.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_progress.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,4 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' - - - +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,8 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;ist.recv_bind=127.0.0.1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;ist.recv_bind=127.0.0.1;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;ist.recv_bind=127.0.0.1' - +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;ist.recv_bind=127.0.0.1;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,9 +4,9 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sync_wait=1 [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sync_wait=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -8,10 +8,10 @@ ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] ssl-mode=VERIFY_CA diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_applier.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_applier.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,9 @@ !include ../galera_2nodes.cnf [mysqld.1] +wsrep_slave_threads=1 wsrep-debug=1 [mysqld.2] +wsrep_slave_threads=1 wsrep-debug=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_smallchanges.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_smallchanges.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # --source include/galera_cluster.inc +--source include/have_innodb.inc # Save original auto_increment_offset values. --let $node_1=node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_load_data.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_load_data.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test 2025-05-19 16:14:24.000000000 +0000 @@ -397,4 +397,3 @@ --connection node_1 use test; drop database cardtest02; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_log_bin_opt.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_log_bin_opt.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,10 +5,10 @@ wsrep_sst_auth="root:" [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_many_rows.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_many_rows.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,6 @@ --source include/big_test.inc --source include/galera_cluster.inc +--source include/have_innodb.inc # Save original auto_increment_offset values. --let $node_1=node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,5 +5,3 @@ [mysqld.2] innodb-stats-persistent=1 - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,6 @@ --source include/galera_cluster.inc --source include/have_innodb.inc + --connection node_1 create table t(a int); insert into t select 1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_15611.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_15611.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ !include ../galera_2nodes.cnf + [mysqld.1] [mysqld.2] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdl_race.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdl_race.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test 2025-05-19 16:14:24.000000000 +0000 @@ -91,4 +91,3 @@ --disconnect node_1a --disconnect node_1b --disconnect node_1c - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_nonPK_and_PA.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_nonPK_and_PA.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test 2025-05-19 16:14:24.000000000 +0000 @@ -26,7 +26,6 @@ --source include/have_debug_sync.inc --source include/galera_have_debug_sync.inc - # Setup CREATE TABLE t1 (f1 VARCHAR(32) NOT NULL) ENGINE=InnoDB; @@ -44,7 +43,7 @@ SET GLOBAL wsrep_slave_threads = 2; --echo *************************************************************** ---echo scenario 1, conflicting UPDATE +--echo scenario 1, conflicting UPDATE --echo *************************************************************** # Set up a synchronization point to catch the first transaction @@ -99,9 +98,8 @@ --source include/galera_signal_sync_point.inc --source include/galera_clear_sync_point.inc - --echo *************************************************************** ---echo scenario 2, conflicting DELETE +--echo scenario 2, conflicting DELETE --echo *************************************************************** # Set up a synchronization point to catch the first transaction @@ -164,5 +162,6 @@ DROP TABLE t1; DROP TABLE t2; + --connection node_2 SET GLOBAL wsrep_slave_threads = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_nopk_unicode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_nopk_unicode.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test 2025-05-19 16:14:24.000000000 +0000 @@ -39,5 +39,4 @@ SELECT f1 = 'текÑÑ‚2' FROM t1; SELECT f1 = 'текÑÑ‚2' FROM t1 WHERE f1 = 'текÑÑ‚2'; - DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -29,11 +29,13 @@ --let $galera_connection_name = node_2a --let $galera_server_number = 2 --source include/galera_connect.inc + --connection node_2a --sleep 1 SET SESSION wsrep_sync_wait=0; -SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification'); -SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE '%Waiting for table metadata lock%'; +SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification'); +SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); + SELECT COUNT(*) AS EXPECT_0 FROM t1; SELECT COUNT(*) AS EXPECT_0 FROM t2; @@ -44,8 +46,11 @@ --eval SET SESSION wsrep_sync_wait = $wsrep_sync_wait_orig; SELECT COUNT(*) AS EXPECT_1 FROM t1; SELECT COUNT(*) AS EXPECT_1 FROM t2; -SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE '%committed%' or STATE = 'Waiting for certification'); +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification'); +--disable_query_log --eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig; +--enable_query_log + DROP TABLE t1; DROP TABLE t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test 2025-05-19 16:14:24.000000000 +0000 @@ -67,4 +67,3 @@ --connection default DROP TABLE t1; DROP TABLE ten; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test 2025-05-19 16:14:24.000000000 +0000 @@ -91,5 +91,3 @@ DROP TABLE t1; DROP TABLE ten; DROP PROCEDURE p1; - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_simple.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_simple.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test 2025-05-19 16:14:24.000000000 +0000 @@ -48,10 +48,10 @@ --connection node_2 SET SESSION wsrep_sync_wait = 0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%'; +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'); --source include/wait_condition.inc ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification'); +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification'); --source include/wait_condition.inc UNLOCK TABLES; @@ -61,7 +61,9 @@ SELECT COUNT(*) as expect_20 FROM t1; SELECT COUNT(*) as expect_20 FROM t2; +--disable_query_log --eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig; +--enable_query_log DROP TABLE t1; DROP TABLE t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_partitioned_tables.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_partitioned_tables.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,133 @@ +--source include/galera_cluster.inc +--source include/have_partition.inc +--source include/have_innodb.inc +--source include/have_aria.inc + +call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine partition for table"); + +--echo # wsrep-mode= DEFAULT +SET GLOBAL wsrep_mode = ""; +SELECT @@wsrep_mode; +CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB + PARTITION BY KEY (v1) + PARTITIONS 2; +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM + PARTITION BY KEY (v1) + PARTITIONS 2; +ALTER TABLE t1 ADD COLUMN v2 int; +ALTER TABLE t2 ADD COLUMN v2 int; +INSERT INTO t1 VALUES (1,1),(2,2); +INSERT INTO t2 VALUES (1,1),(2,2); +ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM; +ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria; +UPDATE t1 SET v3 = 3; +UPDATE t2 SET v3 = 3; +CREATE INDEX xx1 ON t1(v2); +CREATE INDEX xx2 ON t2(v2); +DROP INDEX xx1 ON t1; +DROP INDEX xx2 ON t2; +TRUNCATE TABLE t1; +TRUNCATE TABLE t2; +RENAME TABLE t1 TO t1_v2; +RENAME TABLE t2 TO t2_v2; +CREATE VIEW x1 AS SELECT * FROM t1_v2; +CREATE VIEW x2 AS SELECT * FROM t2_v2; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1 + AFTER INSERT ON t1_v2 FOR EACH ROW + UPDATE t1_v2 SET t1_v2.v3 = t1_v2.v3+1; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2 + AFTER INSERT ON t2_v2 FOR EACH ROW + UPDATE t2_v2 SET t2_v2.v3 = t2_v2.v3+1; + +--connection node_2 +SHOW CREATE TABLE t1_v2; +SHOW CREATE TABLE t2_v2; +SHOW CREATE VIEW x1; +SHOW CREATE VIEW x2; + +SELECT * FROM t1_v2; +SELECT * FROM t2_v2; + +--connection node_1 +DROP VIEW x1; +DROP VIEW x2; +DROP TRIGGER increment_before_t1; +DROP TRIGGER increment_before_t2; +DROP TABLE t1_v2; +DROP TABLE t2_v2; + +SET GLOBAL wsrep_mode = ""; +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM + PARTITION BY KEY (v1) + PARTITIONS 2; +--echo # wsrep-mode= STRICT_REPLICATION +SET GLOBAL wsrep_mode = "STRICT_REPLICATION"; +SELECT @@wsrep_mode; +CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB + PARTITION BY KEY (v1) + PARTITIONS 2; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +CREATE OR REPLACE TABLE t3 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM + PARTITION BY KEY (v1) + PARTITIONS 2; +ALTER TABLE t1 ADD COLUMN v2 int; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +ALTER TABLE t2 ADD COLUMN v2 int; +INSERT INTO t1 VALUES (1,1),(2,2); +INSERT INTO t2 VALUES (1),(2); +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria; +UPDATE t1 SET v2 = v2 + 3; +UPDATE t2 SET v1 = v1 + 3; +CREATE INDEX xx1 ON t1(v2); +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +CREATE INDEX xx2 ON t2(v2); +DROP INDEX xx1 ON t1; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +DROP INDEX xx2 on t2; +TRUNCATE TABLE t1; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +TRUNCATE TABLE t2; +# At the moment can't restrict rename +RENAME TABLE t1 TO t1_v2; +RENAME TABLE t2 TO t2_v2; +RENAME TABLE t2_v2 TO t2; +CREATE VIEW x1 AS SELECT * FROM t1_v2; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +CREATE VIEW x2 AS SELECT * FROM t2; +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1 + AFTER INSERT ON t1_v2 FOR EACH ROW + UPDATE t1_v2 SET t1_v2.v2 = t1_v2.v2+1; +--error ER_GALERA_REPLICATION_NOT_SUPPORTED +CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2 + AFTER INSERT ON t2 FOR EACH ROW + UPDATE t2 SET t2.v1 = t2.v1+1; + +--connection node_2 +SHOW CREATE TABLE t1_v2; +SHOW CREATE TABLE t2; +SHOW CREATE VIEW x1; + +SELECT * FROM t1_v2; +SELECT * FROM t2; + +--connection node_1 +DROP VIEW x1; +DROP TRIGGER increment_before_t1; +DROP TABLE t1_v2; +# We allow dropping table +DROP TABLE t2; +SET GLOBAL wsrep_mode = ""; + +CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM + PARTITION BY KEY (v1) + PARTITIONS 2; +--echo # wsrep-mode= STRICT_REPLICATION +SET GLOBAL wsrep_mode = "STRICT_REPLICATION"; +SELECT @@wsrep_mode; +ALTER TABLE t2 ENGINE=InnoDB; +DROP TABLE t2; + +SET GLOBAL wsrep_mode = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,8 +4,7 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' - +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_recovery.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_recovery.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test 2025-05-19 16:14:24.000000000 +0000 @@ -33,8 +33,8 @@ # Perform --wsrep-recover and preserve the positions into variables by placing them in $MYSQL_TMP_DIR/galera_wsrep_start_position.inc and then --source'ing it ---exec $MYSQLD --defaults-group-suffix=.1 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.1.log > $MYSQL_TMP_DIR/galera_wsrep_recover.1.log 2>&1 ---exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.2.log > $MYSQL_TMP_DIR/galera_wsrep_recover.2.log 2>&1 +--exec $MYSQLD --defaults-group-suffix=.1 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --wsrep-recover --loose-innodb --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.1.log > $MYSQL_TMP_DIR/galera_wsrep_recover.1.log 2>&1 +--exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --wsrep-recover --loose-innodb --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.2.log > $MYSQL_TMP_DIR/galera_wsrep_recover.2.log 2>&1 --perl use strict; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,4 +7,3 @@ [mysqld.2] query_cache_type=1 query_cache_size=1355776 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_invalidate.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_invalidate.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test 2025-05-19 16:14:24.000000000 +0000 @@ -29,7 +29,7 @@ --connection node_3 --disable_query_log ---eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', MASTER_PORT=$NODE_MYPORT_1, master_use_gtid=current_pos +--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_1, master_use_gtid=current_pos; --enable_query_log START SLAVE; --source include/wait_for_slave_to_start.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,4 +7,3 @@ [mysqld.2] query_cache_type=1 query_cache_size=1355776 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_read_only.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_read_only.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test 2025-05-19 16:14:24.000000000 +0000 @@ -48,4 +48,3 @@ SET GLOBAL read_only=FALSE; DROP TABLE t1; DROP USER foo@localhost; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test 2025-05-19 16:14:24.000000000 +0000 @@ -25,7 +25,6 @@ SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 234; SELECT COUNT(*) = 1 FROM t2 WHERE f1 = REPEAT('b', 256); - --disable_query_log --eval SET GLOBAL wsrep_provider_options = '$wsrep_provider_options_orig'; --enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_nochanges.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_nochanges.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test 2025-05-19 16:14:24.000000000 +0000 @@ -37,4 +37,3 @@ --source include/auto_increment_offset_restore.inc --source include/galera_end.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -17,4 +17,3 @@ wsrep_gtid_domain_id=16 gtid_domain_id=11 gtid_strict_mode=1 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test 2025-05-19 16:14:24.000000000 +0000 @@ -40,6 +40,7 @@ --let $node_1 = node_1 --let $node_2 = replica +--let $node_3 = primary --source include/auto_increment_offset_save.inc --connection replica diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_savepoint_replay.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_savepoint_replay.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test 2025-05-19 16:14:24.000000000 +0000 @@ -83,4 +83,3 @@ SELECT COUNT(*) = 1 FROM t1 WHERE f2 = 'c'; DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequence_engine.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequence_engine.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,13 @@ --source include/galera_cluster.inc --source include/have_sequence.inc +--connection node_2 +let $restore_wsrep_ignore_apply_errors=`SELECT @@GLOBAL.wsrep_ignore_apply_errors`; +SET GLOBAL wsrep_ignore_apply_errors=0; + +--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 +--connection node_2a +SET SESSION wsrep_sync_wait=0; SET GLOBAL wsrep_ignore_apply_errors=0; SET SESSION AUTOCOMMIT=0; SET SESSION max_error_count=0; @@ -11,6 +18,8 @@ --error ER_NO_SUCH_TABLE SHOW CREATE TABLE t0; ---connection node_1 -SET GLOBAL wsrep_ignore_apply_errors=DEFAULT; +--disable_query_log +--eval SET GLOBAL wsrep_ignore_apply_errors=$restore_wsrep_ignore_apply_errors +--enable_query_log +--disconnect node_2a diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,13 +1,9 @@ !include ../galera_2nodes.cnf [mysqld.1] -log-bin -log-slave-updates auto-increment-increment=2 auto-increment-offset=1 [mysqld.2] -log-bin -log-slave-updates auto-increment-increment=2 auto-increment-offset=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.combinations 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,5 @@ +[binlogon] +log-bin +log-slave-updates + +[binlogoff] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,9 @@ --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/have_sequence.inc +--source include/have_aria.inc +--disable_ps2_protocol # # MDEV-19353 : Alter Sequence do not replicate to another nodes with in Galera Cluster # @@ -45,6 +48,7 @@ select NEXT VALUE FOR Seq1_1; --connection node_1 +SHOW CREATE SEQUENCE Seq1_1; DROP SEQUENCE Seq1_1; # @@ -316,6 +320,12 @@ DROP TABLE t1; DROP SEQUENCE t; +--connection node_2 +--let $wsrep_sync_wait_orig_2 = `SELECT @@wsrep_sync_wait` +SET SESSION wsrep_sync_wait=15; + +--connection node_1 + CREATE SEQUENCE t INCREMENT BY 0 CACHE=20 ENGINE=INNODB; CREATE TABLE t1(a int not null primary key default nextval(t), b int) engine=innodb; # @@ -338,6 +348,10 @@ SELECT * FROM t1; SELECT NEXTVAL(t); +--disable_query_log +--eval SET SESSION wsrep_sync_wait = $wsrep_sync_wait_orig_2 +--enable_query_log + --connection node_1 DROP TABLE t1; DROP SEQUENCE t; @@ -355,4 +369,17 @@ ALTER SEQUENCE IF EXISTS t MINVALUE=1; DROP TABLE t; + +--echo +--echo MDEV-32631: +--echo + +CREATE OR REPLACE TABLE t1(c INT ) ENGINE=ARIA; +SET SESSION WSREP_OSU_METHOD=RSU; +--error ER_NOT_SUPPORTED_YET +INSERT INTO t1 SELECT seq,concat(seq,1) FROM seq_1_to_100; +SET SESSION WSREP_OSU_METHOD=TOI; +DROP TABLE t1; + +--echo --echo End of 10.5 tests diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,9 @@ +!include ../galera_2nodes.cnf + +[mysqld.1] +auto-increment-increment=2 +auto-increment-offset=1 + +[mysqld.2] +auto-increment-increment=2 +auto-increment-offset=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,5 @@ +[binlogon] +log-bin +log-slave-updates + +[binlogoff] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,115 @@ +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/galera_have_debug_sync.inc + +--disable_ps2_protocol +# +# We create InnoDB seqeuence with small cache that is then +# used as default value for column in table. +# +--connection node_1 +--let $wsrep_local_replays_old = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'` +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 INT) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1, 0), (3, 0); +--connection node_1 +START TRANSACTION; +INSERT INTO t1 VALUES (4, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (5, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (6, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (7, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (8, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (9, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (10, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (11, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (12, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (13, next value for s); # No conflict in cert +INSERT INTO t1 VALUES (14, next value for s); # No conflict in cert +SELECT * FROM t1 WHERE f1 > 0 FOR UPDATE; # Should cause GAP lock between 1 and 3 + +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 +SET SESSION wsrep_sync_wait=0; +# Block the applier on node #1 and issue a conflicting update on node #2 +--let $galera_sync_point = apply_monitor_slave_enter_sync +--source include/galera_set_sync_point.inc + +# +# Send conflicting INSERT +# +--connection node_2 +INSERT INTO t1 VALUES (2, 2); # This should BF abort because of GAP lock + +--connection node_1a +--source include/galera_wait_sync_point.inc +--source include/galera_clear_sync_point.inc + +# Block the commit, send the COMMIT and wait until it gets blocked +--let $galera_sync_point = commit_monitor_master_enter_sync +--source include/galera_set_sync_point.inc + +--connection node_1 +--send COMMIT + +--connection node_1a + +--let $galera_sync_point = apply_monitor_slave_enter_sync commit_monitor_master_enter_sync +--source include/galera_wait_sync_point.inc +--source include/galera_clear_sync_point.inc + +--let $galera_sync_point = abort_trx_end +--source include/galera_set_sync_point.inc +--let $galera_sync_point = apply_monitor_slave_enter_sync +--source include/galera_signal_sync_point.inc +--let $galera_sync_point = abort_trx_end commit_monitor_master_enter_sync +--source include/galera_wait_sync_point.inc + +# Let the transactions proceed +--source include/galera_clear_sync_point.inc +--let $galera_sync_point = abort_trx_end +--source include/galera_signal_sync_point.inc +--let $galera_sync_point = commit_monitor_master_enter_sync +--source include/galera_signal_sync_point.inc + +# Commit succeeds +--connection node_1 +--reap + +# wsrep_local_replays has increased by 1 +--let $wsrep_local_replays_new = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'` +--disable_query_log +--eval SELECT $wsrep_local_replays_new - $wsrep_local_replays_old = 1 AS wsrep_local_replays; +--enable_query_log + +INSERT INTO t1 VALUES (22, next value for s); +INSERT INTO t1 VALUES (23, next value for s); +INSERT INTO t1 VALUES (24, next value for s); +INSERT INTO t1 VALUES (25, next value for s); +INSERT INTO t1 VALUES (26, next value for s); +INSERT INTO t1 VALUES (27, next value for s); +INSERT INTO t1 VALUES (28, next value for s); +INSERT INTO t1 VALUES (29, next value for s); +INSERT INTO t1 VALUES (30, next value for s); +INSERT INTO t1 VALUES (31, next value for s); +INSERT INTO t1 VALUES (32, next value for s); +INSERT INTO t1 VALUES (33, next value for s); +INSERT INTO t1 VALUES (34, next value for s); +INSERT INTO t1 VALUES (35, next value for s); + +--connection node_1 +SELECT * FROM t1; +SELECT LASTVAL(s); + +--connection node_2 +SELECT * FROM t1; +SELECT LASTVAL(s); + +--connection node_1 +SELECT NEXTVAL(s); + +--connection node_2 +SELECT NEXTVAL(s); + +DROP SEQUENCE s; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,9 @@ +!include ../galera_2nodes.cnf + +[mysqld.1] +auto-increment-increment=2 +auto-increment-offset=1 + +[mysqld.2] +auto-increment-increment=2 +auto-increment-offset=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.combinations 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,5 @@ +[binlogon] +log-bin +log-slave-updates + +[binlogoff] diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,255 @@ +--source include/galera_cluster.inc +--source include/have_sequence.inc + +--disable_ps2_protocol +# +# Case 1: Separate transactions from few connections +# +--connection node_1 +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; + +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 + +--connection node_1 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; + +--connection node_2 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; + +--connection node_2a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; + +--connection node_1a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +COMMIT; + +--connection node_2 +SELECT LASTVAL(s); +--connection node_1 +SELECT LASTVAL(s); +--connection node_2a +SELECT LASTVAL(s); +--connection node_1a +SELECT LASTVAL(s); + +--connection node_1 +SELECT * FROM t1; +--connection node_2 +SELECT * FROM t1; + +--connection node_1 +DROP TABLE t1; +DROP SEQUENCE s; + +# +# Case 2: All rollback +# +--connection node_1 +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; + +--connection node_1 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; + +--connection node_2 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; + +--connection node_2a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; + +--connection node_1a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +ROLLBACK; + +--connection node_2 +SELECT LASTVAL(s); +--connection node_1 +SELECT LASTVAL(s); +--connection node_2a +SELECT LASTVAL(s); +--connection node_1a +SELECT LASTVAL(s); + +--connection node_1 +SELECT * FROM t1; +--connection node_2 +SELECT * FROM t1; + +--connection node_1 +DROP TABLE t1; +DROP SEQUENCE s; +# +# Case 3: Mixed transactions +# +--connection node_1 +CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB; +CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB; + +--connection node_1 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); + +--connection node_1a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); + +--connection node_2a +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); + +--connection node_2 +BEGIN; +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); +INSERT INTO t1(f2) values (1); + +--connection node_1 +COMMIT; +--connection node_1a +ROLLBACK; +--connection node_2 +--error ER_LOCK_DEADLOCK +COMMIT; +--connection node_2a +--error ER_LOCK_DEADLOCK +ROLLBACK; + +--connection node_2 +SELECT LASTVAL(s); +--connection node_1 +SELECT LASTVAL(s); +--connection node_2a +SELECT LASTVAL(s); +--connection node_1a +SELECT LASTVAL(s); + +--connection node_1 +SELECT * FROM t1; +--connection node_2 +SELECT * FROM t1; + +--connection node_1 +DROP TABLE t1; +DROP SEQUENCE s; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_server.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_server.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test 2025-05-19 16:14:24.000000000 +0000 @@ -25,4 +25,3 @@ --source include/galera_end.inc --echo # End of test - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test 2025-05-19 16:14:24.000000000 +0000 @@ -95,4 +95,5 @@ --connection node_1 DROP TABLE t1; SET GLOBAL wsrep_slave_threads = DEFAULT; + --source include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_slave_replay.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_slave_replay.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,6 +6,7 @@ # or rollback and replay (depending on the nature of lock conflict). # +--source include/galera_cluster.inc --source include/have_innodb.inc --source include/have_log_bin.inc --source include/have_debug.inc @@ -13,9 +14,7 @@ --source include/galera_have_debug_sync.inc --connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 - --connection node_2a ---source include/galera_cluster.inc ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sp_bf_abort.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sp_bf_abort.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc 2025-05-19 16:14:24.000000000 +0000 @@ -35,4 +35,3 @@ --source include/galera_signal_sync_point.inc --let $galera_sync_point = after_replicate_sync --source include/galera_signal_sync_point.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_split_brain.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_split_brain.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test 2025-05-19 16:14:24.000000000 +0000 @@ -13,6 +13,7 @@ --let $node_2=node_2 --source include/auto_increment_offset_save.inc +--connection node_2 call mtr.add_suppression("WSREP: TO isolation failed for: "); --connection node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test 2025-05-19 16:14:24.000000000 +0000 @@ -17,7 +17,6 @@ INSERT INTO t1 VALUES (2); - --connection node_2 SELECT COUNT(*) = 2 FROM t1; SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,10 @@ --source include/galera_cluster.inc --source include/have_innodb.inc --source include/big_test.inc +--source include/have_perfschema.inc + +# Verify that SSL is handled by the provider. +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,11 @@ +!include ../galera_2nodes.cnf + +[mysqld] +loose-galera-ssl-cipher=1 +wsrep-debug=1 + +[mysqld.1] +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;cert.log_conflicts=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + +[mysqld.2] +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;cert.log_conflicts=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,82 @@ +# +# Test upgrading the SSL chipher +# + +--source include/galera_cluster.inc +--source include/have_ssl_communication.inc +--source include/have_openssl.inc +--source include/force_restart.inc + +# +# Lowest supported Galera library version +# +--let $galera_version=26.4.21 +source ../wsrep/include/check_galera_version.inc; + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +# Setup galera ports +--connection node_1 +--source suite/galera/include/galera_base_port.inc +--let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT + +--connection node_2 +--source suite/galera/include/galera_base_port.inc +--let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT + +SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# 2. Restart node #1 with a socket.ssl_cipher + +--connection node_1 +--source include/shutdown_mysqld.inc +--let $restart_noprint = 1 +--let $start_mysqld_params = --wsrep-cluster-address=gcomm://127.0.0.1:$NODE_GALERAPORT_2 --wsrep_provider_options=base_port=$NODE_GALERAPORT_1;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-key.pem;socket.ssl_cipher=AES256-SHA +--source include/start_mysqld.inc +--source include/wait_until_connected_again.inc + +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; +--source include/wait_condition.inc +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# 3. Restart node #2 with the new socket.ssl_ca , socket.ssl_cert, socket.ssl_key and socket.ssl_cipher + +--connection node_2 +--source include/shutdown_mysqld.inc +--let $start_mysqld_params = --wsrep_provider_options=base_port=$NODE_GALERAPORT_2;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-key.pem;socket.ssl_cipher=AES256-SHA +--source include/start_mysqld.inc +--source include/wait_until_connected_again.inc + +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; +--source include/wait_condition.inc +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# 4. Restart node #1 with the new socket.ssl_ca , socket.ssl_cert, socket.ssl_key and socket.ssl_cipher + +--connection node_1 +--source include/shutdown_mysqld.inc +--let $start_mysqld_params = --wsrep-cluster-address=gcomm://127.0.0.1:$NODE_GALERAPORT_2 --wsrep_provider_options=base_port=$NODE_GALERAPORT_1;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-key.pem;socket.ssl_cipher=AES256-SHA +--source include/start_mysqld.inc +--source include/wait_until_connected_again.inc + +--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; +--source include/wait_condition.inc +SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; + +# 5. Make sure node_2 is ready as well +--connection node_2 +--source include/galera_wait_ready.inc + +# Upgrade complete. Both nodes now use the new key and certificate + +# Restore original auto_increment_offset values. +--source include/auto_increment_offset_restore.inc + +--connection node_1 +call mtr.add_suppression("WSREP: write_handler\\(\\)"); +--connection node_2 +call mtr.add_suppression("WSREP: write_handler\\(\\)"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test 2025-05-19 16:14:24.000000000 +0000 @@ -8,6 +8,10 @@ --source include/galera_cluster.inc --source include/have_innodb.inc --source include/big_test.inc +--source include/have_perfschema.inc + +# Verify that SSL is handled by the provider. +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,7 @@ wsrep-debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test 2025-05-19 16:14:24.000000000 +0000 @@ -10,6 +10,9 @@ --source include/have_openssl.inc --source include/force_restart.inc +# Verify that SSL is handled by the provider. +SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%'; + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_encrypted.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_encrypted.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -11,7 +11,7 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,12 +6,12 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' innodb_fast_shutdown=0 innodb_undo_tablespaces=0 [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' innodb_fast_shutdown=0 innodb_undo_tablespaces=3 loose_innodb_log_file_buffering diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,11 +6,11 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/data_dir_test -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,10 +6,10 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,28 @@ +!include ../galera_2nodes.cnf + +[mysqld] +wsrep_sst_method=mariabackup +wsrep_sst_auth="root:" +gtid_strict_mode=ON +wsrep-gtid_mode=ON +log-bin +log-slave_updates +loose-galera-sst-mariabackup-gtid=1 + +[mysqld.1] +wsrep_provider_options='pc.weight=2;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' +gtid_domain_id=10 +wsrep_gtid_domain_id=100 +wsrep_slave_threads=4 +server-id=10 + +[mysqld.2] +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' +gtid_domain_id=10 +wsrep_gtid_domain_id=100 +wsrep_slave_threads=4 +server-id=10 + +[sst] +transferfmt=@ENV.MTR_GALERA_TFMT +streamfmt=mbstream diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,29 @@ +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc +--source include/force_restart.inc + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--source suite/galera/include/galera_st_shutdown_slave.inc +--source suite/galera/include/galera_st_clean_slave.inc + +--source suite/galera/include/galera_st_kill_slave.inc +--source suite/galera/include/galera_st_kill_slave_ddl.inc + +# Restore original auto_increment_offset values. +--source include/auto_increment_offset_restore.inc + +--connection node_1 +--echo # Node_1 +SHOW global variables like 'gtid%pos'; + +--connection node_2 +--echo # Node_2 +SHOW global variables like 'gtid%pos'; + +--source include/galera_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,10 +6,10 @@ wsrep_debug=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,10 +5,10 @@ wsrep_sst_auth="root:" [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,10 +7,10 @@ innodb-file-per-table=ON [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [sst] transferfmt=@ENV.MTR_GALERA_TFMT diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,4 +5,4 @@ wsrep_sst_auth="root:" [mariabackup] -use_memory=123m +use_memory=129m diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test 2025-05-19 16:14:24.000000000 +0000 @@ -40,8 +40,8 @@ --source include/wait_condition.inc # Confirm that IST did not take place ---let $assert_text = mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\) ---let $assert_select = mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\) +--let $assert_text = mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\) +--let $assert_select = mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\) --let $assert_count = 1 --let $assert_file = $MYSQLTEST_VARDIR/mysqld.2/data/mariabackup.prepare.log --let $assert_only_after = Starting InnoDB instance for recovery diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,7 @@ # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -9,10 +9,10 @@ loose-galera_sst_mysqldump_with_key=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [client] ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,7 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync2.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync2.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,11 +4,11 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_bin=@ENV.MYSQLTEST_VARDIR/mysqld.1/server1_binlog log_bin_index=@ENV.MYSQLTEST_VARDIR/tmp/server1_binlog_index.index [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_bin=@ENV.MYSQLTEST_VARDIR/mysqld.2/server2_binlog log_bin_index=@ENV.MYSQLTEST_VARDIR/tmp/server2_binlog_index.index diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,9 +4,9 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_bin=server1_binlog [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_bin=server2_binlog diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,11 +4,11 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test_2 -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' [sst] backup_threads=2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -14,7 +14,7 @@ ssl-mode=VERIFY_CA [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -8,7 +8,7 @@ tcert=@ENV.MYSQL_TEST_DIR/std_data/server-cert.pem [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -10,7 +10,7 @@ ssl-mode=VERIFY_CA [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,23 @@ +!include ../galera_2nodes.cnf + +[mysqld] +wsrep_sst_method=rsync +gtid_strict_mode=ON +wsrep-gtid_mode=ON +log-bin +log-slave_updates +loose-galera-sst-rsync-gtid=1 + +[mysqld.1] +wsrep_provider_options='pc.weight=2;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' +gtid_domain_id=10 +wsrep_gtid_domain_id=100 +wsrep_slave_threads=4 +server-id=10 + +[mysqld.2] +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' +gtid_domain_id=10 +wsrep_gtid_domain_id=100 +wsrep_slave_threads=4 +server-id=10 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,29 @@ +--source include/big_test.inc +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_mariabackup.inc +--source include/force_restart.inc + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--source suite/galera/include/galera_st_shutdown_slave.inc +--source suite/galera/include/galera_st_clean_slave.inc + +--source suite/galera/include/galera_st_kill_slave.inc +--source suite/galera/include/galera_st_kill_slave_ddl.inc + +# Restore original auto_increment_offset values. +--source include/auto_increment_offset_restore.inc + +--connection node_1 +--echo # Node_1 +SHOW global variables like 'gtid%pos'; + +--connection node_2 +--echo # Node_2 +SHOW global variables like 'gtid%pos'; + +--source include/galera_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,11 +4,11 @@ wsrep_sst_method=rsync [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_basename=server1 log_bin [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' log_basename=server2 log_bin diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,14 +5,14 @@ bind-address=:: [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_incoming_address='[::1]:@mysqld.1.port' wsrep_node_address=::1 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' [mysqld.2] +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' +wsrep_node_incoming_address='[::1]:@mysqld.2.port' wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;gcache.size=1;pc.ignore_sb=true' wsrep_node_address=::1 -wsrep_node_incoming_address='[::1]:@mysqld.2.port' wsrep_sst_receive_address=AUTO diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_cluster.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_cluster.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test 2025-05-19 16:14:24.000000000 +0000 @@ -14,5 +14,3 @@ SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_index.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_index.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test 2025-05-19 16:14:24.000000000 +0000 @@ -12,7 +12,6 @@ --connection node_2 INSERT INTO wsrep_local_indexes VALUES ((SELECT variable_value FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE variable_name = 'wsrep_local_index')); - --connection node_1 SELECT COUNT(*) = 2 FROM wsrep_local_indexes; SELECT COUNT(DISTINCT wsrep_local_index) = 2 FROM wsrep_local_indexes; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_state.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_state.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test 2025-05-19 16:14:24.000000000 +0000 @@ -22,7 +22,3 @@ --source include/wait_condition.inc SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_innodb.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_innodb.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test 2025-05-19 16:14:24.000000000 +0000 @@ -15,9 +15,10 @@ # In both cases apply flood control if >= 10 same warning # --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_aria.inc -call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine .*"); +call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine "); CREATE TABLE t1(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=INNODB; CREATE TABLE t2(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=MYISAM; @@ -114,4 +115,3 @@ SET GLOBAL log_warnings=DEFAULT; SET GLOBAL wsrep_mode=DEFAULT; --disable_query_log - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_primary_key.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_primary_key.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test 2025-05-19 16:14:24.000000000 +0000 @@ -14,9 +14,10 @@ # In both cases apply flood control if >= 10 same warning # --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_aria.inc -call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled. Table .*"); +call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled\\. Table "); CREATE TABLE t1(a int, b varchar(50)) ENGINE=INNODB; CREATE TABLE t2(a int, b varchar(50)) ENGINE=MYISAM; @@ -140,4 +141,3 @@ SET GLOBAL log_warnings=DEFAULT; SET GLOBAL wsrep_mode=DEFAULT; --disable_query_log - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_suspend_slave.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_suspend_slave.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test 2025-05-19 16:14:24.000000000 +0000 @@ -67,4 +67,3 @@ # Restore original auto_increment_offset values. --let $node_2=node_2a --source include/auto_increment_offset_restore.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---wsrep-sync-wait=0 --wsrep-causal-reads=OFF \ No newline at end of file +--wsrep-sync-wait=0 --wsrep-causal-reads=OFF diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,8 +3,8 @@ # --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_debug.inc ---source include/have_debug_sync.inc CREATE TABLE t1 (f1 INTEGER) ENGINE=InnoDB; INSERT INTO t1 VALUES (1); @@ -44,7 +44,6 @@ --eval SELECT WSREP_SYNC_WAIT_UPTO_GTID('$wsrep_last_committed_gtid') AS WSREP_SYNC_WAIT_UPTO; --enable_query_log - # Timeout if GTID is not received on time --disable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_table_with_hyphen.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_table_with_hyphen.inc 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc 2025-05-19 16:14:24.000000000 +0000 @@ -45,4 +45,3 @@ --connection node_2 --eval drop table `$fk_child` --eval drop table `$fk_parent` - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_temporary_sequences.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_temporary_sequences.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test 2025-05-19 16:14:24.000000000 +0000 @@ -30,7 +30,6 @@ SHOW CREATE TABLE seq1; SHOW CREATE TABLE seq2; - --connection node_1 DROP TABLE t; DROP SEQUENCE seq1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -13,5 +13,3 @@ log_slave_updates=ON wsrep_sst_method=rsync thread_handling = pool-of-threads - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,6 @@ --let $node_1 = node_1 --let $node_2 = node_2 - --source ../galera/include/auto_increment_offset_save.inc # diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,43 +1,81 @@ --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/have_sequence.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/galera_have_debug_sync.inc # # In this test, we simultaneously send two non-conflicting ALTER TABLE statements # +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 +--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 +--connection node_1 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY AUTO_INCREMENT, f2 INTEGER); +INSERT INTO t1(f2) SELECT seq FROM seq_1_to_1000; ---connection node_2 +--connection node_2a +SET SESSION wsrep_sync_wait=0; --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; --source include/wait_condition.inc ---send ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 123); +--let $wait_condition = SELECT COUNT(*) = 1000 FROM t1; +--source include/wait_condition.inc + +--connection node_1a +--echo # Block the applier on node_1 and issue a ddl from node_2 +SET SESSION wsrep_sync_wait=0; +--let $galera_sync_point = apply_monitor_slave_enter_sync +--source include/galera_set_sync_point.inc +--connection node_2 +--echo # DDL 1 +--send ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 VALUES (NULL, 10000, 10000); + +--connection node_1a +--source include/galera_wait_sync_point.inc +--source include/galera_clear_sync_point.inc + +--echo # This will block on acquiring total order isolation --connection node_1 +--echo # DDL 2 --send CREATE UNIQUE INDEX i1 ON t1(f2); +--connection node_1a +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'acquiring total order%' or STATE LIKE 'Waiting for table metadata%' +--source include/wait_condition.inc + +--echo # Signal DDL 1 +--source include/galera_clear_sync_point.inc +--let $galera_sync_point = apply_monitor_slave_enter_sync +--source include/galera_signal_sync_point.inc + --connection node_2 --reap -INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 234); +--connection node_1 +--reap + +--connection node_2 --let $wait_condition = SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc -SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; -SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; -SELECT COUNT(*) = 2 FROM t1; +SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; +SHOW CREATE TABLE t1; +SELECT COUNT(*) AS EXPECT_1001 FROM t1; --connection node_1 ---reap - --let $wait_condition = SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; --source include/wait_condition.inc -SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; -SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; -SELECT COUNT(*) = 2 FROM t1; +SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1'; +SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1'; +SHOW CREATE TABLE t1; +SELECT COUNT(*) AS EXPECT_1001 FROM t1; DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ftwrl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ftwrl.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test 2025-05-19 16:14:24.000000000 +0000 @@ -19,4 +19,3 @@ SHOW CREATE TABLE t1; DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_transaction_read_only.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_transaction_read_only.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test 2025-05-19 16:14:24.000000000 +0000 @@ -55,4 +55,3 @@ --enable_query_log DROP TABLE t1; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_udf.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_udf.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,10 +6,3 @@ [mysqld.2] query_cache_type=1 - - - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_unicode_identifiers.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_unicode_identifiers.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test 2025-05-19 16:14:24.000000000 +0000 @@ -75,4 +75,3 @@ DROP DATABASE `база`; DROP DATABASE `втора база`; --eval SET GLOBAL wsrep_sync_wait = $wsrep_sync_wait_orig - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_v1_row_events.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_v1_row_events.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,10 +4,3 @@ log-bin-use-v1-row-events=1 [mysqld.2] - - - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_OSU_method2.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_OSU_method2.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test 2025-05-19 16:14:24.000000000 +0000 @@ -44,4 +44,3 @@ --connection node_1a SET DEBUG_SYNC= 'RESET'; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test 2025-05-19 16:14:24.000000000 +0000 @@ -94,11 +94,13 @@ --eval SET GLOBAL wsrep_auto_increment_control = $auto_increment_control_orig --eval SET GLOBAL auto_increment_increment = $auto_increment_increment_node1 --eval SET GLOBAL auto_increment_offset = $auto_increment_offset_node1 +--disconnect node_1a --connection node_2 --eval SET GLOBAL wsrep_auto_increment_control = $auto_increment_control_orig --eval SET GLOBAL auto_increment_increment = $auto_increment_increment_node2 --eval SET GLOBAL auto_increment_offset = $auto_increment_offset_node2 +--disconnect node_2a --enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,9 +5,3 @@ [mysqld.2] wsrep-auto-increment-control=ON - - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test 2025-05-19 16:14:24.000000000 +0000 @@ -22,6 +22,8 @@ DROP TABLE t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc SHOW TABLES; # Drop schema that does not exist @@ -33,6 +35,8 @@ DROP SCHEMA s1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME LIKE 's1'; +--source include/wait_condition.inc SHOW SCHEMAS; # Drop index that does not exist using DROP INDEX @@ -45,6 +49,10 @@ DROP INDEX idx1 ON t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_INDEXES WHERE NAME LIKE 'idx1'; +--source include/wait_condition.inc SHOW CREATE TABLE t1; DROP TABLE t1; @@ -58,6 +66,10 @@ ALTER TABLE t1 DROP INDEX idx1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_INDEXES WHERE NAME LIKE 'idx1'; +--source include/wait_condition.inc SHOW CREATE TABLE t1; DROP TABLE t1; @@ -71,6 +83,11 @@ ALTER TABLE t1 DROP COLUMN f2; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS WHERE NAME LIKE 'f2'; +--source include/wait_condition.inc + SHOW CREATE TABLE t1; DROP TABLE t1; @@ -93,6 +110,10 @@ SELECT COUNT(*) AS expect_0 FROM t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 0 FROM t1; +--source include/wait_condition.inc SELECT COUNT(*) AS expect_0 FROM t1; DROP TABLE t1; @@ -112,6 +133,10 @@ SELECT COUNT(*) AS expect_1 FROM t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1; +--source include/wait_condition.inc SELECT COUNT(*) AS expect_1 FROM t1; DROP TABLE t1; @@ -136,6 +161,8 @@ SELECT COUNT(*) AS expect_0 FROM t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 0 FROM t1; --source include/wait_condition.inc SELECT VARIABLE_VALUE expect_Primary FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status'; @@ -171,6 +198,8 @@ SELECT COUNT(*) AS expect_0 FROM t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 0 FROM t1; --source include/wait_condition.inc SELECT VARIABLE_VALUE expect_Primary FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status'; @@ -202,6 +231,8 @@ SELECT COUNT(*) expect_0 FROM t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1'; +--source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 0 FROM t1; --source include/wait_condition.inc SELECT VARIABLE_VALUE = 'Primary' FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status'; @@ -219,6 +250,10 @@ INSERT INTO child VALUES (1,1),(2,2),(3,3); --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/parent'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/child'; +--source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 3 FROM child; --source include/wait_condition.inc @@ -233,6 +268,10 @@ SELECT COUNT(*) AS expect_0 FROM child; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/parent'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/child'; +--source include/wait_condition.inc --let $wait_condition = SELECT COUNT(*) = 0 FROM child; --source include/wait_condition.inc SELECT VARIABLE_VALUE = 'Primary' FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,7 +6,7 @@ ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem [mysqld.1] -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;repl.causal_read_timeout=PT90S;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_incoming_address='[::1]:@mysqld.1.port' wsrep_node_address=[::1]:@mysqld.1.#galera_port wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' @@ -14,7 +14,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;repl.causal_read_timeout=PT90S;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_incoming_address='[::1]:@mysqld.2.port' wsrep_node_address=[::1]:@mysqld.2.#galera_port wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,7 +3,6 @@ # --source include/galera_cluster.inc ---source include/have_innodb.inc --source include/have_aria.inc CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=Aria; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test 2025-05-19 16:14:24.000000000 +0000 @@ -234,4 +234,3 @@ --connection node_2 SET GLOBAL wsrep_mode = DEFAULT; --enable_query_log - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,7 +3,6 @@ # --source include/galera_cluster.inc ---source include/have_innodb.inc CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=MyISAM; INSERT INTO t1 VALUES (1); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test 2025-05-19 16:14:24.000000000 +0000 @@ -21,6 +21,11 @@ INSERT INTO t1 SELECT 4 FROM DUAL UNION ALL SELECT 5 FROM DUAL; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 5 FROM t1; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_5 FROM t1; DROP TABLE t1; @@ -36,6 +41,13 @@ REPLACE INTO t1 SELECT 3, 'yyy' FROM DUAL; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 3 FROM t1; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 3 AND f2 = 'yyy'; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_3 FROM t1; SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f1 = 1 AND f2 = 'klm'; SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f1 = 2 AND f2 = 'xyz'; @@ -49,6 +61,9 @@ UPDATE t1 SET f2 = 'zzz' WHERE f2 = 'yyy'; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1 WHERE f2 = 'zzz'; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f2 = 'zzz'; # @@ -59,6 +74,9 @@ DELETE FROM t1 WHERE f2 = 'zzz'; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 0 FROM t1 WHERE f2 = 'zzz'; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_0 FROM t1 WHERE f2 = 'zzz'; # @@ -69,6 +87,9 @@ TRUNCATE TABLE t1; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 0 FROM t1; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_0 FROM t1; DROP TABLE t1; @@ -77,8 +98,8 @@ # --connection node_1 -CREATE TABLE t1 (f1 INTEGER) ENGINE=MyISAM; -CREATE TABLE t2 (f1 INTEGER) ENGINE=InnoDB; +CREATE TABLE t1 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=MyISAM; +CREATE TABLE t2 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=InnoDB; SET AUTOCOMMIT=OFF; START TRANSACTION; INSERT INTO t1 VALUES (1); @@ -86,6 +107,15 @@ COMMIT; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t2; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_1 FROM t1; SELECT COUNT(*) AS EXPECT_1 FROM t2; @@ -100,6 +130,11 @@ ROLLBACK; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 2 FROM t1; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t2; +--source include/wait_condition.inc + SELECT COUNT(*) AS EXPECT_2 FROM t1; SELECT COUNT(*) AS EXPECT_1 FROM t2; @@ -119,13 +154,20 @@ INSERT INTO t2 VALUES (1); --connection node_2 -# The MyISAM update is replicated immediately, so a duplicate key error happens even before the COMMIT +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1; +--source include/wait_condition.inc +# The MyISAM update is replicated when executed, so a duplicate key error happens even before the COMMIT --error ER_DUP_ENTRY INSERT INTO t1 VALUES (1); --connection node_1 COMMIT; DROP TABLE t1, t2; + # # Test prepared staments # @@ -146,6 +188,10 @@ SELECT * FROM t1 ORDER BY id; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 11 FROM t1; +--source include/wait_condition.inc SELECT * FROM t1 ORDER BY id; DROP TABLE t1; @@ -172,6 +218,10 @@ SELECT * FROM t1 ORDER BY id; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 11 FROM t1; +--source include/wait_condition.inc SELECT * FROM t1 ORDER BY id; DROP PROCEDURE proc; @@ -195,26 +245,46 @@ SELECT * FROM t2 ORDER BY id; --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2'; +--source include/wait_condition.inc +SELECT COUNT(*) FROM t1; +--let $wait_condition = SELECT COUNT(*) = 10 FROM t1; +--source include/wait_condition.inc SELECT * FROM t1 ORDER BY id; SELECT * FROM t2 ORDER BY id; DROP TRIGGER tr1; DROP TRIGGER tr2; DROP TRIGGER tr3; -DROP TABLE t1,t2; +DROP TABLE t1, t2; + +CREATE TABLE t1 (a INT, b INT, UNIQUE(a)) ENGINE=MyISAM; +CREATE TRIGGER tr1 BEFORE INSERT ON t1 FOR EACH ROW SET NEW.a=1; +INSERT INTO t1 (a,b) VALUES (10,20); +SELECT * from t1; + +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1'; +--source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) = 1 FROM t1; +--source include/wait_condition.inc + +SELECT * from t1; +--connection node_1 +DROP TABLE t1; --echo # --echo # MDEV-11152: wsrep_replicate_myisam: SELECT gets replicated using TO --echo # --connection node_1 -CREATE TABLE t1 (i INT) ENGINE=INNODB; +CREATE TABLE t1 (i INT NOT NULL PRIMARY KEY) ENGINE=INNODB; INSERT INTO t1 VALUES(1); # This command should not get replicated. SELECT * FROM t1; DROP TABLE t1; ---connection node_1 ---disable_query_log SET GLOBAL wsrep_mode = DEFAULT; + --connection node_2 SET GLOBAL wsrep_mode = DEFAULT; ---enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_slave_threads.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_slave_threads.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test 2025-05-19 16:14:24.000000000 +0000 @@ -15,11 +15,15 @@ --connection node_1 --let $wsrep_slave_threads_orig = `SELECT @@wsrep_slave_threads` + CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=InnoDB; CREATE TABLE t2 (f1 INT AUTO_INCREMENT PRIMARY KEY) Engine=InnoDB; --connection node_2 +--let $wsrep_slave_threads_orig_2 = `SELECT @@wsrep_slave_threads` + CALL mtr.add_suppression("WSREP: Refusing exit for the last slave thread\\."); + # Setting wsrep_slave_threads to zero triggers a warning SET GLOBAL wsrep_slave_threads = 0; SHOW WARNINGS; @@ -74,7 +78,9 @@ --let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_applier_thread_count'; --source include/wait_condition.inc ---eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig +--disable_query_log +--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig_2; +--enable_query_log DROP TABLE t1; DROP TABLE t2; @@ -94,6 +100,11 @@ SET GLOBAL wsrep_slave_threads = 1; --connection node_1 + +--disable_query_log +--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig; +--enable_query_log + INSERT INTO t1 VALUES (DEFAULT); INSERT INTO t1 VALUES (DEFAULT); INSERT INTO t1 VALUES (DEFAULT); @@ -106,6 +117,10 @@ --connection node_2 +--disable_query_log +--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig_2; +--enable_query_log + # Wait until above DDL is replicated # # make sure that we are left with exactly one applier thread before we leaving the test diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_wsrep_mode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_wsrep_mode.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test 2025-05-19 16:14:24.000000000 +0000 @@ -57,9 +57,3 @@ # reset SET GLOBAL wsrep_mode=DEFAULT; - - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,20 @@ +!include ../galera_4nodes.cnf + +[mysqld] +wsrep-ignore-apply-errors=0 + +[mysqld.1] +wsrep_node_name='node_1' + +[mysqld.2] +wsrep_node_name='node_2' + +[mysqld.3] +wsrep_node_name='node_3' + +[mysqld.4] +wsrep_node_name='node_4' +wsrep_sst_donor='node_1' + +[ENV] +galera_cluster_size=4 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,165 @@ +# +# Test a case where a joiner encounters an error during IST +# Instead of voting it should assume error and bail out. +# + +--source include/galera_cluster.inc +--source include/big_test.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +# Make sure that the test is operating on the right version of galera library. +--let $galera_version=26.4.19 +source ../wsrep/include/check_galera_version.inc; + +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--let $node_4=node_4 +--source ../include/auto_increment_offset_save.inc + +# create table t1 and procedure p1 to generate wirtesets +--connection node_1 +CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY); + +DELIMITER |; +CREATE PROCEDURE p1(IN max INT) +BEGIN + DECLARE i INT; + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; + + SET i = 0; + WHILE i < max DO + INSERT IGNORE INTO t1 VALUES (DEFAULT); + SET i = i + 1; + END WHILE; +END| +DELIMITER ;| + +CALL p1(130); + +--connection node_4 +--echo Shutting down server 4... +--let $node_4_server_id= `SELECT @@server_id` +--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect +--let $node_4_pid_file= `SELECT @@pid_file` +--source include/shutdown_mysqld.inc + +# Wait for node #4 to leave cluster +--let $members = 3 +--connection node_1 +--source include/wsrep_wait_membership.inc +--connection node_2 +--source include/wsrep_wait_membership.inc +--connection node_3 +--source include/wsrep_wait_membership.inc +--echo Server 4 left the cluster + +# Create some writesets for IST +--connection node_1 +CALL p1(130); + +# Create a writeset that node 4 won't be able to apply by creating a table +# that won't be present in the replication stream +--connection node_1 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +--connection node_2 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +--connection node_3 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +# This should cause error during IST +INSERT INTO t2 VALUES (DEFAULT); + +# make sure nodes 1,2,3 progress far enough for commit cut update +CALL p1(130); + +--connection node_1 +# prepare to stop SST donor thread when it receives a request from starting node #4 +SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation"; + +--echo Restarting server 4 +# Need to use this form instead of start_mysqld.inc because the latter is blocking +--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name + +--echo Wait for server 1 to become a donor +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached"; +--echo Server 1 got SST request from server 4 +SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue"; +SET GLOBAL debug = ""; +SET DEBUG_SYNC='RESET'; + +# +# After this point node #4 shall proceed to IST and bail out +# + +--echo Waiting for server 4 to leave the cluster +--let $members = 3 +--source include/wsrep_wait_membership.inc +--connection node_2 +--source include/wsrep_wait_membership.inc +--connection node_3 +--source include/wsrep_wait_membership.inc + +--connection node_4 +--echo Server 4 left the cluster, killing it... + +# Kill the connected server +--exec echo "wait" > $node_4_expect_file_name +--let KILL_NODE_PIDFILE = $node_4_pid_file +--perl + my $pid_filename = $ENV{'KILL_NODE_PIDFILE'}; + my $mysqld_pid = `cat $pid_filename`; + chomp($mysqld_pid); + system("kill -9 $mysqld_pid"); + exit(0); +EOF +--echo Killed server 4... +--source include/wait_until_disconnected.inc +--echo Restarting server 4... +--source include/start_mysqld.inc +--source include/galera_wait_ready.inc + +# Confirm node #4 has rejoined +--connection node_1 +--let $members = 4 +--source include/wsrep_wait_membership.inc + +# Confirm that all is good and all nodes have identical data + +--connection node_1 +SELECT count(*) AS expect1_390 FROM t1; +SELECT count(*) AS expect1_1 FROM t2; + +--connection node_2 +SELECT count(*) AS expect2_390 FROM t1; +SELECT count(*) AS expect2_1 FROM t2; + +--connection node_3 +SELECT count(*) AS expect3_390 FROM t1; +SELECT count(*) AS expect3_1 FROM t2; + +--connection node_4 +SELECT count(*) AS expect4_390 FROM t1; +SELECT count(*) AS expect4_1 FROM t2; + +DROP TABLE t1; +DROP TABLE t2; +DROP PROCEDURE p1; + +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +CALL mtr.add_suppression("Inconsistency detected: Failed on preordered"); +CALL mtr.add_suppression("Failed to apply write set"); +CALL mtr.add_suppression("Sending JOIN failed: -103"); +CALL mtr.add_suppression("Failed to JOIN the cluster after SST"); + +--source ../include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,21 @@ +!include ../galera_4nodes.cnf + +[mysqld] +wsrep-ignore-apply-errors=0 + +[mysqld.1] +wsrep_node_name='node_1' + +[mysqld.2] +wsrep_node_name='node_2' + +[mysqld.3] +wsrep_node_name='node_3' + +[mysqld.4] +wsrep_node_name='node_4' +wsrep_sst_donor='node_1' + +[ENV] +galera_cluster_size=4 +MTR_SST_JOINER_DELAY=20 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,73 @@ +# +# Test a case where a vote happens in JOINED state after SST on a writeset +# that should be applied. +# + +--source galera_vote_joined_begin.inc +# +# At this point state snapshot has been copied, node 1 is operational and +# we have about 10 seconds while everything we do will go into the replication +# queue on node 4 which it will have to apply on top of the snapshot. +# + +# Increase replication queue on node_4 +--connection node_1 +CALL p1(130); + +# Create a writeset that node 4 won't be able to apply by creating a table +# that won't be present in the replication stream +--connection node_1 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +--connection node_2 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +--connection node_3 +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +# This should cause node #4 to initiate a vote and leave the cluster +INSERT INTO t2 VALUES (DEFAULT); + +# make sure nodes 1,2,3 progress far enough for commit cut update +CALL p1(130); + +--echo Waiting for server 4 to leave the cluster +--let $members = 3 +--source include/wsrep_wait_membership.inc +--connection node_2 +--source include/wsrep_wait_membership.inc +--connection node_1 +--source include/wsrep_wait_membership.inc + +--connection node_4 +--echo Server 4 left the cluster, killing it... +# Kill the connected server +--exec echo "wait" > $node_4_expect_file_name +--let KILL_NODE_PIDFILE = $node_4_pid_file +--perl + my $pid_filename = $ENV{'KILL_NODE_PIDFILE'}; + my $mysqld_pid = `cat $pid_filename`; + chomp($mysqld_pid); + system("kill -9 $mysqld_pid"); + exit(0); +EOF +--echo Killed server 4... +--source include/wait_until_disconnected.inc +--echo Restarting server 4... +--source include/start_mysqld.inc +--source include/galera_wait_ready.inc +DROP TABLE t2; + +--source galera_vote_joined_end.inc + +--connection node_4 +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); +CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus"); +CALL mtr.add_suppression("Failed to apply write set: gtid:"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_begin.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_begin.inc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,79 @@ +# This file purpose is to set up node 4 to require SST which is artificaially +# prolonged and as a result accumulate sufficient relication queue. +# The contents of the qeuee are controlled in the sourcing test files. + +--source include/galera_cluster.inc +--source include/big_test.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc + +# Make sure that the test is operating on the right version of galera library. +--let $galera_version=26.4.19 +source ../wsrep/include/check_galera_version.inc; + +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--let $node_4=node_4 +--source ../include/auto_increment_offset_save.inc + +# create table t1 and procedure p1 to generate wirtesets +--connection node_1 +CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY); + +DELIMITER |; +CREATE PROCEDURE p1(IN max INT) +BEGIN + DECLARE i INT; + DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END; + + SET i = 0; + WHILE i < max DO + INSERT IGNORE INTO t1 VALUES (DEFAULT); + SET i = i + 1; + END WHILE; +END| +DELIMITER ;| + +# 130 events move the commit cut, it is essential in voting +CALL p1(130); + +--connection node_4 +--echo Shutting down server 4... +--let $node_4_server_id= `SELECT @@server_id` +--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect +--let $node_4_pid_file= `SELECT @@pid_file` +--source include/shutdown_mysqld.inc +# enforce SST +--exec rm -rf $MYSQLTEST_VARDIR/mysqld.4/data/grastate.dat + +# Wait for node #4 to leave cluster +--connection node_1 +--let $members = 3 +--source include/wsrep_wait_membership.inc + +# prepare to stop SST donor thread when node is in donor state +SET GLOBAL debug = "+d,sync.wsrep_donor_state"; + +--connection node_4 +--echo Restarting server 4... +# Need to use this form instead of start_mysqld.inc because the latter is blocking +--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name + +# Wait for node #1 to become a donor +--connection node_1 +SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached"; +--echo Tables on server 1 flushed and locked for SST to server 4 +SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state"; +SET GLOBAL debug = ""; +SET DEBUG_SYNC='RESET'; + +--echo Wait for the state snapshot to be copied to server 4 +--source include/galera_wait_ready.inc +--echo SST script unlocked server 1 + +# +# At this point state snapshot has been copied, node 1 is operational and +# we have about 20 seconds while everything we do will go into the replication +# queue on node 4 which it will have to apply on top of the snapshot. +# diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_end.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_end.inc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,33 @@ +# Confirm node #4 has rejoined +--connection node_1 +--let $members = 4 +--source include/wsrep_wait_membership.inc +#DROP TABLE IF EXISTS t2; + +# Confirm that all is good and all nodes have identical data + +--connection node_1 +SELECT count(*) AS expect1_390 FROM t1; + +#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows"); +#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno [0-9]+"); + +--connection node_2 +SELECT count(*) AS expect2_390 FROM t1; + +#CALL mtr.add_suppression("mysqld: Can't find record in 't1'"); +#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows"); +#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno seqno [0-9]+"); + +--connection node_3 +SELECT count(*) AS expect3_390 FROM t1; + +--connection node_4 +SELECT count(*) AS expect4_390 FROM t1; + +DROP TABLE t1; +DROP PROCEDURE p1; + +#CALL mtr.add_suppression("inconsistent with group"); + +--source ../include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,21 @@ +!include ../galera_4nodes.cnf + +[mysqld] +wsrep-ignore-apply-errors=0 + +[mysqld.1] +wsrep_node_name='node_1' + +[mysqld.2] +wsrep_node_name='node_2' + +[mysqld.3] +wsrep_node_name='node_3' + +[mysqld.4] +wsrep_node_name='node_4' +wsrep_sst_donor='node_1' + +[ENV] +galera_cluster_size=4 +MTR_SST_JOINER_DELAY=20 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,100 @@ +# +# Test a case where a vote happens in JOINED state after SST on a writeset +# that should be skipped. I.e. JOINED node should continue operation. +# + +--source galera_vote_joined_begin.inc +# +# At this point state snapshot has been copied, node 1 is operational and +# we have about 10 seconds while everything we do will go into the replication +# queue on node 4 which it will have to apply on top of the snapshot. +# + +# Increase replication queue on node_4 +--connection node_1 +CALL p1(130); + +# +# Create a writeset that node 4 won't be able to apply by making node 3 +# inconsisitent +# +--connection node_3 +--let $node_3_server_id= `SELECT @@server_id` +--let $node_3_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_3_server_id.expect +--let $node_3_pid_file= `SELECT @@pid_file` +SET SESSION wsrep_on = OFF; +CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY); +SET SESSION wsrep_on = ON; + +# This should cause nodes #1 and #2 to initiate a vote and kick node #3 +# out of the cluster, node #4 should recover the vote when fails to apply +# the event and continue +INSERT INTO t2 VALUES (DEFAULT); +SET SESSION wsrep_on = OFF; + +# make sure nodes 1,2 progress far enough for commit cut update +--connection node_1 +CALL p1(130); + +--let $members = 3 +--echo Waiting for server 3 to leave the cluster +--connection node_1 +--source include/wsrep_wait_membership.inc +--connection node_2 +--source include/wsrep_wait_membership.inc +--connection node_4 +# need to wait for extra SST delay on joiner +--sleep $MTR_SST_JOINER_DELAY +--sleep $MTR_SST_JOINER_DELAY +--enable_reconnect +--let $wait_timeout = 60 +--source include/wsrep_wait_membership.inc + +--connection node_3 +--echo Server 3 left the cluster, killing it... +# Kill the connected server +--exec echo "wait" > $node_3_expect_file_name +--let KILL_NODE_PIDFILE = $node_3_pid_file +--perl + my $pid_filename = $ENV{'KILL_NODE_PIDFILE'}; + my $mysqld_pid = `cat $pid_filename`; + chomp($mysqld_pid); + system("kill -9 $mysqld_pid"); + exit(0); +EOF +--echo Killed server 3. +--source include/wait_until_disconnected.inc +--echo Restarting server 3... +--exec echo "restart:$start_mysqld_params" > $node_3_expect_file_name + +--echo Waiting for server 3 to rejoin the cluster +--connection node_1 +--let $members = 3 +--source include/wsrep_wait_membership.inc + +--connection node_3 +--echo sleeping for $MTR_SST_JOINER_DELAY +# need to wait for extra SST delay on joiner +--sleep $MTR_SST_JOINER_DELAY +--sleep $MTR_SST_JOINER_DELAY +--echo Waiting ready +--enable_reconnect +--source include/galera_wait_ready.inc +--echo Server 3 restarted. + +--source galera_vote_joined_end.inc + +--connection node_1 +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); + +--connection node_2 +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); + +--connection node_3 +CALL mtr.add_suppression("Vote 0 \\(success\\) on .+ is inconsistent with group"); + +--connection node_4 +CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table "); +CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test 2025-05-19 16:14:24.000000000 +0000 @@ -91,10 +91,6 @@ DROP TABLE t2; ---let $node_3=node_3 ---let $auto_increment_offset_node_3 = 3; ---let $node_4=node_4 ---let $auto_increment_offset_node_4 = 4; --source suite/galera/include/auto_increment_offset_restore.inc --disconnect node_3 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,14 +4,13 @@ loose-galera-wan=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.3] -wsrep_provider_options='base_port=@mysqld.3.#galera_port;gcache.size=10M;gmcast.segment=2' +wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.4] -wsrep_provider_options='base_port=@mysqld.4.#galera_port;gcache.size=10M;gmcast.segment=3' - +wsrep_provider_options='gmcast.segment=3;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test 2025-05-19 16:14:24.000000000 +0000 @@ -10,10 +10,10 @@ --source include/have_innodb.inc --source include/force_restart.inc -CALL mtr.add_suppression("WSREP: Stray state UUID msg:"); -CALL mtr.add_suppression("Sending JOIN failed: "); -CALL mtr.add_suppression("WSREP: .* sending install message failed: Socket is not connected"); -CALL mtr.add_suppression("There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside"); +CALL mtr.add_suppression("WSREP: Stray state UUID msg: "); +CALL mtr.add_suppression("WSREP: .*Sending JOIN failed: "); +CALL mtr.add_suppression("WSREP: .*sending install message failed: (Transport endpoint|Socket) is not connected"); +CALL mtr.add_suppression("WSREP: .*There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside"); --let $wait_condition = SELECT VARIABLE_VALUE = 4 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; --source include/wait_condition.inc @@ -42,8 +42,8 @@ DROP TABLE t1; --connection node_1 -call mtr.add_suppression("WSREP: read_completion_condition.*"); -call mtr.add_suppression("WSREP: read_handler.*"); +call mtr.add_suppression("WSREP: read_completion_condition"); +call mtr.add_suppression("WSREP: read_handler"); --disconnect node_3 --disconnect node_4 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,14 +4,13 @@ loose-galera-wan-restart-ist=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.3] -wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.segment=2' +wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.4] -wsrep_provider_options='base_port=@mysqld.4.#galera_port;gmcast.segment=2' - +wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,14 +4,13 @@ loose-galera-wan-restart-sst=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1' +wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.3] -wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.segment=2' +wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.4] -wsrep_provider_options='base_port=@mysqld.4.#galera_port;gmcast.segment=2' - +wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,8 +5,3 @@ [mysqld.2] wsrep_log_conflicts=ON - - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_mode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_mode.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test 2025-05-19 16:14:24.000000000 +0000 @@ -16,7 +16,6 @@ DROP TABLE t1; SET GLOBAL wsrep_mode = default; - # MDEV-25698 SIGSEGV in wsrep_should_replicate_ddl SET GLOBAL wsrep_mode = STRICT_REPLICATION; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,10 +3,11 @@ # --source include/galera_cluster.inc --source include/have_innodb.inc + --let LOGF=$MYSQLTEST_VARDIR/log/mysqld.1.err --disable_info -call mtr.add_suppression("WSREP\: Unknown parameter 'gmcasts\\.segment'"); -call mtr.add_suppression("WSREP\: Set options returned 7"); +call mtr.add_suppression("WSREP: Unknown parameter 'gmcasts\\.segment'"); +call mtr.add_suppression("WSREP: Set options returned 7"); --error ER_WRONG_ARGUMENTS SET GLOBAL wsrep_provider_options="gmcasts.segment=1"; # Search for unhandled exception message. diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test --- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,11 +6,22 @@ --source include/auto_increment_offset_save.inc --connection node_1 -call mtr.add_suppression("WSREP:.*"); + +call mtr.add_suppression("WSREP: async IST sender failed to serve"); +call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused"); +call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect"); +call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error"); + SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options; SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true;pc.weight=2'; --connection node_2 + +call mtr.add_suppression("WSREP: async IST sender failed to serve"); +call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused"); +call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect"); +call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error"); + SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address; SET GLOBAL WSREP_ON=0; SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; @@ -22,6 +33,7 @@ SET GLOBAL wsrep_cluster_address = @wsrep_cluster_address_orig; SELECT 1; DELETE FROM mysql.wsrep_allowlist; + --connection node_2 --source include/kill_galera.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-29775.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-29775.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,81 @@ +--source include/galera_cluster.inc +--source include/have_aria.inc + +# +# MDEV-29775 : Assertion `0' failed in void Protocol::end_statement() when adding data to the MyISAM table after setting wsrep_mode=replicate_myisam +# +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM; +INSERT INTO t VALUES(); +SELECT * FROM t; +--connection node_2 +SELECT * FROM t; +DROP TABLE t; + +--connection node_1 +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +SET GLOBAL wsrep_forced_binlog_format=ROW; +CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM; +INSERT INTO t VALUES(); +SELECT * FROM t; +--connection node_2 +SELECT * FROM t; +DROP TABLE t; + +--connection node_1 +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria; +INSERT INTO t VALUES(); +SELECT * FROM t; +--connection node_2 +SELECT * FROM t; +DROP TABLE t; + +--connection node_1 +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +SET GLOBAL wsrep_forced_binlog_format=ROW; +CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria; +INSERT INTO t VALUES(); +SELECT * FROM t; +--connection node_2 +SELECT * FROM t; +DROP TABLE t; + +--connection node_1 +SET GLOBAL wsrep_mode=REPLICATE_MYISAM; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_forced_binlog_format=MIXED; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; + +SET GLOBAL wsrep_mode=REPLICATE_ARIA; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_forced_binlog_format=MIXED; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; + +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_ARIA; + +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_ARIA; + +SET GLOBAL wsrep_forced_binlog_format=DEFAULT; +SET GLOBAL wsrep_mode=DEFAULT; +SET GLOBAL wsrep_forced_binlog_format=MIXED; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; +SET GLOBAL wsrep_forced_binlog_format=STATEMENT; +--error ER_WRONG_ARGUMENTS +SET GLOBAL wsrep_mode = REPLICATE_MYISAM; + +SET GLOBAL wsrep_forced_binlog_format=DEFAULT; +SET GLOBAL wsrep_mode=DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-30653.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-30653.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_aria.inc create table t1 (id serial, val int) engine=innodb; @@ -6,7 +7,8 @@ insert into t1 values(1, 23); insert into t2 values(2, 42); -call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental. Storage engine Aria for table 'test'.'t2' is not supported in Galera"); + +call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental\\. Storage engine Aria for table 'test'\\.'t2' is not supported in Galera"); begin; update t1 set val=24 where id=1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-31285.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-31285.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test 2025-05-19 16:14:24.000000000 +0000 @@ -11,5 +11,3 @@ --connection node_2 --error ER_NO_SUCH_TABLE SHOW CREATE TABLE t; - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,9 +1,4 @@ !include ../galera_2nodes.cnf -[mysqld.1] +[mysqld] log-bin -wsrep-debug=1 - -[mysqld.1] -log-bin -wsrep-debug=1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,5 @@ --source include/galera_cluster.inc --source include/have_innodb.inc ---source include/force_restart.inc CREATE TABLE t1 (id INT PRIMARY KEY) ENGINE=InnoDB; CREATE TABLE t2 (id INT PRIMARY KEY) ENGINE=InnoDB; @@ -21,8 +20,9 @@ --connection node_2 SET SESSION wsrep_sync_wait = 0; ---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table metadata lock' ---source include/wait_condition.inc +--let $wait_condition = SELECT COUNT(*) BETWEEN 1 AND 2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%'; +--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST +--source include/wait_condition_with_debug_and_kill.inc --connection node_1 INSERT INTO t2 VALUES (1); @@ -38,3 +38,8 @@ DROP TABLE t1; DROP TABLE t2; + +--connection node_1 + +--disconnect node_2a +--disconnect node_2b diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#201.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#201.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,5 +5,3 @@ [mysqld.2] query_cache_type=1 - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#247.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#247.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test 2025-05-19 16:14:24.000000000 +0000 @@ -20,4 +20,3 @@ --sleep 1 DROP TABLE t1; SHOW VARIABLES LIKE 'wsrep_desync'; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#31.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#31.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test 2025-05-19 16:14:24.000000000 +0000 @@ -49,5 +49,3 @@ --source include/auto_increment_offset_restore.inc --source include/galera_end.inc - - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#33.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#33.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,8 +1,7 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true' - +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#332.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test --- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#332.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test 2025-05-19 16:14:24.000000000 +0000 @@ -216,4 +216,3 @@ DROP TABLE c; DROP TABLE p1; DROP TABLE p2; - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/rename.test mariadb-10.11.13/mysql-test/suite/galera/t/rename.test --- mariadb-10.11.11/mysql-test/suite/galera/t/rename.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/rename.test 2025-05-19 16:14:24.000000000 +0000 @@ -50,4 +50,3 @@ DROP TABLE t2; --echo # End of tests - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/view.test mariadb-10.11.13/mysql-test/suite/galera/t/view.test --- mariadb-10.11.11/mysql-test/suite/galera/t/view.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/view.test 2025-05-19 16:14:24.000000000 +0000 @@ -47,4 +47,3 @@ DROP TABLE t1; --echo # End of tests - diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test --- mariadb-10.11.11/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_aria.inc call mtr.add_suppression("WSREP: ALTER TABLE isolation failure"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/disabled.def mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/disabled.def 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,3 @@ # Do not use any TAB characters for whitespace. # ############################################################################## - -galera_2_cluster : MDEV-32631 galera_2_cluster: before_rollback(): Assertion `0' failed -galera_nbo_master_phase_two_crash : MENT-2215 Test failure on galera_3nodes.galera_nbo_master_non_prim_failure diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -24,6 +24,7 @@ #sst_port=@OPT.port wsrep_cluster_address=gcomm:// wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_3nodes.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_3nodes.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -19,10 +19,11 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address=gcomm:// -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port' +wsrep_node_name=node1 [mysqld.2] wsrep-on=1 @@ -30,10 +31,11 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port' +wsrep_node_name=node2 [mysqld.3] wsrep-on=1 @@ -41,10 +43,11 @@ #ist_port=@OPT.port #sst_port=@OPT.port wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port' -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port' wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port' +wsrep_node_name=node3 [sst] sst-log-archive-dir=@ENV.MYSQLTEST_VARDIR/log diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/MDEV-36360.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/MDEV-36360.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,61 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +CREATE TABLE parent ( +id INT PRIMARY KEY +) ENGINE=InnoDB; +CREATE TABLE child ( +id INT PRIMARY KEY, +parent_id INT, +KEY (parent_id), +CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id) +) ENGINE=InnoDB; +INSERT INTO parent VALUES (1), (2); +connection node_3; +SET SESSION wsrep_on = OFF; +DELETE FROM parent WHERE id = 1; +SET SESSION wsrep_on = ON; +Restarting server 3 with one applier thread having FK and UK checks disabled +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_after_write_row'; +connection node_1; +INSERT INTO child VALUES (1, 1); +connection node_3; +SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_after_write_row_reached'; +SET GLOBAL DEBUG_DBUG = ''; +SET wsrep_sync_wait = 0; +SET DEBUG_SYNC = 'ib_after_row_insert SIGNAL signal.wsrep_after_write_row'; +INSERT INTO child VALUES (2, 2); +SET DEBUG_SYNC = 'RESET'; +include/assert_grep.inc [no FK constraint failure] +Server 3 +SELECT COUNT(*) AS EXPECT_1 FROM parent; +EXPECT_1 +1 +SELECT COUNT(*) AS EXPECT_2 FROM child; +EXPECT_2 +2 +connection node_1; +Server 1 +SET wsrep_sync_wait = 15; +SELECT COUNT(*) AS EXPECT_2 FROM parent; +EXPECT_2 +2 +SELECT COUNT(*) AS EXPECT_2 FROM child; +EXPECT_2 +2 +connection node_2; +Server 2 +SET wsrep_sync_wait = 15; +SELECT COUNT(*) AS EXPECT_2 FROM parent; +EXPECT_2 +2 +SELECT COUNT(*) AS EXPECT_2 FROM child; +EXPECT_2 +2 +DROP TABLE child; +DROP TABLE parent; +disconnect node_2; +disconnect node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera-features#115.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera-features#115.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,41 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_2; +SET GLOBAL wsrep_on=OFF; +DROP SCHEMA test; +connection node_3; +SET GLOBAL wsrep_on=OFF; +CREATE TABLE t1 (f1 INTEGER); +connection node_1; +CREATE TABLE t1 (f1 INTEGER); +connection node_1; +SET SESSION wsrep_sync_wait=0; +connection node_2; +SET SESSION wsrep_sync_wait=0; +connection node_3; +SET SESSION wsrep_sync_wait=0; +connection node_1; +SET GLOBAL wsrep_provider_options='pc.bootstrap=YES'; +connection node_2; +disconnect node_2; +connect node_2, 127.0.0.1, root, , mysql, $NODE_MYPORT_2; +# restart +connection node_3; +# restart +connection node_1; +DROP TABLE test.t1; +connection node_2; +CALL mtr.add_suppression("Inconsistent by consensus\\."); +CALL mtr.add_suppression("Error_code: 1049"); +CALL mtr.add_suppression("WSREP: Failed to apply trx: source: "); +CALL mtr.add_suppression("WSREP: Failed to apply app buffer"); +CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\."); +connection node_3; +CALL mtr.add_suppression("Inconsistent by consensus\\."); +CALL mtr.add_suppression("Error_code: 1050"); +CALL mtr.add_suppression("WSREP: Failed to apply trx: source: "); +CALL mtr.add_suppression("WSREP: Failed to apply app buffer"); +CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,9 +1,9 @@ connection node_2; connection node_1; +connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6; connect node_5, 127.0.0.1, root, , test, $NODE_MYPORT_5; connect node_4, 127.0.0.1, root, , test, $NODE_MYPORT_4; connection node_4; -CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_1, master_use_gtid=current_pos;; START SLAVE; include/wait_for_slave_to_start.inc connection node_1; @@ -21,7 +21,6 @@ SELECT COUNT(*) = 1 FROM t1; COUNT(*) = 1 1 -connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6; connection node_6; SELECT COUNT(*) = 1 FROM t1; COUNT(*) = 1 @@ -51,18 +50,30 @@ COUNT(*) = 3 1 connection node_2; +connection node_1; +connection node_3; +connection node_4; +connection node_5; +connection node_6; +connection node_2; OPTIMIZE TABLE t1; Table Op Msg_type Msg_text test.t1 optimize note Table does not support optimize, doing recreate + analyze instead test.t1 optimize status OK +Warnings: +Note 1592 Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statement is unsafe because it uses a system variable that may have a different value on the slave connection node_1; +connection node_3; connection node_4; +connection node_5; connection node_6; connection node_1; DROP TABLE t1; connection node_4; STOP SLAVE; RESET SLAVE; +Warnings: +Note 4190 RESET SLAVE is implicitly changing the value of 'Using_Gtid' from 'Current_Pos' to 'Slave_Pos' SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; @@ -75,19 +86,33 @@ SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); connection node_3; SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); connection node_5; SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); connection node_6; SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; +connection node_1; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_2; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_3; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_4; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_5; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_6; CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result 2025-05-19 16:14:24.000000000 +0000 @@ -2,8 +2,6 @@ connection node_1; connection node_1; connection node_2; -connection node_1; -connection node_2; connection node_3; Killing node #3 to free ports for garbd ... connection node_3; @@ -26,8 +24,8 @@ Restarting node #3 to satisfy MTR's end-of-test checks connection node_3; connection node_1; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_2; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_3; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,6 @@ connection node_2; connection node_1; connection node_1; -connection node_1; connection node_2; connection node_3; connection node_1; @@ -12,7 +11,6 @@ CREATE TABLE ten (f1 INTEGER) ENGINE=InnoDB; INSERT INTO ten VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); INSERT INTO t1 (f2) SELECT REPEAT('x', 1024) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4; -connection node_2; Killing node #3 to free ports for garbd ... connection node_3; connection node_1; @@ -34,8 +32,8 @@ connection node_3; connection node_1; connection node_1; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_2; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); connection node_3; -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result 2025-05-19 16:14:24.000000000 +0000 @@ -35,7 +35,7 @@ Variable_name Value wsrep_cluster_size 3 connection node_1; -change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_4, master_use_gtid=current_pos, ignore_server_ids=(12,13);; +--- ignore_server_ids=(12,13) start slave; include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -45,7 +45,7 @@ @@gtid_slave_pos connection node_4; -change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_1, master_use_gtid=current_pos, ignore_server_ids=(22,23);; +--- ignore_server_ids=(22,23) start slave; include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -262,7 +262,7 @@ reset master; set global wsrep_on=ON; connection node_1; -change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_6, master_use_gtid=current_pos, ignore_server_ids=(12,13);; +--- ignore_server_ids=(12,13) start slave; include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -272,7 +272,7 @@ @@gtid_slave_pos connection node_4; -change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_3, master_use_gtid=current_pos, ignore_server_ids=(22,23);; +--- ignore_server_ids=(22,23) start slave; include/wait_for_slave_to_start.inc select @@gtid_binlog_state; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result 2025-05-19 16:14:24.000000000 +0000 @@ -77,8 +77,8 @@ SET GLOBAL wsrep_provider_options = 'dbug='; connection node_1; DROP TABLE t1; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_2; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_3; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result 2025-05-19 16:14:24.000000000 +0000 @@ -87,11 +87,11 @@ SET GLOBAL wsrep_provider_options = 'dbug='; connection node_1; DROP TABLE t1; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_2; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_3; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); disconnect node_1a; disconnect node_3; disconnect node_2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result 2025-05-19 16:14:24.000000000 +0000 @@ -94,9 +94,9 @@ SET GLOBAL wsrep_provider_options = 'signal=after_shift_to_joining'; connection node_1; DROP TABLE t1; -call mtr.add_suppression("WSREP: Send action {(.*), STATE_REQUEST} returned -107 \\(Transport endpoint is not connected\\)"); -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Send action {.* STATE_REQUEST} returned -107 \\((Transport endpoint|Socket) is not connected\\)"); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_2; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); connection node_3; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result 2025-05-19 16:14:24.000000000 +0000 @@ -26,7 +26,7 @@ SELECT f1 = 111 FROM t1; f1 = 111 1 -SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE LIKE '%committed%'; +SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification'); COUNT(*) IN (1, 2) 1 SET GLOBAL wsrep_slave_threads = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result 2025-05-19 16:14:24.000000000 +0000 @@ -157,10 +157,10 @@ CALL mtr.add_suppression('WSREP: gcs_caused\\(\\) returned -1'); connection node_2; CALL mtr.add_suppression('SYNC message from member'); -CALL mtr.add_suppression('user message in state LEAVING'); -CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)'); -CALL mtr.add_suppression('overriding reported weight for'); +CALL mtr.add_suppression('WSREP: user message in state LEAVING'); +CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected'); +CALL mtr.add_suppression('overriding reported weight for '); connection node_3; CALL mtr.add_suppression('WSREP: user message in state LEAVING'); -CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)'); -CALL mtr.add_suppression('overriding reported weight for'); +CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected'); +CALL mtr.add_suppression('overriding reported weight for '); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result 2025-05-19 16:14:24.000000000 +0000 @@ -47,7 +47,7 @@ CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\."); CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\."); CALL mtr.add_suppression("Failed to initialize plugins\\."); -CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)"); +CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)"); connection node_3; CALL mtr.add_suppression("WSREP: no nodes coming from prim view, prim not possible"); CALL mtr.add_suppression("WSREP: It may not be safe to bootstrap the cluster from this node"); @@ -61,7 +61,7 @@ CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\."); CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\."); CALL mtr.add_suppression("Failed to initialize plugins\\."); -CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)"); +CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)"); SHOW CREATE TABLE t1; Table Create Table t1 CREATE TABLE `t1` ( diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,26 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_3; +connection node_2; +connection node_1; +SET GLOBAL debug_dbug = '+d,sync.wsrep_sst_donor_after_donation'; +connection node_2; +# restart +connection node_1; +SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached'; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1'; +SET SESSION wsrep_sync_wait=0; +SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_sst_donor_after_donation_continue'; +SET DEBUG_SYNC = 'RESET'; +SET GLOBAL debug_dbug = ''; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0'; +SET SESSION wsrep_sync_wait=15; +connection node_1; +connection node_2; +connection node_3; +connection node_1; +connection node_1; +CALL mtr.add_suppression("WSREP: sst sent called when not SST donor, state CONNECTED"); +CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result 2025-05-19 16:14:24.000000000 +0000 @@ -57,7 +57,6 @@ t1 CREATE TABLE `t1` ( `f1` int(11) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci -CALL mtr.add_suppression("is inconsistent with group"); connection node_3; SHOW CREATE TABLE t1; Table Create Table @@ -80,4 +79,5 @@ CALL mtr.add_suppression("Table 'mysql\\.gtid_slave_pos' doesn't exist"); connection node_2; # restart +CALL mtr.add_suppression("WSREP: .+ is inconsistent with group"); connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/suite.pm mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/suite.pm 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm 2025-05-19 16:14:24.000000000 +0000 @@ -9,69 +9,71 @@ push @::global_suppressions, ( - qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1), - qr(WSREP: Could not open saved state file for reading: .*), - qr(WSREP: Could not open state file for reading: .*), - qr(WSREP: Gap in state sequence. Need state transfer.), + qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1), + qr(WSREP: Could not open saved state file for reading: ), + qr(WSREP: Could not open state file for reading: ), + qr(WSREP: Gap in state sequence\. Need state transfer\.), qr(WSREP: Failed to prepare for incremental state transfer:), - qr(WSREP:.*down context.*), + qr(WSREP: .*down context.*), qr(WSREP: Failed to send state UUID:), - qr(WSREP: last inactive check more than .* skipping check), - qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.), - qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|, + qr(WSREP: last inactive check more than .+ skipping check), + qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.), + qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|, qr(WSREP: Quorum: No node with complete state), qr(WSREP: Initial position was provided by configuration or SST, avoiding override), - qr|WSREP: discarding established \(time wait\) .*|, - qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.), + qr|WSREP: discarding established \(time wait\) |, + qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.), qr(WSREP: evs::proto.*), - qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|, + qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|, qr(WSREP: no nodes coming from prim view, prim not possible), - qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable), + qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable), qr(WSREP: user message in state LEAVING), - qr(WSREP: .* sending install message failed: Transport endpoint is not connected), + qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected), qr(WSREP: .* sending install message failed: Resource temporarily unavailable), - qr(WSREP: Maximum writeset size exceeded by .*), - qr(WSREP: transaction size exceeded.*), - qr(WSREP: RBR event .*), - qr(WSREP: Ignoring error for TO isolated action: .*), - qr(WSREP: transaction size limit .*), - qr(WSREP: rbr write fail, .*), - qr(WSREP: .*Backend not supported: foo.*), - qr(WSREP: .*Failed to initialize backend using .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: Maximum writeset size exceeded by ), + qr(WSREP: transaction size exceeded), + qr(WSREP: RBR event ), + qr(WSREP: Ignoring error for TO isolated action: ), + qr(WSREP: transaction size limit ), + qr(WSREP: rbr write fail, ), + qr(WSREP: .*Backend not supported: foo), + qr(WSREP: .*Failed to initialize backend using ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Socket type not supported), qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*), - qr(WSREP: .*Failed to open backend connection: -110 .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: .*Failed to open backend connection: -110 ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Connection timed out), qr|WSREP: wsrep::connect\(.*\) failed: 7|, - qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.), + qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.), qr(WSREP: Could not find peer:), - qr(WSREP: TO isolation failed for: .*), - qr|WSREP: gcs_caused\(\) returned .*|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|, - qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|, + qr(WSREP: TO isolation failed for: ), + qr|WSREP: gcs_caused\(\) returned |, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|, + qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|, qr(WSREP: Action message in non-primary configuration from member [0-9]*), qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*), - qr(WSREP: discarding established .*), - qr|WSREP: .*core_handle_uuid_msg.*|, - qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on), - qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|, - qr|Query apply failed:*|, - qr(WSREP: Ignoring error*), - qr(WSREP: Failed to remove page file .*), - qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*), - qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|, + qr(WSREP: discarding established ), + qr|WSREP: .*core_handle_uuid_msg|, + qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on), + qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|, + qr|WSREP: .*Query apply failed:|, + qr(WSREP: Ignoring error), + qr(WSREP: Failed to remove page file ), + qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ), + qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+, + qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+, qr|WSREP: Trying to continue unpaused monitor|, qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|, + qr|WSREP: Failed to report last committed|, ); sub which($) { return `sh -c "command -v $_[0]"` } sub skip_combinations { my %skip = (); - $skip{'include/have_mariabackup.inc'} = 'Need ss' + $skip{'include/have_mariabackup.inc'} = 'Need socket statistics utility' unless which("lsof") || which("sockstat") || which("ss"); %skip; } diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GAL-501.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GAL-501.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,7 +6,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// wsrep_node_address=[::1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_incoming_address='[::1]:@mysqld.1.port' bind-address=:: @@ -14,7 +14,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' wsrep_node_address=[::1] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_incoming_address='[::1]:@mysqld.2.port' bind-address=:: @@ -22,7 +22,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' wsrep_node_address=[::1] -wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_incoming_address='[::1]:@mysqld.3.port' bind-address=:: diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GCF-354.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GCF-354.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,10 +7,10 @@ wsrep-debug=1 [mysqld.1] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G;pc.weight=4' +wsrep_provider_options='repl.causal_read_timeout=PT90S;pc.weight=4;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M' [mysqld.2] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M' [mysqld.3] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/MDEV-36360.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/MDEV-36360.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,110 @@ +# +# MDEV-36360: Don't grab table-level X locks for applied inserts. +# +# It prevents a debug crash in wsrep_report_error() which happened when appliers would run +# with FK and UK checks disabled and erroneously execute plain inserts as bulk inserts. +# +# Moreover, in release builds such a behavior could lead to deadlocks between two applier +# threads if a thread waiting for a table-level lock was ordered before the lock holder. +# In that case the lock holder would proceed to commit order and wait forever for the +# now-blocked other applier thread to commit before. +# + +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/have_debug.inc + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +# Create parent and child tables. +--connection node_1 +CREATE TABLE parent ( + id INT PRIMARY KEY +) ENGINE=InnoDB; + +CREATE TABLE child ( + id INT PRIMARY KEY, + parent_id INT, + KEY (parent_id), + CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id) +) ENGINE=InnoDB; + +# Fill the parent table with rows that will later be used by the child. +INSERT INTO parent VALUES (1), (2); + +# Wait until the rows are replicated on node #3. +--connection node_3 +--let $wait_condition = SELECT COUNT(*) = 2 FROM parent +--source include/wait_condition.inc + +# Delete one row from the parent table on node #3 and rejoin the cluster. +SET SESSION wsrep_on = OFF; +DELETE FROM parent WHERE id = 1; +SET SESSION wsrep_on = ON; +--echo Restarting server 3 with one applier thread having FK and UK checks disabled +--source include/shutdown_mysqld.inc +--let $start_mysqld_params = --wsrep_slave_FK_checks=0 --wsrep_slave_UK_checks=0 +--source ../galera/include/start_mysqld.inc + +# Stop the applier after writing a row into the child table. +SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_after_write_row'; + +# Insert a child row that will be applied on node #3, but should not +# grab table-level X-lock. +--connection node_1 +INSERT INTO child VALUES (1, 1); + +--connection node_3 +SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_after_write_row_reached'; +# Now that the applier has hit the global sync point wait, reset it +# so that the upcoming insert avoids it. +SET GLOBAL DEBUG_DBUG = ''; +# Don't wait for applied insert to commit. +SET wsrep_sync_wait = 0; +SET DEBUG_SYNC = 'ib_after_row_insert SIGNAL signal.wsrep_after_write_row'; +# The insert should pass the sync point, as otherwise if the applied insert +# grabs table-level X-lock, they'll both deadlock forever. +INSERT INTO child VALUES (2, 2); +SET DEBUG_SYNC = 'RESET'; + +--let $assert_select = foreign key constraint fails +--let $assert_count = 0 +--let $assert_text = no FK constraint failure +--let $assert_only_after = CURRENT_TEST +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.3.err +--source include/assert_grep.inc + +# Child row insert is applied even though there's no parent row. +--echo Server 3 +SELECT COUNT(*) AS EXPECT_1 FROM parent; +SELECT COUNT(*) AS EXPECT_2 FROM child; + +# Check other nodes have both parent and child rows. +--connection node_1 +--echo Server 1 +SET wsrep_sync_wait = 15; +SELECT COUNT(*) AS EXPECT_2 FROM parent; +SELECT COUNT(*) AS EXPECT_2 FROM child; + +--connection node_2 +--echo Server 2 +SET wsrep_sync_wait = 15; +SELECT COUNT(*) AS EXPECT_2 FROM parent; +SELECT COUNT(*) AS EXPECT_2 FROM child; + +DROP TABLE child; +DROP TABLE parent; + +# Restore original auto_increment_offset values. +--source ../galera/include/auto_increment_offset_restore.inc + +--source include/galera_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,4 @@ +!include ../galera_3nodes.cnf + +[mysqld] +wsrep-ignore-apply-errors=0 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,89 @@ +# +# This test tests that one successful node wins over two nodes that fail for +# different reasons +# +--source include/galera_cluster.inc +--source include/have_innodb.inc + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source suite/galera/include/auto_increment_offset_save.inc + +# create inconsistency on node 2 +--connection node_2 +SET GLOBAL wsrep_on=OFF; +DROP SCHEMA test; + +# create inconsistency on node 3 +--connection node_3 +SET GLOBAL wsrep_on=OFF; +CREATE TABLE t1 (f1 INTEGER); + +--connection node_1 +CREATE TABLE t1 (f1 INTEGER); + +# check that nodes 2 and 3 leave the cluster, and node_1 is Primary by itself + +--connection node_1 +SET SESSION wsrep_sync_wait=0; +--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +--source include/wait_condition.inc + +--connection node_2 +SET SESSION wsrep_sync_wait=0; +--let $wait_condition = SELECT VARIABLE_VALUE = 'Disconnected' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc + +--connection node_3 +SET SESSION wsrep_sync_wait=0; +--let $wait_condition = SELECT VARIABLE_VALUE = 'Disconnected' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc + +--connection node_1 +# this is a workaround for "sending install message failed" BUG: +# https://github.com/codership/galera/issues/174 +# When it happens, node_1 becomes non-prim +SET GLOBAL wsrep_provider_options='pc.bootstrap=YES'; +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc + +# restart nodes 2 and 3, since they failed + +--connection node_2 +# need to reinitialize connection due to a "Bad handshake" bug. +# we reconnect using the 'mysql' database as 'test' was dropped. +--disconnect node_2 +--connect node_2, 127.0.0.1, root, , mysql, $NODE_MYPORT_2 + --source include/restart_mysqld.inc + +--connection node_3 + --source include/restart_mysqld.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +--source include/wait_condition.inc +--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status'; +--source include/wait_condition.inc + +DROP TABLE test.t1; + +--source suite/galera/include/auto_increment_offset_restore.inc + +--connection node_2 +CALL mtr.add_suppression("Inconsistent by consensus\\."); +CALL mtr.add_suppression("Error_code: 1049"); +CALL mtr.add_suppression("WSREP: Failed to apply trx: source: "); +CALL mtr.add_suppression("WSREP: Failed to apply app buffer"); +CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\."); + +--connection node_3 +CALL mtr.add_suppression("Inconsistent by consensus\\."); +CALL mtr.add_suppression("Error_code: 1050"); +CALL mtr.add_suppression("WSREP: Failed to apply trx: source: "); +CALL mtr.add_suppression("WSREP: Failed to apply app buffer"); +CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#119.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#119.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test 2025-05-19 16:14:24.000000000 +0000 @@ -66,6 +66,5 @@ CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\."); CALL mtr.add_suppression("WSREP: Failed to apply write set: "); - # Restore original auto_increment_offset values. --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,25 +1,34 @@ !include ../galera_2x3nodes.cnf +[mysqld] +wsrep-debug=1 + [mysqld.1] wsrep_gtid_domain_id=1 server-id=11 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M;pc.weight=2' [mysqld.2] wsrep_gtid_domain_id=1 server-id=12 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M' [mysqld.3] wsrep_gtid_domain_id=1 server-id=13 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M' [mysqld.4] wsrep_gtid_domain_id=2 server-id=21 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M' [mysqld.5] wsrep_gtid_domain_id=2 server-id=22 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.5.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M' [mysqld.6] wsrep_gtid_domain_id=2 server-id=23 +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.6.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,5 @@ +[binlogon] +log-bin +log-slave-updates + +[binlogoff] diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,14 +9,17 @@ --source include/big_test.inc --source include/galera_cluster.inc --source include/have_innodb.inc +--source include/force_restart.inc +--connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6 --connect node_5, 127.0.0.1, root, , test, $NODE_MYPORT_5 - --connect node_4, 127.0.0.1, root, , test, $NODE_MYPORT_4 + --connection node_4 ---replace_result $NODE_MYPORT_1 NODE_MYPORT_1 +--disable_query_log --eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_1, master_use_gtid=current_pos; +--enable_query_log START SLAVE; --source include/wait_for_slave_to_start.inc @@ -42,7 +45,6 @@ SELECT COUNT(*) = 1 FROM t1; ---connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6 --connection node_6 SELECT COUNT(*) = 1 FROM t1; @@ -81,23 +83,46 @@ # --connection node_2 +--let $wsrep_last_committed_before_2 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` + +--connection node_1 +--let $wsrep_last_committed_before_1 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` + +--connection node_3 +--let $wsrep_last_committed_before_3 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` + +--connection node_4 +--let $wsrep_last_committed_before_4 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` ---let $wsrep_last_committed_before = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` +--connection node_5 +--let $wsrep_last_committed_before_5 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` + +--connection node_6 +--let $wsrep_last_committed_before_6 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'` + +--connection node_2 OPTIMIZE TABLE t1; +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_2 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--source include/wait_condition.inc --connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_1 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--source include/wait_condition.inc ---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--connection node_3 +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_3 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' --source include/wait_condition.inc --connection node_4 +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_4 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--source include/wait_condition.inc ---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--connection node_5 +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_5 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' --source include/wait_condition.inc --connection node_6 - ---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' +--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_6 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed' --source include/wait_condition.inc # @@ -115,6 +140,7 @@ SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; +--source include/wait_until_ready.inc SET GLOBAL GTID_SLAVE_POS=""; --connection node_1 @@ -122,35 +148,56 @@ SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; +--source include/wait_until_ready.inc --connection node_2 SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; - -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +--source include/wait_until_ready.inc --connection node_3 SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; - -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +--source include/wait_until_ready.inc --connection node_5 SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; - -CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +--source include/wait_until_ready.inc --connection node_6 SET GLOBAL wsrep_on = OFF; RESET MASTER; SET GLOBAL wsrep_on = ON; +--source include/wait_until_ready.inc + +connection node_1; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); + +connection node_2; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); + +connection node_3; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); + +connection node_4; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); + +connection node_5; +CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); +connection node_6; CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node"); +CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since "); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -7,7 +7,7 @@ wsrep_allowlist="127.0.0.1,127.0.0.2,127.0.0.3" [mysqld.2] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=127.0.0.2;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=127.0.0.2;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' # Variable is only used on bootstrap node, so this will be ignored wsrep_allowlist="127.0.0.1,127.0.0.2,127.0.0.3,127.0.0.4,127.0.0.5" @@ -18,9 +18,9 @@ wsrep_sst_receive_address='127.0.0.2:@mysqld.2.#sst_port' [mysqld.3] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=127.0.0.3;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_quorum=TRUE;pc.wait_prim=FALSE' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=127.0.0.3;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_quorum=TRUE;pc.wait_prim=FALSE;gcache.size=10M' wsrep_node_address=127.0.0.3 wsrep_sst_receive_address=127.0.0.3:@mysqld.3.#sst_port wsrep_node_incoming_address=127.0.0.3:@mysqld.3.port -wsrep_sst_receive_address='127.0.0.3:@mysqld.3.#sst_port' \ No newline at end of file +wsrep_sst_receive_address='127.0.0.3:@mysqld.3.#sst_port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test 2025-05-19 16:14:24.000000000 +0000 @@ -50,4 +50,3 @@ --source ../galera/include/auto_increment_offset_restore.inc --source include/galera_end.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/have_debug.inc --source include/have_debug_sync.inc --source include/big_test.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,12 +5,12 @@ [mysqld.1] wsrep_node_name='node.1' -wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] wsrep_node_name='node.2' -wsrep_provider_options='base_port=@mysqld.2.#galera_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.3] wsrep_node_name='node.3' -wsrep_provider_options='base_port=@mysqld.3.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true' +wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test 2025-05-19 16:14:24.000000000 +0000 @@ -14,7 +14,6 @@ --let $node_3 = node_3 --source ../galera/include/auto_increment_offset_save.inc - --connection node_1 --let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment'; --source include/wait_condition.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test 2025-05-19 16:14:24.000000000 +0000 @@ -87,5 +87,6 @@ --source include/wait_condition.inc DROP TABLE t1; + # Restore original auto_increment_offset values. --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,14 +9,9 @@ --source include/big_test.inc # Save galera ports ---connection node_1 --source suite/galera/include/galera_base_port.inc --let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT ---connection node_2 ---source suite/galera/include/galera_base_port.inc ---let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT - --let $galera_connection_name = node_3 --let $galera_server_number = 3 --source include/galera_connect.inc @@ -81,10 +76,10 @@ # Workaround for galera#101 --connection node_1 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_2 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_3 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test 2025-05-19 16:14:24.000000000 +0000 @@ -10,11 +10,9 @@ --source include/have_debug.inc --source include/have_debug_sync.inc ---connection node_1 -# Save original auto_increment_offset values. ---let $node_1=node_1 ---let $node_2=node_2 ---let $node_3=node_3 +# Save galera ports +--source suite/galera/include/galera_base_port.inc +--let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT --let $galera_connection_name = node_3 --let $galera_server_number = 3 @@ -22,12 +20,13 @@ --source suite/galera/include/galera_base_port.inc --let $NODE_GALERAPORT_3 = $_NODE_GALERAPORT +# Save original auto_increment_offset values. +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 --source ../galera/include/auto_increment_offset_save.inc -# Save galera ports --connection node_1 ---source suite/galera/include/galera_base_port.inc ---let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT --let $datadir= `SELECT @@datadir` --let $innodb_max_dirty_pages_pct = `SELECT @@innodb_max_dirty_pages_pct` @@ -41,10 +40,6 @@ CREATE TABLE ten (f1 INTEGER) ENGINE=InnoDB; INSERT INTO ten VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); INSERT INTO t1 (f2) SELECT REPEAT('x', 1024) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4; - ---connection node_2 ---source suite/galera/include/galera_base_port.inc ---let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT --echo Killing node #3 to free ports for garbd ... --connection node_3 @@ -124,13 +119,16 @@ --eval SET GLOBAL innodb_max_dirty_pages_pct_lwm = $innodb_max_dirty_pages_pct_lwm --enable_query_log +# Restore original auto_increment_offset values. --source ../galera/include/auto_increment_offset_restore.inc +# Workaround for galera#101 + --connection node_1 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_2 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); --connection node_3 -CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)"); +CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\."); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,7 @@ # following tests such as galera_3nodes.galera_var_dirty_reads2 !include ../galera_2x3nodes.cnf + [mysqld.1] wsrep_gtid_domain_id=1 server-id=11 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test 2025-05-19 16:14:24.000000000 +0000 @@ -42,8 +42,10 @@ SHOW STATUS LIKE 'wsrep_cluster_size'; #--disable_parsing --connection node_1 ---replace_result $NODE_MYPORT_4 NODE_MYPORT_4 +--echo --- ignore_server_ids=(12,13) +--disable_query_log --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_4, master_use_gtid=current_pos, ignore_server_ids=(12,13); +--enable_query_log start slave; --source include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -51,8 +53,10 @@ #--query_vertical SHOW SLAVE STATUS; --connection node_4 ---replace_result $NODE_MYPORT_1 NODE_MYPORT_1 +--echo --- ignore_server_ids=(22,23) +--disable_query_log --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_1, master_use_gtid=current_pos, ignore_server_ids=(22,23); +--enable_query_log start slave; --source include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -73,6 +77,8 @@ --echo cluster 2 node 1 --connection node_4 +--let $wait_condition = SELECT COUNT(*) = 1 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (2, 21, 1); select @@gtid_binlog_state; @@ -81,11 +87,16 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 1 node 2 --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1; +--source include/wait_condition.inc + select @@gtid_binlog_state; insert into t1 values (1, 12, 3); select @@gtid_binlog_state; @@ -95,10 +106,14 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 1 node 3 --connection node_3 +--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (1, 13, 4); select @@gtid_binlog_state; @@ -108,10 +123,14 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 2 node 2 --connection node_5 +--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (2, 22, 2); select @@gtid_binlog_state; @@ -121,37 +140,55 @@ --source include/save_master_gtid.inc --connection node_1 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 2 node 3 --connection node_6 +--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (2, 23, 3); select @@gtid_binlog_state; --echo #wait for sync cluster 2 and 1 --connection node_4 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc --source include/save_master_gtid.inc --connection node_1 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo # check other nodes are consistent --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_3 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_5 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_6 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --echo cluster 1 node 1 --connection node_1 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; drop table t1; stop slave; @@ -210,8 +247,10 @@ # Then we will kill node D and set up the replication between A and E # To see whether fail over works or not. --connection node_1 ---replace_result $NODE_MYPORT_6 NODE_MYPORT_6 +--echo --- ignore_server_ids=(12,13) +--disable_query_log --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_6, master_use_gtid=current_pos, ignore_server_ids=(12,13); +--enable_query_log start slave; --source include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -219,8 +258,10 @@ #--query_vertical SHOW SLAVE STATUS; --connection node_4 ---replace_result $NODE_MYPORT_3 NODE_MYPORT_3 +--echo --- ignore_server_ids=(22,23) +--disable_query_log --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_3, master_use_gtid=current_pos, ignore_server_ids=(22,23); +--enable_query_log start slave; --source include/wait_for_slave_to_start.inc select @@gtid_binlog_state; @@ -242,6 +283,8 @@ --sleep 2 --echo cluster 2 node 1 --connection node_4 +--let $wait_condition = SELECT COUNT(*) = 1 FROM test.t1; +--source include/wait_condition.inc insert into t1 values (2, 21, 1); select @@gtid_binlog_state; @@ -250,11 +293,16 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1; +--source include/wait_condition.inc + select * from t1 order by 1, 2, 3; --echo cluster 1 node 2 --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (1, 12, 3); select @@gtid_binlog_state; @@ -264,10 +312,14 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 1 node 3 --connection node_3 +--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (1, 13, 4); select @@gtid_binlog_state; @@ -277,10 +329,14 @@ --source include/save_master_gtid.inc --connection node_4 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 2 node 2 --connection node_5 +--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (2, 22, 2); select @@gtid_binlog_state; @@ -290,10 +346,14 @@ --source include/save_master_gtid.inc --connection node_1 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo cluster 2 node 3 --connection node_6 +--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; insert into t1 values (2, 23, 3); select @@gtid_binlog_state; @@ -303,24 +363,36 @@ --source include/save_master_gtid.inc --connection node_1 --source include/sync_with_master_gtid.inc +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select * from t1 order by 1, 2, 3; --echo # check other nodes are consistent --connection node_2 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_3 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_5 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --connection node_6 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; select * from t1 order by 1, 2, 3; --echo cluster 1 node 1 --connection node_1 +--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1; +--source include/wait_condition.inc select @@gtid_binlog_state; drop table t1; stop slave; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -6,7 +6,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.1.port' @@ -15,7 +15,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.2.port' @@ -25,7 +25,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.3.port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -9,7 +9,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.1.port' @@ -18,7 +18,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.2.port' @@ -28,7 +28,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.3.port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.1.port' @@ -13,7 +13,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.2.port' @@ -21,7 +21,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.3.port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test 2025-05-19 16:14:24.000000000 +0000 @@ -20,7 +20,6 @@ CREATE USER 'sst'; GRANT ALL PRIVILEGES ON *.* TO 'sst'; ---let $wsrep_sst_auth_orig = `SELECT @@wsrep_sst_auth` SET GLOBAL wsrep_sst_auth = 'sst:'; --connection node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.1.port' @@ -13,7 +13,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.2.port' @@ -21,7 +21,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.3.port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -10,7 +10,7 @@ [mysqld.1] wsrep-cluster-address=gcomm:// -wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.1.port' @@ -18,7 +18,7 @@ [mysqld.2] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.2.port' @@ -26,7 +26,7 @@ [mysqld.3] wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port' -wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port' wsrep_node_address=::1 wsrep_node_incoming_address='[::1]:@mysqld.3.port' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ !include ../galera_3nodes.cnf [mysqld.1] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M' auto_increment_increment=1 auto_increment_offset=1 # this will force server restarts before this test @@ -9,14 +9,14 @@ wsrep-debug=1 [mysqld.2] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M' auto_increment_increment=2 auto_increment_offset=2 loose-galera-ist-gcache-rollover=2 wsrep-debug=1 [mysqld.3] -wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M' auto_increment_increment=3 auto_increment_offset=3 loose-galera-ist-gcache-rollover=3 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test 2025-05-19 16:14:24.000000000 +0000 @@ -259,12 +259,12 @@ DROP TABLE t1; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_2 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_3 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test 2025-05-19 16:14:24.000000000 +0000 @@ -270,13 +270,13 @@ DROP TABLE t1; -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_2 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_3 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --disconnect node_1a diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test 2025-05-19 16:14:24.000000000 +0000 @@ -295,13 +295,13 @@ DROP TABLE t1; -call mtr.add_suppression("WSREP: Send action {(.*), STATE_REQUEST} returned -107 \\(Transport endpoint is not connected\\)"); -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Send action {.* STATE_REQUEST} returned -107 \\((Transport endpoint|Socket) is not connected\\)"); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_2 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --connection node_3 -call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\."); +call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\."); --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test 2025-05-19 16:14:24.000000000 +0000 @@ -65,7 +65,7 @@ --connection node_3 SELECT f1 = 111 FROM t1; -SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE LIKE '%committed%'; +SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification'); SET GLOBAL wsrep_slave_threads = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test 2025-05-19 16:14:24.000000000 +0000 @@ -17,7 +17,6 @@ --let $node_1 = node_1 --let $node_2 = node_2 --let $node_3 = node_3 - --source ../galera/include/auto_increment_offset_save.inc --connection node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test 2025-05-19 16:14:24.000000000 +0000 @@ -132,11 +132,11 @@ --connection node_2 CALL mtr.add_suppression('SYNC message from member'); -CALL mtr.add_suppression('user message in state LEAVING'); -CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)'); -CALL mtr.add_suppression('overriding reported weight for'); +CALL mtr.add_suppression('WSREP: user message in state LEAVING'); +CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected'); +CALL mtr.add_suppression('overriding reported weight for '); --connection node_3 CALL mtr.add_suppression('WSREP: user message in state LEAVING'); -CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)'); -CALL mtr.add_suppression('overriding reported weight for'); +CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected'); +CALL mtr.add_suppression('overriding reported weight for '); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test 2025-05-19 16:14:24.000000000 +0000 @@ -14,7 +14,6 @@ --let $node_1 = node_1 --let $node_2 = node_2 --let $node_3 = node_3 - --source ../galera/include/auto_increment_offset_save.inc --connection node_1 @@ -195,7 +194,7 @@ CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\."); CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\."); CALL mtr.add_suppression("Failed to initialize plugins\\."); -CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)"); +CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)"); --connection node_3 CALL mtr.add_suppression("WSREP: no nodes coming from prim view, prim not possible"); @@ -210,7 +209,7 @@ CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\."); CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\."); CALL mtr.add_suppression("Failed to initialize plugins\\."); -CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)"); +CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)"); SHOW CREATE TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,10 +5,10 @@ loose-galera-ssl-reload=1 [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.3] -wsrep_provider_options='base_port=@mysqld.3.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem' +wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,4 @@ +!include ../galera_3nodes.cnf + +[mysqld.2] +wsrep_sst_donor=node1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,64 @@ +# +# Construct a situation where Donor node partitions in the +# middle of SST. The Donor should stay in non-Primary state instead of +# crashing in assertion in wsrep-lib. +# +# In the test, node_2 is restarted and node_1 configured to be +# the donor. Node_1 execution is stopped before sst_sent() is +# called and node_1 is made to partition from the cluster. +# + +--source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc +--source include/big_test.inc + +--let $galera_connection_name = node_3 +--let $galera_server_number = 3 +--source include/galera_connect.inc + +--let $node_1=node_1 +--let $node_2=node_2 +--let $node_3=node_3 +--source ../galera/include/auto_increment_offset_save.inc + +--connection node_2 +--source include/shutdown_mysqld.inc +--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat + +--connection node_1 +SET GLOBAL debug_dbug = '+d,sync.wsrep_sst_donor_after_donation'; + +--connection node_2 +--source include/start_mysqld.inc + +--connection node_1 +SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached'; +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1'; +SET SESSION wsrep_sync_wait=0; +--let $wait_condition = SELECT VARIABLE_VALUE = 'non-Primary' FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status' +--source include/wait_condition.inc + +SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_sst_donor_after_donation_continue'; +SET DEBUG_SYNC = 'RESET'; +SET GLOBAL debug_dbug = ''; + +SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0'; +SET SESSION wsrep_sync_wait=15; + +--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--connection node_1 +--source include/wait_condition.inc +--connection node_2 +--source include/wait_condition.inc +--connection node_3 + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_ready' +--source include/wait_condition.inc + +--source ../galera/include/auto_increment_offset_restore.inc + +--connection node_1 +CALL mtr.add_suppression("WSREP: sst sent called when not SST donor, state CONNECTED"); +CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component"); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test 2025-05-19 16:14:24.000000000 +0000 @@ -69,7 +69,6 @@ --connection node_2 SHOW CREATE TABLE t1; -CALL mtr.add_suppression("is inconsistent with group"); --connection node_3 SHOW CREATE TABLE t1; @@ -83,6 +82,7 @@ # restart node so we don't fail on WSREP_START_POSITION internal check --source include/restart_mysqld.inc --source include/wait_until_connected_again.inc +CALL mtr.add_suppression("WSREP: .+ is inconsistent with group"); --connection node_1 --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,7 @@ --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 --connection node_1 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test 2025-05-19 16:14:24.000000000 +0000 @@ -10,6 +10,7 @@ --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 --connection node_1 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 @@ -55,4 +56,3 @@ SELECT cluster_uuid = (SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_state_uuid') FROM mysql.wsrep_cluster_members; --source ../galera/include/auto_increment_offset_restore.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -5,5 +5,4 @@ wsrep-ignore-apply-errors=0 [ENV] -galera_cluster_size = 3 - +galera_cluster_size=3 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result 2025-05-19 16:14:24.000000000 +0000 @@ -2,7 +2,7 @@ connection node_1; connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; -connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3; +connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3; connect node_3a, 127.0.0.1, root, , test, $NODE_MYPORT_3; connection node_1; connection node_2; @@ -45,7 +45,7 @@ SET SESSION wsrep_sync_wait = DEFAULT; SET DEBUG_SYNC = 'now SIGNAL continue'; connection node_2; -ERROR HY000: Got error 6 "No such device or address" during COMMIT +ERROR HY000: Error while appending streaming replication fragment(provider status: Not connected to Primary Component) connection node_2a; SET DEBUG_SYNC = 'RESET'; connection node_1a; @@ -74,15 +74,15 @@ SET SESSION wsrep_sync_wait = 0; SET SESSION wsrep_sync_wait = DEFAULT; connection node_1a; -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; EXPECT_0 0 connection node_2a; -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; EXPECT_0 0 connection node_3a; -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; EXPECT_0 0 connection node_1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result 2025-05-19 16:14:24.000000000 +0000 @@ -44,7 +44,7 @@ SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; EXPECT_0 0 -call mtr.add_suppression("WSREP: node uuid:.*"); +call mtr.add_suppression("WSREP: node uuid:"); connection node_1; DROP TABLE t1; DROP TABLE t2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/suite.pm mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/suite.pm 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm 2025-05-19 16:14:24.000000000 +0000 @@ -9,38 +9,39 @@ push @::global_suppressions, ( - qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1), - qr(WSREP: Could not open saved state file for reading: .*), - qr(WSREP: Could not open state file for reading: .*), - qr(WSREP: Gap in state sequence. Need state transfer.), + qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1), + qr(WSREP: Could not open saved state file for reading: ), + qr(WSREP: Could not open state file for reading: ), + qr(WSREP: Gap in state sequence\. Need state transfer\.), qr(WSREP: Failed to prepare for incremental state transfer:), - qr(WSREP:.*down context.*), + qr(WSREP: .*down context.*), qr(WSREP: Failed to send state UUID:), qr(WSREP: last inactive check more than .* skipping check), qr(WSREP: SQL statement was ineffective), - qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.), - qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|, + qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.), + qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|, qr(WSREP: Quorum: No node with complete state), qr(WSREP: Initial position was provided by configuration or SST, avoiding override), - qr|WSREP: discarding established \(time wait\) .*|, - qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.), + qr|WSREP: discarding established \(time wait\) |, + qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.), qr(WSREP: evs::proto.*), - qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|, + qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|, qr(WSREP: no nodes coming from prim view, prim not possible), - qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable), + qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable), qr(WSREP: user message in state LEAVING), - qr(WSREP: .* sending install message failed: Transport endpoint is not connected), + qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected), qr(WSREP: .* sending install message failed: Resource temporarily unavailable), - qr(WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.), qr(WSREP: Could not find peer:), - qr|WSREP: gcs_caused\(\) returned .*|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|, + qr|WSREP: gcs_caused\(\) returned |, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|, qr(WSREP: Action message in non-primary configuration from member [0-9]*), qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*), - qr|WSREP: .*core_handle_uuid_msg.*|, - qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on), - qr(WSREP: JOIN message from member .* in non-primary configuration. Ignored.), + qr|WSREP: .*core_handle_uuid_msg|, + qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on), + qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|, + qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+, + qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+, ); bless { }; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test 2025-05-19 16:14:24.000000000 +0000 @@ -85,4 +85,5 @@ --connection node_2 CALL mtr.add_suppression("WSREP: failed to send SR rollback for "); + --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test 2025-05-19 16:14:24.000000000 +0000 @@ -5,6 +5,7 @@ --source include/galera_cluster.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,7 @@ --source include/force_restart.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test 2025-05-19 16:14:24.000000000 +0000 @@ -21,7 +21,7 @@ --connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 --connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 ---connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 +--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 --connect node_3a, 127.0.0.1, root, , test, $NODE_MYPORT_3 # Save original auto_increment_offset values. @@ -158,15 +158,15 @@ --connection node_1a --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log --source include/wait_condition.inc -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; --connection node_2a --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log --source include/wait_condition.inc -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; --connection node_3a --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log --source include/wait_condition.inc -SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; +SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; --connection node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,6 +6,7 @@ # Test the effect of gmcast.isolate on master during an SR transaction # --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,7 @@ --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 --connection node_1 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test 2025-05-19 16:14:24.000000000 +0000 @@ -6,6 +6,7 @@ --source include/have_innodb.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,7 @@ --source include/have_innodb.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test 2025-05-19 16:14:24.000000000 +0000 @@ -8,6 +8,7 @@ --source include/have_innodb.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 @@ -65,4 +66,5 @@ --connection node_1 --disconnect node_1a DROP TABLE t1; + --source ../galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test 2025-05-19 16:14:24.000000000 +0000 @@ -9,6 +9,7 @@ --source include/force_restart.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 @@ -88,7 +89,7 @@ --connection node_2 SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log; # As noted above sometimes node delivers the same view twice -call mtr.add_suppression("WSREP: node uuid:.*"); +call mtr.add_suppression("WSREP: node uuid:"); --connection node_1 DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,4 @@ !include ../galera_3nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.weight=3' - +wsrep_provider_options='pc.weight=3;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,7 @@ --source include/have_innodb.inc --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3 + # Save original auto_increment_offset values. --let $node_1=node_1 --let $node_2=node_2 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt --- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,2 +1 @@ --wsrep-ignore-apply-errors=0 - diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/disabled.def mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def --- mariadb-10.11.11/mysql-test/suite/galera_sr/disabled.def 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def 2025-05-19 16:14:24.000000000 +0000 @@ -9,7 +9,3 @@ # Do not use any TAB characters for whitespace. # ############################################################################## - -GCF-1060 : MDEV-32160 GCF-1060 test failure due to wsrep MDL conflict -# Links to below failures in MDEV-30172 -MDEV-25718 : timeout related to wsrep_sync_wait and DEBUG_SYNC diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/MENT-2042.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result --- mariadb-10.11.11/mysql-test/suite/galera_sr/r/MENT-2042.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,9 @@ +connection node_2; +connection node_1; +connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1; +connection node_1; +CREATE TABLE t1 (f1 INTEGER PRIMARY KEY); +XA START 'a'; +ERROR 42000: This version of MariaDB doesn't yet support 'XA transactions with Galera replication' +DROP TABLE t1; +disconnect node_1a; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result --- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ connection node_2; connection node_1; -CALL mtr.add_suppression("WSREP: discarding established.*"); +CALL mtr.add_suppression("WSREP: discarding established"); connection node_1; connection node_2; connection node_2; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result --- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result 2025-05-19 16:14:24.000000000 +0000 @@ -2,6 +2,7 @@ connection node_1; connection node_1; connection node_2; +connection node_1; CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; SET SESSION wsrep_trx_fragment_size = 1; SET AUTOCOMMIT=OFF; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_myisam.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result --- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_myisam.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result 2025-05-19 16:14:24.000000000 +0000 @@ -14,3 +14,4 @@ 1 DROP TABLE t1; connection node_1; +SET GLOBAL wsrep_mode = DEFAULT; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result --- mariadb-10.11.11/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result 2025-05-19 16:14:24.000000000 +0000 @@ -25,7 +25,7 @@ connection node_1; Got one of the listed errors connection node_2; -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; SET GLOBAL debug_dbug = ''; SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb'; SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/suite.pm mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm --- mariadb-10.11.11/mysql-test/suite/galera_sr/suite.pm 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm 2025-05-19 16:14:24.000000000 +0000 @@ -9,62 +9,64 @@ push @::global_suppressions, ( - qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1), - qr(WSREP: Could not open saved state file for reading: .*), - qr(WSREP: Could not open state file for reading: .*), - qr(WSREP: Gap in state sequence. Need state transfer.), + qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1), + qr(WSREP: Could not open saved state file for reading: ), + qr(WSREP: Could not open state file for reading: ), + qr(WSREP: Gap in state sequence\. Need state transfer\.), qr(WSREP: Failed to prepare for incremental state transfer:), - qr(WSREP:.*down context.*), + qr(WSREP: .*down context.*), qr(WSREP: Failed to send state UUID:), - qr(WSREP: last inactive check more than .* skipping check), - qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.), - qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|, + qr(WSREP: last inactive check more than .+ skipping check), + qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.), + qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|, qr(WSREP: Quorum: No node with complete state), qr(WSREP: Initial position was provided by configuration or SST, avoiding override), - qr|WSREP: discarding established \(time wait\) .*|, - qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.), + qr|WSREP: discarding established \(time wait\) |, + qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.), qr(WSREP: evs::proto.*), - qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|, + qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|, qr(WSREP: no nodes coming from prim view, prim not possible), - qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable), + qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable), qr(WSREP: user message in state LEAVING), - qr(WSREP: .* sending install message failed: Transport endpoint is not connected), + qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected), qr(WSREP: .* sending install message failed: Resource temporarily unavailable), - qr(WSREP: Maximum writeset size exceeded by .*), - qr(WSREP: transaction size exceeded.*), - qr(WSREP: RBR event .*), - qr(WSREP: Ignoring error for TO isolated action: .*), - qr(WSREP: transaction size limit .*), - qr(WSREP: rbr write fail, .*), - qr(WSREP: .*Backend not supported: foo.*), - qr(WSREP: .*Failed to initialize backend using .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: Maximum writeset size exceeded by ), + qr(WSREP: transaction size exceeded), + qr(WSREP: RBR event ), + qr(WSREP: Ignoring error for TO isolated action: ), + qr(WSREP: transaction size limit ), + qr(WSREP: rbr write fail, ), + qr(WSREP: .*Backend not supported: foo), + qr(WSREP: .*Failed to initialize backend using ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Socket type not supported), qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*), - qr(WSREP: .*Failed to open backend connection: -110 .*), - qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*), + qr(WSREP: .*Failed to open backend connection: -110 ), + qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ), qr(WSREP: gcs connect failed: Connection timed out), qr|WSREP: wsrep::connect\(.*\) failed: 7|, - qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.), + qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.), qr(WSREP: Could not find peer:), - qr(WSREP: TO isolation failed for: .*), - qr|WSREP: gcs_caused\(\) returned .*|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|, - qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|, - qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|, + qr(WSREP: TO isolation failed for: ), + qr|WSREP: gcs_caused\(\) returned |, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|, + qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|, + qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|, qr(WSREP: Action message in non-primary configuration from member [0-9]*), qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*), - qr(WSREP: discarding established .*), - qr|WSREP: .*core_handle_uuid_msg.*|, - qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on), - qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|, - qr|Query apply failed:*|, - qr(WSREP: Ignoring error*), - qr(WSREP: Failed to remove page file .*), - qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*), - qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|, + qr(WSREP: discarding established ), + qr|WSREP: .*core_handle_uuid_msg|, + qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on), + qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|, + qr|WSREP: .*Query apply failed:|, + qr(WSREP: Ignoring error), + qr(WSREP: Failed to remove page file ), + qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ), + qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+, + qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+, qr|WSREP: Trying to continue unpaused monitor|, qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|, + qr|WSREP: Failed to report last committed|, ); bless { }; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-27615.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-27615.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test 2025-05-19 16:14:24.000000000 +0000 @@ -69,5 +69,4 @@ --disconnect node_2 --connect node_2, 127.0.0.1, root, , test, $NODE_MYPORT_2 - --source suite/galera/include/auto_increment_offset_restore.inc diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-28971.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-28971.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,7 @@ # --source include/galera_cluster.inc +--source include/have_sequence.inc CREATE SEQUENCE SEQ NOCACHE ENGINE=InnoDB; SET SESSION wsrep_trx_fragment_size=1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MENT-2042.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MENT-2042.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,23 @@ +# +# MENT-2042 Assertion `bf_aborted()' failed in wsrep::transaction::xa_replay_common() +# + +--source include/galera_cluster.inc +--source include/have_debug_sync.inc + +--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1 + +--connection node_1 +--let connection_id = `SELECT CONNECTION_ID()` + +CREATE TABLE t1 (f1 INTEGER PRIMARY KEY); + +# +# Execute XA transaction up to COMMIT +# + +--error ER_NOT_SUPPORTED_YET +XA START 'a'; + +DROP TABLE t1; +--disconnect node_1a diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,7 +7,7 @@ # leave the cluster. # -CALL mtr.add_suppression("WSREP: discarding established.*"); +CALL mtr.add_suppression("WSREP: discarding established"); # Save original auto_increment_offset values. --let $node_1=node_1 diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ - --log-bin --log-slave-updates --loose-galera-sr-gtid-unique +--log-bin --log-slave-updates --loose-galera-sr-gtid-unique diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,8 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.recovery=false' +wsrep_provider_options='pc.recovery=false;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' + auto_increment_offset=1 [mysqld.2] diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test 2025-05-19 16:14:24.000000000 +0000 @@ -11,6 +11,8 @@ --let $node_2=node_2 --source ../../galera/include/auto_increment_offset_save.inc +--connection node_1 + CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB; SET SESSION wsrep_trx_fragment_size = 1; SET AUTOCOMMIT=OFF; @@ -26,7 +28,6 @@ --let $wait_condition = SELECT COUNT(*) > 0 FROM mysql.wsrep_streaming_log; --source include/wait_condition.inc - # # Kill the entire cluster and restart # diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ !include ../galera_2nodes.cnf [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.weight=2' +wsrep_provider_options='pc.weight=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_myisam.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_myisam.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test 2025-05-19 16:14:24.000000000 +0000 @@ -22,6 +22,4 @@ DROP TABLE t1; --connection node_1 ---disable_query_log SET GLOBAL wsrep_mode = DEFAULT; ---enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -4,8 +4,7 @@ # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true' +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' [mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true' - +wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test 2025-05-19 16:14:24.000000000 +0000 @@ -85,4 +85,3 @@ # Restore original auto_increment_offset values. --source ../galera/include/auto_increment_offset_restore.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test 2025-05-19 16:14:24.000000000 +0000 @@ -8,6 +8,7 @@ --let $node_1=node_1 --let $node_2=node_2 --source ../galera/include/auto_increment_offset_save.inc + --connection node_2 call mtr.add_suppression("WSREP: Failed to scan the last segment to the end\\. Last events may be missing\\. Last recovered event: "); diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,7 @@ !include ../galera_2nodes.cnf + [mysqld.1] -wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=16K' -[mysqld.2] -wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=16K' +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=16K' +[mysqld.2] +wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=16K' diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test 2025-05-19 16:14:24.000000000 +0000 @@ -18,4 +18,3 @@ --connection node_2 --source include/galera_wait_ready.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test 2025-05-19 16:14:24.000000000 +0000 @@ -47,7 +47,7 @@ --reap --connection node_2 -SET GLOBAL wsrep_slave_threads = 1; +SET GLOBAL wsrep_slave_threads = DEFAULT; SET GLOBAL debug_dbug = ''; SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb'; SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb'; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test 2025-05-19 16:14:24.000000000 +0000 @@ -40,7 +40,6 @@ --connection node_1 SELECT COUNT(*) = 6 FROM t1; - --connection node_2 SELECT COUNT(*) = 6 FROM t1; diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test --- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test 2025-05-19 16:14:24.000000000 +0000 @@ -39,7 +39,3 @@ --connection node_1 DROP TABLE t1; DROP TABLE t2; - - - - diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_basic.result mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result --- mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_basic.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result 2025-05-19 16:14:24.000000000 +0000 @@ -86,6 +86,8 @@ DROP INDEX idx1 ON t; DROP INDEX idx2 ON t; DROP TABLE t; +# restart +set default_storage_engine=innodb; /* Test large BLOB data */ CREATE TABLE `t` ( `a` BLOB, diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_stats.result mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result --- mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_stats.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result 2025-05-19 16:14:24.000000000 +0000 @@ -38,6 +38,10 @@ idxa n_diff_pfx02 a,DB_ROW_ID idxa n_leaf_pages Number of leaf pages in the index idxa size Number of pages in the index +idxb n_diff_pfx01 b +idxb n_diff_pfx02 b,DB_ROW_ID +idxb n_leaf_pages Number of leaf pages in the index +idxb size Number of pages in the index vidxcd n_diff_pfx01 c vidxcd n_diff_pfx02 c,d vidxcd n_diff_pfx03 c,d,DB_ROW_ID @@ -54,6 +58,14 @@ GEN_CLUST_INDEX n_diff_pfx01 DB_ROW_ID GEN_CLUST_INDEX n_leaf_pages Number of leaf pages in the index GEN_CLUST_INDEX size Number of pages in the index +idxb n_diff_pfx01 b +idxb n_diff_pfx02 b,DB_ROW_ID +idxb n_leaf_pages Number of leaf pages in the index +idxb size Number of pages in the index +vidxcd n_diff_pfx01 d +vidxcd n_diff_pfx02 d,DB_ROW_ID +vidxcd n_leaf_pages Number of leaf pages in the index +vidxcd size Number of pages in the index ALTER TABLE t ADD INDEX vidxe (e), ALGORITHM=INPLACE; select count(*) from t; count(*) @@ -65,6 +77,18 @@ GEN_CLUST_INDEX n_diff_pfx01 DB_ROW_ID GEN_CLUST_INDEX n_leaf_pages Number of leaf pages in the index GEN_CLUST_INDEX size Number of pages in the index +idxb n_diff_pfx01 b +idxb n_diff_pfx02 b,DB_ROW_ID +idxb n_leaf_pages Number of leaf pages in the index +idxb size Number of pages in the index +vidxcd n_diff_pfx01 d +vidxcd n_diff_pfx02 d,DB_ROW_ID +vidxcd n_leaf_pages Number of leaf pages in the index +vidxcd size Number of pages in the index +vidxe n_diff_pfx01 e +vidxe n_diff_pfx02 e,DB_ROW_ID +vidxe n_leaf_pages Number of leaf pages in the index +vidxe size Number of pages in the index ALTER TABLE t ADD COLUMN f INT GENERATED ALWAYS AS(a + a), ADD INDEX vidxf (f), ALGORITHM=INPLACE; select count(*) from t; count(*) @@ -76,6 +100,22 @@ GEN_CLUST_INDEX n_diff_pfx01 DB_ROW_ID GEN_CLUST_INDEX n_leaf_pages Number of leaf pages in the index GEN_CLUST_INDEX size Number of pages in the index +idxb n_diff_pfx01 b +idxb n_diff_pfx02 b,DB_ROW_ID +idxb n_leaf_pages Number of leaf pages in the index +idxb size Number of pages in the index +vidxcd n_diff_pfx01 d +vidxcd n_diff_pfx02 d,DB_ROW_ID +vidxcd n_leaf_pages Number of leaf pages in the index +vidxcd size Number of pages in the index +vidxe n_diff_pfx01 e +vidxe n_diff_pfx02 e,DB_ROW_ID +vidxe n_leaf_pages Number of leaf pages in the index +vidxe size Number of pages in the index +vidxf n_diff_pfx01 f +vidxf n_diff_pfx02 f,DB_ROW_ID +vidxf n_leaf_pages Number of leaf pages in the index +vidxf size Number of pages in the index ALTER TABLE t DROP INDEX vidxcd; SELECT index_name, stat_name, stat_description FROM mysql.innodb_index_stats @@ -84,4 +124,16 @@ GEN_CLUST_INDEX n_diff_pfx01 DB_ROW_ID GEN_CLUST_INDEX n_leaf_pages Number of leaf pages in the index GEN_CLUST_INDEX size Number of pages in the index +idxb n_diff_pfx01 b +idxb n_diff_pfx02 b,DB_ROW_ID +idxb n_leaf_pages Number of leaf pages in the index +idxb size Number of pages in the index +vidxe n_diff_pfx01 e +vidxe n_diff_pfx02 e,DB_ROW_ID +vidxe n_leaf_pages Number of leaf pages in the index +vidxe size Number of pages in the index +vidxf n_diff_pfx01 f +vidxf n_diff_pfx02 f,DB_ROW_ID +vidxf n_leaf_pages Number of leaf pages in the index +vidxf size Number of pages in the index DROP TABLE t; diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/t/innodb_virtual_basic.test mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test --- mariadb-10.11.11/mysql-test/suite/gcol/t/innodb_virtual_basic.test 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ --source include/have_innodb.inc --source include/have_partition.inc ---source include/big_test.inc +--source include/not_embedded.inc call mtr.add_suppression("\\[Warning\\] InnoDB: Compute virtual"); @@ -66,6 +66,41 @@ DROP INDEX idx2 ON t; DROP TABLE t; +let MYSQLD_DATADIR=`select @@datadir`; +let PAGE_SIZE=`select @@innodb_page_size`; +--source include/shutdown_mysqld.inc +perl; +do "$ENV{MTR_SUITE_DIR}/../innodb/include/crc32.pl"; +my $file = "$ENV{MYSQLD_DATADIR}/ibdata1"; +open(FILE, "+<$file") || die "Unable to open $file"; +binmode FILE; +my $ps= $ENV{PAGE_SIZE}; +my $page; +die "Unable to read $file" unless sysread(FILE, $page, $ps) == $ps; +my $full_crc32 = unpack("N",substr($page,54,4)) & 0x10; # FIL_SPACE_FLAGS +sysseek(FILE, 7*$ps, 0) || die "Unable to seek $file\n"; +die "Unable to read $file" unless sysread(FILE, $page, $ps) == $ps; +substr($page,54,4)=pack("N",0xc001cafe); # 32 MSB of 64-bit DICT_HDR_INDEX_ID +my $polynomial = 0x82f63b78; # CRC-32C +if ($full_crc32) +{ + my $ck = mycrc32(substr($page, 0, $ps-4), 0, $polynomial); + substr($page, $ps-4, 4) = pack("N", $ck); +} +else +{ + my $ck= pack("N",mycrc32(substr($page, 4, 22), 0, $polynomial) ^ + mycrc32(substr($page, 38, $ps - 38 - 8), 0, $polynomial)); + substr($page,0,4)=$ck; + substr($page,$ps-8,4)=$ck; +} +sysseek(FILE, 7*$ps, 0) || die "Unable to rewind $file\n"; +syswrite(FILE, $page, $ps)==$ps || die "Unable to write $file\n"; +close(FILE) || die "Unable to close $file"; +EOF +--source include/start_mysqld.inc +set default_storage_engine=innodb; + /* Test large BLOB data */ CREATE TABLE `t` ( `a` BLOB, diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/alter_copy_bulk.result mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/alter_copy_bulk.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result 2025-05-19 16:14:24.000000000 +0000 @@ -91,3 +91,24 @@ ALTER TABLE t1 FORCE, ALGORITHM=COPY; DROP TABLE t1; SET GLOBAL innodb_stats_persistent=@default_stats_persistent; +# +# MDEV-36504 Memory leak after insert into empty table +# +CREATE TABLE t1 (k INT PRIMARY KEY)ENGINE=InnoDB; +INSERT INTO t1 SET k= 1; +START TRANSACTION; +INSERT INTO t1 SET k= 2; +SELECT COUNT(*) > 0 FROM mysql.innodb_index_stats LOCK IN SHARE MODE; +COUNT(*) > 0 +1 +connect con1,localhost,root,,,; +SET innodb_lock_wait_timeout=0; +CREATE TABLE t2(f1 INT DEFAULT 1 PRIMARY KEY) +STATS_PERSISTENT= 1 ENGINE=InnoDB as SELECT k FROM t1; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +disconnect con1; +connection default; +SET innodb_lock_wait_timeout=default; +DROP TABLE t1; +DROP TABLE IF EXISTS t2; +# restart diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/alter_partitioned_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/alter_partitioned_debug.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ CREATE TABLE t1 (a INT, b VARCHAR(10)) ENGINE=InnoDB +STATS_PERSISTENT=1 STATS_AUTO_RECALC=0 PARTITION BY RANGE(a) (PARTITION pa VALUES LESS THAN (3), PARTITION pb VALUES LESS THAN (5)); @@ -19,9 +20,30 @@ ERROR 23000: Duplicate entry '2-two' for key 'a' connection default; DELETE FROM t1; -disconnect ddl; SET DEBUG_SYNC = 'RESET'; CHECK TABLE t1; Table Op Msg_type Msg_text test.t1 check status OK -DROP TABLE t1; +CREATE TABLE t(a INT, b VARCHAR(10)) ENGINE=InnoDB +STATS_PERSISTENT=1 STATS_AUTO_RECALC=1; +RENAME TABLE t TO u; +DELETE FROM mysql.innodb_table_stats WHERE table_name='u'; +DELETE FROM mysql.innodb_index_stats WHERE table_name='u'; +SET STATEMENT debug_dbug='+d,dict_stats_save_exit_notify_and_wait' FOR +SELECT * FROM u; +connection ddl; +SET DEBUG_SYNC='open_tables_after_open_and_process_table +WAIT_FOR dict_stats_save_finished'; +ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u; +connect sync,localhost,root; +SET DEBUG_SYNC='now SIGNAL dict_stats_save_unblock'; +disconnect sync; +connection default; +a b +connection ddl; +disconnect ddl; +connection default; +SELECT * FROM u; +a b +SET DEBUG_SYNC = 'RESET'; +DROP TABLE t1,u; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,6 @@ -@@ -13,212 +13,212 @@ +--- autoinc_persist.result ++++ autoinc_persist.result,desc +@@ -13,224 +13,224 @@ # # Pre-create several tables SET SQL_MODE='STRICT_ALL_TABLES'; @@ -296,8 +298,7 @@ +2 +1 +CREATE TABLE t11(a FLOAT AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB; - INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), - (20), (30), (31); + INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t11; a --10 @@ -310,7 +311,7 @@ -20 -30 31 --CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB; +-CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB; +30 +20 +5 @@ -320,9 +321,30 @@ +1 +-1 +-10 ++CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB; + INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); + ERROR 22003: Out of range value for column 'a' at row 5 + INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31); + SELECT * FROM t11u; + a +-11 +-12 +-13 +-14 +-15 +-20 +-30 + 31 +-CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB; ++30 ++20 ++15 ++14 ++13 ++12 ++11 +CREATE TABLE t12(a DOUBLE AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB; - INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), - (20), (30), (31); + INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t12; a --10 @@ -344,10 +366,10 @@ +1 +-1 +-10 - # Scenario 1: Normal restart, to test if the counters are persisted - # Scenario 2: Delete some values, to test the counters should not be the - # one which is the largest in current table -@@ -242,14 +242,14 @@ + CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB; + INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); + ERROR 22003: Out of range value for column 'a' at row 5 +@@ -268,14 +268,14 @@ SELECT MAX(a) AS `Expect 100000000000` FROM t9; Expect 100000000000 100000000000 @@ -364,7 +386,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=1234 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t13 VALUES(0); SELECT a AS `Expect 1234` FROM t13; -@@ -464,28 +464,28 @@ +@@ -490,28 +490,28 @@ INSERT INTO t1 VALUES(0), (0); SELECT * FROM t1; a @@ -398,7 +420,7 @@ # Ensure that all changes before the server is killed are persisted. set global innodb_flush_log_at_trx_commit=1; TRUNCATE TABLE t1; -@@ -498,63 +498,63 @@ +@@ -524,63 +524,63 @@ INSERT INTO t19 VALUES(0), (0); SELECT * FROM t19; a @@ -481,7 +503,7 @@ DELETE FROM t3 WHERE a > 300; SELECT MAX(a) AS `Expect 200` FROM t3; Expect 200 -@@ -566,7 +566,7 @@ +@@ -592,7 +592,7 @@ Table Create Table t3 CREATE TABLE `t3` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -490,7 +512,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=201 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t3 VALUES(0); SELECT MAX(a) AS `Expect 201` FROM t3; -@@ -579,7 +579,7 @@ +@@ -605,7 +605,7 @@ Table Create Table t3 CREATE TABLE `t3` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -499,7 +521,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=500 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t3 VALUES(0); SELECT MAX(a) AS `Expect 500` FROM t3; -@@ -591,13 +591,13 @@ +@@ -617,13 +617,13 @@ Table Create Table t3 CREATE TABLE `t3` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -515,7 +537,7 @@ INSERT INTO t3 VALUES(150), (180); UPDATE t3 SET a = 200 WHERE a = 150; INSERT INTO t3 VALUES(220); -@@ -607,7 +607,7 @@ +@@ -633,7 +633,7 @@ Table Create Table t3 CREATE TABLE `t3` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -524,7 +546,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=221 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t3 VALUES(0); SELECT MAX(a) AS `Expect 221` FROM t3; -@@ -619,7 +619,7 @@ +@@ -645,7 +645,7 @@ Table Create Table t3 CREATE TABLE `t3` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -533,7 +555,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=120 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # MDEV-6076: Test adding an AUTO_INCREMENT COLUMN CREATE TABLE mdev6076a (b INT) ENGINE=InnoDB; -@@ -669,18 +669,18 @@ +@@ -695,18 +695,18 @@ INSERT INTO t_inplace SELECT * FROM t3; SELECT * FROM t_inplace; a @@ -559,7 +581,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=211 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This will keep the autoinc counter ALTER TABLE t_inplace AUTO_INCREMENT = 250, ALGORITHM = INPLACE; -@@ -689,7 +689,7 @@ +@@ -715,7 +715,7 @@ Table Create Table t_inplace CREATE TABLE `t_inplace` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -568,7 +590,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=250 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This should keep the autoinc counter as well ALTER TABLE t_inplace ADD COLUMN b INT, ALGORITHM = INPLACE; -@@ -699,16 +699,16 @@ +@@ -725,16 +725,16 @@ t_inplace CREATE TABLE `t_inplace` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, `b` int(11) DEFAULT NULL, @@ -590,7 +612,7 @@ # This should reset the autoinc counter to the one specified # Since it's smaller than current one but bigger than existing # biggest counter in the table -@@ -719,7 +719,7 @@ +@@ -745,7 +745,7 @@ t_inplace CREATE TABLE `t_inplace` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, `b` int(11) DEFAULT NULL, @@ -599,7 +621,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=180 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This should reset the autoinc counter to the next value of # current max counter in the table, since the specified value -@@ -730,7 +730,7 @@ +@@ -756,7 +756,7 @@ Table Create Table t_inplace CREATE TABLE `t_inplace` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -608,7 +630,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=123 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t_inplace VALUES(0), (0); SELECT MAX(a) AS `Expect 124` FROM t_inplace; -@@ -757,18 +757,18 @@ +@@ -783,18 +783,18 @@ INSERT INTO t_copy SELECT * FROM t3; SELECT * FROM t_copy; a @@ -634,7 +656,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=211 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This will keep the autoinc counter ALTER TABLE t_copy AUTO_INCREMENT = 250, ALGORITHM = COPY; -@@ -777,7 +777,7 @@ +@@ -803,7 +803,7 @@ Table Create Table t_copy CREATE TABLE `t_copy` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -643,7 +665,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=250 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This should keep the autoinc counter as well ALTER TABLE t_copy ADD COLUMN b INT, ALGORITHM = COPY; -@@ -787,16 +787,16 @@ +@@ -813,16 +813,16 @@ t_copy CREATE TABLE `t_copy` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, `b` int(11) DEFAULT NULL, @@ -665,7 +687,7 @@ # This should reset the autoinc counter to the one specified # Since it's smaller than current one but bigger than existing # biggest counter in the table -@@ -807,7 +807,7 @@ +@@ -833,7 +833,7 @@ t_copy CREATE TABLE `t_copy` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, `b` int(11) DEFAULT NULL, @@ -674,7 +696,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=180 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci # This should reset the autoinc counter to the next value of # current max counter in the table, since the specified value -@@ -818,7 +818,7 @@ +@@ -844,7 +844,7 @@ Table Create Table t_copy CREATE TABLE `t_copy` ( `a` smallint(6) NOT NULL AUTO_INCREMENT, @@ -683,7 +705,7 @@ ) ENGINE=InnoDB AUTO_INCREMENT=123 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci INSERT INTO t_copy VALUES(0), (0); SELECT MAX(a) AS `Expect 124` FROM t_copy; -@@ -842,7 +842,7 @@ +@@ -868,7 +868,7 @@ 126 DROP TABLE t_copy, it_copy; # Scenario 9: Test the sql_mode = NO_AUTO_VALUE_ON_ZERO @@ -692,7 +714,7 @@ set SQL_MODE = NO_AUTO_VALUE_ON_ZERO; INSERT INTO t30 VALUES(NULL, 1), (200, 2), (0, 3); INSERT INTO t30(b) VALUES(4), (5), (6), (7); -@@ -869,20 +869,20 @@ +@@ -895,20 +895,20 @@ set global innodb_flush_log_at_trx_commit=1; CREATE TABLE t31 (a INT) ENGINE = InnoDB; INSERT INTO t31 VALUES(1), (2); @@ -719,7 +741,7 @@ INSERT INTO t32 VALUES(0), (0); # Ensure that all changes before the server is killed are persisted. set global innodb_flush_log_at_trx_commit=1; -@@ -897,7 +897,7 @@ +@@ -923,7 +923,7 @@ # increasing the counter CREATE TABLE t33 ( a BIGINT NOT NULL PRIMARY KEY, @@ -728,7 +750,7 @@ INSERT INTO t33 VALUES(1, NULL); INSERT INTO t33 VALUES(2, NULL); INSERT INTO t33 VALUES(2, NULL); -@@ -920,13 +920,13 @@ +@@ -946,13 +946,13 @@ INSERT INTO t31(a) VALUES(6), (0); SELECT * FROM t31; a b @@ -748,7 +770,7 @@ DROP TABLE t31; set SQL_MODE = NO_AUTO_VALUE_ON_ZERO; DELETE FROM t30 WHERE a = 0; -@@ -965,7 +965,7 @@ +@@ -991,7 +991,7 @@ DROP TABLE t33; CREATE TABLE t33 ( a BIGINT NOT NULL PRIMARY KEY, @@ -757,7 +779,7 @@ ALTER TABLE t33 DISCARD TABLESPACE; restore: t33 .ibd and .cfg files ALTER TABLE t33 IMPORT TABLESPACE; -@@ -975,7 +975,7 @@ +@@ -1001,8 +1001,8 @@ 4 SELECT * FROM t33; a b @@ -766,4 +788,5 @@ 3 4 +2 2 +10 1 - DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33; + DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u, + t30, t32, t33; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist.result mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result 2025-05-19 16:14:24.000000000 +0000 @@ -190,8 +190,7 @@ 100000000000 100000000006 CREATE TABLE t11(a FLOAT AUTO_INCREMENT KEY) ENGINE = InnoDB; -INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), -(20), (30), (31); +INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t11; a -10 @@ -204,9 +203,22 @@ 20 30 31 +CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB; +INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); +ERROR 22003: Out of range value for column 'a' at row 5 +INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31); +SELECT * FROM t11u; +a +11 +12 +13 +14 +15 +20 +30 +31 CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB; -INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), -(20), (30), (31); +INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t12; a -10 @@ -219,6 +231,20 @@ 20 30 31 +CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB; +INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); +ERROR 22003: Out of range value for column 'a' at row 5 +INSERT INTO t12u VALUES(0), (0), (0), (0), (0), (20), (30), (31); +SELECT * FROM t12u; +a +11 +12 +13 +14 +15 +20 +30 +31 # Scenario 1: Normal restart, to test if the counters are persisted # Scenario 2: Delete some values, to test the counters should not be the # one which is the largest in current table @@ -978,4 +1004,5 @@ 10 1 2 2 3 4 -DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33; +DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u, +t30, t32, t33; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/buf_pool_resize_oom.result mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/buf_pool_resize_oom.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -# -# Bug #21348684 SIGABRT DURING RESIZING THE INNODB BUFFER POOL -# ONLINE WITH MEMORY FULL CONDITION -# -call mtr.add_suppression("InnoDB: failed to allocate the chunk array"); -SET GLOBAL debug_dbug='+d,buf_pool_resize_chunk_null'; -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + 1048576; -# restart diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/doublewrite.result mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/doublewrite.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result 2025-05-19 16:14:24.000000000 +0000 @@ -11,9 +11,11 @@ commit work; SET GLOBAL innodb_fast_shutdown = 0; # restart +SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0; +SET GLOBAL innodb_max_dirty_pages_pct=99; connect dml,localhost,root,,; XA START 'x'; -insert into t1 values (6, repeat('%', @@innodb_page_size/2)); +insert into t1 values(6, repeat('%', @@innodb_page_size/2)); XA END 'x'; XA PREPARE 'x'; disconnect dml; @@ -23,7 +25,6 @@ # restart FOUND 1 /InnoDB: Recovered page \[page id: space=[1-9][0-9]*, page number=0\]/ in mysqld.1.err # restart -XA ROLLBACK 'x'; check table t1; Table Op Msg_type Msg_text test.t1 check status OK @@ -34,18 +35,13 @@ 3 //////////// 4 ------------ 5 ............ -connect dml,localhost,root,,; -XA START 'x'; -insert into t1 values (6, repeat('%', @@innodb_page_size/2)); -XA END 'x'; -XA PREPARE 'x'; -disconnect dml; -connection default; -flush table t1 for export; +SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0; +SET GLOBAL innodb_max_dirty_pages_pct=99; +XA ROLLBACK 'x'; +FLUSH TABLE t1 FOR EXPORT; # Kill the server # restart FOUND 4 /InnoDB: Recovered page \[page id: space=[1-9][0-9]*, page number=[03]\]/ in mysqld.1.err -XA ROLLBACK 'x'; check table t1; Table Op Msg_type Msg_text test.t1 check status OK diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/foreign_key.result mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/foreign_key.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result 2025-05-19 16:14:24.000000000 +0000 @@ -155,7 +155,6 @@ FLUSH TABLES; # restart disconnect incomplete; -SET @save_stats_persistent = @@GLOBAL.innodb_stats_persistent; SET GLOBAL innodb_stats_persistent = 0; INSERT INTO child SET a=0; INSERT INTO child SET a=1; @@ -1182,6 +1181,25 @@ ALTER TABLE t2 ADD KEY(b), ALGORITHM=NOCOPY; DELETE FROM t1; DROP TABLE t2, t1; +# +# MDEV-33167 ASAN errors after failing to load foreign key +# relation for the table +# +call mtr.add_suppression("InnoDB: Load table `test`.`t3` failed, the table has missing foreign key indexes. Turn off 'foreign_key_checks' and try again."); +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t1(f1 VARCHAR(8), +FOREIGN KEY(f1) REFERENCES test.t3(f1))ENGINE=InnoDB; +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t2(f1 VARCHAR(8), +FOREIGN KEY(f1) REFERENCES test.t3(f1)) +ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t3(f1 VARCHAR(8) PRIMARY KEY) +ENGINE=InnoDB DEFAULT CHARSET=latin1; +set GLOBAL innodb_fast_shutdown=0; +# restart +ALTER TABLE t2 FORCE; +DROP TABLE t2, t1, t3; # End of 10.6 tests CREATE TABLE t1 ( @@ -1204,5 +1222,4 @@ ADD UNIQUE INDEX(f3); ERROR HY000: Cannot delete rows from table which is parent in a foreign key constraint 't1_ibfk_1' of table 't1' drop table t1, t2; -SET GLOBAL innodb_stats_persistent = @save_stats_persistent; # End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb-index-online.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb-index-online.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result 2025-05-19 16:14:24.000000000 +0000 @@ -534,7 +534,6 @@ ROLLBACK; SET DEBUG_SYNC = 'now SIGNAL inserted'; connection con1; -disconnect con1; connection default; SELECT * FROM t1; a b @@ -543,6 +542,31 @@ Table Op Msg_type Msg_text test.t1 check status OK DROP TABLE t1; +# +# MDEV-36281 DML aborts during online virtual index +# +CREATE TABLE t1(f1 INT NOT NULL PRIMARY KEY, f2 INT NOT NULL, +f3 INT NOT NULL, f4 INT AS (f3) VIRTUAL, +f5 INT AS (f1) VIRTUAL, INDEX(f4))ENGINE=InnoDB; +INSERT INTO t1(f1, f2, f3) VALUES(1, 2, 3); +SET DEBUG_SYNC = 'innodb_inplace_alter_table_enter SIGNAL dml_start WAIT_FOR dml_finish'; +ALTER TABLE t1 ADD INDEX v1(f5, f2, f4), ADD INDEX v2(f3, f5); +connection con1; +set DEBUG_SYNC="now WAIT_FOR dml_start"; +UPDATE t1 SET f3= f3 + 1; +set DEBUG_SYNC="now SIGNAL dml_finish"; +disconnect con1; +connection default; +CHECK TABLE t1 EXTENDED; +Table Op Msg_type Msg_text +test.t1 check status OK +SELECT f5, f2, f4 FROM t1 USE INDEX(v1); +f5 f2 f4 +1 2 4 +SELECT f3, f5 FROM t1 USE INDEX(v2); +f3 f5 +4 1 +DROP TABLE t1; SET DEBUG_SYNC = 'RESET'; SET GLOBAL innodb_file_per_table = @global_innodb_file_per_table_orig; SET GLOBAL innodb_monitor_enable = default; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ -call mtr.add_suppression("InnoDB: Cannot allocate memory for the buffer pool"); +call mtr.add_suppression("InnoDB: Cannot map innodb_buffer_pool_size_max="); call mtr.add_suppression("InnoDB: Plugin initialization aborted at srv0start.cc.*"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error."); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed."); @@ -6,4 +6,4 @@ # MDEV-25019 memory allocation failures during startup cause server failure in different, confusing ways # # restart: --debug_dbug=+d,ib_buf_chunk_init_fails -FOUND 1 /\[ERROR\] InnoDB: Cannot allocate memory for the buffer pool/ in mysqld.1.err +FOUND 1 /\[ERROR\] InnoDB: Cannot map innodb_buffer_pool_size_max=16m/ in mysqld.1.err diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,32 +1,51 @@ +# +# MDEV-29445: Reorganize buffer pool (and remove chunks) +# set global innodb_adaptive_hash_index=ON; select @@innodb_buffer_pool_size; @@innodb_buffer_pool_size 8388608 +set global innodb_buffer_pool_size = 9437184; set global innodb_buffer_pool_size = 10485760; select @@innodb_buffer_pool_size; @@innodb_buffer_pool_size 10485760 -create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED; -create or replace view view0 as select 1 union all select 1; -set @`v_id` := 0; -set @`v_val` := 0; -replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17; -set global innodb_buffer_pool_size = 64 * 1024 * 1024 + 512 * 1024; -Warnings: -Warning 1292 Truncated incorrect innodb_buffer_pool_size value: '67633152' -select @@innodb_buffer_pool_size; -@@innodb_buffer_pool_size -68157440 +create table t1 (id int primary key, val int not null) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED; +create table t2 (id int primary key, val int not null) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=$kbs; +SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR +INSERT INTO t1 SELECT seq*4,seq*4 FROM seq_1_to_262144; +SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR +INSERT INTO t2 SELECT seq*4,seq*4 FROM seq_1_to_16384; +set global innodb_buffer_pool_size = 7340032; select count(val) from t1; count(val) 262144 +select count(val) from t2; +count(val) +16384 set global innodb_adaptive_hash_index=OFF; -set global innodb_buffer_pool_size = 25165824; +set global innodb_buffer_pool_size = 24117248; +set global innodb_buffer_pool_size = 26214400; +Warnings: +Warning 1292 Truncated incorrect innodb_buffer_pool_size value: '26214400' select @@innodb_buffer_pool_size; @@innodb_buffer_pool_size 25165824 select count(val) from t1; count(val) 262144 -drop table t1; -drop view view0; +select count(val) from t2; +count(val) +16384 +drop table t1,t2; +SET GLOBAL innodb_max_purge_lag_wait = 0; +SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct; +SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0; +SET GLOBAL innodb_max_dirty_pages_pct = 0.0; +SET GLOBAL innodb_buffer_pool_size = @old_innodb_buffer_pool_size; +SET GLOBAL innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index; +SET GLOBAL innodb_max_dirty_pages_pct = @save_pct; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -SET @save_size=@@innodb_buffer_pool_size; -# -# MDEV-27891: Delayed SIGSEGV in InnoDB buffer pool resize -# after or during DROP TABLE -# -select @@innodb_buffer_pool_chunk_size; -@@innodb_buffer_pool_chunk_size -1048576 -CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; -SET GLOBAL innodb_buffer_pool_size=256*1024*1024; -DROP TABLE t1; -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + @@innodb_buffer_pool_chunk_size; -# End of 10.6 tests -SET GLOBAL innodb_buffer_pool_size=@save_size; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result 2025-05-19 16:14:24.000000000 +0000 @@ -4,7 +4,32 @@ SET GLOBAL innodb_buffer_pool_size=16777216; CREATE TEMPORARY TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; INSERT INTO t1 SELECT seq FROM seq_1_to_200; +SET GLOBAL innodb_max_purge_lag_wait=0; +SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct; +SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0; +SET GLOBAL innodb_max_dirty_pages_pct = 0.0; +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +Variable_name Value +Innodb_buffer_pool_resize_status +connect con1,localhost,root; +SET DEBUG_SYNC='buf_pool_shrink_before_wakeup SIGNAL blocked WAIT_FOR go'; SET GLOBAL innodb_buffer_pool_size=8388608; +connection default; +SET DEBUG_SYNC='now WAIT_FOR blocked'; +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +Variable_name Value +Innodb_buffer_pool_resize_status Withdrawing blocks. (505/505). +SET DEBUG_SYNC='now SIGNAL go'; +connection con1; +disconnect con1; +connection default; +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +Variable_name Value +Innodb_buffer_pool_resize_status +SET DEBUG_SYNC=RESET; +SET GLOBAL innodb_max_dirty_pages_pct = @save_pct; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm; SELECT COUNT(*),MIN(a),MAX(a) FROM t1; COUNT(*) MIN(a) MAX(a) 200 1 200 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,26 +0,0 @@ -select @@innodb_buffer_pool_chunk_size; -@@innodb_buffer_pool_chunk_size -4194304 -create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED; -create or replace view view0 as select 1 union all select 1; -set @`v_id` := 0; -set @`v_val` := 0; -replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17; -set global innodb_buffer_pool_size = 7340032; -Warnings: -Warning 1292 Truncated incorrect innodb_buffer_pool_size value: '7340032' -select count(val) from t1; -count(val) -262144 -set global innodb_buffer_pool_size = 16777216; -select count(val) from t1; -count(val) -262144 -drop table t1; -drop view view0; -set global innodb_buffer_pool_size = 2*1048576; -Warnings: -Warning 1292 Truncated incorrect innodb_buffer_pool_size value: '2097152' -select @@innodb_buffer_pool_size; -@@innodb_buffer_pool_size -4194304 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_bug52663.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_bug52663.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,10 +1,11 @@ +SET @save_innodb_timeout=@@innodb_lock_wait_timeout; +SET GLOBAL innodb_lock_wait_timeout=1; set session transaction isolation level read committed; create table innodb_bug52663 (what varchar(5), id integer, count integer, primary key (what, id)) engine=innodb; insert into innodb_bug52663 values ('total', 0, 0); begin; connect addconroot, localhost, root,,; -connection addconroot; set session transaction isolation level read committed; begin; connection default; @@ -31,3 +32,4 @@ what id count total 0 2 drop table innodb_bug52663; +SET GLOBAL innodb_lock_wait_timeout=@save_innodb_timeout; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,10 +1,18 @@ CREATE TABLE `t`(`id` INT, PRIMARY KEY(`id`)) ENGINE=InnoDB STATS_PERSISTENT=0; INSERT INTO t VALUES (1); -SET GLOBAL innodb_monitor_reset = "module_innodb"; +SET GLOBAL innodb_monitor_disable="lock_row_lock_time"; +SET GLOBAL innodb_monitor_disable="lock_row_lock_time_max"; +SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time'; +SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time_max'; +SET GLOBAL innodb_monitor_enable="lock_row_lock_time"; +SET GLOBAL innodb_monitor_enable="lock_row_lock_time_max"; BEGIN; SELECT * FROM t FOR UPDATE; id 1 +SELECT @innodb_row_lock_time_before := variable_value +FROM information_schema.global_status +WHERE LOWER(variable_name) = 'innodb_row_lock_time'; connect con1,localhost,root,,; SET innodb_lock_wait_timeout = 1; SELECT * FROM t FOR UPDATE; @@ -12,29 +20,27 @@ disconnect con1; connection default; COMMIT; -SELECT variable_value > 100 FROM information_schema.global_status +SELECT variable_value - @innodb_row_lock_time_before > 100 +FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_row_lock_time'; -variable_value > 100 +variable_value - @innodb_row_lock_time_before > 100 1 -SELECT variable_value > 100 FROM information_schema.global_status +SELECT variable_value > 100 +FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_row_lock_time_max'; variable_value > 100 1 -SELECT variable_value > 100 FROM information_schema.global_status -WHERE LOWER(variable_name) = 'innodb_row_lock_time_avg'; -variable_value > 100 -1 -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS -WHERE NAME="lock_row_lock_time"; -count_reset > 100 -1 -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS -WHERE NAME="lock_row_lock_time_max"; +SELECT count_reset > 100 +FROM INFORMATION_SCHEMA.INNODB_METRICS +WHERE NAME='lock_row_lock_time'; count_reset > 100 1 -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS -WHERE NAME="lock_row_lock_time_avg"; +SELECT count_reset > 100 +FROM INFORMATION_SCHEMA.INNODB_METRICS +WHERE NAME='lock_row_lock_time_max'; count_reset > 100 1 DROP TABLE t; -SET GLOBAL innodb_monitor_reset=default; +SET GLOBAL innodb_monitor_enable=default; +SET GLOBAL innodb_monitor_disable=default; +SET GLOBAL innodb_monitor_reset_all=default; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result 2025-05-19 16:14:24.000000000 +0000 @@ -5,13 +5,13 @@ SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 3 SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 0 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 0 +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 1 @@ -25,13 +25,13 @@ SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 3 SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 0 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 0 +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 1 @@ -45,13 +45,13 @@ SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 3 SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 0 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't'; COUNT(*) 0 +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't'; COUNT(*) 0 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_fetch.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_fetch.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result 2025-05-19 16:14:24.000000000 +0000 @@ -125,7 +125,7 @@ table_name = 'test_ps_fetch' AND index_name = 'idx' AND stat_name = 'n_diff_pfx02'; -FLUSH TABLE test_ps_fetch; +RENAME TABLE test_ps_fetch TO tmp, tmp TO test_ps_fetch; SELECT seq_in_index, column_name, cardinality FROM information_schema.statistics WHERE table_name = 'test_ps_fetch' ORDER BY index_name, seq_in_index; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -1,8 +1,9 @@ -@@ -527,6 +527,6 @@ +@@ -576,7 +576,7 @@ FROM information_schema.global_status WHERE variable_name = 'innodb_instant_alter_column'; instants -37 +38 - SET GLOBAL innodb_stats_persistent = @save_stats_persistent; - # End of 10.6 tests + CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB; + INSERT INTO t1 VALUES(1, 'a'); + ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result 2025-05-19 16:14:24.000000000 +0000 @@ -575,5 +575,16 @@ WHERE variable_name = 'innodb_instant_alter_column'; instants 37 +CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB; +INSERT INTO t1 VALUES(1, 'a'); +ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST; +SET STATEMENT DEBUG_DBUG="+d,instant_insert_fail" FOR +ALTER TABLE t1 DROP COLUMN f1; +ERROR HY000: Internal error: InnoDB: Insert into SYS_COLUMNS failed +ALTER TABLE t1 DROP COLUMN f1; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +DROP TABLE t1; SET GLOBAL innodb_stats_persistent = @save_stats_persistent; # End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/lock_isolation.result mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/lock_isolation.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,3 +1,6 @@ +connect disable_purging,localhost,root; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; # # MDEV-26642 Weird SELECT view when a record is # modified to the same value by two transactions @@ -52,15 +55,17 @@ # MDEV-26643 Inconsistent behaviors of UPDATE under # READ UNCOMMITTED and READ COMMITTED isolation level # -CREATE TABLE t(a INT, b INT) ENGINE=InnoDB; +CREATE TABLE t(a INT, b INT) ENGINE=InnoDB STATS_PERSISTENT=0; INSERT INTO t VALUES(NULL, 1), (2, 2); SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; BEGIN; UPDATE t SET a = 10; connection consistent; SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; UPDATE t SET b = 20 WHERE a; connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; COMMIT; connection consistent; SELECT * FROM t; @@ -74,8 +79,10 @@ UPDATE t SET a = 10; connection consistent; SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; UPDATE t SET b = 20 WHERE a; connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; COMMIT; connection consistent; SELECT * FROM t; @@ -89,8 +96,10 @@ UPDATE t SET a = 10; connection con_weird; SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; UPDATE t SET b = 20 WHERE a; connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; SELECT * FROM t; a b 10 1 @@ -113,8 +122,10 @@ connection consistent; SET TRANSACTION ISOLATION LEVEL READ COMMITTED; BEGIN; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; UPDATE t SET b = 2 WHERE a; connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; UPDATE t SET a = 1; COMMIT; connection consistent; @@ -128,20 +139,25 @@ # # MDEV-33802 Weird read view after ROLLBACK of other transactions # -CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB; -INSERT INTO t SET a=1; -BEGIN; -INSERT INTO t SET a=2; +CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB STATS_PERSISTENT=0; connection consistent; START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; +INSERT INTO t SET a=1; +connection consistent; SAVEPOINT sp1; SELECT * FROM t FORCE INDEX (b) FOR UPDATE; ERROR HY000: Record has changed since last read in table 't' SAVEPOINT sp1; +connection default; +BEGIN; +INSERT INTO t SET a=2; connection con_weird; START TRANSACTION WITH CONSISTENT SNAPSHOT; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; SELECT * FROM t FORCE INDEX (b) FOR UPDATE; connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; ROLLBACK; connection con_weird; a b @@ -149,12 +165,74 @@ SELECT * FROM t FORCE INDEX (b) FOR UPDATE; a b 1 NULL +COMMIT; disconnect con_weird; connection consistent; SELECT * FROM t FORCE INDEX (b) FOR UPDATE; a b 1 NULL +COMMIT; +connection default; +TRUNCATE TABLE t; +# +# MDEV-36639 innodb_snapshot_isolation=1 gives error for not comitted row changes +# +INSERT INTO t VALUES (1,1),(2,2); +connection default; +# Case 1: Transaction A modifies a record, transaction B with snapshot +# isolation level is blocked by A, then A is committed. +# Expected behaviour: B gets ER_CHECKREAD. +BEGIN; +UPDATE t SET b=3 WHERE a = 1; +connection consistent; +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t; +a b +1 1 +2 2 +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; +SELECT * FROM t WHERE a=1 FOR UPDATE; +connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; +COMMIT; +connection consistent; +ERROR HY000: Record has changed since last read in table 't' +# Case 2: Transaction A modifies a record, transaction B with snapshot +# isolation level is blocked by A, then A is rolled back. +# Expected behaviour: B continues execution. +connection default; +BEGIN; +UPDATE t SET b=4 WHERE a=1; +connection consistent; +BEGIN; +SELECT * FROM t; +a b +2 2 +1 3 +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; +SELECT * FROM t WHERE a=1 FOR UPDATE; +connection default; +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; +ROLLBACK; +connection consistent; +a b +1 3 +ROLLBACK; +# Case 3: Transaction B with snapshot isolation level started with +# consistent snapshot. Transaction A modifies a record and is committed. +# Both B tries to read modified by A record. +# Expected behavior: B gets ER_CHECKREAD. +connection consistent; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; +UPDATE t SET b=4 WHERE a=1; +connection consistent; +SELECT * FROM t WHERE a=1 FOR UPDATE; +ERROR HY000: Record has changed since last read in table 't' disconnect consistent; +disconnect disable_purging; connection default; +SET DEBUG_SYNC="RESET"; DROP TABLE t; # End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/lock_memory_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/lock_memory_debug.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result 2025-05-19 16:14:24.000000000 +0000 @@ -5,7 +5,7 @@ CREATE TABLE t1 (col1 INT) ENGINE=InnoDB; INSERT INTO t1 VALUES (1),(2),(3),(4),(5); SET STATEMENT debug_dbug='+d,innodb_skip_lock_bitmap' FOR -INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g LIMIT 45000; +INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g; ERROR HY000: The total number of locks exceeds the lock table size SELECT COUNT(*) FROM t1; COUNT(*) diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/log_upgrade_101_flags.result mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/log_upgrade_101_flags.result 2025-01-30 11:01:23.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,7 @@ call mtr.add_suppression("InnoDB: The change buffer is corrupted"); call mtr.add_suppression("InnoDB: Tablespace size stored in header is 768 pages, but the sum of data file sizes is 384 pages"); call mtr.add_suppression("InnoDB: adjusting FSP_SPACE_FLAGS of file"); -# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=10M +# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=11M SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' AND support IN ('YES', 'DEFAULT', 'ENABLED'); diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,11 @@ +--- mem_pressure.result ++++ mem_pressure,32bit.result +@@ -11,7 +11,7 @@ + @@GLOBAL.innodb_buffer_pool_size_auto_min, + @@GLOBAL.innodb_buffer_pool_size_max; + @@GLOBAL.innodb_buffer_pool_size @@GLOBAL.innodb_buffer_pool_size_auto_min @@GLOBAL.innodb_buffer_pool_size_max +-17825792 16777216 25165824 ++17825792 16777216 18874368 + CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; + SET GLOBAL innodb_limit_optimistic_insert_debug=2; + SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure.result mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result 2025-05-19 16:14:24.000000000 +0000 @@ -4,23 +4,34 @@ set @save_dbug=@@debug_dbug; set @save_limit=@@GLOBAL.innodb_limit_optimistic_insert_debug; set GLOBAL innodb_max_purge_lag_wait=0; +SET @innodb_buffer_pool_size= @@GLOBAL.innodb_buffer_pool_size; +SET @innodb_buffer_pool_size_min= @@GLOBAL.innodb_buffer_pool_size_auto_min; +SELECT +@@GLOBAL.innodb_buffer_pool_size, +@@GLOBAL.innodb_buffer_pool_size_auto_min, +@@GLOBAL.innodb_buffer_pool_size_max; +@@GLOBAL.innodb_buffer_pool_size @@GLOBAL.innodb_buffer_pool_size_auto_min @@GLOBAL.innodb_buffer_pool_size_max +17825792 16777216 25165824 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; SET GLOBAL innodb_limit_optimistic_insert_debug=2; SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR INSERT INTO t1 SELECT * FROM seq_1_to_1000; SET GLOBAL innodb_limit_optimistic_insert_debug=@save_limit; DROP TABLE t1; -SELECT CAST(VARIABLE_VALUE AS INTEGER) INTO @dirty_prev -FROM INFORMATION_SCHEMA.GLOBAL_STATUS -WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty'; -set debug_dbug="d,trigger_garbage_collection"; -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size; -FOUND 1 /[Mm]emory pressure.*/ in mysqld.1.err -SELECT CAST(VARIABLE_VALUE AS INTEGER) < @dirty_prev AS LESS_DIRTY_IS_GOOD -FROM INFORMATION_SCHEMA.GLOBAL_STATUS -WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty'; -LESS_DIRTY_IS_GOOD +SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR +SET GLOBAL innodb_buffer_pool_size=@innodb_buffer_pool_size; +FOUND 1 /Memory pressure event disregarded.*/ in mysqld.1.err +SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR +SET GLOBAL innodb_buffer_pool_size_auto_min= +CAST(@innodb_buffer_pool_size/2 AS UNSIGNED), +innodb_buffer_pool_size=@innodb_buffer_pool_size; +Warnings: +Warning 1292 Truncated incorrect innodb_buffer_pool_size_auto_min value: '8912896' +select @@global.innodb_buffer_pool_size < @innodb_buffer_pool_size; +@@global.innodb_buffer_pool_size < @innodb_buffer_pool_size 1 -FOUND 1 /InnoDB: Memory pressure event freed.*/ in mysqld.1.err +FOUND 1 /InnoDB: Memory pressure event shrunk.*/ in mysqld.1.err set debug_dbug=@save_dbug; +SET GLOBAL innodb_buffer_pool_size= @innodb_buffer_pool_size; +SET GLOBAL innodb_buffer_pool_size_auto_min=@innodb_buffer_pool_size_min; # End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/page_cleaner.result mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/page_cleaner.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result 2025-05-19 16:14:24.000000000 +0000 @@ -2,8 +2,21 @@ SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; SET GLOBAL innodb_max_dirty_pages_pct_lwm=0.0; SET GLOBAL innodb_max_dirty_pages_pct=0.0; +CREATE TABLE t(a INT) ENGINE=InnoDB STATS_PERSISTENT=0; +connect prevent_purge,localhost,root; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; +SET GLOBAL innodb_max_purge_lag_wait=0; SET GLOBAL innodb_max_dirty_pages_pct=90.0; -CREATE TABLE t ENGINE=InnoDB SELECT * FROM seq_1_to_10000; +SELECT variable_value INTO @log_writes FROM information_schema.global_status +WHERE variable_name='innodb_log_writes'; +BEGIN; +ROLLBACK; +SELECT if(variable_value-@log_writes<500,'ok',variable_value-@log_writes) +FROM information_schema.global_status WHERE variable_name='innodb_log_writes'; +if(variable_value-@log_writes<500,'ok',variable_value-@log_writes) +ok +disconnect prevent_purge; SELECT variable_value>0 FROM information_schema.global_status WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; variable_value>0 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/recovery_memory.result mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/recovery_memory.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result 2025-05-19 16:14:24.000000000 +0000 @@ -12,7 +12,7 @@ connect con1,localhost,root,,,; CALL dorepeat(); connection default; -# restart: --innodb_buffer_pool_size=5242880 +# restart: --innodb_buffer_pool_size=6m DROP TABLE t1; DROP PROCEDURE dorepeat; # diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,16k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,16k.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ ---- ./suite/innodb/r/restart.result -+++ suite/innodb/r/restart.reject -@@ -32,10 +32,10 @@ - SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; - SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); --ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '5242879' - SHOW WARNINGS; - Level Code Message --Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE --Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+Warning 1210 innodb_buffer_pool_size must be at least 5242880 for innodb_page_size=16384 -+Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of '5242879' - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,32k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,32k.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ ---- ./suite/innodb/r/restart.result -+++ suite/innodb/r/restart.reject -@@ -32,10 +32,10 @@ - SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; - SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); --ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '10485759' - SHOW WARNINGS; - Level Code Message --Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE --Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+Warning 1210 innodb_buffer_pool_size must be at least 10485760 for innodb_page_size=32768 -+Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of '10485759' - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,4k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,4k.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ ---- ./suite/innodb/r/restart.result -+++ suite/innodb/r/restart.reject -@@ -32,10 +32,10 @@ - SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; - SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); --ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '2097151' - SHOW WARNINGS; - Level Code Message --Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE --Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+Warning 1210 innodb_buffer_pool_size must be at least 2097152 for innodb_page_size=4096 -+Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of '2097151' - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,64k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,64k.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ ---- ./suite/innodb/r/restart.result -+++ suite/innodb/r/restart.reject -@@ -32,10 +32,10 @@ - SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; - SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); --ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '20971519' - SHOW WARNINGS; - Level Code Message --Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE --Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+Warning 1210 innodb_buffer_pool_size must be at least 20971520 for innodb_page_size=65536 -+Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of '20971519' - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,8k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,8k.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ ---- ./suite/innodb/r/restart.result -+++ suite/innodb/r/restart.reject -@@ -32,10 +32,10 @@ - SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; - SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); --ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '3145727' - SHOW WARNINGS; - Level Code Message --Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE --Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -+Warning 1210 innodb_buffer_pool_size must be at least 3145728 for innodb_page_size=8192 -+Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of '3145727' - EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart.result mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/restart.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result 2025-05-19 16:14:24.000000000 +0000 @@ -30,19 +30,6 @@ a DROP TABLE tr,tc,td; # -# MDEV-27467 innodb to enfore the minimum innodb_buffer_pool_size in SET (resize) the same as startup -# -SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; -SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; -EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); -ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -SHOW WARNINGS; -Level Code Message -Warning 1210 innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE -Error 1231 Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE' -EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); -SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; -# # MDEV-27882 Innodb - recognise MySQL-8.0 innodb flags and give a specific error message # FOUND 1 /InnoDB: MySQL-8\.0 tablespace in \./ibdata1/ in attempted_start.err diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/stat_tables.result mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/stat_tables.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result 2025-05-19 16:14:24.000000000 +0000 @@ -101,3 +101,13 @@ CREATE TABLE t1 (c1 INT) ENGINE=InnoDB STATS_PERSISTENT 1; DROP TABLE t1; # End of 10.6 tests +# +# MDEV-36373 Warning: ... persistent statistics storage is corrupted +# +CREATE TABLE t1 (c INT) ENGINE=InnoDB; +SET STATEMENT tx_read_only=1 FOR ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +DROP TABLE t1; +# End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/stats_persistent.result mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result --- mariadb-10.11.11/mysql-test/suite/innodb/r/stats_persistent.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result 2025-05-19 16:14:24.000000000 +0000 @@ -17,3 +17,13 @@ test.t1 analyze status OK SET DEBUG_SYNC= 'RESET'; DROP TABLE t1; +# +# MDEV-36649 dict_acquire_mdl_shared() aborts when table +# mode is DICT_TABLE_OP_OPEN_ONLY_IF_CACHED +# +set @old_defragment_stats_accuracy= @@innodb_defragment_stats_accuracy; +SET GLOBAL innodb_defragment_stats_accuracy=1; +CREATE TABLE t (a INT ) ENGINE=INNODB; +INSERT INTO t SELECT * FROM seq_1_to_1000; +DROP TABLE t; +set global innodb_defragment_stats_accuracy= @old_defragment_stats_accuracy; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/alter_copy_bulk.test mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/alter_copy_bulk.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test 2025-05-19 16:14:24.000000000 +0000 @@ -109,3 +109,24 @@ ALTER TABLE t1 FORCE, ALGORITHM=COPY; DROP TABLE t1; SET GLOBAL innodb_stats_persistent=@default_stats_persistent; + +--echo # +--echo # MDEV-36504 Memory leak after insert into empty table +--echo # +CREATE TABLE t1 (k INT PRIMARY KEY)ENGINE=InnoDB; +INSERT INTO t1 SET k= 1; +START TRANSACTION; +INSERT INTO t1 SET k= 2; +SELECT COUNT(*) > 0 FROM mysql.innodb_index_stats LOCK IN SHARE MODE; + +connect(con1,localhost,root,,,); +SET innodb_lock_wait_timeout=0; +--error ER_LOCK_WAIT_TIMEOUT +CREATE TABLE t2(f1 INT DEFAULT 1 PRIMARY KEY) + STATS_PERSISTENT= 1 ENGINE=InnoDB as SELECT k FROM t1; +disconnect con1; +connection default; +SET innodb_lock_wait_timeout=default; +DROP TABLE t1; +DROP TABLE IF EXISTS t2; +--source include/restart_mysqld.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/alter_partitioned_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/alter_partitioned_debug.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test 2025-05-19 16:14:24.000000000 +0000 @@ -4,6 +4,7 @@ --source include/have_debug_sync.inc CREATE TABLE t1 (a INT, b VARCHAR(10)) ENGINE=InnoDB +STATS_PERSISTENT=1 STATS_AUTO_RECALC=0 PARTITION BY RANGE(a) (PARTITION pa VALUES LESS THAN (3), PARTITION pb VALUES LESS THAN (5)); @@ -26,9 +27,46 @@ connection default; DELETE FROM t1; -disconnect ddl; SET DEBUG_SYNC = 'RESET'; CHECK TABLE t1; -DROP TABLE t1; + +CREATE TABLE t(a INT, b VARCHAR(10)) ENGINE=InnoDB +STATS_PERSISTENT=1 STATS_AUTO_RECALC=1; +RENAME TABLE t TO u; +DELETE FROM mysql.innodb_table_stats WHERE table_name='u'; +DELETE FROM mysql.innodb_index_stats WHERE table_name='u'; + +send SET STATEMENT debug_dbug='+d,dict_stats_save_exit_notify_and_wait' FOR +SELECT * FROM u; + +connection ddl; +SET DEBUG_SYNC='open_tables_after_open_and_process_table +WAIT_FOR dict_stats_save_finished'; +send ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u; + +connect sync,localhost,root; +let $wait_condition= + select count(*) = 1 from information_schema.processlist + where state = 'debug sync point: now' + and info like 'SET STATEMENT debug_dbug%SELECT * FROM u'; +--source include/wait_condition.inc +let $wait_condition= + select count(*) = 1 from information_schema.processlist + where state = 'Waiting for table metadata lock' + and info like 'ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u'; +--source include/wait_condition.inc +SET DEBUG_SYNC='now SIGNAL dict_stats_save_unblock'; +disconnect sync; + +connection default; +reap; +connection ddl; +reap; +disconnect ddl; +connection default; +SELECT * FROM u; +SET DEBUG_SYNC = 'RESET'; + +DROP TABLE t1,u; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/autoinc_persist.test mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/autoinc_persist.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test 2025-05-19 16:14:24.000000000 +0000 @@ -95,15 +95,25 @@ SELECT * FROM t10; eval CREATE TABLE t11(a FLOAT $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB; -INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), -(20), (30), (31); +INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t11; +eval CREATE TABLE t11u(a FLOAT UNSIGNED $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB; +--error ER_WARN_DATA_OUT_OF_RANGE +INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); +INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31); +SELECT * FROM t11u; + eval CREATE TABLE t12(a DOUBLE $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB; -INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), -(20), (30), (31); +INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); SELECT * FROM t12; +CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB; +--error ER_WARN_DATA_OUT_OF_RANGE +INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31); +INSERT INTO t12u VALUES(0), (0), (0), (0), (0), (20), (30), (31); +SELECT * FROM t12u; + --echo # Scenario 1: Normal restart, to test if the counters are persisted --echo # Scenario 2: Delete some values, to test the counters should not be the --echo # one which is the largest in current table @@ -566,4 +576,5 @@ SELECT MAX(b) AS `Expect 4` FROM t33; SELECT * FROM t33; -DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33; +DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u, +t30, t32, t33; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ ---innodb-buffer-pool-size=8m --innodb-buffer-pool-chunk-size=1m diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.test mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ ---source include/have_innodb.inc ---source include/have_debug.inc ---source include/not_embedded.inc - ---echo # ---echo # Bug #21348684 SIGABRT DURING RESIZING THE INNODB BUFFER POOL ---echo # ONLINE WITH MEMORY FULL CONDITION ---echo # - -call mtr.add_suppression("InnoDB: failed to allocate the chunk array"); - -SET GLOBAL debug_dbug='+d,buf_pool_resize_chunk_null'; - ---disable_warnings -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + 1048576; ---enable_warnings - -let $wait_timeout = 60; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 27) = 'Resizing buffer pool failed' - FROM information_schema.global_status - WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS'; - ---source include/wait_condition.inc -# Restart the server, because the buffer pool would not necessarily be -# shrunk afterwards even if we request it. ---source include/restart_mysqld.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.combinations mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations --- mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.combinations 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations 2025-05-19 16:14:24.000000000 +0000 @@ -1,7 +1,9 @@ [strict_crc32] --innodb-checksum-algorithm=strict_crc32 --innodb-use-atomic-writes=0 +--innodb-undo-tablespaces=0 [strict_full_crc32] --innodb-checksum-algorithm=strict_full_crc32 --innodb-use-atomic-writes=0 +--innodb-undo-tablespaces=0 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.test mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test 2025-05-19 16:14:24.000000000 +0000 @@ -42,10 +42,17 @@ SET GLOBAL innodb_fast_shutdown = 0; let $shutdown_timeout=; --source include/restart_mysqld.inc +SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0; +let $wait_condition = +SELECT variable_value = 0 +FROM information_schema.global_status +WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; +--source include/wait_condition.inc +SET GLOBAL innodb_max_dirty_pages_pct=99; --source ../include/no_checkpoint_start.inc connect (dml,localhost,root,,); XA START 'x'; -insert into t1 values (6, repeat('%', @@innodb_page_size/2)); +insert into t1 values(6, repeat('%', @@innodb_page_size/2)); XA END 'x'; XA PREPARE 'x'; disconnect dml; @@ -53,10 +60,12 @@ flush table t1 for export; -let $restart_parameters=; ---let CLEANUP_IF_CHECKPOINT=XA COMMIT 'x';drop table t1; +--let CLEANUP_IF_CHECKPOINT=drop table t1, unexpected_checkpoint; --source ../include/no_checkpoint_end.inc +--copy_file $MYSQLD_DATADIR/ibdata1 $MYSQLD_DATADIR/ibdata1.bak +--copy_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile0.bak + perl; use IO::Handle; do "$ENV{MTR_SUITE_DIR}/include/crc32.pl"; @@ -145,6 +154,12 @@ --source include/shutdown_mysqld.inc let $shutdown_timeout=; # Corrupt the file in a better way. + +--remove_file $MYSQLD_DATADIR/ibdata1 +--remove_file $MYSQLD_DATADIR/ib_logfile0 +--move_file $MYSQLD_DATADIR/ibdata1.bak $MYSQLD_DATADIR/ibdata1 +--move_file $MYSQLD_DATADIR/ib_logfile0.bak $MYSQLD_DATADIR/ib_logfile0 + perl; use IO::Handle; my $fname= "$ENV{'MYSQLD_DATADIR'}test/t1.ibd"; @@ -157,22 +172,23 @@ close FILE; EOF --source include/start_mysqld.inc -XA ROLLBACK 'x'; check table t1; select f1, f2 from t1; +SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0; +let $wait_condition = +SELECT variable_value = 0 +FROM information_schema.global_status +WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; +--source include/wait_condition.inc +SET GLOBAL innodb_max_dirty_pages_pct=99; --source ../include/no_checkpoint_start.inc -connect (dml,localhost,root,,); -XA START 'x'; -insert into t1 values (6, repeat('%', @@innodb_page_size/2)); -XA END 'x'; -XA PREPARE 'x'; -disconnect dml; -connection default; - -flush table t1 for export; +XA ROLLBACK 'x'; +FLUSH TABLE t1 FOR EXPORT; -let $restart_parameters=; +# If we are skipping the test at this point due to an unexpected +# checkpoint, we will already have tested a part of this functionality. +--let CLEANUP_IF_CHECKPOINT=drop table t1; --source ../include/no_checkpoint_end.inc # Zero out the first page in file and try to recover from dblwr @@ -186,7 +202,6 @@ --source include/start_mysqld.inc let SEARCH_PATTERN=InnoDB: Recovered page \\[page id: space=[1-9][0-9]*, page number=[03]\\]; --source include/search_pattern_in_file.inc -XA ROLLBACK 'x'; check table t1; select f1, f2 from t1; drop table t1; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/foreign_key.test mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/foreign_key.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test 2025-05-19 16:14:24.000000000 +0000 @@ -133,7 +133,6 @@ --let $shutdown_timeout= disconnect incomplete; -SET @save_stats_persistent = @@GLOBAL.innodb_stats_persistent; SET GLOBAL innodb_stats_persistent = 0; INSERT INTO child SET a=0; @@ -1245,6 +1244,33 @@ DELETE FROM t1; DROP TABLE t2, t1; +--echo # +--echo # MDEV-33167 ASAN errors after failing to load foreign key +--echo # relation for the table +--echo # +call mtr.add_suppression("InnoDB: Load table `test`.`t3` failed, the table has missing foreign key indexes. Turn off 'foreign_key_checks' and try again."); +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t1(f1 VARCHAR(8), + FOREIGN KEY(f1) REFERENCES test.t3(f1))ENGINE=InnoDB; + +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t2(f1 VARCHAR(8), + FOREIGN KEY(f1) REFERENCES test.t3(f1)) + ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; + +SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR +CREATE TABLE t3(f1 VARCHAR(8) PRIMARY KEY) + ENGINE=InnoDB DEFAULT CHARSET=latin1; + +set GLOBAL innodb_fast_shutdown=0; +--let $shutdown_timeout= +--source include/restart_mysqld.inc +# Error encountered while loading the foreign key +# constraint for t3. t1 wasn't loaded into memory yet +# t2 failed to find index for foreign key relation +ALTER TABLE t2 FORCE; +DROP TABLE t2, t1, t3; + --echo # End of 10.6 tests CREATE TABLE t1 @@ -1270,7 +1296,5 @@ ALTER TABLE t2 ADD FOREIGN KEY (f2) REFERENCES t2 (f2), ADD UNIQUE INDEX(f3); drop table t1, t2; -SET GLOBAL innodb_stats_persistent = @save_stats_persistent; --echo # End of 10.11 tests ---source include/wait_until_count_sessions.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,5 @@ --loose-innodb-sort-buffer-size=64k --loose-innodb-online-alter-log-max-size=128k ---loose-innodb-buffer-pool-size=5M +--loose-innodb-buffer-pool-size=6M --loose-innodb-sys-indexes --loose-innodb-sys-fields diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test 2025-05-19 16:14:24.000000000 +0000 @@ -510,12 +510,35 @@ connection con1; reap; -disconnect con1; connection default; SELECT * FROM t1; CHECK TABLE t1; DROP TABLE t1; + +--echo # +--echo # MDEV-36281 DML aborts during online virtual index +--echo # +CREATE TABLE t1(f1 INT NOT NULL PRIMARY KEY, f2 INT NOT NULL, + f3 INT NOT NULL, f4 INT AS (f3) VIRTUAL, + f5 INT AS (f1) VIRTUAL, INDEX(f4))ENGINE=InnoDB; +INSERT INTO t1(f1, f2, f3) VALUES(1, 2, 3); +SET DEBUG_SYNC = 'innodb_inplace_alter_table_enter SIGNAL dml_start WAIT_FOR dml_finish'; +send ALTER TABLE t1 ADD INDEX v1(f5, f2, f4), ADD INDEX v2(f3, f5); + +connection con1; +set DEBUG_SYNC="now WAIT_FOR dml_start"; +UPDATE t1 SET f3= f3 + 1; +set DEBUG_SYNC="now SIGNAL dml_finish"; + +disconnect con1; +connection default; +reap; +CHECK TABLE t1 EXTENDED; +SELECT f5, f2, f4 FROM t1 USE INDEX(v1); +SELECT f3, f5 FROM t1 USE INDEX(v2); +DROP TABLE t1; + SET DEBUG_SYNC = 'RESET'; # Check that all connections opened by test cases in this file are really diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-table-online-master.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-table-online-master.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=5M +--innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=6M diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1 @@ +--innodb-buffer-pool-size-max=16m diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,6 +1,6 @@ --source include/have_innodb.inc --source include/have_debug.inc -call mtr.add_suppression("InnoDB: Cannot allocate memory for the buffer pool"); +call mtr.add_suppression("InnoDB: Cannot map innodb_buffer_pool_size_max="); call mtr.add_suppression("InnoDB: Plugin initialization aborted at srv0start.cc.*"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error."); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed."); @@ -10,5 +10,5 @@ let restart_parameters=--debug_dbug=+d,ib_buf_chunk_init_fails; --source include/restart_mysqld.inc let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err; -let SEARCH_PATTERN=\[ERROR\] InnoDB: Cannot allocate memory for the buffer pool; +let SEARCH_PATTERN=\[ERROR\] InnoDB: Cannot map innodb_buffer_pool_size_max=16m; --source include/search_pattern_in_file.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,2 +1,3 @@ --innodb-buffer-pool-size=8M +--innodb-buffer-pool-size-max=25M --innodb-page-size=4k diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,17 +1,13 @@ -# -# WL6117 : Resize the InnoDB Buffer Pool Online -# - --source include/have_innodb.inc ---source include/big_test.inc +--source include/have_sequence.inc -let $wait_timeout = 180; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status'; +--echo # +--echo # MDEV-29445: Reorganize buffer pool (and remove chunks) +--echo # --disable_query_log +call mtr.add_suppression("InnoDB: Over 67 percent of the buffer pool is occupied by lock heaps"); +call mtr.add_suppression("innodb_buffer_pool_size change aborted"); set @old_innodb_buffer_pool_size = @@innodb_buffer_pool_size; set @old_innodb_adaptive_hash_index = @@innodb_adaptive_hash_index; --enable_query_log @@ -21,52 +17,63 @@ select @@innodb_buffer_pool_size; # Expand buffer pool +set global innodb_buffer_pool_size = 9437184; set global innodb_buffer_pool_size = 10485760; ---source include/wait_condition.inc - select @@innodb_buffer_pool_size; +let $kbs=`SELECT CAST(@@innodb_page_size / 1024 AS INT)`; # fill buffer pool --disable_query_log SET @save_innodb_read_only_compressed=@@GLOBAL.innodb_read_only_compressed; SET GLOBAL innodb_read_only_compressed=OFF; --enable_query_log -create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED; -create or replace view view0 as select 1 union all select 1; +create table t1 (id int primary key, val int not null) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED; +evalp create table t2 (id int primary key, val int not null) +ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=$kbs; + +SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR +INSERT INTO t1 SELECT seq*4,seq*4 FROM seq_1_to_262144; +SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR +INSERT INTO t2 SELECT seq*4,seq*4 FROM seq_1_to_16384; -set @`v_id` := 0; -set @`v_val` := 0; - -# 2^18 == 262144 records -replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17; --disable_query_log SET GLOBAL innodb_read_only_compressed=@save_innodb_read_only_compressed; --enable_query_log -# Shrink buffer pool -set global innodb_buffer_pool_size = 64 * 1024 * 1024 + 512 * 1024; ---source include/wait_condition.inc - -select @@innodb_buffer_pool_size; +# Attempt to shrink the buffer pool. This may occasionally fail. +--error 0,ER_WRONG_USAGE +set global innodb_buffer_pool_size = 7340032; select count(val) from t1; +select count(val) from t2; set global innodb_adaptive_hash_index=OFF; -# Expand buffer pool to 24MB -set global innodb_buffer_pool_size = 25165824; ---source include/wait_condition.inc +# Expand buffer pool to 23 and then 24 MiB (requesting 25 MiB) +set global innodb_buffer_pool_size = 24117248; +set global innodb_buffer_pool_size = 26214400; select @@innodb_buffer_pool_size; select count(val) from t1; +select count(val) from t2; -drop table t1; -drop view view0; +drop table t1,t2; ---disable_query_log -set global innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index; -set global innodb_buffer_pool_size = @old_innodb_buffer_pool_size; ---enable_query_log +SET GLOBAL innodb_max_purge_lag_wait = 0; +SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct; +SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; + +SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0; +SET GLOBAL innodb_max_dirty_pages_pct = 0.0; +let $wait_condition = +SELECT variable_value = 0 +FROM information_schema.global_status +WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; --source include/wait_condition.inc +SET GLOBAL innodb_buffer_pool_size = @old_innodb_buffer_pool_size; +SET GLOBAL innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index; +SET GLOBAL innodb_max_dirty_pages_pct = @save_pct; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ ---innodb-buffer-pool-chunk-size=1M ---loose-skip-innodb-disable-resize_buffer_pool_debug diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ ---source include/have_innodb.inc ---source include/big_test.inc - -SET @save_size=@@innodb_buffer_pool_size; - -let $wait_timeout = 60; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS'; - ---echo # ---echo # MDEV-27891: Delayed SIGSEGV in InnoDB buffer pool resize ---echo # after or during DROP TABLE ---echo # - -select @@innodb_buffer_pool_chunk_size; -CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; -SET GLOBAL innodb_buffer_pool_size=256*1024*1024; -DROP TABLE t1; ---source include/wait_condition.inc -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + @@innodb_buffer_pool_chunk_size; ---source include/wait_condition.inc - ---echo # End of 10.6 tests - -SET GLOBAL innodb_buffer_pool_size=@save_size; ---source include/wait_condition.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ ---innodb-buffer-pool-size=8M --innodb-buffer-pool-chunk-size=2M diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1 @@ +--innodb-buffer-pool-size-max=16m diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,24 +1,43 @@ --source include/have_innodb.inc --source include/have_sequence.inc --source include/have_debug.inc +--source include/have_debug_sync.inc SET @save_limit=@@GLOBAL.innodb_limit_optimistic_insert_debug; SET @save_size=@@GLOBAL.innodb_buffer_pool_size; SET GLOBAL innodb_limit_optimistic_insert_debug=2; - SET GLOBAL innodb_buffer_pool_size=16777216; CREATE TEMPORARY TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; INSERT INTO t1 SELECT seq FROM seq_1_to_200; -SET GLOBAL innodb_buffer_pool_size=8388608; +# Flush the buffer pool to prevent +# "innodb_buffer_pool_size change aborted" error with ./mtr --repeat=3 +SET GLOBAL innodb_max_purge_lag_wait=0; +SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct; +SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0; +SET GLOBAL innodb_max_dirty_pages_pct = 0.0; + +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +connect con1,localhost,root; +SET DEBUG_SYNC='buf_pool_shrink_before_wakeup SIGNAL blocked WAIT_FOR go'; +send SET GLOBAL innodb_buffer_pool_size=8388608; +connection default; +SET DEBUG_SYNC='now WAIT_FOR blocked'; +# adjust for 32-bit and SUX_LOCK_GENERIC +--replace_regex /(5..)\/\1/505\/505/ +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +SET DEBUG_SYNC='now SIGNAL go'; +connection con1; +reap; +disconnect con1; +connection default; +SHOW STATUS LIKE 'innodb_buffer_pool_resize_status'; +SET DEBUG_SYNC=RESET; -let $wait_timeout = 60; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS'; ---source include/wait_condition.inc +SET GLOBAL innodb_max_dirty_pages_pct = @save_pct; +SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm; SELECT COUNT(*),MIN(a),MAX(a) FROM t1; DROP TEMPORARY TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ ---innodb-buffer-pool-size=16M ---innodb-buffer-pool-chunk-size=4M ---innodb-page-size=4k diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test 1970-01-01 00:00:00.000000000 +0000 @@ -1,61 +0,0 @@ -# -# WL6117 : Resize the InnoDB Buffer Pool Online -# (innodb_buffer_pool_chunk_size used case) -# - ---source include/have_innodb.inc ---source include/big_test.inc - -let $wait_timeout = 180; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status'; - ---disable_query_log -set @old_innodb_buffer_pool_size = @@innodb_buffer_pool_size; ---enable_query_log - -select @@innodb_buffer_pool_chunk_size; - -# fill buffer pool ---disable_query_log -SET @save_innodb_read_only_compressed=@@GLOBAL.innodb_read_only_compressed; -SET GLOBAL innodb_read_only_compressed=OFF; ---enable_query_log -create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED; -create or replace view view0 as select 1 union all select 1; - -set @`v_id` := 0; -set @`v_val` := 0; - -# 2^18 == 262144 records -replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17; ---disable_query_log -SET GLOBAL innodb_read_only_compressed=@save_innodb_read_only_compressed; ---enable_query_log - -# Shrink buffer pool to 7MB -set global innodb_buffer_pool_size = 7340032; ---source include/wait_condition.inc - -select count(val) from t1; - -# Expand buffer pool to 16MB -set global innodb_buffer_pool_size = 16777216; ---source include/wait_condition.inc - -select count(val) from t1; - -drop table t1; -drop view view0; - -# Try to shrink buffer pool to smaller than chunk size -set global innodb_buffer_pool_size = 2*1048576; ---source include/wait_condition.inc -select @@innodb_buffer_pool_size; - ---disable_query_log -set global innodb_buffer_pool_size = @old_innodb_buffer_pool_size; ---enable_query_log ---source include/wait_condition.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_bug52663.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_bug52663.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,5 +1,7 @@ --source include/have_innodb.inc +SET @save_innodb_timeout=@@innodb_lock_wait_timeout; +SET GLOBAL innodb_lock_wait_timeout=1; set session transaction isolation level read committed; create table innodb_bug52663 (what varchar(5), id integer, count integer, primary key @@ -8,7 +10,6 @@ begin; connect (addconroot, localhost, root,,); -connection addconroot; set session transaction isolation level read committed; begin; @@ -32,3 +33,4 @@ connection default; select * from innodb_bug52663; drop table innodb_bug52663; +SET GLOBAL innodb_lock_wait_timeout=@save_innodb_timeout; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test 2025-05-19 16:14:24.000000000 +0000 @@ -5,11 +5,26 @@ INSERT INTO t VALUES (1); -SET GLOBAL innodb_monitor_reset = "module_innodb"; +SET GLOBAL innodb_monitor_disable="lock_row_lock_time"; +SET GLOBAL innodb_monitor_disable="lock_row_lock_time_max"; +SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time'; +SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time_max'; +SET GLOBAL innodb_monitor_enable="lock_row_lock_time"; +SET GLOBAL innodb_monitor_enable="lock_row_lock_time_max"; BEGIN; SELECT * FROM t FOR UPDATE; +# We can't predict (innodb/lock)_row_lock_time_avg value, because it's counted +# as the whole waiting time divided by the amount of waits. The +# corresponding counters in lock_sys can't be reset with any query. + +--disable_result_log +SELECT @innodb_row_lock_time_before := variable_value + FROM information_schema.global_status + WHERE LOWER(variable_name) = 'innodb_row_lock_time'; +--enable_result_log + --connect(con1,localhost,root,,) SET innodb_lock_wait_timeout = 1; --error ER_LOCK_WAIT_TIMEOUT @@ -19,24 +34,28 @@ --connection default COMMIT; -SELECT variable_value > 100 FROM information_schema.global_status +SELECT variable_value - @innodb_row_lock_time_before > 100 + FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_row_lock_time'; -SELECT variable_value > 100 FROM information_schema.global_status +# We can't use 'variable_value - @innodb_row_lock_time_max_before' trick for +# innodb_row_lock_time_max, because we can't reset it, and we don't know the +# initial value at the moment of the test execution. +SELECT variable_value > 100 + FROM information_schema.global_status WHERE LOWER(variable_name) = 'innodb_row_lock_time_max'; -SELECT variable_value > 100 FROM information_schema.global_status - WHERE LOWER(variable_name) = 'innodb_row_lock_time_avg'; - -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS - WHERE NAME="lock_row_lock_time"; -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS - WHERE NAME="lock_row_lock_time_max"; -SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS - WHERE NAME="lock_row_lock_time_avg"; +SELECT count_reset > 100 + FROM INFORMATION_SCHEMA.INNODB_METRICS + WHERE NAME='lock_row_lock_time'; +SELECT count_reset > 100 + FROM INFORMATION_SCHEMA.INNODB_METRICS + WHERE NAME='lock_row_lock_time_max'; DROP TABLE t; --disable_warnings -SET GLOBAL innodb_monitor_reset=default; +SET GLOBAL innodb_monitor_enable=default; +SET GLOBAL innodb_monitor_disable=default; +SET GLOBAL innodb_monitor_reset_all=default; --enable_warnings --source include/wait_until_count_sessions.inc diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test 2025-05-19 16:14:24.000000000 +0000 @@ -17,9 +17,7 @@ -- eval $check_stats1 -- eval $check_stats2 -# open and close the table SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; @@ -27,7 +25,8 @@ -- eval $check_stats1 -- eval $check_stats2 -# open the table, causing stats recalc/save +# rename and open the table, causing stats recalc/save +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; -- eval $check_stats1 @@ -43,9 +42,7 @@ -- eval $check_stats1 -- eval $check_stats2 -# open and close the table SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; @@ -53,7 +50,7 @@ -- eval $check_stats1 -- eval $check_stats2 -# open the table, causing stats recalc/save +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; -- eval $check_stats1 @@ -69,9 +66,7 @@ -- eval $check_stats1 -- eval $check_stats2 -# open and close the table SELECT * FROM t; -FLUSH TABLE t; DELETE FROM mysql.innodb_index_stats WHERE table_name = 't'; DELETE FROM mysql.innodb_table_stats WHERE table_name = 't'; @@ -79,7 +74,8 @@ -- eval $check_stats1 -- eval $check_stats2 -# open the table, stats should not be present, since autorecalc is disabled +# rename the table, stats should not be present, since autorecalc is disabled +RENAME TABLE t TO tmp, tmp TO t; SELECT * FROM t; -- eval $check_stats1 diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_fetch.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_fetch.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test 2025-05-19 16:14:24.000000000 +0000 @@ -69,7 +69,7 @@ index_name = 'idx' AND stat_name = 'n_diff_pfx02'; -FLUSH TABLE test_ps_fetch; +RENAME TABLE test_ps_fetch TO tmp, tmp TO test_ps_fetch; SELECT seq_in_index, column_name, cardinality FROM information_schema.statistics WHERE table_name = 'test_ps_fetch' diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/instant_alter_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/instant_alter_debug.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test 2025-05-19 16:14:24.000000000 +0000 @@ -657,11 +657,19 @@ SET DEBUG_SYNC=RESET; --echo # End of 10.5 tests - SELECT variable_value-@old_instant instants FROM information_schema.global_status WHERE variable_name = 'innodb_instant_alter_column'; -SET GLOBAL innodb_stats_persistent = @save_stats_persistent; +CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB; +INSERT INTO t1 VALUES(1, 'a'); +ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST; +--error ER_INTERNAL_ERROR +SET STATEMENT DEBUG_DBUG="+d,instant_insert_fail" FOR +ALTER TABLE t1 DROP COLUMN f1; +ALTER TABLE t1 DROP COLUMN f1; +CHECK TABLE t1; +DROP TABLE t1; +SET GLOBAL innodb_stats_persistent = @save_stats_persistent; --echo # End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_isolation.test mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_isolation.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,9 +1,16 @@ --source include/have_innodb.inc +--source include/count_sessions.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc --disable_query_log call mtr.add_suppression("InnoDB: Transaction was aborted due to "); --enable_query_log +--connect disable_purging,localhost,root +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--connection default --echo # --echo # MDEV-26642 Weird SELECT view when a record is --echo # modified to the same value by two transactions @@ -41,22 +48,18 @@ --echo # READ UNCOMMITTED and READ COMMITTED isolation level --echo # -CREATE TABLE t(a INT, b INT) ENGINE=InnoDB; +CREATE TABLE t(a INT, b INT) ENGINE=InnoDB STATS_PERSISTENT=0; INSERT INTO t VALUES(NULL, 1), (2, 2); SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; BEGIN; UPDATE t SET a = 10; --connection consistent SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; --send UPDATE t SET b = 20 WHERE a --connection default -let $wait_condition= - select count(*) = 1 from information_schema.processlist - where state = 'Updating' - and info = 'UPDATE t SET b = 20 WHERE a'; ---source include/wait_condition.inc - +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; COMMIT; --connection consistent @@ -70,14 +73,11 @@ --connection consistent SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; --send UPDATE t SET b = 20 WHERE a --connection default -let $wait_condition= - select count(*) = 1 from information_schema.processlist - where info = 'UPDATE t SET b = 20 WHERE a'; ---source include/wait_condition.inc - +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; COMMIT; --connection consistent @@ -91,15 +91,11 @@ --connection con_weird SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; send UPDATE t SET b = 20 WHERE a; --connection default -let $wait_condition= - select count(*) = 1 from information_schema.processlist - where state = 'Updating' - and info = 'UPDATE t SET b = 20 WHERE a'; ---source include/wait_condition.inc - +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; SELECT * FROM t; COMMIT; @@ -123,14 +119,11 @@ BEGIN; # As semi-consistent read is disabled for innodb_snapshot_isolation=ON, the # following UPDATE must be blocked on the first record. +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; --send UPDATE t SET b = 2 WHERE a --connection default -let $wait_condition= - select count(*) = 1 from information_schema.processlist - where state = 'Updating' and info = 'UPDATE t SET b = 2 WHERE a'; ---source include/wait_condition.inc - +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; UPDATE t SET a = 1; COMMIT; --connection consistent @@ -149,13 +142,15 @@ --echo # MDEV-33802 Weird read view after ROLLBACK of other transactions --echo # -CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB; -INSERT INTO t SET a=1; - -BEGIN; INSERT INTO t SET a=2; +CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB STATS_PERSISTENT=0; --connection consistent START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--connection default +INSERT INTO t SET a=1; + +--connection consistent SAVEPOINT sp1; --disable_ps2_protocol --error ER_CHECKREAD @@ -163,29 +158,100 @@ --enable_ps2_protocol SAVEPOINT sp1; +--connection default +BEGIN; INSERT INTO t SET a=2; + --connection con_weird START TRANSACTION WITH CONSISTENT SNAPSHOT; -send -SELECT * FROM t FORCE INDEX (b) FOR UPDATE; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; +--send SELECT * FROM t FORCE INDEX (b) FOR UPDATE --connection default -let $wait_condition= - select count(*) = 1 from information_schema.processlist - where state = 'Sending data' - and info LIKE 'SELECT * FROM t %'; ---source include/wait_condition.inc +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; ROLLBACK; --connection con_weird --reap SELECT * FROM t FORCE INDEX (b) FOR UPDATE; +COMMIT; --disconnect con_weird --connection consistent SELECT * FROM t FORCE INDEX (b) FOR UPDATE; +COMMIT; + +--connection default +TRUNCATE TABLE t; + +--echo # +--echo # MDEV-36639 innodb_snapshot_isolation=1 gives error for not comitted row changes +--echo # +INSERT INTO t VALUES (1,1),(2,2); + +--connection default +--echo # Case 1: Transaction A modifies a record, transaction B with snapshot +--echo # isolation level is blocked by A, then A is committed. +--echo # Expected behaviour: B gets ER_CHECKREAD. +BEGIN; +UPDATE t SET b=3 WHERE a = 1; + +--connection consistent +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; +--send SELECT * FROM t WHERE a=1 FOR UPDATE + +--connection default +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; +COMMIT; + +--connection consistent +--error ER_CHECKREAD +--reap + +--echo # Case 2: Transaction A modifies a record, transaction B with snapshot +--echo # isolation level is blocked by A, then A is rolled back. +--echo # Expected behaviour: B continues execution. + +--connection default +BEGIN; +UPDATE t SET b=4 WHERE a=1; + +--connection consistent +BEGIN; +SELECT * FROM t; +SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked"; +--send SELECT * FROM t WHERE a=1 FOR UPDATE + +--connection default +SET DEBUG_SYNC="now WAIT_FOR select_blocked"; +ROLLBACK; + +--connection consistent +--reap +ROLLBACK; + +--echo # Case 3: Transaction B with snapshot isolation level started with +--echo # consistent snapshot. Transaction A modifies a record and is committed. +--echo # Both B tries to read modified by A record. +--echo # Expected behavior: B gets ER_CHECKREAD. + +--connection consistent +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--connection default +UPDATE t SET b=4 WHERE a=1; + +--connection consistent +--error ER_CHECKREAD +SELECT * FROM t WHERE a=1 FOR UPDATE; --disconnect consistent +--disconnect disable_purging --connection default +SET DEBUG_SYNC="RESET"; DROP TABLE t; +--source include/wait_until_count_sessions.inc --echo # End of 10.6 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.opt mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---innodb_buffer_pool_size=5M +--innodb_buffer_pool_size=6M diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test 2025-05-19 16:14:24.000000000 +0000 @@ -15,7 +15,7 @@ --error ER_LOCK_TABLE_FULL SET STATEMENT debug_dbug='+d,innodb_skip_lock_bitmap' FOR -INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g LIMIT 45000; +INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g; SELECT COUNT(*) FROM t1; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/log_upgrade_101_flags.test mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/log_upgrade_101_flags.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test 2025-05-19 16:14:24.000000000 +0000 @@ -73,7 +73,7 @@ close OUT or die; EOF ---let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=10M +--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=11M --source include/start_mysqld.inc SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mdev-15707.opt mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/mdev-15707.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1 +1 @@ ---innodb --innodb-buffer-pool-size=5MB --innodb-read-io-threads=1 --innodb-doublewrite=0 --innodb-flush-log-at-trx-commit=0 \ No newline at end of file +--innodb --innodb-buffer-pool-size=6MB --innodb-read-io-threads=1 --innodb-doublewrite=0 --innodb-flush-log-at-trx-commit=0 \ No newline at end of file diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.opt mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt 2025-05-19 16:14:24.000000000 +0000 @@ -0,0 +1,3 @@ +--loose-innodb-buffer-pool-size-auto-min=17m +--innodb-buffer-pool-size-max=17m +--innodb-buffer-pool-size=17m diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.test mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,8 +1,8 @@ --source include/have_debug.inc ---source include/have_cgroupv2.inc --source include/not_embedded.inc --source include/have_innodb.inc --source include/have_sequence.inc +--source include/word_size.inc --echo # --echo # MDEV-24670 avoid OOM by linux kernel co-operative memory management @@ -15,6 +15,13 @@ # This is not an actual parameter, so there is no need to restore it. set GLOBAL innodb_max_purge_lag_wait=0; +SET @innodb_buffer_pool_size= @@GLOBAL.innodb_buffer_pool_size; +SET @innodb_buffer_pool_size_min= @@GLOBAL.innodb_buffer_pool_size_auto_min; +SELECT +@@GLOBAL.innodb_buffer_pool_size, +@@GLOBAL.innodb_buffer_pool_size_auto_min, +@@GLOBAL.innodb_buffer_pool_size_max; + CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; SET GLOBAL innodb_limit_optimistic_insert_debug=2; SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR @@ -24,32 +31,31 @@ DROP TABLE t1; ---disable_cursor_protocol -SELECT CAST(VARIABLE_VALUE AS INTEGER) INTO @dirty_prev -FROM INFORMATION_SCHEMA.GLOBAL_STATUS -WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty'; ---enable_cursor_protocol - -set debug_dbug="d,trigger_garbage_collection"; -SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size; +SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR +SET GLOBAL innodb_buffer_pool_size=@innodb_buffer_pool_size; let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; -# either a fail or the pressure event -let SEARCH_PATTERN= [Mm]emory pressure.*; +let SEARCH_PATTERN= Memory pressure event disregarded.*; +let SEARCH_WAIT= FOUND; --source include/search_pattern_in_file.inc +SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR +SET GLOBAL innodb_buffer_pool_size_auto_min= +CAST(@innodb_buffer_pool_size/2 AS UNSIGNED), +innodb_buffer_pool_size=@innodb_buffer_pool_size; + # The garbage collection happens asynchronously after trigger, in a background # thread. So wait for it to happen to avoid sporadic failure. let $wait_condition= - SELECT CAST(VARIABLE_VALUE AS INTEGER) < @dirty_prev AS LESS_DIRTY_IS_GOOD - FROM INFORMATION_SCHEMA.GLOBAL_STATUS - WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty'; + select @@global.innodb_buffer_pool_size < @innodb_buffer_pool_size; --source include/wait_condition.inc eval $wait_condition; -let SEARCH_PATTERN= InnoDB: Memory pressure event freed.*; +let SEARCH_PATTERN= InnoDB: Memory pressure event shrunk.*; let SEARCH_WAIT= FOUND; --source include/search_pattern_in_file.inc set debug_dbug=@save_dbug; +SET GLOBAL innodb_buffer_pool_size= @innodb_buffer_pool_size; +SET GLOBAL innodb_buffer_pool_size_auto_min=@innodb_buffer_pool_size_min; --echo # End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/page_cleaner.test mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/page_cleaner.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test 2025-05-19 16:14:24.000000000 +0000 @@ -7,6 +7,12 @@ SET GLOBAL innodb_max_dirty_pages_pct_lwm=0.0; SET GLOBAL innodb_max_dirty_pages_pct=0.0; +CREATE TABLE t(a INT) ENGINE=InnoDB STATS_PERSISTENT=0; +--connect (prevent_purge,localhost,root) +START TRANSACTION WITH CONSISTENT SNAPSHOT; +--connection default +SET GLOBAL innodb_max_purge_lag_wait=0; + let $wait_condition = SELECT variable_value = 0 FROM information_schema.global_status @@ -15,7 +21,24 @@ SET GLOBAL innodb_max_dirty_pages_pct=90.0; -CREATE TABLE t ENGINE=InnoDB SELECT * FROM seq_1_to_10000; +--disable_cursor_protocol +SELECT variable_value INTO @log_writes FROM information_schema.global_status +WHERE variable_name='innodb_log_writes'; +--enable_cursor_protocol + +BEGIN; +--disable_query_log +let $N=500; +while ($N) { + INSERT INTO t SELECT * FROM seq_1_to_10; + dec $N; +} +--enable_query_log +ROLLBACK; + +SELECT if(variable_value-@log_writes<500,'ok',variable_value-@log_writes) +FROM information_schema.global_status WHERE variable_name='innodb_log_writes'; +--disconnect prevent_purge SELECT variable_value>0 FROM information_schema.global_status WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/purge_secondary.opt mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/purge_secondary.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,4 @@ --innodb-sys-tablestats ---innodb_buffer_pool_size=5M +--innodb_buffer_pool_size=6M --innodb_monitor_enable=module_buffer --skip-innodb-stats-persistent diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/recovery_memory.test mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/recovery_memory.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test 2025-05-19 16:14:24.000000000 +0000 @@ -22,7 +22,7 @@ connection default; sleep 10; let $shutdown_timeout=0; -let $restart_parameters=--innodb_buffer_pool_size=5242880; +let $restart_parameters=--innodb_buffer_pool_size=6m; --source include/restart_mysqld.inc DROP TABLE t1; DROP PROCEDURE dorepeat; @@ -33,11 +33,11 @@ --echo # if ($have_debug) { SET DEBUG_DBUG="+d,ib_log_checkpoint_avoid_hard"; -let $restart_parameters=--innodb_buffer_pool_size=5242880 --debug_dbug=+d,ibuf_init_corrupt; +let $restart_parameters=--innodb_buffer_pool_size=6m --debug_dbug=+d,ibuf_init_corrupt; } if (!$have_debug) { --echo SET DEBUG_DBUG="+d,ib_log_checkpoint_avoid_hard"; -let $restart_parameters=--innodb_buffer_pool_size=5242880; +let $restart_parameters=--innodb_buffer_pool_size=6m; } CREATE TABLE t1(f1 INT NOT NULL)ENGINE=InnoDB; INSERT INTO t1 SELECT * FROM seq_1_to_65536; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/restart.opt mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/restart.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ ---loose-innodb_disable_resize_buffer_pool_debug=0 ---innodb-buffer-pool-chunk-size=1M diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/restart.test mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/restart.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test 2025-05-19 16:14:24.000000000 +0000 @@ -93,31 +93,6 @@ DROP TABLE tr,tc,td; --echo # ---echo # MDEV-27467 innodb to enfore the minimum innodb_buffer_pool_size in SET (resize) the same as startup ---echo # - -let $wait_timeout = 180; -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status'; - ---disable_cursor_protocol -SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig; -SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size; ---enable_cursor_protocol ---error ER_WRONG_VALUE_FOR_VAR -EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1); - -SHOW WARNINGS; - -EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size); - ---source include/wait_condition.inc - -SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig; - ---echo # --echo # MDEV-27882 Innodb - recognise MySQL-8.0 innodb flags and give a specific error message --echo # diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/stat_tables.test mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/stat_tables.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test 2025-05-19 16:14:24.000000000 +0000 @@ -110,3 +110,12 @@ DROP TABLE t1; --echo # End of 10.6 tests + +--echo # +--echo # MDEV-36373 Warning: ... persistent statistics storage is corrupted +--echo # +CREATE TABLE t1 (c INT) ENGINE=InnoDB; +SET STATEMENT tx_read_only=1 FOR ANALYZE TABLE t1; +DROP TABLE t1; + +--echo # End of 10.11 tests diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/stats_persistent.test mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test --- mariadb-10.11.11/mysql-test/suite/innodb/t/stats_persistent.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test 2025-05-19 16:14:24.000000000 +0000 @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source include/have_sequence.inc --source include/have_debug.inc --source include/have_debug_sync.inc --source include/count_sessions.inc @@ -26,3 +27,14 @@ DROP TABLE t1; --source include/wait_until_count_sessions.inc + +--echo # +--echo # MDEV-36649 dict_acquire_mdl_shared() aborts when table +--echo # mode is DICT_TABLE_OP_OPEN_ONLY_IF_CACHED +--echo # +set @old_defragment_stats_accuracy= @@innodb_defragment_stats_accuracy; +SET GLOBAL innodb_defragment_stats_accuracy=1; +CREATE TABLE t (a INT ) ENGINE=INNODB; +INSERT INTO t SELECT * FROM seq_1_to_1000; +DROP TABLE t; +set global innodb_defragment_stats_accuracy= @old_defragment_stats_accuracy; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/update_time-master.opt mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt --- mariadb-10.11.11/mysql-test/suite/innodb/t/update_time-master.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ ---innodb-buffer-pool-size=5M diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/r/index_table.result mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result --- mariadb-10.11.11/mysql-test/suite/innodb_fts/r/index_table.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result 2025-05-19 16:14:24.000000000 +0000 @@ -5,6 +5,9 @@ title VARCHAR(200), content TEXT ) ENGINE= InnoDB; +SET STATEMENT debug_dbug='+d,innodb_report_deadlock' FOR +CREATE FULLTEXT INDEX idx ON articles (title, content); +ERROR HY000: Got error 11 "Resource temporarily unavailable" from storage engine InnoDB CREATE FULLTEXT INDEX idx ON articles (title, content); INSERT INTO articles (title, content) VALUES ('MySQL Tutorial','DBMS stands for MySQL DataBase ...'), diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result --- mariadb-10.11.11/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result 2025-05-19 16:14:24.000000000 +0000 @@ -118,4 +118,13 @@ SELECT @@GLOBAL.innodb_ft_aux_table; @@GLOBAL.innodb_ft_aux_table test/t1 +CREATE TABLE t(a INT) ENGINE=InnoDB; +SET GLOBAL innodb_ft_aux_table='test/t'; +ERROR 42000: Variable 'innodb_ft_aux_table' can't be set to the value of 'test/t' +DROP TABLE t; +SET GLOBAL innodb_ft_aux_table='test/t'; +ERROR 42000: Variable 'innodb_ft_aux_table' can't be set to the value of 'test/t' +SELECT @@GLOBAL.innodb_ft_aux_table; +@@GLOBAL.innodb_ft_aux_table +test/t1 SET GLOBAL innodb_ft_aux_table = @save_ft_aux_table; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/t/index_table.test mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test --- mariadb-10.11.11/mysql-test/suite/innodb_fts/t/index_table.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -3,6 +3,9 @@ -- source include/have_innodb.inc -- source include/have_debug.inc +--disable_query_log +call mtr.add_suppression("InnoDB: \\(Deadlock\\) writing `use_stopword'"); +--enable_query_log SET @optimize=@@GLOBAL.INNODB_OPTIMIZE_FULLTEXT_ONLY; SET GLOBAL INNODB_OPTIMIZE_FULLTEXT_ONLY=1; @@ -14,6 +17,9 @@ content TEXT ) ENGINE= InnoDB; +--error ER_GET_ERRNO +SET STATEMENT debug_dbug='+d,innodb_report_deadlock' FOR +CREATE FULLTEXT INDEX idx ON articles (title, content); CREATE FULLTEXT INDEX idx ON articles (title, content); INSERT INTO articles (title, content) VALUES diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test --- mariadb-10.11.11/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test 2025-05-19 16:14:24.000000000 +0000 @@ -41,4 +41,13 @@ SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE; SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_CONFIG; SELECT @@GLOBAL.innodb_ft_aux_table; + +CREATE TABLE t(a INT) ENGINE=InnoDB; +--error ER_WRONG_VALUE_FOR_VAR +SET GLOBAL innodb_ft_aux_table='test/t'; +DROP TABLE t; +--error ER_WRONG_VALUE_FOR_VAR +SET GLOBAL innodb_ft_aux_table='test/t'; +SELECT @@GLOBAL.innodb_ft_aux_table; + SET GLOBAL innodb_ft_aux_table = @save_ft_aux_table; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/r/rollback.result mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result --- mariadb-10.11.11/mysql-test/suite/innodb_gis/r/rollback.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result 2025-05-19 16:14:25.000000000 +0000 @@ -412,3 +412,16 @@ ERROR HY000: Lost connection to server during query insert into t1 values(5, point(5,5), point(5,5), 5); drop table t1; +# +# MDEV-35420 Server aborts while deleting the record +# in spatial index +# +CREATE TABLE t1 (c POINT NOT NULL, SPATIAL(c)) engine=InnoDB; +CHECK TABLE t1; +Table Op Msg_type Msg_text +test.t1 check status OK +SET STATEMENT unique_checks=0,foreign_key_checks=0 FOR +START TRANSACTION; +INSERT INTO t1 SELECT ST_GeomFromText('POINT(114368751 656950466)') FROM seq_1_to_512; +ROLLBACK; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rollback.test mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test --- mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rollback.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test 2025-05-19 16:14:25.000000000 +0000 @@ -8,6 +8,7 @@ # Avoid CrashReporter popup on Mac --source include/not_crashrep.inc --source include/have_innodb_16k.inc +--source include/have_sequence.inc CREATE TABLE t4 (id bigint(12) unsigned NOT NULL auto_increment, c2 varchar(15) collate utf8_bin default NULL, @@ -475,3 +476,15 @@ insert into t1 values(5, point(5,5), point(5,5), 5); drop table t1; + +--echo # +--echo # MDEV-35420 Server aborts while deleting the record +--echo # in spatial index +--echo # +CREATE TABLE t1 (c POINT NOT NULL, SPATIAL(c)) engine=InnoDB; +CHECK TABLE t1; +SET STATEMENT unique_checks=0,foreign_key_checks=0 FOR +START TRANSACTION; +INSERT INTO t1 SELECT ST_GeomFromText('POINT(114368751 656950466)') FROM seq_1_to_512; +ROLLBACK; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rtree_purge.test mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test --- mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rtree_purge.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,5 +1,6 @@ # This test case will test R-tree purge. +--source include/long_test.inc --source include/innodb_page_size.inc --source include/have_sequence.inc --source include/not_valgrind.inc diff -Nru mariadb-10.11.11/mysql-test/suite/json/r/json_no_table.result mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result --- mariadb-10.11.11/mysql-test/suite/json/r/json_no_table.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result 2025-05-19 16:14:25.000000000 +0000 @@ -2886,7 +2886,7 @@ ["a", "b", "c"] select charset(json_unquote('"abc"')); charset(json_unquote('"abc"')) -utf8mb3 +utf8mb4 select json_quote(convert(X'e68891' using utf8)); json_quote(convert(X'e68891' using utf8)) "我" diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.result mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result 2025-05-19 16:14:25.000000000 +0000 @@ -1,7 +1,12 @@ CREATE TABLE t(i INT) ENGINE INNODB; INSERT INTO t VALUES(1); +SET GLOBAL innodb_max_purge_lag_wait=0; # xtrabackup backup NOT FOUND /InnoDB: Allocated tablespace ID/ in backup.log +SELECT variable_value FROM information_schema.global_status +WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; +variable_value +0 INSERT INTO t VALUES(2); # xtrabackup prepare # shutdown server diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.test mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test --- mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test 2025-05-19 16:14:25.000000000 +0000 @@ -2,6 +2,7 @@ CREATE TABLE t(i INT) ENGINE INNODB; INSERT INTO t VALUES(1); +SET GLOBAL innodb_max_purge_lag_wait=0; echo # xtrabackup backup; let $targetdir=$MYSQLTEST_VARDIR/tmp/backup; --let $backup_log=$MYSQLTEST_VARDIR/tmp/backup.log @@ -18,6 +19,8 @@ --source include/search_pattern_in_file.inc --remove_file $backup_log +SELECT variable_value FROM information_schema.global_status +WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY'; INSERT INTO t VALUES(2); diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.result mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result 2025-05-19 16:14:25.000000000 +0000 @@ -4,6 +4,9 @@ # CREATE TABLE t (pk INT PRIMARY KEY) ENGINE=InnoDB ROW_FORMAT=COMPRESSED; ALTER TABLE t PARTITION BY KEY(pk); +# Incremental backup +# Prepare fullbackup +# Prepare incremental backup # shutdown server # remove datadir # xtrabackup move back diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.test mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test --- mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test 2025-05-19 16:14:25.000000000 +0000 @@ -16,12 +16,18 @@ ALTER TABLE t PARTITION BY KEY(pk); +--echo # Incremental backup --exec $XTRABACKUP --backup --target-dir=$incremental_dir --incremental-basedir=$basedir --protocol=tcp --port=$MASTER_MYPORT --user=root > $incremental_dir.log 2>&1 +--echo # Prepare fullbackup --exec $XTRABACKUP --prepare --target-dir=$basedir --user=root > $MYSQL_TMP_DIR/backup_prepare_0.log 2>&1 ---exec $XTRABACKUP --prepare --target-dir=$basedir --incremental-dir=$incremental_dir --user=root > $MYSQL_TMP_DIR/backup_prepare_1.log ---cat_file $MYSQL_TMP_DIR/backup_prepare_1.log +--echo # Prepare incremental backup +--exec $XTRABACKUP --prepare --target-dir=$basedir --incremental-dir=$incremental_dir --user=root > $MYSQL_TMP_DIR/backup_prepare_1.log 2>&1 let $targetdir=$basedir; -- source include/restart_and_restore.inc - SHOW CREATE TABLE t; DROP TABLE t; +remove_file $incremental_dir.log; +remove_file $MYSQL_TMP_DIR/backup_prepare_0.log; +remove_file $MYSQL_TMP_DIR/backup_prepare_1.log; +rmdir $basedir; +rmdir $incremental_dir; diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/log_page_corruption.test mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test --- mariadb-10.11.11/mysql-test/suite/mariabackup/log_page_corruption.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,5 +1,5 @@ +--source include/long_test.inc --source include/have_debug.inc ---source include/no_valgrind_without_big.inc --source include/innodb_undo_tablespaces.inc --echo ######## diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partial.result mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/partial.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result 2025-05-19 16:14:25.000000000 +0000 @@ -4,8 +4,8 @@ INSERT INTO t21 VALUES(1); CREATE TABLE t2(i int) ENGINE INNODB; # xtrabackup backup -t1.new -t21.new +t1.ibd +t21.ibd # xtrabackup prepare t1.cfg t21.cfg diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partial_exclude.result mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/partial_exclude.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result 2025-05-19 16:14:25.000000000 +0000 @@ -14,7 +14,7 @@ INSERT INTO test.t2 VALUES(20); # xtrabackup backup COMMIT; -t1.new +t1.ibd DROP TABLE t1; DROP TABLE t2; DROP DATABASE db2; diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.result mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,11 @@ +# +# MDEV-36437 mariabackup - confusing error message when running out of file handles with partitioned MyISAM +# +create table t1 ( +id bigint(20) not null auto_increment, +primary key (id) +) engine=myisam +partition by hash (id) +partitions 600; +FOUND 1 /Error 24 on file ./test/t1#P#p\d+\.MY[DI] open during `test`.`t1` table copy: Too many open files/ in backup.log +drop table t1; diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.test mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test --- mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,25 @@ +source include/not_windows.inc; +source include/have_partition.inc; +let $targetdir=$MYSQLTEST_VARDIR/tmp/backup; +let $log=$MYSQL_TMP_DIR/backup.log; + +--echo # +--echo # MDEV-36437 mariabackup - confusing error message when running out of file handles with partitioned MyISAM +--echo # + +create table t1 ( + id bigint(20) not null auto_increment, + primary key (id) +) engine=myisam + partition by hash (id) + partitions 600; + +error 1; +exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir > $log 2>&1; +let SEARCH_FILE=$log; +let SEARCH_PATTERN=Error 24 on file ./test/t1#P#p\d+\.MY[DI] open during `test`.`t1` table copy: Too many open files; +source include/search_pattern_in_file.inc; + +rmdir $targetdir; +#remove_file $log; +drop table t1; diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/unsupported_redo.result mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result --- mariadb-10.11.11/mysql-test/suite/mariabackup/unsupported_redo.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result 2025-05-19 16:14:25.000000000 +0000 @@ -22,8 +22,8 @@ ALTER TABLE t21 FORCE, ALGORITHM=INPLACE; # Create partial backup (excluding table t21), Ignore the # unsupported redo log for the table t21. -t1.new -t2.new +t1.ibd +t2.ibd # Prepare the full backup t1.ibd t2.ibd diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.opt mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt --- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1 @@ +--master-info-file=$MYSQL_TMP_DIR/master_info_file.txt diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.result mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result --- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,18 @@ +CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1; +CHANGE MASTER 'named' TO master_host='localhost', master_user='test', master_port=SERVER_MYPORT_2; +--list_files @@datadir *.info +relay-log-named.info +relay-log.info +--list_files MYSQL_TMP_DIR *.txt +master_info_file-named.txt +master_info_file.txt +multi-master_info_file.txt +--cat_file MYSQL_TMP_DIR/multi-master_info_file.txt +named +FOUND 1 matches in master_info_file.txt +FOUND 1 matches in master_info_file.txt +FOUND 1 matches in master_info_file.txt +FOUND 1 matches in master_info_file-named.txt +FOUND 1 matches in master_info_file-named.txt +FOUND 1 matches in master_info_file-named.txt +RESET REPLICA 'named' ALL; diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.test mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test --- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,38 @@ +# MDEV-36238: Test `--master-info-file` +# +# Other tests (such as `info_logs`) work explicitly with `(multi-)master.info`. +# This test sees that `--master-info-file` moves/renames this file. + +--source include/not_embedded.inc +--replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1 +--eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_1 +--replace_result $SERVER_MYPORT_2 SERVER_MYPORT_2 +--eval CHANGE MASTER 'named' TO master_host='localhost', master_user='test', master_port=$SERVER_MYPORT_2 + +--let $datadir = `SELECT @@datadir` +--echo --list_files @@datadir *.info +--list_files $datadir *.info +--echo --list_files MYSQL_TMP_DIR *.txt +--list_files $MYSQL_TMP_DIR *.txt + +--echo --cat_file MYSQL_TMP_DIR/multi-master_info_file.txt +--cat_file $MYSQL_TMP_DIR/multi-master_info_file.txt +--let SEARCH_OUTPUT= count + +--let SEARCH_FILE= $MYSQL_TMP_DIR/master_info_file.txt +--let SEARCH_PATTERN= \\n127.0.0.1\\n +--source include/search_pattern_in_file.inc +--let SEARCH_PATTERN= \\nroot\\n +--source include/search_pattern_in_file.inc +--let SEARCH_PATTERN= \\n$SERVER_MYPORT_1\\n +--source include/search_pattern_in_file.inc + +--let SEARCH_FILE= $MYSQL_TMP_DIR/master_info_file-named.txt +--let SEARCH_PATTERN= \\nlocalhost\\n +--source include/search_pattern_in_file.inc +--let SEARCH_PATTERN= \\ntest\\n +--source include/search_pattern_in_file.inc +--let SEARCH_PATTERN= \\n$SERVER_MYPORT_2\\n +--source include/search_pattern_in_file.inc + +RESET REPLICA 'named' ALL; diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.cnf mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf --- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,13 @@ +!include ./my.cnf + +[mysqld.1] +show-slave-auth-info + +[mysqld.4] +server-id=4 +log-warnings=2 +report-user=my_user +report-password=my_password + +[ENV] +SERVER_MYPORT_4= @mysqld.4.port diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.result mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result --- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,45 @@ +# Setup +connect master1,127.0.0.1,root,,,$SERVER_MYPORT_1; +connect master2,127.0.0.1,root,,,$SERVER_MYPORT_2; +connect slave1,127.0.0.1,root,,,$SERVER_MYPORT_3; +connect slave2,127.0.0.1,root,,,$SERVER_MYPORT_4; +connection slave2; +CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1; +CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_2; +START ALL SLAVES; +connection slave1; +CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1; +CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_2; +START ALL SLAVES; +# Test +connection master2; +SHOW SLAVE HOSTS; +Server_id Host Port Master_id +3 localhost SERVER_MYPORT_3 2 +4 localhost SERVER_MYPORT_4 2 +connection master1; +SHOW SLAVE HOSTS; +Server_id Host User Password Port Master_id +3 localhost SERVER_MYPORT_3 1 +4 localhost my_user my_password SERVER_MYPORT_4 1 +SHOW REPLICA HOSTS; +Server_id Host User Password Port Master_id +3 localhost SERVER_MYPORT_3 1 +4 localhost my_user my_password SERVER_MYPORT_4 1 +# Cleanup +connection slave2; +STOP ALL SLAVES; +include/wait_for_slave_to_stop.inc +SET @@SESSION.default_master_connection= 'control sample'; +include/wait_for_slave_to_stop.inc +RESET SLAVE ALL; +connection slave1; +STOP ALL SLAVES; +include/wait_for_slave_to_stop.inc +SET @@SESSION.default_master_connection= 'control sample'; +include/wait_for_slave_to_stop.inc +RESET SLAVE ALL; +disconnect master1; +disconnect master2; +disconnect slave1; +disconnect slave2; diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.test mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test --- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,83 @@ +# MDEV-36238: Test `--show-slave-auth-info` (and `--report-user`/`password`) +# +# `rpl.rpl_show_slave_hosts` and `rpl.rpl_slave_alias_replica` +# (and several others) test SHOW SLAVE HOSTS without `--show-slave-auth-info`. +# This test supplements them with a comparison between with and without. + +# SHOW SLAVE HOSTS is agnostic of binlog formats +--source include/have_binlog_format_mixed.inc + +--echo # Setup + +# This server has `--show-slave-auth-info`. +--connect (master1,127.0.0.1,root,,,$SERVER_MYPORT_1) +# This `--show-slave-auth-info`-less server asserts that it is per-master. +--connect (master2,127.0.0.1,root,,,$SERVER_MYPORT_2) +# This is a non-reporting slave. +--connect (slave1,127.0.0.1,root,,,$SERVER_MYPORT_3) +# This is a self-reporting slave. +--connect (slave2,127.0.0.1,root,,,$SERVER_MYPORT_4) + +--let $rpl_server_number= 2 +while ($rpl_server_number) +{ + --connection slave$rpl_server_number + + --replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1 + --eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_1 + --replace_result $SERVER_MYPORT_2 SERVER_MYPORT_2 + --eval CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_2 + --disable_warnings + START ALL SLAVES; + --enable_warnings + + --dec $rpl_server_number +} + +--echo # Test + +--let $rpl_server_number= 2 +while ($rpl_server_number) +{ + --connection master$rpl_server_number + + # Make sure the master's synced up + --let $show_statement= SHOW SLAVE HOSTS + --let $field= Server_id + --let $condition= =3 + --source include/wait_show_condition.inc + --let $condition= =4 + --source include/wait_show_condition.inc + + --replace_result $SERVER_MYPORT_3 SERVER_MYPORT_3 $SERVER_MYPORT_4 SERVER_MYPORT_4 + SHOW SLAVE HOSTS; + + --dec $rpl_server_number +} + +# MDEV-20601 Make REPLICA a synonym for SLAVE in SQL statements +--replace_result $SERVER_MYPORT_3 SERVER_MYPORT_3 $SERVER_MYPORT_4 SERVER_MYPORT_4 +SHOW REPLICA HOSTS; + +--echo # Cleanup + +--let $rpl_server_number= 2 +while ($rpl_server_number) +{ + --connection slave$rpl_server_number + + --disable_warnings + STOP ALL SLAVES; + --enable_warnings + --source include/wait_for_slave_to_stop.inc + SET @@SESSION.default_master_connection= 'control sample'; + --source include/wait_for_slave_to_stop.inc + RESET SLAVE ALL; + + --dec $rpl_server_number +} + +--disconnect master1 +--disconnect master2 +--disconnect slave1 +--disconnect slave2 diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_innodb.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test --- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_innodb.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_innodb.inc --source include/have_partition.inc --source include/have_debug_sync.inc diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_memory.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test --- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_memory.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_partition.inc --source include/have_debug_sync.inc diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_myisam.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test --- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_myisam.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_partition.inc --source include/have_debug_sync.inc diff -Nru mariadb-10.11.11/mysql-test/suite/perfschema/r/threads_innodb.result mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result --- mariadb-10.11.11/mysql-test/suite/perfschema/r/threads_innodb.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result 2025-05-19 16:14:25.000000000 +0000 @@ -1,10 +1,10 @@ SELECT name, type, processlist_user, processlist_host, processlist_db, -processlist_command, processlist_time, processlist_state, processlist_info, +processlist_command, processlist_time, processlist_info, parent_thread_id, role, instrumented FROM performance_schema.threads WHERE name LIKE 'thread/innodb/%' GROUP BY name; -name type processlist_user processlist_host processlist_db processlist_command processlist_time processlist_state processlist_info parent_thread_id role instrumented -thread/innodb/page_cleaner_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL NULL YES -thread/innodb/page_encrypt_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL NULL YES -thread/innodb/thread_pool_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL NULL YES +name type processlist_user processlist_host processlist_db processlist_command processlist_time processlist_info parent_thread_id role instrumented +thread/innodb/page_cleaner_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL YES +thread/innodb/page_encrypt_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL YES +thread/innodb/thread_pool_thread BACKGROUND NULL NULL NULL NULL NULL NULL NULL NULL YES diff -Nru mariadb-10.11.11/mysql-test/suite/perfschema/t/threads_innodb.test mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test --- mariadb-10.11.11/mysql-test/suite/perfschema/t/threads_innodb.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test 2025-05-19 16:14:25.000000000 +0000 @@ -14,7 +14,7 @@ # We suppress here duplicates rows with the goal to avoid that the test fails # in case some defaults are changed. SELECT name, type, processlist_user, processlist_host, processlist_db, - processlist_command, processlist_time, processlist_state, processlist_info, + processlist_command, processlist_time, processlist_info, parent_thread_id, role, instrumented FROM performance_schema.threads WHERE name LIKE 'thread/innodb/%' diff -Nru mariadb-10.11.11/mysql-test/suite/plugins/r/server_audit.result mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result --- mariadb-10.11.11/mysql-test/suite/plugins/r/server_audit.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result 2025-05-19 16:14:25.000000000 +0000 @@ -20,6 +20,9 @@ set global server_audit_incl_users=null; set global server_audit_file_path='server_audit.log'; set global server_audit_output_type=file; +set global server_audit_file_path=REPEAT(REPEAT('new_file_name', 50), 50); +Warnings: +Warning 1 server_audit_file_path can't exceed FN_LEN characters. set global server_audit_logging=on; set global server_audit_incl_users= repeat("'root',", 10000); ERROR 42000: Variable 'server_audit_incl_users' can't be set to the value of ''root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','...' diff -Nru mariadb-10.11.11/mysql-test/suite/plugins/t/server_audit.test mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test --- mariadb-10.11.11/mysql-test/suite/plugins/t/server_audit.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test 2025-05-19 16:14:25.000000000 +0000 @@ -20,6 +20,10 @@ set global server_audit_incl_users=null; set global server_audit_file_path='server_audit.log'; set global server_audit_output_type=file; + +--replace_regex /[1-9][0-9][0-9]+/FN_LEN/ +set global server_audit_file_path=REPEAT(REPEAT('new_file_name', 50), 50); + set global server_audit_logging=on; --error ER_WRONG_VALUE_FOR_VAR diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result 2025-05-19 16:14:25.000000000 +0000 @@ -4,6 +4,7 @@ CREATE TABLE t (a INT) ENGINE = innodb; connection slave; include/stop_slave.inc +SET STATEMENT sql_log_bin= 0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads; SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode; SET @@global.slave_parallel_threads= 2; @@ -19,6 +20,7 @@ connection slave; SET @@global.debug_dbug="+d,hold_worker_on_schedule"; start slave; +SET debug_sync = 'now WAIT_FOR reached_pause'; connection slave1; backup stage start; backup stage block_commit; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_create_select_row.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_create_select_row.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,158 @@ +include/master-slave.inc +[connection master] +connection master; +set @max_binlog_cache_size = @@global.max_binlog_cache_size; +set @binlog_cache_size = @@global.binlog_cache_size; +set @@global.max_binlog_cache_size = 4096; +set @@global. binlog_cache_size = 4096; +# +# MDEV-35207 ignored error at binlogging by CREATE-TABLE-SELECT leads to assert +# +connect conn_err,localhost,root,,; +call mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage"); +create table t engine=myisam select repeat ('a',4096*3) AS a; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +create table t engine=innodb select repeat ('a',4096*3) AS a; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +create table t (a int unique, b char) select 1 AS a, 'b' as b union select 1 as a, 'c' as b; +ERROR 23000: Duplicate entry '1' for key 'a' +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +disconnect conn_err; +connection master; + +# +# MDEV-35499 errored CREATE-OR-REPLACE-SELECT does not DROP table in binlog +# +# +# Engine = innodb +# +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=innodb select 1 AS a, 'b' as b union select 1 as a, 'c' as b; +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a text) engine=innodb select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a text) engine=innodb select repeat ('a',4096*3) AS a;; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +# +# Engine = myisam +# +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=myisam select 1 AS a, 'b' as b union select 1 as a, 'c' as b; +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a text) engine=myisam select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +set statement binlog_format=statement for create table t (a int) select 1 as a; +set statement binlog_format=row for create or replace table t (a text) engine=myisam select repeat ('a',4096*3) AS a;; +ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again +select * from t; +ERROR 42S02: Table 'test.t' doesn't exist +# +# Prove an expected lonely `DROP table t' +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Query # # use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */ +master-bin.000001 # Query # # ROLLBACK +create table ti_pk (a int primary key) engine=innodb; +create table ta (a int) engine=aria; +create function f_ia(arg int) +returns integer +begin +insert into ti_pk set a=1; +insert into ta set a=1; +insert into ti_pk set a=arg; +return 1; +end | +set statement binlog_format = ROW for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a; +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +select * from t_y; +ERROR 42S02: Table 'test.t_y' doesn't exist +# correct execution: `ta` is modified and its new record is binlogged +include/show_binlog_events.inc +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 # Gtid # # BEGIN GTID #-#-# +master-bin.000001 # Table_map # # table_id: # (test.ta) +master-bin.000001 # Write_rows_v1 # # table_id: # flags: STMT_END_F +master-bin.000001 # Query # # COMMIT +select * from ta; +a +1 +select * from ti_pk; +a +connection slave; +include/diff_tables.inc [master:ta,slave:ta] +connection master; +delete from ta; +connection slave; +connection master; +set statement binlog_format = STATEMENT for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a; +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +select * from t_y; +ERROR 42S02: Table 'test.t_y' doesn't exist +# ***TODO: fix MDEV-36027***. As of now `ta` is modified but that's not binlogged +include/show_binlog_events.inc +select *,'on_master' from ta; +a on_master +1 on_master +select * from ti_pk; +a +connection slave; +select *,'on_slave' from ta; +a on_slave +connection master; +drop function f_ia; +drop table ti_pk, ta; +SET @@global.max_binlog_cache_size = @max_binlog_cache_size; +SET @@global. binlog_cache_size = @binlog_cache_size; +connection slave; +End of the tests +include/rpl_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_gtid_crash.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_gtid_crash.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result 2025-05-19 16:14:25.000000000 +0000 @@ -12,6 +12,8 @@ connection server_2; SET sql_log_bin=0; call mtr.add_suppression('Master command COM_REGISTER_SLAVE failed: failed registering on master, reconnecting to try again'); +call mtr.add_suppression('Slave I/O: .*Lost connection to server during query'); +call mtr.add_suppression("Slave I/O thread couldn't register on master"); SET sql_log_bin=1; include/stop_slave.inc CHANGE MASTER TO master_host = '127.0.0.1', master_port = MASTER_PORT, diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_master_pos_wait.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_master_pos_wait.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result 2025-05-19 16:14:25.000000000 +0000 @@ -43,6 +43,9 @@ select master_pos_wait('master-bin.000001',1000000,1,"my_slave"); master_pos_wait('master-bin.000001',1000000,1,"my_slave") -1 +select master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE"); +master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE") +-1 STOP SLAVE 'my_slave'; RESET SLAVE 'my_slave' ALL; change master to master_port=MASTER_MYPORT, master_host='127.0.0.1', master_user='root'; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result 2025-05-19 16:14:25.000000000 +0000 @@ -1,16 +1,15 @@ ***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts *** include/master-slave.inc [connection master] -connection server_2; -SET sql_log_bin=0; +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CALL mtr.add_suppression("InnoDB: Transaction was aborted due to "); CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); -SET sql_log_bin=1; +connection server_2; SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; include/stop_slave.inc SET GLOBAL slave_parallel_threads=10; CHANGE MASTER TO master_use_gtid=slave_pos; connection server_1; -ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB; INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6); connect con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,41 @@ +# Set up Semi-Sync with rpl_semi_sync_master_wait_no_slave=0 +include/master-slave.inc +[connection master] +SET @@GLOBAL.rpl_semi_sync_master_enabled= 1; +SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 0; +connection slave; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1; +include/start_slave.inc +connection master; +connection slave; +connection master; +SELECT ID INTO @binlog_dump_tid +FROM information_schema.PROCESSLIST WHERE COMMAND = 'Binlog Dump'; +# Control State +SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid; +STATE +Master has sent all binlog to slave; waiting for more updates +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 1 +# Disable Semi-Sync while the dump thread is still connected to its slave +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid; +STATE +Master has sent all binlog to slave; waiting for more updates +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 1 +# Disconnect the slave and wait until the master's dump thread is gone +connection slave; +STOP SLAVE; +connection master; +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 0 +# Cleanup +SET @@GLOBAL.rpl_semi_sync_master_enabled= 0; +SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 1; +connection slave; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/rpl_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,53 @@ +# Skip starting the slave because we manually start with SSL later +include/master-slave.inc +[connection master] +# +# Setup +connection master; +CREATE USER replssl@localhost; +GRANT REPLICATION SLAVE on *.* to replssl@localhost REQUIRE SSL; +set @orig_master_enabled= @@GLOBAL.rpl_semi_sync_master_enabled; +SET @@GLOBAL.rpl_semi_sync_master_enabled= 1; +connection slave; +CHANGE MASTER TO +master_user='replssl', +master_password='', +master_ssl=1, +master_ssl_ca='MYSQL_TEST_DIR/std_data/cacert.pem', +master_ssl_cert='MYSQL_TEST_DIR/std_data/client-cert.pem', +master_ssl_key='MYSQL_TEST_DIR/std_data/client-key.pem'; +set @orig_slave_enabled= @@GLOBAL.rpl_semi_sync_slave_enabled; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1; +include/start_slave.inc +connection master; +# Verify Semi-Sync is active +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 1 +# Create some table so slave can be seen as up-to-date and working +connection master; +CREATE TABLE t1 (a INT); +connection slave; +# Disconnect the slave and wait until the master's dump thread is gone +connection slave; +STOP SLAVE; +connection master; +# MDEV-36663: Verifying dump thread connection is killed.. +# ..done +# Cleanup +connection master; +SET @@GLOBAL.rpl_semi_sync_master_enabled= @orig_master_enabled; +DROP USER replssl@localhost; +DROP TABLE t1; +connection slave; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= @orig_slave_enabled; +CHANGE MASTER TO +master_user='root', +master_ssl=0, +master_ssl_ca='', +master_ssl_cert='', +master_ssl_key=''; +connection slave; +include/start_slave.inc +include/rpl_end.inc +# End of rpl_semi_sync_ssl_stop.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result --- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,26 @@ +include/master-slave.inc +[connection master] +connection master; +create table t1 (a int primary key, b int) engine=innodb; +insert t1 values (1,1),(3,3),(5,5),(7,7); +create table t2 (m int) engine=aria; +# Create multi-engine, two-phase XA transaction (T1) +xa start '1'; +insert t2 values (1); +update t1 set b=50 where b=5; +xa end '1'; +xa prepare '1'; +# Create T2 +connection server_1; +update t1 set b=10 where a=5; +connection master; +xa commit '1'; +connection server_1; +include/save_master_gtid.inc +# This would hang prior to MDEV-21117 +connection slave; +include/sync_with_master_gtid.inc +connection master; +drop table t1, t2; +include/rpl_end.inc +# End of rpl_xa_2pc_multi_engine.test diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test 2025-05-19 16:14:25.000000000 +0000 @@ -11,6 +11,7 @@ --sync_slave_with_master --source include/stop_slave.inc +SET STATEMENT sql_log_bin= 0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads; SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode; SET @@global.slave_parallel_threads= 2; @@ -28,20 +29,21 @@ --connection slave SET @@global.debug_dbug="+d,hold_worker_on_schedule"; start slave; +SET debug_sync = 'now WAIT_FOR reached_pause'; --let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit" --source include/wait_condition.inc --connection slave1 backup stage start; ---send backup stage block_commit +backup stage block_commit; --connection slave --let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for backup lock" SET debug_sync = 'now SIGNAL continue_worker'; +--source include/wait_condition.inc SET debug_sync = RESET; --connection slave1 -reap; backup stage end; --connection master diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_create_select_row.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_create_select_row.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,161 @@ +--source include/have_binlog_format_row.inc +--source include/have_innodb.inc +--source include/master-slave.inc + +--connection master +set @max_binlog_cache_size = @@global.max_binlog_cache_size; +set @binlog_cache_size = @@global.binlog_cache_size; +set @@global.max_binlog_cache_size = 4096; +set @@global. binlog_cache_size = 4096; + +--echo # +--echo # MDEV-35207 ignored error at binlogging by CREATE-TABLE-SELECT leads to assert +--echo # +# fix the current (write) binlog position +--let $binlog_file_0= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start_0 = query_get_value(SHOW MASTER STATUS, Position, 1) + +# use a separate connection also to validate its close will be clean +connect (conn_err,localhost,root,,); + +call mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage"); +--error ER_TRANS_CACHE_FULL +create table t engine=myisam select repeat ('a',4096*3) AS a; + +--error ER_TRANS_CACHE_FULL +create table t engine=innodb select repeat ('a',4096*3) AS a; + +--error ER_DUP_ENTRY +create table t (a int unique, b char) select 1 AS a, 'b' as b union select 1 as a, 'c' as b; +--error ER_NO_SUCH_TABLE +select * from t; + +--disconnect conn_err + +--connection master +--let $binlog_file_1= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start_1= query_get_value(SHOW MASTER STATUS, Position, 1) + +--let $cmp = `select strcmp('$binlog_file_1', '$binlog_file_0') <> 0 OR $binlog_start_1 <> $binlog_start_0` +if (!$cmp) +{ + --echo *** Error: unexpected advance of binlog position + --die +} + +--echo +--echo # +--echo # MDEV-35499 errored CREATE-OR-REPLACE-SELECT does not DROP table in binlog +--echo # +--let $i = 2 +while ($i) +{ + --let $engine=`select if($i % 2, "myisam", "innodb")` + --echo # + --echo # Engine = $engine + --echo # + set statement binlog_format=statement for create table t (a int) select 1 as a; + --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) + --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) + --error ER_DUP_ENTRY + --eval set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=$engine select 1 AS a, 'b' as b union select 1 as a, 'c' as b + --error ER_NO_SUCH_TABLE + select * from t; + --echo # + --echo # Prove an expected lonely `DROP table t' + --source include/show_binlog_events.inc + + # error before stmt commit + set statement binlog_format=statement for create table t (a int) select 1 as a; + --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) + --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) + --error ER_TRANS_CACHE_FULL + --eval set statement binlog_format=row for create or replace table t (a text) engine=$engine select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a + --error ER_NO_SUCH_TABLE + select * from t; + --echo # + --echo # Prove an expected lonely `DROP table t' + --source include/show_binlog_events.inc + + # error at stmt commit + set statement binlog_format=statement for create table t (a int) select 1 as a; + --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) + --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) + --error ER_TRANS_CACHE_FULL + --eval set statement binlog_format=row for create or replace table t (a text) engine=$engine select repeat ('a',4096*3) AS a; + --error ER_NO_SUCH_TABLE + select * from t; + --echo # + --echo # Prove an expected lonely `DROP table t' + --source include/show_binlog_events.inc + +--dec $i +} + +# Tests of mixed engines to demonstrate non-transaction table updates +# are binlogged or otherwise MDEV-36027. +create table ti_pk (a int primary key) engine=innodb; +create table ta (a int) engine=aria; +delimiter |; +create function f_ia(arg int) +returns integer +begin + insert into ti_pk set a=1; + insert into ta set a=1; + insert into ti_pk set a=arg; + return 1; +end | +delimiter ;| + +--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) + +--error ER_DUP_ENTRY +set statement binlog_format = ROW for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a; +--error ER_NO_SUCH_TABLE +select * from t_y; + +--echo # correct execution: `ta` is modified and its new record is binlogged +--source include/show_binlog_events.inc +select * from ta; +select * from ti_pk; + +--sync_slave_with_master +--let $diff_tables=master:ta,slave:ta +--source include/diff_tables.inc + +--connection master +delete from ta; +--sync_slave_with_master + +--connection master +# MDEV-36027 Errored-out CREATE-SELECT does not binlog results of any function modifying non-transactional table +--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1) +--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1) +--error ER_DUP_ENTRY +set statement binlog_format = STATEMENT for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a; +--error ER_NO_SUCH_TABLE +select * from t_y; + +--echo # ***TODO: fix MDEV-36027***. As of now `ta` is modified but that's not binlogged +--source include/show_binlog_events.inc +select *,'on_master' from ta; +select * from ti_pk; + +--sync_slave_with_master +select *,'on_slave' from ta; + +# Cleanup +--connection master +drop function f_ia; +drop table ti_pk, ta; + +SET @@global.max_binlog_cache_size = @max_binlog_cache_size; +SET @@global. binlog_cache_size = @binlog_cache_size; + +# test that binlog replicates correctly to slave +# --connection slave +--sync_slave_with_master + +--echo End of the tests +--source include/rpl_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt 2025-05-19 16:14:25.000000000 +0000 @@ -1 +1 @@ ---master-retry-count=100 --slave-net-timeout=10 +--master-retry-count=500 --slave-net-timeout=10 diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test 2025-05-19 16:14:25.000000000 +0000 @@ -24,6 +24,8 @@ --sync_with_master SET sql_log_bin=0; call mtr.add_suppression('Master command COM_REGISTER_SLAVE failed: failed registering on master, reconnecting to try again'); +call mtr.add_suppression('Slave I/O: .*Lost connection to server during query'); +call mtr.add_suppression("Slave I/O thread couldn't register on master"); SET sql_log_bin=1; --source include/stop_slave.inc --replace_result $MASTER_MYPORT MASTER_PORT diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test 2025-05-19 16:14:25.000000000 +0000 @@ -9,6 +9,7 @@ # * Various states of master and heartbeat # * Circular replication ############################################################# +--source include/long_test.inc --source include/master-slave.inc # # The test runs long and does not have any specifics to diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_master_pos_wait.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_master_pos_wait.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test 2025-05-19 16:14:25.000000000 +0000 @@ -48,6 +48,7 @@ --echo # Call with a valid connection name -- hangs before MDEV-7130 fix (expected -1) select master_pos_wait('master-bin.000001',1000000,1,"my_slave"); +select master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE"); STOP SLAVE 'my_slave'; RESET SLAVE 'my_slave' ALL; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test 2025-05-19 16:14:25.000000000 +0000 @@ -5,21 +5,19 @@ --source include/have_debug_sync.inc --source include/master-slave.inc ---disable_query_log -call mtr.add_suppression("InnoDB: Transaction was aborted due to "); ---enable_query_log +ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; +CALL mtr.add_suppression("InnoDB: Transaction was aborted due to "); +CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); +--save_master_pos --connection server_2 -SET sql_log_bin=0; -CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); -SET sql_log_bin=1; +--sync_with_master SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads; --source include/stop_slave.inc SET GLOBAL slave_parallel_threads=10; CHANGE MASTER TO master_use_gtid=slave_pos; --connection server_1 -ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB; INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6); --connect (con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,) diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test 2025-05-19 16:14:25.000000000 +0000 @@ -3,6 +3,7 @@ # tables. Specifically when drop temporary tables and create temporary tables # are used. ################################################################################### +--source include/long_test.inc --source include/have_binlog_format_row.inc --source include/have_innodb.inc --source include/master-slave.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test 2025-05-19 16:14:25.000000000 +0000 @@ -4,6 +4,7 @@ # Please check all dependent tests after modifying it # +source include/long_test.inc; source include/not_embedded.inc; source include/have_innodb.inc; source include/master-slave.inc; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_binlog_format_statement.inc set global rpl_semi_sync_master_wait_point=AFTER_SYNC; source rpl_semi_sync.test; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/long_test.inc --source include/have_binlog_format_row.inc set global rpl_semi_sync_master_wait_point=AFTER_SYNC; source rpl_semi_sync.test; diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,68 @@ +# MDEV-36359: Master crashes when reverting to async after Semi-Sync disabled. +# +# Assert behavior of turning Semi-Sync off on +# the master when still connected to a slave + +--source include/have_binlog_format_mixed.inc # format-agnostic + +--echo # Set up Semi-Sync with rpl_semi_sync_master_wait_no_slave=0 +--let $rpl_skip_start_slave= 1 +--source include/master-slave.inc + +--let $orig_master_enabled=`SELECT @@GLOBAL.rpl_semi_sync_master_enabled` +SET @@GLOBAL.rpl_semi_sync_master_enabled= 1; +--let $orig_wait_no_slave=`SELECT @@GLOBAL.rpl_semi_sync_master_wait_no_slave` +SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 0; + +--connection slave +--let $orig_slave_enabled=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled` +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1; +--source include/start_slave.inc + +--connection master +# Make sure Semi-Sync is active +--let $status_var= Rpl_semi_sync_master_status +--let $status_var_value= ON +--source include/wait_for_status_var.inc + +--sync_slave_with_master +--connection master + +--disable_cursor_protocol +SELECT ID INTO @binlog_dump_tid + FROM information_schema.PROCESSLIST WHERE COMMAND = 'Binlog Dump'; +--enable_cursor_protocol + +--echo # Control State +SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid; +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; + +--echo # Disable Semi-Sync while the dump thread is still connected to its slave +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +--let $status_var_value= OFF +--source include/wait_for_status_var.inc + +SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid; +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; + +--echo # Disconnect the slave and wait until the master's dump thread is gone +--connection slave +STOP SLAVE; +# Starting with MDEV-13073, +# Semi-Sync STOP SLAVE also terminates its dump thread on the master. +--connection master + +# MDEV-36359: The disconnection would crash the master and leave the wait with +# error 2013 'Lost connection to server during query' +--let $wait_condition= SELECT COUNT(*)=0 FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid +--source include/wait_condition.inc +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; + +--echo # Cleanup +--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $orig_master_enabled +--eval SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= $orig_wait_no_slave +--connection slave +--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= $orig_slave_enabled + +--let $rpl_only_running_threads= 1 +--source include/rpl_end.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,100 @@ +# +# This test verifies that semi-sync setups configured to use SSL can kill +# the replication connection when the IO thread is stopped (e.g. from +# STOP SLAVE). The way it should happen, is that the IO thread creates a new +# connection to the primary which issues KILL on the connection id of the +# replication connection. MDEV-36663 reported an issue where this new +# kill-oriented connection could not connect to a primary when it requires +# connections to use SSL. +# +# This test sets up a semi-sync SSL master-slave topology, and stops the +# slave IO thread. It then validates that the connection was killed by using +# the wait_condition.inc utility to wait for the binlog dump thread to die, +# and also validates that the status variable Rpl_semi_sync_master_clients +# reports as 0. +# +# References: +# MDEV-36663: Semi-sync Replica Can't Kill Dump Thread When Using SSL +# +--source include/have_binlog_format_mixed.inc # format-agnostic +--source include/have_ssl_communication.inc + +--echo # Skip starting the slave because we manually start with SSL later +--let $rpl_skip_start_slave= 1 +--source include/master-slave.inc + +--echo # +--echo # Setup +--connection master +CREATE USER replssl@localhost; +GRANT REPLICATION SLAVE on *.* to replssl@localhost REQUIRE SSL; + +set @orig_master_enabled= @@GLOBAL.rpl_semi_sync_master_enabled; +SET @@GLOBAL.rpl_semi_sync_master_enabled= 1; + +--connection slave +--replace_result $MYSQL_TEST_DIR MYSQL_TEST_DIR +eval CHANGE MASTER TO + master_user='replssl', + master_password='', + master_ssl=1, + master_ssl_ca='$MYSQL_TEST_DIR/std_data/cacert.pem', + master_ssl_cert='$MYSQL_TEST_DIR/std_data/client-cert.pem', + master_ssl_key='$MYSQL_TEST_DIR/std_data/client-key.pem'; + +set @orig_slave_enabled= @@GLOBAL.rpl_semi_sync_slave_enabled; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1; + +--source include/start_slave.inc + +--connection master +--echo # Verify Semi-Sync is active +--let $status_var= Rpl_semi_sync_master_clients +--let $status_var_value= 1 +--source include/wait_for_status_var.inc +SHOW STATUS LIKE 'Rpl_semi_sync_master_clients'; + +--echo # Create some table so slave can be seen as up-to-date and working +--connection master +CREATE TABLE t1 (a INT); +--sync_slave_with_master + +--echo # Disconnect the slave and wait until the master's dump thread is gone +--connection slave +STOP SLAVE; +--connection master + +--echo # MDEV-36663: Verifying dump thread connection is killed.. +# Prior to MDEV-36663 fixes, this would time out and +# Rpl_semi_sync_master_clients would remain 1. +--let $wait_condition= SELECT COUNT(*)=0 FROM information_schema.PROCESSLIST WHERE USER = 'replssl' +--source include/wait_condition.inc + +--let $n_master_clients= query_get_value(SHOW STATUS LIKE 'Rpl_semi_sync_master_clients', Value, 1) +if ($n_master_clients) +{ + --echo # Rpl_semi_sync_master_clients: $n_master_clients + --die Semi-sync dump thread connection not killed +} +--echo # ..done + +--echo # Cleanup +--connection master +SET @@GLOBAL.rpl_semi_sync_master_enabled= @orig_master_enabled; +DROP USER replssl@localhost; +DROP TABLE t1; + +--connection slave +SET @@GLOBAL.rpl_semi_sync_slave_enabled= @orig_slave_enabled; +CHANGE MASTER TO + master_user='root', + master_ssl=0, + master_ssl_ca='', + master_ssl_cert='', + master_ssl_key=''; + +--connection slave +--source include/start_slave.inc + +--source include/rpl_end.inc +--echo # End of rpl_semi_sync_ssl_stop.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_typeconv.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_typeconv.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test 2025-05-19 16:14:25.000000000 +0000 @@ -4,6 +4,7 @@ # Please check all dependent tests after modifying it # +--source include/long_test.inc --source include/have_binlog_format_row.inc --source include/master-slave.inc diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test --- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,63 @@ +# +# This test ensures binlog order is correct for multi-engine, two-phase XA +# transactions. MDEV-26652 exposed a race condition which would allow +# concurrent transactions which modify the same table record to binlog in +# the "opposite" order, i.e. what _should_ be: +# T1 XA PREPARE +# T1 XA COMMIT +# T2 +# +# was binlogged as +# T1 XA PREPARE +# T2 +# T1 XA COMMIT +# +# which would break replication. +# +# Note that the actual fix for this issue was done with MDEV-21117. +# +# References: +# MDEV-26652: xa transactions binlogged in wrong order +# MDEV-21117: refine the server binlog-based recovery for semisync +# +source include/have_binlog_format_row.inc; +source include/have_innodb.inc; +source include/master-slave.inc; + +--connection master +create table t1 (a int primary key, b int) engine=innodb; +insert t1 values (1,1),(3,3),(5,5),(7,7); +create table t2 (m int) engine=aria; + + +--echo # Create multi-engine, two-phase XA transaction (T1) +xa start '1'; +insert t2 values (1); +update t1 set b=50 where b=5; +xa end '1'; + +# Aria doesn't support XA PREPARE, so disable warnings +--disable_warnings +xa prepare '1'; +--enable_warnings + +--echo # Create T2 +--connection server_1 +--send update t1 set b=10 where a=5 + +--connection master +xa commit '1'; + +--connection server_1 +--reap +--source include/save_master_gtid.inc + +--echo # This would hang prior to MDEV-21117 +--connection slave +--source include/sync_with_master_gtid.inc + +--connection master +drop table t1, t2; + +--source include/rpl_end.inc +--echo # End of rpl_xa_2pc_multi_engine.test diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.opt mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt --- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1 @@ +--innodb-sys-tables diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.result mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result --- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result 2025-05-19 16:14:25.000000000 +0000 @@ -166,6 +166,32 @@ select next value for t1; next value for t1 11 +$check_innodb_flags; +is_sequence +12288 +alter table t1 sequence=0; +begin; +delete from t1; +rollback; +$check_innodb_flags; +is_sequence +0 +alter table t1 sequence=1; +$check_innodb_flags; +is_sequence +12288 +alter table t1 sequence=0, algorithm=copy; +$check_innodb_flags; +is_sequence +0 +alter table t1 sequence=1, algorithm=inplace; +ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: SEQUENCE. Try ALGORITHM=COPY +alter table t1 sequence=1, algorithm=copy; +$check_innodb_flags; +is_sequence +12288 +alter table t1 sequence=0, algorithm=inplace; +ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: SEQUENCE. Try ALGORITHM=COPY drop sequence t1; # # ALTER TABLE diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.test mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test --- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test 2025-05-19 16:14:25.000000000 +0000 @@ -80,6 +80,25 @@ show create sequence t1; select * from t1; select next value for t1; +let $check_innodb_flags = +select flag & 12288 is_sequence from information_schema.innodb_sys_tables +where name='test/t1'; +evalp $check_innodb_flags; +alter table t1 sequence=0; +begin; +delete from t1; +rollback; +evalp $check_innodb_flags; +alter table t1 sequence=1; +evalp $check_innodb_flags; +alter table t1 sequence=0, algorithm=copy; +evalp $check_innodb_flags; +--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON +alter table t1 sequence=1, algorithm=inplace; +alter table t1 sequence=1, algorithm=copy; +evalp $check_innodb_flags; +--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON +alter table t1 sequence=0, algorithm=inplace; drop sequence t1; --echo # diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.result mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result --- mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result 2025-05-19 16:14:25.000000000 +0000 @@ -47,14 +47,57 @@ 11 1 9223372036854775806 1 1 1000 0 0 connection only_alter; select next value for s1; -ERROR 42000: INSERT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1` +ERROR 42000: SELECT, INSERT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1` alter sequence s1 restart= 11; select * from s1; ERROR 42000: SELECT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1` connection default; -drop database mysqltest_1; drop user 'normal'@'%'; drop user 'read_only'@'%'; drop user 'read_write'@'%'; drop user 'alter'@'%'; drop user 'only_alter'@'%'; +drop sequence s1; +# +# MDEV-36413 User without any privileges to a sequence can read from +# it and modify it via column default +# +create sequence s1; +create sequence s2; +select * from s2; +next_not_cached_value minimum_value maximum_value start_value increment cache_size cycle_option cycle_count +1 1 9223372036854775806 1 1 1000 0 0 +create table t2 (a int not null default(nextval(s1))); +insert into t2 values(); +create user u; +grant create, insert, select, drop on mysqltest_1.t1 to u; +grant insert, select on mysqltest_1.s1 to u; +grant select on mysqltest_1.t2 to u; +connect con1,localhost,u,,mysqltest_1; +select nextval(s2); +ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2` +show create sequence s2; +ERROR 42000: SHOW command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2` +create table t1 (a int not null default(nextval(s1))); +drop table t1; +create table t1 (a int not null default(nextval(s1))) select a from t2; +insert into t1 values(); +select * from t1; +a +1 +2 +drop table t1; +create table t1 (a int not null default(nextval(s1))) select a from (select t2.a from t2,t2 as t3 where t2.a=t3.a) as t4; +drop table t1; +create table t1 (a int not null default(nextval(s2))); +ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2` +create table t1 (a int not null default(nextval(s1)), +b int not null default(nextval(s2))); +ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2` +disconnect con1; +connection default; +drop user u; +drop database mysqltest_1; +# +# End of 10.11 tests +# diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.test mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test --- mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test 2025-05-19 16:14:25.000000000 +0000 @@ -60,10 +60,58 @@ # connection default; -drop database mysqltest_1; drop user 'normal'@'%'; drop user 'read_only'@'%'; drop user 'read_write'@'%'; drop user 'alter'@'%'; drop user 'only_alter'@'%'; +drop sequence s1; + +--echo # +--echo # MDEV-36413 User without any privileges to a sequence can read from +--echo # it and modify it via column default +--echo # + +create sequence s1; +create sequence s2; +select * from s2; +create table t2 (a int not null default(nextval(s1))); +insert into t2 values(); + +create user u; +grant create, insert, select, drop on mysqltest_1.t1 to u; +grant insert, select on mysqltest_1.s1 to u; +grant select on mysqltest_1.t2 to u; + +--connect(con1,localhost,u,,mysqltest_1) +--error ER_TABLEACCESS_DENIED_ERROR +select nextval(s2); +--error ER_TABLEACCESS_DENIED_ERROR +show create sequence s2; + +create table t1 (a int not null default(nextval(s1))); +drop table t1; +create table t1 (a int not null default(nextval(s1))) select a from t2; +insert into t1 values(); +select * from t1; +drop table t1; +create table t1 (a int not null default(nextval(s1))) select a from (select t2.a from t2,t2 as t3 where t2.a=t3.a) as t4; +drop table t1; +--error ER_TABLEACCESS_DENIED_ERROR +create table t1 (a int not null default(nextval(s2))); +--error ER_TABLEACCESS_DENIED_ERROR +create table t1 (a int not null default(nextval(s1)), + b int not null default(nextval(s2))); +--disconnect con1 +--connection default +drop user u; + +# +# Cleanup +# + +drop database mysqltest_1; +--echo # +--echo # End of 10.11 tests +--echo # diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/gtid.result mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result --- mariadb-10.11.11/mysql-test/suite/sql_sequence/gtid.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result 2025-05-19 16:14:25.000000000 +0000 @@ -174,7 +174,7 @@ drop sequence s_db.s2; connection m_normal_2; select next value for s_db.s1; -ERROR 42000: INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1` +ERROR 42000: SELECT, INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1` create sequence s_db.s2; ERROR 42000: CREATE command denied to user 'normal_2'@'localhost' for table `s_db`.`s2` connection m_normal_1; diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/other.result mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result --- mariadb-10.11.11/mysql-test/suite/sql_sequence/other.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result 2025-05-19 16:14:25.000000000 +0000 @@ -48,7 +48,6 @@ insert into s1 (next_not_cached_value, minimum_value) values (100,1000); ERROR HY000: Field 'maximum_value' doesn't have a default value insert into s1 values (next value for s1, 1,9223372036854775806,1,1,1000,0,0); -ERROR HY000: Table 's1' is specified twice, both as a target for 'INSERT' and as a separate source for data insert into s1 values(1000,9223372036854775806,1,1,1,1000,0,0); ERROR HY000: Sequence 'test.s1' has out of range value for options insert into s1 values(0,9223372036854775806,1,1,1,1000,0,0); diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/other.test mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test --- mariadb-10.11.11/mysql-test/suite/sql_sequence/other.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test 2025-05-19 16:14:25.000000000 +0000 @@ -38,7 +38,6 @@ create sequence s2; --error ER_NO_DEFAULT_FOR_FIELD insert into s1 (next_not_cached_value, minimum_value) values (100,1000); ---error ER_UPDATE_TABLE_USED insert into s1 values (next value for s1, 1,9223372036854775806,1,1,1000,0,0); --error ER_SEQUENCE_INVALID_DATA insert into s1 values(1000,9223372036854775806,1,1,1,1000,0,0); diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/replication.result mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result --- mariadb-10.11.11/mysql-test/suite/sql_sequence/replication.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result 2025-05-19 16:14:25.000000000 +0000 @@ -285,7 +285,7 @@ drop sequence s_db.s2; connection m_normal_2; select NEXT VALUE for s_db.s1; -ERROR 42000: INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1` +ERROR 42000: SELECT, INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1` create sequence s_db.s2; ERROR 42000: CREATE command denied to user 'normal_2'@'localhost' for table `s_db`.`s2` connection m_normal_1; diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/view.test mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test --- mariadb-10.11.11/mysql-test/suite/sql_sequence/view.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,5 +1,4 @@ --source include/have_sequence.inc ---source include/have_innodb.inc # # Test sequences with views diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result 2025-05-19 16:14:25.000000000 +0000 @@ -1,16 +1,17 @@ SET @start_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size; -'#---------------------BS_STVARS_022_01----------------------#' -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); -COUNT(@@GLOBAL.innodb_buffer_pool_size) -1 -1 Expected '#---------------------BS_STVARS_022_02----------------------#' -SET @@GLOBAL.innodb_buffer_pool_size=10485760; -Expected succeeded -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); -COUNT(@@GLOBAL.innodb_buffer_pool_size) +SELECT @@GLOBAL.innodb_buffer_pool_size_max; +@@GLOBAL.innodb_buffer_pool_size_max +8388608 +SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max; +@@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max +1 +SET GLOBAL innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max + 1048576; +Warnings: +Warning 1292 Truncated incorrect innodb_buffer_pool_size value: '9437184' +SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max; +@@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max 1 -1 Expected '#---------------------BS_STVARS_022_03----------------------#' SELECT @@GLOBAL.innodb_buffer_pool_size = VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES @@ -18,10 +19,6 @@ @@GLOBAL.innodb_buffer_pool_size = VARIABLE_VALUE 1 1 Expected -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); -COUNT(@@GLOBAL.innodb_buffer_pool_size) -1 -1 Expected SELECT COUNT(VARIABLE_VALUE) FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME='innodb_buffer_pool_size'; @@ -50,4 +47,7 @@ 1 Expected SELECT innodb_buffer_pool_size = @@SESSION.innodb_buffer_pool_size; ERROR 42S22: Unknown column 'innodb_buffer_pool_size' in 'SELECT' -# restart +SET GLOBAL innodb_buffer_pool_size = @start_buffer_pool_size; +SELECT @@innodb_buffer_pool_size = @start_buffer_pool_size; +@@innodb_buffer_pool_size = @start_buffer_pool_size +1 diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff 2025-05-19 16:14:25.000000000 +0000 @@ -9,7 +9,7 @@ VARIABLE_COMMENT Number of InnoDB Adaptive Hash Index Partitions (default 8) NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 512 -@@ -71,7 +71,7 @@ +@@ -83,7 +83,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -18,20 +18,20 @@ VARIABLE_COMMENT The AUTOINC lock modes supported by InnoDB: 0 => Old style AUTOINC locking (for backward compatibility); 1 => New style AUTOINC locking; 2 => No AUTOINC locking (unsafe for SBR) NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -83,10 +83,10 @@ +@@ -95,10 +95,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT Size of a single memory chunk for resizing buffer pool. Online buffer pool resizing happens at this granularity. 0 means autosize this variable based on buffer pool size. + VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 0 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -119,7 +119,7 @@ +@@ -131,7 +131,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 25 VARIABLE_SCOPE GLOBAL @@ -40,7 +40,50 @@ VARIABLE_COMMENT Dump only the hottest N% of each buffer pool, defaults to 25 NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 100 -@@ -203,7 +203,7 @@ +@@ -203,10 +203,10 @@ + SESSION_VALUE NULL + DEFAULT_VALUE 134217728 + VARIABLE_SCOPE GLOBAL +-VARIABLE_TYPE BIGINT UNSIGNED ++VARIABLE_TYPE INT UNSIGNED + VARIABLE_COMMENT The size of the memory buffer InnoDB uses to cache data and indexes of its tables. + NUMERIC_MIN_VALUE 2097152 +-NUMERIC_MAX_VALUE 18446744073701163008 ++NUMERIC_MAX_VALUE 4292870144 + NUMERIC_BLOCK_SIZE 1048576 + ENUM_VALUE_LIST NULL + READ_ONLY NO +@@ -215,11 +215,11 @@ + SESSION_VALUE NULL + DEFAULT_VALUE 0 + VARIABLE_SCOPE GLOBAL +-VARIABLE_TYPE BIGINT UNSIGNED ++VARIABLE_TYPE INT UNSIGNED + VARIABLE_COMMENT Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure + NUMERIC_MIN_VALUE 0 +-NUMERIC_MAX_VALUE 18446744073701163008 +-NUMERIC_BLOCK_SIZE 8388608 ++NUMERIC_MAX_VALUE 4292870144 ++NUMERIC_BLOCK_SIZE 2097152 + ENUM_VALUE_LIST NULL + READ_ONLY NO + COMMAND_LINE_ARGUMENT REQUIRED +@@ -227,11 +227,11 @@ + SESSION_VALUE NULL + DEFAULT_VALUE 0 + VARIABLE_SCOPE GLOBAL +-VARIABLE_TYPE BIGINT UNSIGNED ++VARIABLE_TYPE INT UNSIGNED + VARIABLE_COMMENT Maximum innodb_buffer_pool_size + NUMERIC_MIN_VALUE 0 +-NUMERIC_MAX_VALUE 18446744073701163008 +-NUMERIC_BLOCK_SIZE 8388608 ++NUMERIC_MAX_VALUE 4292870144 ++NUMERIC_BLOCK_SIZE 2097152 + ENUM_VALUE_LIST NULL + READ_ONLY YES + COMMAND_LINE_ARGUMENT REQUIRED +@@ -239,7 +239,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -49,7 +92,7 @@ VARIABLE_COMMENT A number between [0, 100] that tells how oftern buffer pool dump status in percentages should be printed. E.g. 10 means that buffer pool dump status is printed when every 10% of number of buffer pool pages are dumped. Default is 0 (only start and end status is printed). NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -323,7 +323,7 @@ +@@ -359,7 +359,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 5 VARIABLE_SCOPE GLOBAL @@ -58,7 +101,7 @@ VARIABLE_COMMENT If the compression failure rate of a table is greater than this number more padding is added to the pages to reduce the failures. A value of zero implies no padding NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -347,7 +347,7 @@ +@@ -383,7 +383,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 50 VARIABLE_SCOPE GLOBAL @@ -67,7 +110,7 @@ VARIABLE_COMMENT Percentage of empty space on a data page that can be reserved to make the page compressible. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 75 -@@ -623,7 +623,7 @@ +@@ -671,7 +671,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 600 VARIABLE_SCOPE GLOBAL @@ -76,7 +119,7 @@ VARIABLE_COMMENT Maximum number of seconds that semaphore times out in InnoDB. NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 4294967295 -@@ -671,7 +671,7 @@ +@@ -719,7 +719,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL @@ -85,7 +128,7 @@ VARIABLE_COMMENT Number of iterations over which the background flushing is averaged. NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 1000 -@@ -695,7 +695,7 @@ +@@ -743,7 +743,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -94,7 +137,7 @@ VARIABLE_COMMENT Controls the durability/speed trade-off for commits. Set to 0 (write and flush redo log to disk only once per second), 1 (flush to disk at each commit), 2 (write to log at commit but flush to disk only once per second) or 3 (flush to disk at prepare and at commit, slower and usually redundant). 1 and 3 guarantees that after a crash, committed transactions will not be lost and will be consistent with the binlog and other transactional engines. 2 can get inconsistent and lose transactions if there is a power failure or kernel crash but not if mysqld crashes. 0 has no guarantees in case of crash. 0 and 2 can be faster than 1 or 3. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 3 -@@ -719,7 +719,7 @@ +@@ -767,7 +767,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -103,7 +146,7 @@ VARIABLE_COMMENT Set to 0 (don't flush neighbors from buffer pool), 1 (flush contiguous neighbors from buffer pool) or 2 (flush neighbors from buffer pool), when flushing a block NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -755,7 +755,7 @@ +@@ -803,7 +803,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -112,7 +155,7 @@ VARIABLE_COMMENT Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 6 -@@ -779,10 +779,10 @@ +@@ -827,10 +827,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 8000000 VARIABLE_SCOPE GLOBAL @@ -125,7 +168,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -815,7 +815,7 @@ +@@ -863,7 +863,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 84 VARIABLE_SCOPE GLOBAL @@ -134,7 +177,7 @@ VARIABLE_COMMENT InnoDB Fulltext search maximum token size in characters NUMERIC_MIN_VALUE 10 NUMERIC_MAX_VALUE 84 -@@ -827,7 +827,7 @@ +@@ -875,7 +875,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 3 VARIABLE_SCOPE GLOBAL @@ -143,7 +186,7 @@ VARIABLE_COMMENT InnoDB Fulltext search minimum token size in characters NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 16 -@@ -839,7 +839,7 @@ +@@ -887,7 +887,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000 VARIABLE_SCOPE GLOBAL @@ -152,7 +195,7 @@ VARIABLE_COMMENT InnoDB Fulltext search number of words to optimize for each optimize table call NUMERIC_MIN_VALUE 1000 NUMERIC_MAX_VALUE 10000 -@@ -851,10 +851,10 @@ +@@ -899,10 +899,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000000000 VARIABLE_SCOPE GLOBAL @@ -165,7 +208,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -875,7 +875,7 @@ +@@ -923,7 +923,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2 VARIABLE_SCOPE GLOBAL @@ -174,7 +217,7 @@ VARIABLE_COMMENT InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 16 -@@ -887,10 +887,10 @@ +@@ -935,10 +935,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 640000000 VARIABLE_SCOPE GLOBAL @@ -187,7 +230,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -935,22 +935,22 @@ +@@ -983,7 +983,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 200 VARIABLE_SCOPE GLOBAL @@ -195,27 +238,17 @@ +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT Number of IOPs the server can do. Tunes the background IO rate NUMERIC_MIN_VALUE 100 --NUMERIC_MAX_VALUE 18446744073709551615 -+NUMERIC_MAX_VALUE 4294967295 - NUMERIC_BLOCK_SIZE 0 - ENUM_VALUE_LIST NULL - READ_ONLY NO - COMMAND_LINE_ARGUMENT REQUIRED - VARIABLE_NAME INNODB_IO_CAPACITY_MAX + NUMERIC_MAX_VALUE 4294967295 +@@ -995,7 +995,7 @@ SESSION_VALUE NULL --DEFAULT_VALUE 18446744073709551615 -+DEFAULT_VALUE 4294967295 + DEFAULT_VALUE 4294967295 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT Limit to which innodb_io_capacity can be inflated. NUMERIC_MIN_VALUE 100 --NUMERIC_MAX_VALUE 18446744073709551615 -+NUMERIC_MAX_VALUE 4294967295 - NUMERIC_BLOCK_SIZE 0 - ENUM_VALUE_LIST NULL - READ_ONLY NO -@@ -1043,10 +1043,10 @@ + NUMERIC_MAX_VALUE 4294967295 +@@ -1115,10 +1115,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 32 VARIABLE_SCOPE GLOBAL @@ -228,7 +261,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1055,10 +1055,10 @@ +@@ -1127,10 +1127,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 1536 VARIABLE_SCOPE GLOBAL @@ -241,7 +274,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1091,10 +1091,10 @@ +@@ -1163,10 +1163,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -254,7 +287,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1103,7 +1103,7 @@ +@@ -1175,7 +1175,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -263,7 +296,7 @@ VARIABLE_COMMENT Maximum delay of user threads in micro-seconds NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 10000000 -@@ -1235,10 +1235,10 @@ +@@ -1307,10 +1307,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -276,7 +309,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -1259,7 +1259,7 @@ +@@ -1331,7 +1331,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 16384 VARIABLE_SCOPE GLOBAL @@ -285,16 +318,16 @@ VARIABLE_COMMENT Page size to use for all InnoDB tablespaces. NUMERIC_MIN_VALUE 4096 NUMERIC_MAX_VALUE 65536 -@@ -1295,7 +1295,7 @@ +@@ -1367,7 +1367,7 @@ SESSION_VALUE NULL - DEFAULT_VALUE 1000 + DEFAULT_VALUE 127 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT Number of UNDO log pages to purge in one batch from the history list. NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 5000 -@@ -1307,7 +1307,7 @@ +@@ -1379,7 +1379,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 128 VARIABLE_SCOPE GLOBAL @@ -303,7 +336,7 @@ VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 128 -@@ -1343,7 +1343,7 @@ +@@ -1415,7 +1415,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 56 VARIABLE_SCOPE GLOBAL @@ -312,7 +345,7 @@ VARIABLE_COMMENT Number of pages that must be accessed sequentially for InnoDB to trigger a readahead. NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 64 -@@ -1427,7 +1427,7 @@ +@@ -1499,7 +1499,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1048576 VARIABLE_SCOPE GLOBAL @@ -321,7 +354,7 @@ VARIABLE_COMMENT Memory buffer size for index creation NUMERIC_MIN_VALUE 65536 NUMERIC_MAX_VALUE 67108864 -@@ -1595,10 +1595,10 @@ +@@ -1667,10 +1667,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result 2025-05-19 16:14:25.000000000 +0000 @@ -96,7 +96,7 @@ DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Size of a single memory chunk for resizing buffer pool. Online buffer pool resizing happens at this granularity. 0 means autosize this variable based on buffer pool size. +VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 18446744073709551615 NUMERIC_BLOCK_SIZE 1048576 @@ -206,11 +206,35 @@ VARIABLE_TYPE BIGINT UNSIGNED VARIABLE_COMMENT The size of the memory buffer InnoDB uses to cache data and indexes of its tables. NUMERIC_MIN_VALUE 2097152 -NUMERIC_MAX_VALUE 9223372036854775807 +NUMERIC_MAX_VALUE 18446744073701163008 NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_BUFFER_POOL_SIZE_AUTO_MIN +SESSION_VALUE NULL +DEFAULT_VALUE 0 +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 18446744073701163008 +NUMERIC_BLOCK_SIZE 8388608 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_BUFFER_POOL_SIZE_MAX +SESSION_VALUE NULL +DEFAULT_VALUE 0 +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT Maximum innodb_buffer_pool_size +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 18446744073701163008 +NUMERIC_BLOCK_SIZE 8388608 +ENUM_VALUE_LIST NULL +READ_ONLY YES +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_BUF_DUMP_STATUS_FREQUENCY SESSION_VALUE NULL DEFAULT_VALUE 0 @@ -962,19 +986,19 @@ VARIABLE_TYPE BIGINT UNSIGNED VARIABLE_COMMENT Number of IOPs the server can do. Tunes the background IO rate NUMERIC_MIN_VALUE 100 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_IO_CAPACITY_MAX SESSION_VALUE NULL -DEFAULT_VALUE 18446744073709551615 +DEFAULT_VALUE 4294967295 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BIGINT UNSIGNED VARIABLE_COMMENT Limit to which innodb_io_capacity can be inflated. NUMERIC_MIN_VALUE 100 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO @@ -1020,7 +1044,7 @@ DEFAULT_VALUE OFF VARIABLE_SCOPE GLOBAL VARIABLE_TYPE BOOLEAN -VARIABLE_COMMENT Force checkpoint now +VARIABLE_COMMENT Write back dirty pages from the buffer pool and update the log checkpoint NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL @@ -1068,7 +1092,7 @@ DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL VARIABLE_TYPE INT UNSIGNED -VARIABLE_COMMENT Delay between log buffer spin lock polls (0 to use a blocking latch) +VARIABLE_COMMENT Deprecated parameter with no effect NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 6000 NUMERIC_BLOCK_SIZE 0 @@ -1571,10 +1595,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 20 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT The number of leaf index pages to sample when calculating persistent statistics (by ANALYZE, default 20) NUMERIC_MIN_VALUE 1 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO @@ -1595,10 +1619,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 8 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT The number of leaf index pages to sample when calculating transient statistics (if persistent statistics are not used, default 8) NUMERIC_MIN_VALUE 1 -NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_MAX_VALUE 4294967295 NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result 2025-05-19 16:14:25.000000000 +0000 @@ -2325,11 +2325,11 @@ VARIABLE_NAME OPTIMIZER_ADJUST_SECONDARY_KEY_COSTS VARIABLE_SCOPE SESSION VARIABLE_TYPE SET -VARIABLE_COMMENT A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. selectivity_multiplier. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. +VARIABLE_COMMENT A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. fix_derived_table_read_cost = Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier +ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier,fix_derived_table_read_cost READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME OPTIMIZER_EXTRA_PRUNING_DEPTH diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result 2025-05-19 16:14:25.000000000 +0000 @@ -2495,11 +2495,11 @@ VARIABLE_NAME OPTIMIZER_ADJUST_SECONDARY_KEY_COSTS VARIABLE_SCOPE SESSION VARIABLE_TYPE SET -VARIABLE_COMMENT A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. selectivity_multiplier. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. +VARIABLE_COMMENT A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. fix_derived_table_read_cost = Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer. NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier +ENUM_VALUE_LIST adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier,fix_derived_table_read_cost READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME OPTIMIZER_EXTRA_PRUNING_DEPTH diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result 1970-01-01 00:00:00.000000000 +0000 @@ -1,51 +0,0 @@ -# -# wsrep_forced_binlog_format -# -# save the initial value -SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format; -# default -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -NONE - -# scope -SELECT @@session.wsrep_forced_binlog_format; -ERROR HY000: Variable 'wsrep_forced_binlog_format' is a GLOBAL variable -SET @@global.wsrep_forced_binlog_format=STATEMENT; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -STATEMENT - -# valid values -SET @@global.wsrep_forced_binlog_format=STATEMENT; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -STATEMENT -SET @@global.wsrep_forced_binlog_format=ROW; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -ROW -SET @@global.wsrep_forced_binlog_format=MIXED; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -MIXED -SET @@global.wsrep_forced_binlog_format=NONE; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -NONE -SET @@global.wsrep_forced_binlog_format=default; -SELECT @@global.wsrep_forced_binlog_format; -@@global.wsrep_forced_binlog_format -NONE - -# invalid values -SET @@global.wsrep_forced_binlog_format=NULL; -ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'NULL' -SET @@global.wsrep_forced_binlog_format='junk'; -ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'junk' -SET @@global.wsrep_forced_binlog_format=ON; -ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'ON' - -# restore the initial value -SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved; -# End of test diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result --- mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,15 @@ +# +# wsrep_replicate_myisam +# +# save the initial value +SET @wsrep_mode_saved = @@global.wsrep_mode; + +# scope and valid values +SET @@global.wsrep_mode=REPLICATE_MYISAM; +SELECT @@global.wsrep_mode; +@@global.wsrep_mode +REPLICATE_MYISAM + +# restore the initial value +SET @@global.wsrep_mode = @wsrep_mode_saved; +# End of test diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ ---innodb-buffer-pool-chunk-size=2M diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1 @@ +--innodb-buffer-pool-size-max=8m diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test 2025-05-19 16:14:25.000000000 +0000 @@ -24,35 +24,19 @@ --source include/have_innodb.inc -let $wait_condition = - SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool' - FROM information_schema.global_status - WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status'; - SET @start_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size; ---echo '#---------------------BS_STVARS_022_01----------------------#' -#################################################################### -# Displaying default value # -#################################################################### -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); ---echo 1 Expected - - --echo '#---------------------BS_STVARS_022_02----------------------#' #################################################################### # Check if Value can set # #################################################################### -SET @@GLOBAL.innodb_buffer_pool_size=10485760; ---echo Expected succeeded ---source include/wait_condition.inc - -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); ---echo 1 Expected - - - +--enable_warnings +SELECT @@GLOBAL.innodb_buffer_pool_size_max; +SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max; +SET GLOBAL innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max + 1048576; +SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max; +--disable_warnings --echo '#---------------------BS_STVARS_022_03----------------------#' ################################################################# @@ -66,9 +50,6 @@ --enable_warnings --echo 1 Expected -SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size); ---echo 1 Expected - --disable_warnings SELECT COUNT(VARIABLE_VALUE) FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES @@ -76,8 +57,6 @@ --enable_warnings --echo 1 Expected - - --echo '#---------------------BS_STVARS_022_04----------------------#' ################################################################################ # Check if accessing variable with and without GLOBAL point to same variable # @@ -111,4 +90,6 @@ # Restore the original buffer pool size. ---source include/restart_mysqld.inc +SET GLOBAL innodb_buffer_pool_size = @start_buffer_pool_size; + +SELECT @@innodb_buffer_pool_size = @start_buffer_pool_size; diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.opt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt 2025-05-19 16:14:25.000000000 +0000 @@ -1,2 +1,4 @@ ---loose-innodb-flush-log-at-timeout=3 +--innodb +--innodb-purge-rseg-truncate-frequency=64 +--innodb-flush-log-at-timeout=3 --table_open_cache=200 diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test 2025-05-19 16:14:25.000000000 +0000 @@ -3,6 +3,10 @@ --source include/not_valgrind.inc --source include/word_size.inc +--disable_query_log +call mtr.add_suppression("'innodb-purge-rseg-truncate-frequency' was removed"); +--enable_query_log + --vertical_results --replace_regex /^\/\S+/PATH/ /\.\//PATH/ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYPE, VARIABLE_COMMENT, NUMERIC_MIN_VALUE, NUMERIC_MAX_VALUE, NUMERIC_BLOCK_SIZE, ENUM_VALUE_LIST, READ_ONLY, COMMAND_LINE_ARGUMENT from information_schema.system_variables diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test 1970-01-01 00:00:00.000000000 +0000 @@ -1,46 +0,0 @@ ---source include/have_wsrep.inc - ---echo # ---echo # wsrep_forced_binlog_format ---echo # - ---echo # save the initial value -SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format; - ---echo # default -SELECT @@global.wsrep_forced_binlog_format; - ---echo ---echo # scope ---error ER_INCORRECT_GLOBAL_LOCAL_VAR -SELECT @@session.wsrep_forced_binlog_format; -SET @@global.wsrep_forced_binlog_format=STATEMENT; -SELECT @@global.wsrep_forced_binlog_format; - ---echo ---echo # valid values -SET @@global.wsrep_forced_binlog_format=STATEMENT; -SELECT @@global.wsrep_forced_binlog_format; -SET @@global.wsrep_forced_binlog_format=ROW; -SELECT @@global.wsrep_forced_binlog_format; -SET @@global.wsrep_forced_binlog_format=MIXED; -SELECT @@global.wsrep_forced_binlog_format; -SET @@global.wsrep_forced_binlog_format=NONE; -SELECT @@global.wsrep_forced_binlog_format; -SET @@global.wsrep_forced_binlog_format=default; -SELECT @@global.wsrep_forced_binlog_format; - ---echo ---echo # invalid values ---error ER_WRONG_VALUE_FOR_VAR -SET @@global.wsrep_forced_binlog_format=NULL; ---error ER_WRONG_VALUE_FOR_VAR -SET @@global.wsrep_forced_binlog_format='junk'; ---error ER_WRONG_VALUE_FOR_VAR -SET @@global.wsrep_forced_binlog_format=ON; - ---echo ---echo # restore the initial value -SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved; - ---echo # End of test diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test --- mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,19 @@ +--source include/have_wsrep.inc + +--echo # +--echo # wsrep_replicate_myisam +--echo # + +--echo # save the initial value +SET @wsrep_mode_saved = @@global.wsrep_mode; + +--echo +--echo # scope and valid values +SET @@global.wsrep_mode=REPLICATE_MYISAM; +SELECT @@global.wsrep_mode; + +--echo +--echo # restore the initial value +SET @@global.wsrep_mode = @wsrep_mode_saved; + +--echo # End of test diff -Nru mariadb-10.11.11/mysql-test/suite/versioning/r/partition.result mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result --- mariadb-10.11.11/mysql-test/suite/versioning/r/partition.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result 2025-05-19 16:14:25.000000000 +0000 @@ -3445,6 +3445,20 @@ create table t (a int) with system versioning partition by system_time partitions 3; ERROR HY000: Maybe missing parameters: no rotation condition for multiple HISTORY partitions. # +# MDEV-36115 InnoDB: assertion: node->pcur->rel_pos == BTR_PCUR_ON +# in row_update_for_mysql +# +create table t (a int key) engine=innodb +with system versioning +partition by key() partitions 3; +start transaction; +insert into t values (1),(2),(3),(4),(5),(6),(7),(8); +set timestamp=+1; +delete from t; +insert into t values (1),(2); +DELETE from t; +drop table t; +# # End of 10.5 tests # # @@ -3470,4 +3484,25 @@ # # End of 10.9 tests # +# +# MDEV-34775 Wrong reopen of already open routine due to auto-create in SP +# +create table t (a int) with system versioning +partition by system_time +interval 1 minute auto; +create function f() +returns int +begin +replace into t select * from t; +return 0; +end $ +set timestamp= @@timestamp + 61; +select f(); +f() +0 +drop table t; +drop function f; +# +# End of 10.11 tests +# set global innodb_stats_persistent= @save_persistent; diff -Nru mariadb-10.11.11/mysql-test/suite/versioning/t/partition.test mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test --- mariadb-10.11.11/mysql-test/suite/versioning/t/partition.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test 2025-05-19 16:14:25.000000000 +0000 @@ -2676,6 +2676,22 @@ create table t (a int) with system versioning partition by system_time partitions 3; --echo # +--echo # MDEV-36115 InnoDB: assertion: node->pcur->rel_pos == BTR_PCUR_ON +--echo # in row_update_for_mysql +--echo # +create table t (a int key) engine=innodb +with system versioning +partition by key() partitions 3; + +start transaction; +insert into t values (1),(2),(3),(4),(5),(6),(7),(8); +set timestamp=+1; +delete from t; +insert into t values (1),(2); +DELETE from t; +drop table t; + +--echo # --echo # End of 10.5 tests --echo # @@ -2717,5 +2733,32 @@ --echo # End of 10.9 tests --echo # +--echo # +--echo # MDEV-34775 Wrong reopen of already open routine due to auto-create in SP +--echo # + +create table t (a int) with system versioning +partition by system_time +interval 1 minute auto; + +--delimiter $ +create function f() +returns int +begin + replace into t select * from t; + return 0; +end $ +--delimiter ; + +set timestamp= @@timestamp + 61; +select f(); + +drop table t; +drop function f; + +--echo # +--echo # End of 10.11 tests +--echo # + set global innodb_stats_persistent= @save_persistent; --source suite/versioning/common_finish.inc diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/README mariadb-10.11.13/mysql-test/suite/wsrep/README --- mariadb-10.11.11/mysql-test/suite/wsrep/README 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/README 2025-05-19 16:14:25.000000000 +0000 @@ -4,4 +4,3 @@ * As these tests are specific to wsrep-related functionalities, they must skip on server built without wsrep patch (vanilla). (-DWITH_WSREP=OFF) See : include/have_wsrep.inc, include/have_wsrep_enabled.inc, not_wsrep.inc - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/include/check_galera_version.inc mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc --- mariadb-10.11.11/mysql-test/suite/wsrep/include/check_galera_version.inc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc 2025-05-19 16:14:25.000000000 +0000 @@ -44,4 +44,3 @@ } --echo # Correct Galera library found - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/plugin.result mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result --- mariadb-10.11.11/mysql-test/suite/wsrep/r/plugin.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,3 @@ -SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins where plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name; +SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins WHERE plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name; plugin_name plugin_version plugin_maturity wsrep 1.0 Stable diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result --- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,18 @@ +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; +# Case 1: Server goes through graceful shutdown and is restarted +connection default; +INSERT INTO t1 VALUES (1); +Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +WSREP_LAST_SEEN_GTID() +100-10-2 +Performing --wsrep-recover ... +Using --wsrep-start-position when starting mysqld ... +Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +WSREP_LAST_SEEN_GTID() +100-10-2 +SELECT * FROM t1; +f1 +1 +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result --- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,65 @@ +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; +# Case 1: Server goes through graceful shutdown and is restarted +connection default; +INSERT INTO t1 VALUES (1); +Performing --wsrep-recover ... +Using --wsrep-start-position when starting mysqld ... +Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +WSREP_LAST_SEEN_GTID() +100-10-2 +SELECT @@GLOBAL.gtid_binlog_pos; +@@GLOBAL.gtid_binlog_pos +100-10-2 +SELECT * FROM t1; +f1 +1 +# Case 2: Server is killed after the transaction gets prepared +# but before it is written into binlog. As there is not GTID assigned, +# the transaction must be rolled back during recovery. +connect con, localhost, root; +SET DEBUG_SYNC = "ha_commit_trans_after_prepare SIGNAL reached WAIT_FOR continue"; +INSERT INTO t1 VALUES (2); +connection default; +SET DEBUG_SYNC = "now WAIT_FOR reached"; +# Kill the server +Performing --wsrep-recover ... +Using --wsrep-start-position when starting mysqld ... +Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +WSREP_LAST_SEEN_GTID() +100-10-2 +SELECT @@GLOBAL.gtid_binlog_pos; +@@GLOBAL.gtid_binlog_pos +100-10-2 +Expect 1 +SELECT * FROM t1; +f1 +1 +disconnect con; +# Case 3: Server is killed after the transaction gets written into binlog +# but before it is committed in storage engine. In this case the +# transaction must be committed during recovery as it had a valid +# GTID assigned. +connect con, localhost, root; +SET DEBUG_SYNC = "commit_before_get_LOCK_commit_ordered SIGNAL reached WAIT_FOR continue"; +INSERT INTO t1 VALUES (3); +connection default; +SET DEBUG_SYNC = "now WAIT_FOR reached"; +# Kill the server +Performing --wsrep-recover ... +Using --wsrep-start-position when starting mysqld ... +Expect 100-10-3 +SELECT WSREP_LAST_SEEN_GTID(); +WSREP_LAST_SEEN_GTID() +100-10-3 +SELECT @@GLOBAL.gtid_binlog_pos; +@@GLOBAL.gtid_binlog_pos +100-10-3 +Expect 1 3 +SELECT * FROM t1; +f1 +1 +3 +disconnect con; +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff --- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff 2025-05-19 16:14:25.000000000 +0000 @@ -1,5 +1,5 @@ --- r/wsrep-recover-v25.result -+++ r/wsrep-recover-v25.reject ++++ r/wsrep-recover-v25,binlogoin.reject @@ -12,4 +12,16 @@ SELECT VARIABLE_VALUE `expect 6` FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'; expect 6 diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result --- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,51 @@ +# +# wsrep_forced_binlog_format +# +# save the initial value +SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format; +# default +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +NONE + +# scope +SELECT @@session.wsrep_forced_binlog_format; +ERROR HY000: Variable 'wsrep_forced_binlog_format' is a GLOBAL variable +SET @@global.wsrep_forced_binlog_format=STATEMENT; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +STATEMENT + +# valid values +SET @@global.wsrep_forced_binlog_format=STATEMENT; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +STATEMENT +SET @@global.wsrep_forced_binlog_format=ROW; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +ROW +SET @@global.wsrep_forced_binlog_format=MIXED; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +MIXED +SET @@global.wsrep_forced_binlog_format=NONE; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +NONE +SET @@global.wsrep_forced_binlog_format=default; +SELECT @@global.wsrep_forced_binlog_format; +@@global.wsrep_forced_binlog_format +NONE + +# invalid values +SET @@global.wsrep_forced_binlog_format=NULL; +ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'NULL' +SET @@global.wsrep_forced_binlog_format='junk'; +ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'junk' +SET @@global.wsrep_forced_binlog_format=ON; +ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'ON' + +# restore the initial value +SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved; +# End of test diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result --- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,8 @@ +# +# MDEV-27126: my_getopt compares option names case sensitively +# +# Check if the variable is set correctly from options +SELECT @@GLOBAL.wsrep_slave_uk_checks; +@@GLOBAL.wsrep_slave_uk_checks +1 +# End of test. diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/suite.pm mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm --- mariadb-10.11.11/mysql-test/suite/wsrep/suite.pm 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm 2025-05-19 16:14:25.000000000 +0000 @@ -9,9 +9,9 @@ push @::global_suppressions, ( - qr(WSREP: Could not open saved state file for reading: .*), - qr(WSREP: Could not open state file for reading: .*), - qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|, + qr(WSREP: Could not open saved state file for reading: ), + qr(WSREP: Could not open state file for reading: ), + qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|, ); bless { }; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/binlog_format.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/binlog_format.cnf 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -5,4 +5,3 @@ wsrep-provider=@ENV.WSREP_PROVIDER wsrep-cluster-address=gcomm:// innodb_autoinc_lock_mode=2 - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/foreign_key.test mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/foreign_key.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test 2025-05-19 16:14:25.000000000 +0000 @@ -17,4 +17,3 @@ # Cleanup DROP TABLE c; DROP TABLE p; - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_10186.test mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_10186.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test 2025-05-19 16:14:25.000000000 +0000 @@ -9,4 +9,3 @@ SELECT @@wsrep_on; SET @@GLOBAL.wsrep_cluster_address='gcomm://'; - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_7798.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_7798.cnf 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -4,4 +4,3 @@ wsrep-on=ON wsrep-provider=@ENV.WSREP_PROVIDER wsrep-cluster-address=gcomm:// - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/plugin.test mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/plugin.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test 2025-05-19 16:14:25.000000000 +0000 @@ -5,4 +5,4 @@ # MDEV-7604: wsrep plugin lists its status as Unknown # -SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins where plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name; \ No newline at end of file +SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins WHERE plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/pool_of_threads.test mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/pool_of_threads.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test 2025-05-19 16:14:25.000000000 +0000 @@ -1,3 +1,4 @@ +--source include/have_innodb.inc --source include/have_wsrep_enabled.inc --source include/have_binlog_format_row.inc diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/variables.test mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/variables.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test 2025-05-19 16:14:25.000000000 +0000 @@ -23,4 +23,3 @@ --echo # variables SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.SESSION_VARIABLES WHERE VARIABLE_NAME LIKE "wsrep%" ORDER BY VARIABLE_NAME; - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/variables_debug.test mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/variables_debug.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test 2025-05-19 16:14:25.000000000 +0000 @@ -8,7 +8,7 @@ --let $galera_version=26.4.21 source include/check_galera_version.inc; -source include/galera_variables_ok.inc; +source include/galera_variables_ok_debug.inc; --replace_column 2 # SHOW GLOBAL STATUS LIKE 'wsrep%'; @@ -25,4 +25,3 @@ --echo # variables SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.SESSION_VARIABLES WHERE VARIABLE_NAME LIKE "wsrep%" ORDER BY VARIABLE_NAME; - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,10 @@ +!include ../my.cnf + +[mysqld.1] +wsrep-on=ON +wsrep-provider=@ENV.WSREP_PROVIDER +wsrep-cluster-address=gcomm:// +binlog-format=ROW +wsrep-gtid-domain-id=100 +server-id=10 +innodb-autoinc-lock-mode=2 diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,28 @@ +# Test wsrep GTID recovery with binlog off. The test restarts the server +# and verifies that the GTID returned by SELECT WSREP_LAST_SEEN_GTID() +# gets initialized properly during server restart. +# +--source include/have_wsrep.inc +--source include/have_wsrep_provider.inc +--source include/have_innodb.inc +--source include/have_debug_sync.inc + +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; + +--echo # Case 1: Server goes through graceful shutdown and is restarted +--connection default +INSERT INTO t1 VALUES (1); + +--echo Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); + +--source include/shutdown_mysqld.inc +--let $galera_wsrep_recover_server_id = 1 +--source suite/galera/include/galera_wsrep_recover.inc +--source suite/galera/include/start_mysqld.inc + +--echo Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +SELECT * FROM t1; + +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,14 @@ +!include ../my.cnf + +[mysqld.1] +wsrep-on=ON +wsrep-provider=@ENV.WSREP_PROVIDER +wsrep-cluster-address=gcomm:// +binlog-format=ROW +log-bin +log-slave-updates +gtid-domain-id=10 +gtid-strict-mode=ON +wsrep-gtid-mode=ON +wsrep-gtid-domain-id=100 +server-id=10 diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,73 @@ +# Test wsrep recovery with gtid_mode=ON. The test crashes the server +# in different commit stages and verifies that the GTID returned by +# SELECT WSREP_LAST_SEEN_GTID() and @@GLOBAL.gtid_binlog_pos get +# initialized properly during server restart. +# +--source include/have_wsrep.inc +--source include/have_wsrep_provider.inc +--source include/have_innodb.inc +--source include/have_log_bin.inc +--source include/have_debug_sync.inc + +CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB; + +--echo # Case 1: Server goes through graceful shutdown and is restarted +--connection default +INSERT INTO t1 VALUES (1); +--source include/shutdown_mysqld.inc +--let $galera_wsrep_recover_server_id = 1 +--source suite/galera/include/galera_wsrep_recover.inc +--source suite/galera/include/start_mysqld.inc + +--echo Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +SELECT @@GLOBAL.gtid_binlog_pos; +SELECT * FROM t1; + +--echo # Case 2: Server is killed after the transaction gets prepared +--echo # but before it is written into binlog. As there is not GTID assigned, +--echo # the transaction must be rolled back during recovery. +--connect con, localhost, root +SET DEBUG_SYNC = "ha_commit_trans_after_prepare SIGNAL reached WAIT_FOR continue"; +--send INSERT INTO t1 VALUES (2) + +--connection default +SET DEBUG_SYNC = "now WAIT_FOR reached"; +--source include/kill_mysqld.inc +--let $galera_wsrep_recover_server_id = 1 +--source suite/galera/include/galera_wsrep_recover.inc +--source suite/galera/include/start_mysqld.inc +--source include/wait_wsrep_ready.inc + +--echo Expect 100-10-2 +SELECT WSREP_LAST_SEEN_GTID(); +SELECT @@GLOBAL.gtid_binlog_pos; +--echo Expect 1 +SELECT * FROM t1; +--disconnect con + +--echo # Case 3: Server is killed after the transaction gets written into binlog +--echo # but before it is committed in storage engine. In this case the +--echo # transaction must be committed during recovery as it had a valid +--echo # GTID assigned. + +--connect con, localhost, root +SET DEBUG_SYNC = "commit_before_get_LOCK_commit_ordered SIGNAL reached WAIT_FOR continue"; +--send INSERT INTO t1 VALUES (3) + +--connection default +SET DEBUG_SYNC = "now WAIT_FOR reached"; +--source include/kill_mysqld.inc +--let $galera_wsrep_recover_server_id = 1 +--source suite/galera/include/galera_wsrep_recover.inc +--source suite/galera/include/start_mysqld.inc +--source include/wait_wsrep_ready.inc +--echo Expect 100-10-3 +SELECT WSREP_LAST_SEEN_GTID(); +SELECT @@GLOBAL.gtid_binlog_pos; +--echo Expect 1 3 +SELECT * FROM t1; + +--disconnect con + +DROP TABLE t1; diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover.cnf 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -6,4 +6,4 @@ innodb-flush-log-at-trx-commit=1 wsrep-cluster-address=gcomm:// wsrep-provider=@ENV.WSREP_PROVIDER -innodb-autoinc-lock-mode=2 \ No newline at end of file +innodb-autoinc-lock-mode=2 diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,7 @@ +!include ../my.cnf + +[mysqld.1] +wsrep-on=ON +wsrep-cluster-address=gcomm:// +wsrep-provider=@ENV.WSREP_PROVIDER +binlog-format=ROW diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,48 @@ +--source include/have_innodb.inc +--source include/have_wsrep_provider.inc +--source include/have_binlog_format_row.inc + +--echo # +--echo # wsrep_forced_binlog_format +--echo # + +--echo # save the initial value +SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format; + +--echo # default +SELECT @@global.wsrep_forced_binlog_format; + +--echo +--echo # scope +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SELECT @@session.wsrep_forced_binlog_format; +SET @@global.wsrep_forced_binlog_format=STATEMENT; +SELECT @@global.wsrep_forced_binlog_format; + +--echo +--echo # valid values +SET @@global.wsrep_forced_binlog_format=STATEMENT; +SELECT @@global.wsrep_forced_binlog_format; +SET @@global.wsrep_forced_binlog_format=ROW; +SELECT @@global.wsrep_forced_binlog_format; +SET @@global.wsrep_forced_binlog_format=MIXED; +SELECT @@global.wsrep_forced_binlog_format; +SET @@global.wsrep_forced_binlog_format=NONE; +SELECT @@global.wsrep_forced_binlog_format; +SET @@global.wsrep_forced_binlog_format=default; +SELECT @@global.wsrep_forced_binlog_format; + +--echo +--echo # invalid values +--error ER_WRONG_VALUE_FOR_VAR +SET @@global.wsrep_forced_binlog_format=NULL; +--error ER_WRONG_VALUE_FOR_VAR +SET @@global.wsrep_forced_binlog_format='junk'; +--error ER_WRONG_VALUE_FOR_VAR +SET @@global.wsrep_forced_binlog_format=ON; + +--echo +--echo # restore the initial value +SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved; + +--echo # End of test diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,6 @@ +!include ../my.cnf + +[mysqld.1] +wsrep-on=ON +wsrep-provider=@ENV.WSREP_PROVIDER +wsrep-cluster-address=gcomm:// diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1 @@ +--wsrep-slave-uk-checks=1 diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,11 @@ +--source include/have_innodb.inc +--source include/have_wsrep_provider.inc +--source include/have_binlog_format_row.inc +--echo # +--echo # MDEV-27126: my_getopt compares option names case sensitively +--echo # + +--echo # Check if the variable is set correctly from options +SELECT @@GLOBAL.wsrep_slave_uk_checks; + +--echo # End of test. diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_rpl.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_rpl.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test 2025-05-19 16:14:25.000000000 +0000 @@ -41,4 +41,3 @@ --source include/rpl_end.inc --echo # End of test. - diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test 2025-05-19 16:14:25.000000000 +0000 @@ -44,7 +44,6 @@ SELECT @@global.wsrep_sst_method; SHOW WARNINGS; - --disable_query_log SET @@global.wsrep_sst_method = @wsrep_sst_method_saved; --enable_query_log diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf --- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf 2025-05-19 16:14:25.000000000 +0000 @@ -9,4 +9,3 @@ #galera_port=@OPT.port #ist_port=@OPT.port #sst_port=@OPT.port - diff -Nru mariadb-10.11.11/mysys/CMakeLists.txt mariadb-10.11.13/mysys/CMakeLists.txt --- mariadb-10.11.11/mysys/CMakeLists.txt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/CMakeLists.txt 2025-05-19 16:14:25.000000000 +0000 @@ -46,7 +46,8 @@ my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c my_rdtsc.c psi_noop.c my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c - file_logger.c my_dlerror.c crc32/crc32c.cc) + file_logger.c my_dlerror.c crc32/crc32c.cc + my_virtual_mem.c) IF (WIN32) SET (MYSYS_SOURCES ${MYSYS_SOURCES} @@ -170,7 +171,7 @@ ENDIF(HAVE_BFD_H) IF (WIN32) - TARGET_LINK_LIBRARIES(mysys iphlpapi dbghelp) + TARGET_LINK_LIBRARIES(mysys iphlpapi dbghelp ws2_32 synchronization) ENDIF(WIN32) # Need explicit pthread for gcc -fsanitize=address diff -Nru mariadb-10.11.11/mysys/mf_keycache.c mariadb-10.11.13/mysys/mf_keycache.c --- mariadb-10.11.11/mysys/mf_keycache.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/mf_keycache.c 2025-05-19 16:14:25.000000000 +0000 @@ -3762,10 +3762,11 @@ static int cmp_sec_link(const void *_a, const void *_b) { - BLOCK_LINK *const *a= _a; - BLOCK_LINK *const *b= _b; - return (((*a)->hash_link->diskpos < (*b)->hash_link->diskpos) ? -1 : - ((*a)->hash_link->diskpos > (*b)->hash_link->diskpos) ? 1 : 0); + const BLOCK_LINK *a= *(const BLOCK_LINK **)_a; + const BLOCK_LINK *b= *(const BLOCK_LINK **)_b; + + return (a->hash_link->diskpos < b->hash_link->diskpos) ? -1 : + (a->hash_link->diskpos > b->hash_link->diskpos) ? 1 : 0; } diff -Nru mariadb-10.11.11/mysys/my_default.c mariadb-10.11.13/mysys/my_default.c --- mariadb-10.11.11/mysys/my_default.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/my_default.c 2025-05-19 16:14:25.000000000 +0000 @@ -318,6 +318,9 @@ } if (! my_defaults_group_suffix) + my_defaults_group_suffix= getenv("MARIADB_GROUP_SUFFIX"); + + if (! my_defaults_group_suffix) my_defaults_group_suffix= getenv("MYSQL_GROUP_SUFFIX"); if (my_defaults_extra_file && my_defaults_extra_file != extra_file_buffer) diff -Nru mariadb-10.11.11/mysys/my_getopt.c mariadb-10.11.13/mysys/my_getopt.c --- mariadb-10.11.11/mysys/my_getopt.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/my_getopt.c 2025-05-19 16:14:25.000000000 +0000 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1002,7 +1003,7 @@ for (;s != end ; s++, t++) { - if ((*s != '-' ? *s : '_') != (*t != '-' ? *t : '_')) + if ((*s != '-' ? tolower(*s) : '_') != (*t != '-' ? tolower(*t) : '_')) DBUG_RETURN(1); } DBUG_RETURN(0); diff -Nru mariadb-10.11.11/mysys/my_largepage.c mariadb-10.11.13/mysys/my_largepage.c --- mariadb-10.11.11/mysys/my_largepage.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/my_largepage.c 2025-05-19 16:14:25.000000000 +0000 @@ -35,17 +35,11 @@ #endif /* __sun__ ... */ #endif /* HAVE_SOLARIS_LARGE_PAGES */ -#if defined(_WIN32) -static size_t my_large_page_size; -#define HAVE_LARGE_PAGES -#elif defined(HAVE_MMAP) -#define HAVE_LARGE_PAGES -#endif -#ifdef HAVE_LARGE_PAGES -static my_bool my_use_large_pages= 0; -#else -#define my_use_large_pages 0 +my_bool my_use_large_pages; + +#ifdef _WIN32 +static size_t my_large_page_size; #endif #if defined(HAVE_GETPAGESIZES) || defined(__linux__) @@ -172,7 +166,7 @@ @retval a large page size that is valid on this system or 0 if no large page size possible. */ -#if defined(HAVE_MMAP) && !defined(_WIN32) +#ifndef _WIN32 static size_t my_next_large_page_size(size_t sz, int *start) { DBUG_ENTER("my_next_large_page_size"); @@ -188,11 +182,12 @@ } DBUG_RETURN(0); } -#endif /* defined(MMAP) || !defined(_WIN32) */ +#endif -int my_init_large_pages(my_bool super_large_pages) +int my_init_large_pages(void) { + my_use_large_pages= 1; #ifdef _WIN32 if (!my_obtain_privilege(SE_LOCK_MEMORY_NAME)) { @@ -200,19 +195,15 @@ "Lock Pages in memory access rights required for use with" " large-pages, see https://mariadb.com/kb/en/library/" "mariadb-memory-allocation/#huge-pages", MYF(MY_WME)); + my_use_large_pages= 0; } my_large_page_size= GetLargePageMinimum(); #endif - my_use_large_pages= 1; my_get_large_page_sizes(my_large_page_sizes); -#ifndef HAVE_LARGE_PAGES - my_printf_error(EE_OUTOFMEMORY, "No large page support on this platform", - MYF(MY_WME)); -#endif - #ifdef HAVE_SOLARIS_LARGE_PAGES + extern my_bool opt_super_large_pages; /* tell the kernel that we want to use 4/256MB page for heap storage and also for the stack. We use 4 MByte as default and if the @@ -222,9 +213,15 @@ measured in a number of GBytes. We use as big pages as possible which isn't bigger than the above desired page sizes. + + Note: This refers to some implementations of the SPARC ISA, + where the supported page sizes are + 8KiB, 64KiB, 512KiB, 4MiB, 32MiB, 256MiB, 2GiB, and 16GiB. + On implementations of the AMD64 ISA, the available page sizes + should be 4KiB, 2MiB, and 1GiB. */ int nelem= 0; - size_t max_desired_page_size= (super_large_pages ? 256 : 4) * 1024 * 1024; + size_t max_desired_page_size= opt_super_large_pages ? 256 << 20 : 4 << 20; size_t max_page_size= my_next_large_page_size(max_desired_page_size, &nelem); if (max_page_size > 0) @@ -426,6 +423,78 @@ DBUG_RETURN(ptr); } +#ifndef _WIN32 +/** + Special large pages allocator, with possibility to commit to allocating + more memory later. + Every implementation returns a zero filled buffer here. +*/ +char *my_large_virtual_alloc(size_t *size) +{ + char *ptr; + DBUG_ENTER("my_large_virtual_alloc"); + + if (my_use_large_pages) + { + size_t large_page_size; + int page_i= 0; + + while ((large_page_size= my_next_large_page_size(*size, &page_i)) != 0) + { + int mapflag= MAP_PRIVATE | +# ifdef MAP_POPULATE + MAP_POPULATE | +# endif +# if defined MAP_HUGETLB /* linux 2.6.32 */ + MAP_HUGETLB | +# if defined MAP_HUGE_SHIFT /* Linux-3.8+ */ + my_bit_log2_size_t(large_page_size) << MAP_HUGE_SHIFT | +# else +# warning "No explicit large page (HUGETLB pages) support in Linux < 3.8" +# endif +# elif defined MAP_ALIGNED + MAP_ALIGNED(my_bit_log2_size_t(large_page_size)) | +# if defined MAP_ALIGNED_SUPER + MAP_ALIGNED_SUPER | +# endif +# endif + OS_MAP_ANON; + + size_t aligned_size= MY_ALIGN(*size, (size_t) large_page_size); + ptr= mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, mapflag, -1, 0); + if (ptr == (void*) -1) + { + ptr= NULL; + /* try next smaller memory size */ + if (errno == ENOMEM) + continue; + + /* other errors are more serious */ + break; + } + else /* success */ + { + /* + we do need to record the adjustment so that munmap gets called with + the right size. This is only the case for HUGETLB pages. + */ + *size= aligned_size; + DBUG_RETURN(ptr); + } + } + } + + ptr= mmap(NULL, *size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | OS_MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size); + ptr= NULL; + } + + DBUG_RETURN(ptr); +} +#endif /** General large pages deallocator. @@ -482,7 +551,7 @@ #endif /* memory_sanitizer */ #else my_free_lock(ptr); -#endif /* HAVE_MMMAP */ +#endif /* HAVE_MMAP */ DBUG_VOID_RETURN; } diff -Nru mariadb-10.11.11/mysys/my_pread.c mariadb-10.11.13/mysys/my_pread.c --- mariadb-10.11.11/mysys/my_pread.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/mysys/my_pread.c 2025-05-19 16:14:25.000000000 +0000 @@ -158,6 +158,15 @@ #else writtenbytes= pwrite(Filedes, Buffer, Count, offset); #endif + + DBUG_EXECUTE_IF ("simulate_file_pwrite_error", + if (writtenbytes == Count && + my_seek(Filedes, 0, SEEK_END, MYF(0)) > 1024*1024L) + { + errno= ENOSPC; + writtenbytes= (size_t) -1; + }); + if (writtenbytes == Count) break; my_errno= errno; diff -Nru mariadb-10.11.11/mysys/my_virtual_mem.c mariadb-10.11.13/mysys/my_virtual_mem.c --- mariadb-10.11.11/mysys/my_virtual_mem.c 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/mysys/my_virtual_mem.c 2025-05-19 16:14:25.000000000 +0000 @@ -0,0 +1,201 @@ +/* Copyright (c) 2025, MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include +#include +#include +#include +#ifdef _AIX +# include +#endif + +/* + Functionality for handling virtual memory + + - reserve range, + - commit memory (within reserved range) + - decommit previously commited memory + - release range + + Not every OS has a "reserve" functionality, i.e it is not always + possible to reserve memory larger than swap or RAM for example. + + We try to respect use_large_pages setting, on Windows and Linux +*/ +#ifdef _WIN32 +char *my_virtual_mem_reserve(size_t *size) +{ + DWORD flags= my_use_large_pages + ? MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT + : MEM_RESERVE; + char *ptr= VirtualAlloc(NULL, *size, flags, PAGE_READWRITE); + if (!ptr && (flags & MEM_LARGE_PAGES)) + { + /* Try without large pages */ + ptr= VirtualAlloc(NULL, *size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), *size); + } + return ptr; +} +#endif + +#if defined _WIN32 && !defined DBUG_OFF +static my_bool is_memory_committed(char *ptr, size_t size) +{ + MEMORY_BASIC_INFORMATION mbi; + if (VirtualQuery(ptr, &mbi, sizeof mbi) == 0) + DBUG_ASSERT(0); + return !!(mbi.State & MEM_COMMIT); +} +#endif + +char *my_virtual_mem_commit(char *ptr, size_t size) +{ + DBUG_ASSERT(ptr); +#ifdef _WIN32 + if (my_use_large_pages) + { + DBUG_ASSERT(is_memory_committed(ptr, size)); + } + else + { + void *p= VirtualAlloc(ptr, size, MEM_COMMIT, PAGE_READWRITE); + DBUG_ASSERT(p == ptr); + if (!p) + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size); + return NULL; + } + } +#else + if (my_use_large_pages) + /* my_large_virtual_alloc() already created a read/write mapping. */; + else + { +# ifdef _AIX + /* + MAP_FIXED does not not work on IBM AIX in the way does works elsewhere. + Apparently, it is not possible to mmap(2) a range that is already in use, + at least not by default. + + mprotect(2) is the fallback, it can't communicate out-of-memory + conditions, but it looks like overcommitting is not possible on + AIX anyway. + */ + if (mprotect(ptr, size, PROT_READ | PROT_WRITE)) + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size); + return NULL; + } +# else + void *p= 0; + const int flags= +# ifdef MAP_POPULATE + MAP_POPULATE | +# endif + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED; + p= mmap(ptr, size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (p == MAP_FAILED) + { + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size); + return NULL; + } + DBUG_ASSERT(p == ptr); +# if defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE /* Apple macOS */ + madvise(ptr, size, MADV_FREE_REUSE); /* cancel MADV_FREE_REUSABLE */ +# endif +# endif + } +#endif + update_malloc_size(size, 0); + return ptr; +} + +void my_virtual_mem_decommit(char *ptr, size_t size) +{ +#ifdef _WIN32 + DBUG_ASSERT(is_memory_committed(ptr, size)); +# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT +# error "VirtualFree(MEM_DECOMMIT) will not allow subsequent reads!" +# endif + if (!my_use_large_pages) + { + if (!VirtualFree(ptr, size, MEM_DECOMMIT)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, + GetLastError()); + DBUG_ASSERT(0); + } + } +#else + const int prot= +# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT + /* + In InnoDB, buf_pool_t::page_guess() may deference pointers to + this, assuming that either the original contents or zeroed + contents is available. + */ + PROT_READ +# else + /* We will explicitly mark the memory unaccessible. */ + PROT_NONE +# endif + ; +# ifdef _AIX + disclaim(ptr, size, DISCLAIM_ZEROMEM); +# elif defined __linux__ || defined __osf__ + madvise(ptr, size, MADV_DONTNEED); /* OSF/1, Linux mimicing AIX disclaim() */ +# elif defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE + /* Mac OS X 10.9; undocumented in Apple macOS */ + madvise(ptr, size, MADV_FREE_REUSABLE); /* macOS mimicing AIX disclaim() */ +# elif defined MADV_PURGE /* Illumos */ + madvise(ptr, size, MADV_PURGE); /* Illumos mimicing AIX disclaim() */ +# elif defined MADV_FREE + /* FreeBSD, NetBSD, OpenBSD, Dragonfly BSD, OpenSolaris, Apple macOS */ + madvise(ptr, size, MADV_FREE); /* allow lazy zeroing out */ +# elif defined MADV_DONTNEED +# warning "It is unclear if madvise(MADV_DONTNEED) works as intended" + madvise(ptr, size, MADV_DONTNEED); +# else +# warning "Do not know how to decommit memory" +# endif + if (mprotect(ptr, size, prot)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno); + DBUG_ASSERT(0); + } +#endif + update_malloc_size(-(longlong) size, 0); +} + +void my_virtual_mem_release(char *ptr, size_t size) +{ +#ifdef _WIN32 + DBUG_ASSERT(my_use_large_pages || !is_memory_committed(ptr, size)); + if (!VirtualFree(ptr, 0, MEM_RELEASE)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, + GetLastError()); + DBUG_ASSERT(0); + } +#else + if (munmap(ptr, size)) + { + my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno); + DBUG_ASSERT(0); + } +#endif +} diff -Nru mariadb-10.11.11/plugin/auth_examples/auth_0x0100.c mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c --- mariadb-10.11.11/plugin/auth_examples/auth_0x0100.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c 2025-05-19 16:14:25.000000000 +0000 @@ -56,6 +56,10 @@ }; #endif +/* function-type-mismatch ignore */ +#if defined(__clang__) +__attribute__((no_sanitize("undefined"))) +#endif static int do_auth_0x0100(MYSQL_PLUGIN_VIO *vio, MYSQL_SERVER_AUTH_INFO *info) { info->password_used= 1; diff -Nru mariadb-10.11.11/plugin/server_audit/server_audit.c mariadb-10.11.13/plugin/server_audit/server_audit.c --- mariadb-10.11.11/plugin/server_audit/server_audit.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/server_audit/server_audit.c 2025-05-19 16:14:25.000000000 +0000 @@ -2855,6 +2855,18 @@ { char *new_name= (*(char **) save) ? *(char **) save : empty_str; + if (strlen(new_name) + 4 > FN_REFLEN) + { + error_header(); + fprintf(stderr, + "server_audit_file_path can't exceed %d characters.\n", + FN_REFLEN - 4); + fprintf(stderr, "Log filename remains unchanged '%s'.\n", file_path); + CLIENT_ERROR(1, "server_audit_file_path can't exceed %d characters.", + MYF(ME_WARNING), FN_REFLEN - 4); + return; + } + ADD_ATOMIC(internal_stop_logging, 1); error_header(); fprintf(stderr, "Log file name was changed to '%s'.\n", new_name); diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result 2025-05-19 16:14:25.000000000 +0000 @@ -2407,3 +2407,26 @@ DROP TABLE t1; SET max_sort_length=DEFAULT; # End of 10.8 tests +# +# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +# +CREATE OR REPLACE TABLE t1 (c1 BINARY(16), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000001); +INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000002); +SELECT CAST(c1 AS INET6) FROM t1 WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1; +CAST(c1 AS INET6) +::1 +::2 +SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1; +CAST(c1 AS INET6) +::1 +::2 +SELECT CAST(c1 AS INET6) FROM t1 WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6); +CAST(c1 AS INET6) +::1 +::2 +SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6); +CAST(c1 AS INET6) +::1 +::2 +DROP TABLE t1; diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.test mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.test 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test 2025-05-19 16:14:25.000000000 +0000 @@ -1741,3 +1741,15 @@ SET max_sort_length=DEFAULT; --echo # End of 10.8 tests + +--echo # +--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix +--echo # +CREATE OR REPLACE TABLE t1 (c1 BINARY(16), UNIQUE (c1)); +INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000001); +INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000002); +SELECT CAST(c1 AS INET6) FROM t1 WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1; +SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1; +SELECT CAST(c1 AS INET6) FROM t1 WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6); +SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6); +DROP TABLE t1; diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc 2025-05-19 16:14:25.000000000 +0000 @@ -36,3 +36,16 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=CAST('::ff' AS INET6); DROP TABLE t1; + +--echo # +--echo # MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in +--echo # Type_handler_fbt::Field_fbt::store_native, +--echo # Assertion `item->null_value' failed in Type_handler::Item_send_str +--echo # + +CREATE TABLE t1 (a datetime); +INSERT INTO t1 VALUES (NULL); +SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt; +DROP TABLE t1; + +--echo # End of 10.5 tests diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result 2025-05-19 16:14:25.000000000 +0000 @@ -88,6 +88,18 @@ Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff' DROP TABLE t1; # +# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in +# Type_handler_fbt::Field_fbt::store_native, +# Assertion `item->null_value' failed in Type_handler::Item_send_str +# +CREATE TABLE t1 (a datetime); +INSERT INTO t1 VALUES (NULL); +SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt; +cast('::' AS INET6) min(1) +:: NULL +DROP TABLE t1; +# End of 10.5 tests +# # MDEV-26742 Assertion `field->type_handler() == this' failed in FixedBinTypeBundle::Type_handler_fbt::stored_field_cmp_to_item # CREATE TABLE t1 (pk inet6, c text) engine=myisam; diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result 2025-05-19 16:14:25.000000000 +0000 @@ -155,5 +155,17 @@ Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff' DROP TABLE t1; # +# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in +# Type_handler_fbt::Field_fbt::store_native, +# Assertion `item->null_value' failed in Type_handler::Item_send_str +# +CREATE TABLE t1 (a datetime); +INSERT INTO t1 VALUES (NULL); +SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt; +cast('::' AS INET6) min(1) +:: NULL +DROP TABLE t1; +# End of 10.5 tests +# # End of 10.5 tests # diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result --- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result 2025-05-19 16:14:25.000000000 +0000 @@ -88,6 +88,18 @@ Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff' DROP TABLE t1; # +# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in +# Type_handler_fbt::Field_fbt::store_native, +# Assertion `item->null_value' failed in Type_handler::Item_send_str +# +CREATE TABLE t1 (a datetime); +INSERT INTO t1 VALUES (NULL); +SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt; +cast('::' AS INET6) min(1) +:: NULL +DROP TABLE t1; +# End of 10.5 tests +# # MDEV-26742 Assertion `field->type_handler() == this' failed in FixedBinTypeBundle::Type_handler_fbt::stored_field_cmp_to_item # CREATE TABLE t1 (c varchar(64), key(c)) engine=myisam; diff -Nru mariadb-10.11.11/plugin/userstat/client_stats.cc mariadb-10.11.13/plugin/userstat/client_stats.cc --- mariadb-10.11.11/plugin/userstat/client_stats.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/userstat/client_stats.cc 2025-05-19 16:14:25.000000000 +0000 @@ -45,8 +45,8 @@ table->field[j++]->store((longlong)user_stats->total_connections,TRUE); table->field[j++]->store((longlong)user_stats->concurrent_connections, TRUE); table->field[j++]->store((longlong)user_stats->connected_time, TRUE); - table->field[j++]->store((double)user_stats->busy_time); - table->field[j++]->store((double)user_stats->cpu_time); + table->field[j++]->store((double)user_stats->busy_time/1e6); + table->field[j++]->store((double)user_stats->cpu_time/1e6); table->field[j++]->store((longlong)user_stats->bytes_received, TRUE); table->field[j++]->store((longlong)user_stats->bytes_sent, TRUE); table->field[j++]->store((longlong)user_stats->binlog_bytes_written, TRUE); diff -Nru mariadb-10.11.11/plugin/versioning/versioning.cc mariadb-10.11.13/plugin/versioning/versioning.cc --- mariadb-10.11.11/plugin/versioning/versioning.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/plugin/versioning/versioning.cc 2025-05-19 16:14:25.000000000 +0000 @@ -150,7 +150,6 @@ { { C_STRING_WITH_LEN("TRT_TRX_ID") }, BUILDER(Create_func_trt)}, { { C_STRING_WITH_LEN("TRT_TRX_SEES") }, BUILDER(Create_func_trt_trx_sees)}, { { C_STRING_WITH_LEN("TRT_TRX_SEES_EQ") }, BUILDER(Create_func_trt_trx_sees)}, - { {0, 0}, NULL} }; diff -Nru mariadb-10.11.11/scripts/mysqlhotcopy.sh mariadb-10.11.13/scripts/mysqlhotcopy.sh --- mariadb-10.11.11/scripts/mysqlhotcopy.sh 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/scripts/mysqlhotcopy.sh 2025-05-19 16:14:25.000000000 +0000 @@ -208,7 +208,7 @@ else { $dsn .= "host=" . $opt{host}; - if ($opt{host} ne "localhost") + if ($opt{host} ne "localhost" and $opt{port}) { $dsn .= ";port=". $opt{port}; } diff -Nru mariadb-10.11.11/scripts/wsrep_sst_common.sh mariadb-10.11.13/scripts/wsrep_sst_common.sh --- mariadb-10.11.11/scripts/wsrep_sst_common.sh 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/scripts/wsrep_sst_common.sh 2025-05-19 16:14:25.000000000 +0000 @@ -1910,4 +1910,17 @@ SST_PID="$DATA/wsrep_sst.pid" +if [ -n "${MTR_SST_JOINER_DELAY:-}" ]; then + MTR_SST_JOINER_DELAY=$(trim_string "$MTR_SST_JOINER_DELAY") +fi + +simulate_long_sst() +{ + # Delay for MTR tests if needed to simulate long SST/IST: + if [ ${MTR_SST_JOINER_DELAY:-0} -gt 0 ]; then + wsrep_log_info "Sleeping $MTR_SST_JOINER_DELAY seconds for MTR test" + sleep $MTR_SST_JOINER_DELAY + fi +} + wsrep_log_info "$WSREP_METHOD $WSREP_TRANSFER_TYPE started on $WSREP_SST_OPT_ROLE" diff -Nru mariadb-10.11.11/scripts/wsrep_sst_mariabackup.sh mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh --- mariadb-10.11.11/scripts/wsrep_sst_mariabackup.sh 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh 2025-05-19 16:14:25.000000000 +0000 @@ -1513,6 +1513,8 @@ exit 2 fi + simulate_long_sst + # use donor magic file, if present # if IST was used, donor magic file was not created # Remove special tags from the magic file, and from the output: diff -Nru mariadb-10.11.11/scripts/wsrep_sst_mysqldump.sh mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh --- mariadb-10.11.11/scripts/wsrep_sst_mysqldump.sh 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh 2025-05-19 16:14:25.000000000 +0000 @@ -184,5 +184,9 @@ echo "$SET_START_POSITION" | $MYSQL || exit $? fi +if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then + simulate_long_sst +fi + wsrep_log_info "$WSREP_METHOD $WSREP_TRANSFER_TYPE completed on $WSREP_SST_OPT_ROLE" exit 0 diff -Nru mariadb-10.11.11/scripts/wsrep_sst_rsync.sh mariadb-10.11.13/scripts/wsrep_sst_rsync.sh --- mariadb-10.11.11/scripts/wsrep_sst_rsync.sh 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/scripts/wsrep_sst_rsync.sh 2025-05-19 16:14:25.000000000 +0000 @@ -915,6 +915,8 @@ fi fi + simulate_long_sst + # Remove special tags from the magic file, and from the output: coords=$(head -n1 "$MAGIC_FILE") wsrep_log_info "Galera co-ords from recovery: $coords" diff -Nru mariadb-10.11.11/sql/filesort.cc mariadb-10.11.13/sql/filesort.cc --- mariadb-10.11.11/sql/filesort.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/filesort.cc 2025-05-19 16:14:25.000000000 +0000 @@ -640,26 +640,16 @@ } #ifndef DBUG_OFF -/* - Print table's current row into a buffer and return a pointer to it. - This is intended to be used from gdb: - - (gdb) p dbug_print_table_row(table) - $33 = "SUBQUERY2_t1(col_int_key,col_varchar_nokey)=(7,c)" - (gdb) +static char dbug_row_print_buf[4096]; - Only columns in table->read_set are printed -*/ -const char* dbug_print_row(TABLE *table, const uchar *rec, bool print_names) +String dbug_format_row(TABLE *table, const uchar *rec, bool print_names) { Field **pfield; - const size_t alloc_size= 512; - char *row_buff= (char *) alloc_root(&table->mem_root, alloc_size); - char *row_buff_tmp= (char *) alloc_root(&table->mem_root, alloc_size); - String tmp(row_buff_tmp, alloc_size, &my_charset_bin); - String output(row_buff, alloc_size, &my_charset_bin); + char row_buff_tmp[512]; + String tmp(row_buff_tmp, sizeof(row_buff_tmp), &my_charset_bin); + String output(dbug_row_print_buf, sizeof(dbug_row_print_buf), &my_charset_bin); auto move_back_lambda= [table, rec]() mutable { table->move_fields(table->field, table->record[0], rec); @@ -672,7 +662,7 @@ move_back_guard.engage(); } - SCOPE_VALUE(table->read_set, (table->read_set && table->write_set) ? + SCOPE_VALUE(table->read_set, (table->reginfo.lock_type >= TL_WRITE_ALLOW_WRITE) ? table->write_set : table->read_set); output.length(0); @@ -724,10 +714,35 @@ } output.append(')'); - return output.c_ptr_safe(); + return output; } +/** + A function to display a row in debugger. + + Example usage: + (gdb) p dbug_print_row(table, table->record[1]) +*/ +const char *dbug_print_row(TABLE *table, const uchar *rec) +{ + String row= dbug_format_row(table, table->record[0]); + if (row.length() > sizeof dbug_row_print_buf - 1) + return "Couldn't fit into buffer"; + memcpy(dbug_row_print_buf, row.c_ptr(), row.length()); + return dbug_row_print_buf; +} +/** + Print table's current row into a buffer and return a pointer to it. + + This is intended to be used from gdb: + + (gdb) p dbug_print_table_row(table) + $33 = "SUBQUERY2_t1(col_int_key,col_varchar_nokey)=(7,c)" + (gdb) + + Only columns in table->read_set are printed +*/ const char* dbug_print_table_row(TABLE *table) { return dbug_print_row(table, table->record[0]); diff -Nru mariadb-10.11.11/sql/ha_partition.cc mariadb-10.11.13/sql/ha_partition.cc --- mariadb-10.11.11/sql/ha_partition.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/ha_partition.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2141,7 +2141,9 @@ m_added_file[i]->extra(HA_EXTRA_BEGIN_ALTER_COPY); error= copy_partitions(copied, deleted); for (i= 0; i < part_count; i++) - m_added_file[i]->extra(HA_EXTRA_END_ALTER_COPY); + m_added_file[i]->extra(error + ? HA_EXTRA_ABORT_ALTER_COPY + : HA_EXTRA_END_ALTER_COPY); if (unlikely(error)) { /* @@ -4404,31 +4406,19 @@ DBUG_ENTER("ha_partition::store_lock"); DBUG_ASSERT(thd == current_thd); - /* - This can be called from get_lock_data() in mysql_lock_abort_for_thread(), - even when thd != table->in_use. In that case don't use partition pruning, - but use all partitions instead to avoid using another threads structures. - */ - if (thd != table->in_use) + MY_BITMAP *used_partitions= lock_type == TL_UNLOCK || + lock_type == TL_IGNORE ? + &m_locked_partitions : + &m_part_info->lock_partitions; + + for (i= bitmap_get_first_set(used_partitions); + i < m_tot_parts; + i= bitmap_get_next_set(used_partitions, i)) { - for (i= 0; i < m_tot_parts; i++) - to= m_file[i]->store_lock(thd, to, lock_type); + DBUG_PRINT("info", ("store lock %u iteration", i)); + to= m_file[i]->store_lock(thd, to, lock_type); } - else - { - MY_BITMAP *used_partitions= lock_type == TL_UNLOCK || - lock_type == TL_IGNORE ? - &m_locked_partitions : - &m_part_info->lock_partitions; - for (i= bitmap_get_first_set(used_partitions); - i < m_tot_parts; - i= bitmap_get_next_set(used_partitions, i)) - { - DBUG_PRINT("info", ("store lock %u iteration", i)); - to= m_file[i]->store_lock(thd, to, lock_type); - } - } DBUG_RETURN(to); } @@ -4755,7 +4745,6 @@ } - m_last_part= new_part_id; start_part_bulk_insert(thd, new_part_id); DBUG_ASSERT(!m_file[new_part_id]->row_logging); if (new_part_id == old_part_id) @@ -4790,6 +4779,8 @@ goto exit; } + m_last_part= new_part_id; + exit: /* if updating an auto_increment column, update @@ -9478,6 +9469,7 @@ case HA_EXTRA_STARTING_ORDERED_INDEX_SCAN: case HA_EXTRA_BEGIN_ALTER_COPY: case HA_EXTRA_END_ALTER_COPY: + case HA_EXTRA_ABORT_ALTER_COPY: DBUG_RETURN(loop_partitions(extra_cb, &operation)); default: { diff -Nru mariadb-10.11.11/sql/ha_sequence.cc mariadb-10.11.13/sql/ha_sequence.cc --- mariadb-10.11.11/sql/ha_sequence.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/ha_sequence.cc 2025-05-19 16:14:25.000000000 +0000 @@ -353,6 +353,12 @@ return(COMPATIBLE_DATA_YES); } +enum_alter_inplace_result +ha_sequence::check_if_supported_inplace_alter(TABLE *altered_table, + Alter_inplace_info *ai) +{ + return file->check_if_supported_inplace_alter(altered_table, ai); +} int ha_sequence::external_lock(THD *thd, int lock_type) { diff -Nru mariadb-10.11.11/sql/ha_sequence.h mariadb-10.11.13/sql/ha_sequence.h --- mariadb-10.11.11/sql/ha_sequence.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/ha_sequence.h 2025-05-19 16:14:25.000000000 +0000 @@ -94,6 +94,9 @@ /* For ALTER ONLINE TABLE */ bool check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) override; + enum_alter_inplace_result + check_if_supported_inplace_alter(TABLE *altered_table, + Alter_inplace_info *ai) override; void write_lock() { write_locked= 1;} void unlock() { write_locked= 0; } bool is_locked() { return write_locked; } diff -Nru mariadb-10.11.11/sql/handle_connections_win.cc mariadb-10.11.13/sql/handle_connections_win.cc --- mariadb-10.11.11/sql/handle_connections_win.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/handle_connections_win.cc 2025-05-19 16:14:25.000000000 +0000 @@ -595,11 +595,8 @@ void handle_connections_win() { - int n_waits; - create_shutdown_event(); wait_events.push_back(hEventShutdown); - n_waits= 1; for (size_t i= 0; i < all_listeners.size(); i++) { diff -Nru mariadb-10.11.11/sql/handler.cc mariadb-10.11.13/sql/handler.cc --- mariadb-10.11.11/sql/handler.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/handler.cc 2025-05-19 16:14:25.000000000 +0000 @@ -499,7 +499,7 @@ SETMSG(HA_ERR_INDEX_COL_TOO_LONG, ER_DEFAULT(ER_INDEX_COLUMN_TOO_LONG)); SETMSG(HA_ERR_INDEX_CORRUPT, ER_DEFAULT(ER_INDEX_CORRUPT)); SETMSG(HA_FTS_INVALID_DOCID, "Invalid InnoDB FTS Doc ID"); - SETMSG(HA_ERR_DISK_FULL, ER_DEFAULT(ER_DISK_FULL)); + SETMSG(HA_ERR_DISK_FULL, "Disk got full writing '%s'"); SETMSG(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE, "Too many words in a FTS phrase or proximity search"); SETMSG(HA_ERR_FK_DEPTH_EXCEEDED, "Foreign key cascade delete/update exceeds"); SETMSG(HA_ERR_TABLESPACE_MISSING, ER_DEFAULT(ER_TABLESPACE_MISSING)); @@ -672,6 +672,8 @@ DBUG_EXECUTE_IF("unstable_db_type", { static int i= (int) DB_TYPE_FIRST_DYNAMIC; + while (installed_htons[i]) + i++; hton->db_type= (enum legacy_db_type)++i; }); @@ -1899,6 +1901,8 @@ } #endif /* WITH_WSREP */ error= ha_commit_one_phase(thd, all); + if (error) + goto err; #ifdef WITH_WSREP // Here in case of error we must return 2 for inconsistency if (run_wsrep_hooks && !error) @@ -2139,7 +2143,7 @@ if (ha_info) { - int err; + int err= 0; if (has_binlog_hton(ha_info) && (err= binlog_commit(thd, all, @@ -2147,6 +2151,8 @@ { my_error(ER_ERROR_DURING_COMMIT, MYF(0), err); error= 1; + + goto err; } for (; ha_info; ha_info= ha_info_next) { @@ -2182,7 +2188,7 @@ if (count >= 2) statistic_increment(transactions_multi_engine, LOCK_status); } - + err: DBUG_RETURN(error); } @@ -2291,7 +2297,7 @@ "conf %d wsrep_err %s SQL %s", thd->thread_id, thd->query_id, thd->wsrep_trx().state(), wsrep::to_c_string(thd->wsrep_cs().current_error()), - thd->query()); + wsrep_thd_query(thd)); } #endif /* WITH_WSREP */ } @@ -2307,7 +2313,7 @@ if (WSREP(thd) && thd->is_error()) { WSREP_DEBUG("ha_rollback_trans(%lld, %s) rolled back: msg %s is_real %d wsrep_err %s", - thd->thread_id, all? "TRUE" : "FALSE", + thd->thread_id, all ? "TRUE" : "FALSE", thd->get_stmt_da()->message(), is_real_trans, wsrep::to_c_string(thd->wsrep_cs().current_error())); } @@ -2800,6 +2806,7 @@ } if (IF_WSREP((wsrep_emulate_bin_log && wsrep_is_wsrep_xid(info->list + i) && + !wsrep_is_xid_gtid_undefined(info->list + i) && x <= wsrep_limit), false) || tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT) { @@ -4455,8 +4462,12 @@ break; case ENOSPC: case HA_ERR_DISK_FULL: - textno= ER_DISK_FULL; SET_FATAL_ERROR; // Ensure error is logged + my_printf_error(ER_DISK_FULL, "Disk got full writing '%s.%s' (Errcode: %M)", + MYF(errflag | ME_ERROR_LOG), + table_share->db.str, table_share->table_name.str, + error); + DBUG_VOID_RETURN; break; case HA_ERR_KEY_NOT_FOUND: case HA_ERR_NO_ACTIVE_RECORD: @@ -7718,7 +7729,10 @@ }); #endif /* WITH_WSREP */ if ((error= ha_check_overlaps(NULL, buf))) + { + DEBUG_SYNC_C("ha_write_row_end"); DBUG_RETURN(error); + } /* NOTE: this != table->file is true in 3 cases: @@ -7739,6 +7753,7 @@ if (table->next_number_field && buf == table->record[0]) if (int err= update_auto_increment()) error= err; + DEBUG_SYNC_C("ha_write_row_end"); DBUG_RETURN(error); } } @@ -7749,7 +7764,8 @@ TABLE_IO_WAIT(tracker, PSI_TABLE_WRITE_ROW, MAX_KEY, error, { error= write_row(buf); }) - DBUG_PRINT("dml", ("INSERT: %s = %d", dbug_print_row(table, buf, false), error)); + DBUG_PRINT("dml", ("INSERT: %s = %d", + dbug_format_row(table, buf, false).c_ptr_safe(), error)); MYSQL_INSERT_ROW_DONE(error); if (likely(!error)) @@ -7760,14 +7776,12 @@ Log_func *log_func= Write_rows_log_event::binlog_row_logging_function; error= binlog_log_row(table, 0, buf, log_func); } + #ifdef WITH_WSREP - if (WSREP_NNULL(ha_thd()) && table_share->tmp_table == NO_TMP_TABLE && - ht->flags & HTON_WSREP_REPLICATION && - !error && (error= wsrep_after_row(ha_thd()))) - { - DEBUG_SYNC_C("ha_write_row_end"); - DBUG_RETURN(error); - } + THD *thd= ha_thd(); + if (WSREP_NNULL(thd) && table_share->tmp_table == NO_TMP_TABLE && + ht->flags & HTON_WSREP_REPLICATION && !error) + error= wsrep_after_row(thd); #endif /* WITH_WSREP */ } @@ -7811,8 +7825,10 @@ TABLE_IO_WAIT(tracker, PSI_TABLE_UPDATE_ROW, active_index, 0, { error= update_row(old_data, new_data);}) - DBUG_PRINT("dml", ("UPDATE: %s => %s = %d", dbug_print_row(table, old_data, false), - dbug_print_row(table, new_data, false), error)); + DBUG_PRINT("dml", ("UPDATE: %s => %s = %d", + dbug_format_row(table, old_data, false).c_ptr_safe(), + dbug_format_row(table, new_data, false).c_ptr_safe(), + error)); MYSQL_UPDATE_ROW_DONE(error); if (likely(!error)) @@ -7892,7 +7908,8 @@ TABLE_IO_WAIT(tracker, PSI_TABLE_DELETE_ROW, active_index, error, { error= delete_row(buf);}) - DBUG_PRINT("dml", ("DELETE: %s = %d", dbug_print_row(table, buf, false), error)); + DBUG_PRINT("dml", ("DELETE: %s = %d", + dbug_format_row(table, buf, false).c_ptr_safe(), error)); MYSQL_DELETE_ROW_DONE(error); if (likely(!error)) { @@ -8236,16 +8253,6 @@ VERSIONING functions ******************************************************************************/ -bool Vers_parse_info::is_start(const char *name) const -{ - DBUG_ASSERT(name); - return as_row.start && as_row.start.streq(name); -} -bool Vers_parse_info::is_end(const char *name) const -{ - DBUG_ASSERT(name); - return as_row.end && as_row.end.streq(name); -} bool Vers_parse_info::is_start(const Create_field &f) const { return f.flags & VERS_ROW_START; @@ -8300,8 +8307,8 @@ return false; } -const Lex_ident Vers_parse_info::default_start= "row_start"; -const Lex_ident Vers_parse_info::default_end= "row_end"; +const Lex_ident Vers_parse_info::default_start= { STRING_WITH_LEN("row_start")}; +const Lex_ident Vers_parse_info::default_end= { STRING_WITH_LEN("row_end")}; bool Vers_parse_info::fix_implicit(THD *thd, Alter_info *alter_info) { @@ -8560,7 +8567,7 @@ if (alter_info->flags & ALTER_ADD_SYSTEM_VERSIONING) { - if (check_sys_fields(table_name, share->db, alter_info)) + if (check_sys_fields(share->table_name, share->db, alter_info)) return true; } @@ -8866,8 +8873,8 @@ } } - bool res= period_info.check_field(row_start, period.start.str) - || period_info.check_field(row_end, period.end.str); + bool res= period_info.check_field(row_start, period.start) + || period_info.check_field(row_end, period.end); if (res) return true; diff -Nru mariadb-10.11.11/sql/handler.h mariadb-10.11.13/sql/handler.h --- mariadb-10.11.11/sql/handler.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/handler.h 2025-05-19 16:14:25.000000000 +0000 @@ -2117,8 +2117,6 @@ } protected: - bool is_start(const char *name) const; - bool is_end(const char *name) const; bool is_start(const Create_field &f) const; bool is_end(const Create_field &f) const; bool fix_implicit(THD *thd, Alter_info *alter_info); @@ -5444,6 +5442,6 @@ bool versioned); #ifndef DBUG_OFF -const char* dbug_print_row(TABLE *table, const uchar *rec, bool print_names= true); +String dbug_format_row(TABLE *table, const uchar *rec, bool print_names= true); #endif /* DBUG_OFF */ #endif /* HANDLER_INCLUDED */ diff -Nru mariadb-10.11.11/sql/item.cc mariadb-10.11.13/sql/item.cc --- mariadb-10.11.11/sql/item.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item.cc 2025-05-19 16:14:25.000000000 +0000 @@ -5321,6 +5321,7 @@ double Item_copy_string::val_real() { + DBUG_ASSERT(copied_in); int err_not_used; char *end_not_used; return (null_value ? 0.0 : @@ -5331,6 +5332,7 @@ longlong Item_copy_string::val_int() { + DBUG_ASSERT(copied_in); int err; return null_value ? 0 : str_value.charset()->strntoll(str_value.ptr(), str_value.length(), 10, @@ -5340,6 +5342,7 @@ int Item_copy_string::save_in_field(Field *field, bool no_conversions) { + DBUG_ASSERT(copied_in); return save_str_value_in_field(field, &str_value); } @@ -5350,11 +5353,15 @@ if (res && res != &str_value) str_value.copy(*res); null_value=item->null_value; +#ifndef DBUG_OFF + copied_in= 1; +#endif } /* ARGSUSED */ String *Item_copy_string::val_str(String *str) { + DBUG_ASSERT(copied_in); // Item_copy_string is used without fix_fields call if (null_value) return (String*) 0; @@ -5364,6 +5371,7 @@ my_decimal *Item_copy_string::val_decimal(my_decimal *decimal_value) { + DBUG_ASSERT(copied_in); // Item_copy_string is used without fix_fields call if (null_value) return (my_decimal *) 0; @@ -11067,8 +11075,8 @@ {} /** - Wrapper of hide_view_error call for Name_resolution_context error - processor. + Wrapper of replace_view_error_with_generic call for Name_resolution_context + error processor. @note hide view underlying tables details in error messages @@ -11076,7 +11084,7 @@ void view_error_processor(THD *thd, void *data) { - ((TABLE_LIST *)data)->hide_view_error(thd); + ((TABLE_LIST *)data)->replace_view_error_with_generic(thd); } diff -Nru mariadb-10.11.11/sql/item.h mariadb-10.11.13/sql/item.h --- mariadb-10.11.11/sql/item.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item.h 2025-05-19 16:14:25.000000000 +0000 @@ -757,6 +757,17 @@ virtual const String *const_ptr_string() const { return NULL; } }; +struct subselect_table_finder_param +{ + THD *thd; + /* + We're searching for different TABLE_LIST objects referring to the same + table as this one + */ + const TABLE_LIST *find; + /* NUL - not found, ERROR_TABLE - search error, or the found table reference */ + TABLE_LIST *dup; +}; /****************************************************************************/ @@ -1954,6 +1965,19 @@ */ virtual Item *clone_item(THD *thd) const { return nullptr; } + /* + @detail + The meaning of this function seems to be: + Check what the item would return if it was provided with two identical + non-NULL arguments. + It is not clear why it is defined for generic class Item or what its other + uses are. + + @return + COND_TRUE Would return true + COND_FALSE Would return false + COND_OK May return either, depending on the argument type. + */ virtual cond_result eq_cmp_result() const { return COND_OK; } inline uint float_length(uint decimals_par) const { return decimals < FLOATING_POINT_DECIMALS ? (DBL_DIG+2+decimals_par) : DBL_DIG+8;} @@ -2292,6 +2316,7 @@ set_extraction_flag(*(int16*)arg); return 0; } + virtual bool subselect_table_finder_processor(void *arg) { return 0; }; /* TRUE if the expression depends only on the table indicated by tab_map @@ -6673,8 +6698,15 @@ Type_std_attributes::set(item); name= item->name; set_handler(item->type_handler()); +#ifndef DBUG_OFF + copied_in= 0; +#endif } +#ifndef DBUG_OFF + bool copied_in; +#endif + public: /** @@ -6740,7 +6772,10 @@ double val_real() override; longlong val_int() override; bool get_date(THD *thd, MYSQL_TIME *ltime, date_mode_t fuzzydate) override - { return get_date_from_string(thd, ltime, fuzzydate); } + { + DBUG_ASSERT(copied_in); + return get_date_from_string(thd, ltime, fuzzydate); + } void copy() override; int save_in_field(Field *field, bool no_conversions) override; Item *do_get_copy(THD *thd) const override @@ -6770,9 +6805,13 @@ null_value= tmp.is_null(); m_value= tmp.is_null() ? Timestamp_or_zero_datetime() : Timestamp_or_zero_datetime(tmp); +#ifndef DBUG_OFF + copied_in=1; +#endif } int save_in_field(Field *field, bool) override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); if (null_value) return set_field_to_null(field); @@ -6781,30 +6820,35 @@ } longlong val_int() override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); return null_value ? 0 : m_value.to_datetime(current_thd).to_longlong(); } double val_real() override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); return null_value ? 0e0 : m_value.to_datetime(current_thd).to_double(); } String *val_str(String *to) override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); return null_value ? NULL : m_value.to_datetime(current_thd).to_string(to, decimals); } my_decimal *val_decimal(my_decimal *to) override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); return null_value ? NULL : m_value.to_datetime(current_thd).to_decimal(to); } bool get_date(THD *thd, MYSQL_TIME *ltime, date_mode_t fuzzydate) override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); bool res= m_value.to_TIME(thd, ltime, fuzzydate); DBUG_ASSERT(!res); @@ -6812,6 +6856,7 @@ } bool val_native(THD *thd, Native *to) override { + DBUG_ASSERT(copied_in); DBUG_ASSERT(sane()); return null_value || m_value.to_native(to, decimals); } diff -Nru mariadb-10.11.11/sql/item_cmpfunc.h mariadb-10.11.13/sql/item_cmpfunc.h --- mariadb-10.11.11/sql/item_cmpfunc.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_cmpfunc.h 2025-05-19 16:14:25.000000000 +0000 @@ -1003,6 +1003,23 @@ class Item_func_between :public Item_func_opt_neg { + /* + If the types of the arguments to BETWEEN permit, then: + + WHERE const1 BETWEEN expr2 AND field1 + can be optimized as if it was just: + WHERE const1 <= field1 + + as expr2 could be an arbitrary expression. More generally, + this optimization is permitted if aggregation for comparison + for three expressions (const1,const2,field1) and for two + expressions (const1,field1) return the same type handler. + + @param [IN] field_item - This is a field from the right side + of the BETWEEN operator. + */ + bool can_optimize_range_const(Item_field *field_item) const; + protected: SEL_TREE *get_func_mm_tree(RANGE_OPT_PARAM *param, Field *field, Item *value) override; @@ -2945,9 +2962,18 @@ TODO: We could still replace "expr1" to "const" in "expr1 LIKE expr2" in case of a "PAD SPACE" collation, but only if "expr2" has '%' - at the end. + at the end. */ - return compare_collation() == &my_charset_bin ? COND_TRUE : COND_OK; + if (compare_collation() == &my_charset_bin) + { + /* + 'foo' NOT LIKE 'foo' is false, + 'foo' LIKE 'foo' is true. + */ + return negated? COND_FALSE : COND_TRUE; + } + + return COND_OK; } void add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level, table_map usable_tables, SARGABLE_PARAM **sargables) diff -Nru mariadb-10.11.11/sql/item_func.cc mariadb-10.11.13/sql/item_func.cc --- mariadb-10.11.11/sql/item_func.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_func.cc 2025-05-19 16:14:25.000000000 +0000 @@ -7068,6 +7068,16 @@ /***************************************************************************** SEQUENCE functions *****************************************************************************/ +bool Item_func_nextval::check_access_and_fix_fields(THD *thd, Item **ref, + privilege_t want_access) +{ + table_list->sequence= false; + bool error= check_single_table_access(thd, want_access, table_list, false); + table_list->sequence= true; + if (error && table_list->belong_to_view) + table_list->replace_view_error_with_generic(thd); + return error || Item_longlong_func::fix_fields(thd, ref); +} longlong Item_func_nextval::val_int() { diff -Nru mariadb-10.11.11/sql/item_func.h mariadb-10.11.13/sql/item_func.h --- mariadb-10.11.11/sql/item_func.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_func.h 2025-05-19 16:14:25.000000000 +0000 @@ -4234,6 +4234,7 @@ protected: TABLE_LIST *table_list; TABLE *table; + bool check_access_and_fix_fields(THD *, Item **ref, privilege_t); public: Item_func_nextval(THD *thd, TABLE_LIST *table_list_arg): Item_longlong_func(thd), table_list(table_list_arg) {} @@ -4243,6 +4244,8 @@ static LEX_CSTRING name= {STRING_WITH_LEN("nextval") }; return name; } + bool fix_fields(THD *thd, Item **ref) override + { return check_access_and_fix_fields(thd, ref, INSERT_ACL | SELECT_ACL); } bool fix_length_and_dec(THD *thd) override { unsigned_flag= 0; @@ -4284,6 +4287,8 @@ public: Item_func_lastval(THD *thd, TABLE_LIST *table_list_arg): Item_func_nextval(thd, table_list_arg) {} + bool fix_fields(THD *thd, Item **ref) override + { return check_access_and_fix_fields(thd, ref, SELECT_ACL); } longlong val_int() override; LEX_CSTRING func_name_cstring() const override { @@ -4308,6 +4313,8 @@ : Item_func_nextval(thd, table_list_arg), nextval(nextval_arg), round(round_arg), is_used(is_used_arg) {} + bool fix_fields(THD *thd, Item **ref) override + { return check_access_and_fix_fields(thd, ref, INSERT_ACL); } longlong val_int() override; LEX_CSTRING func_name_cstring() const override { diff -Nru mariadb-10.11.11/sql/item_geofunc.cc mariadb-10.11.13/sql/item_geofunc.cc --- mariadb-10.11.11/sql/item_geofunc.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_geofunc.cc 2025-05-19 16:14:25.000000000 +0000 @@ -91,6 +91,15 @@ { String *str_ret= args[0]->val_str(str); null_value= args[0]->null_value; + if (!null_value && arg_count == 2 && !args[1]->null_value) { + srid= (uint32)args[1]->val_int(); + + if (str->copy(*str_ret)) + return 0; + + int4store(str->ptr(), srid); + return str; + } return str_ret; } @@ -2524,7 +2533,7 @@ String *arg2= args[1]->val_str(&bak2); double distance= 0.0; double sphere_radius= 6370986.0; // Default radius equals Earth radius - + null_value= (args[0]->null_value || args[1]->null_value); if (null_value) { @@ -2542,7 +2551,7 @@ } if (sphere_radius <= 0) { - my_error(ER_INTERNAL_ERROR, MYF(0), "Radius must be greater than zero."); + my_error(ER_GIS_UNSUPPORTED_ARGUMENT, MYF(0), func_name()); return 1; } } @@ -2554,26 +2563,27 @@ my_error(ER_GIS_INVALID_DATA, MYF(0), "ST_Distance_Sphere"); goto handle_errors; } -// Method allowed for points and multipoints + // Method allowed for points and multipoints if (!(g1->get_class_info()->m_type_id == Geometry::wkb_point || g1->get_class_info()->m_type_id == Geometry::wkb_multipoint) || !(g2->get_class_info()->m_type_id == Geometry::wkb_point || g2->get_class_info()->m_type_id == Geometry::wkb_multipoint)) { - // Generate error message in case different geometry is used? - my_error(ER_INTERNAL_ERROR, MYF(0), func_name()); + // Generate error message in case of unexpected geometry. + my_error(ER_GIS_UNSUPPORTED_ARGUMENT, MYF(0), func_name()); return 0; } distance= spherical_distance_points(g1, g2, sphere_radius); if (distance < 0) { - my_error(ER_INTERNAL_ERROR, MYF(0), "Returned distance cannot be negative."); + my_error(ER_INTERNAL_ERROR, MYF(0), + "Returned distance cannot be negative."); return 1; } return distance; - handle_errors: - return 0; +handle_errors: + return 0; } diff -Nru mariadb-10.11.11/sql/item_jsonfunc.cc mariadb-10.11.13/sql/item_jsonfunc.cc --- mariadb-10.11.11/sql/item_jsonfunc.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_jsonfunc.cc 2025-05-19 16:14:25.000000000 +0000 @@ -74,7 +74,8 @@ } -static bool append_simple(String *s, const char *a, size_t a_len) +static bool __attribute__((warn_unused_result)) +append_simple(String *s, const char *a, size_t a_len) { if (!s->realloc_with_extra_if_needed(s->length() + a_len)) { @@ -86,7 +87,8 @@ } -static inline bool append_simple(String *s, const uchar *a, size_t a_len) +static inline bool __attribute__((warn_unused_result)) +append_simple(String *s, const uchar *a, size_t a_len) { return append_simple(s, (const char *) a, a_len); } @@ -300,8 +302,10 @@ nice_js->length(0); nice_js->set_charset(je->s.cs); - nice_js->alloc(je->s.str_end - je->s.c_str + 32); + if (nice_js->alloc(je->s.str_end - je->s.c_str + 32)) + goto error; + DBUG_ASSERT(mode != Item_func_json_format::DETAILED || (tab_size >= 0 && tab_size <= TAB_SIZE_LIMIT)); @@ -347,7 +351,8 @@ goto error; nice_js->append('"'); - append_simple(nice_js, key_start, key_end - key_start); + if (append_simple(nice_js, key_start, key_end - key_start)) + goto error; nice_js->append(colon, colon_len); } /* now we have key value to handle, so no 'break'. */ @@ -851,7 +856,7 @@ bool Item_func_json_unquote::fix_length_and_dec(THD *thd) { - collation.set(&my_charset_utf8mb3_general_ci, + collation.set(&my_charset_utf8mb4_bin, DERIVATION_COERCIBLE, MY_REPERTOIRE_ASCII); max_length= args[0]->max_char_length() * collation.collation->mbmaxlen; set_maybe_null(); @@ -894,12 +899,12 @@ return js; str->length(0); - str->set_charset(&my_charset_utf8mb3_general_ci); + str->set_charset(&my_charset_utf8mb4_bin); if (str->realloc_with_extra_if_needed(je.value_len) || (c_len= json_unescape(js->charset(), je.value, je.value + je.value_len, - &my_charset_utf8mb3_general_ci, + &my_charset_utf8mb4_bin, (uchar *) str->ptr(), (uchar *) (str->ptr() + je.value_len))) < 0) goto error; @@ -2248,24 +2253,67 @@ str->set_charset(js->charset()); if (item_pos) { - if (append_simple(str, js->ptr(), item_pos - js->ptr()) || - (n_item > 0 && str->append(" ", 1)) || - append_json_value(str, args[n_arg+1], &tmp_val) || - str->append(",", 1) || - (n_item == 0 && str->append(" ", 1)) || - append_simple(str, item_pos, js->end() - item_pos)) + my_ptrdiff_t size= item_pos - js->ptr(); + if (append_simple(str, js->ptr(), size)) + { + my_error(ER_OUTOFMEMORY, MYF(0), (int) size); goto return_null; /* Out of memory. */ + } + if (n_item > 0 && str->append(" ", 1)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 1); + goto return_null; /* Out of memory. */ + } + if (append_json_value(str, args[n_arg+1], &tmp_val)) + { + my_error(ER_OUTOFMEMORY, MYF(0), tmp_val.length()); + goto return_null; /* Out of memory. */ + } + if (str->append(",", 1)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 1); + goto return_null; /* Out of memory. */ + } + if (n_item == 0 && str->append(" ", 1)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 1); + goto return_null; /* Out of memory. */ + } + size= js->end() - item_pos; + if (append_simple(str, item_pos, size)) + { + my_error(ER_OUTOFMEMORY, MYF(0), (int) size); + goto return_null; /* Out of memory. */ + } } else { + my_ptrdiff_t size; /* Insert position wasn't found - append to the array. */ DBUG_ASSERT(je.state == JST_ARRAY_END); item_pos= (const char *) (je.s.c_str - je.sav_c_len); - if (append_simple(str, js->ptr(), item_pos - js->ptr()) || - (n_item > 0 && str->append(", ", 2)) || - append_json_value(str, args[n_arg+1], &tmp_val) || - append_simple(str, item_pos, js->end() - item_pos)) + size= item_pos - js->ptr(); + if (append_simple(str, js->ptr(), size)) + { + my_error(ER_OUTOFMEMORY, MYF(0), (int) size); + goto return_null; /* Out of memory. */ + } + if (n_item > 0 && str->append(", ", 2)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 2); goto return_null; /* Out of memory. */ + } + if (append_json_value(str, args[n_arg+1], &tmp_val)) + { + my_error(ER_OUTOFMEMORY, MYF(0), tmp_val.length()); + goto return_null; /* Out of memory. */ + } + size= js->end() - item_pos; + if (append_simple(str, item_pos, size)) + { + my_error(ER_OUTOFMEMORY, MYF(0), (int) size); + goto return_null; /* Out of memory. */ + } } { @@ -4117,13 +4165,23 @@ goto error; if (je.value_type == JSON_VALUE_STRING) { - if (value2.realloc_with_extra_if_needed(je.value_len) || - (c_len= json_unescape(js->charset(), je.value, + if (value2.realloc_with_extra_if_needed(je.value_len)) + { + my_error(ER_OUTOFMEMORY, MYF(0), je.value_len); + goto error; + } + if ((c_len= json_unescape(js->charset(), je.value, je.value + je.value_len, - &my_charset_utf8mb3_general_ci, + &my_charset_utf8mb4_bin, (uchar *) value2.ptr(), (uchar *) (value2.ptr() + je.value_len))) < 0) + { + if (current_thd) + push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN, + ER_JSON_BAD_CHR, ER_THD(current_thd, ER_JSON_BAD_CHR), + 0, "comparison", (int)((const char *) je.s.c_str - js->ptr())); goto error; + } value2.length(c_len); js= &value2; @@ -4166,13 +4224,23 @@ if (type == JSON_VALUE_STRING) { - if (value1.realloc_with_extra_if_needed(value_len) || - (c_len= json_unescape(value1.charset(), (uchar *) value, + if (value1.realloc_with_extra_if_needed(value_len)) + { + my_error(ER_OUTOFMEMORY, MYF(0), value_len); + return 1; + } + if ((c_len= json_unescape(value1.charset(), (uchar *) value, (uchar *) value+value_len, - &my_charset_utf8mb3_general_ci, + &my_charset_utf8mb4_bin, (uchar *) value1.ptr(), (uchar *) (value1.ptr() + value_len))) < 0) + { + if (current_thd) + push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN, + ER_JSON_BAD_CHR, ER_THD(current_thd, ER_JSON_BAD_CHR), + 0, "equality comparison", 0); return 1; + } value1.length(c_len); res1= &value1; } diff -Nru mariadb-10.11.11/sql/item_strfunc.cc mariadb-10.11.13/sql/item_strfunc.cc --- mariadb-10.11.11/sql/item_strfunc.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_strfunc.cc 2025-05-19 16:14:25.000000000 +0000 @@ -56,7 +56,6 @@ #include "sql_statistics.h" /* fmtlib include (https://fmt.dev/). */ -#define FMT_STATIC_THOUSANDS_SEPARATOR ',' #define FMT_HEADER_ONLY 1 #include "fmt/args.h" @@ -1403,6 +1402,13 @@ }; }; +struct fmt_locale_comma : std::numpunct +{ + char do_thousands_sep() const override { return ','; } + std::string do_grouping() const override { return "\3"; } +}; +static std::locale fmt_locale(std::locale(), new fmt_locale_comma); + /* SFORMAT(format_string, ...) This function receives a formatting specification string and N parameters @@ -1455,7 +1461,7 @@ /* Create the string output */ try { - auto text = fmt::vformat(fmt_arg->c_ptr_safe(), arg_store); + auto text = fmt::vformat(fmt_locale, fmt_arg->c_ptr_safe(), arg_store); res->length(0); res->set_charset(collation.collation); res->append(text.c_str(), text.size(), fmt_arg->charset()); diff -Nru mariadb-10.11.11/sql/item_subselect.cc mariadb-10.11.13/sql/item_subselect.cc --- mariadb-10.11.11/sql/item_subselect.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_subselect.cc 2025-05-19 16:14:25.000000000 +0000 @@ -7147,3 +7147,27 @@ for (uint i= 0; i < merge_keys_count; i++) partial_match_array_sizes[i]= merge_keys[i]->get_key_buff_elements(); } + + +/* + Check if somewhere inside this subselect we read the table. This means a + full read "(SELECT ... FROM tbl)", outside reference to tbl.column does not + count +*/ + +bool +Item_subselect::subselect_table_finder_processor(void *arg) +{ + subselect_table_finder_param *param= (subselect_table_finder_param *)arg; + for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select()) + { + TABLE_LIST *dup; + if ((dup= sl->find_table(param->thd, ¶m->find->db, + ¶m->find->table_name))) + { + param->dup= dup; + return TRUE; + } + } + return FALSE; +}; diff -Nru mariadb-10.11.11/sql/item_subselect.h mariadb-10.11.13/sql/item_subselect.h --- mariadb-10.11.11/sql/item_subselect.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/item_subselect.h 2025-05-19 16:14:25.000000000 +0000 @@ -273,6 +273,7 @@ { return TRUE; } + bool subselect_table_finder_processor(void *arg) override; void register_as_with_rec_ref(With_element *with_elem); void init_expr_cache_tracker(THD *thd); diff -Nru mariadb-10.11.11/sql/lex_string.h mariadb-10.11.13/sql/lex_string.h --- mariadb-10.11.11/sql/lex_string.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/lex_string.h 2025-05-19 16:14:25.000000000 +0000 @@ -110,7 +110,7 @@ class Lex_cstring_strlen: public Lex_cstring { public: - Lex_cstring_strlen(const char *from) + explicit Lex_cstring_strlen(const char *from) :Lex_cstring(from, from ? strlen(from) : 0) { } }; diff -Nru mariadb-10.11.11/sql/log.cc mariadb-10.11.13/sql/log.cc --- mariadb-10.11.11/sql/log.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/log.cc 2025-05-19 16:14:25.000000000 +0000 @@ -322,6 +322,11 @@ incident= TRUE; } + void clear_incident(void) + { + incident= FALSE; + } + bool has_incident(void) { return(incident); @@ -1932,6 +1937,16 @@ if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE)) DBUG_RETURN(1); +#ifdef WITH_WSREP + /* Wsrep transaction was BF aborted but it must replay because certification + succeeded. The transaction must not be written into binlog yet, it will + be done during commit after the replay. */ + if (WSREP(thd) && wsrep_must_replay(thd)) + { + DBUG_RETURN(0); + } +#endif /* WITH_WSREP */ + /* Doing a commit or a rollback including non-transactional tables, i.e., ending a transaction where we might write the transaction @@ -2530,6 +2545,18 @@ } +void binlog_clear_incident(THD *thd) +{ + binlog_cache_mngr *const cache_mngr= opt_bin_log ? + (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton) : 0; + if (cache_mngr) + { + cache_mngr->stmt_cache.clear_incident(); + cache_mngr->trx_cache.clear_incident(); + } +} + + void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional) { DBUG_ENTER("MYSQL_BIN_LOG::set_write_error"); @@ -7971,7 +7998,12 @@ { DBUG_RETURN(0); } - else if (!(thd->variables.option_bits & OPTION_BIN_LOG)) + + if (!(thd->variables.option_bits & OPTION_BIN_LOG) +#ifdef WITH_WSREP + && !WSREP(thd) +#endif + ) { cache_mngr->need_unlog= false; DBUG_RETURN(0); @@ -8878,6 +8910,13 @@ bool has_xid= entry->end_event->get_type_code() == XID_EVENT; DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt"); +#ifdef WITH_WSREP + if (WSREP(entry->thd) && + !(entry->thd->variables.option_bits & OPTION_BIN_LOG)) + { + DBUG_RETURN(0); + } +#endif /* WITH_WSREP */ /* An error in the trx_cache will truncate the cache to the last good diff -Nru mariadb-10.11.11/sql/log.h mariadb-10.11.13/sql/log.h --- mariadb-10.11.11/sql/log.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/log.h 2025-05-19 16:14:25.000000000 +0000 @@ -1186,6 +1186,7 @@ void make_default_log_name(char **out, const char* log_ext, bool once); void binlog_reset_cache(THD *thd); +void binlog_clear_incident(THD *thd); bool write_annotated_row(THD *thd); extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log; diff -Nru mariadb-10.11.11/sql/mysql_install_db.cc mariadb-10.11.13/sql/mysql_install_db.cc --- mariadb-10.11.11/sql/mysql_install_db.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/mysql_install_db.cc 2025-05-19 16:14:25.000000000 +0000 @@ -336,7 +336,7 @@ " --bootstrap" " --datadir=." " --tmpdir=." - " --loose-innodb-buffer-pool-size=20M" + " --loose-innodb-buffer-pool-size=21M" "\"" , mysqld_path, opt_verbose_bootstrap ? "--console" : ""); return cmdline; @@ -344,10 +344,29 @@ static char my_ini_path[MAX_PATH]; +/** + Wrapper for WritePrivateProfileStringA, with retries and sleeps + if file is locked by another process. +*/ +static BOOL write_private_profile_string_with_retries(const char *appname, + const char *key, const char *val, const char *filename) +{ + static constexpr int RETRIES=50; + static constexpr int SLEEP_MS=10; + for (int n= RETRIES;; n--) + { + if (WritePrivateProfileStringA(appname, key, val, filename)) + return TRUE; + if (GetLastError() != ERROR_ACCESS_DENIED || !n) + return FALSE; + Sleep(SLEEP_MS); + } +} + static void write_myini_str(const char *key, const char* val, const char *section="mysqld") { DBUG_ASSERT(my_ini_path[0]); - if (!WritePrivateProfileString(section, key, val, my_ini_path)) + if (!write_private_profile_string_with_retries(section, key, val, my_ini_path)) { die("Can't write to ini file key=%s, val=%s, section=%s, Windows error %u",key,val,section, GetLastError()); diff -Nru mariadb-10.11.11/sql/mysql_upgrade_service.cc mariadb-10.11.13/sql/mysql_upgrade_service.cc --- mariadb-10.11.11/sql/mysql_upgrade_service.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/mysql_upgrade_service.cc 2025-05-19 16:14:25.000000000 +0000 @@ -45,7 +45,6 @@ "OPTIONS:" static char mysqld_path[MAX_PATH]; -static char mysqladmin_path[MAX_PATH]; static char mysqlupgrade_path[MAX_PATH]; static char defaults_file_param[MAX_PATH + 16]; /*--defaults-file= */ @@ -302,13 +301,29 @@ our --skip-grant-tables do not work anymore after mysql_upgrade that does "flush privileges". Instead, the shutdown event is set. */ +#define OPEN_EVENT_RETRY_SLEEP_MS 100 +#define OPEN_EVENT_MAX_RETRIES 50 + void initiate_mysqld_shutdown() { char event_name[32]; DWORD pid= GetProcessId(mysqld_process); sprintf_s(event_name, "MySQLShutdown%d", pid); - HANDLE shutdown_handle= OpenEvent(EVENT_MODIFY_STATE, FALSE, event_name); - if(!shutdown_handle) + + HANDLE shutdown_handle; + for (int i= 0;; i++) + { + shutdown_handle= OpenEvent(EVENT_MODIFY_STATE, FALSE, event_name); + if(shutdown_handle != nullptr || i == OPEN_EVENT_MAX_RETRIES) + break; + if (WaitForSingleObject(mysqld_process, OPEN_EVENT_RETRY_SLEEP_MS) != + WAIT_TIMEOUT) + { + die("server process exited before shutdown event was created"); + break; + } + } + if (!shutdown_handle) { die("OpenEvent() failed for shutdown event"); } @@ -403,6 +418,26 @@ } +/** + Waits until starting server can be connected to, via given named pipe, with timeout + Dies if either server process exited meanwhile, or when timeout was exceeded. +*/ +static void wait_for_server_startup(HANDLE process, const char *named_pipe, DWORD timeout_sec) +{ + unsigned long long end_time= GetTickCount64() + 1000ULL*timeout_sec; + for (;;) + { + if (WaitNamedPipe(named_pipe, 0)) + return; + + if (GetTickCount64() >= end_time) + die("Server did not startup after %lu seconds", timeout_sec); + + if (WaitForSingleObject(process, 100) != WAIT_TIMEOUT) + die("Server did not start"); + } +} + int main(int argc, char **argv) { @@ -419,8 +454,9 @@ /* Get full path to mysqld, we need it when changing service configuration. - Assume installation layout, i.e mysqld.exe, mysqladmin.exe, mysqlupgrade.exe - and mysql_upgrade_service.exe are in the same directory. + Assume mysqld.exe in the same directory as this program. + mysql_upgrade.exe is either in the same directory, or pointed to by + MARIADB_UPGRADE_EXE environment variable (in case of MTR running it) */ GetModuleFileName(NULL, bindir, FN_REFLEN); p= strrchr(bindir, FN_LIBCHAR); @@ -429,15 +465,19 @@ *p= 0; } sprintf_s(mysqld_path, "%s\\mysqld.exe", bindir); - sprintf_s(mysqladmin_path, "%s\\mysqladmin.exe", bindir); sprintf_s(mysqlupgrade_path, "%s\\mysql_upgrade.exe", bindir); - char *paths[]= {mysqld_path, mysqladmin_path, mysqlupgrade_path}; - for(int i= 0; i< 3;i++) - { - if(GetFileAttributes(paths[i]) == INVALID_FILE_ATTRIBUTES) - die("File %s does not exist", paths[i]); + if (access(mysqld_path, 0)) + die("File %s does not exist", mysqld_path); + if (access(mysqlupgrade_path, 0)) + { + /* Try to get path from environment variable, set by MTR */ + char *alt_mysqlupgrade_path= getenv("MARIADB_UPGRADE_EXE"); + if (alt_mysqlupgrade_path) + sprintf_s(mysqlupgrade_path, "%s", alt_mysqlupgrade_path); } + if (access(mysqlupgrade_path, 0)) + die("File %s does not exist", mysqld_path); /* Messages written on stdout should not be buffered, GUI upgrade program @@ -482,6 +522,10 @@ DWORD start_duration_ms = 0; + char pipe_name[64]; + snprintf(pipe_name, sizeof(pipe_name), + "\\\\.\\pipe\\mysql_upgrade_service_%lu", GetCurrentProcessId()); + if (do_start_stop_server) { /* Start/stop server with --loose-innodb-fast-shutdown=1 */ @@ -493,37 +537,23 @@ { die("Cannot start mysqld.exe process, last error =%u", GetLastError()); } - char pipe_name[64]; - snprintf(pipe_name, sizeof(pipe_name), "\\\\.\\pipe\\mysql_upgrade_service_%lu", - GetCurrentProcessId()); - for (;;) - { - if (WaitForSingleObject(mysqld_process, 0) != WAIT_TIMEOUT) - die("mysqld.exe did not start"); - - if (WaitNamedPipe(pipe_name, 0)) - { - // Server started, shut it down. - initiate_mysqld_shutdown(); - if (WaitForSingleObject((HANDLE)mysqld_process, shutdown_timeout * 1000) != WAIT_OBJECT_0) - { - die("Could not shutdown server started with '--innodb-fast-shutdown=0'"); - } - DWORD exit_code; - if (!GetExitCodeProcess((HANDLE)mysqld_process, &exit_code)) - { - die("Could not get mysqld's exit code"); - } - if (exit_code) - { - die("Could not get successfully shutdown mysqld"); - } - CloseHandle(mysqld_process); - break; - } - Sleep(500); - start_duration_ms += 500; + wait_for_server_startup(mysqld_process, pipe_name, startup_timeout); + // Server started, shut it down. + initiate_mysqld_shutdown(); + if (WaitForSingleObject((HANDLE)mysqld_process, shutdown_timeout * 1000) != WAIT_OBJECT_0) + { + die("Could not shutdown server"); + } + DWORD exit_code; + if (!GetExitCodeProcess((HANDLE)mysqld_process, &exit_code)) + { + die("Could not get server's exit code"); + } + if (exit_code) + { + die("Could not get successfully shutdown server (exit code %u)",exit_code); } + CloseHandle(mysqld_process); } log("Phase %d/%d: Fixing server config file%s", ++phase, max_phases, @@ -550,22 +580,7 @@ } log("Phase %d/%d: Waiting for startup to complete",++phase,max_phases); - start_duration_ms= 0; - for(;;) - { - if (WaitForSingleObject(mysqld_process, 0) != WAIT_TIMEOUT) - die("mysqld.exe did not start"); - - if (run_tool(P_WAIT, mysqladmin_path, "--protocol=pipe", socket_param, - "ping", "--no-beep", NULL) == 0) - { - break; - } - if (start_duration_ms > startup_timeout*1000) - die("Server did not come up in %d seconds",startup_timeout); - Sleep(500); - start_duration_ms+= 500; - } + wait_for_server_startup(mysqld_process, pipe_name, startup_timeout); log("Phase %d/%d: Running mysql_upgrade",++phase,max_phases); int upgrade_err= (int) run_tool(P_WAIT, mysqlupgrade_path, diff -Nru mariadb-10.11.11/sql/mysqld.cc mariadb-10.11.13/sql/mysqld.cc --- mariadb-10.11.11/sql/mysqld.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/mysqld.cc 2025-05-19 16:14:25.000000000 +0000 @@ -420,7 +420,9 @@ char* opt_secure_file_priv; my_bool lower_case_file_system= 0; my_bool opt_large_pages= 0; +#ifdef HAVE_SOLARIS_LARGE_PAGES my_bool opt_super_large_pages= 0; +#endif my_bool opt_myisam_use_mmap= 0; uint opt_large_page_size= 0; #if defined(ENABLED_DEBUG_SYNC) @@ -1396,11 +1398,6 @@ static int systemd_sock_activation; /* systemd socket activation */ - -/** wakeup listening(main) thread by writing to this descriptor */ -static int termination_event_fd= -1; - - C_MODE_START #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE /** @@ -1453,9 +1450,14 @@ #endif /* OS specific variables */ - +#ifndef EMBEDDED_LIBRARY #ifdef _WIN32 +/** wakeup main thread by signaling this event */ HANDLE hEventShutdown; +#else +/** wakeup listening(main) thread by writing to this descriptor */ +static int termination_event_fd= -1; +#endif #endif @@ -3744,12 +3746,12 @@ #endif /* - When thread specific is set, both mysqld_server_initialized and thd - must be set, and we check that with DBUG_ASSERT. - - However, do not crash, if current_thd is NULL, in release version. + is_thread_specific is only relevant when a THD exist and the server + has fully started. is_thread_specific can be set during recovery by + Aria for functions that are normally only run in one thread. + However InnoDB sets thd early, so we can use it. */ - DBUG_ASSERT(!is_thread_specific || (mysqld_server_initialized && thd)); + DBUG_ASSERT(!is_thread_specific || thd || !plugins_are_initialized); if (is_thread_specific && likely(thd)) /* If thread specific memory */ { @@ -4118,7 +4120,7 @@ if (opt_large_pages) { DBUG_PRINT("info", ("Large page set")); - if (my_init_large_pages(opt_super_large_pages)) + if (my_init_large_pages()) { return 1; } @@ -5337,7 +5339,7 @@ MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"), MARIADB_REMOVED_OPTION("innodb-lru-flush-size"), MARIADB_REMOVED_OPTION("innodb-page-cleaners"), - MARIADB_REMOVED_OPTION("innodb-purge-truncate-frequency"), + MARIADB_REMOVED_OPTION("innodb-purge-rseg-truncate-frequency"), MARIADB_REMOVED_OPTION("innodb-replication-delay"), MARIADB_REMOVED_OPTION("innodb-scrub-log"), MARIADB_REMOVED_OPTION("innodb-scrub-log-speed"), @@ -7872,7 +7874,9 @@ bzero((char*) &global_status_var, offsetof(STATUS_VAR, last_cleared_system_status_var)); opt_large_pages= 0; +#ifdef HAVE_SOLARIS_LARGE_PAGES opt_super_large_pages= 0; +#endif #if defined(ENABLED_DEBUG_SYNC) opt_debug_sync_timeout= 0; #endif /* defined(ENABLED_DEBUG_SYNC) */ @@ -8872,15 +8876,22 @@ bool is_log= opt_log || global_system_variables.sql_log_slow || opt_bin_log; bool is_debug= IF_DBUG(!strstr(MYSQL_SERVER_SUFFIX_STR, "-debug"), 0); const char *is_valgrind= -#ifdef HAVE_VALGRIND +#ifdef HAVE_valgrind !strstr(MYSQL_SERVER_SUFFIX_STR, "-valgrind") ? "-valgrind" : #endif ""; + const char *is_asan= +#ifdef __SANITIZE_ADDRESS__ + !strstr(MYSQL_SERVER_SUFFIX_STR, "-asan") ? "-asan" : +#endif + ""; + return strxnmov(buf, size - 1, MYSQL_SERVER_VERSION, MYSQL_SERVER_SUFFIX_STR, IF_EMBEDDED("-embedded", ""), is_valgrind, + is_asan, is_debug ? "-debug" : "", is_log ? "-log" : "", NullS); @@ -9303,6 +9314,7 @@ PSI_stage_info stage_purging_old_relay_logs= { 0, "Purging old relay logs", 0}; PSI_stage_info stage_query_end= { 0, "Query end", 0}; PSI_stage_info stage_starting_cleanup= { 0, "Starting cleanup", 0}; +PSI_stage_info stage_slave_sql_cleanup= { 0, "Slave SQL thread ending", 0}; PSI_stage_info stage_rollback= { 0, "Rollback", 0}; PSI_stage_info stage_rollback_implicit= { 0, "Rollback_implicit", 0}; PSI_stage_info stage_commit= { 0, "Commit", 0}; @@ -9544,6 +9556,7 @@ & stage_preparing, & stage_purging_old_relay_logs, & stage_starting_cleanup, + & stage_slave_sql_cleanup, & stage_query_end, & stage_queueing_master_event_to_the_relay_log, & stage_reading_event_from_the_relay_log, diff -Nru mariadb-10.11.11/sql/mysqld.h mariadb-10.11.13/sql/mysqld.h --- mariadb-10.11.11/sql/mysqld.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/mysqld.h 2025-05-19 16:14:25.000000000 +0000 @@ -612,6 +612,7 @@ extern PSI_stage_info stage_purging_old_relay_logs; extern PSI_stage_info stage_query_end; extern PSI_stage_info stage_starting_cleanup; +extern PSI_stage_info stage_slave_sql_cleanup; extern PSI_stage_info stage_rollback; extern PSI_stage_info stage_rollback_implicit; extern PSI_stage_info stage_commit; diff -Nru mariadb-10.11.11/sql/net_serv.cc mariadb-10.11.13/sql/net_serv.cc --- mariadb-10.11.11/sql/net_serv.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/net_serv.cc 2025-05-19 16:14:25.000000000 +0000 @@ -773,18 +773,22 @@ } #endif /* !defined(MYSQL_SERVER) */ net->error= 2; /* Close socket */ - net->last_errno= (interrupted ? ER_NET_WRITE_INTERRUPTED : - ER_NET_ERROR_ON_WRITE); -#ifdef MYSQL_SERVER - if (global_system_variables.log_warnings > 3) + + if (net->vio->state != VIO_STATE_SHUTDOWN || net->last_errno == 0) { - sql_print_warning("Could not write packet: fd: %lld state: %d " - "errno: %d vio_errno: %d length: %ld", - (longlong) vio_fd(net->vio), (int) net->vio->state, - vio_errno(net->vio), net->last_errno, - (ulong) (end-pos)); - } + net->last_errno= (interrupted ? ER_NET_WRITE_INTERRUPTED : + ER_NET_ERROR_ON_WRITE); +#ifdef MYSQL_SERVER + if (global_system_variables.log_warnings > 3) + { + sql_print_warning("Could not write packet: fd: %lld state: %d " + "errno: %d vio_errno: %d length: %ld", + (longlong) vio_fd(net->vio), (int) net->vio->state, + vio_errno(net->vio), net->last_errno, + (ulong) (end-pos)); + } #endif + } MYSQL_SERVER_my_error(net->last_errno, MYF(0)); break; } @@ -1097,6 +1101,7 @@ ER_NET_READ_INTERRUPTED : ER_NET_READ_ERROR); #ifdef MYSQL_SERVER + strmake_buf(net->last_error, ER(net->last_errno)); if (global_system_variables.log_warnings > 3) { /* Log things as a warning */ diff -Nru mariadb-10.11.11/sql/opt_range.cc mariadb-10.11.13/sql/opt_range.cc --- mariadb-10.11.11/sql/opt_range.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/opt_range.cc 2025-05-19 16:14:25.000000000 +0000 @@ -8351,56 +8351,58 @@ /* Build conjunction of all SEL_TREEs for a simple predicate applying equalities - + SYNOPSIS get_full_func_mm_tree() param PARAM from SQL_SELECT::test_quick_select field_item field in the predicate - value constant in the predicate (or a field already read from + value constant in the predicate (or a field already read from a table in the case of dynamic range access) (for BETWEEN it contains the number of the field argument, - for IN it's always 0) + for IN it's always 0) inv TRUE <> NOT cond_func is considered (makes sense only when cond_func is BETWEEN or IN) DESCRIPTION - For a simple SARGable predicate of the form (f op c), where f is a field and - c is a constant, the function builds a conjunction of all SEL_TREES that can - be obtained by the substitution of f for all different fields equal to f. + For a simple SARGable predicate of the form (f op c), where f is a field + and c is a constant, the function builds a conjunction of all SEL_TREES that + can be obtained by the substitution of f for all different fields equal to f. - NOTES + NOTES If the WHERE condition contains a predicate (fi op c), then not only SELL_TREE for this predicate is built, but the trees for the results of substitution of fi for each fj belonging to the same multiple equality as fi are built as well. - E.g. for WHERE t1.a=t2.a AND t2.a > 10 + E.g. for WHERE t1.a=t2.a AND t2.a > 10 a SEL_TREE for t2.a > 10 will be built for quick select from t2 - and + and a SEL_TREE for t1.a > 10 will be built for quick select from t1. - A BETWEEN predicate of the form (fi [NOT] BETWEEN c1 AND c2) is treated - in a similar way: we build a conjuction of trees for the results - of all substitutions of fi for equal fj. + A BETWEEN predicate of the form (fi [NOT] BETWEEN c1 AND c2), where fi + is some field, is treated in a similar way: we build a conjuction of + trees for the results of all substitutions of fi equal fj. + Yet a predicate of the form (c BETWEEN f1i AND f2i) is processed differently. It is considered as a conjuction of two SARGable - predicates (f1i <= c) and (f2i <=c) and the function get_full_func_mm_tree - is called for each of them separately producing trees for - AND j (f1j <=c ) and AND j (f2j <= c) + predicates (f1i <= c) and (c <= f2i) and the function get_full_func_mm_tree + is called for each of them separately producing trees for + AND j (f1j <= c) and AND j (c <= f2j) After this these two trees are united in one conjunctive tree. It's easy to see that the same tree is obtained for - AND j,k (f1j <=c AND f2k<=c) - which is equivalent to + AND j,k (f1j <= c AND c <= f2k) + which is equivalent to AND j,k (c BETWEEN f1j AND f2k). + The validity of the processing of the predicate (c NOT BETWEEN f1i AND f2i) which equivalent to (f1i > c OR f2i < c) is not so obvious. Here the - function get_full_func_mm_tree is called for (f1i > c) and (f2i < c) - producing trees for AND j (f1j > c) and AND j (f2j < c). Then this two - trees are united in one OR-tree. The expression + function get_full_func_mm_tree is called for (f1i > c) and called for + (f2i < c) producing trees for AND j (f1j > c) and AND j (f2j < c). Then + this two trees are united in one OR-tree. The expression (AND j (f1j > c) OR AND j (f2j < c) is equivalent to the expression - AND j,k (f1j > c OR f2k < c) - which is just a translation of + AND j,k (f1j > c OR f2k < c) + which is just a translation of AND j,k (c NOT BETWEEN f1j AND f2k) In the cases when one of the items f1, f2 is a constant c1 we do not create @@ -8413,9 +8415,9 @@ As to IN predicates only ones of the form (f IN (c1,...,cn)), where f1 is a field and c1,...,cn are constant, are considered as SARGable. We never try to narrow the index scan using predicates of - the form (c IN (c1,...,f,...,cn)). - - RETURN + the form (c IN (c1,...,f,...,cn)). + + RETURN Pointer to the tree representing the built conjunction of SEL_TREEs */ @@ -8513,6 +8515,11 @@ SEL_TREE *tree= li.ref()[0]->get_mm_tree(param, li.ref()); if (param->statement_should_be_aborted()) DBUG_RETURN(NULL); + bool orig_disable_index_merge= param->disable_index_merge_plans; + + if (list.elements > MAX_OR_ELEMENTS_FOR_INDEX_MERGE) + param->disable_index_merge_plans= true; + if (tree) { if (tree->type == SEL_TREE::IMPOSSIBLE && @@ -8529,7 +8536,10 @@ { SEL_TREE *new_tree= li.ref()[0]->get_mm_tree(param, li.ref()); if (new_tree == NULL || param->statement_should_be_aborted()) + { + param->disable_index_merge_plans= orig_disable_index_merge; DBUG_RETURN(NULL); + } tree= tree_or(param, tree, new_tree); if (tree == NULL || tree->type == SEL_TREE::ALWAYS) { @@ -8561,6 +8571,7 @@ if (replace_cond) *cond_ptr= replacement_item; } + param->disable_index_merge_plans= orig_disable_index_merge; DBUG_RETURN(tree); } @@ -8614,6 +8625,19 @@ } +bool +Item_func_between::can_optimize_range_const(Item_field *field_item) const +{ + const Type_handler *fi_handler= field_item->type_handler_for_comparison(); + Type_handler_hybrid_field_type cmp(fi_handler); + if (cmp.aggregate_for_comparison(args[0]->type_handler_for_comparison()) || + cmp.type_handler() != m_comparator.type_handler()) + return false; // Cannot optimize range because of type mismatch. + + return true; +} + + SEL_TREE * Item_func_between::get_mm_tree(RANGE_OPT_PARAM *param, Item **cond_ptr) { @@ -8639,6 +8663,8 @@ if (arguments()[i]->real_item()->type() == Item::FIELD_ITEM) { Item_field *field_item= (Item_field*) (arguments()[i]->real_item()); + if (!can_optimize_range_const(field_item)) + continue; SEL_TREE *tmp= get_full_func_mm_tree(param, field_item, (Item*)(intptr) i); if (negated) @@ -9952,6 +9978,8 @@ { bool must_be_ored= sel_trees_must_be_ored(param, tree1, tree2, ored_keys); no_imerge_from_ranges= must_be_ored; + if (param->disable_index_merge_plans) + no_imerge_from_ranges= true; if (no_imerge_from_ranges && no_merges1 && no_merges2) { @@ -16006,7 +16034,7 @@ Remember this key, and continue looking for a non-NULL key that satisfies some other condition. */ - memcpy(tmp_record, record, head->s->rec_buff_length); + memcpy(tmp_record, record, head->s->reclength); found_null= TRUE; continue; } @@ -16046,7 +16074,7 @@ */ if (found_null && result) { - memcpy(record, tmp_record, head->s->rec_buff_length); + memcpy(record, tmp_record, head->s->reclength); result= 0; } return result; @@ -16079,7 +16107,7 @@ ha_rkey_function find_flag; key_part_map keypart_map; QUICK_RANGE *cur_range; - int result; + int result= HA_ERR_KEY_NOT_FOUND; DBUG_ASSERT(min_max_ranges.elements > 0); @@ -16088,10 +16116,11 @@ get_dynamic(&min_max_ranges, (uchar*)&cur_range, range_idx - 1); /* - If the current value for the min/max argument is smaller than the left - boundary of cur_range, there is no need to check this range. + If the key has already been "moved" by a successful call to + ha_index_read_map, and the current value for the max argument + comes before the range, there is no need to check this range. */ - if (range_idx != min_max_ranges.elements && + if (!result && !(cur_range->flag & NO_MIN_RANGE) && (key_cmp(min_max_arg_part, (const uchar*) cur_range->min_key, min_max_arg_len) == -1)) diff -Nru mariadb-10.11.11/sql/opt_range.h mariadb-10.11.13/sql/opt_range.h --- mariadb-10.11.11/sql/opt_range.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/opt_range.h 2025-05-19 16:14:25.000000000 +0000 @@ -39,6 +39,32 @@ class JOIN; class Item_sum; +/* + When processing an OR clause with more than MAX_OR_ELEMENTS_FOR_INDEX_MERGE + disjuncts (i.e. OR-parts), do not construct index_merge plans from it. + + Some users have OR clauses with extremely large number of disjuncts, like: + + (key1=1 AND key2=10) OR + (key1=2 AND key2=20) OR + (key1=3 AND key2=30) OR + ... + + When processing this, the optimizer would try to build a lot of potential + index_merge plans. Hypothetically this could be useful as the cheapest plan + could be to pick a specific index for each disjunct and build: + + index_merge(key1 IN (1,3,8,15...), key2 IN (20, 40, 50 ...)) + + In practice this causes combinatorial amount of time to be spent in the range + analyzer, and most variants will be discarded when the range optimizer tries + to avoid this combinatorial explosion (which may or may not work depending on + the form of the WHERE clause). + In practice, very long ORs are served well enough by just considering range + accesses on individual indexes. +*/ +const int MAX_OR_ELEMENTS_FOR_INDEX_MERGE=100; + struct KEY_PART { uint16 key,part; /* See KEY_PART_INFO for meaning of the next two: */ @@ -889,6 +915,9 @@ */ bool remove_false_where_parts; + /* If TRUE, do not construct index_merge plans */ + bool disable_index_merge_plans; + /* Which functions should give SQL notes for unusable keys. */ diff -Nru mariadb-10.11.11/sql/rpl_injector.h mariadb-10.11.13/sql/rpl_injector.h --- mariadb-10.11.11/sql/rpl_injector.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/rpl_injector.h 2025-05-19 16:14:25.000000000 +0000 @@ -146,7 +146,6 @@ }; transaction() : m_thd(NULL) { } - transaction(transaction const&); ~transaction(); /* Clear transaction, i.e., make calls to 'good()' return false. */ diff -Nru mariadb-10.11.11/sql/rpl_mi.cc mariadb-10.11.13/sql/rpl_mi.cc --- mariadb-10.11.11/sql/rpl_mi.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/rpl_mi.cc 2025-05-19 16:14:25.000000000 +0000 @@ -21,6 +21,7 @@ #include "slave.h" #include "strfunc.h" #include "sql_repl.h" +#include #ifdef HAVE_REPLICATION @@ -1369,27 +1370,21 @@ Sql_condition::enum_warning_level warning) { Master_info *mi; - char buff[MAX_CONNECTION_NAME+1], *res; - size_t buff_length; DBUG_ENTER("get_master_info"); DBUG_PRINT("enter", ("connection_name: '%.*s'", (int) connection_name->length, connection_name->str)); - /* Make name lower case for comparison */ - res= strmake(buff, connection_name->str, connection_name->length); - my_casedn_str(system_charset_info, buff); - buff_length= (size_t) (res-buff); - + if (!connection_name->str) + connection_name= &empty_clex_str; mi= (Master_info*) my_hash_search(&master_info_hash, - (uchar*) buff, buff_length); + (uchar*) connection_name->str, + connection_name->length); if (!mi && warning != Sql_condition::WARN_LEVEL_NOTE) { my_error(WARN_NO_MASTER_INFO, - MYF(warning == Sql_condition::WARN_LEVEL_WARN ? ME_WARNING : - 0), - (int) connection_name->length, - connection_name->str); + MYF(warning == Sql_condition::WARN_LEVEL_WARN ? ME_WARNING : 0), + (int) connection_name->length, connection_name->str); } DBUG_RETURN(mi); } @@ -2074,4 +2069,52 @@ DBUG_RETURN(result); } +void setup_mysql_connection_for_master(MYSQL *mysql, Master_info *mi, + uint timeout) +{ + DBUG_ASSERT(mi); + DBUG_ASSERT(mi->mysql); + mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &timeout); + mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &timeout); + +#ifdef HAVE_OPENSSL + if (mi->ssl) + { + mysql_ssl_set(mysql, + mi->ssl_key[0]?mi->ssl_key:0, + mi->ssl_cert[0]?mi->ssl_cert:0, + mi->ssl_ca[0]?mi->ssl_ca:0, + mi->ssl_capath[0]?mi->ssl_capath:0, + mi->ssl_cipher[0]?mi->ssl_cipher:0); + mysql_options(mysql, MYSQL_OPT_SSL_CRL, + mi->ssl_crl[0] ? mi->ssl_crl : 0); + mysql_options(mysql, MYSQL_OPT_SSL_CRLPATH, + mi->ssl_crlpath[0] ? mi->ssl_crlpath : 0); + mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, + &mi->ssl_verify_server_cert); + } +#endif + + /* + If server's default charset is not supported (like utf16, utf32) as client + charset, then set client charset to 'latin1' (default client charset). + */ + if (is_supported_parser_charset(default_charset_info)) + mysql_options(mysql, MYSQL_SET_CHARSET_NAME, default_charset_info->cs_name.str); + else + { + sql_print_information("'%s' can not be used as client character set. " + "'%s' will be used as default client character set " + "while connecting to master.", + default_charset_info->cs_name.str, + default_client_charset_info->cs_name.str); + mysql_options(mysql, MYSQL_SET_CHARSET_NAME, + default_client_charset_info->cs_name.str); + } + + /* Set MYSQL_PLUGIN_DIR in case master asks for an external authentication plugin */ + if (opt_plugin_dir_ptr && *opt_plugin_dir_ptr) + mysql_options(mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir_ptr); +} + #endif /* HAVE_REPLICATION */ diff -Nru mariadb-10.11.11/sql/rpl_mi.h mariadb-10.11.13/sql/rpl_mi.h --- mariadb-10.11.11/sql/rpl_mi.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/rpl_mi.h 2025-05-19 16:14:25.000000000 +0000 @@ -487,5 +487,16 @@ uint any_slave_sql_running(bool already_locked); bool give_error_if_slave_running(bool already_lock); +/* + Sets up the basic options for a MYSQL connection, mysql, to connect to the + primary server described by the Master_info parameter, mi. The timeout must + be passed explicitly, as different types of connections created by the slave + will use different values. + + Assumes mysql_init() has already been called on the mysql connection object. +*/ +void setup_mysql_connection_for_master(MYSQL *mysql, Master_info *mi, + uint timeout); + #endif /* HAVE_REPLICATION */ #endif /* RPL_MI_H */ diff -Nru mariadb-10.11.11/sql/rpl_parallel.cc mariadb-10.11.13/sql/rpl_parallel.cc --- mariadb-10.11.11/sql/rpl_parallel.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/rpl_parallel.cc 2025-05-19 16:14:25.000000000 +0000 @@ -124,8 +124,8 @@ else if (cmp == 0 && rli->group_master_log_pos < qev->future_event_master_log_pos) rli->group_master_log_pos= qev->future_event_master_log_pos; - mysql_mutex_unlock(&rli->data_lock); mysql_cond_broadcast(&rli->data_cond); + mysql_mutex_unlock(&rli->data_lock); } @@ -153,14 +153,12 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id, rpl_parallel_entry *entry, rpl_group_info *rgi) { - THD *thd= rpt->thd; - wait_for_commit *wfc= &rgi->commit_orderer; - int err; - if (rgi->get_finish_event_group_called()) return; - thd->get_stmt_da()->set_overwrite_status(true); + THD *thd= rpt->thd; + wait_for_commit *wfc= &rgi->commit_orderer; + int err; if (unlikely(rgi->worker_error)) { @@ -320,10 +318,6 @@ wait_for_pending_deadlock_kill(thd, rgi); thd->clear_error(); thd->reset_killed(); - /* - Would do thd->get_stmt_da()->set_overwrite_status(false) here, but - reset_diagnostics_area() already does that. - */ thd->get_stmt_da()->reset_diagnostics_area(); wfc->wakeup_subsequent_commits(rgi->worker_error); rgi->did_mark_start_commit= false; @@ -1597,9 +1591,7 @@ else { delete qev->ev; - thd->get_stmt_da()->set_overwrite_status(true); err= thd->wait_for_prior_commit(); - thd->get_stmt_da()->set_overwrite_status(false); } end_of_group= diff -Nru mariadb-10.11.11/sql/semisync_master.cc mariadb-10.11.13/sql/semisync_master.cc --- mariadb-10.11.11/sql/semisync_master.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/semisync_master.cc 2025-05-19 16:14:25.000000000 +0000 @@ -565,12 +565,14 @@ { lock(); DBUG_ASSERT(rpl_semi_sync_master_clients > 0); - if (!(--rpl_semi_sync_master_clients) && !rpl_semi_sync_master_wait_no_slave) + if (!(--rpl_semi_sync_master_clients) && !rpl_semi_sync_master_wait_no_slave + && get_master_enabled()) { /* Signal transactions waiting in commit_trx() that they do not have to wait anymore. */ + DBUG_ASSERT(m_active_tranxs); m_active_tranxs->clear_active_tranx_nodes(NULL, 0, signal_waiting_transaction); } diff -Nru mariadb-10.11.11/sql/semisync_slave.cc mariadb-10.11.13/sql/semisync_slave.cc --- mariadb-10.11.11/sql/semisync_slave.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/semisync_slave.cc 2025-05-19 16:14:25.000000000 +0000 @@ -141,7 +141,7 @@ DBUG_ASSERT(!debug_sync_set_action(mi->io_thd, STRING_WITH_LEN(act))); };); #endif - kill_connection(mi->mysql); + kill_connection(mi); } set_slave_enabled(0); @@ -158,8 +158,9 @@ } -void Repl_semi_sync_slave::kill_connection(MYSQL *mysql) +void Repl_semi_sync_slave::kill_connection(Master_info *mi) { + MYSQL *mysql= mi->mysql; if (!mysql) return; @@ -168,8 +169,8 @@ size_t kill_buffer_length; kill_mysql = mysql_init(kill_mysql); - mysql_options(kill_mysql, MYSQL_OPT_CONNECT_TIMEOUT, &m_kill_conn_timeout); - mysql_options(kill_mysql, MYSQL_OPT_READ_TIMEOUT, &m_kill_conn_timeout); + + setup_mysql_connection_for_master(kill_mysql, mi, m_kill_conn_timeout); mysql_options(kill_mysql, MYSQL_OPT_WRITE_TIMEOUT, &m_kill_conn_timeout); bool ret= (!mysql_real_connect(kill_mysql, mysql->host, diff -Nru mariadb-10.11.11/sql/semisync_slave.h mariadb-10.11.13/sql/semisync_slave.h --- mariadb-10.11.11/sql/semisync_slave.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/semisync_slave.h 2025-05-19 16:14:25.000000000 +0000 @@ -92,7 +92,7 @@ void slave_stop(Master_info *mi); void slave_reconnect(Master_info *mi); int request_transmit(Master_info *mi); - void kill_connection(MYSQL *mysql); + void kill_connection(Master_info *mi); private: /* True when init_object has been called */ diff -Nru mariadb-10.11.11/sql/signal_handler.cc mariadb-10.11.13/sql/signal_handler.cc --- mariadb-10.11.11/sql/signal_handler.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/signal_handler.cc 2025-05-19 16:14:25.000000000 +0000 @@ -277,7 +277,7 @@ my_safe_printf_stderr("Status: %s\n", kreason); my_safe_printf_stderr("Query (%p): ", thd->query()); my_safe_print_str(thd->query(), MY_MIN(65536U, thd->query_length())); - my_safe_printf_stderr("%s", "Optimizer switch: "); + my_safe_printf_stderr("%s", "\nOptimizer switch: "); ulonglong optsw= thd->variables.optimizer_switch; for (uint i= 0; optimizer_switch_names[i+1]; i++, optsw >>= 1) { diff -Nru mariadb-10.11.11/sql/slave.cc mariadb-10.11.13/sql/slave.cc --- mariadb-10.11.11/sql/slave.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/slave.cc 2025-05-19 16:14:25.000000000 +0000 @@ -3213,21 +3213,23 @@ if (full) protocol->store(mi->connection_name.str, mi->connection_name.length, &my_charset_bin); + mysql_mutex_lock(&mi->run_lock); + THD *sql_thd= mi->rli.sql_driver_thd; + const char *slave_sql_running_state= + sql_thd ? sql_thd->get_proc_info() : ""; + THD *io_thd= mi->io_thd; + const char *slave_io_running_state= io_thd ? io_thd->get_proc_info() : ""; + mysql_mutex_unlock(&mi->run_lock); + if (full) - { /* Show what the sql driver replication thread is doing This is only meaningful if there is only one slave thread. */ - msg= (mi->rli.sql_driver_thd ? - mi->rli.sql_driver_thd->get_proc_info() : ""); - protocol->store_string_or_null(msg, &my_charset_bin); - } - msg= mi->io_thd ? mi->io_thd->get_proc_info() : ""; - protocol->store_string_or_null(msg, &my_charset_bin); + protocol->store_string_or_null(slave_sql_running_state, &my_charset_bin); - mysql_mutex_unlock(&mi->run_lock); + protocol->store_string_or_null(slave_io_running_state, &my_charset_bin); mysql_mutex_lock(&mi->data_lock); mysql_mutex_lock(&mi->rli.data_lock); @@ -3401,10 +3403,6 @@ protocol->store((uint32) mi->rli.get_sql_delay()); // SQL_Remaining_Delay - // THD::proc_info is not protected by any lock, so we read it once - // to ensure that we use the same value throughout this function. - const char *slave_sql_running_state= - mi->rli.sql_driver_thd ? mi->rli.sql_driver_thd->proc_info : ""; if (slave_sql_running_state == stage_sql_thd_waiting_until_delay.m_name) { time_t t= my_time(0), sql_delay_end= mi->rli.get_sql_delay_end(); @@ -5485,6 +5483,7 @@ THD *thd; /* needs to be first for thread_stack */ char saved_log_name[FN_REFLEN]; char saved_master_log_name[FN_REFLEN]; + bool thd_initialized= 0; my_off_t UNINIT_VAR(saved_log_pos); my_off_t UNINIT_VAR(saved_master_log_pos); String saved_skip_gtid_pos; @@ -5587,6 +5586,7 @@ thd->variables.alter_algorithm= (ulong) Alter_info::ALTER_TABLE_ALGORITHM_DEFAULT; server_threads.insert(thd); + thd_initialized= 1; /* We are going to set slave_running to 1. Assuming slave I/O thread is alive and connected, this is going to make Seconds_Behind_Master be 0 @@ -5966,7 +5966,7 @@ } THD_STAGE_INFO(thd, stage_waiting_for_slave_mutex_on_exit); thd->add_status_to_global(); - server_threads.erase(thd); + THD_STAGE_INFO(thd, stage_slave_sql_cleanup); mysql_mutex_lock(&rli->run_lock); err_during_init: @@ -5980,9 +5980,9 @@ rli->relay_log.description_event_for_exec= 0; rli->reset_inuse_relaylog(); /* Wake up master_pos_wait() */ - mysql_mutex_unlock(&rli->data_lock); DBUG_PRINT("info",("Signaling possibly waiting master_pos_wait() functions")); mysql_cond_broadcast(&rli->data_cond); + mysql_mutex_unlock(&rli->data_lock); rli->ignore_log_space_limit= 0; /* don't need any lock */ /* we die so won't remember charset - re-update them on next thread start */ thd->system_thread_info.rpl_sql_info->cached_charset_invalidate(); @@ -6037,6 +6037,8 @@ rpl_parallel_resize_pool_if_no_slaves(); delete serial_rgi; + if (thd_initialized) + server_threads.erase(thd); delete thd; DBUG_LEAVE; // Must match DBUG_ENTER() @@ -7616,50 +7618,10 @@ if (opt_slave_compressed_protocol) client_flag|= CLIENT_COMPRESS; /* We will use compression */ - mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &slave_net_timeout); - mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &slave_net_timeout); + setup_mysql_connection_for_master(mi->mysql, mi, slave_net_timeout); mysql_options(mysql, MYSQL_OPT_USE_THREAD_SPECIFIC_MEMORY, (char*) &my_true); -#ifdef HAVE_OPENSSL - if (mi->ssl) - { - mysql_ssl_set(mysql, - mi->ssl_key[0]?mi->ssl_key:0, - mi->ssl_cert[0]?mi->ssl_cert:0, - mi->ssl_ca[0]?mi->ssl_ca:0, - mi->ssl_capath[0]?mi->ssl_capath:0, - mi->ssl_cipher[0]?mi->ssl_cipher:0); - mysql_options(mysql, MYSQL_OPT_SSL_CRL, - mi->ssl_crl[0] ? mi->ssl_crl : 0); - mysql_options(mysql, MYSQL_OPT_SSL_CRLPATH, - mi->ssl_crlpath[0] ? mi->ssl_crlpath : 0); - mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, - &mi->ssl_verify_server_cert); - } -#endif - - /* - If server's default charset is not supported (like utf16, utf32) as client - charset, then set client charset to 'latin1' (default client charset). - */ - if (is_supported_parser_charset(default_charset_info)) - mysql_options(mysql, MYSQL_SET_CHARSET_NAME, default_charset_info->cs_name.str); - else - { - sql_print_information("'%s' can not be used as client character set. " - "'%s' will be used as default client character set " - "while connecting to master.", - default_charset_info->cs_name.str, - default_client_charset_info->cs_name.str); - mysql_options(mysql, MYSQL_SET_CHARSET_NAME, - default_client_charset_info->cs_name.str); - } - - /* Set MYSQL_PLUGIN_DIR in case master asks for an external authentication plugin */ - if (opt_plugin_dir_ptr && *opt_plugin_dir_ptr) - mysql_options(mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir_ptr); - /* we disallow empty users */ if (mi->user[0] == 0) { diff -Nru mariadb-10.11.11/sql/sp_head.cc mariadb-10.11.13/sql/sp_head.cc --- mariadb-10.11.11/sql/sp_head.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sp_head.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1531,7 +1531,7 @@ thd->wsrep_cs().reset_error(); /* Reset also thd->killed if it has been set during BF abort. */ if (killed_mask_hard(thd->killed) == KILL_QUERY) - thd->killed= NOT_KILLED; + thd->reset_killed(); /* if failed transaction was not replayed, must return with error from here */ if (!must_replay) err_status = 1; } @@ -2552,6 +2552,16 @@ if (!spvar) DBUG_RETURN(FALSE); + if (!spvar->field_def.type_handler()->is_scalar_type() && + dynamic_cast(arg_item)) + { + // Item_param cannot store values of non-scalar data types yet + my_error(ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION, MYF(0), + spvar->field_def.type_handler()->name().ptr(), + "EXECUTE ... USING ?"); + DBUG_RETURN(true); + } + if (spvar->mode != sp_variable::MODE_IN) { Settable_routine_parameter *srp= diff -Nru mariadb-10.11.11/sql/sql_acl.cc mariadb-10.11.13/sql/sql_acl.cc --- mariadb-10.11.11/sql/sql_acl.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_acl.cc 2025-05-19 16:14:25.000000000 +0000 @@ -8433,19 +8433,13 @@ /* If sequence is used as part of NEXT VALUE, PREVIOUS VALUE or SELECT, - we need to modify the requested access rights depending on how the - sequence is used. + the privilege will be checked in ::fix_fields(). + Direct SELECT of a sequence table doesn't set t_ref->sequence, so + privileges will be checked normally, as for any table. */ if (t_ref->sequence && !(want_access & ~(SELECT_ACL | INSERT_ACL | UPDATE_ACL | DELETE_ACL))) - { - /* - We want to have either SELECT or INSERT rights to sequences depending - on how they are accessed - */ - orig_want_access= ((t_ref->lock_type >= TL_FIRST_WRITE) ? - INSERT_ACL : SELECT_ACL); - } + continue; const ACL_internal_table_access *access= get_cached_table_access(&t_ref->grant.m_internal, @@ -13111,6 +13105,9 @@ return dup; } + if (!initialized) + return dup; + if (lock) mysql_mutex_lock(&acl_cache->lock); if (find_acl_role(dup->user.str, false)) diff -Nru mariadb-10.11.11/sql/sql_base.cc mariadb-10.11.13/sql/sql_base.cc --- mariadb-10.11.11/sql/sql_base.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_base.cc 2025-05-19 16:14:25.000000000 +0000 @@ -19,6 +19,7 @@ #include "mariadb.h" #include "sql_base.h" // setup_table_map +#include "sql_list.h" #include "sql_priv.h" #include "unireg.h" #include "debug_sync.h" @@ -781,6 +782,7 @@ } } +#ifdef DBUG_ASSERT_EXISTS static inline bool check_field_pointers(const TABLE *table) { for (Field **pf= table->field; *pf; pf++) @@ -796,6 +798,7 @@ } return true; } +#endif int close_thread_tables_for_query(THD *thd) @@ -1173,7 +1176,6 @@ t_name= &table->table_name; t_alias= &table->alias; -retry: DBUG_PRINT("info", ("real table: %s.%s", d_name->str, t_name->str)); for (TABLE_LIST *tl= table_list; tl ; tl= tl->next_global, res= 0) { @@ -1235,28 +1237,53 @@ DBUG_PRINT("info", ("found same copy of table or table which we should skip")); } - if (res && res->belong_to_derived) - { - /* - We come here for queries of type: - INSERT INTO t1 (SELECT tmp.a FROM (select * FROM t1) as tmp); + DBUG_RETURN(res); +} - Try to fix by materializing the derived table - */ - TABLE_LIST *derived= res->belong_to_derived; - if (derived->is_merged_derived() && !derived->derived->is_excluded()) + +TABLE_LIST* unique_table_in_select_list(THD *thd, TABLE_LIST *table, SELECT_LEX *sel) +{ + subselect_table_finder_param param= {thd, table, NULL}; + List_iterator_fast it(sel->item_list); + Item *item; + while ((item= it++)) + { + if (item->walk(&Item::subselect_table_finder_processor, FALSE, ¶m)) { - DBUG_PRINT("info", - ("convert merged to materialization to resolve the conflict")); - derived->change_refs_to_fields(); - derived->set_materialized_derived(); - goto retry; + if (param.dup == NULL) + return ERROR_TABLE; + return param.dup; } + DBUG_ASSERT(param.dup == NULL); } - DBUG_RETURN(res); + return NULL; } +typedef TABLE_LIST* (*find_table_callback)(THD *thd, TABLE_LIST *table, + TABLE_LIST *table_list, + uint check_flag, SELECT_LEX *sel); + +static +TABLE_LIST* +find_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list, + uint check_flag, SELECT_LEX *sel, find_table_callback callback ); + +TABLE_LIST* unique_table_callback(THD *thd, TABLE_LIST *table, + TABLE_LIST *table_list, + uint check_flag, SELECT_LEX *sel) +{ + return find_dup_table(thd, table, table_list, check_flag); +} + + +TABLE_LIST* unique_in_sel_table_callback(THD *thd, TABLE_LIST *table, + TABLE_LIST *table_list, + uint check_flag, SELECT_LEX *sel) +{ + return unique_table_in_select_list(thd, table, sel); +} + /** Test that the subject table of INSERT/UPDATE/DELETE/CREATE or (in case of MyISAMMRG) one of its children are not used later @@ -1276,6 +1303,25 @@ unique_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list, uint check_flag) { + return find_table(thd, table, table_list, check_flag, NULL, + &unique_table_callback); +} + + +TABLE_LIST* +unique_table_in_insert_returning_subselect(THD *thd, TABLE_LIST *table, SELECT_LEX *sel) +{ + return find_table(thd, table, NULL, 0, sel, + &unique_in_sel_table_callback); + +} + + +static +TABLE_LIST* +find_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list, + uint check_flag, SELECT_LEX *sel, find_table_callback callback ) +{ TABLE_LIST *dup; table= table->find_table_for_update(); @@ -1306,12 +1352,12 @@ if (!tmp_parent) break; - if ((dup= find_dup_table(thd, child, child->next_global, check_flag))) + if ((dup= (*callback)(thd, child, child->next_global, check_flag, sel))) break; } } else - dup= find_dup_table(thd, table, table_list, check_flag); + dup= (*callback)(thd, table, table_list, check_flag, sel); return dup; } @@ -4561,6 +4607,7 @@ } thd->current_tablenr= 0; + sroutine_to_open= &thd->lex->sroutines_list.first; restart: /* @@ -4576,7 +4623,6 @@ has_prelocking_list= thd->lex->requires_prelocking(); table_to_open= start; - sroutine_to_open= &thd->lex->sroutines_list.first; *counter= 0; THD_STAGE_INFO(thd, stage_opening_tables); prelocking_strategy->reset(thd); @@ -4673,7 +4719,7 @@ elements from the table list (if MERGE tables are involved), */ close_tables_for_reopen(thd, start, ot_ctx.start_of_statement_svp(), - ot_ctx.remove_implicitly_used_deps()); + false); /* Here we rely on the fact that 'tables' still points to the valid @@ -4741,10 +4787,9 @@ /* F.ex. deadlock happened */ if (ot_ctx.can_recover_from_failed_open()) { - DBUG_ASSERT(ot_ctx.remove_implicitly_used_deps()); close_tables_for_reopen(thd, start, ot_ctx.start_of_statement_svp(), - ot_ctx.remove_implicitly_used_deps()); + true); if (ot_ctx.recover_from_failed_open()) goto error; @@ -4753,6 +4798,7 @@ goto error; error= FALSE; + sroutine_to_open= &thd->lex->sroutines_list.first; goto restart; } /* @@ -6034,19 +6080,19 @@ trying to reopen tables. NULL if no metadata locks were held and thus all metadata locks should be released. - @param[in] remove_implicit_deps True in case routines and tables implicitly + @param[in] remove_indirect True in case routines and tables implicitly used by a statement should be removed. */ void close_tables_for_reopen(THD *thd, TABLE_LIST **tables, const MDL_savepoint &start_of_statement_svp, - bool remove_implicit_deps) + bool remove_indirect) { - TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table(); TABLE_LIST *tmp; - if (remove_implicit_deps) + if (remove_indirect) { + TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table(); /* If table list consists only from tables from prelocking set, table list for new attempt should be empty, so we have to update list's root pointer. @@ -7412,82 +7458,83 @@ if (!found) continue; // No matching field + /* Restore field_2 to point to the field which was a match for field_1. */ + field_2= nj_col_2->field(); + /* field_1 and field_2 have the same names. Check if they are in the USING clause (if present), mark them as common fields, and add a new equi-join condition to the ON clause. */ - if (nj_col_2) - { - /* - Create non-fixed fully qualified field and let fix_fields to - resolve it. - */ - Item *item_1= nj_col_1->create_item(thd); - Item *item_2= nj_col_2->create_item(thd); - Item_ident *item_ident_1, *item_ident_2; - Item_func_eq *eq_cond; - if (!item_1 || !item_2) - goto err; // out of memory + /* + Create non-fixed fully qualified field and let fix_fields to + resolve it. + */ + Item *item_1= nj_col_1->create_item(thd); + Item *item_2= nj_col_2->create_item(thd); + Item_ident *item_ident_1, *item_ident_2; + Item_func_eq *eq_cond; - /* - The following assert checks that the two created items are of - type Item_ident. - */ - DBUG_ASSERT(!thd->lex->current_select->no_wrap_view_item); - /* - In the case of no_wrap_view_item == 0, the created items must be - of sub-classes of Item_ident. - */ - DBUG_ASSERT(item_1->type() == Item::FIELD_ITEM || - item_1->type() == Item::REF_ITEM); - DBUG_ASSERT(item_2->type() == Item::FIELD_ITEM || - item_2->type() == Item::REF_ITEM); + if (!item_1 || !item_2) + goto err; // out of memory - /* - We need to cast item_1,2 to Item_ident, because we need to hook name - resolution contexts specific to each item. - */ - item_ident_1= (Item_ident*) item_1; - item_ident_2= (Item_ident*) item_2; - /* - Create and hook special name resolution contexts to each item in the - new join condition . We need this to both speed-up subsequent name - resolution of these items, and to enable proper name resolution of - the items during the execute phase of PS. - */ - if (set_new_item_local_context(thd, item_ident_1, nj_col_1->table_ref) || - set_new_item_local_context(thd, item_ident_2, nj_col_2->table_ref)) - goto err; + /* + The following assert checks that the two created items are of + type Item_ident. + */ + DBUG_ASSERT(!thd->lex->current_select->no_wrap_view_item); + /* + In the case of no_wrap_view_item == 0, the created items must be + of sub-classes of Item_ident. + */ + DBUG_ASSERT(item_1->type() == Item::FIELD_ITEM || + item_1->type() == Item::REF_ITEM); + DBUG_ASSERT(item_2->type() == Item::FIELD_ITEM || + item_2->type() == Item::REF_ITEM); - if (!(eq_cond= new (thd->mem_root) Item_func_eq(thd, item_ident_1, item_ident_2))) - goto err; /* Out of memory. */ + /* + We need to cast item_1,2 to Item_ident, because we need to hook name + resolution contexts specific to each item. + */ + item_ident_1= (Item_ident*) item_1; + item_ident_2= (Item_ident*) item_2; + /* + Create and hook special name resolution contexts to each item in the + new join condition . We need this to both speed-up subsequent name + resolution of these items, and to enable proper name resolution of + the items during the execute phase of PS. + */ + if (set_new_item_local_context(thd, item_ident_1, nj_col_1->table_ref) || + set_new_item_local_context(thd, item_ident_2, nj_col_2->table_ref)) + goto err; - /* - Add the new equi-join condition to the ON clause. Notice that - fix_fields() is applied to all ON conditions in setup_conds() - so we don't do it here. - */ - add_join_on(thd, (table_ref_1->outer_join & JOIN_TYPE_RIGHT ? - table_ref_1 : table_ref_2), - eq_cond); - - nj_col_1->is_common= nj_col_2->is_common= TRUE; - DBUG_PRINT ("info", ("%s.%s and %s.%s are common", - nj_col_1->safe_table_name(), - nj_col_1->name()->str, - nj_col_2->safe_table_name(), - nj_col_2->name()->str)); - - if (field_1) - update_field_dependencies(thd, field_1, field_1->table); - if (field_2) - update_field_dependencies(thd, field_2, field_2->table); + if (!(eq_cond= new (thd->mem_root) Item_func_eq(thd, item_ident_1, item_ident_2))) + goto err; /* Out of memory. */ - if (using_fields != NULL) - ++(*found_using_fields); - } + /* + Add the new equi-join condition to the ON clause. Notice that + fix_fields() is applied to all ON conditions in setup_conds() + so we don't do it here. + */ + add_join_on(thd, (table_ref_1->outer_join & JOIN_TYPE_RIGHT ? + table_ref_1 : table_ref_2), + eq_cond); + + nj_col_1->is_common= nj_col_2->is_common= TRUE; + DBUG_PRINT ("info", ("%s.%s and %s.%s are common", + nj_col_1->safe_table_name(), + nj_col_1->name()->str, + nj_col_2->safe_table_name(), + nj_col_2->name()->str)); + + if (field_1) + update_field_dependencies(thd, field_1, field_1->table); + if (field_2) + update_field_dependencies(thd, field_2, field_2->table); + + if (using_fields != NULL) + ++(*found_using_fields); } if (leaf_1) leaf_1->is_join_columns_complete= TRUE; @@ -8392,7 +8439,7 @@ if (table_list->belong_to_view && !table_list->view && check_single_table_access(thd, access, table_list, FALSE)) { - tables->hide_view_error(thd); + tables->replace_view_error_with_generic(thd); DBUG_RETURN(TRUE); } access= want_access; @@ -8897,14 +8944,15 @@ } -static void unwind_stored_field_offsets(const List &fields, Field *end) +static void unwind_stored_field_offsets(const List &fields, Item_field *end) { - for (Item &item_field: fields) + for (Item &item: fields) { - Field *f= item_field.field_for_view_update()->field; - if (f == end) + Item_field *item_field= item.field_for_view_update(); + if (item_field == end) break; + Field *f= item_field->field; if (f->stored_in_db()) { TABLE *table= f->table; @@ -8948,7 +8996,7 @@ { List_iterator_fast f(fields),v(values); Item *value, *fld; - Item_field *field; + Item_field *field= NULL; Field *rfield; TABLE *table; bool only_unvers_fields= update && table_arg->versioned(); @@ -8966,11 +9014,8 @@ while ((fld= f++)) { - if (!(field= fld->field_for_view_update())) - { - my_error(ER_NONUPDATEABLE_COLUMN, MYF(0), fld->name.str); - goto err_unwind_fields; - } + field= fld->field_for_view_update(); + DBUG_ASSERT(field); // ensured by check_fields or check_view_insertability. value=v++; DBUG_ASSERT(value); rfield= field->field; @@ -9038,7 +9083,7 @@ DBUG_RETURN(thd->is_error()); err_unwind_fields: if (update && thd->variables.sql_mode & MODE_SIMULTANEOUS_ASSIGNMENT) - unwind_stored_field_offsets(fields, rfield); + unwind_stored_field_offsets(fields, field); err: DBUG_PRINT("error",("got error")); thd->abort_on_warning= save_abort_on_warning; @@ -9407,9 +9452,11 @@ memcpy(path_copy, path, path_len - ext_len); path_copy[path_len - ext_len]= 0; init_tmp_table_share(thd, &share, "", 0, "", path_copy); - handlerton *ht= share.db_type(); if (!open_table_def(thd, &share)) - ht->drop_table(share.db_type(), path_copy); + { + handlerton *ht= share.db_type(); + ht->drop_table(ht, path_copy); + } free_table_share(&share); } /* diff -Nru mariadb-10.11.11/sql/sql_base.h mariadb-10.11.13/sql/sql_base.h --- mariadb-10.11.11/sql/sql_base.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_base.h 2025-05-19 16:14:25.000000000 +0000 @@ -157,7 +157,7 @@ my_bool mysql_rm_tmp_tables(void); void close_tables_for_reopen(THD *thd, TABLE_LIST **tables, const MDL_savepoint &start_of_statement_svp, - bool remove_implicit_dependencies); + bool remove_indirect); bool table_already_fk_prelocked(TABLE_LIST *tl, LEX_CSTRING *db, LEX_CSTRING *table, thr_lock_type lock_type); TABLE_LIST *find_table_in_list(TABLE_LIST *table, @@ -296,6 +296,8 @@ bool lock_tables(THD *thd, TABLE_LIST *tables, uint counter, uint flags); int decide_logging_format(THD *thd, TABLE_LIST *tables); void close_thread_table(THD *thd, TABLE **table_ptr); +TABLE_LIST* +unique_table_in_insert_returning_subselect(THD *thd, TABLE_LIST *table, SELECT_LEX *sel); TABLE_LIST *unique_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list, uint check_flag); bool is_equal(const LEX_CSTRING *a, const LEX_CSTRING *b); @@ -568,23 +570,6 @@ return m_timeout; } - /** - Return true in case tables and routines the statement implicilty - dependent on should be removed, else return false. - - @note The use case when routines and tables the statement implicitly - dependent on shouldn't be removed is the one when a new partition be - created on handling the INSERT statement against a versioning partitioned - table. For this case re-opening a versioning table would result in adding - implicitly dependent routines (e.g. table's triggers) that lead to - allocation of memory on PS mem_root and so leaking a memory until the PS - statement be deallocated. - */ - bool remove_implicitly_used_deps() const - { - return m_action != OT_ADD_HISTORY_PARTITION; - } - uint get_flags() const { return m_flags; } /** diff -Nru mariadb-10.11.11/sql/sql_cache.cc mariadb-10.11.13/sql/sql_cache.cc --- mariadb-10.11.11/sql/sql_cache.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_cache.cc 2025-05-19 16:14:25.000000000 +0000 @@ -3553,6 +3553,7 @@ if (table_block == 0) { DBUG_PRINT("qcache", ("Can't write table name to cache")); + node->parent= NULL; DBUG_RETURN(0); } Query_cache_table *header= table_block->table(); @@ -3576,6 +3577,7 @@ DBUG_PRINT("qcache", ("Can't insert table to hash")); // write_block_data return locked block free_memory_block(table_block); + node->parent= NULL; DBUG_RETURN(0); } char *db= header->db(); diff -Nru mariadb-10.11.11/sql/sql_class.cc mariadb-10.11.13/sql/sql_class.cc --- mariadb-10.11.11/sql/sql_class.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_class.cc 2025-05-19 16:14:25.000000000 +0000 @@ -8381,6 +8381,24 @@ } +void +wait_for_commit::prior_commit_error(THD *thd) +{ + /* + Only raise a "prior commit failed" error if we didn't already raise + an error. + + The ER_PRIOR_COMMIT_FAILED is just an internal mechanism to ensure that a + transaction does not commit successfully if a prior commit failed, so that + the parallel replication worker threads stop in an orderly fashion when + one of them get an error. Thus, if another worker already got another real + error, overriding it with ER_PRIOR_COMMIT_FAILED is not useful. + */ + if (!thd->get_stmt_da()->is_set()) + my_error(ER_PRIOR_COMMIT_FAILED, MYF(0)); +} + + /* Wakeup anyone waiting for us to have committed. diff -Nru mariadb-10.11.11/sql/sql_class.h mariadb-10.11.13/sql/sql_class.h --- mariadb-10.11.11/sql/sql_class.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_class.h 2025-05-19 16:14:25.000000000 +0000 @@ -2383,8 +2383,8 @@ return wait_for_prior_commit2(thd, allow_kill); else { - if (wakeup_error) - my_error(ER_PRIOR_COMMIT_FAILED, MYF(0)); + if (unlikely(wakeup_error)) + prior_commit_error(thd); return wakeup_error; } } @@ -2435,6 +2435,7 @@ void wakeup(int wakeup_error); int wait_for_prior_commit2(THD *thd, bool allow_kill); + void prior_commit_error(THD *thd); void wakeup_subsequent_commits2(int wakeup_error); void unregister_wait_for_prior_commit2(); diff -Nru mariadb-10.11.11/sql/sql_cmd.h mariadb-10.11.13/sql/sql_cmd.h --- mariadb-10.11.11/sql/sql_cmd.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_cmd.h 2025-05-19 16:14:25.000000000 +0000 @@ -141,6 +141,7 @@ handlerton **ha, bool tmp_table); bool is_set() { return m_storage_engine_name.str != NULL; } + const LEX_CSTRING *name() const { return &m_storage_engine_name; } }; diff -Nru mariadb-10.11.11/sql/sql_db.cc mariadb-10.11.13/sql/sql_db.cc --- mariadb-10.11.11/sql/sql_db.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_db.cc 2025-05-19 16:14:25.000000000 +0000 @@ -536,36 +536,53 @@ DESCRIPTION + create->default_table_charset is guaranteed to be alway set + Required by some callers + RETURN VALUES 0 File found - 1 No database file or could not open it - + -1 No database file (file was not found or 'empty' file was cached) + 1 Could not open it */ -bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create) +int load_db_opt(THD *thd, const char *path, Schema_specification_st *create) { File file; char buf[256+DATABASE_COMMENT_MAXLEN]; DBUG_ENTER("load_db_opt"); - bool error=1; + int error= 0; size_t nbytes; myf utf8_flag= thd->get_utf8_flag(); bzero((char*) create,sizeof(*create)); - create->default_table_charset= thd->variables.collation_server; /* Check if options for this database are already in the hash */ if (!get_dbopt(thd, path, create)) - DBUG_RETURN(0); + { + if (!create->default_table_charset) + error= -1; // db.opt did not exists + goto err1; + } /* Otherwise, load options from the .opt file */ if ((file= mysql_file_open(key_file_dbopt, path, O_RDONLY | O_SHARE, MYF(0))) < 0) + { + /* + Create an empty entry, to avoid doing an extra file open for every create + table. + */ + put_dbopt(path, create); + error= -1; goto err1; + } IO_CACHE cache; if (init_io_cache(&cache, file, IO_SIZE, READ_CACHE, 0, 0, MYF(0))) - goto err2; + { + error= 1; + goto err2; // Not cached + } while ((int) (nbytes= my_b_gets(&cache, (char*) buf, sizeof(buf))) > 0) { @@ -586,7 +603,7 @@ default-collation commands. */ if (!(create->default_table_charset= - get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(utf8_flag))) && + get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(utf8_flag))) && !(create->default_table_charset= get_charset_by_name(pos+1, MYF(utf8_flag)))) { @@ -621,10 +638,11 @@ err2: mysql_file_close(file, MYF(0)); err1: + if (!create->default_table_charset) // In case of error + create->default_table_charset= thd->variables.collation_server; DBUG_RETURN(error); } - /* Retrieve database options by name. Load database options file or fetch from cache. @@ -651,11 +669,12 @@ db_create_info right after that. RETURN VALUES (read NOTE!) - FALSE Success - TRUE Failed to retrieve options + 0 File found + -1 No database file (file was not found or 'empty' file was cached) + 1 Could not open it */ -bool load_db_opt_by_name(THD *thd, const char *db_name, +int load_db_opt_by_name(THD *thd, const char *db_name, Schema_specification_st *db_create_info) { char db_opt_path[FN_REFLEN + 1]; @@ -1951,8 +1970,7 @@ build_table_filename(path, sizeof(path)-1, old_db->str, "", MY_DB_OPT_FILE, 0); - if ((load_db_opt(thd, path, &create_info))) - create_info.default_table_charset= thd->variables.collation_server; + load_db_opt(thd, path, &create_info); length= build_table_filename(path, sizeof(path)-1, old_db->str, "", "", 0); if (length && path[length-1] == FN_LIBCHAR) diff -Nru mariadb-10.11.11/sql/sql_db.h mariadb-10.11.13/sql/sql_db.h --- mariadb-10.11.11/sql/sql_db.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_db.h 2025-05-19 16:14:25.000000000 +0000 @@ -37,8 +37,8 @@ bool my_dboptions_cache_init(void); void my_dboptions_cache_free(void); bool check_db_dir_existence(const char *db_name); -bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create); -bool load_db_opt_by_name(THD *thd, const char *db_name, +int load_db_opt(THD *thd, const char *path, Schema_specification_st *create); +int load_db_opt_by_name(THD *thd, const char *db_name, Schema_specification_st *db_create_info); CHARSET_INFO *get_default_db_collation(THD *thd, const char *db_name); bool my_dbopt_init(void); diff -Nru mariadb-10.11.11/sql/sql_error.cc mariadb-10.11.13/sql/sql_error.cc --- mariadb-10.11.11/sql/sql_error.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_error.cc 2025-05-19 16:14:25.000000000 +0000 @@ -318,18 +318,16 @@ #endif get_warning_info()->clear_error_condition(); set_is_sent(false); - /** Tiny reset in debug mode to see garbage right away */ - if (!is_bulk_op()) - /* - For BULK DML operations (e.g. UPDATE) the data member m_status - has the value DA_OK_BULK. Keep this value in order to handle - m_affected_rows, m_statement_warn_count in correct way. Else, - the number of rows and the number of warnings affected by - the last statement executed as part of a trigger fired by the dml - (e.g. UPDATE statement fires a trigger on AFTER UPDATE) would counts - rows modified by trigger's statement. - */ - m_status= DA_EMPTY; + /* + For BULK DML operations (e.g. UPDATE) the data member m_status + has the value DA_OK_BULK. Keep this value in order to handle + m_affected_rows, m_statement_warn_count in correct way. Else, + the number of rows and the number of warnings affected by + the last statement executed as part of a trigger fired by the dml + (e.g. UPDATE statement fires a trigger on AFTER UPDATE) would counts + rows modified by trigger's statement. + */ + m_status= is_bulk_op() ? DA_OK_BULK : DA_EMPTY; DBUG_VOID_RETURN; } diff -Nru mariadb-10.11.11/sql/sql_insert.cc mariadb-10.11.13/sql/sql_insert.cc --- mariadb-10.11.11/sql/sql_insert.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_insert.cc 2025-05-19 16:14:25.000000000 +0000 @@ -57,6 +57,7 @@ */ #include "mariadb.h" /* NO_EMBEDDED_ACCESS_CHECKS */ +#include "sql_list.h" #include "sql_priv.h" #include "sql_insert.h" #include "sql_update.h" // compare_record @@ -728,6 +729,8 @@ Name_resolution_context_state ctx_state; SELECT_LEX *returning= thd->lex->has_returning() ? thd->lex->returning() : 0; unsigned char *readbuff= NULL; + List insert_values_cache; + bool cache_insert_values= FALSE; #ifndef EMBEDDED_LIBRARY char *query= thd->query(); @@ -785,7 +788,7 @@ if ((res= mysql_prepare_insert(thd, table_list, fields, values, update_fields, update_values, duplic, ignore, - &unused_conds, FALSE))) + &unused_conds, FALSE, &cache_insert_values))) { retval= thd->is_error(); if (res < 0) @@ -1033,8 +1036,41 @@ if (returning) fix_rownum_pointers(thd, thd->lex->returning(), &info.accepted_rows); + if (cache_insert_values) + { + insert_values_cache.empty(); + while ((values= its++)) + { + List *caches= new (thd->mem_root) List_item; + List_iterator_fast iv(*values); + Item *item; + if (caches == 0) + { + error= 1; + goto values_loop_end; + } + caches->empty(); + while((item= iv++)) + { + Item_cache *cache= item->get_cache(thd); + if (!cache) + { + error= 1; + goto values_loop_end; + } + cache->setup(thd, item); + caches->push_back(cache); + } + insert_values_cache.push_back(caches); + } + its.rewind(); + } + do { + List_iterator_fast itc(insert_values_cache); + List_iterator_fast *itr; + DBUG_PRINT("info", ("iteration %llu", iteration)); if (iteration && bulk_parameters_set(thd)) { @@ -1042,7 +1078,24 @@ goto values_loop_end; } - while ((values= its++)) + if (cache_insert_values) + { + List_item *caches; + while ((caches= itc++)) + { + List_iterator_fast ic(*caches); + Item_cache *cache; + while((cache= (Item_cache*) ic++)) + { + cache->cache_value(); + } + } + itc.rewind(); + itr= &itc; + } + else + itr= &its; + while ((values= (*itr)++)) { thd->get_stmt_da()->inc_current_row_for_warning(); if (fields.elements || !value_count) @@ -1146,7 +1199,7 @@ break; info.accepted_rows++; } - its.rewind(); + itr->rewind(); iteration++; } while (bulk_parameters_iterations(thd)); @@ -1657,6 +1710,7 @@ table_list Global/local table list where Where clause (for insert ... select) select_insert TRUE if INSERT ... SELECT statement + cache_insert_values insert's VALUES(...) has to be pre-computed TODO (in far future) In cases of: @@ -1679,7 +1733,7 @@ List &update_fields, List &update_values, enum_duplicates duplic, bool ignore, COND **where, - bool select_insert) + bool select_insert, bool * const cache_insert_values) { SELECT_LEX *select_lex= thd->lex->first_select_lex(); Name_resolution_context *context= &select_lex->context; @@ -1783,6 +1837,15 @@ thd->vers_insert_history(row_start); // check privileges } + /* + Check if we read from the same table we're inserting into. + Queries like INSERT INTO t1 VALUES ((SELECT ... FROM t1...)) have + to pre-compute the VALUES part. + Reading from the same table in the RETURNING clause is not allowed. + + INSERT...SELECT detects this case in select_insert::prepare and also + uses buffering to handle it correcly. + */ if (!select_insert) { Item *fake_conds= 0; @@ -1790,10 +1853,30 @@ if ((duplicate= unique_table(thd, table_list, table_list->next_global, CHECK_DUP_ALLOW_DIFFERENT_ALIAS))) { - update_non_unique_table_error(table_list, "INSERT", duplicate); - DBUG_RETURN(1); + /* + This is INSERT INTO ... VALUES (...) and it must pre-compute the + values to be inserted. + */ + (*cache_insert_values)= true; } + else + (*cache_insert_values)= false; + select_lex->fix_prepare_information(thd, &fake_conds, &fake_conds); + + if ((*cache_insert_values) && thd->lex->has_returning()) + { + // Check if the table we're inserting into is also in RETURNING clause + TABLE_LIST *dup= + unique_table_in_insert_returning_subselect(thd, table_list, + thd->lex->returning()); + if (dup) + { + if (dup != ERROR_TABLE) + update_non_unique_table_error(table_list, "INSERT", duplicate); + DBUG_RETURN(1); + } + } } /* Only call prepare_for_posistion() if we are not performing a DELAYED @@ -3930,6 +4013,7 @@ int res; LEX *lex= thd->lex; SELECT_LEX *select_lex= lex->first_select_lex(); + bool cache_insert_values= false; DBUG_ENTER("mysql_insert_select_prepare"); /* @@ -3940,7 +4024,7 @@ if ((res= mysql_prepare_insert(thd, lex->query_tables, lex->field_list, 0, lex->update_list, lex->value_list, lex->duplicates, lex->ignore, - &select_lex->where, TRUE))) + &select_lex->where, TRUE, &cache_insert_values))) DBUG_RETURN(res); /* @@ -4227,6 +4311,7 @@ int select_insert::prepare2(JOIN *) { DBUG_ENTER("select_insert::prepare2"); + switch_to_nullable_trigger_fields(*fields, table); if (table->validate_default_values_of_unset_fields(thd)) DBUG_RETURN(1); if (thd->lex->describe) @@ -4348,7 +4433,11 @@ bool select_insert::prepare_eof() { int error; - bool const trans_table= table->file->has_transactions_and_rollback(); + // make sure any ROW format pending event is logged in the same binlog cache + bool const trans_table= (thd->is_current_stmt_binlog_format_row() && + table->file->row_logging) ? + table->file->row_logging_has_trans : + table->file->has_transactions_and_rollback(); bool changed; bool binary_logged= 0; killed_state killed_status= thd->killed; @@ -4527,7 +4616,7 @@ table->file->ha_rnd_end(); table->file->extra(HA_EXTRA_NO_IGNORE_DUP_KEY); table->file->extra(HA_EXTRA_WRITE_CANNOT_REPLACE); - + table->file->extra(HA_EXTRA_ABORT_ALTER_COPY); /* If at least one row has been inserted/modified and will stay in the table (the table doesn't have transactions) we must write to @@ -4573,7 +4662,8 @@ query_cache_invalidate3(thd, table, 1); } DBUG_ASSERT(transactional_table || !changed || - thd->transaction->stmt.modified_non_trans_table); + (thd->transaction->stmt.modified_non_trans_table || + thd->transaction->all.modified_non_trans_table)); table->s->table_creation_was_logged|= binary_logged; table->file->ha_release_auto_increment(); @@ -5266,9 +5356,14 @@ /* Remember xid's for the case of row based logging */ ddl_log_update_xid(&ddl_log_state_create, thd->binlog_xid); ddl_log_update_xid(&ddl_log_state_rm, thd->binlog_xid); - trans_commit_stmt(thd); - if (!(thd->variables.option_bits & OPTION_GTID_BEGIN)) - trans_commit_implicit(thd); + if (trans_commit_stmt(thd) || + (!(thd->variables.option_bits & OPTION_GTID_BEGIN) && + trans_commit_implicit(thd))) + { + abort_result_set(); + DBUG_RETURN(true); + } + thd->binlog_xid= 0; #ifdef WITH_WSREP @@ -5388,7 +5483,13 @@ /* possible error of writing binary log is ignored deliberately */ (void) thd->binlog_flush_pending_rows_event(TRUE, TRUE); + /* + In the error case, we remove any partially created table. So clear any + incident event generates due to cache error, as it no longer relevant. + */ + binlog_clear_incident(thd); + bool drop_table_was_logged= false; if (table) { bool tmp_table= table->s->tmp_table; @@ -5435,6 +5536,7 @@ create_info->db_type == partition_hton, &create_info->tabledef_version, tmp_table); + drop_table_was_logged= true; debug_crash_here("ddl_log_create_after_binlog"); thd->binlog_xid= 0; } @@ -5459,8 +5561,21 @@ if (create_info->table_was_deleted) { - /* Unlock locked table that was dropped by CREATE. */ - (void) trans_rollback_stmt(thd); + if (drop_table_was_logged) + { + /* for DROP binlogging the error status has to be canceled first */ + Diagnostics_area new_stmt_da(thd->query_id, false, true); + Diagnostics_area *old_stmt_da= thd->get_stmt_da(); + + thd->set_stmt_da(&new_stmt_da); + (void) trans_rollback_stmt(thd); + thd->set_stmt_da(old_stmt_da); + } + else + { + /* Unlock locked table that was dropped by CREATE. */ + (void) trans_rollback_stmt(thd); + } thd->locked_tables_list.unlock_locked_table(thd, create_info->mdl_ticket); } diff -Nru mariadb-10.11.11/sql/sql_insert.h mariadb-10.11.13/sql/sql_insert.h --- mariadb-10.11.11/sql/sql_insert.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_insert.h 2025-05-19 16:14:25.000000000 +0000 @@ -28,7 +28,7 @@ List &update_fields, List &update_values, enum_duplicates duplic, bool ignore, - COND **where, bool select_insert); + COND **where, bool select_insert, bool * const cache_results); bool mysql_insert(THD *thd,TABLE_LIST *table,List &fields, List &values, List &update_fields, List &update_values, enum_duplicates flag, diff -Nru mariadb-10.11.11/sql/sql_lex.cc mariadb-10.11.13/sql/sql_lex.cc --- mariadb-10.11.11/sql/sql_lex.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_lex.cc 2025-05-19 16:14:25.000000000 +0000 @@ -11179,7 +11179,8 @@ Field_pair *get_corresponding_field_pair(Item *item, List pair_list) { - DBUG_ASSERT(item->type() == Item::FIELD_ITEM || + DBUG_ASSERT(item->type() == Item::DEFAULT_VALUE_ITEM || + item->type() == Item::FIELD_ITEM || (item->type() == Item::REF_ITEM && ((((Item_ref *) item)->ref_type() == Item_ref::VIEW_REF) || (((Item_ref *) item)->ref_type() == Item_ref::REF)))); @@ -12244,6 +12245,48 @@ false; } +/** + Find the real table in prepared SELECT tree + + NOTE: all SELECT must be prepared (to have leaf table list). + + NOTE: it looks only for real tables (not view or derived) + + @param thd the current thread handle + @param db_name name of db of the table to look for + @param db_name name of db of the table to look for + + @return first found table, NULL or ERROR_TABLE +*/ + +TABLE_LIST *SELECT_LEX::find_table(THD *thd, + const LEX_CSTRING *db_name, + const LEX_CSTRING *table_name) +{ + uchar buff[STACK_BUFF_ALLOC]; // Max argument in function + if (check_stack_overrun(thd, STACK_MIN_SIZE, buff)) + return NULL; + + List_iterator_fast ti(leaf_tables); + TABLE_LIST *table; + while ((table= ti++)) + { + if (cmp(&table->db, db_name) == 0 && + cmp(&table->table_name, table_name) == 0) + return table; + } + + for (SELECT_LEX_UNIT *u= first_inner_unit(); u; u= u->next_unit()) + { + for (st_select_lex *sl= u->first_select(); sl; sl=sl->next_select()) + { + if ((table= sl->find_table(thd, db_name, table_name))) + return table; + } + } + return NULL; +} + bool st_select_lex::is_query_topmost(THD *thd) { diff -Nru mariadb-10.11.11/sql/sql_lex.h mariadb-10.11.13/sql/sql_lex.h --- mariadb-10.11.11/sql/sql_lex.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_lex.h 2025-05-19 16:14:25.000000000 +0000 @@ -1690,6 +1690,10 @@ void lex_start(LEX *plex); bool is_unit_nest() { return (nest_flags & UNIT_NEST_FL); } void mark_as_unit_nest() { nest_flags= UNIT_NEST_FL; } + + TABLE_LIST *find_table(THD *thd, + const LEX_CSTRING *db_name, + const LEX_CSTRING *table_name); }; typedef class st_select_lex SELECT_LEX; @@ -4681,7 +4685,7 @@ int add_period(Lex_ident name, Lex_ident_sys_st start, Lex_ident_sys_st end) { - if (check_period_name(name.str)) { + if (check_column_name(name)) { my_error(ER_WRONG_COLUMN_NAME, MYF(0), name.str); return 1; } diff -Nru mariadb-10.11.11/sql/sql_parse.cc mariadb-10.11.13/sql/sql_parse.cc --- mariadb-10.11.11/sql/sql_parse.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_parse.cc 2025-05-19 16:14:25.000000000 +0000 @@ -7296,18 +7296,9 @@ DBUG_PRINT("info", ("derived: %d view: %d", table_ref->derived != 0, table_ref->view != 0)); - if (table_ref->is_anonymous_derived_table()) + if (table_ref->is_anonymous_derived_table() || table_ref->sequence) continue; - if (table_ref->sequence) - { - /* We want to have either SELECT or INSERT rights to sequences depending - on how they are accessed - */ - want_access= ((table_ref->lock_type >= TL_FIRST_WRITE) ? - INSERT_ACL : SELECT_ACL); - } - if (check_access(thd, want_access, table_ref->get_db_name().str, &table_ref->grant.privilege, &table_ref->grant.m_internal, @@ -10389,7 +10380,13 @@ bool check_ident_length(const LEX_CSTRING *ident) { - if (check_string_char_length(ident, 0, NAME_CHAR_LEN, system_charset_info, 1)) + /* + string_char_length desite the names, goes into Well_formed_prefix_status + so this is more than just a length comparison. Things like a primary key + doesn't have a name, therefore no length. Also the ident grammar allows + empty backtick. Check quickly the length, and if 0, accept that. + */ + if (ident->length && check_string_char_length(ident, 0, NAME_CHAR_LEN, system_charset_info, 1)) { my_error(ER_TOO_LONG_IDENT, MYF(0), ident->str); return 1; diff -Nru mariadb-10.11.11/sql/sql_prepare.cc mariadb-10.11.13/sql/sql_prepare.cc --- mariadb-10.11.11/sql/sql_prepare.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_prepare.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1304,6 +1304,7 @@ THD *thd= stmt->thd; List_iterator_fast its(values_list); List_item *values; + bool cache_results= FALSE; DBUG_ENTER("mysql_test_insert_common"); if (insert_precheck(thd, table_list)) @@ -1336,7 +1337,8 @@ if (mysql_prepare_insert(thd, table_list, fields, values, update_fields, update_values, duplic, ignore, - &unused_conds, FALSE)) + &unused_conds, FALSE, + &cache_results)) goto error; value_count= values->elements; diff -Nru mariadb-10.11.11/sql/sql_priv.h mariadb-10.11.13/sql/sql_priv.h --- mariadb-10.11.11/sql/sql_priv.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_priv.h 2025-05-19 16:14:25.000000000 +0000 @@ -281,6 +281,7 @@ #define OPTIMIZER_FIX_INNODB_CARDINALITY (8) #define OPTIMIZER_ADJ_FIX_REUSE_RANGE_FOR_REF (16) #define OPTIMIZER_ADJ_FIX_CARD_MULT (32) +#define OPTIMIZER_ADJ_FIX_DERIVED_TABLE_READ_COST (64) #define OPTIMIZER_ADJ_DEFAULT (OPTIMIZER_ADJ_FIX_REUSE_RANGE_FOR_REF | \ OPTIMIZER_ADJ_FIX_CARD_MULT) diff -Nru mariadb-10.11.11/sql/sql_reload.cc mariadb-10.11.13/sql/sql_reload.cc --- mariadb-10.11.11/sql/sql_reload.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_reload.cc 2025-05-19 16:14:25.000000000 +0000 @@ -618,7 +618,7 @@ if (table_list->belong_to_view && check_single_table_access(thd, PRIV_LOCK_TABLES, table_list, FALSE)) { - table_list->hide_view_error(thd); + table_list->replace_view_error_with_generic(thd); goto error_reset_bits; } if (table_list->is_view_or_derived()) diff -Nru mariadb-10.11.11/sql/sql_select.cc mariadb-10.11.13/sql/sql_select.cc --- mariadb-10.11.11/sql/sql_select.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_select.cc 2025-05-19 16:14:25.000000000 +0000 @@ -3581,7 +3581,14 @@ continue; Item *item= new (thd->mem_root) Item_temptable_rowid(tab->table); item->fix_fields(thd, 0); - table_fields->push_back(item, thd->mem_root); + /* + table_fields points to JOIN::all_fields or JOIN::tmp_all_fields_*. + These lists start with "added" fields and then their suffix is shared + with JOIN::fields_list or JOIN::tmp_fields_list*. + Because of that, new elements can only be added to the front of the list, + not to the back. + */ + table_fields->push_front(item, thd->mem_root); cur->tmp_table_param->func_count++; } return 0; @@ -5994,7 +6001,10 @@ s->table->opt_range_condition_rows=s->records; } else + { + /* Update s->records and s->read_time */ s->scan_time(); + } if (s->table->is_splittable()) s->add_keyuses_for_splitting(); @@ -14049,6 +14059,36 @@ } +/* + Procedure of keys generation for result tables of materialized derived + tables/views. + + A key is generated for each equi-join pair {derived_table, some_other_table}. + Each generated key consists of fields of derived table used in equi-join. + Example: + + SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN + t1 ON tt.f1=t1.f3 and tt.f2.=t1.f4; + In this case for the derived table tt one key will be generated. It will + consist of two parts f1 and f2. + Example: + + SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN + t1 ON tt.f1=t1.f3 JOIN + t2 ON tt.f2=t2.f4; + In this case for the derived table tt two keys will be generated. + One key over f1 field, and another key over f2 field. + Currently optimizer may choose to use only one such key, thus the second + one will be dropped after range optimizer is finished. + See also JOIN::drop_unused_derived_keys function. + Example: + + SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN + t1 ON tt.f1=a_function(t1.f3); + In this case for the derived table tt one key will be generated. It will + consist of one field - f1. +*/ + static bool generate_derived_keys(DYNAMIC_ARRAY *keyuse_array) { @@ -14759,7 +14799,7 @@ } goto no_join_cache; } - if (cache_level > 4 && no_bka_cache) + if (cache_level < 5 || no_bka_cache) goto no_join_cache; if ((flags & HA_MRR_NO_ASSOCIATION) && @@ -15461,6 +15501,7 @@ double JOIN_TAB::scan_time() { double res; + THD *thd= join->thd; if (table->is_created()) { if (table->is_filled_at_execution()) @@ -15481,10 +15522,53 @@ } res= read_time; } - else + else if (!(thd->variables.optimizer_adjust_secondary_key_costs & + OPTIMIZER_ADJ_FIX_DERIVED_TABLE_READ_COST)) { + /* + Old code, do not merge into 11.0+: + */ found_records= records=table->stat_records(); - read_time= found_records ? (double)found_records: 10.0;// TODO:fix this stub + read_time= found_records ? (double)found_records: 10.0; + res= read_time; + } + else + { + bool using_heap= 0; + TABLE_SHARE *share= table->s; + found_records= records= table->stat_records(); + + if (share->db_type() == heap_hton) + { + /* Check that the rows will fit into the heap table */ + ha_rows max_rows; + max_rows= (ha_rows) ((MY_MIN(thd->variables.tmp_memory_table_size, + thd->variables.max_heap_table_size)) / + MY_ALIGN(share->reclength, sizeof(char*))); + if (records <= max_rows) + { + /* The rows will fit into the heap table */ + using_heap= 1; + } + } + + /* + Code for the following is taken from the heap and aria storage engine. + In 11.# this is done without explict engine code + */ + if (using_heap) + read_time= (records / 20.0) + 1; + else + { + handler *file= table->file; + file->stats.data_file_length= share->reclength * records; + /* + Call the default scan_time() method as this is the cost for the + scan when heap is converted to Aria + */ + read_time= file->handler::scan_time(); + file->stats.data_file_length= 0; + } res= read_time; } return res; @@ -18544,6 +18628,8 @@ prev_table->dep_tables|= used_tables; if (prev_table->on_expr) { + /* If the ON expression is still there, it's an outer join */ + DBUG_ASSERT(prev_table->outer_join); prev_table->dep_tables|= table->on_expr_dep_tables; table_map prev_used_tables= prev_table->nested_join ? prev_table->nested_join->used_tables : @@ -18558,11 +18644,59 @@ prevents update of inner table dependences. For example it might happen if RAND() function is used in JOIN ON clause. - */ - if (!((prev_table->on_expr->used_tables() & - ~(OUTER_REF_TABLE_BIT | RAND_TABLE_BIT)) & - ~prev_used_tables)) + */ + table_map prev_on_expr_deps= prev_table->on_expr->used_tables() & + ~(OUTER_REF_TABLE_BIT | RAND_TABLE_BIT); + prev_on_expr_deps&= ~prev_used_tables; + + if (!prev_on_expr_deps) prev_table->dep_tables|= used_tables; + else + { + /* + Another possible case is when prev_on_expr_deps!=0 but it depends + on a table outside this join nest. SQL name resolution don't allow + this but it is possible when LEFT JOIN is inside a subquery which + is converted into a semi-join nest, Example: + + t1 SEMI JOIN ( + t2 + LEFT JOIN (t3 LEFT JOIN t4 ON t4.col=t1.col) ON expr + ) ON ... + + here, we would have prev_table=t4, table=t3. The condition + "ON t4.col=t1.col" depends on tables {t1, t4}. To make sure the + optimizer puts t3 before t4 we need to make sure t4.dep_tables + includes t3. + */ + + DBUG_ASSERT(table->embedding == prev_table->embedding); + if (table->embedding) + { + /* + Find what are the "peers" of "table" in the join nest. Normally, + it is table->embedding->nested_join->used_tables, but here we are + in the process of recomputing that value. + So, we walk the join list and collect the bitmap of peers: + */ + table_map peers= 0; + List_iterator_fast li(*join_list); + TABLE_LIST *peer; + while ((peer= li++)) + { + table_map curmap= peer->nested_join + ? peer->nested_join->used_tables + : peer->get_map(); + peers|= curmap; + } + /* + If prev_table doesn't depend on any of its peers, add a + dependency on nearest peer, that is, on 'table'. + */ + if (!(prev_on_expr_deps & peers)) + prev_table->dep_tables|= used_tables; + } + } } } prev_table= table; @@ -22354,6 +22488,8 @@ */ clear_tables(join, &cleared_tables); } + if (join->tmp_table_param.copy_funcs.elements) + copy_fields(&join->tmp_table_param); if (!join->having || join->having->val_bool()) { List *columns_list= (procedure ? &join->procedure_fields_list : @@ -27021,9 +27157,13 @@ original field name, we should additionally check if we have conflict for this name (in case if we would perform lookup in all tables). */ - if (resolution == RESOLVED_BEHIND_ALIAS && - order_item->fix_fields_if_needed_for_order_by(thd, order->item)) - return TRUE; + if (resolution == RESOLVED_BEHIND_ALIAS) + { + if (order_item->fix_fields_if_needed_for_order_by(thd, order->item)) + return TRUE; + // fix_fields may have replaced order->item, reset local variable. + order_item= *order->item; + } /* Lookup the current GROUP field in the FROM clause. */ order_item_type= order_item->type(); @@ -30489,7 +30629,7 @@ */ if (top_level || item->is_explicit_name() || - !check_column_name(item->name.str)) + !check_column_name(item->name)) item->print_item_w_name(str, query_type); else item->print(str, query_type); diff -Nru mariadb-10.11.11/sql/sql_show.cc mariadb-10.11.13/sql/sql_show.cc --- mariadb-10.11.11/sql/sql_show.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_show.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1435,7 +1435,14 @@ DBUG_RETURN(TRUE); } - load_db_opt_by_name(thd, dbname->str, &create); + if (load_db_opt_by_name(thd, dbname->str, &create) < 0) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_UNKNOWN_ERROR, + "Database '%.192s' does not have a db.opt file. " + "You can create one with ALTER DATABASE if needed", + dbname->str); + } } mysqld_show_create_db_get_fields(thd, &field_list); @@ -2943,25 +2950,27 @@ while (thread_info *thd_info= arg.thread_infos.get()) { + const char *str; + ulonglong start_time; + CSET_STRING query; + protocol->prepare_for_resend(); protocol->store(thd_info->thread_id); protocol->store(thd_info->user, strlen(thd_info->user), system_charset_info); protocol->store(thd_info->host, strlen(thd_info->host), system_charset_info); protocol->store_string_or_null(thd_info->db, system_charset_info); - if (thd_info->proc_info) - protocol->store(thd_info->proc_info, strlen(thd_info->proc_info), - system_charset_info); + if ((str= thd_info->proc_info)) + protocol->store(str, strlen(str), system_charset_info); else protocol->store(&command_name[thd_info->command], system_charset_info); - if (thd_info->start_time && now > thd_info->start_time) - protocol->store_long((now - thd_info->start_time) / HRTIME_RESOLUTION); + if ((start_time= thd_info->start_time) && now > start_time) + protocol->store_long((now - start_time) / HRTIME_RESOLUTION); else protocol->store_null(); protocol->store_string_or_null(thd_info->state_info, system_charset_info); - if (thd_info->query_string.length()) - protocol->store(thd_info->query_string.str(), - thd_info->query_string.length(), - thd_info->query_string.charset()); + query= thd_info->query_string; + if (query.length() && query.str()) + protocol->store(query.str(), query.length(), query.charset()); else protocol->store_null(); if (!(thd->variables.old_behavior & OLD_MODE_NO_PROGRESS_INFO)) @@ -4339,7 +4348,7 @@ break; } - if (lower_case_table_names && !rc) + if (lower_case_table_names == 1 && !rc) { /* We can safely do in-place upgrades here since all of the above cases diff -Nru mariadb-10.11.11/sql/sql_statistics.cc mariadb-10.11.13/sql/sql_statistics.cc --- mariadb-10.11.11/sql/sql_statistics.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_statistics.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2077,12 +2077,9 @@ for (i= 0, state= calc_state; i < prefixes; i++, state++) { - if (i < prefixes) - { - double val= state->prefix_count == 0 ? - 0 : (double) state->entry_count / state->prefix_count; - index_info->collected_stats->set_avg_frequency(i, val); - } + double val= state->prefix_count == 0 ? + 0 : (double) state->entry_count / state->prefix_count; + index_info->collected_stats->set_avg_frequency(i, val); } } }; @@ -3142,7 +3139,7 @@ double avg_frequency= pk_read_stats->get_avg_frequency(j-1); set_if_smaller(avg_frequency, 1); double val= (pk_read_stats->get_avg_frequency(j) / - avg_frequency); + avg_frequency > 0 ? avg_frequency : 1); index_statistics->set_avg_frequency (l, val); } } diff -Nru mariadb-10.11.11/sql/sql_string.h mariadb-10.11.13/sql/sql_string.h --- mariadb-10.11.11/sql/sql_string.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_string.h 2025-05-19 16:14:25.000000000 +0000 @@ -909,6 +909,8 @@ :Charset(cs), Binary_string(str, len) { } String(const String &str) = default; + String(String &&str) noexcept + :Charset(std::move(str)), Binary_string(std::move(str)){} void set(String &str,size_t offset,size_t arg_length) { diff -Nru mariadb-10.11.11/sql/sql_table.cc mariadb-10.11.13/sql/sql_table.cc --- mariadb-10.11.11/sql/sql_table.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_table.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1587,12 +1587,19 @@ else { #ifdef WITH_WSREP - if (WSREP(thd) && hton && !wsrep_should_replicate_ddl(thd, hton)) + if (WSREP(thd) && hton) { - error= 1; - goto err; + handlerton *ht= hton; + // For partitioned tables resolve underlying handlerton + if (table->table && table->table->file->partition_ht()) + ht= table->table->file->partition_ht(); + if (!wsrep_should_replicate_ddl(thd, ht)) + { + error= 1; + goto err; + } } -#endif +#endif /* WITH_WSREP */ if (thd->locked_tables_mode == LTM_LOCK_TABLES || thd->locked_tables_mode == LTM_PRELOCKED_UNDER_LOCK_TABLES) @@ -1863,18 +1870,6 @@ if (non_temp_tables_count) query_cache_invalidate3(thd, tables, 0); - /* - We are always logging drop of temporary tables. - The reason is to handle the following case: - - Use statement based replication - - CREATE TEMPORARY TABLE foo (logged) - - set row based replication - - DROP TEMPORARY TABLE foo (needs to be logged) - This should be fixed so that we remember if creation of the - temporary table was logged and only log it if the creation was - logged. - */ - if (non_trans_tmp_table_deleted || trans_tmp_table_deleted || non_tmp_table_deleted) { @@ -3112,7 +3107,7 @@ DBUG_ASSERT(sql_field->charset); - if (check_column_name(sql_field->field_name.str)) + if (check_column_name(sql_field->field_name)) { my_error(ER_WRONG_COLUMN_NAME, MYF(0), sql_field->field_name.str); DBUG_RETURN(TRUE); @@ -3750,7 +3745,7 @@ key_part_info++; } - if (!key_info->name.str || check_column_name(key_info->name.str)) + if (!key_info->name.str || check_column_name(key_info->name)) { my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key_info->name.str); DBUG_RETURN(TRUE); @@ -4989,9 +4984,26 @@ // In Galera cluster we support only InnoDB sequences if (db_type != DB_TYPE_INNODB) { - my_error(ER_NOT_SUPPORTED_YET, MYF(0), - "non-InnoDB sequences in Galera cluster"); - return(true); + // Currently any dynamic storage engine is not possible to identify + // using DB_TYPE_XXXX and ENGINE=SEQUENCE is one of them. + // Therefore, we get storage engine name from lex. + const LEX_CSTRING *tb_name= thd->lex->m_sql_cmd->option_storage_engine_name()->name(); + // (1) CREATE TABLE ... ENGINE=SEQUENCE OR + // (2) ALTER TABLE ... ENGINE= OR + // Note in ALTER TABLE table->s->sequence != nullptr + // (3) CREATE SEQUENCE ... ENGINE= + if ((thd->lex->sql_command == SQLCOM_CREATE_TABLE && + lex_string_eq(tb_name, STRING_WITH_LEN("SEQUENCE"))) || + (thd->lex->sql_command == SQLCOM_ALTER_TABLE) || + (thd->lex->sql_command == SQLCOM_CREATE_SEQUENCE)) + { + my_error(ER_NOT_SUPPORTED_YET, MYF(0), + "non-InnoDB sequences in Galera cluster"); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_NOT_SUPPORTED_YET, + "ENGINE=%s not supported by Galera", tb_name->str); + return(true); + } } // In Galera cluster it is best to use INCREMENT BY 0 with CACHE @@ -6223,7 +6235,7 @@ } else if (drop->type == Alter_drop::PERIOD) { - if (table->s->period.name.streq(drop->name)) + if (table->s->period.name.streq(Lex_ident(drop->name))) remove_drop= FALSE; } else /* Alter_drop::KEY and Alter_drop::FOREIGN_KEY */ @@ -9215,7 +9227,7 @@ for (bool found= false; !found && (drop= drop_it++); ) { found= drop->type == Alter_drop::PERIOD && - table->s->period.name.streq(drop->name); + table->s->period.name.streq(Lex_ident(drop->name)); } if (drop) @@ -9258,7 +9270,7 @@ } } - if (share->period.constr_name.streq(check->name.str)) + if (share->period.constr_name.streq(check->name)) { if (!drop_period && !keep) { @@ -10514,10 +10526,21 @@ if (WSREP(thd) && table && (thd->lex->sql_command == SQLCOM_ALTER_TABLE || thd->lex->sql_command == SQLCOM_CREATE_INDEX || - thd->lex->sql_command == SQLCOM_DROP_INDEX) && - !wsrep_should_replicate_ddl(thd, table->s->db_type())) - DBUG_RETURN(true); -#endif /* WITH_WSREP */ + thd->lex->sql_command == SQLCOM_DROP_INDEX)) + { + handlerton *ht= table->s->db_type(); + + // If alter used ENGINE= we use that + if (create_info->used_fields & HA_CREATE_USED_ENGINE) + ht= create_info->db_type; + // For partitioned tables resolve underlying handlerton + else if (table->file->partition_ht()) + ht= table->file->partition_ht(); + + if (!wsrep_should_replicate_ddl(thd, ht)) + DBUG_RETURN(true); + } +#endif DEBUG_SYNC(thd, "alter_table_after_open_tables"); @@ -11609,7 +11632,8 @@ - Neither old or new engine uses files from another engine The above is mainly true for the sequence and the partition engine. */ - engine_changed= ((new_table->file->ht != table->file->ht) && + engine_changed= ((new_table->file->storage_ht() != + table->file->storage_ht()) && ((!(new_table->file->ha_table_flags() & HA_FILE_BASED) || !(table->file->ha_table_flags() & HA_FILE_BASED))) && !(table->file->ha_table_flags() & HA_REUSES_FILE_NAMES) && @@ -11644,7 +11668,7 @@ debug_crash_here("ddl_log_alter_after_copy"); // Use old table /* - We are new ready to use the new table. Update the state in the + We are now ready to use the new table. Update the state in the ddl log so that we recovery know that the new table is ready and in case of crash it should use the new one and log the query to the binary log. @@ -12354,6 +12378,7 @@ if (alt_error > 0) { error= alt_error; + to->file->extra(HA_EXTRA_ABORT_ALTER_COPY); copy_data_error_ignore(error, false, to, thd, alter_ctx); } } diff -Nru mariadb-10.11.11/sql/sql_trigger.cc mariadb-10.11.13/sql/sql_trigger.cc --- mariadb-10.11.11/sql/sql_trigger.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_trigger.cc 2025-05-19 16:14:25.000000000 +0000 @@ -622,7 +622,12 @@ table= tables->table; #ifdef WITH_WSREP - if (WSREP(thd) && !wsrep_should_replicate_ddl(thd, table->s->db_type())) + /* Resolve should we replicate creation of the trigger. + It should be replicated if storage engine(s) associated + to trigger are replicated by Galera. + */ + if (WSREP(thd) && + !wsrep_should_replicate_ddl_iterate(thd, tables)) goto end; #endif diff -Nru mariadb-10.11.11/sql/sql_truncate.cc mariadb-10.11.13/sql/sql_truncate.cc --- mariadb-10.11.11/sql/sql_truncate.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_truncate.cc 2025-05-19 16:14:25.000000000 +0000 @@ -303,7 +303,7 @@ bool Sql_cmd_truncate_table::lock_table(THD *thd, TABLE_LIST *table_ref, bool *hton_can_recreate) { - handlerton *hton; + const handlerton *hton; bool versioned; bool sequence= false; TABLE *table= NULL; @@ -336,8 +336,15 @@ versioned= table->versioned(); hton= table->file->ht; #ifdef WITH_WSREP + /* Resolve should we replicate truncate. It should + be replicated if storage engine(s) associated + are replicated by Galera. If this is partitioned + table we need to find out default partition + handlerton. + */ if (WSREP(thd) && - !wsrep_should_replicate_ddl(thd, hton)) + !wsrep_should_replicate_ddl(thd, table->file->partition_ht() ? + table->file->partition_ht() : hton)) DBUG_RETURN(TRUE); #endif @@ -359,12 +366,26 @@ sequence= share->table_type == TABLE_TYPE_SEQUENCE; hton= share->db_type(); #ifdef WITH_WSREP - if (WSREP(thd) && - hton != view_pseudo_hton && - !wsrep_should_replicate_ddl(thd, hton)) + if (WSREP(thd) && hton != view_pseudo_hton) { - tdc_release_share(share); - DBUG_RETURN(TRUE); + /* Resolve should we replicate truncate. It should + be replicated if storage engine(s) associated + are replicated by Galera. If this is partitioned + table we need to find out default partition + handlerton. + */ + const handlerton* const ht= +#ifdef WITH_PARTITION_STORAGE_ENGINE + share->default_part_plugin ? + plugin_hton(share->default_part_plugin) : +#endif + hton; + + if (ht && !wsrep_should_replicate_ddl(thd, ht)) + { + tdc_release_share(share); + DBUG_RETURN(TRUE); + } } #endif diff -Nru mariadb-10.11.11/sql/sql_update.cc mariadb-10.11.13/sql/sql_update.cc --- mariadb-10.11.11/sql/sql_update.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_update.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1704,7 +1704,7 @@ if (multi_update_check_table_access(thd, tbl, tables_for_update, &updated)) { - tbl->hide_view_error(thd); + tbl->replace_view_error_with_generic(thd); return true; } } @@ -2356,7 +2356,8 @@ if (unlikely((thd->variables.option_bits & OPTION_SAFE_UPDATES) && error_if_full_join(join))) DBUG_RETURN(1); - if (join->implicit_grouping) + if (join->implicit_grouping || + join->select_lex->have_window_funcs()) { my_error(ER_INVALID_GROUP_FUNC_USE, MYF(0)); DBUG_RETURN(1); diff -Nru mariadb-10.11.11/sql/sql_view.cc mariadb-10.11.13/sql/sql_view.cc --- mariadb-10.11.11/sql/sql_view.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_view.cc 2025-05-19 16:14:25.000000000 +0000 @@ -183,7 +183,7 @@ for (uint column_no= 1; (item= it++); column_no++) { - if (item->is_explicit_name() || !check_column_name(item->name.str)) + if (item->is_explicit_name() || !check_column_name(item->name)) continue; name_len= my_snprintf(buff, NAME_LEN, "Name_exp_%u", column_no); item->orig_name= item->name.str; @@ -341,7 +341,7 @@ { if (check_single_table_access(thd, SELECT_ACL, tbl, FALSE)) { - tbl->hide_view_error(thd); + tbl->replace_view_error_with_generic(thd); goto err; } } @@ -452,8 +452,6 @@ lex->link_first_table_back(view, link_to_local); view->open_type= OT_BASE_ONLY; - WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); - /* ignore lock specs for CREATE statement */ @@ -471,13 +469,20 @@ } #ifdef WITH_WSREP - if(!wsrep_should_replicate_ddl_iterate(thd, static_cast(tables))) + /* Resolve should we replicate creation of the view. + It should be replicated if storage engine(s) associated + to view are replicated by Galera. + */ + if (WSREP(thd) && + !wsrep_should_replicate_ddl_iterate(thd, tables)) { res= TRUE; goto err_no_relink; } #endif + WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); + view= lex->unlink_first_table(&link_to_local); if (check_db_dir_existence(view->db.str)) diff -Nru mariadb-10.11.11/sql/sql_yacc.yy mariadb-10.11.13/sql/sql_yacc.yy --- mariadb-10.11.11/sql/sql_yacc.yy 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sql_yacc.yy 2025-05-19 16:14:25.000000000 +0000 @@ -9107,7 +9107,7 @@ if ($4.str) { if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW && - check_column_name($4.str))) + check_column_name($4))) my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), $4.str)); $2->base_flags|= item_base_t::IS_EXPLICIT_NAME; $2->set_name(thd, $4); diff -Nru mariadb-10.11.11/sql/structs.h mariadb-10.11.13/sql/structs.h --- mariadb-10.11.11/sql/structs.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/structs.h 2025-05-19 16:14:25.000000000 +0000 @@ -236,7 +236,7 @@ LEX_CSTRING user, host; void init() { memset(this, 0, sizeof(*this)); } void copy(MEM_ROOT *root, const LEX_CSTRING *usr, const LEX_CSTRING *host); - bool is_role() const { return user.str[0] && !host.str[0]; } + bool is_role() const { return user.str[0] && (!host.str || !host.str[0]); } void set_lex_string(LEX_CSTRING *l, char *buf) { if (is_role()) diff -Nru mariadb-10.11.11/sql/sys_vars.cc mariadb-10.11.13/sql/sys_vars.cc --- mariadb-10.11.11/sql/sys_vars.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/sys_vars.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2982,7 +2982,7 @@ { "adjust_secondary_key_cost", "disable_max_seek", "disable_forced_index_in_group_by", "fix_innodb_cardinality", "fix_reuse_range_for_ref", - "fix_card_multiplier", 0 + "fix_card_multiplier", "fix_derived_table_read_cost", 0 }; @@ -2999,8 +2999,9 @@ "secondary keys. " "fix_reuse_range_for_ref = Do a better job at reusing range access estimates " "when estimating ref access. " - "fix_card_multiplier = Fix the computation in selectivity_for_indexes." - " selectivity_multiplier. " + "fix_card_multiplier = Fix the computation in selectivity_for_indexes. " + "fix_derived_table_read_cost = Fix the cost of reading materialized " + "derived table. " "This variable will be deleted in MariaDB 11.0 as it is not needed with the " "new 11.0 optimizer.", @@ -6309,7 +6310,9 @@ static Sys_var_enum Sys_wsrep_forced_binlog_format( "wsrep_forced_binlog_format", "binlog format to take effect over user's choice", GLOBAL_VAR(wsrep_forced_binlog_format), CMD_LINE(REQUIRED_ARG), - wsrep_binlog_format_names, DEFAULT(BINLOG_FORMAT_UNSPEC)); + wsrep_binlog_format_names, DEFAULT(BINLOG_FORMAT_UNSPEC), + NO_MUTEX_GUARD, NOT_IN_BINLOG, + ON_CHECK(wsrep_forced_binlog_format_check)); static Sys_var_mybool Sys_wsrep_recover_datadir( "wsrep_recover", "Recover database state after crash and exit", diff -Nru mariadb-10.11.11/sql/table.cc mariadb-10.11.13/sql/table.cc --- mariadb-10.11.11/sql/table.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/table.cc 2025-05-19 16:14:25.000000000 +0000 @@ -5320,9 +5320,10 @@ } -bool check_column_name(const char *name) +bool check_column_name(const Lex_ident &ident) { // name length in symbols + const char *name= ident.str, *end= ident.str + ident.length; size_t name_length= 0; bool last_char_is_space= TRUE; @@ -5332,9 +5333,7 @@ last_char_is_space= my_isspace(system_charset_info, *name); if (system_charset_info->use_mb()) { - int len=my_ismbchar(system_charset_info, name, - name+system_charset_info->mbmaxlen); - if (len) + if (int len= my_ismbchar(system_charset_info, name, end)) { name += len; name_length++; @@ -5354,12 +5353,6 @@ } -bool check_period_name(const char *name) -{ - return check_column_name(name); -} - - /** Checks whether a table is intact. Should be done *just* after the table has been opened. @@ -6360,9 +6353,9 @@ @pre This method can be called only if there is an error. */ -void TABLE_LIST::hide_view_error(THD *thd) +void TABLE_LIST::replace_view_error_with_generic(THD *thd) { - if ((thd->killed && !thd->is_error())|| thd->get_internal_handler()) + if ((thd->killed && !thd->is_error()) || thd->get_internal_handler()) return; /* Hide "Unknown column" or "Unknown function" error */ DBUG_ASSERT(thd->is_error()); @@ -9956,37 +9949,6 @@ return error; } -/* - Procedure of keys generation for result tables of materialized derived - tables/views. - - A key is generated for each equi-join pair derived table-another table. - Each generated key consists of fields of derived table used in equi-join. - Example: - - SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN - t1 ON tt.f1=t1.f3 and tt.f2.=t1.f4; - In this case for the derived table tt one key will be generated. It will - consist of two parts f1 and f2. - Example: - - SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN - t1 ON tt.f1=t1.f3 JOIN - t2 ON tt.f2=t2.f4; - In this case for the derived table tt two keys will be generated. - One key over f1 field, and another key over f2 field. - Currently optimizer may choose to use only one such key, thus the second - one will be dropped after range optimizer is finished. - See also JOIN::drop_unused_derived_keys function. - Example: - - SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN - t1 ON tt.f1=a_function(t1.f3); - In this case for the derived table tt one key will be generated. It will - consist of one field - f1. -*/ - - /* @brief diff -Nru mariadb-10.11.11/sql/table.h mariadb-10.11.13/sql/table.h --- mariadb-10.11.11/sql/table.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/table.h 2025-05-19 16:14:25.000000000 +0000 @@ -2192,7 +2192,7 @@ void init(vers_system_time_t _type, Vers_history_point _start= Vers_history_point(), Vers_history_point _end= Vers_history_point(), - Lex_ident _name= "SYSTEM_TIME") + Lex_ident _name= { STRING_WITH_LEN("SYSTEM_TIME") }) { type= _type; orig_type= _type; @@ -2207,7 +2207,7 @@ void set_all() { type= SYSTEM_TIME_ALL; - name= "SYSTEM_TIME"; + name= { STRING_WITH_LEN("SYSTEM_TIME") }; } void print(String *str, enum_query_type query_type) const; @@ -2572,7 +2572,7 @@ List *view_tables; /* most upper view this table belongs to */ TABLE_LIST *belong_to_view; - /* A derived table this table belongs to */ + /* A merged derived table this table belongs to */ TABLE_LIST *belong_to_derived; /* The view directly referencing this table @@ -2830,7 +2830,7 @@ bool check_single_table(TABLE_LIST **table, table_map map, TABLE_LIST *view); bool set_insert_values(MEM_ROOT *mem_root); - void hide_view_error(THD *thd); + void replace_view_error_with_generic(THD *thd); TABLE_LIST *find_underlying_table(TABLE *table); TABLE_LIST *first_leaf_for_name_resolution(); TABLE_LIST *last_leaf_for_name_resolution(); @@ -3078,6 +3078,8 @@ ulonglong m_table_ref_version; }; +#define ERROR_TABLE ((TABLE_LIST*) 0x1) + class Item; /* @@ -3388,8 +3390,7 @@ int db_errno); void update_create_info_from_table(HA_CREATE_INFO *info, TABLE *form); bool check_db_name(LEX_STRING *db); -bool check_column_name(const char *name); -bool check_period_name(const char *name); +bool check_column_name(const Lex_ident &name); bool check_table_name(const char *name, size_t length, bool check_for_path_chars); int rename_file_ext(const char * from,const char * to,const char * ext); char *get_field(MEM_ROOT *mem, Field *field); diff -Nru mariadb-10.11.11/sql/vers_string.h mariadb-10.11.13/sql/vers_string.h --- mariadb-10.11.11/sql/vers_string.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/vers_string.h 2025-05-19 16:14:25.000000000 +0000 @@ -62,7 +62,7 @@ { } Lex_cstring_with_compare(const LEX_CSTRING src) : Lex_cstring(src.str, src.length) { } - Lex_cstring_with_compare(const char *_str) : Lex_cstring(_str, strlen(_str)) + explicit Lex_cstring_with_compare(const char *_str) : Lex_cstring(_str, strlen(_str)) { } bool streq(const Lex_cstring_with_compare& b) const { diff -Nru mariadb-10.11.11/sql/wsrep_applier.cc mariadb-10.11.13/sql/wsrep_applier.cc --- mariadb-10.11.11/sql/wsrep_applier.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_applier.cc 2025-05-19 16:14:25.000000000 +0000 @@ -203,6 +203,21 @@ } } + if (LOG_EVENT_IS_WRITE_ROW(typ) || + LOG_EVENT_IS_UPDATE_ROW(typ) || + LOG_EVENT_IS_DELETE_ROW(typ)) + { + Rows_log_event* rle = static_cast(ev); + if (thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS)) + { + rle->set_flags(Rows_log_event::RELAXED_UNIQUE_CHECKS_F); + } + if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) + { + rle->set_flags(Rows_log_event::NO_FOREIGN_KEY_CHECKS_F); + } + } + /* Use the original server id for logging. */ thd->set_server_id(ev->server_id); thd->lex->current_select= 0; diff -Nru mariadb-10.11.11/sql/wsrep_client_service.cc mariadb-10.11.13/sql/wsrep_client_service.cc --- mariadb-10.11.11/sql/wsrep_client_service.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_client_service.cc 2025-05-19 16:14:25.000000000 +0000 @@ -304,6 +304,12 @@ replayer_service.replay_status(ret); } + // In Galera we allow only InnoDB sequences, thus + // sequence table updates are in writeset. + // Binlog cache needs reset so that binlog_close + // does not write cache to binlog file yet. + binlog_reset_cache(m_thd); + replayer_thd->main_security_ctx = old_ctx; delete replayer_thd; DBUG_RETURN(ret); diff -Nru mariadb-10.11.11/sql/wsrep_high_priority_service.cc mariadb-10.11.13/sql/wsrep_high_priority_service.cc --- mariadb-10.11.11/sql/wsrep_high_priority_service.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_high_priority_service.cc 2025-05-19 16:14:25.000000000 +0000 @@ -610,7 +610,7 @@ int ret= apply_events(thd, m_rli, data, err, true); thd->close_temporary_tables(); - if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit)) + if (!ret && !wsrep::commits_transaction(ws_meta.flags())) { thd->wsrep_cs().fragment_applied(ws_meta.seqno()); } @@ -778,7 +778,7 @@ } ret= ret || apply_events(thd, m_rli, data, err, true); thd->close_temporary_tables(); - if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit)) + if (!ret && !wsrep::commits_transaction(ws_meta.flags())) { thd->wsrep_cs().fragment_applied(ws_meta.seqno()); } diff -Nru mariadb-10.11.11/sql/wsrep_mysqld.cc mariadb-10.11.13/sql/wsrep_mysqld.cc --- mariadb-10.11.11/sql/wsrep_mysqld.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_mysqld.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1,5 +1,5 @@ -/* Copyright (c) 2008, 2023 Codership Oy - Copyright (c) 2020, 2022, MariaDB +/* Copyright (c) 2008, 2025, Codership Oy + Copyright (c) 2020, 2025, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -834,7 +834,8 @@ wsrep_server_gtid_t new_gtid; new_gtid.domain_id= wsrep_gtid_domain_id; new_gtid.server_id= global_system_variables.server_id; - new_gtid.seqno= 0; + /* Use seqno which was recovered in wsrep_init_gtid() */ + new_gtid.seqno= wsrep_gtid_server.seqno(); /* Try to search for domain_id and server_id combination in binlog if found continue from last seqno */ wsrep_get_binlog_gtid_seqno(new_gtid); wsrep_gtid_server.gtid(new_gtid); @@ -867,12 +868,13 @@ wsrep_init_position(); wsrep_sst_auth_init(); - if (strlen(wsrep_provider)== 0 || - !strcmp(wsrep_provider, WSREP_NONE)) + if (!*wsrep_provider || + !strcasecmp(wsrep_provider, WSREP_NONE)) { // enable normal operation in case no provider is specified global_system_variables.wsrep_on= 0; - int err= Wsrep_server_state::instance().load_provider(wsrep_provider, wsrep_provider_options ? wsrep_provider_options : ""); + int err= Wsrep_server_state::instance().load_provider( + wsrep_provider, wsrep_provider_options ? wsrep_provider_options : ""); if (err) { DBUG_PRINT("wsrep",("wsrep::init() failed: %d", err)); @@ -1603,7 +1605,12 @@ This allows autocommit SELECTs and a first SELECT after SET AUTOCOMMIT=0 TODO: modify to check if thd has locked any rows. */ - return thd->wsrep_cs().sync_wait(-1); + if (thd->wsrep_cs().sync_wait(-1)) + { + wsrep_override_error(thd, thd->wsrep_cs().current_error(), + thd->wsrep_cs().current_error_status()); + return true; + } } return false; @@ -2489,50 +2496,48 @@ /* Forward declarations. */ int wsrep_create_trigger_query(THD *thd, uchar** buf, size_t* buf_len); -bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list) -{ - if (WSREP(thd)) - { - for (const TABLE_LIST* it= table_list; it; it= it->next_global) - { - if (it->table && - !wsrep_should_replicate_ddl(thd, it->table->s->db_type())) - return false; - } - } - return true; -} +/*! Should DDL be replicated by Galera + * + * @param thd thread handle + * @param hton real storage engine handlerton + * + * @retval true if we should replicate DDL, false if not */ bool wsrep_should_replicate_ddl(THD* thd, const handlerton *hton) { if (!wsrep_check_mode(WSREP_MODE_STRICT_REPLICATION)) return true; - if (!hton) - return true; + DBUG_ASSERT(hton != nullptr); switch (hton->db_type) { + case DB_TYPE_UNKNOWN: + /* Special pseudo-handlertons (such as 10.6+ JSON tables). */ + return true; + break; case DB_TYPE_INNODB: return true; break; case DB_TYPE_MYISAM: if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM)) return true; - else - WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd)); break; case DB_TYPE_ARIA: if (wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA)) - return true; - else - WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd)); + return true; + break; + case DB_TYPE_PARTITION_DB: + /* In most cases this means we could not find out + table->file->partition_ht() */ + return true; break; default: - WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd)); break; } + WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd)); + /* wsrep_mode = STRICT_REPLICATION, treat as error */ my_error(ER_GALERA_REPLICATION_NOT_SUPPORTED, MYF(0)); push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, @@ -2542,6 +2547,26 @@ ha_resolve_storage_engine_name(hton)); return false; } + +bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list) +{ + for (const TABLE_LIST* it= table_list; it; it= it->next_global) + { + const TABLE* table= it->table; + if (table && !it->table_function) + { + /* If this is partitioned table we need to find out + implementing storage engine handlerton. + */ + const handlerton *ht= table->file->partition_ht(); + if (!ht) ht= table->s->db_type(); + if (!wsrep_should_replicate_ddl(thd, ht)) + return false; + } + } + return true; +} + /* Decide if statement should run in TOI. @@ -2650,9 +2675,8 @@ if (create_info) { const handlerton *hton= create_info->db_type; - if (!hton) - hton= ha_default_handlerton(thd); + hton= ha_default_handlerton(thd); if (!wsrep_should_replicate_ddl(thd, hton)) return false; } @@ -2787,7 +2811,6 @@ unireg_abort(1); } - /* returns: 0: statement was replicated as TOI @@ -2803,6 +2826,7 @@ DBUG_ASSERT(wsrep_OSU_method_get(thd) == WSREP_OSU_TOI); WSREP_DEBUG("TOI Begin: %s", wsrep_thd_query(thd)); + DEBUG_SYNC(thd, "wsrep_before_toi_begin"); if (wsrep_can_run_in_toi(thd, db, table, table_list, create_info) == false) { @@ -3043,12 +3067,13 @@ const wsrep::key_array *fk_tables, const HA_CREATE_INFO *create_info) { + DEBUG_SYNC(thd, "wsrep_kill_thd_before_enter_toi"); mysql_mutex_lock(&thd->LOCK_thd_kill); const killed_state killed = thd->killed; mysql_mutex_unlock(&thd->LOCK_thd_kill); if (killed) { - DBUG_ASSERT(FALSE); + /* The thread may have been killed as a result of memory pressure. */ return -1; } @@ -3217,29 +3242,28 @@ @param requestor_ctx The MDL context of the requestor @param ticket MDL ticket for the requested lock + @param key The key of the object (data) being protected - @retval TRUE Lock request can be granted - @retval FALSE Lock request cannot be granted */ - void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx, const MDL_ticket *ticket, const MDL_key *key) { THD *request_thd= requestor_ctx->get_thd(); - THD *granted_thd= ticket->get_ctx()->get_thd(); /* Fallback to the non-wsrep behaviour */ if (!WSREP(request_thd)) return; - const char* schema= key->db_name(); - int schema_len= key->db_name_length(); - mysql_mutex_lock(&request_thd->LOCK_thd_data); if (wsrep_thd_is_toi(request_thd) || wsrep_thd_is_applying(request_thd)) { + THD *granted_thd= ticket->get_ctx()->get_thd(); + + const char* schema= key->db_name(); + int schema_len= key->db_name_length(); + WSREP_DEBUG("wsrep_handle_mdl_conflict request TOI/APPLY for %s", wsrep_thd_query(request_thd)); THD_STAGE_INFO(request_thd, stage_waiting_isolation); @@ -3259,7 +3283,6 @@ /* Here we will call wsrep_abort_transaction so we should hold THD::LOCK_thd_data to protect victim from concurrent usage and THD::LOCK_thd_kill to protect from disconnect or delete. - */ mysql_mutex_lock(&granted_thd->LOCK_thd_kill); mysql_mutex_lock(&granted_thd->LOCK_thd_data); @@ -3303,16 +3326,21 @@ (granted_thd->system_thread != NON_SYSTEM_THREAD && granted_thd->mdl_context.has_explicit_locks())) { - WSREP_DEBUG("BF thread waiting for FLUSH for %s", - wsrep_thd_query(request_thd)); - THD_STAGE_INFO(request_thd, stage_waiting_ddl); + WSREP_DEBUG("BF thread waiting for %s", + granted_thd->lex->sql_command == SQLCOM_FLUSH ? "FLUSH" : "BACKUP"); ticket->wsrep_report(wsrep_debug); + if (granted_thd->current_backup_stage != BACKUP_FINISHED && wsrep_check_mode(WSREP_MODE_BF_MARIABACKUP)) { wsrep_abort_thd(request_thd, granted_thd, 1); } } + else if (granted_thd->lex->sql_command == SQLCOM_LOCK_TABLES) + { + WSREP_DEBUG("BF thread waiting for LOCK TABLES"); + ticket->wsrep_report(wsrep_debug); + } else if (request_thd->lex->sql_command == SQLCOM_DROP_TABLE) { WSREP_DEBUG("DROP caused BF abort, conf %s for %s", diff -Nru mariadb-10.11.11/sql/wsrep_mysqld.h mariadb-10.11.13/sql/wsrep_mysqld.h --- mariadb-10.11.11/sql/wsrep_mysqld.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_mysqld.h 2025-05-19 16:14:25.000000000 +0000 @@ -356,7 +356,7 @@ const wsrep::key_array *fk_tables= nullptr, const HA_CREATE_INFO* create_info= nullptr); -bool wsrep_should_replicate_ddl(THD* thd, const handlerton *db_type); +bool wsrep_should_replicate_ddl(THD* thd, const handlerton *hton); bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list); void wsrep_to_isolation_end(THD *thd); @@ -615,7 +615,6 @@ #define wsrep_thr_deinit() do {} while(0) #define wsrep_init_globals() do {} while(0) #define wsrep_create_appliers(X) do {} while(0) -#define wsrep_should_replicate_ddl(X,Y) (1) #define wsrep_cluster_address_exists() (false) #define WSREP_MYSQL_DB (0) #define WSREP_TO_ISOLATION_BEGIN(db_, table_, table_list_) do { } while(0) diff -Nru mariadb-10.11.11/sql/wsrep_server_service.cc mariadb-10.11.13/sql/wsrep_server_service.cc --- mariadb-10.11.11/sql/wsrep_server_service.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_server_service.cc 2025-05-19 16:14:25.000000000 +0000 @@ -192,6 +192,7 @@ break; case wsrep::log::unknown: WSREP_UNKNOWN("%s", message); + assert(0); break; } } diff -Nru mariadb-10.11.11/sql/wsrep_sst.cc mariadb-10.11.13/sql/wsrep_sst.cc --- mariadb-10.11.11/sql/wsrep_sst.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_sst.cc 2025-05-19 16:14:25.000000000 +0000 @@ -464,7 +464,7 @@ if (WSREP_ON) { int const rcode(seqno < 0 ? seqno : 0); - error= wsrep_sst_complete(thd,rcode, sst_gtid); + error= wsrep_sst_complete(thd, rcode, sst_gtid); } return error; @@ -1977,6 +1977,15 @@ wsrep::seqno(err ? wsrep::seqno::undefined() : wsrep::seqno(ret_seqno))); +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("sync.wsrep_sst_donor_after_donation", { + const char act[]= "now " + "SIGNAL sync.wsrep_sst_donor_after_donation_reached " + "WAIT_FOR signal.wsrep_sst_donor_after_donation_continue"; + DBUG_ASSERT(!debug_sync_set_action(thd.ptr, STRING_WITH_LEN(act))); + }); +#endif /* ENABLED_DEBUG_SYNC */ + Wsrep_server_state::instance().sst_sent(gtid, err); proc.wait(); diff -Nru mariadb-10.11.11/sql/wsrep_thd.h mariadb-10.11.13/sql/wsrep_thd.h --- mariadb-10.11.11/sql/wsrep_thd.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_thd.h 2025-05-19 16:14:25.000000000 +0000 @@ -237,25 +237,13 @@ wsrep::client_error ce, enum wsrep::provider::status status) { - DBUG_ASSERT(ce != wsrep::e_success); - switch (ce) - { - case wsrep::e_error_during_commit: - if (status == wsrep::provider::error_size_exceeded) - wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded"); - else - wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, 0, status); - break; - case wsrep::e_deadlock_error: - wsrep_override_error(thd, ER_LOCK_DEADLOCK); - break; - case wsrep::e_interrupted_error: - wsrep_override_error(thd, ER_QUERY_INTERRUPTED); - break; - case wsrep::e_size_exceeded_error: + DBUG_ASSERT(ce != wsrep::e_success); + switch (ce) + { + case wsrep::e_error_during_commit: + if (status == wsrep::provider::error_size_exceeded) wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded"); - break; - case wsrep::e_append_fragment_error: + else /* TODO: Figure out better error number */ if (status) wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, @@ -265,17 +253,45 @@ else wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, "Error while appending streaming replication fragment"); - break; - case wsrep::e_not_supported_error: - wsrep_override_error(thd, ER_NOT_SUPPORTED_YET); - break; - case wsrep::e_timeout_error: - wsrep_override_error(thd, ER_LOCK_WAIT_TIMEOUT); + break; + case wsrep::e_deadlock_error: + switch (thd->lex->sql_command) + { + case SQLCOM_XA_END: + case SQLCOM_XA_PREPARE: + wsrep_override_error(thd, ER_XA_RBDEADLOCK); break; default: - wsrep_override_error(thd, ER_UNKNOWN_ERROR); + wsrep_override_error(thd, ER_LOCK_DEADLOCK); break; } + break; + case wsrep::e_interrupted_error: + wsrep_override_error(thd, ER_QUERY_INTERRUPTED); + break; + case wsrep::e_size_exceeded_error: + wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded"); + break; + case wsrep::e_append_fragment_error: + /* TODO: Figure out better error number */ + if (status) + wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, + "Error while appending streaming replication fragment" + "(provider status: %s)", + wsrep::provider::to_string(status).c_str()); + else + wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, + "Error while appending streaming replication fragment"); + break; + case wsrep::e_not_supported_error: + wsrep_override_error(thd, ER_NOT_SUPPORTED_YET); + break; + case wsrep::e_timeout_error: + wsrep_override_error(thd, ER_LOCK_WAIT_TIMEOUT); + break; + default: + wsrep_override_error(thd, ER_UNKNOWN_ERROR); + } } /** diff -Nru mariadb-10.11.11/sql/wsrep_trans_observer.h mariadb-10.11.13/sql/wsrep_trans_observer.h --- mariadb-10.11.11/sql/wsrep_trans_observer.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_trans_observer.h 2025-05-19 16:14:25.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright 2016-2023 Codership Oy +/* Copyright 2016-2025 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -265,12 +265,17 @@ { DBUG_RETURN(ret); } + if ((ret= thd->wsrep_cs().before_prepare()) == 0) { DBUG_ASSERT(!thd->wsrep_trx().ws_meta().gtid().is_undefined()); + /* Here we init xid with UUID and wsrep seqno. GTID is + set to undefined because commit order is decided later + in wsrep_before_commit(). wsrep_before_prepare() is + executed out of order. */ wsrep_xid_init(&thd->wsrep_xid, thd->wsrep_trx().ws_meta().gtid(), - wsrep_gtid_server.gtid()); + wsrep_gtid_server.undefined()); } mysql_mutex_lock(&thd->LOCK_thd_kill); @@ -472,12 +477,6 @@ int wsrep_after_statement(THD* thd) { DBUG_ENTER("wsrep_after_statement"); - WSREP_DEBUG("wsrep_after_statement for %lu client_state %s " - " client_mode %s trans_state %s", - thd_get_thread_id(thd), - wsrep::to_c_string(thd->wsrep_cs().state()), - wsrep::to_c_string(thd->wsrep_cs().mode()), - wsrep::to_c_string(thd->wsrep_cs().transaction().state())); int ret= ((thd->wsrep_cs().state() != wsrep::client_state::s_none && thd->wsrep_cs().mode() == Wsrep_client_state::m_local) && !thd->internal_transaction() ? diff -Nru mariadb-10.11.11/sql/wsrep_var.cc mariadb-10.11.13/sql/wsrep_var.cc --- mariadb-10.11.11/sql/wsrep_var.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_var.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright 2008-2022 Codership Oy +/* Copyright 2008-2023 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -353,14 +353,12 @@ var->save_result.string_value.length); start_pos_buf[var->save_result.string_value.length]= 0; - WSREP_DEBUG("SST wsrep_start_position check for new position %s old %s", - start_pos_buf, wsrep_start_position); + start_pos_buf, wsrep_start_position); // Verify the format. if (wsrep_start_position_verify(start_pos_buf)) return true; - // Give error if position is updated when wsrep is not enabled or // provider is not loaded. if ((!WSREP_ON || !Wsrep_server_state::instance().is_provider_loaded()) @@ -667,7 +665,7 @@ { wsrep_create_rollbacker(); WSREP_DEBUG("Cluster address update creating %ld applier threads running %lu", - wsrep_slave_threads, wsrep_running_applier_threads); + wsrep_slave_threads, wsrep_running_applier_threads); wsrep_create_appliers(wsrep_slave_threads); } mysql_mutex_unlock(&LOCK_wsrep_cluster_config); @@ -771,7 +769,7 @@ { wsrep_slave_count_change = (wsrep_slave_threads - wsrep_running_applier_threads); WSREP_DEBUG("Change on slave threads: New %ld old %lu difference %d", - wsrep_slave_threads, wsrep_running_applier_threads, wsrep_slave_count_change); + wsrep_slave_threads, wsrep_running_applier_threads, wsrep_slave_count_change); } bool wsrep_slave_threads_update (sys_var *self, THD* thd, enum_var_type type) @@ -796,9 +794,9 @@ // Thread creation and execution is asyncronous, therefore we need // wait them to be started or error produced while (wsrep_running_applier_threads != (ulong)wsrep_slave_threads && - !wsrep_thread_create_failed.load(std::memory_order_relaxed)) + !wsrep_thread_create_failed.load(std::memory_order_relaxed)) { - my_sleep(1000); + my_sleep(1000); } mysql_mutex_lock(&LOCK_global_system_variables); @@ -987,6 +985,22 @@ bool wsrep_mode_check(sys_var *self, THD* thd, set_var* var) { + ulonglong new_wsrep_mode= var->save_result.ulonglong_value; + ulonglong old_wsrep_mode= wsrep_mode; + wsrep_mode= new_wsrep_mode; + if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM) || + wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA)) + { + if (!(wsrep_forced_binlog_format == BINLOG_FORMAT_UNSPEC || + wsrep_forced_binlog_format == BINLOG_FORMAT_ROW)) + { + my_message(ER_WRONG_ARGUMENTS, "wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] " + "can't be enabled if wsrep_forced_binlog != [NONE|ROW]", MYF(0)); + wsrep_mode= old_wsrep_mode; + return true; + } + } + wsrep_mode= old_wsrep_mode; return false; } @@ -1130,3 +1144,28 @@ return false; } +bool wsrep_forced_binlog_format_check(sys_var *self, THD* thd, set_var* var) +{ + ulonglong new_forced_binlog_format= var->save_result.ulonglong_value; + + if (!(new_forced_binlog_format == BINLOG_FORMAT_UNSPEC || + new_forced_binlog_format == BINLOG_FORMAT_ROW)) + { + if (wsrep_check_mode(WSREP_MODE_BINLOG_ROW_FORMAT_ONLY)) + { + my_message(ER_WRONG_ARGUMENTS, "wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set " + "if wsrep_mode=BINLOG_ROW_FORMAT_ONLY", MYF(0)); + return true; + } + + if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM) || + wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA)) + { + my_message(ER_WRONG_ARGUMENTS, "wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set " + "if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]", MYF(0)); + return true; + } + } + + return false; +} diff -Nru mariadb-10.11.11/sql/wsrep_var.h mariadb-10.11.13/sql/wsrep_var.h --- mariadb-10.11.11/sql/wsrep_var.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_var.h 2025-05-19 16:14:25.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2021 Codership Oy +/* Copyright (C) 2013-2023 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -110,6 +110,7 @@ extern bool wsrep_gtid_domain_id_update UPDATE_ARGS; extern bool wsrep_mode_check CHECK_ARGS; +extern bool wsrep_forced_binlog_format_check CHECK_ARGS; #else /* WITH_WSREP */ #define wsrep_provider_init(X) diff -Nru mariadb-10.11.11/sql/wsrep_xid.cc mariadb-10.11.13/sql/wsrep_xid.cc --- mariadb-10.11.11/sql/wsrep_xid.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_xid.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright 2015 Codership Oy +/* Copyright 2015-2025 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,6 +24,8 @@ #include #include /* std::sort() */ +#include /* std::string */ +#include /* std::stringstream */ /* * WSREPXid */ @@ -119,11 +121,7 @@ if (hton->set_checkpoint) { - const unsigned char* uuid= wsrep_xid_uuid(xid); - char uuid_str[40]= {0, }; - wsrep_uuid_print((const wsrep_uuid_t*)uuid, uuid_str, sizeof(uuid_str)); - WSREP_DEBUG("Set WSREPXid for InnoDB: %s:%lld", - uuid_str, (long long)wsrep_xid_seqno(xid)); + WSREP_DEBUG("Set WSREPXid for InnoDB: %s", wsrep_xid_print(xid).c_str()); hton->set_checkpoint(hton, xid); } return FALSE; @@ -150,12 +148,7 @@ if (hton->get_checkpoint) { hton->get_checkpoint(hton, xid); - wsrep_uuid_t uuid; - memcpy(&uuid, wsrep_xid_uuid(xid), sizeof(uuid)); - char uuid_str[40]= {0, }; - wsrep_uuid_print(&uuid, uuid_str, sizeof(uuid_str)); - WSREP_DEBUG("Read WSREPXid from InnoDB: %s:%lld", - uuid_str, (long long)wsrep_xid_seqno(xid)); + WSREP_DEBUG("Read WSREPXid from InnoDB: %s", wsrep_xid_print(xid).c_str()); } return FALSE; } @@ -252,3 +245,29 @@ { std::sort(array, array + len, Wsrep_xid_cmp()); } + +std::string wsrep_xid_print(const XID *xid) +{ + std::stringstream ss; + const unsigned char* uuid= wsrep_xid_uuid(xid); + char uuid_str[40]= {0, }; + wsrep_uuid_print((const wsrep_uuid_t*)uuid, uuid_str, sizeof(uuid_str)); + wsrep_server_gtid_t gtid= {0,0,0}; + memcpy(>id, &xid->data[WSREP_XID_RPL_GTID_OFFSET], sizeof(wsrep_server_gtid_t)); + ss << uuid_str << ":" << wsrep_xid_seqno(xid) << " " << gtid.domain_id << "-" + << gtid.server_id << "-" << gtid.seqno; + return ss.str(); +} + +bool wsrep_is_xid_gtid_undefined(const XID *xid) +{ + wsrep_server_gtid_t gtid= {0,0,0}; + + if (wsrep_is_wsrep_xid(xid) && + xid->data[WSREP_XID_VERSION_OFFSET] == WSREP_XID_VERSION_3) + { + memcpy(>id, &xid->data[WSREP_XID_RPL_GTID_OFFSET], sizeof(wsrep_server_gtid_t)); + } + + return (gtid.seqno == 0 && gtid.server_id == 0 && gtid.domain_id == 0); +} diff -Nru mariadb-10.11.11/sql/wsrep_xid.h mariadb-10.11.13/sql/wsrep_xid.h --- mariadb-10.11.11/sql/wsrep_xid.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/sql/wsrep_xid.h 2025-05-19 16:14:25.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright (C) 2015 Codership Oy +/* Copyright (C) 2015-2025 Codership Oy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +34,8 @@ //void wsrep_set_SE_checkpoint(XID&); /* uncomment if needed */ void wsrep_sort_xid_array(XID *array, int len); +std::string wsrep_xid_print(const XID *xid); +bool wsrep_is_xid_gtid_undefined(const XID *xid); #endif /* WITH_WSREP */ #endif /* WSREP_UTILS_H */ diff -Nru mariadb-10.11.11/sql/yy_mariadb.cc mariadb-10.11.13/sql/yy_mariadb.cc --- mariadb-10.11.11/sql/yy_mariadb.cc 2025-01-30 11:01:27.000000000 +0000 +++ mariadb-10.11.13/sql/yy_mariadb.cc 2025-05-19 16:14:28.000000000 +0000 @@ -39851,7 +39851,7 @@ if ((yyvsp[0].lex_str).str) { if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW && - check_column_name((yyvsp[0].lex_str).str))) + check_column_name((yyvsp[0].lex_str)))) my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), (yyvsp[0].lex_str).str)); (yyvsp[-2].item)->base_flags|= item_base_t::IS_EXPLICIT_NAME; (yyvsp[-2].item)->set_name(thd, (yyvsp[0].lex_str)); diff -Nru mariadb-10.11.11/sql/yy_oracle.cc mariadb-10.11.13/sql/yy_oracle.cc --- mariadb-10.11.11/sql/yy_oracle.cc 2025-01-30 11:01:27.000000000 +0000 +++ mariadb-10.11.13/sql/yy_oracle.cc 2025-05-19 16:14:28.000000000 +0000 @@ -39022,7 +39022,7 @@ if ((yyvsp[0].lex_str).str) { if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW && - check_column_name((yyvsp[0].lex_str).str))) + check_column_name((yyvsp[0].lex_str)))) my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), (yyvsp[0].lex_str).str)); (yyvsp[-2].item)->base_flags|= item_base_t::IS_EXPLICIT_NAME; (yyvsp[-2].item)->set_name(thd, (yyvsp[0].lex_str)); diff -Nru mariadb-10.11.11/storage/connect/CMakeLists.txt mariadb-10.11.13/storage/connect/CMakeLists.txt --- mariadb-10.11.11/storage/connect/CMakeLists.txt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/connect/CMakeLists.txt 2025-05-19 16:14:25.000000000 +0000 @@ -413,14 +413,16 @@ RETURN() ENDIF() -IF(MSVC AND (CMAKE_CXX_FLAGS MATCHES "/MP")) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") # domdoc.cpp uses compiler directive #import which is not compatible # with the /MP option, resulting in compiler error C2813. # Remove /MP for this file. + GET_TARGET_PROPERTY(CURRENT_COMPILE_OPTIONS connect COMPILE_OPTIONS) + LIST(REMOVE_ITEM CURRENT_COMPILE_OPTIONS "$<$:/MP>") + SET_TARGET_PROPERTIES(connect PROPERTIES COMPILE_OPTIONS "${CURRENT_COMPILE_OPTIONS}") SET(src_list ${CONNECT_SOURCES}) LIST(FIND src_list domdoc.cpp idx) IF(idx GREATER -1) - STRING(REPLACE "/MP" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") LIST(REMOVE_AT src_list ${idx}) SET_SOURCE_FILES_PROPERTIES(${src_list} PROPERTIES COMPILE_FLAGS "/MP") ENDIF() diff -Nru mariadb-10.11.11/storage/connect/connect.cc mariadb-10.11.13/storage/connect/connect.cc --- mariadb-10.11.11/storage/connect/connect.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/connect/connect.cc 2025-05-19 16:14:25.000000000 +0000 @@ -92,11 +92,11 @@ free(dbuserp); - if (trace(1)) - htrc("CntEndDB: Freeing Dup\n"); + if (trace(1)) + htrc("CntEndDB: Freeing Dup\n"); - g->Activityp->Aptr = NULL; - } // endif dbuserp + g->Activityp->Aptr = NULL; // Free PlgGetUser() data + } // endif dbuserp } // end of CntEndDB diff -Nru mariadb-10.11.11/storage/connect/plgxml.h mariadb-10.11.13/storage/connect/plgxml.h --- mariadb-10.11.11/storage/connect/plgxml.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/connect/plgxml.h 2025-05-19 16:14:25.000000000 +0000 @@ -5,7 +5,7 @@ /******************************************************************/ /* Dual XML implementation base classes defines. */ /******************************************************************/ -#if !defined(BASE_BUFFER_SIZE) +#ifndef LIBXML2_SUPPORT enum ElementType { // libxml2 XML_ELEMENT_NODE = 1, XML_ATTRIBUTE_NODE = 2, @@ -28,7 +28,7 @@ XML_XINCLUDE_START = 19, XML_XINCLUDE_END = 20, XML_DOCB_DOCUMENT_NODE = 21}; -#endif // !BASE_BUFFER_SIZE +#endif //#if !defined(NODE_TYPE_LIST) #ifdef NOT_USED diff -Nru mariadb-10.11.11/storage/connect/tabxml.cpp mariadb-10.11.13/storage/connect/tabxml.cpp --- mariadb-10.11.11/storage/connect/tabxml.cpp 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/connect/tabxml.cpp 2025-05-19 16:14:25.000000000 +0000 @@ -25,6 +25,9 @@ #include #include //#include +#ifdef LIBXML2_SUPPORT +#include +#endif #include "osutil.h" #define _O_RDONLY O_RDONLY #endif // !_WIN32 diff -Nru mariadb-10.11.11/storage/connect/user_connect.cc mariadb-10.11.13/storage/connect/user_connect.cc --- mariadb-10.11.11/storage/connect/user_connect.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/connect/user_connect.cc 2025-05-19 16:14:25.000000000 +0000 @@ -101,9 +101,6 @@ PACTIVITY ap= NULL; PDBUSER dup= NULL; - // Areasize= 64M because of VEC tables. Should be parameterisable -//g= PlugInit(NULL, 67108864); -//g= PlugInit(NULL, 134217728); // 128M was because of old embedded tests g= PlugInit(NULL, (size_t)worksize); // Check whether the initialization is complete @@ -113,12 +110,13 @@ printf("%s\n", g->Message); (void) PlugExit(g); + g= 0; - if (dup) - free(dup); + if (dup) + free(dup); return true; - } // endif g-> + } // endif g-> dup->Catalog= new MYCAT(NULL); @@ -128,17 +126,16 @@ g->Activityp= ap; g->Activityp->Aptr= dup; - pthread_mutex_lock(&usrmut); + pthread_mutex_lock(&usrmut); next= to_users; to_users= this; if (next) next->previous= this; - count = 1; - pthread_mutex_unlock(&usrmut); - - last_query_id= thdp->query_id; + count = 1; + pthread_mutex_unlock(&usrmut); + last_query_id= thdp->query_id; return false; } // end of user_init diff -Nru mariadb-10.11.11/storage/federatedx/federatedx_io.cc mariadb-10.11.13/storage/federatedx/federatedx_io.cc --- mariadb-10.11.11/storage/federatedx/federatedx_io.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/federatedx/federatedx_io.cc 2025-05-19 16:14:25.000000000 +0000 @@ -51,6 +51,7 @@ static const io_schemes_st federated_io_schemes[] = { { "mysql", &instantiate_io_mysql }, + { "mariadb", &instantiate_io_mysql }, { "null", instantiate_io_null } /* must be last element */ }; diff -Nru mariadb-10.11.11/storage/federatedx/ha_federatedx.cc mariadb-10.11.13/storage/federatedx/ha_federatedx.cc --- mariadb-10.11.11/storage/federatedx/ha_federatedx.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/federatedx/ha_federatedx.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1484,20 +1484,20 @@ sizeof(int) + 8); key.append(scheme); key.q_append('\0'); - server->hostname= (const char *) (intptr) key.length(); + size_t hostname_pos= key.length(); key.append(hostname); key.q_append('\0'); - server->database= (const char *) (intptr) key.length(); + size_t database_pos= key.length(); key.append(database); key.q_append('\0'); key.q_append((uint32) share->port); - server->socket= (const char *) (intptr) key.length(); + size_t socket_pos= key.length(); key.append(socket); key.q_append('\0'); - server->username= (const char *) (intptr) key.length(); + size_t username_pos= key.length(); key.append(username); key.q_append('\0'); - server->password= (const char *) (intptr) key.length(); + size_t password_pos= key.length(); key.append(password); key.c_ptr_safe(); // Ensure we have end \0 @@ -1505,13 +1505,12 @@ /* Copy and add end \0 */ server->key= (uchar *) strmake_root(mem_root, key.ptr(), key.length()); - /* pointer magic */ - server->scheme+= (intptr) server->key; - server->hostname+= (intptr) server->key; - server->database+= (intptr) server->key; - server->username+= (intptr) server->key; - server->password+= (intptr) server->key; - server->socket+= (intptr) server->key; + server->scheme= (const char *)server->key; + server->hostname= (const char *)server->key + hostname_pos; + server->database= (const char *)server->key + database_pos; + server->username= (const char *)server->key + username_pos; + server->password= (const char *)server->key + password_pos; + server->socket= (const char*)server->key + socket_pos; server->port= share->port; if (!share->socket) diff -Nru mariadb-10.11.11/storage/innobase/CMakeLists.txt mariadb-10.11.13/storage/innobase/CMakeLists.txt --- mariadb-10.11.11/storage/innobase/CMakeLists.txt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/CMakeLists.txt 2025-05-19 16:14:25.000000000 +0000 @@ -226,7 +226,6 @@ include/dict0pagecompress.h include/dict0pagecompress.inl include/dict0stats.h - include/dict0stats.inl include/dict0stats_bg.h include/dict0types.h include/dyn0buf.h diff -Nru mariadb-10.11.11/storage/innobase/btr/btr0sea.cc mariadb-10.11.13/storage/innobase/btr/btr0sea.cc --- mariadb-10.11.11/storage/innobase/btr/btr0sea.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/btr/btr0sea.cc 2025-05-19 16:14:25.000000000 +0000 @@ -195,7 +195,7 @@ } /** Lazily free detached metadata when removing the last reference. */ -ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index) +ATTRIBUTE_COLD void btr_search_lazy_free(dict_index_t *index) { ut_ad(index->freed()); dict_table_t *table= index->table; @@ -219,8 +219,7 @@ table->autoinc_mutex.wr_unlock(); } -/** Disable the adaptive hash search system and empty the index. */ -void btr_search_disable() +ATTRIBUTE_COLD bool btr_search_disable() { dict_table_t* table; @@ -231,7 +230,7 @@ if (!btr_search_enabled) { dict_sys.unfreeze(); btr_search_x_unlock_all(); - return; + return false; } btr_search_enabled = false; @@ -259,23 +258,25 @@ btr_search_sys.clear(); btr_search_x_unlock_all(); + + return true; } /** Enable the adaptive hash search system. @param resize whether buf_pool_t::resize() is the caller */ -void btr_search_enable(bool resize) +ATTRIBUTE_COLD void btr_search_enable(bool resize) { if (!resize) { mysql_mutex_lock(&buf_pool.mutex); - bool changed = srv_buf_pool_old_size != srv_buf_pool_size; + const auto is_shrinking = buf_pool.is_shrinking(); mysql_mutex_unlock(&buf_pool.mutex); - if (changed) { + if (is_shrinking) { return; } } btr_search_x_lock_all(); - ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64; + ulint hash_size = buf_pool.curr_pool_size() / sizeof(void *) / 64; if (btr_search_sys.parts[0].heap) { ut_ad(btr_search_enabled); @@ -939,88 +940,6 @@ info->last_hash_succ = FALSE; } -/** Clear the adaptive hash index on all pages in the buffer pool. */ -inline void buf_pool_t::clear_hash_index() noexcept -{ - ut_ad(!resizing); - ut_ad(!btr_search_enabled); - - std::set garbage; - - for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; ) - { - for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size; - block != end; block++) - { - dict_index_t *index= block->index; - assert_block_ahi_valid(block); - - /* We can clear block->index and block->n_pointers when - holding all AHI latches exclusively; see the comments in buf0buf.h */ - - if (!index) - { -# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - ut_a(!block->n_pointers); -# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - continue; - } - - ut_d(const auto s= block->page.state()); - /* Another thread may have set the state to - REMOVE_HASH in buf_LRU_block_remove_hashed(). - - The state change in buf_pool_t::realloc() is not observable - here, because in that case we would have !block->index. - - In the end, the entire adaptive hash index will be removed. */ - ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH); -# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG - block->n_pointers= 0; -# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ - if (index->freed()) - garbage.insert(index); - block->index= nullptr; - } - } - - for (dict_index_t *index : garbage) - btr_search_lazy_free(index); -} - -/** Get a buffer block from an adaptive hash index pointer. -This function does not return if the block is not identified. -@param ptr pointer to within a page frame -@return pointer to block, never NULL */ -inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const noexcept -{ - chunk_t::map *chunk_map = chunk_t::map_ref; - ut_ad(chunk_t::map_ref == chunk_t::map_reg); - ut_ad(!resizing); - - chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr); - ut_a(it != chunk_map->begin()); - - chunk_t *chunk= it == chunk_map->end() - ? chunk_map->rbegin()->second - : (--it)->second; - - const size_t offs= size_t(ptr - chunk->blocks->page.frame) >> - srv_page_size_shift; - ut_a(offs < chunk->size); - - buf_block_t *block= &chunk->blocks[offs]; - /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that - block[n].frame == block->page.frame + n * srv_page_size. Check it. */ - ut_ad(block->page.frame == page_align(ptr)); - /* Read the state of the block without holding hash_lock. - A state transition to REMOVE_HASH is possible during - this execution. */ - ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH); - - return block; -} - /** Tries to guess the right search position based on the hash search info of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, and the function returns TRUE, then cursor->up_match and cursor->low_match @@ -1103,7 +1022,8 @@ return false; } - buf_block_t* block = buf_pool.block_from_ahi(rec); + buf_block_t* block = buf_pool.block_from(rec); + ut_ad(block->page.frame == page_align(rec)); buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get( block->page.id().fold()); @@ -2196,7 +2116,7 @@ for (; node != NULL; node = node->next) { const buf_block_t* block - = buf_pool.block_from_ahi((byte*) node->data); + = buf_pool.block_from(node->data); index_id_t page_index_id; if (UNIV_LIKELY(block->page.in_file())) { diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0buddy.cc mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc --- mariadb-10.11.11/storage/innobase/buf/buf0buddy.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc 2025-05-19 16:14:25.000000000 +0000 @@ -162,6 +162,20 @@ } #ifdef UNIV_DEBUG +const buf_block_t *buf_pool_t::contains_zip(const void *data, size_t shift) + const noexcept +{ + const size_t d= size_t(data) >> shift; + + for (size_t i= 0; i < n_blocks; i++) + { + const buf_block_t *block= get_nth_page(i); + if (size_t(block->page.zip.data) >> shift == d) + return block; + } + return nullptr; +} + /** Validate a given zip_free list. */ struct CheckZipFree { CheckZipFree(ulint i) : m_i(i) {} @@ -257,13 +271,10 @@ /** Add a block to the head of the appropriate buddy free list. @param[in,out] buf block to be freed @param[in] i index of buf_pool.zip_free[] */ -UNIV_INLINE -void -buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i) +static void buf_buddy_add_to_free(buf_buddy_free_t *buf, ulint i) { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(buf_pool.zip_free[i].start != buf); - buf_buddy_stamp_free(buf, i); UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf); ut_d(buf_buddy_list_validate(i)); @@ -272,9 +283,7 @@ /** Remove a block from the appropriate buddy free list. @param[in,out] buf block to be freed @param[in] i index of buf_pool.zip_free[] */ -UNIV_INLINE -void -buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i) +static void buf_buddy_remove_from_free(buf_buddy_free_t *buf, ulint i) { mysql_mutex_assert_owner(&buf_pool.mutex); ut_ad(buf_buddy_check_free(buf, i)); @@ -298,13 +307,10 @@ buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); - if (buf_pool.is_shrinking() - && UT_LIST_GET_LEN(buf_pool.withdraw) - < buf_pool.withdraw_target) { - + if (size_t size = buf_pool.shrinking_size()) { while (buf != NULL && buf_pool.will_be_withdrawn( - reinterpret_cast(buf))) { + reinterpret_cast(buf), size)) { /* This should be withdrawn, not to be allocated */ buf = UT_LIST_GET_NEXT(list, buf); } @@ -312,6 +318,7 @@ if (buf) { buf_buddy_remove_from_free(buf, i); + ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i)); } else if (i + 1 < BUF_BUDDY_SIZES) { /* Attempt to split. */ buf = buf_buddy_alloc_zip(i + 1); @@ -321,7 +328,6 @@ reinterpret_cast( reinterpret_cast(buf) + (BUF_BUDDY_LOW << i)); - ut_ad(!buf_pool.contains_zip(buddy)); buf_buddy_add_to_free(buddy, i); } } @@ -340,74 +346,52 @@ return(buf); } +#ifdef UNIV_DEBUG +/** number of blocks allocated to the buddy system */ +static size_t buf_buddy_n_frames; +#endif + /** Deallocate a buffer frame of srv_page_size. @param buf buffer frame to deallocate */ static void buf_buddy_block_free(void *buf) noexcept { mysql_mutex_assert_owner(&buf_pool.mutex); - ut_a(!ut_align_offset(buf, srv_page_size)); - - const ulint fold= BUF_POOL_ZIP_FOLD_PTR(buf); - buf_page_t **prev= buf_pool.zip_hash.cell_get(fold)-> - search(&buf_page_t::hash, [buf](const buf_page_t *b) - { - ut_ad(b->in_zip_hash); - ut_ad(b->state() == buf_page_t::MEMORY); - return b->frame == buf; - }); - - buf_page_t *bpage= *prev; - ut_a(bpage); - ut_a(bpage->frame == buf); - ut_d(bpage->in_zip_hash= false); - *prev= bpage->hash; - bpage->hash= nullptr; - + buf_block_t *block= buf_pool.block_from(buf); + ut_ad(block->page.state() == buf_page_t::MEMORY); + ut_ad(block->page.frame == buf); + ut_ad(!buf_pool.contains_zip(buf, srv_page_size_shift)); ut_d(memset(buf, 0, srv_page_size)); MEM_UNDEFINED(buf, srv_page_size); - - buf_LRU_block_free_non_file_page(reinterpret_cast(bpage)); - ut_ad(buf_pool.buddy_n_frames > 0); - ut_d(buf_pool.buddy_n_frames--); + buf_LRU_block_free_non_file_page(block); + ut_ad(buf_buddy_n_frames > 0); + ut_d(buf_buddy_n_frames--); } /** Allocate a buffer block to the buddy allocator. @param block buffer block to register */ static void buf_buddy_block_register(buf_block_t *block) noexcept { - const ulint fold= BUF_POOL_ZIP_FOLD(block); + ut_ad(buf_pool.is_uncompressed_current(block)); ut_ad(block->page.state() == buf_page_t::MEMORY); - - ut_a(block->page.frame); - ut_a(!ut_align_offset(block->page.frame, srv_page_size)); - - ut_ad(!block->page.in_zip_hash); - ut_d(block->page.in_zip_hash= true); - buf_pool.zip_hash.cell_get(fold)->append(block->page, &buf_page_t::hash); - ut_d(buf_pool.buddy_n_frames++); + ut_d(buf_buddy_n_frames++); } /** Allocate a block from a bigger object. @param[in] buf a block that is free to use @param[in] i index of buf_pool.zip_free[] -@param[in] j size of buf as an index of buf_pool.zip_free[] @return allocated block */ -static -void* -buf_buddy_alloc_from(void* buf, ulint i, ulint j) +static void *buf_buddy_alloc_from(void *buf, ulint i) { - ulint offs = BUF_BUDDY_LOW << j; - ut_ad(j <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); - ut_ad(j >= i); - ut_ad(!ut_align_offset(buf, offs)); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(!ut_align_offset(buf, srv_page_size)); + ut_ad(!buf_pool.contains_zip(buf, srv_page_size_shift)); /* Add the unused parts of the block to the free lists. */ - while (j > i) { + for (ulint j = BUF_BUDDY_SIZES, offs = srv_page_size; j-- > i; ) { buf_buddy_free_t* zip_buf; offs >>= 1; - j--; zip_buf = reinterpret_cast( reinterpret_cast(buf) + offs); @@ -422,7 +406,7 @@ @param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES @param lru assigned to true if buf_pool.mutex was temporarily released @return allocated block, never NULL */ -byte *buf_buddy_alloc_low(ulint i, bool *lru) +byte *buf_buddy_alloc_low(ulint i, bool *lru) noexcept { buf_block_t* block; @@ -439,7 +423,7 @@ } /* Try allocating from the buf_pool.free list. */ - block = buf_LRU_get_free_only(); + block = buf_pool.allocate(); if (block) { goto alloc_big; @@ -455,21 +439,21 @@ buf_buddy_block_register(block); block = reinterpret_cast( - buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES)); + buf_buddy_alloc_from(block->page.frame, i)); func_exit: buf_pool.buddy_stat[i].used++; return reinterpret_cast(block); } -/** Try to relocate a block. The caller must hold zip_free_mutex, and this -function will release and lock it again. +/** Try to relocate a block. @param[in] src block to relocate @param[in] dst free block to relocated to @param[in] i index of buf_pool.zip_free[] @param[in] force true if we must relocated always @return true if relocated */ -static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) +static bool buf_buddy_relocate(void *src, void *dst, ulint i, bool force) + noexcept { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; @@ -575,7 +559,7 @@ @param[in] buf block to be freed, must not be pointed to by the buffer pool @param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ -void buf_buddy_free_low(void* buf, ulint i) +void buf_buddy_free_low(void* buf, ulint i) noexcept { buf_buddy_free_t* buddy; @@ -595,13 +579,12 @@ ut_ad(i < BUF_BUDDY_SIZES); ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); - ut_ad(!buf_pool.contains_zip(buf)); + ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i)); /* Do not recombine blocks if there are few free blocks. We may waste up to 15360*max_len bytes to free blocks (1024 + 2048 + 4096 + 8192 = 15360) */ - if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16 - && !buf_pool.is_shrinking()) { + if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16) { goto func_exit; } @@ -615,10 +598,9 @@ /* The buddy is free: recombine */ buf_buddy_remove_from_free(buddy, i); buddy_is_free: - ut_ad(!buf_pool.contains_zip(buddy)); i++; buf = ut_align_down(buf, BUF_BUDDY_LOW << i); - + ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i)); goto recombine; case BUF_BUDDY_STATE_USED: @@ -655,107 +637,120 @@ buf_buddy_add_to_free(reinterpret_cast(buf), i); } -/** Try to reallocate a block. -@param[in] buf buf_pool block to be reallocated -@param[in] size block size, up to srv_page_size -@return whether the reallocation succeeded */ -bool -buf_buddy_realloc(void* buf, ulint size) -{ - buf_block_t* block = NULL; - ulint i = buf_buddy_get_slot(size); - - mysql_mutex_assert_owner(&buf_pool.mutex); - ut_ad(i <= BUF_BUDDY_SIZES); - ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); - - if (i < BUF_BUDDY_SIZES) { - /* Try to allocate from the buddy system. */ - block = reinterpret_cast(buf_buddy_alloc_zip(i)); - } - - if (block == NULL) { - /* Try allocating from the buf_pool.free list. */ - block = buf_LRU_get_free_only(); - - if (block == NULL) { - return(false); /* free_list was not enough */ - } +/** Reallocate a ROW_FORMAT=COMPRESSED page frame during buf_pool_t::shrink(). +@param bpage page descriptor covering a ROW_FORMAT=COMPRESSED page +@param block uncompressed block for storage +@return block +@retval nullptr if the block was consumed */ +ATTRIBUTE_COLD +buf_block_t *buf_buddy_shrink(buf_page_t *bpage, buf_block_t *block) noexcept +{ + ut_ad(bpage->zip.data); + + void *dst= nullptr; + ulint size= page_zip_get_size(&bpage->zip); + ulint i= buf_buddy_get_slot(size); + + ut_ad(buf_pool.will_be_withdrawn(bpage->zip.data, size)); + ut_ad(bpage->can_relocate()); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (UNIV_LIKELY(i < BUF_BUDDY_SIZES)) + dst= buf_buddy_alloc_zip(i); + + if (!dst) + { + buf_buddy_block_register(block); + dst= buf_buddy_alloc_from(block->page.frame, i); + ut_ad(dst); + block= nullptr; + } + + void *src= bpage->zip.data; + memcpy_aligned(dst, src, size); + bpage->zip.data= static_cast(dst); + buf_pool.buddy_stat[i].relocated++; + + while (i < BUF_BUDDY_SIZES) + { + MEM_UNDEFINED(src, BUF_BUDDY_LOW << i); + /* Try to combine adjacent blocks. */ + buf_buddy_free_t *buddy= reinterpret_cast + (buf_buddy_get(static_cast(src), BUF_BUDDY_LOW << i)); - buf_buddy_block_register(block); - - block = reinterpret_cast( - buf_buddy_alloc_from( - block->page.frame, i, BUF_BUDDY_SIZES)); - } - - buf_pool.buddy_stat[i].used++; - - /* Try to relocate the buddy of buf to the free block. */ - if (buf_buddy_relocate(buf, block, i, true)) { - /* succeeded */ - buf_buddy_free_low(buf, i); - } else { - /* failed */ - buf_buddy_free_low(block, i); - } - - return(true); /* free_list was enough */ -} - -/** Combine all pairs of free buddies. */ -void buf_buddy_condense_free() -{ - mysql_mutex_assert_owner(&buf_pool.mutex); - ut_ad(buf_pool.is_shrinking()); - - for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) { - buf_buddy_free_t* buf = - UT_LIST_GET_FIRST(buf_pool.zip_free[i]); - - /* seek to withdraw target */ - while (buf != NULL - && !buf_pool.will_be_withdrawn( - reinterpret_cast(buf))) { - buf = UT_LIST_GET_NEXT(list, buf); - } - - while (buf != NULL) { - buf_buddy_free_t* next = - UT_LIST_GET_NEXT(list, buf); - - buf_buddy_free_t* buddy = - reinterpret_cast( - buf_buddy_get( - reinterpret_cast(buf), - BUF_BUDDY_LOW << i)); - - /* seek to the next withdraw target */ - while (true) { - while (next != NULL - && !buf_pool.will_be_withdrawn( - reinterpret_cast(next))) { - next = UT_LIST_GET_NEXT(list, next); - } - - if (buddy != next) { - break; - } - - next = UT_LIST_GET_NEXT(list, next); - } - - if (buf_buddy_is_free(buddy, i) - == BUF_BUDDY_STATE_FREE) { - /* Both buf and buddy are free. - Try to combine them. */ - buf_buddy_remove_from_free(buf, i); - buf_pool.buddy_stat[i].used++; + if (buf_buddy_is_free(buddy, i) != BUF_BUDDY_STATE_FREE) + { + ut_ad(!buf_pool.contains_zip(src, BUF_BUDDY_LOW_SHIFT + i)); + buf_buddy_add_to_free(static_cast(src), i); + return block; + } + + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(buddy, i); + i++; + src= ut_align_down(src, BUF_BUDDY_LOW << i); + } + + buf_buddy_block_free(src); + return block; +} + +/** Combine all pairs of free buddies. +@param size the target innodb_buffer_pool_size */ +ATTRIBUTE_COLD void buf_buddy_condense_free(size_t size) noexcept +{ + ut_ad(size); + ut_ad(size == buf_pool.shrinking_size()); + + for (ulint i= 0; i < array_elements(buf_pool.zip_free); i++) + { + buf_buddy_free_t *buf= UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + /* seek to withdraw target */ + while (buf && + !buf_pool.will_be_withdrawn(reinterpret_cast(buf), size)) + buf= UT_LIST_GET_NEXT(list, buf); - buf_buddy_free_low(buf, i); - } + for (buf_buddy_free_t *next= buf; buf; buf= next) + { + buf_buddy_free_t *buddy= reinterpret_cast + (buf_buddy_get(reinterpret_cast(buf), BUF_BUDDY_LOW << i)); - buf = next; - } - } + /* seek to the next withdraw target */ + do + { + while ((next= UT_LIST_GET_NEXT(list, next)) && + !buf_pool.will_be_withdrawn(reinterpret_cast(next), + size)) {} + } + while (buddy == next); + + if (buf_buddy_is_free(buddy, i) != BUF_BUDDY_STATE_FREE) + continue; + + buf_buddy_remove_from_free(buf, i); + ulint j= i; + recombine: + buf_buddy_remove_from_free(buddy, j); + j++; + buf= static_cast + (ut_align_down(buf, BUF_BUDDY_LOW << j)); + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << j); + + if (j == BUF_BUDDY_SIZES) + { + buf_buddy_block_free(buf); + continue; + } + + buddy= reinterpret_cast + (buf_buddy_get(reinterpret_cast(buf), BUF_BUDDY_LOW << j)); + if (buf_buddy_is_free(buddy, j) == BUF_BUDDY_STATE_FREE) + goto recombine; + + ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + j)); + buf_buddy_add_to_free(buf, j); + } + } } diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0buf.cc mariadb-10.11.13/storage/innobase/buf/buf0buf.cc --- mariadb-10.11.11/storage/innobase/buf/buf0buf.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0buf.cc 2025-05-19 16:14:25.000000000 +0000 @@ -47,8 +47,6 @@ #include "lock0lock.h" #include "btr0sea.h" #include "ibuf0ibuf.h" -#include "trx0undo.h" -#include "trx0purge.h" #include "log0log.h" #include "dict0stats_bg.h" #include "srv0srv.h" @@ -64,6 +62,7 @@ #include #include #include "log.h" +#include "my_virtual_mem.h" using st_::span; @@ -277,6 +276,56 @@ */ #ifndef UNIV_INNOCHECKSUM +/** Compute the number of page frames needed for buf_block_t, +per innodb_buffer_pool_extent_size. +@param ps innodb_page_size +@return number of buf_block_t frames per extent */ +static constexpr uint8_t first_page(size_t ps) +{ + return uint8_t(innodb_buffer_pool_extent_size / ps - + innodb_buffer_pool_extent_size / (ps + sizeof(buf_block_t))); +} + +/** Compute the number of bytes needed for buf_block_t, +per innodb_buffer_pool_extent_size. +@param ps innodb_page_size +@return number of buf_block_t frames per extent */ +static constexpr size_t first_frame(size_t ps) +{ + return first_page(ps) * ps; +} + +/** Compute the number of pages per innodb_buffer_pool_extent_size. +@param ps innodb_page_size +@return number of buf_block_t frames per extent */ +static constexpr uint16_t pages(size_t ps) +{ + return uint16_t(innodb_buffer_pool_extent_size / ps - first_page(ps)); +} + +/** The byte offset of the first page frame in a buffer pool extent +of innodb_buffer_pool_extent_size bytes */ +static constexpr size_t first_frame_in_extent[]= +{ + first_frame(4096), first_frame(8192), first_frame(16384), + first_frame(32768), first_frame(65536) +}; + +/** The position offset of the first page frame in a buffer pool extent +of innodb_buffer_pool_extent_size bytes */ +static constexpr uint8_t first_page_in_extent[]= +{ + first_page(4096), first_page(8192), first_page(16384), + first_page(32768), first_page(65536) +}; + +/** Number of pages per buffer pool extent +of innodb_buffer_pool_extent_size bytes */ +static constexpr size_t pages_in_extent[]= +{ + pages(4096), pages(8192), pages(16384), pages(32768), pages(65536) +}; + # ifdef SUX_LOCK_GENERIC void page_hash_latch::read_lock_wait() noexcept { @@ -326,8 +375,6 @@ /** The InnoDB buffer pool */ buf_pool_t buf_pool; -buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg; -buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; #ifdef UNIV_DEBUG /** This is used to insert validation operations in execution @@ -511,16 +558,18 @@ } #ifndef UNIV_INNOCHECKSUM -/** Checks whether the lsn present in the page is lesser than the -peek current lsn. -@param check_lsn lsn to check +/** Check whether a page is newer than the durable LSN. +@param check_lsn whether to check the LSN @param read_buf page frame -@return whether the FIL_PAGE_LSN is invalid */ -static bool buf_page_check_lsn(bool check_lsn, const byte *read_buf) +@return whether the FIL_PAGE_LSN is invalid (ahead of the durable LSN) */ +static bool buf_page_check_lsn(bool check_lsn, const byte *read_buf) noexcept { if (!check_lsn) return false; - lsn_t current_lsn= log_sys.get_lsn(); + /* A page may not be read before it is written, and it may not be + written before the corresponding log has been durably written. + Hence, we refer to the current durable LSN here */ + lsn_t current_lsn= log_sys.get_flushed_lsn(std::memory_order_relaxed); if (UNIV_UNLIKELY(current_lsn == log_sys.FIRST_LSN) && srv_force_recovery == SRV_FORCE_NO_LOG_REDO) return false; @@ -797,6 +846,11 @@ bool setup() { + m_num_fds= 0; + + if (my_use_large_pages) + return false; + static_assert(array_elements(m_fds) == (array_elements(m_triggers) + 1), "insufficient fds"); std::string memcgroup{"/sys/fs/cgroup"}; @@ -809,7 +863,6 @@ cgroup.erase(0, 3); // Remove "0::" memcgroup+= cgroup + "/memory.pressure"; - m_num_fds= 0; for (auto trig= std::begin(m_triggers); trig!= std::end(m_triggers); ++trig) { if ((m_fds[m_num_fds].fd= @@ -958,29 +1011,121 @@ } /** Initialize mem pressure. */ -ATTRIBUTE_COLD void buf_mem_pressure_detect_init() +ATTRIBUTE_COLD static void buf_mem_pressure_detect_init() noexcept { mem_pressure_obj.setup(); } -ATTRIBUTE_COLD void buf_mem_pressure_shutdown() +ATTRIBUTE_COLD void buf_mem_pressure_shutdown() noexcept { mem_pressure_obj.join(); } -#endif /* __linux__ */ +#endif + +#if defined __linux__ || !defined DBUG_OFF +inline void buf_pool_t::garbage_collect() noexcept +{ + mysql_mutex_lock(&mutex); + const size_t old_size{size_in_bytes}, min_size{size_in_bytes_auto_min}; + const size_t reduce_size= + std::max(innodb_buffer_pool_extent_size, + ut_calc_align((old_size - min_size) / 2, + innodb_buffer_pool_extent_size)); + if (old_size < min_size + reduce_size || + first_to_withdraw || old_size != size_in_bytes_requested) + { + mysql_mutex_unlock(&mutex); + sql_print_information("InnoDB: Memory pressure event disregarded;" + " innodb_buffer_pool_size=%zum," + " innodb_buffer_pool_size_min=%zum", + old_size >> 20, min_size >> 20); + return; + } + + size_t size= old_size - reduce_size; + size_t n_blocks_new= get_n_blocks(size); + + ut_ad(UT_LIST_GET_LEN(withdrawn) == 0); + ut_ad(n_blocks_to_withdraw == 0); + + n_blocks_to_withdraw= n_blocks - n_blocks_new; + first_to_withdraw= &get_nth_page(n_blocks_new)->page; + + size_in_bytes_requested= size; + mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + page_cleaner_wakeup(true); + my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + mysql_mutex_unlock(&flush_list_mutex); +# ifdef BTR_CUR_HASH_ADAPT + bool ahi_disabled= btr_search_disable(); +# endif /* BTR_CUR_HASH_ADAPT */ + time_t start= time(nullptr); + mysql_mutex_lock(&mutex); + + do + { + if (shrink(size)) + { + const size_t old_blocks{n_blocks}; + n_blocks= n_blocks_new; + + size_t s= n_blocks_new / BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(uint32(s)); + + os_total_large_mem_allocated-= reduce_size; + shrunk(size, reduce_size); + ibuf_max_size_update(srv_change_buffer_max_size); +# ifdef BTR_CUR_HASH_ADAPT + if (ahi_disabled) + btr_search_enable(true); +# endif + mysql_mutex_unlock(&mutex); + sql_print_information("InnoDB: Memory pressure event shrunk" + " innodb_buffer_pool_size=%zum (%zu pages)" + " from %zum (%zu pages)", + size >> 20, n_blocks_new, old_size >> 20, + old_blocks); + ut_d(validate()); + return; + } + } + while (time(nullptr) - start < 15); + + ut_ad(size_in_bytes > size_in_bytes_requested); + n_blocks_to_withdraw= 0; + first_to_withdraw= nullptr; + size_in_bytes_requested= size_in_bytes; + + while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn)) + { + UT_LIST_REMOVE(withdrawn, b); + UT_LIST_ADD_LAST(free, b); + ut_d(b->in_free_list= true); + ut_ad(b->state() == buf_page_t::NOT_USED); + b->lock.init(); + } + + mysql_mutex_unlock(&mutex); + sql_print_information("InnoDB: Memory pressure event failed to shrink" + " innodb_buffer_pool_size=%zum", old_size); + ut_d(validate()); +} +#endif #if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) -/** Enable buffers to be dumped to core files +/** Enable buffers to be dumped to core files. -A convience function, not called anyhwere directly however +A convenience function, not called anyhwere directly however it is left available for gdb or any debugger to call in the event that you want all of the memory to be dumped to a core file. -Returns number of errors found in madvise calls. */ +@return number of errors found in madvise() calls */ MY_ATTRIBUTE((used)) -int -buf_madvise_do_dump() +int buf_pool_t::madvise_do_dump() noexcept { int ret= 0; @@ -991,20 +1136,13 @@ MADV_DODUMP); } - mysql_mutex_lock(&buf_pool.mutex); - auto chunk = buf_pool.chunks; - - for (ulint n = buf_pool.n_chunks; n--; chunk++) { - ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); - } - - mysql_mutex_unlock(&buf_pool.mutex); + ret+= madvise(buf_pool.memory, buf_pool.size_in_bytes, MADV_DODUMP); return ret; } #endif #ifndef UNIV_DEBUG -static inline byte hex_to_ascii(byte hex_digit) +static inline byte hex_to_ascii(byte hex_digit) noexcept { const int offset= hex_digit <= 9 ? '0' : 'a' - 10; return byte(hex_digit + offset); @@ -1040,163 +1178,80 @@ #endif } -/** Initialize a buffer page descriptor. -@param[in,out] block buffer page descriptor -@param[in] frame buffer page frame */ -static -void -buf_block_init(buf_block_t* block, byte* frame) +IF_DBUG(,inline) byte *buf_block_t::frame_address() const noexcept { - /* This function should only be executed at database startup or by - buf_pool.resize(). Either way, adaptive hash index must not exist. */ - assert_block_ahi_empty_on_init(block); - - block->page.frame = frame; + static_assert(ut_is_2pow(innodb_buffer_pool_extent_size), ""); - MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock); - ut_ad(!block->modify_clock); - MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock); - block->page.lock.init(); - block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL)); -#ifdef BTR_CUR_HASH_ADAPT - MEM_MAKE_DEFINED(&block->index, sizeof block->index); - ut_ad(!block->index); -#endif /* BTR_CUR_HASH_ADAPT */ - ut_d(block->in_unzip_LRU_list = false); - ut_d(block->in_withdraw_list = false); - - page_zip_des_init(&block->page.zip); - - MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash); - ut_ad(!block->page.hash); + byte *frame_= reinterpret_cast + ((reinterpret_cast(this) & ~(innodb_buffer_pool_extent_size - 1)) | + first_frame_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]); + ut_ad(reinterpret_cast(this) + sizeof(*this) <= frame_); + frame_+= + (((reinterpret_cast(this) & (innodb_buffer_pool_extent_size - 1)) / + sizeof(*this)) << srv_page_size_shift); + return frame_; } -/** Allocate a chunk of buffer frames. -@param bytes requested size -@return whether the allocation succeeded */ -inline bool buf_pool_t::chunk_t::create(size_t bytes) noexcept +buf_block_t *buf_pool_t::block_from(const void *ptr) noexcept { - DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;); - /* Round down to a multiple of page size, although it already should be. */ - bytes= ut_2pow_round(bytes, srv_page_size); - - mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx); - - if (UNIV_UNLIKELY(!mem)) - return false; - - MEM_UNDEFINED(mem, mem_size()); - -#ifdef HAVE_LIBNUMA - if (srv_numa_interleave) - { - struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); - MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed); - if (mbind(mem, mem_size(), MPOL_INTERLEAVE, - numa_mems_allowed->maskp, numa_mems_allowed->size, - MPOL_MF_MOVE)) - { - ib::warn() << "Failed to set NUMA memory policy of" - " buffer pool page frames to MPOL_INTERLEAVE" - " (error: " << strerror(errno) << ")."; - } - numa_bitmask_free(numa_mems_allowed); - } -#endif /* HAVE_LIBNUMA */ - - - /* Allocate the block descriptors from - the start of the memory block. */ - blocks= reinterpret_cast(mem); - - /* Align a pointer to the first frame. Note that when - opt_large_page_size is smaller than srv_page_size, - (with max srv_page_size at 64k don't think any hardware - makes this true), - we may allocate one fewer block than requested. When - it is bigger, we may allocate more blocks than requested. */ - static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); - - byte *frame= reinterpret_cast((reinterpret_cast(mem) + - srv_page_size - 1) & - ~ulint{srv_page_size - 1}); - size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem); - - /* Subtract the space needed for block descriptors. */ - { - ulint s= size; - - while (frame < reinterpret_cast(blocks + s)) - { - frame+= srv_page_size; - s--; - } - - size= s; - } - - /* Init block structs and assign frames for them. Then we assign the - frames to the first blocks (we already mapped the memory above). */ - - buf_block_t *block= blocks; + static_assert(ut_is_2pow(innodb_buffer_pool_extent_size), ""); + ut_ad(static_cast(ptr) >= buf_pool.memory); - for (auto i= size; i--; ) { - buf_block_init(block, frame); - MEM_UNDEFINED(block->page.frame, srv_page_size); - /* Add the block to the free list */ - UT_LIST_ADD_LAST(buf_pool.free, &block->page); + byte *first_block= reinterpret_cast + (reinterpret_cast(ptr) & ~(innodb_buffer_pool_extent_size - 1)); + const size_t first_frame= + first_frame_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]; - ut_d(block->page.in_free_list = TRUE); - block++; - frame+= srv_page_size; - } - - reg(); - - return true; + ut_ad(static_cast(ptr) >= first_block + first_frame); + return reinterpret_cast(first_block) + + (((size_t(ptr) & (innodb_buffer_pool_extent_size - 1)) - first_frame) >> + srv_page_size_shift); } -#ifdef UNIV_DEBUG -/** Check that all file pages in the buffer chunk are in a replaceable state. -@return address of a non-free block -@retval nullptr if all freed */ -inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const noexcept +/** Determine the address of the first invalid block descriptor +@param n_blocks buf_pool.n_blocks +@return offset of the first invalid buf_block_t, relative to buf_pool.memory */ +static size_t block_descriptors_in_bytes(size_t n_blocks) noexcept { - buf_block_t *block= blocks; - for (auto i= size; i--; block++) - { - if (block->page.in_file()) - { - /* The uncompressed buffer pool should never - contain ROW_FORMAT=COMPRESSED block descriptors. */ - ut_ad(block->page.frame); - const lsn_t lsn= block->page.oldest_modification(); + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; + const size_t extent_size= pages_in_extent[ssize]; + return n_blocks / extent_size * innodb_buffer_pool_extent_size + + (n_blocks % extent_size) * sizeof(buf_block_t); +} - if (srv_read_only_mode) - { - /* The page cleaner is disabled in read-only mode. No pages - can be dirtied, so all of them must be clean. */ - ut_ad(lsn == 0 || lsn == recv_sys.lsn || - srv_force_recovery == SRV_FORCE_NO_LOG_REDO); - break; - } +buf_block_t *buf_pool_t::get_nth_page(size_t pos) const noexcept +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(pos < n_blocks); + return reinterpret_cast + (memory + block_descriptors_in_bytes(pos)); +} - if (fsp_is_system_temporary(block->page.id().space())) - { - ut_ad(lsn == 0 || lsn == 2); - break; - } +buf_block_t *buf_pool_t::allocate() noexcept +{ + mysql_mutex_assert_owner(&mutex); - if (lsn > 1 || !block->page.can_relocate()) - return block; + while (buf_page_t *b= UT_LIST_GET_FIRST(free)) + { + ut_ad(b->in_free_list); + ut_d(b->in_free_list = FALSE); + ut_ad(!b->oldest_modification()); + ut_ad(!b->in_LRU_list); + ut_a(!b->in_file()); + UT_LIST_REMOVE(free, b); - break; + if (UNIV_LIKELY(!n_blocks_to_withdraw) || !withdraw(*b)) + { + /* No adaptive hash index entries may point to a free block. */ + assert_block_ahi_empty(reinterpret_cast(b)); + b->set_state(buf_page_t::MEMORY); + b->set_os_used(); + return reinterpret_cast(b); } } return nullptr; } -#endif /* UNIV_DEBUG */ /** Create the hash table. @param n the lower bound of n_cells */ @@ -1210,96 +1265,189 @@ array= static_cast(v); } +size_t buf_pool_t::get_n_blocks(size_t size_in_bytes) noexcept +{ + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; + size_t n_blocks_alloc= size_in_bytes / innodb_buffer_pool_extent_size * + pages_in_extent[ssize]; + + if (const size_t incomplete_extent_pages= + (size_in_bytes & (innodb_buffer_pool_extent_size - 1)) >> + srv_page_size_shift) + { + ssize_t d= incomplete_extent_pages - first_page_in_extent[ssize]; + ut_ad(d > 0); + n_blocks_alloc+= d; + } + + return n_blocks_alloc; +} + +size_t buf_pool_t::blocks_in_bytes(size_t n_blocks) noexcept +{ + const size_t shift{srv_page_size_shift}; + const size_t ssize{shift - UNIV_PAGE_SIZE_SHIFT_MIN}; + const size_t extent_size= pages_in_extent[ssize]; + size_t size_in_bytes= n_blocks / extent_size * + innodb_buffer_pool_extent_size; + if (size_t remainder= n_blocks % extent_size) + size_in_bytes+= (remainder + first_page_in_extent[ssize]) << shift; + ut_ad(get_n_blocks(size_in_bytes) == n_blocks); + return size_in_bytes; +} + /** Create the buffer pool. @return whether the creation failed */ -bool buf_pool_t::create() +bool buf_pool_t::create() noexcept { ut_ad(this == &buf_pool); - ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0); ut_ad(!is_initialised()); - ut_ad(srv_buf_pool_size > 0); - ut_ad(!resizing); - ut_ad(!chunks_old); + ut_ad(size_in_bytes_requested > 0); + ut_ad(!(size_in_bytes_max & (innodb_buffer_pool_extent_size - 1))); + ut_ad(!(size_in_bytes_requested & ((1U << 20) - 1))); + ut_ad(size_in_bytes_requested <= size_in_bytes_max); /* mariabackup loads tablespaces, and it requires field_ref_zero to be allocated before innodb initialization */ ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero); - NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; - - if (!field_ref_zero) { + if (!field_ref_zero) + { if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) + { field_ref_zero= static_cast (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); - else - return true; + goto init; + } + + oom: + ut_ad(!is_initialised()); + sql_print_error("InnoDB: Cannot map innodb_buffer_pool_size_max=%zum", + size_in_bytes_max >> 20); + return true; + } + + init: + DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", goto oom;); + size_t size= size_in_bytes_max; + sql_print_information("InnoDB: innodb_buffer_pool_size_max=%zum," + " innodb_buffer_pool_size=%zum", + size >> 20, size_in_bytes_requested >> 20); + + retry: + { + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; +#ifdef _WIN32 + memory_unaligned= my_virtual_mem_reserve(&size); +#else + memory_unaligned= my_large_virtual_alloc(&size); +#endif } - chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map()); + if (!memory_unaligned) + goto oom; - new(&allocator) ut_allocator(mem_key_buf_buf_pool); + const size_t alignment_waste= + ((~size_t(memory_unaligned) & (innodb_buffer_pool_extent_size - 1)) + 1) & + (innodb_buffer_pool_extent_size - 1); - n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit; - const size_t chunk_size= srv_buf_pool_chunk_unit; + if (size < size_in_bytes_max + alignment_waste) + { + my_virtual_mem_release(memory_unaligned, size); + size+= 1 + + (~size_t(memory_unaligned) & (innodb_buffer_pool_extent_size - 1)); + goto retry; + } - chunks= static_cast(ut_zalloc_nokey(n_chunks * sizeof *chunks)); - UT_LIST_INIT(free, &buf_page_t::list); - curr_size= 0; - auto chunk= chunks; + MEM_UNDEFINED(memory_unaligned, size); + ut_dontdump(memory_unaligned, size, true); + memory= memory_unaligned + alignment_waste; + size_unaligned= size; + size-= alignment_waste; + size&= ~(innodb_buffer_pool_extent_size - 1); - do + const size_t actual_size= size_in_bytes_requested; + ut_ad(actual_size <= size); + + size_in_bytes= actual_size; + os_total_large_mem_allocated+= actual_size; + +#ifdef UNIV_PFS_MEMORY + PSI_MEMORY_CALL(memory_alloc)(mem_key_buf_buf_pool, actual_size, &owner); +#endif +#ifdef _WIN32 + if (!my_virtual_mem_commit(memory, actual_size)) { - if (!chunk->create(chunk_size)) - { - while (--chunk >= chunks) - { - buf_block_t* block= chunk->blocks; + my_virtual_mem_release(memory_unaligned, size_unaligned); + memory= nullptr; + memory_unaligned= nullptr; + goto oom; + } +#else + update_malloc_size(actual_size, 0); +#endif - for (auto i= chunk->size; i--; block++) - block->page.lock.free(); +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) + { + struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed); + if (mbind(memory_unaligned, size_unaligned, MPOL_INTERLEAVE, + numa_mems_allowed->maskp, numa_mems_allowed->size, + MPOL_MF_MOVE)) + sql_print_warning("InnoDB: Failed to set NUMA memory policy of" + " buffer pool page frames to MPOL_INTERLEAVE" + " (error: %s).", strerror(errno)); + numa_bitmask_free(numa_mems_allowed); + } +#endif /* HAVE_LIBNUMA */ - allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); - } - ut_free(chunks); - chunks= nullptr; - UT_DELETE(chunk_t::map_reg); - chunk_t::map_reg= nullptr; - aligned_free(const_cast(field_ref_zero)); - field_ref_zero= nullptr; - ut_ad(!is_initialised()); - return true; - } + n_blocks= get_n_blocks(actual_size); + n_blocks_to_withdraw= 0; + UT_LIST_INIT(free, &buf_page_t::list); + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; - curr_size+= chunk->size; + for (char *extent= memory, + *end= memory + block_descriptors_in_bytes(n_blocks); + extent < end; extent+= innodb_buffer_pool_extent_size) + { + buf_block_t *block= reinterpret_cast(extent); + const buf_block_t *extent_end= block + pages_in_extent[ssize]; + if (reinterpret_cast(extent_end) > end) + extent_end= reinterpret_cast(end); + MEM_MAKE_DEFINED(block, (extent_end - block) * sizeof *block); + for (byte *frame= reinterpret_cast(extent) + + first_frame_in_extent[ssize]; + block < extent_end; block++, frame+= srv_page_size) + { + ut_ad(!memcmp(block, field_ref_zero, sizeof *block)); + block->page.frame= frame; + block->page.lock.init(); + UT_LIST_ADD_LAST(free, &block->page); + ut_d(block->page.in_free_list= true); + } } - while (++chunk < chunks + n_chunks); - ut_ad(is_initialised()); #if defined(__aarch64__) mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); #else mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr); #endif + UT_LIST_INIT(withdrawn, &buf_page_t::list); UT_LIST_INIT(LRU, &buf_page_t::LRU); - UT_LIST_INIT(withdraw, &buf_page_t::list); - withdraw_target= 0; UT_LIST_INIT(flush_list, &buf_page_t::list); UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); - ulint s= curr_size; + ulint s= n_blocks; s/= BUF_READ_AHEAD_PORTION; read_ahead_area= s >= READ_AHEAD_PAGES ? READ_AHEAD_PAGES : my_round_up_to_next_power(static_cast(s)); - curr_pool_size= srv_buf_pool_size; - n_chunks_new= n_chunks; - - page_hash.create(2 * curr_size); - zip_hash.create(2 * curr_size); - last_printout_time= time(NULL); + page_hash.create(2 * n_blocks); + last_printout_time= time(nullptr); mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex, MY_MUTEX_INIT_FAST); @@ -1318,14 +1466,8 @@ io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) * OS_AIO_N_PENDING_IOS_PER_THREAD); - /* FIXME: remove some of these variables */ - srv_buf_pool_curr_size= curr_pool_size; - srv_buf_pool_old_size= srv_buf_pool_size; - srv_buf_pool_base_size= srv_buf_pool_size; - last_activity_count= srv_get_activity_count(); - chunk_t::map_ref= chunk_t::map_reg; buf_LRU_old_ratio_update(100 * 3 / 8, false); btr_search_sys_create(); @@ -1334,6 +1476,7 @@ buf_mem_pressure_detect_init(); #endif ut_ad(is_initialised()); + sql_print_information("InnoDB: Completed initialization of buffer pool"); return false; } @@ -1368,14 +1511,31 @@ } } - for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) { - buf_block_t *block= chunk->blocks; + const size_t size{size_in_bytes}; - for (auto i= chunk->size; i--; block++) - block->page.lock.free(); + for (char *extent= memory, + *end= memory + block_descriptors_in_bytes(n_blocks); + extent < end; extent+= innodb_buffer_pool_extent_size) + for (buf_block_t *block= reinterpret_cast(extent), + *extent_end= block + + pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]; + block < extent_end && reinterpret_cast(block) < end; block++) + { + MEM_MAKE_DEFINED(&block->page.lock, sizeof &block->page.lock); + block->page.lock.free(); + } - allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + ut_dodump(memory_unaligned, size_unaligned); +#ifdef UNIV_PFS_MEMORY + PSI_MEMORY_CALL(memory_free)(mem_key_buf_buf_pool, size, owner); + owner= nullptr; +#endif + os_total_large_mem_allocated-= size; + my_virtual_mem_decommit(memory, size); + my_virtual_mem_release(memory_unaligned, size_unaligned); + memory= nullptr; + memory_unaligned= nullptr; } pthread_cond_destroy(&done_flush_LRU); @@ -1383,137 +1543,13 @@ pthread_cond_destroy(&do_flush_list); pthread_cond_destroy(&done_free); - ut_free(chunks); - chunks= nullptr; page_hash.free(); - zip_hash.free(); io_buf.close(); - UT_DELETE(chunk_t::map_reg); - chunk_t::map_reg= chunk_t::map_ref= nullptr; aligned_free(const_cast(field_ref_zero)); field_ref_zero= nullptr; } -/** Try to reallocate a control block. -@param block control block to reallocate -@return whether the reallocation succeeded */ -inline bool buf_pool_t::realloc(buf_block_t *block) noexcept -{ - buf_block_t* new_block; - - mysql_mutex_assert_owner(&mutex); - ut_ad(block->page.in_file()); - ut_ad(block->page.frame); - - new_block = buf_LRU_get_free_only(); - - if (new_block == NULL) { - mysql_mutex_lock(&buf_pool.flush_list_mutex); - page_cleaner_wakeup(); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - return(false); /* free list was not enough */ - } - - const page_id_t id{block->page.id()}; - hash_chain& chain = page_hash.cell_get(id.fold()); - page_hash_latch& hash_lock = page_hash.lock_get(chain); - /* It does not make sense to use transactional_lock_guard - here, because copying innodb_page_size (4096 to 65536) bytes - as well as other changes would likely make the memory - transaction too large. */ - hash_lock.lock(); - - if (block->page.can_relocate()) { - memcpy_aligned( - new_block->page.frame, block->page.frame, - srv_page_size); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - const auto frame = new_block->page.frame; - new_block->page.lock.free(); - new (&new_block->page) buf_page_t(block->page); - new_block->page.frame = frame; - - /* relocate LRU list */ - if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { - UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); - } else { - UT_LIST_ADD_FIRST(LRU, &new_block->page); - } - - if (LRU_old == &block->page) { - LRU_old = &new_block->page; - } - - ut_ad(new_block->page.in_LRU_list); - - /* relocate unzip_LRU list */ - if (block->page.zip.data != NULL) { - ut_ad(block->in_unzip_LRU_list); - ut_d(new_block->in_unzip_LRU_list = true); - - buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); - UT_LIST_REMOVE(unzip_LRU, block); - - ut_d(block->in_unzip_LRU_list = false); - block->page.zip.data = NULL; - page_zip_set_size(&block->page.zip, 0); - - if (prev_block != NULL) { - UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block); - } else { - UT_LIST_ADD_FIRST(unzip_LRU, new_block); - } - } else { - ut_ad(!block->in_unzip_LRU_list); - ut_d(new_block->in_unzip_LRU_list = false); - } - - /* relocate page_hash */ - hash_chain& chain = page_hash.cell_get(id.fold()); - ut_ad(&block->page == page_hash.get(id, chain)); - buf_pool.page_hash.replace(chain, &block->page, - &new_block->page); - buf_block_modify_clock_inc(block); - static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); - memset_aligned<4>(block->page.frame - + FIL_PAGE_OFFSET, 0xff, 4); - static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, - "not perfect alignment"); - memset_aligned<2>(block->page.frame - + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); - MEM_UNDEFINED(block->page.frame, srv_page_size); - block->page.set_state(buf_page_t::REMOVE_HASH); - if (!fsp_is_system_temporary(id.space())) { - buf_flush_relocate_on_flush_list(&block->page, - &new_block->page); - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - block->page.set_corrupt_id(); - - /* set other flags of buf_block_t */ - -#ifdef BTR_CUR_HASH_ADAPT - /* This code should only be executed by resize(), - while the adaptive hash index is disabled. */ - assert_block_ahi_empty(block); - assert_block_ahi_empty_on_init(new_block); - ut_ad(!block->index); - new_block->index = NULL; - new_block->n_hash_helps = 0; - new_block->n_fields = 1; - new_block->left_side = TRUE; -#endif /* BTR_CUR_HASH_ADAPT */ - ut_d(block->page.set_state(buf_page_t::MEMORY)); - /* free block */ - new_block = block; - } - - hash_lock.unlock(); - buf_LRU_block_free_non_file_page(new_block); - return(true); /* free_list was enough */ -} - void buf_pool_t::io_buf_t::create(ulint n_slots) noexcept { this->n_slots= n_slots; @@ -1552,720 +1588,528 @@ } } -/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status -to the specified string. The format and the following parameters are the -same as the ones used for printf(3). -@param[in] fmt format -@param[in] ... extra parameters according to fmt */ -static -void -buf_resize_status( - const char* fmt, - ...) +ATTRIBUTE_COLD bool buf_pool_t::withdraw(buf_page_t &bpage) noexcept { - va_list ap; - - va_start(ap, fmt); - - vsnprintf( - export_vars.innodb_buffer_pool_resize_status, - sizeof(export_vars.innodb_buffer_pool_resize_status), - fmt, ap); - - va_end(ap); - - ib::info() << export_vars.innodb_buffer_pool_resize_status; + mysql_mutex_assert_owner(&mutex); + ut_ad(n_blocks_to_withdraw); + ut_ad(first_to_withdraw); + ut_ad(!bpage.zip.data); + if (&bpage < first_to_withdraw) + return false; + n_blocks_to_withdraw--; + bpage.lock.free(); + UT_LIST_ADD_LAST(withdrawn, &bpage); + return true; } -/** Withdraw blocks from the buffer pool until meeting withdraw_target. -@return whether retry is needed */ -inline bool buf_pool_t::withdraw_blocks() noexcept +ATTRIBUTE_COLD buf_pool_t::shrink_status buf_pool_t::shrink(size_t size) + noexcept { - buf_block_t* block; - ulint loop_count = 0; - - ib::info() << "Start to withdraw the last " - << withdraw_target << " blocks."; - - while (UT_LIST_GET_LEN(withdraw) < withdraw_target) { - - /* try to withdraw from free_list */ - ulint count1 = 0; + mysql_mutex_assert_owner(&mutex); + buf_load_abort(); - mysql_mutex_lock(&mutex); - buf_buddy_condense_free(); - block = reinterpret_cast( - UT_LIST_GET_FIRST(free)); - while (block != NULL - && UT_LIST_GET_LEN(withdraw) < withdraw_target) { - ut_ad(block->page.in_free_list); - ut_ad(!block->page.oldest_modification()); - ut_ad(!block->page.in_LRU_list); - ut_a(!block->page.in_file()); - - buf_block_t* next_block; - next_block = reinterpret_cast( - UT_LIST_GET_NEXT( - list, &block->page)); - - if (will_be_withdrawn(block->page)) { - /* This should be withdrawn */ - UT_LIST_REMOVE(free, &block->page); - UT_LIST_ADD_LAST(withdraw, &block->page); - ut_d(block->in_withdraw_list = true); - count1++; - } - - block = next_block; - } - - /* reserve free_list length */ - if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { - try_LRU_scan = false; - mysql_mutex_unlock(&mutex); - mysql_mutex_lock(&flush_list_mutex); - page_cleaner_wakeup(true); - my_cond_wait(&done_flush_list, - &flush_list_mutex.m_mutex); - mysql_mutex_unlock(&flush_list_mutex); - mysql_mutex_lock(&mutex); - } - - /* relocate blocks/buddies in withdrawn area */ - ulint count2 = 0; - - buf_pool_mutex_exit_forbid(); - for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; - bpage; bpage = next_bpage) { - ut_ad(bpage->in_file()); - next_bpage = UT_LIST_GET_NEXT(LRU, bpage); - if (UNIV_LIKELY_NULL(bpage->zip.data) - && will_be_withdrawn(bpage->zip.data) - && bpage->can_relocate()) { - if (!buf_buddy_realloc( - bpage->zip.data, - page_zip_get_size(&bpage->zip))) { - /* failed to allocate block */ - break; - } - count2++; - if (bpage->frame) { - goto realloc_frame; - } - } - - if (bpage->frame && will_be_withdrawn(*bpage) - && bpage->can_relocate()) { -realloc_frame: - if (!realloc(reinterpret_cast( - bpage))) { - /* failed to allocate block */ - break; - } - count2++; - } - } - buf_pool_mutex_exit_allow(); - mysql_mutex_unlock(&mutex); - - buf_resize_status( - "Withdrawing blocks. (" ULINTPF "/" ULINTPF ").", - UT_LIST_GET_LEN(withdraw), - withdraw_target); - - ib::info() << "Withdrew " - << count1 << " blocks from free list." - << " Tried to relocate " << count2 << " blocks (" - << UT_LIST_GET_LEN(withdraw) << "/" - << withdraw_target << ")."; - - if (++loop_count >= 10) { - /* give up for now. - retried after user threads paused. */ - - ib::info() << "will retry to withdraw later"; - - /* need retry later */ - return(true); - } - } - - /* confirm withdrawn enough */ - for (const chunk_t* chunk = chunks + n_chunks_new, - * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { - block = chunk->blocks; - for (ulint j = chunk->size; j--; block++) { - ut_a(block->page.state() == buf_page_t::NOT_USED); - ut_ad(block->in_withdraw_list); - } - } - - ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw) - << " blocks."; - - return(false); -} - - - -inline void buf_pool_t::page_hash_table::write_lock_all() noexcept -{ - for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + if (!n_blocks_to_withdraw) { - reinterpret_cast(array[n]).lock(); - if (!n) - break; + withdraw_done: + first_to_withdraw= nullptr; + while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn)) + { + UT_LIST_REMOVE(withdrawn, b); + /* satisfy the check in lazy_allocate() */ + ut_d(memset((void*) b, 0, sizeof(buf_block_t))); + } + return SHRINK_DONE; } -} + buf_buddy_condense_free(size); -inline void buf_pool_t::page_hash_table::write_unlock_all() noexcept -{ - for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + for (buf_page_t *b= UT_LIST_GET_FIRST(free), *next; b; b= next) { - reinterpret_cast(array[n]).unlock(); - if (!n) - break; - } -} + ut_ad(b->in_free_list); + ut_ad(!b->in_LRU_list); + ut_ad(!b->zip.data); + ut_ad(!b->oldest_modification()); + ut_a(b->state() == buf_page_t::NOT_USED); + next= UT_LIST_GET_NEXT(list, b); -namespace -{ - -struct find_interesting_trx -{ - void operator()(const trx_t &trx) - { - if (!trx.is_started()) - return; - if (trx.mysql_thd == nullptr) - return; - if (withdraw_started <= trx.start_time_micro) - return; - - if (!found) + if (b >= first_to_withdraw) { - sql_print_warning("InnoDB: The following trx might hold " - "the blocks in buffer pool to " - "be withdrawn. Buffer pool " - "resizing can complete only " - "after all the transactions " - "below release the blocks."); - found= true; + UT_LIST_REMOVE(free, b); + b->lock.free(); + UT_LIST_ADD_LAST(withdrawn, b); + if (!--n_blocks_to_withdraw) + goto withdraw_done; } - - lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time); } - bool &found; - /** microsecond_interval_timer() */ - const ulonglong withdraw_started; - const my_hrtime_t current_time; -}; - -} // namespace - -/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ -inline void buf_pool_t::resize() -{ - ut_ad(this == &buf_pool); - ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); - - bool warning = false; - - NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; - - ut_ad(!resize_in_progress()); - ut_ad(srv_buf_pool_chunk_unit > 0); - - ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift; - std::ostringstream str_old_size, str_new_size, str_chunk_size; - str_old_size << ib::bytes_iec{srv_buf_pool_old_size}; - str_new_size << ib::bytes_iec{srv_buf_pool_size}; - str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit}; + buf_block_t *block= allocate(); + size_t scanned= 0; + for (buf_page_t *b= lru_scan_itr.start(), *prev; block && b; b= prev) + { + ut_ad(b->in_LRU_list); + ut_a(b->in_file()); - buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).", - str_old_size.str().c_str(), - str_new_size.str().c_str(), - str_chunk_size.str().c_str()); + prev= UT_LIST_GET_PREV(LRU, b); -#ifdef BTR_CUR_HASH_ADAPT - /* disable AHI if needed */ - buf_resize_status("Disabling adaptive hash index."); + if (!b->can_relocate()) + { + next: + if (++scanned & 31) + continue; + /* Avoid starvation by periodically releasing buf_pool.mutex. */ + lru_scan_itr.set(prev); + mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&mutex); + prev= lru_scan_itr.get(); + continue; + } - btr_search_s_lock_all(); - const bool btr_search_disabled = btr_search_enabled; - btr_search_s_unlock_all(); + const page_id_t id{b->id()}; + hash_chain &chain= page_hash.cell_get(id.fold()); + page_hash_latch &hash_lock= page_hash.lock_get(chain); + hash_lock.lock(); - btr_search_disable(); + { + /* relocate flush_list and b->page.zip */ + bool have_flush_list_mutex= false; - if (btr_search_disabled) { - ib::info() << "disabled adaptive hash index."; - } -#endif /* BTR_CUR_HASH_ADAPT */ + switch (b->oldest_modification()) { + case 2: + ut_ad(fsp_is_system_temporary(id.space())); + /* fall through */ + case 0: + break; + default: + mysql_mutex_lock(&flush_list_mutex); + switch (ut_d(lsn_t om=) b->oldest_modification()) { + case 1: + delete_from_flush_list(b); + /* fall through */ + case 0: + mysql_mutex_unlock(&flush_list_mutex); + break; + default: + ut_ad(om != 2); + have_flush_list_mutex= true; + } + } - mysql_mutex_lock(&mutex); - ut_ad(n_chunks_new == n_chunks); - ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + if (!b->can_relocate()) + { + next_quick: + if (have_flush_list_mutex) + mysql_mutex_unlock(&flush_list_mutex); + hash_lock.unlock(); + continue; + } - n_chunks_new = (new_instance_size << srv_page_size_shift) - / srv_buf_pool_chunk_unit; - curr_size = n_chunks_new * chunks->size; - mysql_mutex_unlock(&mutex); + if (UNIV_UNLIKELY(will_be_withdrawn(b->zip.data, size))) + { + block= buf_buddy_shrink(b, block); + ut_ad(mach_read_from_4(b->zip.data + FIL_PAGE_OFFSET) == id.page_no()); + if (UNIV_UNLIKELY(!n_blocks_to_withdraw)) + { + if (have_flush_list_mutex) + mysql_mutex_unlock(&flush_list_mutex); + hash_lock.unlock(); + if (block) + buf_LRU_block_free_non_file_page(block); + goto withdraw_done; + } + if (!block && !(block= allocate())) + goto next_quick; + } - if (is_shrinking()) { - /* set withdraw target */ - size_t w = 0; + if (!b->frame || b < first_to_withdraw) + goto next_quick; - for (const chunk_t* chunk = chunks + n_chunks_new, - * const echunk = chunks + n_chunks; - chunk != echunk; chunk++) - w += chunk->size; + ut_ad(is_uncompressed_current(b)); - ut_ad(withdraw_target == 0); - withdraw_target = w; - } + byte *const frame= block->page.frame; + memcpy_aligned<4096>(frame, b->frame, srv_page_size); + b->lock.free(); + block->page.lock.free(); + new(&block->page) buf_page_t(*b); + block->page.frame= frame; - buf_resize_status("Withdrawing blocks to be shrunken."); + if (have_flush_list_mutex) + { + buf_flush_relocate_on_flush_list(b, &block->page); + mysql_mutex_unlock(&flush_list_mutex); + } + } - ulonglong withdraw_started = microsecond_interval_timer(); - ulonglong message_interval = 60ULL * 1000 * 1000; - ulint retry_interval = 1; + /* relocate LRU list */ + if (buf_page_t *prev_b= LRU_remove(b)) + UT_LIST_INSERT_AFTER(LRU, prev_b, &block->page); + else + UT_LIST_ADD_FIRST(LRU, &block->page); -withdraw_retry: - /* wait for the number of blocks fit to the new size (if needed)*/ - bool should_retry_withdraw = is_shrinking() - && withdraw_blocks(); + if (LRU_old == b) + LRU_old= &block->page; - if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { - /* abort to resize for shutdown. */ - return; - } + ut_ad(block->page.in_LRU_list); - /* abort buffer pool load */ - buf_load_abort(); + /* relocate page_hash */ + ut_ad(b == page_hash.get(id, chain)); + page_hash.replace(chain, b, &block->page); - const ulonglong current_time = microsecond_interval_timer(); + if (b->zip.data) + { + ut_ad(mach_read_from_4(b->zip.data + FIL_PAGE_OFFSET) == id.page_no()); + b->zip.data= nullptr; + /* relocate unzip_LRU list */ + buf_block_t *old_block= reinterpret_cast(b); + ut_ad(old_block->in_unzip_LRU_list); + ut_d(old_block->in_unzip_LRU_list= false); + ut_d(block->in_unzip_LRU_list= true); - if (should_retry_withdraw - && current_time - withdraw_started >= message_interval) { + buf_block_t *prev= UT_LIST_GET_PREV(unzip_LRU, old_block); + UT_LIST_REMOVE(unzip_LRU, old_block); - if (message_interval > 900000000) { - message_interval = 1800000000; - } else { - message_interval *= 2; - } + if (prev) + UT_LIST_INSERT_AFTER(unzip_LRU, prev, block); + else + UT_LIST_ADD_FIRST(unzip_LRU, block); + } - bool found= false; - find_interesting_trx f - {found, withdraw_started, my_hrtime_coarse()}; - withdraw_started = current_time; - - /* This is going to exceed the maximum size of a - memory transaction. */ - LockMutexGuard g{SRW_LOCK_CALL}; - trx_sys.trx_list.for_each(f); - } - - if (should_retry_withdraw) { - ib::info() << "Will retry to withdraw " << retry_interval - << " seconds later."; - std::this_thread::sleep_for( - std::chrono::seconds(retry_interval)); + buf_block_modify_clock_inc(block); - if (retry_interval > 5) { - retry_interval = 10; - } else { - retry_interval *= 2; - } +#ifdef BTR_CUR_HASH_ADAPT + assert_block_ahi_empty_on_init(block); + block->index= nullptr; + block->n_hash_helps= 0; + block->n_fields= 1; + block->left_side= true; +#endif /* BTR_CUR_HASH_ADAPT */ + hash_lock.unlock(); - goto withdraw_retry; - } + ut_d(b->in_LRU_list= false); - buf_resize_status("Latching entire buffer pool."); + b->set_state(buf_page_t::NOT_USED); + UT_LIST_ADD_LAST(withdrawn, b); + if (!--n_blocks_to_withdraw) + goto withdraw_done; -#ifndef DBUG_OFF - { - bool should_wait = true; + block= allocate(); + goto next; + } - while (should_wait) { - should_wait = false; - DBUG_EXECUTE_IF( - "ib_buf_pool_resize_wait_before_resize", - should_wait = true; - std::this_thread::sleep_for( - std::chrono::milliseconds(10));); - } - } -#endif /* !DBUG_OFF */ + if (UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < usable_size() / 20) + return SHRINK_ABORT; - if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { - return; - } + mysql_mutex_lock(&flush_list_mutex); - /* Indicate critical path */ - resizing.store(true, std::memory_order_relaxed); + if (LRU_warned && !UT_LIST_GET_FIRST(free)) + { + LRU_warned_clear(); + mysql_mutex_unlock(&flush_list_mutex); + return SHRINK_ABORT; + } - mysql_mutex_lock(&mutex); - page_hash.write_lock_all(); + try_LRU_scan= false; + mysql_mutex_unlock(&mutex); + page_cleaner_wakeup(true); + my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + mysql_mutex_unlock(&flush_list_mutex); + mysql_mutex_lock(&mutex); - chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); + if (!n_blocks_to_withdraw) + goto withdraw_done; - /* add/delete chunks */ + return SHRINK_IN_PROGRESS; +} - buf_resize_status("Resizing buffer pool from " - ULINTPF " chunks to " ULINTPF " chunks.", - n_chunks, n_chunks_new); - - if (is_shrinking()) { - /* delete chunks */ - chunk_t* chunk = chunks + n_chunks_new; - const chunk_t* const echunk = chunks + n_chunks; - - ulint sum_freed = 0; - - while (chunk < echunk) { - /* buf_LRU_block_free_non_file_page() invokes - MEM_NOACCESS() on any buf_pool.free blocks. - We must cancel the effect of that. In - MemorySanitizer, MEM_NOACCESS() is no-op, so - we must not do anything special for it here. */ -#ifdef HAVE_valgrind -# if !__has_feature(memory_sanitizer) - MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size()); +inline void buf_pool_t::shrunk(size_t size, size_t reduced) noexcept +{ + ut_ad(size + reduced == size_in_bytes); + size_in_bytes_requested= size; + size_in_bytes= size; +# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT + /* Only page_guess() may read this memory, which after + my_virtual_mem_decommit() may be zeroed out or preserve its original + contents. Try to catch any unintended reads outside page_guess(). */ + MEM_UNDEFINED(memory + size, size_in_bytes_max - size); +# else + for (size_t n= page_hash.pad(page_hash.n_cells), i= 0; i < n; + i+= page_hash.ELEMENTS_PER_LATCH + 1) + { + auto &latch= reinterpret_cast(page_hash.array[i]); + latch.lock(); + /* We already shrunk size_in_bytes. The exclusive lock here + ensures that any page_guess() will detect an out-of-bounds + guess before we invoke my_virtual_mem_decommit() below. */ + latch.unlock(); + } # endif -#else - MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size); + my_virtual_mem_decommit(memory + size, reduced); +#ifdef UNIV_PFS_MEMORY + PSI_MEMORY_CALL(memory_free)(mem_key_buf_buf_pool, reduced, owner); #endif +} - buf_block_t* block = chunk->blocks; - - for (ulint j = chunk->size; j--; block++) { - block->page.lock.free(); - } - - allocator.deallocate_large_dodump( - chunk->mem, &chunk->mem_pfx); - sum_freed += chunk->size; - ++chunk; - } - - /* discard withdraw list */ - UT_LIST_INIT(withdraw, &buf_page_t::list); - withdraw_target = 0; - - ib::info() << n_chunks - n_chunks_new - << " Chunks (" << sum_freed - << " blocks) were freed."; - - n_chunks = n_chunks_new; - } - - { - /* reallocate chunks */ - const size_t new_chunks_size - = n_chunks_new * sizeof(chunk_t); - - chunk_t* new_chunks = static_cast( - ut_zalloc_nokey_nofatal(new_chunks_size)); - - DBUG_EXECUTE_IF("buf_pool_resize_chunk_null", - ut_free(new_chunks); new_chunks= nullptr; ); - - if (!new_chunks) { - ib::error() << "failed to allocate" - " the chunk array."; - n_chunks_new = n_chunks; - warning = true; - chunks_old = NULL; - goto calc_buf_pool_size; - } - - ulint n_chunks_copy = ut_min(n_chunks_new, n_chunks); - - memcpy(new_chunks, chunks, - n_chunks_copy * sizeof *new_chunks); +ATTRIBUTE_COLD void buf_pool_t::resize(size_t size, THD *thd) noexcept +{ + ut_ad(this == &buf_pool); + mysql_mutex_assert_owner(&LOCK_global_system_variables); + ut_ad(size <= size_in_bytes_max); + if (my_use_large_pages) + { + my_error(ER_VARIABLE_IS_READONLY, MYF(0), "InnoDB", + "innodb_buffer_pool_size", "large_pages=0"); + return; + } - for (ulint j = 0; j < n_chunks_copy; j++) { - new_chunks[j].reg(); - } + size_t n_blocks_new= get_n_blocks(size); - chunks_old = chunks; - chunks = new_chunks; - } + mysql_mutex_lock(&mutex); - if (n_chunks_new > n_chunks) { - /* add chunks */ - ulint sum_added = 0; - ulint n = n_chunks; - const size_t unit = srv_buf_pool_chunk_unit; - - for (chunk_t* chunk = chunks + n_chunks, - * const echunk = chunks + n_chunks_new; - chunk != echunk; chunk++) { - if (!chunk->create(unit)) { - ib::error() << "failed to allocate" - " memory for buffer pool chunk"; + const size_t old_size= size_in_bytes; + if (first_to_withdraw || old_size != size_in_bytes_requested) + { + mysql_mutex_unlock(&mutex); + my_printf_error(ER_WRONG_USAGE, + "innodb_buffer_pool_size change is already in progress", + MYF(0)); + return; + } - warning = true; - n_chunks_new = n_chunks; - break; - } + ut_ad(UT_LIST_GET_LEN(withdrawn) == 0); + ut_ad(n_blocks_to_withdraw == 0); +#ifdef __linux__ + DBUG_EXECUTE_IF("trigger_garbage_collection", + mem_pressure_obj.trigger_collection();); +#endif - sum_added += chunk->size; - ++n; - } + if (size == old_size) + { + mysql_mutex_unlock(&mutex); + DBUG_EXECUTE_IF("trigger_garbage_collection", + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + garbage_collect();); + return; + } - ib::info() << n_chunks_new - n_chunks - << " chunks (" << sum_added - << " blocks) were added."; - - n_chunks = n; - } -calc_buf_pool_size: - /* recalc curr_size */ - ulint new_size = 0; +#ifdef BTR_CUR_HASH_ADAPT + bool ahi_disabled= false; +#endif - { - chunk_t* chunk = chunks; - const chunk_t* const echunk = chunk + n_chunks; - do { - new_size += chunk->size; - } while (++chunk != echunk); - } + const bool significant_change= + n_blocks_new > n_blocks * 2 || n_blocks > n_blocks_new * 2; + const ssize_t n_blocks_removed= n_blocks - n_blocks_new; - curr_size = new_size; - n_chunks_new = n_chunks; + if (n_blocks_removed <= 0) + { + if (!my_virtual_mem_commit(memory + old_size, size - old_size)) + { + mysql_mutex_unlock(&mutex); + sql_print_error("InnoDB: Cannot commit innodb_buffer_pool_size=%zum;" + " retaining innodb_buffer_pool_size=%zum", + size >> 20, old_size >> 20); + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + return; + } - if (chunks_old) { - ut_free(chunks_old); - chunks_old = NULL; - } + size_in_bytes_requested= size; + size_in_bytes= size; - chunk_t::map* chunk_map_old = chunk_t::map_ref; - chunk_t::map_ref = chunk_t::map_reg; + { + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; + const size_t pages= pages_in_extent[ssize]; + const size_t first_extent= n_blocks / pages; - /* set size */ - ut_ad(UT_LIST_GET_LEN(withdraw) == 0); - ulint s= curr_size; - s/= BUF_READ_AHEAD_PORTION; - read_ahead_area= s >= READ_AHEAD_PAGES - ? READ_AHEAD_PAGES - : my_round_up_to_next_power(static_cast(s)); - curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; - srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ - extern ulonglong innobase_buffer_pool_size; - innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size); - - const bool new_size_too_diff - = srv_buf_pool_base_size > srv_buf_pool_size * 2 - || srv_buf_pool_base_size * 2 < srv_buf_pool_size; + char *extent= memory + first_extent * innodb_buffer_pool_extent_size; - mysql_mutex_unlock(&mutex); - page_hash.write_unlock_all(); + buf_block_t *block= reinterpret_cast(extent); + if (const size_t first_blocks= n_blocks % pages) + { + /* Extend the last (partial) extent until its end */ + const buf_block_t *extent_end= block + + (first_extent == (n_blocks_new / pages) + ? (n_blocks_new % pages) + : pages); + block+= first_blocks; + memset((void*) block, 0, (extent_end - block) * sizeof *block); + + for (byte *frame= reinterpret_cast(extent) + + first_frame_in_extent[ssize] + + (first_blocks << srv_page_size_shift); block < extent_end; + block++, frame+= srv_page_size) + { + block->page.frame= frame; + block->page.lock.init(); + UT_LIST_ADD_LAST(free, &block->page); + ut_d(block->page.in_free_list= true); + } + extent+= innodb_buffer_pool_extent_size; + } - UT_DELETE(chunk_map_old); + /* Fill in further extents; @see buf_pool_t::create() */ + for (const char *const end_new= memory + + block_descriptors_in_bytes(n_blocks_new); + extent < end_new; extent+= innodb_buffer_pool_extent_size) + { + block= reinterpret_cast(extent); + const buf_block_t *extent_end= block + pages; + if (reinterpret_cast(extent_end) > end_new) + extent_end= reinterpret_cast(end_new); + + memset((void*) block, 0, (extent_end - block) * sizeof *block); + for (byte *frame= reinterpret_cast(extent) + + first_frame_in_extent[ssize]; + block < extent_end; block++, frame+= srv_page_size) + { + block->page.frame= frame; + block->page.lock.init(); + UT_LIST_ADD_LAST(free, &block->page); + ut_d(block->page.in_free_list= true); + } + } + } - resizing.store(false, std::memory_order_relaxed); + mysql_mutex_unlock(&LOCK_global_system_variables); + resized: + ut_ad(UT_LIST_GET_LEN(withdrawn) == 0); + ut_ad(n_blocks_to_withdraw == 0); + ut_ad(!first_to_withdraw); + const size_t old_blocks{n_blocks}; + n_blocks= n_blocks_new; + + size_t s= n_blocks_new / BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(uint32(s)); - /* Normalize other components, if the new size is too different */ - if (!warning && new_size_too_diff) { - srv_buf_pool_base_size = srv_buf_pool_size; + if (ssize_t d= size - old_size) + { + os_total_large_mem_allocated+= d; + if (d > 0) + { + /* Already committed memory earlier */ + ut_ad(n_blocks_removed <= 0); +#ifdef UNIV_PFS_MEMORY + PSI_MEMORY_CALL(memory_alloc)(mem_key_buf_buf_pool, d, &owner); +#endif + } + else + shrunk(size, size_t(-d)); + } - buf_resize_status("Resizing other hash tables."); + mysql_mutex_unlock(&mutex); - srv_lock_table_size = 5 - * (srv_buf_pool_size >> srv_page_size_shift); - lock_sys.resize(srv_lock_table_size); - dict_sys.resize(); + if (significant_change) + { + sql_print_information("InnoDB: Resizing hash tables"); + srv_lock_table_size= 5 * n_blocks_new; + lock_sys.resize(srv_lock_table_size); + dict_sys.resize(); + } - ib::info() << "Resized hash tables: lock_sys," + ibuf_max_size_update(srv_change_buffer_max_size); #ifdef BTR_CUR_HASH_ADAPT - " adaptive hash index," -#endif /* BTR_CUR_HASH_ADAPT */ - " and dictionary."; - } - - /* normalize ibuf.max_size */ - ibuf_max_size_update(srv_change_buffer_max_size); - - if (srv_buf_pool_old_size != srv_buf_pool_size) { + if (ahi_disabled) + btr_search_enable(true); +#endif + mysql_mutex_lock(&LOCK_global_system_variables); + bool resized= n_blocks_removed < 0; + if (n_blocks_removed > 0) + { + mysql_mutex_lock(&mutex); + resized= size_in_bytes == old_size; + if (resized) + { + size_in_bytes_requested= size; + size_in_bytes= size; + } + mysql_mutex_unlock(&mutex); + } - buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes." - ,srv_buf_pool_old_size, srv_buf_pool_size); - srv_buf_pool_old_size = srv_buf_pool_size; - } + if (resized) + sql_print_information("InnoDB: innodb_buffer_pool_size=%zum (%zu pages)" + " resized from %zum (%zu pages)", + size >> 20, n_blocks_new, old_size >> 20, + old_blocks); + } + else + { + size_t to_withdraw= size_t(n_blocks_removed); + n_blocks_to_withdraw= to_withdraw; + first_to_withdraw= &get_nth_page(n_blocks_new)->page; + size_in_bytes_requested= size; + mysql_mutex_unlock(&LOCK_global_system_variables); + mysql_mutex_unlock(&mutex); + DEBUG_SYNC_C("buf_pool_shrink_before_wakeup"); + mysql_mutex_lock(&flush_list_mutex); + page_cleaner_wakeup(true); + my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + mysql_mutex_unlock(&flush_list_mutex); #ifdef BTR_CUR_HASH_ADAPT - /* enable AHI if needed */ - if (btr_search_disabled) { - btr_search_enable(true); - ib::info() << "Re-enabled adaptive hash index."; - } + ahi_disabled= btr_search_disable(); #endif /* BTR_CUR_HASH_ADAPT */ + mysql_mutex_lock(&mutex); - if (warning) - buf_resize_status("Resizing buffer pool failed"); - - ut_d(validate()); - - return; -} + time_t last_message= 0; -#ifdef __linux__ -inline void buf_pool_t::garbage_collect() -{ - mysql_mutex_lock(&mutex); - size_t freed= 0; - -#ifdef BTR_CUR_HASH_ADAPT - /* buf_LRU_free_page() will temporarily release and reacquire - buf_pool.mutex for invoking btr_search_drop_page_hash_index(). Thus, - we must protect ourselves with the hazard pointer. */ -rescan: -#else - lru_hp.set(nullptr); -#endif - for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev; bpage; bpage= prev) - { - prev= UT_LIST_GET_PREV(LRU, bpage); -#ifdef BTR_CUR_HASH_ADAPT - lru_hp.set(prev); -#endif - auto state= bpage->state(); - ut_ad(state >= buf_page_t::FREED); - ut_ad(bpage->in_LRU_list); - - /* We try to free any pages that can be freed without writing out - anything. */ - switch (bpage->oldest_modification()) { - case 0: - try_to_evict: - if (buf_LRU_free_page(bpage, true)) + do + { + time_t now= time(nullptr); + if (now - last_message > 15) { - evicted: - freed++; -#ifdef BTR_CUR_HASH_ADAPT - bpage= prev; - prev= lru_hp.get(); - if (!prev && bpage) - goto rescan; -#endif + if (last_message != 0 && to_withdraw == n_blocks_to_withdraw) + break; + to_withdraw= n_blocks_to_withdraw; + last_message= now; + sql_print_information("InnoDB: Trying to shrink" + " innodb_buffer_pool_size=%zum (%zu pages)" + " from %zum (%zu pages, to withdraw %zu)", + size >> 20, n_blocks_new, + old_size >> 20, n_blocks, to_withdraw); } - continue; - case 1: - break; - default: - if (state >= buf_page_t::UNFIXED) - continue; + shrink_status s{shrink(size)}; + if (s == SHRINK_DONE) + goto resized; + if (s != SHRINK_IN_PROGRESS) + break; } + while (!thd_kill_level(thd)); + + ut_ad(size_in_bytes > size_in_bytes_requested); + n_blocks_to_withdraw= 0; + first_to_withdraw= nullptr; + size_in_bytes_requested= size_in_bytes; - if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) + while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn)) { - ut_ad(!bpage->is_io_fixed()); - lsn_t oldest_modification= bpage->oldest_modification(); - switch (oldest_modification) { - case 1: - mysql_mutex_lock(&flush_list_mutex); - oldest_modification= bpage->oldest_modification(); - if (oldest_modification) - { - ut_ad(oldest_modification == 1); - delete_from_flush_list(bpage); - } - mysql_mutex_unlock(&flush_list_mutex); - /* fall through */ - case 0: - bpage->lock.u_unlock(true); - goto try_to_evict; - default: - if (bpage->state() < buf_page_t::UNFIXED && - oldest_modification <= log_sys.get_flushed_lsn()) - { - release_freed_page(bpage); - goto evicted; - } - else - bpage->lock.u_unlock(true); - } + UT_LIST_REMOVE(withdrawn, b); + UT_LIST_ADD_LAST(free, b); + ut_d(b->in_free_list= true); + ut_ad(b->state() == buf_page_t::NOT_USED); + b->lock.init(); } - } - -#if defined MADV_FREE - /* FIXME: Issue fewer calls for larger contiguous blocks of - memory. For now, we assume that this is acceptable, because this - code should be executed rarely. */ - for (buf_page_t *bpage= UT_LIST_GET_FIRST(free); bpage; - bpage= UT_LIST_GET_NEXT(list, bpage)) - madvise(bpage->frame, srv_page_size, MADV_FREE); -#endif - mysql_mutex_unlock(&mutex); - sql_print_information("InnoDB: Memory pressure event freed %zu pages", - freed); - return; -} -#endif /* __linux__ */ - -/** Thread pool task invoked by innodb_buffer_pool_size changes. */ -static void buf_resize_callback(void *) -{ - DBUG_ENTER("buf_resize_callback"); - ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); - mysql_mutex_lock(&buf_pool.mutex); - const auto size= srv_buf_pool_size; - const bool work= srv_buf_pool_old_size != size; - mysql_mutex_unlock(&buf_pool.mutex); - - if (work) - buf_pool.resize(); - else - { - std::ostringstream sout; - sout << "Size did not change: old size = new size = " << size; - buf_resize_status(sout.str().c_str()); - } - DBUG_VOID_RETURN; -} -/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */ -static tpool::task_group single_threaded_group(1); -static tpool::waitable_task buf_resize_task(buf_resize_callback, - nullptr, &single_threaded_group); - -void buf_resize_start() -{ -#if !defined(DBUG_OFF) && defined(__linux__) - DBUG_EXECUTE_IF("trigger_garbage_collection", - { - mem_pressure_obj.trigger_collection(); + mysql_mutex_unlock(&mutex); + my_printf_error(ER_WRONG_USAGE, "innodb_buffer_pool_size change aborted", + MYF(ME_ERROR_LOG)); + mysql_mutex_lock(&LOCK_global_system_variables); } - ); -#endif - - srv_thread_pool->submit_task(&buf_resize_task); -} -void buf_resize_shutdown() -{ -#ifdef __linux__ - buf_mem_pressure_shutdown(); -#endif - buf_resize_task.wait(); + ut_d(validate()); } - /** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and buf_pool.page_hash. The caller must relocate bpage->list. @param bpage ROW_FORMAT=COMPRESSED only block @param dpage destination control block */ -static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) +static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) noexcept { const page_id_t id{bpage->id()}; buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); ut_ad(!bpage->frame); mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(mach_read_from_4(bpage->zip.data + FIL_PAGE_OFFSET) == id.page_no()); ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); ut_ad(bpage == buf_pool.page_hash.get(id, chain)); ut_ad(!buf_pool.watch_is_sentinel(*bpage)); @@ -2274,6 +2118,7 @@ ut_ad(state <= buf_page_t::READ_FIX); ut_ad(bpage->lock.is_write_locked()); const auto frame= dpage->frame; + ut_ad(frame == reinterpret_cast(dpage)->frame_address()); dpage->lock.free(); new (dpage) buf_page_t(*bpage); @@ -2345,7 +2190,6 @@ ut_ad(w->access_time == 0); ut_ad(!w->oldest_modification()); ut_ad(!w->zip.data); - ut_ad(!w->in_zip_hash); static_assert(buf_page_t::NOT_USED == 0, "efficiency"); if (ut_d(auto s=) w->state()) { @@ -2625,6 +2469,8 @@ ut_ad(block->zip_size()); ut_a(block->page.id().space() != 0); + ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) + == block->page.id().page_no()); if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { @@ -2863,7 +2709,6 @@ if (b && !watch_is_sentinel(*b)) { uint32_t state= b->fix() + 1; - ut_ad(!b->in_zip_hash); hash_lock.unlock_shared(); if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) @@ -2893,7 +2738,8 @@ return reinterpret_cast(-1); } - if (UNIV_LIKELY(b->frame != nullptr)); + if (UNIV_LIKELY(b->frame != nullptr)) + ut_ad(b->frame==reinterpret_cast(b)->frame_address()); else if (state < buf_page_t::READ_FIX) goto unzip; else @@ -2959,6 +2805,49 @@ } } +TRANSACTIONAL_TARGET +uint32_t buf_pool_t::page_guess(buf_block_t *b, page_hash_latch &latch, + const page_id_t id) noexcept +{ + transactional_shared_lock_guard g{latch}; +#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT + /* shrunk() and my_virtual_mem_decommit() could retain the original + contents of the virtual memory range or zero it out immediately or + with a delay. Any zeroing out may lead to a false positive for + b->page.id() == id but never for b->page.state(). At the time of + the shrunk() call, shrink() and buf_LRU_block_free_non_file_page() + should guarantee that b->page.state() is equal to + buf_page_t::NOT_USED (0) for all to-be-freed blocks. */ +#else + /* shrunk() made the memory inaccessible. */ + if (UNIV_UNLIKELY(reinterpret_cast(b) >= memory + size_in_bytes)) + return 0; +#endif + const page_id_t block_id{b->page.id()}; +#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT + /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able + to catch any unintended access elsewhere in our code. */ + MEM_MAKE_DEFINED(&block_id, sizeof block_id); +#endif + + if (id == block_id) + { + uint32_t state= b->page.state(); +#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT + /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able + to catch any unintended access elsewhere in our code. */ + MEM_MAKE_DEFINED(&state, sizeof state); +#endif + /* Ignore guesses that point to read-fixed blocks. We can only + avoid a race condition by looking up the block via page_hash. */ + if ((state >= buf_page_t::FREED && state < buf_page_t::READ_FIX) || + state >= buf_page_t::WRITE_FIX) + return b->page.fix(); + ut_ad(b->page.frame); + } + return 0; +} + /** Low level function used to get access to a database page. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @@ -3023,22 +2912,9 @@ buf_block_t* block = guess; uint32_t state; - if (block) { - transactional_shared_lock_guard g{hash_lock}; - if (buf_pool.is_uncompressed(block) - && page_id == block->page.id()) { - ut_ad(!block->page.in_zip_hash); - state = block->page.state(); - /* Ignore guesses that point to read-fixed blocks. - We can only avoid a race condition by - looking up the block via buf_pool.page_hash. */ - if ((state >= buf_page_t::FREED - && state < buf_page_t::READ_FIX) - || state >= buf_page_t::WRITE_FIX) { - state = block->page.fix(); - goto got_block; - } - } + if (block + && (state = buf_pool.page_guess(block, hash_lock, page_id))) { + goto got_block; } guess = nullptr; @@ -3108,7 +2984,6 @@ goto loop; got_block: - ut_ad(!block->page.in_zip_hash); state++; got_block_fixed: ut_ad(state > buf_page_t::FREED); @@ -3313,6 +3188,7 @@ btr_search_drop_page_hash_index(block, true); #endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(block->page.frame == block->frame_address()); ut_ad(page_id_t(page_get_space_id(block->page.frame), page_get_page_no(block->page.frame)) == page_id); return block; @@ -3418,21 +3294,19 @@ return block; } -TRANSACTIONAL_TARGET buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id) noexcept { buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); - transactional_shared_lock_guard g - {buf_pool.page_hash.lock_get(chain)}; - if (UNIV_UNLIKELY(!buf_pool.is_uncompressed(block) || - id != block->page.id() || !block->page.frame)) - return nullptr; - const auto state= block->page.state(); - if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || - state >= buf_page_t::READ_FIX)) - return nullptr; - block->page.fix(); - return block; + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + if (uint32_t state= buf_pool.page_guess(block, hash_lock, id)) + { + if (UNIV_LIKELY(state >= buf_page_t::UNFIXED)) + return block; + else + /* Refuse access to pages that are marked as freed in the data file. */ + block->page.unfix(); + } + return nullptr; } buf_block_t *buf_page_optimistic_get(buf_block_t *block, @@ -3635,6 +3509,7 @@ { mysql_mutex_unlock(&buf_pool.mutex); buf_block_t *block= reinterpret_cast(bpage); + ut_ad(bpage->frame == block->frame_address()); mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); #ifdef BTR_CUR_HASH_ADAPT drop_hash_entry= block->index; @@ -3670,7 +3545,8 @@ else { mysql_mutex_unlock(&buf_pool.mutex); - ut_ad(bpage->frame); + ut_ad(bpage->frame == + reinterpret_cast(bpage)->frame_address()); #ifdef BTR_CUR_HASH_ADAPT ut_ad(!reinterpret_cast(bpage)->index); #endif @@ -4064,10 +3940,9 @@ if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) { release_page: - if (node.space->full_crc32() && node.space->crypt_data && - recv_recovery_is_on() && - recv_sys.dblwr.find_encrypted_page(node, id().page_no(), - const_cast(read_frame))) + if (node.space->full_crc32() && recv_recovery_is_on() && + recv_sys.dblwr.find_deferred_page(node, id().page_no(), + const_cast(read_frame))) { /* Recover from doublewrite buffer */ err= DB_SUCCESS; @@ -4127,6 +4002,61 @@ return DB_SUCCESS; } +#ifdef BTR_CUR_HASH_ADAPT +/** Clear the adaptive hash index on all pages in the buffer pool. */ +ATTRIBUTE_COLD void buf_pool_t::clear_hash_index() noexcept +{ + std::set garbage; + + mysql_mutex_lock(&mutex); + ut_ad(!btr_search_enabled); + + for (char *extent= memory, + *end= memory + block_descriptors_in_bytes(n_blocks); + extent < end; extent+= innodb_buffer_pool_extent_size) + for (buf_block_t *block= reinterpret_cast(extent), + *extent_end= block + + pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]; + block < extent_end && reinterpret_cast(block) < end; block++) + { + dict_index_t *index= block->index; + assert_block_ahi_valid(block); + + /* We can clear block->index and block->n_pointers when + holding all AHI latches exclusively; see the comments in buf0buf.h */ + + if (!index) + { +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(!block->n_pointers); +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + continue; + } + + ut_d(const auto s= block->page.state()); + /* Another thread may have set the state to + REMOVE_HASH in buf_LRU_block_remove_hashed(). + + The state change in buf_pool_t::resize() is not observable + here, because in that case we would have !block->index. + + In the end, the entire adaptive hash index will be removed. */ + ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH); +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers= 0; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + if (index->freed()) + garbage.insert(index); + block->index= nullptr; + } + + mysql_mutex_unlock(&mutex); + + for (dict_index_t *index : garbage) + btr_search_lazy_free(index); +} +#endif /* BTR_CUR_HASH_ADAPT */ + #ifdef UNIV_DEBUG /** Check that all blocks are in a replaceable state. @return address of a non-free block @@ -4134,10 +4064,44 @@ void buf_pool_t::assert_all_freed() noexcept { mysql_mutex_lock(&mutex); - const chunk_t *chunk= chunks; - for (auto i= n_chunks; i--; chunk++) - if (const buf_block_t* block= chunk->not_freed()) - ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + + for (char *extent= memory, + *end= memory + block_descriptors_in_bytes(n_blocks); + extent < end; extent+= innodb_buffer_pool_extent_size) + for (buf_block_t *block= reinterpret_cast(extent), + *extent_end= block + + pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]; + block < extent_end && reinterpret_cast(block) < end; block++) + { + if (!block->page.in_file()) + continue; + switch (const lsn_t lsn= block->page.oldest_modification()) { + case 0: + case 1: + break; + + case 2: + ut_ad(fsp_is_system_temporary(block->page.id().space())); + break; + + default: + if (srv_read_only_mode) + { + /* The page cleaner is disabled in read-only mode. No pages + can be dirtied, so all of them must be clean. */ + ut_ad(lsn == recv_sys.lsn || + srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + break; + } + + goto fixed_or_dirty; + } + + if (!block->page.can_relocate()) + fixed_or_dirty: + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + } + mysql_mutex_unlock(&mutex); } #endif /* UNIV_DEBUG */ @@ -4187,40 +4151,35 @@ mysql_mutex_lock(&mutex); - chunk_t* chunk = chunks; - /* Check the uncompressed blocks. */ - for (auto i = n_chunks; i--; chunk++) { - buf_block_t* block = chunk->blocks; - - for (auto j = chunk->size; j--; block++) { - ut_ad(block->page.frame); - switch (const auto f = block->page.state()) { - case buf_page_t::NOT_USED: - n_free++; - break; + for (ulint i = 0; i < n_blocks; i++) { + const buf_block_t* block = get_nth_page(i); + ut_ad(block->page.frame == block->frame_address()); - case buf_page_t::MEMORY: - case buf_page_t::REMOVE_HASH: - /* do nothing */ + switch (const auto f = block->page.state()) { + case buf_page_t::NOT_USED: + ut_ad(!block->page.in_LRU_list); + n_free++; + break; + case buf_page_t::MEMORY: + case buf_page_t::REMOVE_HASH: + /* do nothing */ + break; + default: + if (f >= buf_page_t::READ_FIX + && f < buf_page_t::WRITE_FIX) { + /* A read-fixed block is not + necessarily in the page_hash yet. */ break; - - default: - if (f >= buf_page_t::READ_FIX - && f < buf_page_t::WRITE_FIX) { - /* A read-fixed block is not - necessarily in the page_hash yet. */ - break; - } - ut_ad(f >= buf_page_t::FREED); - const page_id_t id{block->page.id()}; - ut_ad(page_hash.get( - id, - page_hash.cell_get(id.fold())) - == &block->page); - n_lru++; } + ut_ad(f >= buf_page_t::FREED); + const page_id_t id{block->page.id()}; + ut_ad(page_hash.get( + id, + page_hash.cell_get(id.fold())) + == &block->page); + n_lru++; } } @@ -4245,24 +4204,11 @@ ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); mysql_mutex_unlock(&flush_list_mutex); - - if (n_chunks_new == n_chunks - && n_lru + n_free > curr_size + n_zip) { - - ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free - << ", pool " << curr_size - << " zip " << n_zip << ". Aborting..."; - } - + ut_ad(n_lru + n_free <= n_blocks + n_zip); ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru); - - if (n_chunks_new == n_chunks - && UT_LIST_GET_LEN(free) != n_free) { - - ib::fatal() << "Free list len " - << UT_LIST_GET_LEN(free) - << ", free blocks " << n_free << ". Aborting..."; - } + ut_ad(UT_LIST_GET_LEN(free) <= n_free); + ut_ad(size_in_bytes != size_in_bytes_requested + || UT_LIST_GET_LEN(free) == n_free); mysql_mutex_unlock(&mutex); @@ -4277,26 +4223,23 @@ { index_id_t* index_ids; ulint* counts; - ulint size; ulint i; - ulint j; index_id_t id; ulint n_found; - chunk_t* chunk; dict_index_t* index; - size = curr_size; + mysql_mutex_lock(&mutex); index_ids = static_cast( - ut_malloc_nokey(size * sizeof *index_ids)); + ut_malloc_nokey(n_blocks * sizeof *index_ids)); - counts = static_cast(ut_malloc_nokey(sizeof(ulint) * size)); + counts = static_cast( + ut_malloc_nokey(sizeof(ulint) * n_blocks)); - mysql_mutex_lock(&mutex); mysql_mutex_lock(&flush_list_mutex); ib::info() - << "[buffer pool: size=" << curr_size + << "[buffer pool: size=" << n_blocks << ", database pages=" << UT_LIST_GET_LEN(LRU) << ", free pages=" << UT_LIST_GET_LEN(free) << ", modified database pages=" @@ -4316,38 +4259,28 @@ n_found = 0; - chunk = chunks; - - for (i = n_chunks; i--; chunk++) { - buf_block_t* block = chunk->blocks; - ulint n_blocks = chunk->size; - - for (; n_blocks--; block++) { - const buf_frame_t* frame = block->page.frame; - - if (fil_page_index_page_check(frame)) { - - id = btr_page_get_index_id(frame); - - /* Look for the id in the index_ids array */ - j = 0; - - while (j < n_found) { - - if (index_ids[j] == id) { - counts[j]++; - - break; - } - j++; - } - - if (j == n_found) { - n_found++; - index_ids[j] = id; - counts[j] = 1; + for (size_t i = 0; i < n_blocks; i++) { + buf_block_t* block = get_nth_page(i); + const buf_frame_t* frame = block->page.frame; + ut_ad(frame == block->frame_address()); + + if (fil_page_index_page_check(frame)) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + for (ulint j = 0; j < n_found; j++) { + if (index_ids[j] == id) { + counts[j]++; + goto found; } } + + index_ids[n_found] = id; + counts[n_found] = 1; + n_found++; +found: + continue; } } @@ -4381,138 +4314,78 @@ { ulint fixed_pages_number= 0; - mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_assert_owner(&buf_pool.mutex); for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b; b= UT_LIST_GET_NEXT(LRU, b)) if (b->state() > buf_page_t::UNFIXED) fixed_pages_number++; - mysql_mutex_unlock(&buf_pool.mutex); - return fixed_pages_number; } #endif /* UNIV_DEBUG */ -/** Collect buffer pool metadata. -@param[out] pool_info buffer pool metadata */ -void buf_stats_get_pool_info(buf_pool_info_t *pool_info) noexcept +void buf_pool_t::get_info(buf_pool_info_t *pool_info) noexcept { - time_t current_time; - double time_elapsed; - - mysql_mutex_lock(&buf_pool.mutex); - - pool_info->pool_size = buf_pool.curr_size; - - pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU); - - pool_info->old_lru_len = buf_pool.LRU_old_len; - - pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); - - mysql_mutex_lock(&buf_pool.flush_list_mutex); - pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); - - pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); - - pool_info->n_pend_reads = os_aio_pending_reads_approx(); - - pool_info->n_pending_flush_lru = buf_pool.n_flush(); - - pool_info->n_pending_flush_list = os_aio_pending_writes(); - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - - current_time = time(NULL); - time_elapsed = 0.001 + difftime(current_time, - buf_pool.last_printout_time); - - pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young; - - pool_info->n_pages_not_made_young = - buf_pool.stat.n_pages_not_made_young; - - pool_info->n_pages_read = buf_pool.stat.n_pages_read; - - pool_info->n_pages_created = buf_pool.stat.n_pages_created; - - pool_info->n_pages_written = buf_pool.stat.n_pages_written; - - pool_info->n_page_gets = buf_pool.stat.n_page_gets; - - pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd; - pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read; - - pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted; - - pool_info->page_made_young_rate = - static_cast(buf_pool.stat.n_pages_made_young - - buf_pool.old_stat.n_pages_made_young) - / time_elapsed; - - pool_info->page_not_made_young_rate = - static_cast(buf_pool.stat.n_pages_not_made_young - - buf_pool.old_stat.n_pages_not_made_young) - / time_elapsed; - - pool_info->pages_read_rate = - static_cast(buf_pool.stat.n_pages_read - - buf_pool.old_stat.n_pages_read) - / time_elapsed; - - pool_info->pages_created_rate = - static_cast(buf_pool.stat.n_pages_created - - buf_pool.old_stat.n_pages_created) - / time_elapsed; - - pool_info->pages_written_rate = - static_cast(buf_pool.stat.n_pages_written - - buf_pool.old_stat.n_pages_written) - / time_elapsed; - - pool_info->n_page_get_delta = buf_pool.stat.n_page_gets - - buf_pool.old_stat.n_page_gets; - - if (pool_info->n_page_get_delta) { - pool_info->page_read_delta = buf_pool.stat.n_pages_read - - buf_pool.old_stat.n_pages_read; - - pool_info->young_making_delta = - buf_pool.stat.n_pages_made_young - - buf_pool.old_stat.n_pages_made_young; - - pool_info->not_young_making_delta = - buf_pool.stat.n_pages_not_made_young - - buf_pool.old_stat.n_pages_not_made_young; - } - pool_info->pages_readahead_rnd_rate = - static_cast(buf_pool.stat.n_ra_pages_read_rnd - - buf_pool.old_stat.n_ra_pages_read_rnd) - / time_elapsed; - - - pool_info->pages_readahead_rate = - static_cast(buf_pool.stat.n_ra_pages_read - - buf_pool.old_stat.n_ra_pages_read) - / time_elapsed; - - pool_info->pages_evicted_rate = - static_cast(buf_pool.stat.n_ra_pages_evicted - - buf_pool.old_stat.n_ra_pages_evicted) - / time_elapsed; - - pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); - - pool_info->io_sum = buf_LRU_stat_sum.io; - - pool_info->io_cur = buf_LRU_stat_cur.io; + mysql_mutex_lock(&mutex); + pool_info->pool_size= curr_size(); + pool_info->lru_len= UT_LIST_GET_LEN(LRU); + pool_info->old_lru_len= LRU_old_len; + pool_info->free_list_len= UT_LIST_GET_LEN(free); - pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + mysql_mutex_lock(&flush_list_mutex); + pool_info->flush_list_len= UT_LIST_GET_LEN(flush_list); + pool_info->n_pend_unzip= UT_LIST_GET_LEN(unzip_LRU); + pool_info->n_pend_reads= os_aio_pending_reads_approx(); + pool_info->n_pending_flush_lru= n_flush(); + pool_info->n_pending_flush_list= os_aio_pending_writes(); + mysql_mutex_unlock(&flush_list_mutex); - pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + double elapsed= 0.001 + difftime(time(nullptr), last_printout_time); - buf_refresh_io_stats(); - mysql_mutex_unlock(&buf_pool.mutex); + pool_info->n_pages_made_young= stat.n_pages_made_young; + pool_info->page_made_young_rate= + double(stat.n_pages_made_young - old_stat.n_pages_made_young) / + elapsed; + pool_info->n_pages_not_made_young= stat.n_pages_not_made_young; + pool_info->page_not_made_young_rate= + double(stat.n_pages_not_made_young - old_stat.n_pages_not_made_young) / + elapsed; + pool_info->n_pages_read= stat.n_pages_read; + pool_info->pages_read_rate= + double(stat.n_pages_read - old_stat.n_pages_read) / elapsed; + pool_info->n_pages_created= stat.n_pages_created; + pool_info->pages_created_rate= + double(stat.n_pages_created - old_stat.n_pages_created) / elapsed; + pool_info->n_pages_written= stat.n_pages_written; + pool_info->pages_written_rate= + double(stat.n_pages_written - old_stat.n_pages_written) / elapsed; + pool_info->n_page_gets= stat.n_page_gets; + pool_info->n_page_get_delta= stat.n_page_gets - old_stat.n_page_gets; + if (pool_info->n_page_get_delta) + { + pool_info->page_read_delta= stat.n_pages_read - old_stat.n_pages_read; + pool_info->young_making_delta= + stat.n_pages_made_young - old_stat.n_pages_made_young; + pool_info->not_young_making_delta= + stat.n_pages_not_made_young - old_stat.n_pages_not_made_young; + } + pool_info->n_ra_pages_read_rnd= stat.n_ra_pages_read_rnd; + pool_info->pages_readahead_rnd_rate= + double(stat.n_ra_pages_read_rnd - old_stat.n_ra_pages_read_rnd) / elapsed; + pool_info->n_ra_pages_read= stat.n_ra_pages_read; + pool_info->pages_readahead_rate= + double(stat.n_ra_pages_read - old_stat.n_ra_pages_read) / elapsed; + pool_info->n_ra_pages_evicted= stat.n_ra_pages_evicted; + pool_info->pages_evicted_rate= + double(stat.n_ra_pages_evicted - old_stat.n_ra_pages_evicted) / elapsed; + pool_info->unzip_lru_len= UT_LIST_GET_LEN(unzip_LRU); + pool_info->io_sum= buf_LRU_stat_sum.io; + pool_info->io_cur= buf_LRU_stat_cur.io; + pool_info->unzip_sum= buf_LRU_stat_sum.unzip; + pool_info->unzip_cur= buf_LRU_stat_cur.unzip; + buf_refresh_io_stats(); + mysql_mutex_unlock(&mutex); } /*********************************************************************//** @@ -4620,7 +4493,7 @@ { buf_pool_info_t pool_info; - buf_stats_get_pool_info(&pool_info); + buf_pool.get_info(&pool_info); buf_print_io_instance(&pool_info, file); } diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0dblwr.cc mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc --- mariadb-10.11.11/storage/innobase/buf/buf0dblwr.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc 2025-05-19 16:14:25.000000000 +0000 @@ -365,7 +365,7 @@ ut_ad(log_sys.last_checkpoint_lsn); if (!is_created()) return; - const lsn_t max_lsn{log_sys.get_lsn()}; + const lsn_t max_lsn{log_sys.get_flushed_lsn(std::memory_order_relaxed)}; ut_ad(recv_sys.scanned_lsn == max_lsn); ut_ad(recv_sys.scanned_lsn >= recv_sys.lsn); @@ -374,7 +374,7 @@ srv_page_size)); byte *const buf= read_buf + srv_page_size; - std::deque encrypted_pages; + std::deque deferred_pages; for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin(); i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr) { @@ -393,11 +393,12 @@ { /* These pages does not appear to belong to any tablespace. There is a possibility that this page could be - encrypted using full_crc32 format. If innodb encounters - any corrupted encrypted page during recovery then - InnoDB should use this page to find the valid page. - See find_encrypted_page() */ - encrypted_pages.push_back(*i); + encrypted/compressed using full_crc32 format. + If innodb encounters any corrupted encrypted/compressed + page during recovery then InnoDB should use this page to + find the valid page. + See find_encrypted_page()/find_page_compressed() */ + deferred_pages.push_back(*i); continue; } @@ -478,7 +479,7 @@ } recv_sys.dblwr.pages.clear(); - for (byte *page : encrypted_pages) + for (byte *page : deferred_pages) recv_sys.dblwr.pages.push_back(page); fil_flush_file_spaces(); aligned_free(read_buf); @@ -599,20 +600,67 @@ } #endif /* UNIV_DEBUG */ +ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept +{ + mysql_mutex_assert_owner(&mutex); + const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + + sql_print_information("InnoDB: Double Write State\n" + "-------------------\n" + "Batch running : %s\n" + "Active Slot - first_free: %zu reserved: %zu\n" + "Flush Slot - first_free: %zu reserved: %zu\n" + "-------------------", + (batch_running ? "true" : "false"), + active_slot->first_free, active_slot->reserved, + flush_slot->first_free, flush_slot->reserved); +} + bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept { mysql_mutex_assert_owner(&mutex); ut_ad(size == block_size()); - for (;;) + const size_t max_count= 60 * 60; + const size_t first_log_count= 30; + const size_t fatal_threshold= + static_cast(srv_fatal_semaphore_wait_threshold); + size_t log_count= first_log_count; + + for (size_t count= 0;;) { if (!active_slot->first_free) return false; if (!batch_running) break; - my_cond_wait(&cond, &mutex.m_mutex); - } + timespec abstime; + set_timespec(abstime, 1); + my_cond_timedwait(&cond, &mutex.m_mutex, &abstime); + + if (count > fatal_threshold) + { + buf_pool.print_flush_info(); + print_info(); + ib::fatal() << "InnoDB: Long wait (" << count + << " seconds) for double-write buffer flush."; + } + else if (++count < first_log_count && !(count % 5)) + { + sql_print_information("InnoDB: Long wait (%zu seconds) for double-write" + " buffer flush.", count); + buf_pool.print_flush_info(); + print_info(); + } + else if (!(count % log_count)) + { + sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write" + " buffer flush.", count); + buf_pool.print_flush_info(); + print_info(); + log_count= log_count >= max_count ? max_count : log_count * 2; + } + } ut_ad(active_slot->reserved == active_slot->first_free); ut_ad(!flushing_buffered_writes); @@ -732,6 +780,9 @@ ut_ad(lsn); ut_ad(lsn >= bpage->oldest_modification()); log_write_up_to(lsn, true); + ut_ad(!e.request.node->space->full_crc32() || + !buf_page_is_corrupted(true, static_cast(frame), + e.request.node->space->flags)); e.request.node->space->io(e.request, bpage->physical_offset(), e_size, frame, bpage); } diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0dump.cc mariadb-10.11.13/storage/innobase/buf/buf0dump.cc --- mariadb-10.11.11/storage/innobase/buf/buf0dump.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0dump.cc 2025-05-19 16:14:25.000000000 +0000 @@ -58,7 +58,7 @@ static volatile bool buf_dump_should_start; static volatile bool buf_load_should_start; -static bool buf_load_abort_flag; +static Atomic_relaxed buf_load_abort_flag; /** Start the buffer pool dump/load task and instructs it to start a dump. */ void buf_dump_start() @@ -295,7 +295,7 @@ /* limit the number of total pages dumped to X% of the total number of pages */ - t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100; + t_pages = buf_pool.curr_size() * srv_buf_pool_dump_pct / 100; if (n_pages > t_pages) { buf_dump_status(STATUS_INFO, "Restricted to " ULINTPF @@ -477,10 +477,10 @@ return; } - /* If dump is larger than the buffer pool(s), then we ignore the + /* If the dump is larger than the buffer pool, then we ignore the extra trailing. This could happen if a dump is made, then buffer pool is shrunk and then load is attempted. */ - dump_n = std::min(dump_n, buf_pool.get_n_pages()); + dump_n = std::min(dump_n, buf_pool.curr_size()); if (dump_n != 0) { dump = static_cast(ut_malloc_nokey( diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0flu.cc mariadb-10.11.13/storage/innobase/buf/buf0flu.cc --- mariadb-10.11.11/storage/innobase/buf/buf0flu.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0flu.cc 2025-05-19 16:14:25.000000000 +0000 @@ -281,6 +281,8 @@ { ut_ad(!persistent == fsp_is_system_temporary(id().space())); ut_ad(state >= WRITE_FIX); + ut_ad(!frame || + frame == reinterpret_cast(this)->frame_address()); if (UNIV_LIKELY(!error)) { @@ -692,7 +694,6 @@ { static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4)); - ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); } d= tmp; @@ -795,6 +796,7 @@ size_t orig_size; #endif buf_tmp_buffer_t *slot= nullptr; + byte *page= frame; if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ { @@ -810,7 +812,6 @@ } else { - byte *page= frame; size= block->physical_size(); #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 orig_size= size; @@ -852,6 +853,8 @@ if (!space->is_temporary() && !space->is_being_imported() && lsn > log_sys.get_flushed_lsn()) log_write_up_to(lsn, true); + ut_ad(space->is_temporary() || !space->full_crc32() || + !buf_page_is_corrupted(true, write_frame, space->flags)); space->io(IORequest{type, this, slot}, physical_offset(), size, write_frame, this); } @@ -891,7 +894,7 @@ : space.physical_size() == 1024 ? 3 : 0)); /* When flushed, dirty blocks are searched in neighborhoods of this size, and flushed along with the original page. */ - const ulint s= buf_pool.curr_size / 16; + const ulint s= buf_pool.curr_size() / 16; const uint32_t read_ahead= buf_pool.read_ahead_area; const uint32_t buf_flush_area= read_ahead > s ? static_cast(s) : read_ahead; @@ -1209,18 +1212,34 @@ buf_LRU_free_page(bpage, true); } +/** Adjust to_withdraw during buf_pool_t::shrink() */ +ATTRIBUTE_COLD static size_t buf_flush_LRU_to_withdraw(size_t to_withdraw, + const buf_page_t &bpage) + noexcept +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + if (!buf_pool.is_shrinking()) + return 0; + const size_t size{buf_pool.size_in_bytes_requested}; + if (buf_pool.will_be_withdrawn(bpage.frame, size) || + buf_pool.will_be_withdrawn(bpage.zip.data, size)) + to_withdraw--; + return to_withdraw; +} + /** Flush dirty blocks from the end buf_pool.LRU, and move clean blocks to buf_pool.free. -@param max maximum number of blocks to flush -@param n counts of flushed and evicted pages */ -static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept +@param max maximum number of blocks to flush +@param n counts of flushed and evicted pages +@param to_withdraw buf_pool.to_withdraw() */ +static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, + size_t to_withdraw) noexcept { - ulint scanned= 0; + size_t scanned= 0; mysql_mutex_assert_owner(&buf_pool.mutex); - ulint free_limit{buf_pool.LRU_scan_depth}; - if (buf_pool.withdraw_target && buf_pool.is_shrinking()) - free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw); - + size_t free_limit{buf_pool.LRU_scan_depth}; + if (UNIV_UNLIKELY(to_withdraw > free_limit)) + to_withdraw= free_limit; const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : buf_pool.flush_neighbors; fil_space_t *space= nullptr; @@ -1230,20 +1249,21 @@ /* BUF_LRU_MIN_LEN (256) is too high value for low buffer pool(BP) size. For example, for BP size lower than 80M and 16 K page size, the limit is more than - 5% of total BP and for lowest BP 5M, it is 80% of the BP. Non-data objects + 5% of total BP and for lowest BP 6M, it is 80% of the BP. Non-data objects like explicit locks could occupy part of the BP pool reducing the pages available for LRU. If LRU reaches minimum limit and if no free pages are available, server would hang with page cleaner not able to free any more pages. To avoid such hang, we adjust the LRU limit lower than the limit for data objects as checked in buf_LRU_check_size_of_non_data_objects() i.e. one page less than 5% of BP. */ - size_t pool_limit= buf_pool.curr_size / 20 - 1; - auto buf_lru_min_len= std::min(pool_limit, BUF_LRU_MIN_LEN); + const size_t buf_lru_min_len= + std::min((buf_pool.usable_size()) / 20 - 1, size_t{BUF_LRU_MIN_LEN}); for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && ((UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len && UT_LIST_GET_LEN(buf_pool.free) < free_limit) || + to_withdraw || recv_recovery_is_on()); ++scanned, bpage= buf_pool.lru_hp.get()) { @@ -1259,6 +1279,8 @@ if (state != buf_page_t::FREED && (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state))) continue; + if (UNIV_UNLIKELY(to_withdraw != 0)) + to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage); buf_LRU_free_page(bpage, true); ++n->evicted; if (UNIV_LIKELY(scanned & 31)) @@ -1330,20 +1352,32 @@ continue; } + if (state < buf_page_t::UNFIXED) + goto flush; + if (n->flushed >= max && !recv_recovery_is_on()) { bpage->lock.u_unlock(true); break; } - if (neighbors && space->is_rotational()) + if (neighbors && space->is_rotational() && UNIV_LIKELY(!to_withdraw) && + /* Skip neighbourhood flush from LRU list if we haven't yet reached + half of the free page target. */ + UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1, n->flushed, max); - else if (bpage->flush(space)) - ++n->flushed; else - continue; + { + flush: + if (UNIV_UNLIKELY(to_withdraw != 0)) + to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage); + if (bpage->flush(space)) + ++n->flushed; + else + continue; + } goto reacquire_mutex; } @@ -1372,11 +1406,12 @@ @param n counts of flushed and evicted pages */ static void buf_do_LRU_batch(ulint max, flush_counters_t *n) noexcept { - if (buf_LRU_evict_from_unzip_LRU()) + const size_t to_withdraw= buf_pool.to_withdraw(); + if (!to_withdraw && buf_LRU_evict_from_unzip_LRU()) buf_free_from_unzip_LRU_list_batch(); n->evicted= 0; n->flushed= 0; - buf_flush_LRU_list_batch(max, n); + buf_flush_LRU_list_batch(max, n, to_withdraw); mysql_mutex_assert_owner(&buf_pool.mutex); buf_lru_freed_page_count+= n->evicted; @@ -1725,14 +1760,22 @@ buf_do_LRU_batch(max_n, &n); ulint pages= n.flushed; + ulint evicted= n.evicted; + + /* If we have exhausted flush quota, it is likely we exited before + generating enough free pages. Call once more with 0 flush to generate + free pages immediately as required. */ + if (pages >= max_n) + buf_do_LRU_batch(0, &n); - if (n.evicted) + evicted+= n.evicted; + if (evicted) { buf_pool.try_LRU_scan= true; pthread_cond_broadcast(&buf_pool.done_free); } else if (!pages && !buf_pool.try_LRU_scan) - /* For example, with the minimum innodb_buffer_pool_size=5M and + /* For example, with the minimum innodb_buffer_pool_size=6M and the default innodb_page_size=16k there are only a little over 316 pages in the buffer pool. The buffer pool can easily be exhausted by a workload of some dozen concurrent connections. The system could @@ -1760,8 +1803,9 @@ { ut_ad(!srv_read_only_mode); ut_ad(end_lsn >= next_checkpoint_lsn); - ut_ad(end_lsn <= get_lsn()); - ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() || + ut_d(const lsn_t current_lsn{get_lsn()}); + ut_ad(end_lsn <= current_lsn); + ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= current_lsn || srv_shutdown_state > SRV_SHUTDOWN_INITIATED); DBUG_PRINT("ib_log", @@ -1890,7 +1934,8 @@ ut_ad(!is_opened()); my_munmap(buf, file_size); buf= resize_buf; - set_buf_free(START_OFFSET + (get_lsn() - resizing)); + buf_size= unsigned(std::min(resize_target - START_OFFSET, + buf_size_max)); } else #endif @@ -1912,7 +1957,8 @@ resize_flush_buf= nullptr; resize_target= 0; resize_lsn.store(0, std::memory_order_relaxed); - writer_update(); + resize_initiator= nullptr; + writer_update(false); } log_resize_release(); @@ -1999,6 +2045,14 @@ if (recv_recovery_is_on()) recv_sys.apply(true); +#if defined HAVE_valgrind && !__has_feature(memory_sanitizer) + /* The built-in scheduler in Valgrind may neglect some threads for a + long time. Under Valgrind, let us explicitly wait for page write + completion in order to avoid a result difference in the test + innodb.page_cleaner. */ + os_aio_wait_until_no_pending_writes(false); +#endif + switch (srv_file_flush_method) { case SRV_NOSYNC: case SRV_O_DIRECT_NO_FSYNC: @@ -2016,9 +2070,9 @@ } /** Make a checkpoint. */ -ATTRIBUTE_COLD void log_make_checkpoint() +ATTRIBUTE_COLD void log_make_checkpoint() noexcept { - buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire)); + buf_flush_wait_flushed(log_get_lsn()); while (!log_checkpoint()); } @@ -2026,8 +2080,6 @@ NOTE: The calling thread is not allowed to hold any buffer page latches! */ static void buf_flush_wait(lsn_t lsn) noexcept { - ut_ad(lsn <= log_sys.get_lsn()); - lsn_t oldest_lsn; while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn) @@ -2192,6 +2244,8 @@ MONITOR_FLUSH_SYNC_PAGES, n_flushed); } + os_aio_wait_until_no_pending_writes(false); + switch (srv_file_flush_method) { case SRV_NOSYNC: case SRV_O_DIRECT_NO_FSYNC: @@ -2234,13 +2288,13 @@ mysql_mutex_unlock(&buf_pool.flush_list_mutex); } -/** Check if the adpative flushing threshold is recommended based on +/** Check if the adaptive flushing threshold is recommended based on redo log capacity filled threshold. @param oldest_lsn buf_pool.get_oldest_modification() @return true if adaptive flushing is recommended. */ static bool af_needed_for_redo(lsn_t oldest_lsn) noexcept { - lsn_t age= (log_sys.get_lsn() - oldest_lsn); + lsn_t age= log_sys.get_lsn_approx() - oldest_lsn; lsn_t af_lwm= static_cast(srv_adaptive_flushing_lwm * static_cast(log_sys.log_capacity) / 100); @@ -2300,7 +2354,7 @@ lsn_t lsn_rate; ulint n_pages = 0; - const lsn_t cur_lsn = log_sys.get_lsn(); + const lsn_t cur_lsn = log_sys.get_lsn_approx(); ut_ad(oldest_lsn <= cur_lsn); ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); time_t curr_time = time(nullptr); @@ -2309,13 +2363,23 @@ if (!prev_lsn || !pct_for_lsn) { prev_time = curr_time; prev_lsn = cur_lsn; - if (max_pct > 0.0) { - dirty_pct /= max_pct; - } - n_pages = ulint(dirty_pct * double(srv_io_capacity)); - if (n_pages < dirty_blocks) { - n_pages= std::min(srv_io_capacity, dirty_blocks); + if (srv_io_capacity >= dirty_blocks) { + n_pages = dirty_blocks; + } else { + if (max_pct > 1.0) { + dirty_pct/= max_pct; + } + n_pages= ulint(dirty_pct * double(srv_io_capacity)); + + if (n_pages < dirty_blocks) { + n_pages= srv_io_capacity; + + } else { + /* Set maximum IO capacity upper bound. */ + n_pages= std::min(srv_max_io_capacity, + dirty_blocks); + } } func_exit: @@ -2412,6 +2476,13 @@ } TPOOL_SUPPRESS_TSAN +bool buf_pool_t::running_out() const noexcept +{ + return !recv_recovery_is_on() && + UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < n_blocks / 4; +} + +TPOOL_SUPPRESS_TSAN bool buf_pool_t::need_LRU_eviction() const noexcept { /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting @@ -2448,6 +2519,11 @@ DBUG_EXECUTE_IF("ib_page_cleaner_sleep", { std::this_thread::sleep_for(std::chrono::seconds(1)); + /* Cover the logging code in debug mode. */ + buf_pool.print_flush_info(); + buf_dblwr.lock(); + buf_dblwr.print_info(); + buf_dblwr.unlock(); }); lsn_limit= buf_flush_sync_lsn; @@ -2470,7 +2546,7 @@ (!UT_LIST_GET_LEN(buf_pool.flush_list) || srv_max_dirty_pages_pct_lwm == 0.0)) { - buf_pool.LRU_warned.clear(std::memory_order_release); + buf_pool.LRU_warned_clear(); /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); @@ -2545,6 +2621,7 @@ buf_pool.n_flush_inc(); mysql_mutex_unlock(&buf_pool.flush_list_mutex); n= srv_max_io_capacity; + os_aio_wait_until_no_pending_writes(false); mysql_mutex_lock(&buf_pool.mutex); LRU_flush: n= buf_flush_LRU(n); @@ -2648,10 +2725,17 @@ !buf_pool.need_LRU_eviction()) goto check_oldest_and_set_idle; else + { mysql_mutex_lock(&buf_pool.mutex); + os_aio_wait_until_no_pending_writes(false); + } n= srv_max_io_capacity; n= n >= n_flushed ? n - n_flushed : 0; + /* It is critical to generate free pages to keep the system alive. Make + sure we are not hindered by dirty pages in LRU tail. */ + n= std::max(n, std::min(srv_max_io_capacity, + buf_pool.LRU_scan_depth)); goto LRU_flush; } @@ -2689,11 +2773,13 @@ { mysql_mutex_assert_owner(&mutex); try_LRU_scan= false; - if (!LRU_warned.test_and_set(std::memory_order_acquire)) + if (!LRU_warned) + { + LRU_warned= true; sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!" - " %zu blocks are in use and %zu free." - " Consider increasing innodb_buffer_pool_size.", - UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free)); + " Consider increasing innodb_buffer_pool_size."); + print_flush_info(); + } } /** Initialize page_cleaner. */ @@ -2740,7 +2826,7 @@ NOTE: The calling thread is not allowed to hold any buffer page latches! */ void buf_flush_sync_batch(lsn_t lsn) noexcept { - lsn= std::max(lsn, log_sys.get_lsn()); + lsn= std::max(lsn, log_get_lsn()); mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_wait(lsn); mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -2759,24 +2845,77 @@ thd_wait_begin(nullptr, THD_WAIT_DISKIO); tpool::tpool_wait_begin(); - mysql_mutex_lock(&buf_pool.flush_list_mutex); - for (;;) + log_sys.latch.wr_lock(SRW_LOCK_CALL); + + for (lsn_t lsn= log_sys.get_lsn();;) { - const lsn_t lsn= log_sys.get_lsn(); + log_sys.latch.wr_unlock(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_wait(lsn); /* Wait for the page cleaner to be idle (for log resizing at startup) */ while (buf_flush_sync_lsn) my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); - if (lsn == log_sys.get_lsn()) + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + lsn_t new_lsn= log_sys.get_lsn(); + if (lsn == new_lsn) break; + lsn= new_lsn; } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_sys.latch.wr_unlock(); tpool::tpool_wait_end(); thd_wait_end(nullptr); } +ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept +{ + /* We do dirty read of UT_LIST count variable. */ + size_t lru_size= UT_LIST_GET_LEN(LRU); + size_t dirty_size= UT_LIST_GET_LEN(flush_list); + size_t free_size= UT_LIST_GET_LEN(free); + size_t dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0; + sql_print_information("InnoDB: Buffer Pool pages\n" + "-------------------\n" + "LRU Pages : %zu\n" + "Free Pages : %zu\n" + "Dirty Pages: %zu : %zu%%\n" + "-------------------", + lru_size, free_size, dirty_size, dirty_pct); + + lsn_t lsn= log_get_lsn(); + lsn_t clsn= log_sys.last_checkpoint_lsn; + sql_print_information("InnoDB: LSN flush parameters\n" + "-------------------\n" + "System LSN : %" PRIu64 "\n" + "Checkpoint LSN: %" PRIu64 "\n" + "Flush ASync LSN: %" PRIu64 "\n" + "Flush Sync LSN: %" PRIu64 "\n" + "-------------------", + lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load()); + + lsn_t age= lsn - clsn; + lsn_t age_pct= log_sys.max_checkpoint_age + ? age * 100 / log_sys.max_checkpoint_age : 0; + sql_print_information("InnoDB: LSN age parameters\n" + "-------------------\n" + "Current Age : %" PRIu64 " : %" PRIu64 "%%\n" + "Max Age(Async): %" PRIu64 "\n" + "Max Age(Sync) : %" PRIu64 "\n" + "Capacity : %" PRIu64 "\n" + "-------------------", + age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age, + log_sys.log_capacity); + + sql_print_information("InnoDB: Pending IO count\n" + "-------------------\n" + "Pending Read : %zu\n" + "Pending Write: %zu\n" + "-------------------", + os_aio_pending_reads_approx(), os_aio_pending_writes_approx()); +} + #ifdef UNIV_DEBUG /** Functor to validate the flush list. */ struct Check { diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0lru.cc mariadb-10.11.13/storage/innobase/buf/buf0lru.cc --- mariadb-10.11.11/storage/innobase/buf/buf0lru.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0lru.cc 2025-05-19 16:14:25.000000000 +0000 @@ -38,6 +38,7 @@ #include "srv0srv.h" #include "srv0mon.h" #include "my_cpu.h" +#include "log.h" /** The number of blocks from the LRU_old pointer onward, including the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV @@ -133,7 +134,7 @@ buf_pool.stat.LRU_bytes += bpage->physical_size(); - ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size); + ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size()); } /** @return whether the unzip_LRU list should be used for evicting a victim @@ -259,89 +260,55 @@ return(freed); } -/** @return a buffer block from the buf_pool.free list -@retval NULL if the free list is empty */ -buf_block_t* buf_LRU_get_free_only() -{ - buf_block_t* block; - - mysql_mutex_assert_owner(&buf_pool.mutex); - - block = reinterpret_cast( - UT_LIST_GET_FIRST(buf_pool.free)); - - while (block != NULL) { - ut_ad(block->page.in_free_list); - ut_d(block->page.in_free_list = FALSE); - ut_ad(!block->page.oldest_modification()); - ut_ad(!block->page.in_LRU_list); - ut_a(!block->page.in_file()); - UT_LIST_REMOVE(buf_pool.free, &block->page); - - if (!buf_pool.is_shrinking() - || UT_LIST_GET_LEN(buf_pool.withdraw) - >= buf_pool.withdraw_target - || !buf_pool.will_be_withdrawn(block->page)) { - /* No adaptive hash index entries may point to - a free block. */ - assert_block_ahi_empty(block); - - block->page.set_state(buf_page_t::MEMORY); - block->page.set_os_used(); - break; - } - - /* This should be withdrawn */ - UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page); - ut_d(block->in_withdraw_list = true); - - block = reinterpret_cast( - UT_LIST_GET_FIRST(buf_pool.free)); - } - - return(block); -} - /******************************************************************//** Checks how much of buf_pool is occupied by non-data objects like AHI, lock heaps etc. Depending on the size of non-data objects this function will either assert or issue a warning and switch on the status monitor. */ -static void buf_LRU_check_size_of_non_data_objects() +static void buf_LRU_check_size_of_non_data_objects() noexcept { mysql_mutex_assert_owner(&buf_pool.mutex); - if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks) + if (recv_recovery_is_on()) return; - const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + const size_t curr_size{buf_pool.usable_size()}; - if (s < buf_pool.curr_size / 20) - ib::fatal() << "Over 95 percent of the buffer pool is" - " occupied by lock heaps" + auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + + if (s >= curr_size / 20); + else if (buf_pool.is_shrinking()) + buf_pool.LRU_warn(); + else + { + sql_print_error("[FATAL] InnoDB: Over 95 percent of the buffer pool is" + " occupied by lock heaps" #ifdef BTR_CUR_HASH_ADAPT - " or the adaptive hash index" + " or the adaptive hash index" #endif /* BTR_CUR_HASH_ADAPT */ - "! Check that your transactions do not set too many" - " row locks, or review if innodb_buffer_pool_size=" - << (buf_pool.curr_size >> (20U - srv_page_size_shift)) - << "M could be bigger."; + "! Check that your transactions do not set too many" + " row locks, or review if innodb_buffer_pool_size=%zuM" + " could be bigger", + curr_size >> (20 - srv_page_size_shift)); + abort(); + } - if (s < buf_pool.curr_size / 3) + if (s < curr_size / 3) { if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer) { /* Over 67 % of the buffer pool is occupied by lock heaps or the adaptive hash index. This may be a memory leak! */ - ib::warn() << "Over 67 percent of the buffer pool is" - " occupied by lock heaps" + sql_print_warning("InnoDB: Over 67 percent of the buffer pool is" + " occupied by lock heaps" #ifdef BTR_CUR_HASH_ADAPT - " or the adaptive hash index" + " or the adaptive hash index" #endif /* BTR_CUR_HASH_ADAPT */ - "! Check that your transactions do not set too many row locks." - " innodb_buffer_pool_size=" - << (buf_pool.curr_size >> (20U - srv_page_size_shift)) - << "M. Starting the InnoDB Monitor to print diagnostics."; + "! Check that your transactions do not set too many" + " row locks. innodb_buffer_pool_size=%zuM." + " Starting the InnoDB Monitor to print diagnostics.", + curr_size >> (20 - srv_page_size_shift)); + buf_lru_switched_on_innodb_mon= true; srv_print_innodb_monitor= TRUE; srv_monitor_timer_schedule_now(); @@ -389,15 +356,15 @@ retry: /* If there is a block in the free list, take it */ - block= buf_LRU_get_free_only(); + block= buf_pool.allocate(); if (block) { got_block: const ulint LRU_size= UT_LIST_GET_LEN(buf_pool.LRU); const ulint available= UT_LIST_GET_LEN(buf_pool.free); - const ulint scan_depth= buf_pool.LRU_scan_depth / 2; - ut_ad(LRU_size <= BUF_LRU_MIN_LEN || - available >= scan_depth || buf_pool.need_LRU_eviction()); + const size_t scan_depth{buf_pool.LRU_scan_depth / 2}; + ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth || + buf_pool.is_shrinking() || buf_pool.need_LRU_eviction()); ut_d(bool signalled = false); @@ -446,7 +413,7 @@ waited= true; - while (!(block= buf_LRU_get_free_only())) + while (!(block= buf_pool.allocate())) { buf_pool.stat.LRU_waits++; @@ -811,10 +778,10 @@ if (zip || !bpage->zip.data || !bpage->frame) { break; } + mysql_mutex_lock(&buf_pool.flush_list_mutex); relocate_compressed: b = static_cast(ut_zalloc_nokey(sizeof *b)); ut_a(b); - mysql_mutex_lock(&buf_pool.flush_list_mutex); new (b) buf_page_t(*bpage); b->frame = nullptr; { @@ -833,7 +800,12 @@ hash_lock.unlock(); return(false); } - goto relocate_compressed; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (bpage->can_relocate()) { + goto relocate_compressed; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + goto func_exit; } mysql_mutex_assert_owner(&buf_pool.mutex); @@ -872,7 +844,6 @@ /* The fields of bpage were copied to b before buf_LRU_block_remove_hashed() was invoked. */ - ut_ad(!b->in_zip_hash); ut_ad(b->in_LRU_list); ut_ad(b->in_page_hash); ut_d(b->in_page_hash = false); @@ -988,24 +959,12 @@ if (data != NULL) { block->page.zip.data = NULL; - buf_pool_mutex_exit_forbid(); - ut_ad(block->zip_size()); - buf_buddy_free(data, block->zip_size()); - - buf_pool_mutex_exit_allow(); page_zip_set_size(&block->page.zip, 0); } - if (buf_pool.is_shrinking() - && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target - && buf_pool.will_be_withdrawn(block->page)) { - /* This should be withdrawn */ - UT_LIST_ADD_LAST( - buf_pool.withdraw, - &block->page); - ut_d(block->in_withdraw_list = true); + if (buf_pool.to_withdraw() && buf_pool.withdraw(block->page)) { } else { UT_LIST_ADD_FIRST(buf_pool.free, &block->page); ut_d(block->page.in_free_list = true); @@ -1106,7 +1065,6 @@ MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); } - ut_ad(!bpage->in_zip_hash); buf_pool.page_hash.remove(chain, bpage); page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); @@ -1118,11 +1076,7 @@ ut_ad(!bpage->oldest_modification()); hash_lock.unlock(); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(bpage->zip.data, bpage->zip_size()); - - buf_pool_mutex_exit_allow(); bpage->lock.free(); ut_free(bpage); return false; @@ -1151,12 +1105,7 @@ ut_ad(!bpage->in_free_list); ut_ad(!bpage->oldest_modification()); ut_ad(!bpage->in_LRU_list); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, bpage->zip_size()); - - buf_pool_mutex_exit_allow(); - page_zip_set_size(&bpage->zip, 0); } @@ -1327,7 +1276,7 @@ ut_ad(!bpage->frame || reinterpret_cast(bpage) ->in_unzip_LRU_list - == bpage->belongs_to_unzip_LRU()); + == !!bpage->zip.data); if (bpage->is_old()) { const buf_page_t* prev diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0rea.cc mariadb-10.11.13/storage/innobase/buf/buf0rea.cc --- mariadb-10.11.11/storage/innobase/buf/buf0rea.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/buf/buf0rea.cc 2025-05-19 16:14:25.000000000 +0000 @@ -44,7 +44,7 @@ #include "log.h" #include "mariadb_stats.h" -/** If there are buf_pool.curr_size per the number below pending reads, then +/** If there are buf_pool.curr_size() per the number below pending reads, then read-ahead is not done: this is to prevent flooding the buffer pool with i/o-fixed buffer blocks */ #define BUF_READ_AHEAD_PEND_LIMIT 2 @@ -63,7 +63,6 @@ ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked()); ut_ad(w >= &watch[0]); ut_ad(w < &watch[array_elements(watch)]); - ut_ad(!w->in_zip_hash); ut_ad(!w->zip.data); uint32_t s{w->state()}; @@ -372,7 +371,7 @@ return 0; if (os_aio_pending_reads_approx() > - buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT) return 0; fil_space_t* space= fil_space_t::get(page_id.space()); @@ -525,7 +524,7 @@ return 0; if (os_aio_pending_reads_approx() > - buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT) return 0; const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0defrag_bg.cc mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc --- mariadb-10.11.11/storage/innobase/dict/dict0defrag_bg.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc 2025-05-19 16:14:25.000000000 +0000 @@ -196,7 +196,7 @@ ? dict_table_find_index_on_id(table, index_id) : nullptr) if (index->is_btree()) dict_stats_save_defrag_stats(index); - dict_table_close(table, false, thd, mdl); + dict_table_close(table, thd, mdl); } } @@ -217,47 +217,17 @@ if (index->is_ibuf()) return DB_SUCCESS; - MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; - dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats= dict_acquire_mdl_shared(table_stats, thd, &mdl_table); - dict_sys.unfreeze(); - } - if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) - { -release_and_exit: - if (table_stats) - dict_table_close(table_stats, false, thd, mdl_table); + dict_stats stats; + if (stats.open(thd)) return DB_STATS_DO_NOT_EXIST; - } - - dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats= dict_acquire_mdl_shared(index_stats, thd, &mdl_index); - dict_sys.unfreeze(); - } - if (!index_stats) - goto release_and_exit; - if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) - { - dict_table_close(index_stats, false, thd, mdl_index); - goto release_and_exit; - } - trx_t *trx= trx_create(); trx->mysql_thd= thd; trx_start_internal(trx); dberr_t ret= trx->read_only ? DB_READ_ONLY - : lock_table_for_trx(table_stats, trx, LOCK_X); + : lock_table_for_trx(stats.table(), trx, LOCK_X); if (ret == DB_SUCCESS) - ret= lock_table_for_trx(index_stats, trx, LOCK_X); + ret= lock_table_for_trx(stats.index(), trx, LOCK_X); row_mysql_lock_data_dictionary(trx); if (ret == DB_SUCCESS) ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed", @@ -271,31 +241,27 @@ else trx->rollback(); - if (table_stats) - dict_table_close(table_stats, true, thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, true, thd, mdl_index); - row_mysql_unlock_data_dictionary(trx); trx->free(); + stats.close(); return ret; } /**************************************************************//** Gets the number of reserved and used pages in a B-tree. -@return number of pages reserved, or ULINT_UNDEFINED if the index -is unavailable */ +@return number of pages reserved +@retval 0 if the index is unavailable */ static -ulint +uint32_t btr_get_size_and_reserved( dict_index_t* index, /*!< in: index */ ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ - ulint* used, /*!< out: number of pages used (<= reserved) */ + uint32_t* used, /*!< out: number of pages used (<= reserved) */ mtr_t* mtr) /*!< in/out: mini-transaction where index is s-latched */ { - ulint dummy; + uint32_t dummy; ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK)); ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); @@ -304,19 +270,19 @@ || dict_index_is_online_ddl(index) || !index->is_committed() || !index->table->space) { - return(ULINT_UNDEFINED); + return 0; } dberr_t err; buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err); *used = 0; if (!root) { - return ULINT_UNDEFINED; + return 0; } mtr->x_lock_space(index->table->space); - ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + auto n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + root->page.frame, used, mtr); if (flag == BTR_TOTAL_SIZE) { n += fseg_n_reserved_pages(*root, @@ -343,59 +309,28 @@ const time_t now= time(nullptr); mtr_t mtr; - ulint n_leaf_pages; + uint32_t n_leaf_pages; mtr.start(); mtr_sx_lock_index(index, &mtr); - ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, - &n_leaf_pages, &mtr); + uint32_t n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); mtr.commit(); - if (n_leaf_reserved == ULINT_UNDEFINED) + if (!n_leaf_reserved) return DB_SUCCESS; - THD *thd= current_thd; - MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; - dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats= dict_acquire_mdl_shared(table_stats, thd, &mdl_table); - dict_sys.unfreeze(); - } - if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) - { -release_and_exit: - if (table_stats) - dict_table_close(table_stats, false, thd, mdl_table); + THD *const thd= current_thd; + dict_stats stats; + if (stats.open(thd)) return DB_STATS_DO_NOT_EXIST; - } - - dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats= dict_acquire_mdl_shared(index_stats, thd, &mdl_index); - dict_sys.unfreeze(); - } - if (!index_stats) - goto release_and_exit; - - if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) - { - dict_table_close(index_stats, false, thd, mdl_index); - goto release_and_exit; - } - trx_t *trx= trx_create(); trx->mysql_thd= thd; trx_start_internal(trx); dberr_t ret= trx->read_only ? DB_READ_ONLY - : lock_table_for_trx(table_stats, trx, LOCK_X); + : lock_table_for_trx(stats.table(), trx, LOCK_X); if (ret == DB_SUCCESS) - ret= lock_table_for_trx(index_stats, trx, LOCK_X); + ret= lock_table_for_trx(stats.index(), trx, LOCK_X); row_mysql_lock_data_dictionary(trx); @@ -423,12 +358,9 @@ else trx->rollback(); - if (table_stats) - dict_table_close(table_stats, true, thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, true, thd, mdl_index); row_mysql_unlock_data_dictionary(trx); trx->free(); + stats.close(); return ret; } diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0dict.cc mariadb-10.11.13/storage/innobase/dict/dict0dict.cc --- mariadb-10.11.11/storage/innobase/dict/dict0dict.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/dict/dict0dict.cc 2025-05-19 16:14:25.000000000 +0000 @@ -44,6 +44,7 @@ #include "btr0cur.h" #include "btr0sea.h" #include "buf0buf.h" +#include "buf0flu.h" #include "data0type.h" #include "dict0boot.h" #include "dict0load.h" @@ -195,71 +196,6 @@ return(FALSE); } -/** Decrement the count of open handles */ -void dict_table_close(dict_table_t *table) -{ - if (table->get_ref_count() == 1 && - dict_stats_is_persistent_enabled(table) && - strchr(table->name.m_name, '/')) - { - /* It looks like we are closing the last handle. The user could - have executed FLUSH TABLES in order to have the statistics reloaded - from the InnoDB persistent statistics tables. We must acquire - exclusive dict_sys.latch to prevent a race condition with another - thread concurrently acquiring a handle on the table. */ - dict_sys.lock(SRW_LOCK_CALL); - if (table->release()) - { - table->stats_mutex_lock(); - if (table->get_ref_count() == 0) - dict_stats_deinit(table); - table->stats_mutex_unlock(); - } - dict_sys.unlock(); - } - else - table->release(); -} - -/** Decrements the count of open handles of a table. -@param[in,out] table table -@param[in] dict_locked whether dict_sys.latch is being held -@param[in] thd thread to release MDL -@param[in] mdl metadata lock or NULL if the thread - is a foreground one. */ -void -dict_table_close( - dict_table_t* table, - bool dict_locked, - THD* thd, - MDL_ticket* mdl) -{ - if (!dict_locked) - dict_table_close(table); - else - { - if (table->release() && dict_stats_is_persistent_enabled(table) && - strchr(table->name.m_name, '/')) - { - /* Force persistent stats re-read upon next open of the table so - that FLUSH TABLE can be used to forcibly fetch stats from disk if - they have been manually modified. */ - table->stats_mutex_lock(); - if (table->get_ref_count() == 0) - dict_stats_deinit(table); - table->stats_mutex_unlock(); - } - - ut_ad(dict_lru_validate()); - ut_ad(dict_sys.find(table)); - } - - if (!thd || !mdl); - else if (MDL_context *mdl_context= static_cast - (thd_mdl_context(thd))) - mdl_context->release_lock(mdl); -} - /** Check if the table has a given (non_virtual) column. @param[in] table table object @param[in] col_name column name @@ -586,6 +522,14 @@ return(ULINT_UNDEFINED); } +void mdl_release(THD *thd, MDL_ticket *mdl) noexcept +{ + if (!thd || !mdl); + else if (MDL_context *mdl_context= static_cast + (thd_mdl_context(thd))) + mdl_context->release_lock(mdl); +} + /** Parse the table file name into table name and database name. @tparam dict_frozen whether the caller holds dict_sys.latch @param[in,out] db_name database name buffer @@ -694,32 +638,28 @@ MDL_context *mdl_context, MDL_ticket **mdl, dict_table_op_t table_op) { - table_id_t table_id= table->id; char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1]; char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1]; size_t db_len, tbl_len; - bool unaccessible= false; if (!table->parse_name(db_buf, tbl_buf, &db_len, &tbl_len)) /* The name of an intermediate table starts with #sql */ return table; retry: - if (!unaccessible && (!table->is_readable() || table->corrupted)) + ut_ad(!trylock == dict_sys.frozen()); + + if (!table->is_readable() || table->corrupted) { if (*mdl) { mdl_context->release_lock(*mdl); *mdl= nullptr; } - unaccessible= true; + return nullptr; } - if (!trylock) - table->release(); - - if (unaccessible) - return nullptr; + const table_id_t table_id{table->id}; if (!trylock) dict_sys.unfreeze(); @@ -748,11 +688,38 @@ } } + size_t db1_len, tbl1_len; +lookup: dict_sys.freeze(SRW_LOCK_CALL); table= dict_sys.find_table(table_id); if (table) - table->acquire(); - if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) + { + if (!table->is_accessible()) + { + table= nullptr; + unlock_and_return_without_mdl: + if (trylock) + dict_sys.unfreeze(); + return_without_mdl: + if (*mdl) + { + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + return table; + } + + if (trylock) + table->acquire(); + + if (!table->parse_name(db_buf1, tbl_buf1, &db1_len, &tbl1_len)) + { + /* The table was renamed to #sql prefix. + Release MDL (if any) for the old name and return. */ + goto unlock_and_return_without_mdl; + } + } + else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) { dict_sys.unfreeze(); dict_sys.lock(SRW_LOCK_CALL); @@ -760,33 +727,19 @@ table_op == DICT_TABLE_OP_LOAD_TABLESPACE ? DICT_ERR_IGNORE_RECOVER_LOCK : DICT_ERR_IGNORE_FK_NOKEY); - if (table) - table->acquire(); dict_sys.unlock(); - dict_sys.freeze(SRW_LOCK_CALL); - } - - if (!table || !table->is_accessible()) - { -return_without_mdl: - if (trylock) - dict_sys.unfreeze(); - if (*mdl) - { - mdl_context->release_lock(*mdl); - *mdl= nullptr; - } - return nullptr; - } - - size_t db1_len, tbl1_len; - - if (!table->parse_name(db_buf1, tbl_buf1, &db1_len, &tbl1_len)) - { - /* The table was renamed to #sql prefix. - Release MDL (if any) for the old name and return. */ + /* At this point, the freshly loaded table may already have been evicted. + We must look it up again while holding a shared dict_sys.latch. We keep + trying this until the table is found in the cache or it cannot be found + in the dictionary (because the table has been dropped or rebuilt). */ + if (table) + goto lookup; + if (!trylock) + dict_sys.freeze(SRW_LOCK_CALL); goto return_without_mdl; } + else + goto return_without_mdl; if (*mdl) { @@ -873,6 +826,7 @@ dict_table_op_t table_op, THD *thd, MDL_ticket **mdl) { +retry: if (!dict_locked) dict_sys.freeze(SRW_LOCK_CALL); @@ -880,9 +834,21 @@ if (table) { - table->acquire(); - if (thd && !dict_locked) - table= dict_acquire_mdl_shared(table, thd, mdl, table_op); + if (!dict_locked) + { + if (thd) + { + table= dict_acquire_mdl_shared(table, thd, mdl, table_op); + if (table) + goto acquire; + } + else + acquire: + table->acquire(); + dict_sys.unfreeze(); + } + else + table->acquire(); } else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) { @@ -895,24 +861,16 @@ table_op == DICT_TABLE_OP_LOAD_TABLESPACE ? DICT_ERR_IGNORE_RECOVER_LOCK : DICT_ERR_IGNORE_FK_NOKEY); - if (table) - table->acquire(); if (!dict_locked) { dict_sys.unlock(); - if (table && thd) - { - dict_sys.freeze(SRW_LOCK_CALL); - table= dict_acquire_mdl_shared(table, thd, mdl, table_op); - dict_sys.unfreeze(); - } - return table; + if (table) + goto retry; } + else if (table) + table->acquire(); } - if (!dict_locked) - dict_sys.unfreeze(); - return table; } @@ -975,7 +933,7 @@ UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU); UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU); - const ulint hash_size = buf_pool_get_curr_size() + const ulint hash_size = buf_pool.curr_pool_size() / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); table_hash.create(hash_size); @@ -1012,7 +970,10 @@ const ulong threshold= srv_fatal_semaphore_wait_threshold; if (waited >= threshold) + { + buf_pool.print_flush_info(); ib::fatal() << fatal_msg; + } if (waited > threshold / 4) ib::warn() << "A long wait (" << waited @@ -1129,6 +1090,55 @@ DBUG_RETURN(table); } +bool dict_stats::open(THD *thd) noexcept +{ + ut_ad(!mdl_table); + ut_ad(!mdl_index); + ut_ad(!table_stats); + ut_ad(!index_stats); + ut_ad(!mdl_context); + + mdl_context= static_cast(thd_mdl_context(thd)); + if (!mdl_context) + return true; + /* FIXME: use compatible type, and maybe remove this parameter altogether! */ + const double timeout= double(global_system_variables.lock_wait_timeout); + MDL_request request; + MDL_REQUEST_INIT(&request, MDL_key::TABLE, "mysql", "innodb_table_stats", + MDL_SHARED, MDL_EXPLICIT); + if (UNIV_UNLIKELY(mdl_context->acquire_lock(&request, timeout))) + return true; + mdl_table= request.ticket; + MDL_REQUEST_INIT(&request, MDL_key::TABLE, "mysql", "innodb_index_stats", + MDL_SHARED, MDL_EXPLICIT); + if (UNIV_UNLIKELY(mdl_context->acquire_lock(&request, timeout))) + goto release_mdl; + mdl_index= request.ticket; + table_stats= dict_table_open_on_name("mysql/innodb_table_stats", false, + DICT_ERR_IGNORE_NONE); + if (!table_stats) + goto release_mdl; + index_stats= dict_table_open_on_name("mysql/innodb_index_stats", false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + return false; + + table_stats->release(); +release_mdl: + if (mdl_index) + mdl_context->release_lock(mdl_index); + mdl_context->release_lock(mdl_table); + return true; +} + +void dict_stats::close() noexcept +{ + table_stats->release(); + index_stats->release(); + mdl_context->release_lock(mdl_table); + mdl_context->release_lock(mdl_index); +} + /**********************************************************************//** Adds system columns to a table object. */ void @@ -4389,7 +4399,7 @@ table_id_hash.free(); temp_id_hash.free(); - const ulint hash_size = buf_pool_get_curr_size() + const ulint hash_size = buf_pool.curr_pool_size() / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); table_hash.create(hash_size); table_id_hash.create(hash_size); diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0load.cc mariadb-10.11.13/storage/innobase/dict/dict0load.cc --- mariadb-10.11.11/storage/innobase/dict/dict0load.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/dict/dict0load.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2513,10 +2513,12 @@ if (!table->is_readable()) { /* Don't attempt to load the indexes from disk. */ } else if (err == DB_SUCCESS) { + auto i = fk_tables.size(); err = dict_load_foreigns(table->name.m_name, nullptr, 0, true, ignore_err, fk_tables); if (err != DB_SUCCESS) { + fk_tables.erase(fk_tables.begin() + i, fk_tables.end()); ib::warn() << "Load table " << table->name << " failed, the table has missing" " foreign key indexes. Turn off" diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0stats.cc mariadb-10.11.13/storage/innobase/dict/dict0stats.cc --- mariadb-10.11.11/storage/innobase/dict/dict0stats.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/dict/dict0stats.cc 2025-05-19 16:14:25.000000000 +0000 @@ -359,7 +359,7 @@ if (!table) { if (opt_bootstrap) - return DB_TABLE_NOT_FOUND; + return DB_STATS_DO_NOT_EXIST; if (req_schema == &table_stats_schema) { if (innodb_table_stats_not_found_reported) { return DB_STATS_DO_NOT_EXIST; @@ -377,10 +377,10 @@ snprintf(errstr, errstr_sz, "Table %s not found.", req_schema->table_name_sql); - return DB_TABLE_NOT_FOUND; + return DB_STATS_DO_NOT_EXIST; } - if (!table->is_readable() && !table->space) { + if (!table->is_readable() || !table->space) { /* missing tablespace */ snprintf(errstr, errstr_sz, "Tablespace for table %s is missing.", @@ -491,11 +491,8 @@ return DB_SUCCESS; } -/*********************************************************************//** -Checks whether the persistent statistics storage exists and that all -tables have the proper structure. -@return true if exists and all tables are ok */ -static bool dict_stats_persistent_storage_check(bool dict_already_locked) +dict_stats_schema_check +dict_stats_persistent_storage_check(bool dict_already_locked) noexcept { char errstr[512]; dberr_t ret; @@ -521,14 +518,14 @@ switch (ret) { case DB_SUCCESS: - return true; + return SCHEMA_OK; + case DB_STATS_DO_NOT_EXIST: + return SCHEMA_NOT_EXIST; default: if (!opt_bootstrap) { - ib::error() << errstr; + sql_print_error("InnoDB: %s", errstr); } - /* fall through */ - case DB_STATS_DO_NOT_EXIST: - return false; + return SCHEMA_INVALID; } } @@ -544,13 +541,16 @@ { ut_ad(dict_sys.locked()); - if (!dict_stats_persistent_storage_check(true)) - { - pars_info_free(pinfo); - return DB_STATS_DO_NOT_EXIST; + switch (dict_stats_persistent_storage_check(true)) { + case SCHEMA_OK: + return que_eval_sql(pinfo, sql, trx); + case SCHEMA_INVALID: + case SCHEMA_NOT_EXIST: + break; } - return que_eval_sql(pinfo, sql, trx); + pars_info_free(pinfo); + return DB_STATS_DO_NOT_EXIST; } @@ -599,7 +599,7 @@ table->stat_clustered_index_size = 1; /* 1 page for each index, not counting the clustered */ table->stat_sum_of_other_index_sizes - = UT_LIST_GET_LEN(table->indexes) - 1; + = uint32_t(UT_LIST_GET_LEN(table->indexes) - 1); table->stat_modified_counter = 0; dict_index_t* index; @@ -617,7 +617,7 @@ dict_stats_empty_index(index, empty_defrag_stats); } - table->stat_initialized = TRUE; + table->stat = table->stat | dict_table_t::STATS_INITIALIZED; table->stats_mutex_unlock(); } @@ -658,16 +658,10 @@ /*==========================*/ const dict_table_t* table) /*!< in: table */ { - ut_a(table->stat_initialized); - MEM_CHECK_DEFINED(&table->stats_last_recalc, sizeof table->stats_last_recalc); - MEM_CHECK_DEFINED(&table->stat_persistent, - sizeof table->stat_persistent); - - MEM_CHECK_DEFINED(&table->stats_auto_recalc, - sizeof table->stats_auto_recalc); + MEM_CHECK_DEFINED(&table->stat, sizeof table->stat); MEM_CHECK_DEFINED(&table->stats_sample_pages, sizeof table->stats_sample_pages); @@ -844,8 +838,8 @@ ulint n_cols; ib_uint64_t* n_diff; ib_uint64_t* n_not_null; - ibool stats_null_not_equal; - uintmax_t n_sample_pages=1; /* number of pages to sample */ + bool stats_null_not_equal; + uint32_t n_sample_pages=1; /* number of pages to sample */ ulint not_empty_flag = 0; ulint total_external_size = 0; uintmax_t add_on; @@ -883,11 +877,11 @@ case SRV_STATS_NULLS_UNEQUAL: /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL case, we will treat NULLs as unequal value */ - stats_null_not_equal = TRUE; + stats_null_not_equal = true; break; case SRV_STATS_NULLS_EQUAL: - stats_null_not_equal = FALSE; + stats_null_not_equal = false; break; default: @@ -938,19 +932,21 @@ so taking all case2 paths is I, our expression is: n_pages = S < I? min(I,L) : I - */ - if (index->stat_index_size > 1) { - n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) - ? ut_min(index->stat_index_size, - static_cast( - log2(double(index->stat_index_size)) - * double(srv_stats_transient_sample_pages))) - : index->stat_index_size; + */ + if (uint32_t I = index->stat_index_size) { + const uint32_t S{srv_stats_transient_sample_pages}; + n_sample_pages = S < I + ? std::min(I, + uint32_t(log2(double(I)) + * double(S))) + : I; } } /* Sanity check */ - ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size)); + ut_ad(n_sample_pages); + ut_ad(n_sample_pages <= (index->stat_index_size <= 1 + ? 1 : index->stat_index_size)); /* We sample some pages in the index to get an estimate */ btr_cur_t cursor; @@ -1169,7 +1165,7 @@ mtr.x_lock_space(index->table->space); - ulint dummy, size; + uint32_t dummy, size; index->stat_index_size = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF @@ -1209,24 +1205,12 @@ return err; } -/*********************************************************************//** -Calculates new estimates for table and index statistics. This function -is relatively quick and is used to calculate transient statistics that -are not saved on disk. -This was the only way to calculate statistics before the -Persistent Statistics feature was introduced. -@return error code -@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */ -static -dberr_t -dict_stats_update_transient( -/*========================*/ - dict_table_t* table) /*!< in/out: table */ +dberr_t dict_stats_update_transient(dict_table_t *table) noexcept { ut_ad(!table->stats_mutex_is_owner()); dict_index_t* index; - ulint sum_of_index_sizes = 0; + uint32_t sum_of_index_sizes = 0; dberr_t err = DB_SUCCESS; /* Find out the sizes of the indexes and how many different values @@ -1234,17 +1218,16 @@ index = dict_table_get_first_index(table); - if (!table->space) { - /* Nothing to do. */ -empty_table: + if (!index || !table->space) { dict_stats_empty_table(table, true); - return err; - } else if (index == NULL) { - /* Table definition is corrupt */ + return DB_SUCCESS; + } - ib::warn() << "Table " << table->name - << " has no indexes. Cannot calculate statistics."; - goto empty_table; + if (trx_id_t bulk_trx_id = table->bulk_trx_id) { + if (trx_sys.find(nullptr, bulk_trx_id, false)) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } } for (; index != NULL; index = dict_table_get_next_index(index)) { @@ -1285,7 +1268,7 @@ table->stat_modified_counter = 0; - table->stat_initialized = TRUE; + table->stat = table->stat | dict_table_t::STATS_INITIALIZED; table->stats_mutex_unlock(); @@ -2225,8 +2208,8 @@ struct index_stats_t { std::vector stats; - ulint index_size; - ulint n_leaf_pages; + uint32_t index_size; + uint32_t n_leaf_pages; index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1) { @@ -2365,7 +2348,7 @@ uint16_t root_level = btr_page_get_level(root->page.frame); mtr.x_lock_space(index->table->space); - ulint dummy, size; + uint32_t dummy, size; result.index_size = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + root->page.frame, &size, &mtr) @@ -2635,17 +2618,7 @@ DBUG_RETURN(result); } -/*********************************************************************//** -Calculates new estimates for table and index statistics. This function -is relatively slow and is used to calculate persistent statistics that -will be saved on disk. -@return DB_SUCCESS or error code -@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ -static -dberr_t -dict_stats_update_persistent( -/*=========================*/ - dict_table_t* table) /*!< in/out: table */ +dberr_t dict_stats_update_persistent(dict_table_t *table) noexcept { dict_index_t* index; @@ -2653,6 +2626,13 @@ DEBUG_SYNC_C("dict_stats_update_persistent"); + if (trx_id_t bulk_trx_id = table->bulk_trx_id) { + if (trx_sys.find(nullptr, bulk_trx_id, false)) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + } + /* analyze the clustered index first */ index = dict_table_get_first_index(table); @@ -2742,7 +2722,7 @@ table->stat_modified_counter = 0; - table->stat_initialized = TRUE; + table->stat = table->stat | dict_table_t::STATS_INITIALIZED; dict_stats_assert_initialized(table); @@ -2751,6 +2731,18 @@ return(DB_SUCCESS); } +dberr_t dict_stats_update_persistent_try(dict_table_t *table) +{ + if (table->stats_is_persistent() && + dict_stats_persistent_storage_check(false) == SCHEMA_OK) + { + if (dberr_t err= dict_stats_update_persistent(table)) + return err; + return dict_stats_save(table); + } + return DB_SUCCESS; +} + #include "mysql_com.h" /** Save an individual index's statistic into the persistent statistics storage. @@ -2829,14 +2821,14 @@ "END;", trx); if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { - if (innodb_index_stats_not_found == false && - index->stats_error_printed == false) { + if (innodb_index_stats_not_found == false + && !index->table->stats_error_printed) { + index->table->stats_error_printed = true; ib::error() << "Cannot save index statistics for table " << index->table->name << ", index " << index->name << ", stat name \"" << stat_name << "\": " << ret; - index->stats_error_printed = true; } } @@ -2878,27 +2870,29 @@ return err; } -/** Save the table's statistics into the persistent statistics storage. -@param[in] table table whose stats to save -@param[in] only_for_index if this is non-NULL, then stats for indexes -that are not equal to it will not be saved, if NULL, then all indexes' stats -are saved +/** Save the persistent statistics of a table or an index. +@param table table whose stats to save +@param only_for_index the index ID to save statistics for (0=all) @return DB_SUCCESS or error code */ -static -dberr_t -dict_stats_save( - dict_table_t* table, - const index_id_t* only_for_index) +dberr_t dict_stats_save(dict_table_t* table, index_id_t index_id) { pars_info_t* pinfo; char db_utf8[MAX_DB_UTF8_LEN]; char table_utf8[MAX_TABLE_UTF8_LEN]; + THD* const thd = current_thd; #ifdef ENABLED_DEBUG_SYNC DBUG_EXECUTE_IF("dict_stats_save_exit_notify", + SCOPE_EXIT([thd] { + debug_sync_set_action(thd, + STRING_WITH_LEN("now SIGNAL dict_stats_save_finished")); + }); + ); + DBUG_EXECUTE_IF("dict_stats_save_exit_notify_and_wait", SCOPE_EXIT([] { debug_sync_set_action(current_thd, - STRING_WITH_LEN("now SIGNAL dict_stats_save_finished")); + STRING_WITH_LEN("now SIGNAL dict_stats_save_finished" + " WAIT_FOR dict_stats_save_unblock")); }); ); #endif /* ENABLED_DEBUG_SYNC */ @@ -2911,41 +2905,10 @@ return (dict_stats_report_error(table)); } - THD* thd = current_thd; - MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; - dict_table_t* table_stats = dict_table_open_on_name( - TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (table_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats = dict_acquire_mdl_shared(table_stats, thd, - &mdl_table); - dict_sys.unfreeze(); - } - if (!table_stats - || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { -release_and_exit: - if (table_stats) { - dict_table_close(table_stats, false, thd, mdl_table); - } + dict_stats stats; + if (stats.open(thd)) { return DB_STATS_DO_NOT_EXIST; } - - dict_table_t* index_stats = dict_table_open_on_name( - INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (index_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats = dict_acquire_mdl_shared(index_stats, thd, - &mdl_index); - dict_sys.unfreeze(); - } - if (!index_stats) { - goto release_and_exit; - } - if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { - dict_table_close(index_stats, false, thd, mdl_index); - goto release_and_exit; - } - dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), table_utf8, sizeof(table_utf8)); const time_t now = time(NULL); @@ -2954,9 +2917,9 @@ trx_start_internal(trx); dberr_t ret = trx->read_only ? DB_READ_ONLY - : lock_table_for_trx(table_stats, trx, LOCK_X); + : lock_table_for_trx(stats.table(), trx, LOCK_X); if (ret == DB_SUCCESS) { - ret = lock_table_for_trx(index_stats, trx, LOCK_X); + ret = lock_table_for_trx(stats.index(), trx, LOCK_X); } if (ret != DB_SUCCESS) { if (trx->state != TRX_STATE_NOT_STARTED) { @@ -3002,8 +2965,14 @@ "END;", trx); if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { - ib::error() << "Cannot save table statistics for table " - << table->name << ": " << ret; + sql_print_error("InnoDB: Cannot save table statistics for" +#ifdef EMBEDDED_LIBRARY + " table %.*s.%s: %s", +#else + " table %`.*s.%`s: %s", +#endif + int(table->name.dblen()), table->name.m_name, + table->name.basename(), ut_strerr(ret)); rollback_and_exit: trx->rollback(); free_and_exit: @@ -3011,8 +2980,7 @@ dict_sys.unlock(); unlocked_free_and_exit: trx->free(); - dict_table_close(table_stats, false, thd, mdl_table); - dict_table_close(index_stats, false, thd, mdl_index); + stats.close(); return ret; } @@ -3046,7 +3014,7 @@ index = it->second; - if (only_for_index != NULL && index->id != *only_for_index) { + if (index_id != 0 && index->id != index_id) { continue; } @@ -3116,6 +3084,14 @@ goto free_and_exit; } +void dict_stats_empty_table_and_save(dict_table_t *table) +{ + dict_stats_empty_table(table, true); + if (table->stats_is_persistent() && + dict_stats_persistent_storage_check(false) == SCHEMA_OK) + dict_stats_save(table); +} + /*********************************************************************//** Called for the row that is selected by SELECT ... FROM mysql.innodb_table_stats WHERE table='...' @@ -3164,8 +3140,7 @@ ut_a(len == 8); table->stat_clustered_index_size - = std::max( - (ulint) mach_read_from_8(data), 1); + = std::max(mach_read_from_4(data + 4), 1U); break; } @@ -3174,18 +3149,9 @@ ut_a(dtype_get_mtype(type) == DATA_INT); ut_a(len == 8); - ulint stat_other_idx_size - = (ulint) mach_read_from_8(data); - if (!stat_other_idx_size - && UT_LIST_GET_LEN(table->indexes) > 1) { - stat_other_idx_size - = UT_LIST_GET_LEN(table->indexes) - 1; - } - table->stat_sum_of_other_index_sizes - = std::max( - (ulint) mach_read_from_8(data), - UT_LIST_GET_LEN(table->indexes) - 1); - + table->stat_sum_of_other_index_sizes = std::max( + mach_read_from_4(data + 4), + uint32_t(UT_LIST_GET_LEN(table->indexes) - 1)); break; } default: @@ -3370,14 +3336,12 @@ if (stat_name_len == 4 /* strlen("size") */ && strncasecmp("size", stat_name, stat_name_len) == 0) { - index->stat_index_size - = std::max((ulint) stat_value, 1); + index->stat_index_size = std::max(uint32_t(stat_value), 1U); arg->stats_were_modified = true; } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */ && strncasecmp("n_leaf_pages", stat_name, stat_name_len) == 0) { - index->stat_n_leaf_pages - = std::max((ulint) stat_value, 1); + index->stat_n_leaf_pages = std::max(uint32_t(stat_value), 1U); arg->stats_were_modified = true; } else if (stat_name_len == 12 /* strlen("n_page_split") */ && strncasecmp("n_page_split", stat_name, stat_name_len) @@ -3477,19 +3441,11 @@ return(TRUE); } -/*********************************************************************//** -Read table's statistics from the persistent statistics storage. -@return DB_SUCCESS or error code */ -static -dberr_t -dict_stats_fetch_from_ps( -/*=====================*/ - dict_table_t* table) /*!< in/out: table */ +/** Read the stored persistent statistics of a table. */ +dberr_t dict_stats_fetch_from_ps(dict_table_t *table) { index_fetch_t index_fetch_arg; - trx_t* trx; pars_info_t* pinfo; - dberr_t ret; char db_utf8[MAX_DB_UTF8_LEN]; char table_utf8[MAX_TABLE_UTF8_LEN]; @@ -3499,49 +3455,16 @@ stats. */ dict_stats_empty_table(table, true); - THD* thd = current_thd; - MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; - dict_table_t* table_stats = dict_table_open_on_name( - TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (table_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats = dict_acquire_mdl_shared(table_stats, thd, - &mdl_table); - dict_sys.unfreeze(); - } - if (!table_stats - || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { -release_and_exit: - if (table_stats) { - dict_table_close(table_stats, false, thd, mdl_table); - } + THD* const thd = current_thd; + dict_stats stats; + if (stats.open(thd)) { return DB_STATS_DO_NOT_EXIST; } - dict_table_t* index_stats = dict_table_open_on_name( - INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (index_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats = dict_acquire_mdl_shared(index_stats, thd, - &mdl_index); - dict_sys.unfreeze(); - } - if (!index_stats) { - goto release_and_exit; - } - if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { - dict_table_close(index_stats, false, thd, mdl_index); - goto release_and_exit; - } - #ifdef ENABLED_DEBUG_SYNC DEBUG_SYNC(thd, "dict_stats_mdl_acquired"); #endif /* ENABLED_DEBUG_SYNC */ - trx = trx_create(); - - trx_start_internal_read_only(trx); - dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), table_utf8, sizeof(table_utf8)); @@ -3562,76 +3485,77 @@ "fetch_index_stats_step", dict_stats_fetch_index_stats_step, &index_fetch_arg); - dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */ - ret = que_eval_sql(pinfo, - "PROCEDURE FETCH_STATS () IS\n" - "found INT;\n" - "DECLARE FUNCTION fetch_table_stats_step;\n" - "DECLARE FUNCTION fetch_index_stats_step;\n" - "DECLARE CURSOR table_stats_cur IS\n" - " SELECT\n" - /* if you change the selected fields, be - sure to adjust - dict_stats_fetch_table_stats_step() */ - " n_rows,\n" - " clustered_index_size,\n" - " sum_of_other_index_sizes\n" - " FROM \"" TABLE_STATS_NAME "\"\n" - " WHERE\n" - " database_name = :database_name AND\n" - " table_name = :table_name;\n" - "DECLARE CURSOR index_stats_cur IS\n" - " SELECT\n" - /* if you change the selected fields, be - sure to adjust - dict_stats_fetch_index_stats_step() */ - " index_name,\n" - " stat_name,\n" - " stat_value,\n" - " sample_size\n" - " FROM \"" INDEX_STATS_NAME "\"\n" - " WHERE\n" - " database_name = :database_name AND\n" - " table_name = :table_name;\n" - - "BEGIN\n" - - "OPEN table_stats_cur;\n" - "FETCH table_stats_cur INTO\n" - " fetch_table_stats_step();\n" - "IF (SQL % NOTFOUND) THEN\n" - " CLOSE table_stats_cur;\n" - " RETURN;\n" - "END IF;\n" - "CLOSE table_stats_cur;\n" - - "OPEN index_stats_cur;\n" - "found := 1;\n" - "WHILE found = 1 LOOP\n" - " FETCH index_stats_cur INTO\n" - " fetch_index_stats_step();\n" - " IF (SQL % NOTFOUND) THEN\n" - " found := 0;\n" - " END IF;\n" - "END LOOP;\n" - "CLOSE index_stats_cur;\n" + dict_sys.lock(SRW_LOCK_CALL); + que_t* graph = pars_sql( + pinfo, + "PROCEDURE FETCH_STATS () IS\n" + "found INT;\n" + "DECLARE FUNCTION fetch_table_stats_step;\n" + "DECLARE FUNCTION fetch_index_stats_step;\n" + "DECLARE CURSOR table_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_table_stats_step() */ + " n_rows,\n" + " clustered_index_size,\n" + " sum_of_other_index_sizes\n" + " FROM \"" TABLE_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + "DECLARE CURSOR index_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_index_stats_step() */ + " index_name,\n" + " stat_name,\n" + " stat_value,\n" + " sample_size\n" + " FROM \"" INDEX_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + + "BEGIN\n" + + "OPEN table_stats_cur;\n" + "FETCH table_stats_cur INTO\n" + " fetch_table_stats_step();\n" + "IF (SQL % NOTFOUND) THEN\n" + " CLOSE table_stats_cur;\n" + " RETURN;\n" + "END IF;\n" + "CLOSE table_stats_cur;\n" + + "OPEN index_stats_cur;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_stats_cur INTO\n" + " fetch_index_stats_step();\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_stats_cur;\n" - "END;", trx); - /* pinfo is freed by que_eval_sql() */ + "END;"); dict_sys.unlock(); - dict_table_close(table_stats, false, thd, mdl_table); - dict_table_close(index_stats, false, thd, mdl_index); + trx_t* trx = trx_create(); + trx->graph = nullptr; + graph->trx = trx; + trx_start_internal_read_only(trx); + que_run_threads(que_fork_start_command(graph)); + que_graph_free(graph); trx_commit_for_mysql(trx); - + dberr_t ret = index_fetch_arg.stats_were_modified + ? trx->error_state : DB_STATS_DO_NOT_EXIST; trx->free(); - - if (!index_fetch_arg.stats_were_modified) { - return(DB_STATS_DO_NOT_EXIST); - } - - return(ret); + stats.close(); + return ret; } /*********************************************************************//** @@ -3641,250 +3565,46 @@ /*========================*/ dict_index_t* index) /*!< in/out: index */ { - DBUG_ENTER("dict_stats_update_for_index"); - - if (dict_stats_is_persistent_enabled(index->table)) { - - if (dict_stats_persistent_storage_check(false)) { - index_stats_t stats = dict_stats_analyze_index(index); - index->table->stats_mutex_lock(); - index->stat_index_size = stats.index_size; - index->stat_n_leaf_pages = stats.n_leaf_pages; - for (size_t i = 0; i < stats.stats.size(); ++i) { - index->stat_n_diff_key_vals[i] - = stats.stats[i].n_diff_key_vals; - index->stat_n_sample_sizes[i] - = stats.stats[i].n_sample_sizes; - index->stat_n_non_null_key_vals[i] - = stats.stats[i].n_non_null_key_vals; - } - index->table->stat_sum_of_other_index_sizes - += index->stat_index_size; - index->table->stats_mutex_unlock(); - - dict_stats_save(index->table, &index->id); - DBUG_VOID_RETURN; - } - /* else */ - - if (innodb_index_stats_not_found == false && - index->stats_error_printed == false) { - /* Fall back to transient stats since the persistent - storage is not present or is corrupted */ - - ib::info() << "Recalculation of persistent statistics" - " requested for table " << index->table->name - << " index " << index->name - << " but the required" - " persistent statistics storage is not present or is" - " corrupted. Using transient stats instead."; - index->stats_error_printed = false; - } - } - - dict_stats_update_transient_for_index(index); - - DBUG_VOID_RETURN; -} - -/*********************************************************************//** -Calculates new estimates for table and index statistics. The statistics -are used in query optimization. -@return DB_SUCCESS or error code -@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ -dberr_t -dict_stats_update( -/*==============*/ - dict_table_t* table, /*!< in/out: table */ - dict_stats_upd_option_t stats_upd_option) - /*!< in: whether to (re) calc - the stats or to fetch them from - the persistent statistics - storage */ -{ - ut_ad(!table->stats_mutex_is_owner()); - - if (!table->is_readable()) { - return (dict_stats_report_error(table)); - } else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { - /* If we have set a high innodb_force_recovery level, do - not calculate statistics, as a badly corrupted index can - cause a crash in it. */ - dict_stats_empty_table(table, false); - return(DB_SUCCESS); - } - - if (trx_id_t bulk_trx_id = table->bulk_trx_id) { - if (trx_sys.find(nullptr, bulk_trx_id, false)) { - dict_stats_empty_table(table, false); - return DB_SUCCESS_LOCKED_REC; - } - } - - switch (stats_upd_option) { - case DICT_STATS_RECALC_PERSISTENT: - - if (srv_read_only_mode) { - goto transient; - } - - /* Persistent recalculation requested, called from - 1) ANALYZE TABLE, or - 2) the auto recalculation background thread, or - 3) open table if stats do not exist on disk and auto recalc - is enabled */ - - /* InnoDB internal tables (e.g. SYS_TABLES) cannot have - persistent stats enabled */ - ut_a(strchr(table->name.m_name, '/') != NULL); - - /* check if the persistent statistics storage exists - before calling the potentially slow function - dict_stats_update_persistent(); that is a - prerequisite for dict_stats_save() succeeding */ - if (dict_stats_persistent_storage_check(false)) { - - dberr_t err; - - err = dict_stats_update_persistent(table); - - if (err != DB_SUCCESS) { - return(err); - } - - err = dict_stats_save(table, NULL); - - return(err); - } + dict_table_t *const table= index->table; + ut_ad(table->stat_initialized()); - /* Fall back to transient stats since the persistent - storage is not present or is corrupted */ - - if (innodb_table_stats_not_found == false && - table->stats_error_printed == false) { - ib::warn() << "Recalculation of persistent statistics" - " requested for table " - << table->name - << " but the required persistent" - " statistics storage is not present or is corrupted." - " Using transient stats instead."; - table->stats_error_printed = true; - } - - goto transient; - - case DICT_STATS_RECALC_TRANSIENT: - - goto transient; - - case DICT_STATS_EMPTY_TABLE: - - dict_stats_empty_table(table, true); - - /* If table is using persistent stats, - then save the stats on disk */ - - if (dict_stats_is_persistent_enabled(table)) { - - if (dict_stats_persistent_storage_check(false)) { - - return(dict_stats_save(table, NULL)); - } - - return(DB_STATS_DO_NOT_EXIST); - } - - return(DB_SUCCESS); - - case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: - - /* fetch requested, either fetch from persistent statistics - storage or use the old method */ - - if (table->stat_initialized) { - return(DB_SUCCESS); - } - - /* InnoDB internal tables (e.g. SYS_TABLES) cannot have - persistent stats enabled */ - ut_a(strchr(table->name.m_name, '/') != NULL); - - if (!dict_stats_persistent_storage_check(false)) { - /* persistent statistics storage does not exist - or is corrupted, calculate the transient stats */ - - if (innodb_table_stats_not_found == false && - table->stats_error_printed == false && - !opt_bootstrap) { - ib::error() << "Fetch of persistent statistics" - " requested for table " - << table->name - << " but the required system tables " - << TABLE_STATS_NAME_PRINT - << " and " << INDEX_STATS_NAME_PRINT - << " are not present or have unexpected" - " structure. Using transient stats instead."; - table->stats_error_printed = true; - } - - goto transient; - } - - dberr_t err = dict_stats_fetch_from_ps(table); - - switch (err) { - case DB_SUCCESS: - return(DB_SUCCESS); - case DB_STATS_DO_NOT_EXIST: - - if (srv_read_only_mode) { - goto transient; - } -#ifdef WITH_WSREP - if (wsrep_thd_skip_locking(current_thd)) { - goto transient; - } + if (table->stats_is_persistent()) + switch (dict_stats_persistent_storage_check(false)) { + case SCHEMA_NOT_EXIST: + break; + case SCHEMA_INVALID: + if (table->stats_error_printed) + break; + table->stats_error_printed= true; + sql_print_information("InnoDB: Recalculation of persistent statistics" +#ifdef EMBEDDED_LIBRARY + " requested for table %.*s.%s index %s but" +#else + " requested for table %`.*s.%`s index %`s but" #endif - if (dict_stats_auto_recalc_is_enabled(table)) { - return(dict_stats_update( - table, - DICT_STATS_RECALC_PERSISTENT)); - } - - ib::info() << "Trying to use table " << table->name - << " which has persistent statistics enabled," - " but auto recalculation turned off and the" - " statistics do not exist in " - TABLE_STATS_NAME_PRINT - " and " INDEX_STATS_NAME_PRINT - ". Please either run \"ANALYZE TABLE " - << table->name << ";\" manually or enable the" - " auto recalculation with \"ALTER TABLE " - << table->name << " STATS_AUTO_RECALC=1;\"." - " InnoDB will now use transient statistics for " - << table->name << "."; - - goto transient; - default: - - if (innodb_table_stats_not_found == false && - table->stats_error_printed == false) { - ib::error() << "Error fetching persistent statistics" - " for table " - << table->name - << " from " TABLE_STATS_NAME_PRINT " and " - INDEX_STATS_NAME_PRINT ": " << err - << ". Using transient stats method instead."; - } - - goto transient; - } - /* no "default:" in order to produce a compilation warning - about unhandled enumeration value */ - } + " the required persistent statistics storage" + " is corrupted. Using transient stats instead.", + int(table->name.dblen()), table->name.m_name, + table->name.basename(), index->name()); + break; + case SCHEMA_OK: + index_stats_t stats{dict_stats_analyze_index(index)}; + table->stats_mutex_lock(); + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + for (size_t i = 0; i < stats.stats.size(); ++i) + { + index->stat_n_diff_key_vals[i]= stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i]= stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i]= stats.stats[i].n_non_null_key_vals; + } + table->stat_sum_of_other_index_sizes+= index->stat_index_size; + table->stats_mutex_unlock(); + dict_stats_save(table, index->id); + return; + } -transient: - return dict_stats_update_transient(table); + dict_stats_update_transient_for_index(index); } /** Execute DELETE FROM mysql.innodb_table_stats @@ -4034,7 +3754,7 @@ const char *old_name, const char *new_name, trx_t *trx) { - if (!dict_stats_persistent_storage_check(true)) + if (dict_stats_persistent_storage_check(true) != SCHEMA_OK) return DB_STATS_DO_NOT_EXIST; pars_info_t *pinfo= pars_info_create(); @@ -4170,7 +3890,7 @@ index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; - ret = dict_stats_save(&table, NULL); + ret = dict_stats_save(&table); ut_a(ret == DB_SUCCESS); diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0stats_bg.cc mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc --- mariadb-10.11.11/storage/innobase/dict/dict0stats_bg.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc 2025-05-19 16:14:25.000000000 +0000 @@ -135,7 +135,9 @@ void dict_stats_update_if_needed_func(dict_table_t *table) #endif { - if (UNIV_UNLIKELY(!table->stat_initialized)) { + uint32_t stat{table->stat}; + + if (UNIV_UNLIKELY(!table->stat_initialized(stat))) { /* The table may have been evicted from dict_sys and reloaded internally by InnoDB for FOREIGN KEY processing, but not reloaded by the SQL layer. @@ -154,13 +156,9 @@ ulonglong counter = table->stat_modified_counter++; ulonglong n_rows = dict_table_get_n_rows(table); - if (dict_stats_is_persistent_enabled(table)) { - if (table->name.is_temporary()) { - return; - } - if (counter > n_rows / 10 /* 10% */ - && dict_stats_auto_recalc_is_enabled(table)) { - + if (table->stats_is_persistent(stat)) { + if (table->stats_is_auto_recalc(stat) + && counter > n_rows / 10 && !table->name.is_temporary()) { #ifdef WITH_WSREP /* Do not add table to background statistic calculation if this thread is not a @@ -203,7 +201,7 @@ if (counter > threshold) { /* this will reset table->stat_modified_counter to 0 */ - dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); + dict_stats_update_transient(table); } } @@ -331,7 +329,7 @@ if (!mdl || !table->is_accessible()) { - dict_table_close(table, false, thd, mdl); + dict_table_close(table, thd, mdl); goto invalid_table_id; } @@ -345,10 +343,10 @@ difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL; const dberr_t err= update_now - ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT) + ? dict_stats_update_persistent_try(table) : DB_SUCCESS_LOCKED_REC; - dict_table_close(table, false, thd, mdl); + dict_table_close(table, thd, mdl); mysql_mutex_lock(&recalc_pool_mutex); auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(), diff -Nru mariadb-10.11.11/storage/innobase/fsp/fsp0fsp.cc mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc --- mariadb-10.11.11/storage/innobase/fsp/fsp0fsp.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1644,12 +1644,11 @@ /** Calculate reserved fragment page slots. @param inode file segment index @return number of fragment pages */ -static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode) +static uint32_t fseg_get_n_frag_pages(const fseg_inode_t *inode) noexcept { - ulint i; - ulint count = 0; + uint32_t count = 0; - for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + for (ulint i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) { count++; } @@ -1794,21 +1793,24 @@ currently used. @return number of reserved pages */ static -ulint +uint32_t fseg_n_reserved_pages_low( /*======================*/ const fseg_inode_t* inode, /*!< in: segment inode */ - ulint* used) /*!< out: number of pages used (not + uint32_t* used) /*!< out: number of pages used (not more than reserved) */ + noexcept { + const uint32_t extent_size = FSP_EXTENT_SIZE; + *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED) - + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL) + + extent_size * flst_get_len(inode + FSEG_FULL) + fseg_get_n_frag_pages(inode); return fseg_get_n_frag_pages(inode) - + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE) - + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL) - + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL); + + extent_size * flst_get_len(inode + FSEG_FREE) + + extent_size * flst_get_len(inode + FSEG_NOT_FULL) + + extent_size * flst_get_len(inode + FSEG_FULL); } /** Calculate the number of pages reserved by a segment, @@ -1818,9 +1820,9 @@ @param[out] used number of pages that are used (not more than reserved) @param[in,out] mtr mini-transaction @return number of reserved pages */ -ulint fseg_n_reserved_pages(const buf_block_t &block, - const fseg_header_t *header, ulint *used, - mtr_t *mtr) +uint32_t fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, uint32_t *used, + mtr_t *mtr) noexcept { ut_ad(page_align(header) == block.page.frame); buf_block_t *iblock; @@ -1845,7 +1847,7 @@ buf_block_t *iblock, fil_space_t *space, uint32_t hint, mtr_t *mtr) { - ulint used; + uint32_t used; ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); ut_d(space->modify_check(*mtr)); @@ -1996,8 +1998,7 @@ dberr_t* err) { ib_id_t seg_id; - ulint used; - ulint reserved; + uint32_t used, reserved; xdes_t* descr; /*!< extent of the hinted page */ uint32_t ret_page; /*!< the allocated page offset, FIL_NULL if could not be allocated */ diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0config.cc mariadb-10.11.13/storage/innobase/fts/fts0config.cc --- mariadb-10.11.11/storage/innobase/fts/fts0config.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/fts/fts0config.cc 2025-05-19 16:14:25.000000000 +0000 @@ -231,7 +231,7 @@ n_rows_updated = trx->undo_no - undo_no; /* Check if we need to do an insert. */ - if (n_rows_updated == 0) { + if (error == DB_SUCCESS && n_rows_updated == 0) { info = pars_info_create(); pars_info_bind_varchar_literal( diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0fts.cc mariadb-10.11.13/storage/innobase/fts/fts0fts.cc --- mariadb-10.11.11/storage/innobase/fts/fts0fts.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/fts/fts0fts.cc 2025-05-19 16:14:25.000000000 +0000 @@ -37,6 +37,7 @@ #include "fts0plugin.h" #include "dict0stats.h" #include "btr0pcur.h" +#include "log.h" static const ulint FTS_MAX_ID_LEN = 32; @@ -1870,8 +1871,10 @@ } } - ib::warn() << "Failed to create FTS common table " << fts_table_name; - trx->error_state = error; + ut_ad(trx->state == TRX_STATE_NOT_STARTED + || trx->error_state == error); + sql_print_warning("InnoDB: Failed to create FTS common table %s: %s", + fts_table_name, ut_strerr(error)); return NULL; } @@ -2055,8 +2058,10 @@ } } - ib::warn() << "Failed to create FTS index table " << table_name; - trx->error_state = error; + ut_ad(trx->state == TRX_STATE_NOT_STARTED + || trx->error_state == error); + sql_print_warning("InnoDB: Failed to create FTS index table %s: %s", + table_name, ut_strerr(error)); return NULL; } diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0opt.cc mariadb-10.11.13/storage/innobase/fts/fts0opt.cc --- mariadb-10.11.11/storage/innobase/fts/fts0opt.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/fts/fts0opt.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2809,7 +2809,7 @@ std::this_thread::sleep_for(std::chrono::seconds(6));); if (mdl_ticket) - dict_table_close(sync_table, false, fts_opt_thd, mdl_ticket); + dict_table_close(sync_table, fts_opt_thd, mdl_ticket); } /**********************************************************************//** diff -Nru mariadb-10.11.11/storage/innobase/gis/gis0sea.cc mariadb-10.11.13/storage/innobase/gis/gis0sea.cc --- mariadb-10.11.11/storage/innobase/gis/gis0sea.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/gis/gis0sea.cc 2025-05-19 16:14:25.000000000 +0000 @@ -504,10 +504,10 @@ rtr_rec_t rec; rec = rtr_info->matches->matched_recs->back(); rtr_info->matches->matched_recs->pop_back(); + cursor->btr_cur.page_cur.block = rtr_info->matches->block; mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); cursor->btr_cur.page_cur.rec = rec.r_rec; - cursor->btr_cur.page_cur.block = rtr_info->matches->block; DEBUG_SYNC_C("rtr_pcur_move_to_next_return"); return(true); @@ -1565,7 +1565,10 @@ if (auto matches = rtr_info->matches) { mysql_mutex_lock(&matches->rtr_match_mutex); - if (matches->block->page.id() == id) { + /* matches->block could be nullptr when cursor + encounters empty table */ + if (rtr_info->matches->block + && matches->block->page.id() == id) { matches->matched_recs->clear(); matches->valid = false; } @@ -2201,6 +2204,15 @@ ut_ad(orig_mode != PAGE_CUR_RTREE_LOCATE); + /* Collect matched records on page */ + offsets = rec_get_offsets( + rec, index, offsets, + index->n_fields, + ULINT_UNDEFINED, &heap); + + mysql_mutex_lock( + &rtr_info->matches->rtr_match_mutex); + if (!match_init) { rtr_init_match( rtr_info->matches, @@ -2208,14 +2220,12 @@ match_init = true; } - /* Collect matched records on page */ - offsets = rec_get_offsets( - rec, index, offsets, - index->n_fields, - ULINT_UNDEFINED, &heap); rtr_leaf_push_match_rec( rec, rtr_info, offsets, page_is_comp(page)); + + mysql_mutex_unlock( + &rtr_info->matches->rtr_match_mutex); } last_match_rec = rec; diff -Nru mariadb-10.11.11/storage/innobase/handler/ha_innodb.cc mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc --- mariadb-10.11.11/storage/innobase/handler/ha_innodb.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc 2025-05-19 16:14:25.000000000 +0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -154,11 +155,6 @@ #include "wsrep_sst.h" #endif /* WITH_WSREP */ -#ifdef HAVE_URING -/** The Linux kernel version if io_uring() is considered unsafe */ -const char *io_uring_may_be_unsafe; -#endif - #define INSIDE_HA_INNOBASE_CC #define EQ_CURRENT_THD(thd) ((thd) == current_thd) @@ -169,13 +165,9 @@ static const long AUTOINC_NEW_STYLE_LOCKING = 1; static const long AUTOINC_NO_LOCKING = 2; -static constexpr size_t buf_pool_chunk_min_size= 1U << 20; - static ulong innobase_open_files; static long innobase_autoinc_lock_mode; -ulonglong innobase_buffer_pool_size; - /** Percentage of the buffer pool to reserve for 'old' blocks. Connected to buf_LRU_old_ratio. */ static uint innobase_old_blocks_pct; @@ -246,11 +238,11 @@ if (thd_kill_level(thd)) break; /* Adjust for purge_coordinator_state::refresh() */ - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); const lsn_t last= log_sys.last_checkpoint_lsn, max_age= log_sys.max_checkpoint_age; - log_sys.latch.rd_unlock(); const lsn_t lsn= log_sys.get_lsn(); + log_sys.latch.wr_unlock(); if ((lsn - last) / 4 >= max_age / 5) buf_flush_ahead(last + max_age / 5, false); purge_sys.wake_if_not_active(); @@ -1158,7 +1150,7 @@ be rolled back to savepoint */ /** Request notification of log writes */ -static void innodb_log_flush_request(void *cookie); +static void innodb_log_flush_request(void *cookie) noexcept; /** Requests for log flushes */ struct log_flush_request @@ -1330,38 +1322,17 @@ dict_sys.unlock(); - dict_table_t *table_stats, *index_stats; - MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; - table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats= dict_acquire_mdl_shared(table_stats, - thd, &mdl_table); - dict_sys.unfreeze(); - } - index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats= dict_acquire_mdl_shared(index_stats, - thd, &mdl_index); - dict_sys.unfreeze(); - } - + dict_stats stats; + const bool stats_failed{stats.open(thd)}; trx_start_for_ddl(trx); uint errors= 0; char db[NAME_LEN + 1]; strconvert(&my_charset_filename, namebuf, len, system_charset_info, db, sizeof db, &errors); - if (!errors && table_stats && index_stats && - !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && - !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && - lock_table_for_trx(table_stats, trx, LOCK_X) == DB_SUCCESS && - lock_table_for_trx(index_stats, trx, LOCK_X) == DB_SUCCESS) + if (!errors && !stats_failed && + lock_table_for_trx(stats.table(), trx, LOCK_X) == DB_SUCCESS && + lock_table_for_trx(stats.index(), trx, LOCK_X) == DB_SUCCESS) { row_mysql_lock_data_dictionary(trx); if (dict_stats_delete(db, trx)) @@ -1457,19 +1428,16 @@ if (err != DB_SUCCESS) { trx->rollback(); - namebuf[len] = '\0'; - ib::error() << "DROP DATABASE " << namebuf << ": " << err; + sql_print_error("InnoDB: DROP DATABASE %.*s: %s", + int(len), namebuf, ut_strerr(err)); } else trx->commit(); - if (table_stats) - dict_table_close(table_stats, true, thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, true, thd, mdl_index); row_mysql_unlock_data_dictionary(trx); - trx->free(); + if (!stats_failed) + stats.close(); if (err == DB_SUCCESS) { @@ -1620,9 +1588,9 @@ if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr) { if (table->is_readable()) - dict_stats_init(table); + statistics_init(table, true); else - table->stat_initialized= 1; + table->stat.fetch_or(dict_table_t::STATS_INITIALIZED); } } @@ -1932,7 +1900,7 @@ { const trx_id_t trx_id= table->def_trx_id; DBUG_ASSERT(trx_id <= create_id); - dict_table_close(table); + table->release(); DBUG_PRINT("info", ("create_id: %llu trx_id: %" PRIu64, create_id, trx_id)); DBUG_RETURN(create_id != trx_id); } @@ -2978,6 +2946,45 @@ return XAER_NOTA; } +/** Initialize the InnoDB persistent statistics attributes. +@param table InnoDB table +@param table_options MariaDB table options +@param sar the value of STATS_AUTO_RECALC +@param initialized whether the InnoDB statistics were already initialized +@return whether table->stats_sample_pages needs to be initialized */ +static bool innodb_copy_stat_flags(dict_table_t *table, + ulong table_options, + enum_stats_auto_recalc sar, + bool initialized) noexcept +{ + if (table->is_temporary() || table->no_rollback()) + { + table->stat= dict_table_t::STATS_INITIALIZED | + dict_table_t::STATS_PERSISTENT_OFF | dict_table_t::STATS_AUTO_RECALC_OFF; + table->stats_sample_pages= 1; + return false; + } + + static_assert(HA_OPTION_STATS_PERSISTENT == + dict_table_t::STATS_PERSISTENT_ON << 11, ""); + static_assert(HA_OPTION_NO_STATS_PERSISTENT == + dict_table_t::STATS_PERSISTENT_OFF << 11, ""); + uint32_t stat= + uint32_t(table_options & + (HA_OPTION_STATS_PERSISTENT | + HA_OPTION_NO_STATS_PERSISTENT)) >> 11; + static_assert(uint32_t{HA_STATS_AUTO_RECALC_ON} << 3 == + dict_table_t::STATS_AUTO_RECALC_ON, ""); + static_assert(uint32_t{HA_STATS_AUTO_RECALC_OFF} << 3 == + dict_table_t::STATS_AUTO_RECALC_OFF, ""); + static_assert(true == dict_table_t::STATS_INITIALIZED, ""); + stat|= (sar & (HA_STATS_AUTO_RECALC_ON | HA_STATS_AUTO_RECALC_OFF)) << 3 | + uint32_t(initialized); + + table->stat= stat; + return true; +} + /*********************************************************************//** Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. Those flags are stored in .frm file and end up in the MySQL table object, @@ -2990,29 +2997,9 @@ dict_table_t* innodb_table, /*!< in/out: InnoDB table */ const HA_CREATE_INFO* create_info) /*!< in: create info */ { - ibool ps_on; - ibool ps_off; - - if (innodb_table->is_temporary() - || innodb_table->no_rollback()) { - /* Temp tables do not use persistent stats. */ - ps_on = FALSE; - ps_off = TRUE; - } else { - ps_on = create_info->table_options - & HA_OPTION_STATS_PERSISTENT; - ps_off = create_info->table_options - & HA_OPTION_NO_STATS_PERSISTENT; - } - - dict_stats_set_persistent(innodb_table, ps_on, ps_off); - - dict_stats_auto_recalc_set( - innodb_table, - create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, - create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); - - innodb_table->stats_sample_pages = create_info->stats_sample_pages; + if (innodb_copy_stat_flags(innodb_table, create_info->table_options, + create_info->stats_auto_recalc, false)) + innodb_table->stats_sample_pages= create_info->stats_sample_pages; } /*********************************************************************//** @@ -3026,28 +3013,10 @@ dict_table_t* innodb_table, /*!< in/out: InnoDB table */ const TABLE_SHARE* table_share) /*!< in: table share */ { - ibool ps_on; - ibool ps_off; - - if (innodb_table->is_temporary()) { - /* Temp tables do not use persistent stats */ - ps_on = FALSE; - ps_off = TRUE; - } else { - ps_on = table_share->db_create_options - & HA_OPTION_STATS_PERSISTENT; - ps_off = table_share->db_create_options - & HA_OPTION_NO_STATS_PERSISTENT; - } - - dict_stats_set_persistent(innodb_table, ps_on, ps_off); - - dict_stats_auto_recalc_set( - innodb_table, - table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, - table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); - - innodb_table->stats_sample_pages = table_share->stats_sample_pages; + if (innodb_copy_stat_flags(innodb_table, table_share->db_create_options, + table_share->stats_auto_recalc, + innodb_table->stat_initialized())) + innodb_table->stats_sample_pages= table_share->stats_sample_pages; } /*********************************************************************//** @@ -3288,7 +3257,7 @@ bool allow = innobase_query_caching_table_check_low(table, trx); - dict_table_close(table); + table->release(); if (allow) { /* If the isolation level is high, assign a read view for the @@ -3678,7 +3647,7 @@ m_prebuilt->used_in_HANDLER = TRUE; reset_template(); - m_prebuilt->trx->bulk_insert = false; + m_prebuilt->trx->bulk_insert &= TRX_DDL_BULK; } /*********************************************************************//** @@ -3701,53 +3670,44 @@ DBUG_RETURN(1); } -/** Return the minimum buffer pool size based on page size */ -static inline ulint min_buffer_pool_size() +static void innodb_buffer_pool_size_update(THD* thd,st_mysql_sys_var*,void*, + const void *save) noexcept { - ulint s= (BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4) * srv_page_size; - /* buf_pool_chunk_size minimum is 1M, so round up to a multiple */ - ulint alignment= 1U << 20; - return UT_CALC_ALIGN(s, alignment); + buf_pool.resize(*static_cast(save), thd); } -/** Validate the requested buffer pool size. Also, reserve the necessary -memory needed for buffer pool resize. -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] save immediate result for update function -@param[in] value incoming string -@return 0 on success, 1 on failure. -*/ -static -int -innodb_buffer_pool_size_validate( - THD* thd, - struct st_mysql_sys_var* var, - void* save, - struct st_mysql_value* value); - -/** Update the system variable innodb_buffer_pool_size using the "saved" -value. This function is registered as a callback with MySQL. -@param[in] thd thread handle -@param[in] var pointer to system variable -@param[out] var_ptr where the formal string goes -@param[in] save immediate result from check function */ -static -void -innodb_buffer_pool_size_update( - THD* thd, - struct st_mysql_sys_var* var, - void* var_ptr, - const void* save); +static MYSQL_SYSVAR_SIZE_T(buffer_pool_size, buf_pool.size_in_bytes_requested, + PLUGIN_VAR_RQCMDARG, + "The size of the memory buffer InnoDB uses to cache data" + " and indexes of its tables.", + nullptr, innodb_buffer_pool_size_update, 128U << 20, 2U << 20, + size_t(-ssize_t(innodb_buffer_pool_extent_size)), 1U << 20); + +#if defined __linux__ || !defined DBUG_OFF +static void innodb_buffer_pool_size_auto_min_update(THD*,st_mysql_sys_var*, + void*, const void *save) + noexcept +{ + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.size_in_bytes_auto_min= *static_cast(save); + mysql_mutex_unlock(&buf_pool.mutex); +} -static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size, +static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_auto_min, + buf_pool.size_in_bytes_auto_min, PLUGIN_VAR_RQCMDARG, - "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", - innodb_buffer_pool_size_validate, - innodb_buffer_pool_size_update, - 128ULL << 20, - 2ULL << 20, - LLONG_MAX, 1024*1024L); + "Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure", + nullptr, innodb_buffer_pool_size_auto_min_update, 0, 0, + size_t(-ssize_t(innodb_buffer_pool_extent_size)), + innodb_buffer_pool_extent_size); +#endif + +static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_max, buf_pool.size_in_bytes_max, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Maximum innodb_buffer_pool_size", + nullptr, nullptr, 0, 0, + size_t(-ssize_t(innodb_buffer_pool_extent_size)), + innodb_buffer_pool_extent_size); static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -3799,29 +3759,6 @@ return 0; } -/** Initialize and normalize innodb_buffer_pool_{chunk_,}size. */ -static void innodb_buffer_pool_size_init() -{ - if (srv_buf_pool_chunk_unit > srv_buf_pool_size) - { - /* Size unit of buffer pool is larger than srv_buf_pool_size. - adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */ - srv_buf_pool_chunk_unit = srv_buf_pool_size; - } - else if (srv_buf_pool_chunk_unit == 0) - { - srv_buf_pool_chunk_unit = srv_buf_pool_size / 64; - my_large_page_truncate(&srv_buf_pool_chunk_unit); - } - - if (srv_buf_pool_chunk_unit < buf_pool_chunk_min_size) - srv_buf_pool_chunk_unit = buf_pool_chunk_min_size; - - srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size); - innobase_buffer_pool_size = srv_buf_pool_size; -} - - static bool compression_algorithm_is_not_loaded(ulong compression_algorithm, myf flags) { @@ -3847,323 +3784,298 @@ @retval HA_ERR_INITIALIZATION when some parameters are out of range */ static int innodb_init_params() { - DBUG_ENTER("innodb_init_params"); + DBUG_ENTER("innodb_init_params"); - ulong num_pll_degree; + srv_page_size_shift= innodb_page_size_validate(srv_page_size); + if (!srv_page_size_shift) + { + sql_print_error("InnoDB: Invalid page size=%lu.\n", srv_page_size); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - /* Check that values don't overflow on 32-bit systems. */ - if (sizeof(ulint) == 4) { - if (innobase_buffer_pool_size > UINT_MAX32) { - sql_print_error( - "innodb_buffer_pool_size can't be over 4GB" - " on 32-bit systems"); - DBUG_RETURN(HA_ERR_OUT_OF_MEM); - } - } + size_t &min= MYSQL_SYSVAR_NAME(buffer_pool_size).min_val; + min= ut_calc_align + (buf_pool.blocks_in_bytes(BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4), + 1U << 20); + size_t innodb_buffer_pool_size= buf_pool.size_in_bytes_requested; + + /* With large pages, buffer pool can't grow or shrink. */ + if (!buf_pool.size_in_bytes_max || my_use_large_pages || + innodb_buffer_pool_size > buf_pool.size_in_bytes_max) + buf_pool.size_in_bytes_max= ut_calc_align(innodb_buffer_pool_size, + innodb_buffer_pool_extent_size); + + MYSQL_SYSVAR_NAME(buffer_pool_size).max_val= buf_pool.size_in_bytes_max; +#if defined __linux__ || !defined DBUG_OFF + if (!buf_pool.size_in_bytes_auto_min || + buf_pool.size_in_bytes_auto_min > buf_pool.size_in_bytes_max) + buf_pool.size_in_bytes_auto_min= buf_pool.size_in_bytes_max; + MYSQL_SYSVAR_NAME(buffer_pool_size_auto_min).max_val= + buf_pool.size_in_bytes_max; +#endif - /* The buffer pool needs to be able to accommodate enough many - pages, even for larger pages */ - MYSQL_SYSVAR_NAME(buffer_pool_size).min_val= min_buffer_pool_size(); - - if (innobase_buffer_pool_size < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) { - ib::error() << "innodb_page_size=" - << srv_page_size << " requires " - << "innodb_buffer_pool_size >= " - << (MYSQL_SYSVAR_NAME(buffer_pool_size).min_val >> 20) - << "MiB current " << (innobase_buffer_pool_size >> 20) - << "MiB"; - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - if (!ut_is_2pow(log_sys.write_size)) { - sql_print_error("InnoDB: innodb_log_write_ahead_size=%u" - " is not a power of two", - log_sys.write_size); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG)) - DBUG_RETURN(HA_ERR_INITIALIZATION); - - if ((srv_encrypt_tables || srv_encrypt_log - || innodb_encrypt_temporary_tables) - && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) { - sql_print_error("InnoDB: cannot enable encryption, " - "encryption plugin is not available"); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } + if (innodb_buffer_pool_size < min) + { + sql_print_error("InnoDB: innodb_page_size=%lu requires " + "innodb_buffer_pool_size >= %zu MiB current %zu MiB", + srv_page_size, min >> 20, innodb_buffer_pool_size >> 20); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + if (!ut_is_2pow(log_sys.write_size)) + { + sql_print_error("InnoDB: innodb_log_write_ahead_size=%u" + " is not a power of two", + log_sys.write_size); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG)) + DBUG_RETURN(HA_ERR_INITIALIZATION); + + if ((srv_encrypt_tables || srv_encrypt_log || + innodb_encrypt_temporary_tables) && + !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) + { + sql_print_error("InnoDB: cannot enable encryption, " + "encryption plugin is not available"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } #ifdef _WIN32 - if (!is_filename_allowed(srv_buf_dump_filename, - strlen(srv_buf_dump_filename), FALSE)) { - sql_print_error("InnoDB: innodb_buffer_pool_filename" - " cannot have colon (:) in the file name."); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), false)) + { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } #endif - /* First calculate the default path for innodb_data_home_dir etc., - in case the user has not given any value. + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. - Note that when using the embedded server, the datadirectory is not - necessarily the current directory of this program. */ + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ - fil_path_to_mysql_datadir = + fil_path_to_mysql_datadir = #ifndef HAVE_REPLICATION - mysqld_embedded ? mysql_real_data_home : + mysqld_embedded ? mysql_real_data_home : #endif - "./"; + "./"; - /* Set InnoDB initialization parameters according to the values - read from MySQL .cnf file */ + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ - /* The default dir for data files is the datadir of MySQL */ + /* The default dir for data files is the datadir of MySQL */ - srv_data_home = innobase_data_home_dir - ? innobase_data_home_dir - : const_cast(fil_path_to_mysql_datadir); + srv_data_home= innobase_data_home_dir + ? innobase_data_home_dir + : const_cast(fil_path_to_mysql_datadir); #ifdef WITH_WSREP - /* If we use the wsrep API, then we need to tell the server - the path to the data files (for passing it to the SST scripts): */ - wsrep_set_data_home_dir(srv_data_home); + /* If we use the wsrep API, then we need to tell the server + the path to the data files (for passing it to the SST scripts): */ + wsrep_set_data_home_dir(srv_data_home); #endif /* WITH_WSREP */ - /*--------------- Shared tablespaces -------------------------*/ - - /* Check that the value of system variable innodb_page_size was - set correctly. Its value was put into srv_page_size. If valid, - return the associated srv_page_size_shift. */ - srv_page_size_shift = innodb_page_size_validate(srv_page_size); - if (!srv_page_size_shift) { - sql_print_error("InnoDB: Invalid page size=%lu.\n", - srv_page_size); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - srv_sys_space.set_space_id(TRX_SYS_SPACE); - - switch (srv_checksum_algorithm) { - case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: - case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: - srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER - | FSP_FLAGS_FCRC32_PAGE_SSIZE()); - break; - default: - srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); - } - - srv_sys_space.set_path(srv_data_home); - - /* Supports raw devices */ - if (!srv_sys_space.parse_params(innobase_data_file_path, true)) { - ib::error() << "Unable to parse innodb_data_file_path=" - << innobase_data_file_path; - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - srv_tmp_space.set_path(srv_data_home); - - /* Temporary tablespace is in full crc32 format. */ - srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER - | FSP_FLAGS_FCRC32_PAGE_SSIZE()); - - if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) { - ib::error() << "Unable to parse innodb_temp_data_file_path=" - << innobase_temp_data_file_path; - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - /* Perform all sanity check before we take action of deleting files*/ - if (srv_sys_space.intersection(&srv_tmp_space)) { - sql_print_error("innodb_temporary and innodb_system" - " file names seem to be the same."); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - srv_sys_space.normalize_size(); - srv_tmp_space.normalize_size(); - - /* ------------ UNDO tablespaces files ---------------------*/ - if (!srv_undo_dir) { - srv_undo_dir = const_cast(fil_path_to_mysql_datadir); - } - - if (strchr(srv_undo_dir, ';')) { - sql_print_error("syntax error in innodb_undo_directory"); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } + /*--------------- Shared tablespaces -------------------------*/ - /* -------------- All log files ---------------------------*/ - - /* The default dir for log files is the datadir of MySQL */ + /* Check that the value of system variable innodb_page_size was + set correctly. Its value was put into srv_page_size. If valid, + return the associated srv_page_size_shift. */ + + srv_sys_space.set_space_id(TRX_SYS_SPACE); + /* Temporary tablespace is in full crc32 format. */ + srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER | + FSP_FLAGS_FCRC32_PAGE_SSIZE()); + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + srv_sys_space.set_flags(srv_tmp_space.flags()); + break; + default: + srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); + } - if (!srv_log_group_home_dir) { - srv_log_group_home_dir - = const_cast(fil_path_to_mysql_datadir); - } + srv_sys_space.set_path(srv_data_home); - if (strchr(srv_log_group_home_dir, ';')) { - sql_print_error("syntax error in innodb_log_group_home_dir"); - DBUG_RETURN(HA_ERR_INITIALIZATION); - } - - DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL); + if (!srv_sys_space.parse_params(innobase_data_file_path, true)) + { + sql_print_error("InnoDB: Unable to parse innodb_data_file_path=%s", + innobase_data_file_path); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - /* Check that interdependent parameters have sane values. */ - if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { - sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" - " cannot be set higher than" - " innodb_max_dirty_pages_pct.\n" - "InnoDB: Setting" - " innodb_max_dirty_pages_pct_lwm to %lf\n", - srv_max_buf_pool_modified_pct); + srv_tmp_space.set_path(srv_data_home); - srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; - } + if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) + { + sql_print_error("InnoDB: Unable to parse innodb_temp_data_file_path=%s", + innobase_temp_data_file_path); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + /* Perform all sanity check before we take action of deleting files*/ + if (srv_sys_space.intersection(&srv_tmp_space)) + { + sql_print_error("innodb_temporary and innodb_system" + " file names seem to be the same."); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { - /* Avoid overflow. */ - srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; - } else { - /* The user has not set the value. We should - set it based on innodb_io_capacity. */ - srv_max_io_capacity = - ut_max(2 * srv_io_capacity, 2000UL); - } + srv_sys_space.normalize_size(); + srv_tmp_space.normalize_size(); - } else if (srv_max_io_capacity < srv_io_capacity) { - sql_print_warning("InnoDB: innodb_io_capacity" - " cannot be set higher than" - " innodb_io_capacity_max." - "Setting innodb_io_capacity=%lu", - srv_max_io_capacity); + /* ------------ UNDO tablespaces files ---------------------*/ + if (!srv_undo_dir) + srv_undo_dir= const_cast(fil_path_to_mysql_datadir); - srv_io_capacity = srv_max_io_capacity; - } + if (strchr(srv_undo_dir, ';')) + { + sql_print_error("syntax error in innodb_undo_directory"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - if (UNIV_PAGE_SIZE_DEF != srv_page_size) { - ib::info() << "innodb_page_size=" << srv_page_size; + if (!srv_log_group_home_dir) + srv_log_group_home_dir= const_cast(fil_path_to_mysql_datadir); - srv_max_undo_log_size = std::max( - srv_max_undo_log_size, - ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) - << srv_page_size_shift); - } + if (strchr(srv_log_group_home_dir, ';')) + { + sql_print_error("syntax error in innodb_log_group_home_dir"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } - srv_buf_pool_size = ulint(innobase_buffer_pool_size); + DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL); - if (innobase_open_files < 10) { - innobase_open_files = 300; - if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) { - innobase_open_files = tc_size; - } - } + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) + { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lf\n", + srv_max_buf_pool_modified_pct); + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } - if (innobase_open_files > open_files_limit) { - ib::warn() << "innodb_open_files " << innobase_open_files - << " should not be greater" - << " than the open_files_limit " << open_files_limit; - if (innobase_open_files > tc_size) { - innobase_open_files = tc_size; - } - } + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) + { + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) + /* Avoid overflow. */ + srv_max_io_capacity= SRV_MAX_IO_CAPACITY_LIMIT; + else + /* The user has not set the value. We should set it based on + innodb_io_capacity. */ + srv_max_io_capacity= std::max(2 * srv_io_capacity, 2000UL); + } + else if (srv_max_io_capacity < srv_io_capacity) + { + sql_print_warning("InnoDB: innodb_io_capacity cannot be set higher than" + " innodb_io_capacity_max." + "Setting innodb_io_capacity=%lu", srv_max_io_capacity); + srv_io_capacity= srv_max_io_capacity; + } - ulint min_open_files_limit = srv_undo_tablespaces - + srv_sys_space.m_files.size() - + srv_tmp_space.m_files.size() + 1; - if (min_open_files_limit > innobase_open_files) { - sql_print_warning( - "InnoDB: innodb_open_files=%lu is not greater " - "than the number of system tablespace files, " - "temporary tablespace files, " - "innodb_undo_tablespaces=%u; adjusting " - "to innodb_open_files=%zu", - innobase_open_files, srv_undo_tablespaces, - min_open_files_limit); - innobase_open_files = (ulong) min_open_files_limit; - } + if (UNIV_PAGE_SIZE_DEF != srv_page_size) + { + sql_print_information("InnoDB: innodb_page_size=%lu", srv_page_size); + srv_max_undo_log_size= + std::max(srv_max_undo_log_size, + ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) << + srv_page_size_shift); + } - srv_max_n_open_files = innobase_open_files; - srv_innodb_status = (ibool) innobase_create_status_file; + if (innobase_open_files < 10) + innobase_open_files= (srv_file_per_table && tc_size > 300 && + tc_size < open_files_limit) + ? tc_size + : 300; - srv_print_verbose_log = mysqld_embedded ? 0 : 1; + if (innobase_open_files > open_files_limit) + { + sql_print_warning("InnoDB: innodb_open_files %lu" + " should not be greater than the open_files_limit %lu", + innobase_open_files, open_files_limit); + if (innobase_open_files > tc_size) + innobase_open_files= tc_size; + } - /* Round up fts_sort_pll_degree to nearest power of 2 number */ - for (num_pll_degree = 1; - num_pll_degree < fts_sort_pll_degree; - num_pll_degree <<= 1) { + const size_t min_open_files_limit= srv_undo_tablespaces + + srv_sys_space.m_files.size() + srv_tmp_space.m_files.size() + 1; + if (min_open_files_limit > innobase_open_files) + { + sql_print_warning("InnoDB: innodb_open_files=%lu is not greater " + "than the number of system tablespace files, " + "temporary tablespace files, " + "innodb_undo_tablespaces=%lu; adjusting " + "to innodb_open_files=%zu", + innobase_open_files, srv_undo_tablespaces, + min_open_files_limit); + innobase_open_files= ulong(min_open_files_limit); + } - /* No op */ - } + srv_max_n_open_files= innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; - fts_sort_pll_degree = num_pll_degree; + srv_print_verbose_log= !mysqld_embedded; - /* Store the default charset-collation number of this MySQL - installation */ + if (!ut_is_2pow(fts_sort_pll_degree)) + { + ulong n; + for (n= 1; n < fts_sort_pll_degree; n<<= 1) {} + fts_sort_pll_degree= n; + } - data_mysql_default_charset_coll = (ulint) default_charset_info->number; + /* Store the default charset-collation number of this installation */ + data_mysql_default_charset_coll = (ulint) default_charset_info->number; #if !defined _WIN32 && defined O_DIRECT - if (srv_use_atomic_writes && my_may_have_atomic_write) { - /* - Force O_DIRECT on Unixes (on Windows writes are always - unbuffered) - */ - switch (srv_file_flush_method) { - case SRV_O_DIRECT: - case SRV_O_DIRECT_NO_FSYNC: - break; - default: - srv_file_flush_method = SRV_O_DIRECT; - fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); - } - } + if (srv_use_atomic_writes && my_may_have_atomic_write) + { + /* Force O_DIRECT on Unixes (on Windows writes are always unbuffered) */ + switch (srv_file_flush_method) { + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + srv_file_flush_method= SRV_O_DIRECT; + fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); + } + } #endif #if defined __linux__ || defined _WIN32 - if (srv_flush_log_at_trx_commit == 2) { - /* Do not disable the file system cache if - innodb_flush_log_at_trx_commit=2. */ - log_sys.log_buffered = true; - } + if (srv_flush_log_at_trx_commit == 2) + /* Do not disable the file system cache if + innodb_flush_log_at_trx_commit=2. */ + log_sys.log_buffered= true; #endif #if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32 - /* Currently native AIO is supported only on windows and linux - and that also when the support is compiled in. In all other - cases, we ignore the setting of innodb_use_native_aio. */ - srv_use_native_aio = FALSE; -#endif -#ifdef HAVE_URING - if (srv_use_native_aio && io_uring_may_be_unsafe) { - sql_print_warning("innodb_use_native_aio may cause " - "hangs with this kernel %s; see " - "https://jira.mariadb.org/browse/MDEV-26674", - io_uring_may_be_unsafe); - } + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + srv_use_native_aio= FALSE; #endif #ifdef _WIN32 - switch (srv_file_flush_method) { - case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: - srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC; - break; - case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */: - srv_file_flush_method = SRV_FSYNC; - break; - default: - ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC); - } + switch (srv_file_flush_method) { + case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: + srv_file_flush_method= SRV_ALL_O_DIRECT_FSYNC; + break; + case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */: + srv_file_flush_method= SRV_FSYNC; + break; + default: + ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC); + } #else - ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); + ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); #endif - innodb_buffer_pool_size_init(); - - srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); - DBUG_RETURN(0); + DBUG_RETURN(0); } /** Initialize the InnoDB storage engine plugin. @@ -4576,7 +4488,7 @@ undo_no_t savept= 0; trx->rollback(&savept); /* MariaDB will roll back the entire transaction. */ - trx->bulk_insert= false; + trx->bulk_insert&= TRX_DDL_BULK; trx->last_stmt_start= 0; return true; } @@ -4620,10 +4532,9 @@ ut_ad("invalid state" == 0); /* fall through */ case TRX_STATE_PREPARED: - ut_ad(commit_trx || trx->is_wsrep()); - ut_ad(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT - | OPTION_BEGIN) - || trx->is_wsrep()); + ut_ad(commit_trx || + !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT + | OPTION_BEGIN)); /* fall through */ case TRX_STATE_ACTIVE: /* Transaction is deregistered only in a commit or a @@ -4825,11 +4736,13 @@ We put the request in a queue, so that we can notify upper layer about checkpoint complete when we have flushed the redo log. If we have already flushed all relevant redo log, we notify immediately.*/ -static void innodb_log_flush_request(void *cookie) +static void innodb_log_flush_request(void *cookie) noexcept { + log_sys.latch.wr_lock(SRW_LOCK_CALL); lsn_t flush_lsn= log_sys.get_flushed_lsn(); /* Load lsn relaxed after flush_lsn was loaded from the same cache line */ const lsn_t lsn= log_sys.get_lsn(); + log_sys.latch.wr_unlock(); if (flush_lsn >= lsn) /* All log is already persistent. */; @@ -5837,6 +5750,70 @@ table->autoinc_mutex.wr_unlock(); } +dberr_t ha_innobase::statistics_init(dict_table_t *table, bool recalc) +{ + ut_ad(table->is_readable()); + ut_ad(!table->stats_mutex_is_owner()); + + uint32_t stat= table->stat; + dberr_t err= DB_SUCCESS; + + if (!recalc && dict_table_t::stat_initialized(stat)); + else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) + dict_stats_empty_table(table, false); + else + { + if (dict_table_t::stats_is_persistent(stat) && !srv_read_only_mode +#ifdef WITH_WSREP + && !wsrep_thd_skip_locking(m_user_thd) +#endif + ) + { + switch (dict_stats_persistent_storage_check(false)) { + case SCHEMA_OK: + if (recalc) + { + recalc: + err= dict_stats_update_persistent(table); + if (err == DB_SUCCESS) + err= dict_stats_save(table); + } + else + { + err= dict_stats_fetch_from_ps(table); + if (err == DB_STATS_DO_NOT_EXIST && table->stats_is_auto_recalc()) + goto recalc; + } + if (err == DB_SUCCESS || err == DB_READ_ONLY) + return err; + if (!recalc) + break; + /* fall through */ + case SCHEMA_INVALID: + if (table->stats_error_printed) + break; + table->stats_error_printed = true; + if (opt_bootstrap) + break; + sql_print_warning("InnoDB: %s of persistent statistics requested" + " for table %`.*s.%`s" + " but the required persistent statistics storage" + " is corrupted.", + recalc ? "Recalculation" : "Fetch", + int(table->name.dblen()), table->name.m_name, + table->name.basename()); + /* fall through */ + case SCHEMA_NOT_EXIST: + err= DB_STATS_DO_NOT_EXIST; + } + } + + dict_stats_update_transient(table); + } + + return err; +} + /** Open an InnoDB table @param[in] name table name @return error code @@ -7958,6 +7935,17 @@ error, m_prebuilt->table->flags, m_user_thd); #ifdef WITH_WSREP +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("sync.wsrep_after_write_row", + { + const char act[]= + "now " + "SIGNAL sync.wsrep_after_write_row_reached " + "WAIT_FOR signal.wsrep_after_write_row"; + DBUG_ASSERT(!debug_sync_set_action(m_user_thd, STRING_WITH_LEN(act))); + };); +#endif /* ENABLED_DEBUG_SYNC */ + if (!error_result && trx->is_wsrep() && !trx->is_bulk_insert() && wsrep_thd_is_local(m_user_thd) @@ -13338,7 +13326,7 @@ if (!error) { - dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE); + dict_stats_empty_table_and_save(info.table()); if (!info.table()->is_temporary()) log_write_up_to(trx->commit_lsn, true); info.table()->release(); @@ -13387,6 +13375,8 @@ DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); } + ut_ad(m_prebuilt->table->stat_initialized()); + if (m_prebuilt->table->space == fil_system.sys_space) { ib_senderrf( m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, @@ -13460,23 +13450,17 @@ err, m_prebuilt->table->flags, NULL)); } - if (dict_stats_is_persistent_enabled(m_prebuilt->table)) { - dberr_t ret; - - /* Adjust the persistent statistics. */ - ret = dict_stats_update(m_prebuilt->table, - DICT_STATS_RECALC_PERSISTENT); + dict_table_t* t = m_prebuilt->table; - if (ret != DB_SUCCESS) { - push_warning_printf( - ha_thd(), - Sql_condition::WARN_LEVEL_WARN, - ER_ALTER_INFO, - "Error updating stats for table '%s'" - " after table rebuild: %s", - m_prebuilt->table->name.m_name, - ut_strerr(ret)); - } + if (dberr_t ret = dict_stats_update_persistent_try(t)) { + push_warning_printf( + ha_thd(), + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Error updating stats after" + " ALTER TABLE %`.*s.%`s IMPORT TABLESPACE: %s", + int(t->name.dblen()), t->name.m_name, + t->name.basename(), ut_strerr(ret)); } DBUG_RETURN(0); @@ -13619,8 +13603,6 @@ err= lock_table_children(table, trx); } - dict_table_t *table_stats= nullptr, *index_stats= nullptr; - MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; if (err == DB_SUCCESS) err= lock_table_for_trx(table, trx, LOCK_X); @@ -13645,7 +13627,7 @@ /* This looks like the rollback of ALTER TABLE...ADD PARTITION that was caused by MDL timeout. We could have written undo log for inserting the data into the new partitions. */ - if (table->stat_persistent != DICT_STATS_PERSISTENT_OFF) + if (!(table->stat & dict_table_t::STATS_PERSISTENT_OFF)) { /* We do not really know if we are holding MDL_EXCLUSIVE. Even though this code is handling the case that we are not holding @@ -13659,37 +13641,18 @@ #endif DEBUG_SYNC(thd, "before_delete_table_stats"); + dict_stats stats; + bool stats_failed= true; - if (err == DB_SUCCESS && dict_stats_is_persistent_enabled(table) && + if (err == DB_SUCCESS && table->stats_is_persistent() && !table->is_stats_table()) { - table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats= dict_acquire_mdl_shared(table_stats, - thd, &mdl_table); - dict_sys.unfreeze(); - } - - index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats= dict_acquire_mdl_shared(index_stats, - thd, &mdl_index); - dict_sys.unfreeze(); - } - + stats_failed= stats.open(thd); const bool skip_wait{table->name.is_temporary()}; - if (table_stats && index_stats && - !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && - !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && - !(err= lock_table_for_trx(table_stats, trx, LOCK_X, skip_wait))) - err= lock_table_for_trx(index_stats, trx, LOCK_X, skip_wait); + if (!stats_failed && + !(err= lock_table_for_trx(stats.table(), trx, LOCK_X, skip_wait))) + err= lock_table_for_trx(stats.index(), trx, LOCK_X, skip_wait); if (err != DB_SUCCESS && skip_wait) { @@ -13698,10 +13661,8 @@ ut_ad(err == DB_LOCK_WAIT); ut_ad(trx->error_state == DB_SUCCESS); err= DB_SUCCESS; - dict_table_close(table_stats, false, thd, mdl_table); - dict_table_close(index_stats, false, thd, mdl_index); - table_stats= nullptr; - index_stats= nullptr; + stats.close(); + stats_failed= true; } } @@ -13772,13 +13733,11 @@ else if (rollback_add_partition) purge_sys.resume_FTS(); #endif - if (table_stats) - dict_table_close(table_stats, true, thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, true, thd, mdl_index); row_mysql_unlock_data_dictionary(trx); if (trx != parent_trx) trx->free(); + if (!stats_failed) + stats.close(); DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); } @@ -13793,7 +13752,7 @@ err= trx->drop_table_foreign(table->name); } - if (err == DB_SUCCESS && table_stats && index_stats) + if (err == DB_SUCCESS && !stats_failed) err= trx->drop_table_statistics(table->name); if (err != DB_SUCCESS) goto err_exit; @@ -13804,11 +13763,9 @@ std::vector deleted; trx->commit(deleted); - if (table_stats) - dict_table_close(table_stats, true, thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, true, thd, mdl_index); row_mysql_unlock_data_dictionary(trx); + if (!stats_failed) + stats.close(); for (pfs_os_file_t d : deleted) os_file_close(d); log_write_up_to(trx->commit_lsn, true); @@ -14004,9 +13961,6 @@ ib_table->name.m_name, ib_table->id); const char *name= mem_heap_strdup(heap, ib_table->name.m_name); - dict_table_t *table_stats = nullptr, *index_stats = nullptr; - MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; - dberr_t error= lock_table_children(ib_table, trx); if (error == DB_SUCCESS) @@ -14014,6 +13968,7 @@ const bool fts= error == DB_SUCCESS && ib_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + const bool pause_purge= error == DB_SUCCESS && ib_table->get_ref_count() > 1; if (fts) { @@ -14021,45 +13976,33 @@ purge_sys.stop_FTS(*ib_table); error= fts_lock_tables(trx, *ib_table); } + else if (pause_purge) + purge_sys.stop_FTS(); - /* Wait for purge threads to stop using the table. */ - for (uint n = 15; ib_table->get_ref_count() > 1; ) + if (error == DB_SUCCESS) { - if (!--n) + /* Wait for purge threads to stop using the table. */ + for (uint n = 15; ib_table->get_ref_count() > 1; ) { - error= DB_LOCK_WAIT_TIMEOUT; - break; + if (!--n) + { + error= DB_LOCK_WAIT_TIMEOUT; + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); } - std::this_thread::sleep_for(std::chrono::milliseconds(50)); } - if (error == DB_SUCCESS && dict_stats_is_persistent_enabled(ib_table) && + dict_stats stats; + bool stats_failed= true; + + if (error == DB_SUCCESS && ib_table->stats_is_persistent() && !ib_table->is_stats_table()) { - table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats= dict_acquire_mdl_shared(table_stats, m_user_thd, - &mdl_table); - dict_sys.unfreeze(); - } - index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) - { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats= dict_acquire_mdl_shared(index_stats, m_user_thd, - &mdl_index); - dict_sys.unfreeze(); - } - - if (table_stats && index_stats && - !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && - !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && - !(error= lock_table_for_trx(table_stats, trx, LOCK_X))) - error= lock_table_for_trx(index_stats, trx, LOCK_X); + stats_failed= stats.open(m_user_thd); + if (!stats_failed && + !(error= lock_table_for_trx(stats.table(), trx, LOCK_X))) + error= lock_table_for_trx(stats.index(), trx, LOCK_X); } if (error == DB_SUCCESS) @@ -14123,7 +14066,7 @@ if (!err) { - dict_stats_update(m_prebuilt->table, DICT_STATS_EMPTY_TABLE); + dict_stats_empty_table_and_save(m_prebuilt->table); log_write_up_to(trx->commit_lsn, true); row_prebuilt_t *prebuilt= m_prebuilt; uchar *upd_buf= m_upd_buf; @@ -14151,15 +14094,46 @@ } trx->free(); - + if (!stats_failed) + stats.close(); mem_heap_free(heap); + DBUG_RETURN(err); +} - if (table_stats) - dict_table_close(table_stats, false, m_user_thd, mdl_table); - if (index_stats) - dict_table_close(index_stats, false, m_user_thd, mdl_index); +/** Deinitialize InnoDB persistent statistics, forcing them +to be reloaded on subsequent ha_innobase::open(). +@param t table for which the cached STATS_PERSISTENT are to be evicted */ +static void stats_deinit(dict_table_t *t) noexcept +{ + ut_ad(dict_sys.frozen()); + ut_ad(t->get_ref_count() == 0); - DBUG_RETURN(err); + if (t->is_temporary() || t->no_rollback()) + return; + + t->stats_mutex_lock(); + t->stat= t->stat & ~dict_table_t::STATS_INITIALIZED; + MEM_UNDEFINED(&t->stat_n_rows, sizeof t->stat_n_rows); + MEM_UNDEFINED(&t->stat_clustered_index_size, + sizeof t->stat_clustered_index_size); + MEM_UNDEFINED(&t->stat_sum_of_other_index_sizes, + sizeof t->stat_sum_of_other_index_sizes); + MEM_UNDEFINED(&t->stat_modified_counter, sizeof t->stat_modified_counter); +#ifdef HAVE_valgrind + for (dict_index_t *i= dict_table_get_first_index(t); i; + i= dict_table_get_next_index(i)) + { + MEM_UNDEFINED(i->stat_n_diff_key_vals, + i->n_uniq * sizeof *i->stat_n_diff_key_vals); + MEM_UNDEFINED(i->stat_n_sample_sizes, + i->n_uniq * sizeof *i->stat_n_sample_sizes); + MEM_UNDEFINED(i->stat_n_non_null_key_vals, + i->n_uniq * sizeof *i->stat_n_non_null_key_vals); + MEM_UNDEFINED(&i->stat_index_size, sizeof i->stat_index_size); + MEM_UNDEFINED(&i->stat_n_leaf_pages, sizeof i->stat_n_leaf_pages); + } +#endif /* HAVE_valgrind */ + t->stats_mutex_unlock(); } /*********************************************************************//** @@ -14184,8 +14158,6 @@ trx_t* trx = innobase_trx_allocate(thd); trx_start_for_ddl(trx); - dict_table_t *table_stats = nullptr, *index_stats = nullptr; - MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; char norm_from[MAX_FULL_NAME_LEN]; char norm_to[MAX_FULL_NAME_LEN]; @@ -14195,45 +14167,49 @@ dberr_t error = DB_SUCCESS; const bool from_temp = dict_table_t::is_temporary_name(norm_from); + dict_table_t* t; + bool pause_purge = false, fts_exist = false; + if (from_temp) { /* There is no need to lock any FOREIGN KEY child tables. */ - } else if (dict_table_t *table = dict_table_open_on_name( - norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) { - error = lock_table_children(table, trx); - if (error == DB_SUCCESS) { - error = lock_table_for_trx(table, trx, LOCK_X); + t = nullptr; + } else { + t = dict_table_open_on_name( + norm_from, false, DICT_ERR_IGNORE_FK_NOKEY); + if (t) { + error = lock_table_children(t, trx); + if (error == DB_SUCCESS) { + error = lock_table_for_trx(t, trx, LOCK_X); + } + fts_exist = error == DB_SUCCESS && t->flags2 + & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + pause_purge = error == DB_SUCCESS + && t->get_ref_count() > 1; + if (fts_exist) { + fts_optimize_remove_table(t); + purge_sys.stop_FTS(*t); + if (error == DB_SUCCESS) { + error = fts_lock_tables(trx, *t); + } + } else if (pause_purge) { + purge_sys.stop_FTS(); + } } - table->release(); } + dict_stats stats; + bool stats_fail = true; + if (strcmp(norm_from, TABLE_STATS_NAME) && strcmp(norm_from, INDEX_STATS_NAME) && strcmp(norm_to, TABLE_STATS_NAME) && strcmp(norm_to, INDEX_STATS_NAME)) { - table_stats = dict_table_open_on_name(TABLE_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (table_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats = dict_acquire_mdl_shared( - table_stats, thd, &mdl_table); - dict_sys.unfreeze(); - } - index_stats = dict_table_open_on_name(INDEX_STATS_NAME, false, - DICT_ERR_IGNORE_NONE); - if (index_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats = dict_acquire_mdl_shared( - index_stats, thd, &mdl_index); - dict_sys.unfreeze(); - } - - if (error == DB_SUCCESS && table_stats && index_stats - && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) - && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { - error = lock_table_for_trx(table_stats, trx, LOCK_X, - from_temp); + stats_fail = stats.open(thd); + if (!stats_fail && error == DB_SUCCESS) { + error = lock_table_for_trx(stats.table(), trx, + LOCK_X, from_temp); if (error == DB_SUCCESS) { - error = lock_table_for_trx(index_stats, trx, + error = lock_table_for_trx(stats.index(), trx, LOCK_X, from_temp); } if (error != DB_SUCCESS && from_temp) { @@ -14244,12 +14220,8 @@ we cannot lock the tables, when the table is being renamed from from a temporary name. */ - dict_table_close(table_stats, false, thd, - mdl_table); - dict_table_close(index_stats, false, thd, - mdl_index); - table_stats = nullptr; - index_stats = nullptr; + stats.close(); + stats_fail = true; } } } @@ -14276,7 +14248,7 @@ DEBUG_SYNC(thd, "after_innobase_rename_table"); - if (error == DB_SUCCESS && table_stats && index_stats) { + if (error == DB_SUCCESS && !stats_fail) { error = dict_stats_rename_table(norm_from, norm_to, trx); if (error == DB_DUPLICATE_KEY) { /* The duplicate may also occur in @@ -14289,33 +14261,52 @@ if (error == DB_SUCCESS) { trx->flush_log_later = true; + if (t) { + ut_ad(dict_sys.locked()); + if (fts_exist) { + fts_optimize_add_table(t); + } + if (UNIV_LIKELY(t->release())) { + stats_deinit(t); + } else { + ut_ad("unexpected references" == 0); + } + } innobase_commit_low(trx); } else { + if (t) { + if (fts_exist) { + fts_optimize_add_table(t); + } + t->release(); + } trx->rollback(); } - if (table_stats) { - dict_table_close(table_stats, true, thd, mdl_table); - } - if (index_stats) { - dict_table_close(index_stats, true, thd, mdl_index); - } row_mysql_unlock_data_dictionary(trx); + + if (fts_exist || pause_purge) { + purge_sys.resume_FTS(); + } + if (error == DB_SUCCESS) { log_write_up_to(trx->commit_lsn, true); } trx->flush_log_later = false; trx->free(); + if (!stats_fail) { + stats.close(); + } if (error == DB_DUPLICATE_KEY) { /* We are not able to deal with handler::get_dup_key() during DDL operations, because the duplicate key would exist in metadata tables, not in the user table. */ my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to); - error = DB_ERROR; + DBUG_RETURN(HA_ERR_GENERIC); } else if (error == DB_LOCK_WAIT_TIMEOUT) { my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0)); - error = DB_LOCK_WAIT; + DBUG_RETURN(HA_ERR_GENERIC); } DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); @@ -14529,7 +14520,7 @@ ulint stat_clustered_index_size; - ut_a(m_prebuilt->table->stat_initialized); + ut_ad(m_prebuilt->table->stat_initialized()); stat_clustered_index_size = m_prebuilt->table->stat_clustered_index_size; @@ -14656,7 +14647,7 @@ rec_per_key_t rec_per_key; ib_uint64_t n_diff; - ut_a(index->table->stat_initialized); + ut_ad(index->table->stat_initialized()); ut_ad(i < dict_index_get_n_unique(index)); ut_ad(!dict_index_is_spatial(index)); @@ -14794,63 +14785,82 @@ ib_table = m_prebuilt->table; DBUG_ASSERT(ib_table->get_ref_count() > 0); - if (!ib_table->is_readable()) { + if (!ib_table->is_readable() + || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { dict_stats_empty_table(ib_table, true); - } - - if (flag & HA_STATUS_TIME) { - if (is_analyze || innobase_stats_on_metadata) { + } else if (flag & HA_STATUS_TIME) { + stats.update_time = ib_table->update_time; + if (!is_analyze && !innobase_stats_on_metadata) { + goto stats_fetch; + } - dict_stats_upd_option_t opt; - dberr_t ret; + dberr_t ret; + m_prebuilt->trx->op_info = "updating table statistics"; - m_prebuilt->trx->op_info = "updating table statistics"; + if (ib_table->stats_is_persistent() + && !srv_read_only_mode + && dict_stats_persistent_storage_check(false) + == SCHEMA_OK) { + if (is_analyze) { + dict_stats_recalc_pool_del(ib_table->id, + false); +recalc: + ret = statistics_init(ib_table, is_analyze); + } else { + /* This is e.g. 'SHOW INDEXES' */ + ret = statistics_init(ib_table, is_analyze); + switch (ret) { + case DB_SUCCESS: + case DB_READ_ONLY: + break; + default: + goto error; + case DB_STATS_DO_NOT_EXIST: + if (!ib_table + ->stats_is_auto_recalc()) { + break; + } - if (dict_stats_is_persistent_enabled(ib_table)) { - if (is_analyze) { - if (!srv_read_only_mode) { - dict_stats_recalc_pool_del( - ib_table->id, false); + if (opt_bootstrap) { + break; } - opt = DICT_STATS_RECALC_PERSISTENT; - } else { - /* This is e.g. 'SHOW INDEXES', fetch - the persistent stats from disk. */ - opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; +#ifdef WITH_WSREP + if (wsrep_thd_skip_locking( + m_user_thd)) { + break; + } +#endif + is_analyze = true; + goto recalc; } - } else { - opt = DICT_STATS_RECALC_TRANSIENT; } - - ret = dict_stats_update(ib_table, opt); - + } else { + ret = dict_stats_update_transient(ib_table); if (ret != DB_SUCCESS) { +error: m_prebuilt->trx->op_info = ""; DBUG_RETURN(HA_ERR_GENERIC); } - - m_prebuilt->trx->op_info = - "returning various info to MariaDB"; } - - stats.update_time = (ulong) ib_table->update_time; + m_prebuilt->trx->op_info = "returning various info to MariaDB"; + } else { +stats_fetch: + statistics_init(ib_table, false); } - dict_stats_init(ib_table); - if (flag & HA_STATUS_VARIABLE) { ulint stat_clustered_index_size; ulint stat_sum_of_other_index_sizes; - ut_a(ib_table->stat_initialized); - #if !defined NO_ELISION && !defined SUX_LOCK_GENERIC if (xbegin()) { if (ib_table->stats_mutex_is_locked()) xabort(); + ut_ad(ib_table->stat_initialized()); + n_rows = ib_table->stat_n_rows; stat_clustered_index_size @@ -14865,6 +14875,8 @@ { ib_table->stats_shared_lock(); + ut_ad(ib_table->stat_initialized()); + n_rows = ib_table->stat_n_rows; stat_clustered_index_size @@ -14998,7 +15010,7 @@ auto _ = make_scope_exit([ib_table]() { ib_table->stats_shared_unlock(); }); - ut_a(ib_table->stat_initialized); + ut_ad(ib_table->stat_initialized()); for (uint i = 0; i < table->s->keys; i++) { ulong j; @@ -15694,7 +15706,7 @@ << foreign->foreign_table_name; } } else { - dict_table_close(ref_table, true); + ref_table->release(); } } @@ -15852,7 +15864,7 @@ stmt_boundary: trx->bulk_insert_apply(); trx->end_bulk_insert(*m_prebuilt->table); - trx->bulk_insert = false; + trx->bulk_insert &= TRX_DDL_BULK; break; case HA_EXTRA_NO_KEYREAD: (void)check_trx_exists(ha_thd()); @@ -15911,32 +15923,47 @@ break; case HA_EXTRA_END_ALTER_COPY: trx = check_trx_exists(ha_thd()); - if (m_prebuilt->table->skip_alter_undo) { - if (dberr_t err= trx->bulk_insert_apply()) { - m_prebuilt->table->skip_alter_undo = 0; - return convert_error_code_to_mysql( - err, - m_prebuilt->table->flags, - trx->mysql_thd); - } - - trx->end_bulk_insert(*m_prebuilt->table); - trx->bulk_insert = false; - /* During copy alter operation, InnoDB - updates the stats only for non-persistent - tables. */ - if (!dict_stats_is_persistent_enabled( - m_prebuilt->table)) { - dict_stats_update_if_needed( - m_prebuilt->table, *trx); - } + if (!m_prebuilt->table->skip_alter_undo) { + /* This could be invoked inside INSERT...SELECT. + We do not want any extra log writes, because + they could cause a severe performance regression. */ + break; } m_prebuilt->table->skip_alter_undo = 0; + if (dberr_t err= trx->bulk_insert_apply()) { + m_prebuilt->table->skip_alter_undo = 0; + return convert_error_code_to_mysql( + err, m_prebuilt->table->flags, + trx->mysql_thd); + } + + trx->end_bulk_insert(*m_prebuilt->table); + trx->bulk_insert &= TRX_DML_BULK; if (!m_prebuilt->table->is_temporary() && !high_level_read_only) { + /* During copy_data_between_tables(), InnoDB only + updates transient statistics. */ + if (!m_prebuilt->table->stats_is_persistent()) { + dict_stats_update_if_needed(m_prebuilt->table, + *trx); + } + /* The extra log write is necessary for + ALTER TABLE...ALGORITHM=COPY, because + a normal transaction commit would be a no-op + because no undo log records were generated. + This log write will also be unnecessarily executed + during CREATE...SELECT, which is the other caller of + handler::extra(HA_EXTRA_BEGIN_ALTER_COPY). */ log_buffer_flush_to_disk(); } break; + case HA_EXTRA_ABORT_ALTER_COPY: + if (m_prebuilt->table->skip_alter_undo) { + trx = check_trx_exists(ha_thd()); + m_prebuilt->table->skip_alter_undo = 0; + trx->rollback(); + } + break; default:/* Do nothing */ ; } @@ -16031,7 +16058,8 @@ break; } - trx->bulk_insert = false; + ut_ad(trx->bulk_insert != TRX_DDL_BULK); + trx->bulk_insert = TRX_NO_BULK; trx->last_stmt_start = trx->undo_no; } @@ -16239,7 +16267,7 @@ if (!trx->bulk_insert) { break; } - trx->bulk_insert = false; + trx->bulk_insert &= TRX_DDL_BULK; trx->last_stmt_start = trx->undo_no; } @@ -17294,7 +17322,12 @@ param_new = info->option_struct; param_old = table->s->option_struct; - innobase_copy_frm_flags_from_create_info(m_prebuilt->table, info); + m_prebuilt->table->stats_mutex_lock(); + if (!m_prebuilt->table->stat_initialized()) { + innobase_copy_frm_flags_from_create_info( + m_prebuilt->table, info); + } + m_prebuilt->table->stats_mutex_unlock(); if (table_changes != IS_EQUAL_YES) { @@ -17383,7 +17416,8 @@ " higher than innodb_io_capacity_max %lu", in_val, srv_max_io_capacity); - srv_max_io_capacity = (in_val & ~(~0UL >> 1)) + /* Avoid overflow. */ + srv_max_io_capacity = (in_val >= SRV_MAX_IO_CAPACITY_LIMIT / 2) ? in_val : in_val * 2; push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, @@ -17546,22 +17580,6 @@ return(ret); } -extern void buf_resize_start(); - -/** Update the system variable innodb_buffer_pool_size using the "saved" -value. This function is registered as a callback with MySQL. -@param[in] save immediate result from check function */ -static -void -innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save) -{ - snprintf(export_vars.innodb_buffer_pool_resize_status, - sizeof(export_vars.innodb_buffer_pool_resize_status), - "Buffer pool resize requested"); - - buf_resize_start(); -} - /** The latest assigned innodb_ft_aux_table name */ static char* innodb_ft_aux_table; @@ -17576,11 +17594,16 @@ int len = sizeof buf; if (const char* table_name = value->val_str(value, buf, &len)) { + /* Because we are not acquiring MDL on the table name, + we must contiguously hold dict_sys.latch while we are + examining the table, to protect us against concurrent DDL. */ + dict_sys.lock(SRW_LOCK_CALL); if (dict_table_t* table = dict_table_open_on_name( - table_name, false, DICT_ERR_IGNORE_NONE)) { + table_name, true, DICT_ERR_IGNORE_NONE)) { + table->release(); const table_id_t id = dict_table_has_fts_index(table) ? table->id : 0; - dict_table_close(table); + dict_sys.unlock(); if (id) { innodb_ft_aux_table_id = id; if (table_name == buf) { @@ -17591,12 +17614,12 @@ len); } - *static_cast(save) = table_name; return 0; } + } else { + dict_sys.unlock(); } - return 1; } else { *static_cast(save) = NULL; @@ -18385,14 +18408,14 @@ mysql_mutex_unlock(&buf_pool.mutex); } +static my_bool innodb_log_checkpoint_now; #ifdef UNIV_DEBUG -static my_bool innodb_log_checkpoint_now = TRUE; static my_bool innodb_buf_flush_list_now = TRUE; static uint innodb_merge_threshold_set_all_debug = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; +#endif /** Force an InnoDB log checkpoint. */ -/** Force an InnoDB log checkpoint. */ static void checkpoint_now_set(THD* thd, st_mysql_sys_var*, void*, const void *save) @@ -18416,14 +18439,21 @@ const auto size= log_sys.is_encrypted() ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT; mysql_mutex_unlock(&LOCK_global_system_variables); - lsn_t lsn; - while (log_sys.last_checkpoint_lsn.load(std::memory_order_acquire) + size < - (lsn= log_sys.get_lsn(std::memory_order_acquire))) + while (!thd_kill_level(thd)) + { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + lsn_t cp= log_sys.last_checkpoint_lsn.load(std::memory_order_relaxed), + lsn= log_sys.get_lsn(); + log_sys.latch.wr_unlock(); + if (cp + size >= lsn) + break; log_make_checkpoint(); + } mysql_mutex_lock(&LOCK_global_system_variables); } +#ifdef UNIV_DEBUG /****************************************************************//** Force a dirty pages flush now. */ static @@ -18605,7 +18635,7 @@ " innodb_log_buffer_size=%u", MYF(0), log_sys.buf_size); else { - switch (log_sys.resize_start(*static_cast(save))) { + switch (log_sys.resize_start(*static_cast(save), thd)) { case log_t::RESIZE_NO_CHANGE: break; case log_t::RESIZE_IN_PROGRESS: @@ -18617,12 +18647,11 @@ ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_CANT_CREATE_HANDLER_FILE); break; case log_t::RESIZE_STARTED: - const lsn_t start{log_sys.resize_in_progress()}; for (timespec abstime;;) { if (thd_kill_level(thd)) { - log_sys.resize_abort(); + log_sys.resize_abort(thd); break; } @@ -18637,37 +18666,25 @@ resizing= log_sys.resize_in_progress(); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); - if (start > log_sys.get_lsn()) + if (!resizing || !log_sys.resize_running(thd)) + break; + log_sys.latch.wr_lock(SRW_LOCK_CALL); + while (resizing > log_sys.get_lsn()) { ut_ad(!log_sys.is_mmap()); /* The server is almost idle. Write dummy FILE_CHECKPOINT records to ensure that the log resizing will complete. */ - log_sys.latch.wr_lock(SRW_LOCK_CALL); - while (start > log_sys.get_lsn()) - { - mtr_t mtr; - mtr.start(); - mtr.commit_files(log_sys.last_checkpoint_lsn); - } - log_sys.latch.wr_unlock(); + mtr_t mtr; + mtr.start(); + mtr.commit_files(log_sys.last_checkpoint_lsn); } - if (!resizing || resizing > start /* only wait for our resize */) - break; + log_sys.latch.wr_unlock(); } } } mysql_mutex_lock(&LOCK_global_system_variables); } -static void innodb_log_spin_wait_delay_update(THD *, st_mysql_sys_var*, - void *, const void *save) -{ - log_sys.latch.wr_lock(SRW_LOCK_CALL); - mtr_t::spin_wait_delay= *static_cast(save); - mtr_t::finisher_update(); - log_sys.latch.wr_unlock(); -} - /** Update innodb_status_output or innodb_status_output_locks, which control InnoDB "status monitor" output to the error log. @param[out] var current value @@ -18987,7 +19004,7 @@ static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, PLUGIN_VAR_RQCMDARG, "Number of IOPs the server can do. Tunes the background IO rate", - NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0); + NULL, innodb_io_capacity_update, 200, 100, SRV_MAX_IO_CAPACITY_LIMIT, 0); static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity, PLUGIN_VAR_RQCMDARG, @@ -18996,12 +19013,12 @@ SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100, SRV_MAX_IO_CAPACITY_LIMIT, 0); -#ifdef UNIV_DEBUG static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now, PLUGIN_VAR_OPCMDARG, - "Force checkpoint now", + "Write back dirty pages from the buffer pool and update the log checkpoint", NULL, checkpoint_now_set, FALSE); +#ifdef UNIV_DEBUG static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now, PLUGIN_VAR_OPCMDARG, "Force dirty page flush now", @@ -19157,12 +19174,12 @@ " SHOW TABLE STATUS for tables that use transient statistics (off by default)", NULL, NULL, FALSE); -static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages, +static MYSQL_SYSVAR_UINT(stats_transient_sample_pages, srv_stats_transient_sample_pages, PLUGIN_VAR_RQCMDARG, "The number of leaf index pages to sample when calculating transient" " statistics (if persistent statistics are not used, default 8)", - NULL, NULL, 8, 1, ~0ULL, 0); + NULL, NULL, 8, 1, ~0U, 0); static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent, PLUGIN_VAR_OPCMDARG, @@ -19178,12 +19195,12 @@ " new statistics)", NULL, NULL, TRUE); -static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages, +static MYSQL_SYSVAR_UINT(stats_persistent_sample_pages, srv_stats_persistent_sample_pages, PLUGIN_VAR_RQCMDARG, "The number of leaf index pages to sample when calculating persistent" " statistics (by ANALYZE, default 20)", - NULL, NULL, 20, 1, ~0ULL, 0); + NULL, NULL, 20, 1, ~0U, 0); static MYSQL_SYSVAR_ULONGLONG(stats_modified_counter, srv_stats_modified_counter, PLUGIN_VAR_RQCMDARG, @@ -19222,11 +19239,12 @@ "Data file autoextend increment in megabytes", NULL, NULL, 64, 1, 1000, 0); -static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size, srv_buf_pool_chunk_unit, - PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Size of a single memory chunk" - " for resizing buffer pool. Online buffer pool resizing happens at this" - " granularity. 0 means autosize this variable based on buffer pool size.", +static size_t innodb_buffer_pool_chunk_size; + +static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size, + innodb_buffer_pool_chunk_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED, + "Deprecated parameter with no effect", NULL, NULL, 0, 0, SIZE_T_MAX, 1024 * 1024); @@ -19525,11 +19543,12 @@ nullptr, innodb_log_file_size_update, 96 << 20, 4 << 20, std::numeric_limits::max(), 4096); -static MYSQL_SYSVAR_UINT(log_spin_wait_delay, mtr_t::spin_wait_delay, - PLUGIN_VAR_OPCMDARG, - "Delay between log buffer spin lock polls (0 to use a blocking latch)", - nullptr, innodb_log_spin_wait_delay_update, - 0, 0, 6000, 0); +static uint innodb_log_spin_wait_delay; + +static MYSQL_SYSVAR_UINT(log_spin_wait_delay, innodb_log_spin_wait_delay, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED, + "Deprecated parameter with no effect", + nullptr, nullptr, 0, 0, 6000, 0); static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct, PLUGIN_VAR_RQCMDARG, @@ -19634,37 +19653,10 @@ AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ AUTOINC_NO_LOCKING, 0); /* Maximum value */ -#ifdef HAVE_URING -# include -static utsname uname_for_io_uring; -#else -static -#endif -bool innodb_use_native_aio_default() -{ -#ifdef HAVE_URING - utsname &u= uname_for_io_uring; - if (!uname(&u) && u.release[0] == '5' && u.release[1] == '.' && - u.release[2] == '1' && u.release[3] >= '1' && u.release[3] <= '5' && - u.release[4] == '.') - { - if (u.release[3] == '5') { - const char *s= strstr(u.version, "5.15."); - if (s || (s= strstr(u.release, "5.15."))) - if ((s[5] >= '3' || s[6] >= '0')) - return true; /* 5.15.3 and later should be fine */ - } - io_uring_may_be_unsafe= u.release; - return false; /* working around io_uring hangs (MDEV-26674) */ - } -#endif - return true; -} - static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Use native AIO if supported on this platform.", - NULL, NULL, innodb_use_native_aio_default()); + NULL, NULL, TRUE); #ifdef HAVE_LIBNUMA static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave, @@ -19953,6 +19945,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(autoextend_increment), MYSQL_SYSVAR(buffer_pool_size), +#if defined __linux__ || !defined DBUG_OFF + MYSQL_SYSVAR(buffer_pool_size_auto_min), +#endif + MYSQL_SYSVAR(buffer_pool_size_max), MYSQL_SYSVAR(buffer_pool_chunk_size), MYSQL_SYSVAR(buffer_pool_filename), MYSQL_SYSVAR(buffer_pool_dump_now), @@ -20079,8 +20075,8 @@ MYSQL_SYSVAR(monitor_reset_all), MYSQL_SYSVAR(purge_threads), MYSQL_SYSVAR(purge_batch_size), -#ifdef UNIV_DEBUG MYSQL_SYSVAR(log_checkpoint_now), +#ifdef UNIV_DEBUG MYSQL_SYSVAR(buf_flush_list_now), MYSQL_SYSVAR(merge_threshold_set_all_debug), #endif /* UNIV_DEBUG */ @@ -21057,90 +21053,6 @@ cs2, to, static_cast(len), errors))); } -/** Validate the requested buffer pool size. Also, reserve the necessary -memory needed for buffer pool resize. -@param[in] thd thread handle -@param[out] save immediate result for update function -@param[in] value incoming string -@return 0 on success, 1 on failure. -*/ -static -int -innodb_buffer_pool_size_validate( - THD* thd, - st_mysql_sys_var*, - void* save, - struct st_mysql_value* value) -{ - longlong intbuf; - - value->val_int(value, &intbuf); - - if (static_cast(intbuf) < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) { - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "innodb_buffer_pool_size must be at least" - " %lld for innodb_page_size=%lu", - MYSQL_SYSVAR_NAME(buffer_pool_size).min_val, - srv_page_size); - return(1); - } - - if (!srv_was_started) { - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "Cannot update innodb_buffer_pool_size," - " because InnoDB is not started."); - return(1); - } - - mysql_mutex_lock(&buf_pool.mutex); - - if (srv_buf_pool_old_size != srv_buf_pool_size) { - mysql_mutex_unlock(&buf_pool.mutex); - my_printf_error(ER_WRONG_ARGUMENTS, - "Another buffer pool resize is already in progress.", MYF(0)); - return(1); - } - - ulint requested_buf_pool_size = buf_pool_size_align(ulint(intbuf)); - - *static_cast(save) = requested_buf_pool_size; - - if (srv_buf_pool_size == ulint(intbuf)) { - mysql_mutex_unlock(&buf_pool.mutex); - /* nothing to do */ - return(0); - } - - if (srv_buf_pool_size == requested_buf_pool_size) { - mysql_mutex_unlock(&buf_pool.mutex); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_WRONG_ARGUMENTS, - "innodb_buffer_pool_size must be at least" - " innodb_buffer_pool_chunk_size=%zu", - srv_buf_pool_chunk_unit); - /* nothing to do */ - return(0); - } - - srv_buf_pool_size = requested_buf_pool_size; - mysql_mutex_unlock(&buf_pool.mutex); - - if (intbuf != static_cast(requested_buf_pool_size)) { - char buf[64]; - int len = 64; - value->val_str(value, buf, &len); - push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, - ER_TRUNCATED_WRONG_VALUE, - "Truncated incorrect %-.32s value: '%-.128s'", - mysql_sysvar_buffer_pool_size.name, - value->val_str(value, buf, &len)); - } - - return(0); -} - /*************************************************************//** Check for a valid value of innobase_compression_algorithm. @return 0 for valid innodb_compression_algorithm. */ @@ -21436,19 +21348,3 @@ if (UNIV_LIKELY_NULL(local_heap)) mem_heap_free(local_heap); } - -/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, -if needed. -@param[in] size size in bytes -@return aligned size */ -ulint buf_pool_size_align(ulint size) noexcept -{ - const size_t m = srv_buf_pool_chunk_unit; - size = ut_max(size, (size_t) MYSQL_SYSVAR_NAME(buffer_pool_size).min_val); - - if (size % m == 0) { - return(size); - } else { - return (size / m + 1) * m; - } -} diff -Nru mariadb-10.11.11/storage/innobase/handler/ha_innodb.h mariadb-10.11.13/storage/innobase/handler/ha_innodb.h --- mariadb-10.11.11/storage/innobase/handler/ha_innodb.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/handler/ha_innodb.h 2025-05-19 16:14:25.000000000 +0000 @@ -101,6 +101,9 @@ int open(const char *name, int mode, uint test_if_locked) override; + /** Fetch or recalculate InnoDB table statistics */ + dberr_t statistics_init(dict_table_t *table, bool recalc); + handler* clone(const char *name, MEM_ROOT *mem_root) override; int close(void) override; diff -Nru mariadb-10.11.11/storage/innobase/handler/handler0alter.cc mariadb-10.11.13/storage/innobase/handler/handler0alter.cc --- mariadb-10.11.11/storage/innobase/handler/handler0alter.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/handler/handler0alter.cc 2025-05-19 16:14:25.000000000 +0000 @@ -621,6 +621,16 @@ } dict_index_t* index = dict_table_get_first_index(this); + if (instant) { + instant->field_map= static_cast( + mem_heap_dup(heap, instant->field_map, + (index->n_fields - + index->first_user_field()) * + sizeof *instant->field_map)); + instant= static_cast( + mem_heap_dup(heap, instant, sizeof *instant)); + } + bool metadata_changed; { const dict_index_t& i = *dict_table_get_first_index(&table); @@ -2241,6 +2251,12 @@ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); } + if (ha_alter_info->create_info->used_fields + & HA_CREATE_USED_SEQUENCE) { + ha_alter_info->unsupported_reason = "SEQUENCE"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + update_thd(); if (!m_prebuilt->table->space) { @@ -5525,6 +5541,12 @@ return false; } + DBUG_EXECUTE_IF("instant_insert_fail", + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Insert into SYS_COLUMNS failed"); + mem_heap_free(info->heap); + return true;); + if (DB_SUCCESS != que_eval_sql( info, "PROCEDURE ADD_COL () IS\n" @@ -6512,6 +6534,8 @@ DBUG_ASSERT(!ctx->add_index); DBUG_ASSERT(!ctx->add_key_numbers); DBUG_ASSERT(!ctx->num_to_add_index); + DBUG_ASSERT(!(ha_alter_info->create_info->used_fields + & HA_CREATE_USED_SEQUENCE)); user_table = ctx->new_table; @@ -6611,8 +6635,9 @@ mem_heap_alloc(ctx->heap, ctx->num_to_add_index * sizeof *ctx->add_key_numbers)); - const bool fts_exist = ctx->new_table->flags2 + const bool have_fts = user_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + const bool pause_purge = have_fts || user_table->get_ref_count() > 1; /* Acquire a lock on the table before creating any indexes. */ bool table_lock_failed = false; @@ -6639,13 +6664,18 @@ user_table->lock_shared_unlock(); } - if (fts_exist) { - purge_sys.stop_FTS(*ctx->new_table); + if (pause_purge) { + purge_sys.stop_FTS(); + if (have_fts) { + purge_sys.stop_FTS(*user_table, true); + } if (error == DB_SUCCESS) { - error = fts_lock_tables(ctx->trx, *ctx->new_table); + error = fts_lock_tables(ctx->trx, *user_table); } } + ut_ad(user_table->get_ref_count() == 1); + if (error == DB_SUCCESS) { error = lock_sys_tables(ctx->trx); } @@ -7478,7 +7508,7 @@ /* fts_create_common_tables() may drop old common tables, whose files would be deleted here. */ commit_unlock_and_unlink(ctx->trx); - if (fts_exist) { + if (pause_purge) { purge_sys.resume_FTS(); } @@ -7542,10 +7572,11 @@ } } - /* n_ref_count must be 1, because background threads cannot + /* n_ref_count must be 1 (+ InnoDB_share), + because background threads cannot be executing on this very table as we are holding MDL_EXCLUSIVE. */ - ut_ad(ctx->online || user_table->get_ref_count() == 1); + ut_ad(ctx->online || ((user_table->get_ref_count() - 1) <= 1)); if (new_clustered) { online_retry_drop_indexes_low(user_table, ctx->trx); @@ -7574,7 +7605,7 @@ ctx->trx->free(); } trx_commit_for_mysql(ctx->prebuilt->trx); - if (fts_exist) { + if (pause_purge) { purge_sys.resume_FTS(); } @@ -11180,7 +11211,10 @@ DBUG_ENTER("alter_stats_norebuild"); DBUG_ASSERT(!ctx->need_rebuild()); - if (!dict_stats_is_persistent_enabled(ctx->new_table)) { + auto stat = ctx->new_table->stat; + + if (!dict_table_t::stat_initialized(stat) + || !dict_table_t::stats_is_persistent(stat)) { DBUG_VOID_RETURN; } @@ -11189,7 +11223,6 @@ DBUG_ASSERT(index->table == ctx->new_table); if (!(index->type & DICT_FTS)) { - dict_stats_init(ctx->new_table); dict_stats_update_for_index(index); } } @@ -11214,12 +11247,15 @@ { DBUG_ENTER("alter_stats_rebuild"); - if (!table->space - || !dict_stats_is_persistent_enabled(table)) { + if (!table->space || !table->stats_is_persistent() + || dict_stats_persistent_storage_check(false) != SCHEMA_OK) { DBUG_VOID_RETURN; } - dberr_t ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + dberr_t ret = dict_stats_update_persistent(table); + if (ret == DB_SUCCESS) { + ret = dict_stats_save(table); + } if (ret != DB_SUCCESS) { push_warning_printf( @@ -11332,6 +11368,13 @@ /* A rollback is being requested. So far we may at most have created stubs for ADD INDEX or a copy of the table for rebuild. */ +#if 0 /* FIXME: is there a better way for innodb.innodb-index-online? */ + lock_shared_ha_data(); + auto share = static_cast(get_ha_share_ptr()); + set_ha_share_ptr(nullptr); + unlock_shared_ha_data(); + delete share; +#endif DBUG_RETURN(rollback_inplace_alter_table( ha_alter_info, table, m_prebuilt)); } @@ -11559,34 +11602,16 @@ } } - dict_table_t *table_stats = nullptr, *index_stats = nullptr; - MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dict_stats stats; + bool stats_failed = true; dberr_t error = DB_SUCCESS; if (!ctx0->old_table->is_stats_table() && !ctx0->new_table->is_stats_table()) { - table_stats = dict_table_open_on_name( - TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (table_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - table_stats = dict_acquire_mdl_shared( - table_stats, m_user_thd, &mdl_table); - dict_sys.unfreeze(); - } - index_stats = dict_table_open_on_name( - INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); - if (index_stats) { - dict_sys.freeze(SRW_LOCK_CALL); - index_stats = dict_acquire_mdl_shared( - index_stats, m_user_thd, &mdl_index); - dict_sys.unfreeze(); - } - - if (table_stats && index_stats - && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) - && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) - && !(error = lock_table_for_trx(table_stats, + stats_failed = stats.open(m_user_thd); + if (!stats_failed + && !(error = lock_table_for_trx(stats.table(), trx, LOCK_X))) { - error = lock_table_for_trx(index_stats, trx, LOCK_X); + error = lock_table_for_trx(stats.index(), trx, LOCK_X); } } @@ -11600,15 +11625,9 @@ error = lock_sys_tables(trx); } if (error != DB_SUCCESS) { - if (table_stats) { - dict_table_close(table_stats, false, m_user_thd, - mdl_table); - } - if (index_stats) { - dict_table_close(index_stats, false, m_user_thd, - mdl_index); + if (!stats_failed) { + stats.close(); } - my_error_innodb(error, table_share->table_name.str, 0); if (fts_exist) { purge_sys.resume_FTS(); } @@ -11624,6 +11643,7 @@ trx_start_for_ddl(trx); } + my_error_innodb(error, table_share->table_name.str, 0); DBUG_RETURN(true); } @@ -11641,15 +11661,10 @@ fail: trx->rollback(); ut_ad(!trx->fts_trx); - if (table_stats) { - dict_table_close(table_stats, true, m_user_thd, - mdl_table); - } - if (index_stats) { - dict_table_close(index_stats, true, m_user_thd, - mdl_index); - } row_mysql_unlock_data_dictionary(trx); + if (!stats_failed) { + stats.close(); + } if (fts_exist) { purge_sys.resume_FTS(); } @@ -11669,14 +11684,14 @@ if (commit_try_rebuild(ha_alter_info, ctx, altered_table, table, - table_stats && index_stats, + !stats_failed, trx, table_share->table_name.str)) { goto fail; } } else if (commit_try_norebuild(ha_alter_info, ctx, altered_table, table, - table_stats && index_stats, + !stats_failed, trx, table_share->table_name.str)) { goto fail; @@ -11699,13 +11714,6 @@ #endif } - if (table_stats) { - dict_table_close(table_stats, true, m_user_thd, mdl_table); - } - if (index_stats) { - dict_table_close(index_stats, true, m_user_thd, mdl_index); - } - /* Commit or roll back the changes to the data dictionary. */ DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit"); @@ -11854,6 +11862,9 @@ DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", DBUG_SUICIDE();); trx->free(); + if (!stats_failed) { + stats.close(); + } if (fts_exist) { purge_sys.resume_FTS(); } @@ -11910,6 +11921,9 @@ DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", DBUG_SUICIDE();); trx->free(); + if (!stats_failed) { + stats.close(); + } if (fts_exist) { purge_sys.resume_FTS(); } diff -Nru mariadb-10.11.11/storage/innobase/handler/i_s.cc mariadb-10.11.13/storage/innobase/handler/i_s.cc --- mariadb-10.11.11/storage/innobase/handler/i_s.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/handler/i_s.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2230,7 +2230,7 @@ DBUG_RETURN(0); } else if (!dict_table_has_fts_index(user_table) || !user_table->is_readable()) { - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); DBUG_RETURN(0); } @@ -2245,7 +2245,7 @@ fts_table_fetch_doc_ids(trx, &fts_table, deleted); - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); trx->free(); @@ -2578,7 +2578,7 @@ } if (!user_table->fts || !user_table->fts->cache) { - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); DBUG_RETURN(0); } @@ -2603,7 +2603,7 @@ } mysql_mutex_unlock(&cache->lock); - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); DBUG_RETURN(ret); } @@ -3020,7 +3020,7 @@ } } - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); ut_free(conv_str.f_str); @@ -3145,7 +3145,7 @@ } if (!dict_table_has_fts_index(user_table)) { - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); DBUG_RETURN(0); } @@ -3202,7 +3202,7 @@ fts_sql_commit(trx); - dict_table_close(user_table, false, thd, mdl_ticket); + dict_table_close(user_table, thd, mdl_ticket); trx->free(); @@ -3388,7 +3388,7 @@ DBUG_RETURN(0); } - buf_stats_get_pool_info(&info); + buf_pool.get_info(&info); table = tables->table; @@ -3937,87 +3937,37 @@ @return 0 on success, 1 on failure */ static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *) { - int status = 0; - mem_heap_t* heap; - - DBUG_ENTER("i_s_innodb_buffer_page_fill"); - - RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); - - /* deny access to user without PROCESS privilege */ - if (check_global_access(thd, PROCESS_ACL)) { - DBUG_RETURN(0); - } - - heap = mem_heap_create(10000); - - for (ulint n = 0; - n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) { - const buf_block_t* block; - ulint n_blocks; - buf_page_info_t* info_buffer; - ulint num_page; - ulint mem_size; - ulint chunk_size; - ulint num_to_process = 0; - ulint block_id = 0; - - /* Get buffer block of the nth chunk */ - block = buf_pool.chunks[n].blocks; - chunk_size = buf_pool.chunks[n].size; - num_page = 0; - - while (chunk_size > 0) { - /* we cache maximum MAX_BUF_INFO_CACHED number of - buffer page info */ - num_to_process = ut_min(chunk_size, - (ulint)MAX_BUF_INFO_CACHED); - - mem_size = num_to_process * sizeof(buf_page_info_t); - - /* For each chunk, we'll pre-allocate information - structures to cache the page information read from - the buffer pool. Doing so before obtain any mutex */ - info_buffer = (buf_page_info_t*) mem_heap_zalloc( - heap, mem_size); - - /* Obtain appropriate mutexes. Since this is diagnostic - buffer pool info printout, we are not required to - preserve the overall consistency, so we can - release mutex periodically */ - mysql_mutex_lock(&buf_pool.mutex); - - /* GO through each block in the chunk */ - for (n_blocks = num_to_process; n_blocks--; block++) { - i_s_innodb_buffer_page_get_info( - &block->page, block_id, - info_buffer + num_page); - block_id++; - num_page++; - } - - mysql_mutex_unlock(&buf_pool.mutex); + DBUG_ENTER("i_s_innodb_buffer_page_fill"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); - /* Fill in information schema table with information - just collected from the buffer chunk scan */ - status = i_s_innodb_buffer_page_fill( - thd, tables, info_buffer, - num_page); - - /* If something goes wrong, break and return */ - if (status) { - break; - } - - mem_heap_empty(heap); - chunk_size -= num_to_process; - num_page = 0; - } - } - - mem_heap_free(heap); - - DBUG_RETURN(status); + /* deny access to user without PROCESS privilege */ + if (check_global_access(thd, PROCESS_ACL)) + DBUG_RETURN(0); + + int status; + buf_page_info_t *b= + static_cast(my_malloc(PSI_INSTRUMENT_ME, + MAX_BUF_INFO_CACHED * sizeof *b, + MYF(MY_WME))); + if (!b) + DBUG_RETURN(1); + for (size_t j= 0;;) + { + memset((void*) b, 0, MAX_BUF_INFO_CACHED * sizeof *b); + mysql_mutex_lock(&buf_pool.mutex); + const size_t N= buf_pool.curr_size(); + const size_t n= std::min(N, MAX_BUF_INFO_CACHED); + for (size_t i= 0; i < n && j < N; i++, j++) + i_s_innodb_buffer_page_get_info(&buf_pool.get_nth_page(j)->page, j, + &b[i]); + + mysql_mutex_unlock(&buf_pool.mutex); + status= i_s_innodb_buffer_page_fill(thd, tables, b, n); + if (status || j >= N) + break; + } + my_free(b); + DBUG_RETURN(status); } /*******************************************************************//** @@ -4777,9 +4727,9 @@ OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name.m_name)); - OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized, true)); + OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized(), true)); - if (table->stat_initialized) + if (table->stat_initialized()) { OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, true)); diff -Nru mariadb-10.11.11/storage/innobase/ibuf/ibuf0ibuf.cc mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc --- mariadb-10.11.11/storage/innobase/ibuf/ibuf0ibuf.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc 2025-05-19 16:14:25.000000000 +0000 @@ -375,7 +375,7 @@ ibuf.free_list_len = flst_get_len(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST); - ibuf.height = 1 + btr_page_get_level(root); + ibuf.height = uint8_t(1 + btr_page_get_level(root)); /* the '1 +' is the ibuf header page */ ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len); @@ -443,18 +443,11 @@ goto err_exit; } - /* At startup we intialize ibuf to have a maximum of - CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the - buffer pool size. Once ibuf struct is initialized this - value is updated with the user supplied size by calling - ibuf_max_size_update(). */ - ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) - * CHANGE_BUFFER_DEFAULT_SIZE) / 100; - mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr); mysql_mutex_init(ibuf_pessimistic_insert_mutex_key, &ibuf_pessimistic_insert_mutex, nullptr); + ibuf_max_size_update(CHANGE_BUFFER_DEFAULT_SIZE); mysql_mutex_lock(&ibuf_mutex); ibuf_size_update(root); mysql_mutex_unlock(&ibuf_mutex); @@ -506,10 +499,10 @@ percentage of the buffer pool size */ { if (UNIV_UNLIKELY(!ibuf.index)) return; - ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) - * new_val) / 100; + ulint new_size = std::min( + buf_pool.curr_size() * new_val / 100, uint32_t(~0U)); mysql_mutex_lock(&ibuf_mutex); - ibuf.max_size = new_size; + ibuf.max_size = uint32_t(new_size); mysql_mutex_unlock(&ibuf_mutex); } @@ -2061,8 +2054,7 @@ } } - limit = ut_min(IBUF_MAX_N_PAGES_MERGED, - buf_pool_get_curr_size() / 4); + limit = std::min(IBUF_MAX_N_PAGES_MERGED, buf_pool.curr_size() / 4); first_page_no = ibuf_rec_get_page_no(mtr, rec); first_space_id = ibuf_rec_get_space(mtr, rec); @@ -4483,17 +4475,17 @@ return; } - const ulint size= ibuf.size; - const ulint free_list_len= ibuf.free_list_len; - const ulint seg_size= ibuf.seg_size; + const uint32_t size= ibuf.size; + const uint32_t free_list_len= ibuf.free_list_len; + const uint32_t seg_size= ibuf.seg_size; mysql_mutex_unlock(&ibuf_mutex); fprintf(file, "-------------\n" "INSERT BUFFER\n" "-------------\n" - "size " ULINTPF ", free list len " ULINTPF "," - " seg size " ULINTPF ", " ULINTPF " merges\n", + "size %" PRIu32 ", free list len %" PRIu32 "," + " seg size %" PRIu32 ", " ULINTPF " merges\n", size, free_list_len, seg_size, ulint{ibuf.n_merges}); ibuf_print_ops("merged operations:\n", ibuf.n_merged_ops, file); ibuf_print_ops("discarded operations:\n", ibuf.n_discarded_ops, file); diff -Nru mariadb-10.11.11/storage/innobase/include/btr0sea.h mariadb-10.11.13/storage/innobase/include/btr0sea.h --- mariadb-10.11.11/storage/innobase/include/btr0sea.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/btr0sea.h 2025-05-19 16:14:25.000000000 +0000 @@ -39,12 +39,16 @@ #define btr_search_sys_create() btr_search_sys.create() #define btr_search_sys_free() btr_search_sys.free() -/** Disable the adaptive hash search system and empty the index. */ -void btr_search_disable(); +/** Lazily free detached metadata when removing the last reference. */ +ATTRIBUTE_COLD void btr_search_lazy_free(dict_index_t *index); + +/** Disable the adaptive hash search system and empty the index. +@return whether the adaptive hash index was enabled */ +ATTRIBUTE_COLD bool btr_search_disable(); /** Enable the adaptive hash search system. @param resize whether buf_pool_t::resize() is the caller */ -void btr_search_enable(bool resize= false); +ATTRIBUTE_COLD void btr_search_enable(bool resize= false); /*********************************************************************//** Updates the search info. */ diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buddy.h mariadb-10.11.13/storage/innobase/include/buf0buddy.h --- mariadb-10.11.11/storage/innobase/include/buf0buddy.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/buf0buddy.h 2025-05-19 16:14:25.000000000 +0000 @@ -24,17 +24,13 @@ Created December 2006 by Marko Makela *******************************************************/ -#ifndef buf0buddy_h -#define buf0buddy_h - +#pragma once #include "buf0types.h" /** @param[in] block size in bytes @return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ -inline -ulint -buf_buddy_get_slot(ulint size) +inline ulint buf_buddy_get_slot(ulint size) noexcept { ulint i; ulint s; @@ -53,13 +49,13 @@ @param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES @param lru assigned to true if buf_pool.mutex was temporarily released @return allocated block, never NULL */ -byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc)); +byte *buf_buddy_alloc_low(ulint i, bool *lru) noexcept MY_ATTRIBUTE((malloc)); /** Allocate a ROW_FORMAT=COMPRESSED block. @param size compressed page size in bytes @param lru assigned to true if buf_pool.mutex was temporarily released @return allocated block, never NULL */ -inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr) +inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr) noexcept { return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru); } @@ -68,24 +64,26 @@ @param[in] buf block to be freed, must not be pointed to by the buffer pool @param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ -void buf_buddy_free_low(void* buf, ulint i); +void buf_buddy_free_low(void* buf, ulint i) noexcept; /** Deallocate a block. @param[in] buf block to be freed, must not be pointed to by the buffer pool @param[in] size block size in bytes */ -inline void buf_buddy_free(void* buf, ulint size) +inline void buf_buddy_free(void* buf, ulint size) noexcept { - buf_buddy_free_low(buf, buf_buddy_get_slot(size)); + buf_buddy_free_low(buf, buf_buddy_get_slot(size)); } -/** Try to reallocate a block. -@param[in] buf block to be reallocated, must be pointed -to by the buffer pool -@param[in] size block size, up to srv_page_size -@retval false if failed because of no free blocks. */ -bool buf_buddy_realloc(void* buf, ulint size); - -/** Combine all pairs of free buddies. */ -void buf_buddy_condense_free(); -#endif /* buf0buddy_h */ +ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Reallocate a ROW_FORMAT=COMPRESSED page frame during buf_pool_t::shrink(). +@param bpage page descriptor covering a ROW_FORMAT=COMPRESSED page +@param block uncompressed block for storage +@return block +@retval nullptr if the block was consumed */ +ATTRIBUTE_COLD +buf_block_t *buf_buddy_shrink(buf_page_t *bpage, buf_block_t *block) noexcept; + +/** Combine all pairs of free buddies. +@param size the target innodb_buffer_pool_size */ +ATTRIBUTE_COLD void buf_buddy_condense_free(size_t size) noexcept; diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buf.h mariadb-10.11.13/storage/innobase/include/buf0buf.h --- mariadb-10.11.11/storage/innobase/include/buf0buf.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/buf0buf.h 2025-05-19 16:14:25.000000000 +0000 @@ -35,13 +35,16 @@ #include "assume_aligned.h" #include "buf0types.h" #ifndef UNIV_INNOCHECKSUM -#include "ut0byte.h" #include "page0types.h" #include "log0log.h" #include "srv0srv.h" #include "transactional_lock_guard.h" #include +/** The allocation granularity of innodb_buffer_pool_size */ +constexpr size_t innodb_buffer_pool_extent_size= + sizeof(size_t) < 8 ? 2 << 20 : 8 << 20; + /** @name Modes for buf_page_get_gen */ /* @{ */ #define BUF_GET 10 /*!< get always */ @@ -71,7 +74,7 @@ ulint pool_size; /*!< Buffer Pool size in pages */ ulint lru_len; /*!< Length of buf_pool.LRU */ ulint old_lru_len; /*!< buf_pool.LRU_old_len */ - ulint free_list_len; /*!< Length of buf_pool.free list */ + ulint free_list_len; /*!< free + lazy_allocate_size() */ ulint flush_list_len; /*!< Length of buf_pool.flush_list */ ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages pending decompress */ @@ -142,10 +145,8 @@ const page_id_t page_id); #ifndef UNIV_INNOCHECKSUM -# define buf_pool_get_curr_size() srv_buf_pool_curr_size # define buf_block_free(block) buf_pool.free_block(block) - -#define buf_page_get(ID, SIZE, LA, MTR) \ +# define buf_page_get(ID, SIZE, LA, MTR) \ buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR) /** Try to buffer-fix a page. @@ -395,9 +396,6 @@ buf_print_io( /*=========*/ FILE* file); /*!< in: file where to print */ -/** Collect buffer pool metadata. -@param[out] pool_info buffer pool metadata */ -void buf_stats_get_pool_info(buf_pool_info_t *pool_info) noexcept; /** Refresh the statistics used to print per-second averages. */ void buf_refresh_io_stats() noexcept; @@ -427,12 +425,6 @@ ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read) noexcept; -/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, -if needed. -@param[in] size size in bytes -@return aligned size */ -ulint buf_pool_size_align(ulint size) noexcept; - /** Verify that post encryption checksum match with the calculated checksum. This function should be called only if tablespace contains crypt data metadata. @param page page frame @@ -549,7 +541,7 @@ /** buf_pool.LRU status mask in state() */ static constexpr uint32_t LRU_MASK= 7U << 29; - /** lock covering the contents of frame */ + /** lock covering the contents of frame() */ block_lock lock; /** pointer to aligned, uncompressed page frame of innodb_page_size */ byte *frame; @@ -559,8 +551,6 @@ !frame && !zip.data means an active buf_pool.watch */ page_zip_des_t zip; #ifdef UNIV_DEBUG - /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */ - bool in_zip_hash; /** whether this->LRU is in buf_pool.LRU (in_file()); protected by buf_pool.mutex */ bool in_LRU_list; @@ -574,7 +564,7 @@ /** list member in one of the lists of buf_pool; protected by buf_pool.mutex or buf_pool.flush_list_mutex - state() == NOT_USED: buf_pool.free or buf_pool.withdraw + state() == NOT_USED: buf_pool.free in_file() && oldest_modification(): buf_pool.flush_list (protected by buf_pool.flush_list_mutex) @@ -615,7 +605,7 @@ lock() /* not copied */, frame(b.frame), zip(b.zip), #ifdef UNIV_DEBUG - in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list), + in_LRU_list(b.in_LRU_list), in_page_hash(b.in_page_hash), in_free_list(b.in_free_list), #endif /* UNIV_DEBUG */ list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock), @@ -632,7 +622,6 @@ id_= id; zip.fix= state; oldest_modification_= 0; - ut_d(in_zip_hash= false); ut_d(in_free_list= false); ut_d(in_LRU_list= false); ut_d(in_page_hash= false); @@ -891,10 +880,6 @@ buf_pool.page_hash can point to buf_page_t or buf_block_t */ #ifdef UNIV_DEBUG - /** whether page.list is in buf_pool.withdraw - ((state() == NOT_USED)) and the buffer pool is being shrunk; - protected by buf_pool.mutex */ - bool in_withdraw_list; /** whether unzip_LRU is in buf_pool.unzip_LRU (in_file() && frame && zip.data); protected by buf_pool.mutex */ @@ -1022,15 +1007,10 @@ @param state initial state() */ void initialise(const page_id_t page_id, ulint zip_size, uint32_t state) noexcept; -}; -/**********************************************************************//** -Compute the hash fold value for blocks in buf_pool.zip_hash. */ -/* @{ */ -#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift) -#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame) -#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) -/* @} */ + /** Calculate the page frame address */ + IF_DBUG(,inline) byte *frame_address() const noexcept; +}; /** A "Hazard Pointer" class used to iterate over buf_pool.LRU or buf_pool.flush_list. A hazard pointer is a buf_page_t pointer @@ -1198,59 +1178,66 @@ /** The buffer pool */ class buf_pool_t { - /** A chunk of buffers */ - struct chunk_t - { - /** number of elements in blocks[] */ - size_t size; - /** memory allocated for the page frames */ - unsigned char *mem; - /** descriptor of mem */ - ut_new_pfx_t mem_pfx; - /** array of buffer control blocks */ - buf_block_t *blocks; - - /** Map of first page frame address to chunks[] */ - using map= std::map, - ut_allocator>>; - /** Chunk map that may be under construction by buf_resize_thread() */ - static map *map_reg; - /** Current chunk map for lookup only */ - static map *map_ref; - - /** @return the memory size bytes. */ - size_t mem_size() const noexcept { return mem_pfx.m_size; } - - /** Register the chunk */ - void reg() noexcept - { map_reg->emplace(map::value_type(blocks->page.frame, this)); } - - /** Allocate a chunk of buffer frames. - @param bytes requested size - @return whether the allocation succeeded */ - inline bool create(size_t bytes) noexcept; + /** arrays of buf_block_t followed by page frames; + aliged to and repeating every innodb_buffer_pool_extent_size; + each extent comprises pages_in_extent[] blocks */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) char *memory; + /** the allocation of the above memory, possibly including some + alignment loss at the beginning */ + char *memory_unaligned; + /** the virtual address range size of memory_unaligned */ + size_t size_unaligned; +#ifdef UNIV_PFS_MEMORY + /** the "owner thread" of the buffer pool allocation */ + PSI_thread *owner; +#endif + /** initialized number of block descriptors */ + size_t n_blocks; + /** number of blocks that need to be freed in shrink() */ + size_t n_blocks_to_withdraw; + /** first block to withdraw in shrink() */ + const buf_page_t *first_to_withdraw; -#ifdef UNIV_DEBUG - /** Find a block that points to a ROW_FORMAT=COMPRESSED page - @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame - @return the block - @retval nullptr if not found */ - const buf_block_t *contains_zip(const void *data) const noexcept - { - const buf_block_t *block= blocks; - for (auto i= size; i--; block++) - if (block->page.zip.data == data) - return block; - return nullptr; - } + /** amount of memory allocated to the buffer pool and descriptors; + protected by mutex */ + Atomic_relaxed size_in_bytes; - /** Check that all blocks are in a replaceable state. - @return address of a non-free block - @retval nullptr if all freed */ - inline const buf_block_t *not_freed() const noexcept; -#endif /* UNIV_DEBUG */ - }; public: + /** The requested innodb_buffer_pool_size */ + size_t size_in_bytes_requested; +#if defined __linux__ || !defined DBUG_OFF + /** The minimum allowed innodb_buffer_pool_size in garbage_collect() */ + size_t size_in_bytes_auto_min; +#endif + /** The maximum allowed innodb_buffer_pool_size */ + size_t size_in_bytes_max; + + /** @return the current size of the buffer pool, in bytes */ + size_t curr_pool_size() const noexcept { return size_in_bytes; } + + /** @return the current size of the buffer pool, in pages */ + TPOOL_SUPPRESS_TSAN size_t curr_size() const noexcept { return n_blocks; } + /** @return the maximum usable size of the buffer pool, in pages */ + TPOOL_SUPPRESS_TSAN size_t usable_size() const noexcept + { return n_blocks - n_blocks_to_withdraw - UT_LIST_GET_LEN(withdrawn); } + + /** Determine the used size of the buffer pool in bytes. + @param n_blocks size of the buffer pool in blocks + @return the size needed for n_blocks in bytes, for innodb_page_size */ + static size_t blocks_in_bytes(size_t n_blocks) noexcept; + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) + /** Enable buffers to be dumped to core files. + + A convenience function, not called anyhwere directly however + it is left available for gdb or any debugger to call + in the event that you want all of the memory to be dumped + to a core file. + + @return number of errors found in madvise() calls */ + static int madvise_do_dump() noexcept; +#endif + /** Hash cell chain in page_hash_table */ struct hash_chain { @@ -1258,106 +1245,58 @@ buf_page_t *first; }; private: - /** Withdraw blocks from the buffer pool until meeting withdraw_target. - @return whether retry is needed */ - inline bool withdraw_blocks() noexcept; - - /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to - the buf_block_t itself or a member of it. - @param ptr a pointer that will not be dereferenced - @return whether the ptr belongs to a buf_block_t struct */ - bool is_block_field(const void *ptr) const noexcept - { - const chunk_t *chunk= chunks; - const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new); - - /* TODO: protect chunks with a mutex (the older pointer will - currently remain during resize()) */ - for (; chunk < echunk; chunk++) - if (ptr >= reinterpret_cast(chunk->blocks) && - ptr < reinterpret_cast(chunk->blocks + chunk->size)) - return true; - return false; - } - - /** Try to reallocate a control block. - @param block control block to reallocate - @return whether the reallocation succeeded */ - inline bool realloc(buf_block_t *block) noexcept; + /** Determine the number of blocks in a buffer pool of a particular size. + @param size_in_bytes innodb_buffer_pool_size in bytes + @return number of buffer pool pages */ + static size_t get_n_blocks(size_t size_in_bytes) noexcept; + + /** The outcome of shrink() */ + enum shrink_status{SHRINK_DONE= -1, SHRINK_IN_PROGRESS= 0, SHRINK_ABORT}; + + /** Attempt to shrink the buffer pool. + @param size requested innodb_buffer_pool_size in bytes + @retval whether the shrinking was completed */ + ATTRIBUTE_COLD shrink_status shrink(size_t size) noexcept; + + /** Finish shrinking the buffer pool. + @param size the new innodb_buffer_pool_size in bytes + @param reduced how much the innodb_buffer_pool_size was reduced */ + inline void shrunk(size_t size, size_t reduced) noexcept; public: - bool is_initialised() const noexcept { return chunks != nullptr; } + bool is_initialised() const noexcept { return memory != nullptr; } /** Create the buffer pool. @return whether the creation failed */ - bool create(); + bool create() noexcept; /** Clean up after successful create() */ void close() noexcept; - /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ - inline void resize(); + /** Resize the buffer pool. + @param size requested innodb_buffer_pool_size in bytes + @param thd current connnection */ + ATTRIBUTE_COLD void resize(size_t size, THD *thd) noexcept; -#ifdef __linux__ /** Collect garbage (release pages from the LRU list) */ - inline void garbage_collect(); -#endif - - /** @return whether resize() is in progress */ - bool resize_in_progress() const noexcept - { - return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed)); - } - - /** @return the current size in blocks */ - size_t get_n_pages() const noexcept - { - ut_ad(is_initialised()); - size_t size= 0; - for (auto j= ut_min(n_chunks_new, n_chunks); j--; ) - size+= chunks[j].size; - return size; - } + inline void garbage_collect() noexcept; - /** Determine whether a frame is intended to be withdrawn during resize(). + /** Determine whether a frame needs to be withdrawn during resize(). @param ptr pointer within a buf_page_t::frame + @param size size_in_bytes_requested @return whether the frame will be withdrawn */ - bool will_be_withdrawn(const byte *ptr) const noexcept + bool will_be_withdrawn(const byte *ptr, size_t size) const noexcept { - ut_ad(n_chunks_new < n_chunks); -#ifdef SAFE_MUTEX - if (resize_in_progress()) - mysql_mutex_assert_owner(&mutex); -#endif /* SAFE_MUTEX */ - - for (const chunk_t *chunk= chunks + n_chunks_new, - * const echunk= chunks + n_chunks; - chunk != echunk; chunk++) - if (ptr >= chunk->blocks->page.frame && - ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size) - return true; - return false; + const char *p= reinterpret_cast(ptr); + ut_ad(!p || p >= memory); + ut_ad(p < memory + size_in_bytes_max); + return p >= memory + size; } - /** Determine whether a block is intended to be withdrawn during resize(). + /** Withdraw a block if needed in case resize() is shrinking. @param bpage buffer pool block - @return whether the frame will be withdrawn */ - bool will_be_withdrawn(const buf_page_t &bpage) const noexcept - { - ut_ad(n_chunks_new < n_chunks); -#ifdef SAFE_MUTEX - if (resize_in_progress()) - mysql_mutex_assert_owner(&mutex); -#endif /* SAFE_MUTEX */ - - for (const chunk_t *chunk= chunks + n_chunks_new, - * const echunk= chunks + n_chunks; - chunk != echunk; chunk++) - if (&bpage >= &chunk->blocks->page && - &bpage < &chunk->blocks[chunk->size].page) - return true; - return false; - } + @return whether the block was withdrawn */ + ATTRIBUTE_COLD bool withdraw(buf_page_t &bpage) noexcept; /** Release and evict a corrupted page. @param bpage x-latched page that was found corrupted @@ -1371,31 +1310,18 @@ #ifdef UNIV_DEBUG /** Find a block that points to a ROW_FORMAT=COMPRESSED page @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame + @param shift number of least significant address bits to ignore @return the block @retval nullptr if not found */ - const buf_block_t *contains_zip(const void *data) const noexcept - { - mysql_mutex_assert_owner(&mutex); - for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks; - chunk != end; chunk++) - if (const buf_block_t *block= chunk->contains_zip(data)) - return block; - return nullptr; - } - + const buf_block_t *contains_zip(const void *data, size_t shift= 0) + const noexcept; /** Assert that all buffer pool pages are in a replaceable state */ void assert_all_freed() noexcept; #endif /* UNIV_DEBUG */ #ifdef BTR_CUR_HASH_ADAPT /** Clear the adaptive hash index on all pages in the buffer pool. */ - inline void clear_hash_index() noexcept; - - /** Get a buffer block from an adaptive hash index pointer. - This function does not return if the block is not identified. - @param ptr pointer to within a page frame - @return pointer to block, never NULL */ - inline buf_block_t *block_from_ahi(const byte *ptr) const noexcept; + void clear_hash_index() noexcept; #endif /* BTR_CUR_HASH_ADAPT */ /** @@ -1418,13 +1344,27 @@ return empty_lsn; } - /** Determine if a buffer block was created by chunk_t::create(). - @param block block descriptor (not dereferenced) - @return whether block has been created by chunk_t::create() */ - bool is_uncompressed(const buf_block_t *block) const noexcept + /** Look up the block descriptor for a page frame address. + @param ptr address within a valid page frame + @return the corresponding block descriptor */ + static buf_block_t *block_from(const void *ptr) noexcept; + + /** Access a block while holding the buffer pool mutex. + @param pos position between 0 and get_n_pages() + @return the block descriptor */ + buf_block_t *get_nth_page(size_t pos) const noexcept; + +#ifdef UNIV_DEBUG + /** Determine if an object is within the curr_pool_size() + and associated with an uncompressed page. + @param ptr memory object (not dereferenced) + @return whether the object is valid in the current buffer pool */ + bool is_uncompressed_current(const void *ptr) const noexcept { - return is_block_field(reinterpret_cast(block)); + const ptrdiff_t d= static_cast(ptr) - memory; + return d >= 0 && size_t(d) < curr_pool_size(); } +#endif public: /** page_fix() mode of operation */ @@ -1456,6 +1396,16 @@ buf_block_t *page_fix(const page_id_t id) noexcept { return page_fix(id, nullptr, FIX_WAIT_READ); } + /** Validate a block descriptor. + @param b block descriptor that may be invalid after shrink() + @param latch page_hash latch for id + @param id page identifier + @return b->page.fix() if b->page.id() == id + @retval 0 if b is invalid */ + TRANSACTIONAL_TARGET + uint32_t page_guess(buf_block_t *b, page_hash_latch &latch, + const page_id_t id) noexcept; + /** Decompress a page and relocate the block descriptor @param b buffer-fixed compressed-only ROW_FORMAT=COMPRESSED page @param chain hash table chain for b->id().fold() @@ -1477,7 +1427,6 @@ buf_page_t *bpage= page_hash.get(page_id, chain); if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)]) { - ut_ad(!bpage->in_zip_hash); ut_ad(!bpage->zip.data); if (!allow_watch) bpage= nullptr; @@ -1498,7 +1447,6 @@ ut_ad(bpage.in_file()); if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)]) return false; - ut_ad(!bpage.in_zip_hash); ut_ad(!bpage.zip.data); return true; } @@ -1539,23 +1487,30 @@ inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain) noexcept; /** @return whether less than 1/4 of the buffer pool is available */ - TPOOL_SUPPRESS_TSAN - bool running_out() const noexcept - { - return !recv_recovery_is_on() && - UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < - (n_chunks_new * chunks->size) / 4; - } + bool running_out() const noexcept; /** @return whether the buffer pool is running low */ bool need_LRU_eviction() const noexcept; - /** @return whether the buffer pool is shrinking */ - inline bool is_shrinking() const noexcept + /** @return number of blocks resize() needs to evict from the buffer pool */ + size_t is_shrinking() const noexcept + { + mysql_mutex_assert_owner(&mutex); + return n_blocks_to_withdraw + UT_LIST_GET_LEN(withdrawn); + } + + /** @return number of blocks in resize() waiting to be withdrawn */ + size_t to_withdraw() const noexcept { - return n_chunks_new < n_chunks; + mysql_mutex_assert_owner(&mutex); + return n_blocks_to_withdraw; } + /** @return the shrinking size of the buffer pool, in bytes + @retval 0 if resize() is not shrinking the buffer pool */ + size_t shrinking_size() const noexcept + { return is_shrinking() ? size_in_bytes_requested : 0; } + #ifdef UNIV_DEBUG /** Validate the buffer pool. */ void validate() noexcept; @@ -1572,7 +1527,6 @@ mysql_mutex_assert_owner(&mutex); ut_ad(bpage->in_LRU_list); ut_ad(bpage->in_page_hash); - ut_ad(!bpage->in_zip_hash); ut_ad(bpage->in_file()); lru_hp.adjust(bpage); lru_scan_itr.adjust(bpage); @@ -1592,26 +1546,8 @@ /** @name General fields */ /* @{ */ - ulint curr_pool_size; /*!< Current pool size in bytes */ ulint LRU_old_ratio; /*!< Reserve this much of the buffer pool for "old" blocks */ -#ifdef UNIV_DEBUG - ulint buddy_n_frames; /*!< Number of frames allocated from - the buffer pool to the buddy system */ - ulint mutex_exit_forbidden; /*!< Forbid release mutex */ -#endif - ut_allocator allocator; /*!< Allocator used for - allocating memory for the the "chunks" - member. */ - ulint n_chunks; /*!< number of buffer pool chunks */ - ulint n_chunks_new; /*!< new number of buffer pool chunks. - both n_chunks{,new} are protected under - mutex */ - chunk_t* chunks; /*!< buffer pool chunks */ - chunk_t* chunks_old; /*!< old buffer pool chunks to be freed - after resizing buffer pool */ - /** current pool size in pages */ - Atomic_counter curr_size; /** read-ahead request size in pages */ Atomic_counter read_ahead_area; @@ -1723,12 +1659,6 @@ /** Look up a page in a hash bucket chain. */ inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const noexcept; - - /** Exclusively aqcuire all latches */ - inline void write_lock_all() noexcept; - - /** Release all latches */ - inline void write_unlock_all() noexcept; }; /** Buffer pool mutex */ @@ -1745,9 +1675,6 @@ indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */ page_hash_table page_hash; - /** map of block->frame to buf_block_t blocks that belong - to buf_buddy_alloc(); protected by buf_pool.mutex */ - hash_table_t zip_hash; /** number of pending unzip() */ Atomic_counter n_pend_unzip; @@ -1878,30 +1805,29 @@ Set whenever the free list grows, along with a broadcast of done_free. Protected by buf_pool.mutex. */ Atomic_relaxed try_LRU_scan; - /** Whether we have warned to be running out of buffer pool */ - std::atomic_flag LRU_warned; /* @} */ /** @name LRU replacement algorithm fields */ /* @{ */ - UT_LIST_BASE_NODE_T(buf_page_t) free; - /*!< base node of the free - block list */ +private: + /** Whether we have warned to be running out of buffer pool; + only modified by buf_flush_page_cleaner(): + set while holding mutex, cleared while holding flush_list_mutex */ + Atomic_relaxed LRU_warned; + + /** withdrawn blocks during resize() */ + UT_LIST_BASE_NODE_T(buf_page_t) withdrawn; + +public: + /** list of blocks available for allocate() */ + UT_LIST_BASE_NODE_T(buf_page_t) free; + /** broadcast each time when the free list grows or try_LRU_scan is set; protected by mutex */ pthread_cond_t done_free; - UT_LIST_BASE_NODE_T(buf_page_t) withdraw; - /*!< base node of the withdraw - block list. It is only used during - shrinking buffer pool size, not to - reuse the blocks will be removed */ - - ulint withdraw_target;/*!< target length of withdraw - block list, when withdrawing */ - /** "hazard pointer" used during scan of LRU while doing LRU list batch. Protected by buf_pool_t::mutex. */ LRUHp lru_hp; @@ -1942,10 +1868,22 @@ /** Sentinels to detect if pages are read into the buffer pool while a delete-buffering operation is pending. Protected by mutex. */ buf_page_t watch[innodb_purge_threads_MAX + 1]; + + /** Clear LRU_warned */ + void LRU_warned_clear() noexcept + { + mysql_mutex_assert_owner(&flush_list_mutex); + LRU_warned= false; + } + /** Reserve a buffer. */ buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads) noexcept { return io_buf.reserve(wait_for_reads); } + /** Try to allocate a block. + @return a buffer block + @retval nullptr if no blocks are available */ + buf_block_t *allocate() noexcept; /** Remove a block from flush_list. @param bpage buffer pool page */ void delete_from_flush_list(buf_page_t *bpage) noexcept; @@ -1968,6 +1906,13 @@ /** Issue a warning that we could not free up buffer pool pages. */ ATTRIBUTE_COLD void LRU_warn() noexcept; + /** Print buffer pool flush state information. */ + ATTRIBUTE_COLD void print_flush_info() const noexcept; + + /** Collect buffer pool metadata. + @param pool_info buffer pool metadata */ + void get_info(buf_pool_info_t *pool_info) noexcept; + private: /** Temporary memory for page_compressed and encrypted I/O */ struct io_buf_t @@ -1984,9 +1929,6 @@ /** Reserve a buffer */ buf_tmp_buffer_t *reserve(bool wait_for_reads) noexcept; } io_buf; - - /** whether resize() is in the critical path */ - std::atomic resizing; }; /** The InnoDB buffer pool */ @@ -2135,24 +2077,6 @@ this->old= old; } -#ifdef UNIV_DEBUG -/** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid() do { \ - mysql_mutex_assert_owner(&buf_pool.mutex); \ - buf_pool.mutex_exit_forbidden++; \ -} while (0) -/** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow() do { \ - mysql_mutex_assert_owner(&buf_pool.mutex); \ - ut_ad(buf_pool.mutex_exit_forbidden--); \ -} while (0) -#else -/** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid() ((void) 0) -/** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow() ((void) 0) -#endif - /********************************************************************** Let us list the consistency conditions for different control block states. diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buf.inl mariadb-10.11.13/storage/innobase/include/buf0buf.inl --- mariadb-10.11.11/storage/innobase/include/buf0buf.inl 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/buf0buf.inl 2025-05-19 16:14:25.000000000 +0000 @@ -37,7 +37,7 @@ /* FIXME: bpage->freed_page_clock is 31 bits */ return((buf_pool.freed_page_clock & ((1UL << 31) - 1)) < (bpage->freed_page_clock - + (buf_pool.curr_size + + (buf_pool.curr_size() * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio) / (BUF_LRU_OLD_RATIO_DIV * 4)))); } diff -Nru mariadb-10.11.11/storage/innobase/include/buf0dblwr.h mariadb-10.11.13/storage/innobase/include/buf0dblwr.h --- mariadb-10.11.11/storage/innobase/include/buf0dblwr.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/buf0dblwr.h 2025-05-19 16:14:25.000000000 +0000 @@ -159,6 +159,9 @@ my_cond_wait(&cond, &mutex.m_mutex); mysql_mutex_unlock(&mutex); } + + /** Print double write state information. */ + ATTRIBUTE_COLD void print_info() const noexcept; }; /** The doublewrite buffer */ diff -Nru mariadb-10.11.11/storage/innobase/include/buf0lru.h mariadb-10.11.13/storage/innobase/include/buf0lru.h --- mariadb-10.11.11/storage/innobase/include/buf0lru.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/buf0lru.h 2025-05-19 16:14:25.000000000 +0000 @@ -55,10 +55,6 @@ @return true if found and freed */ bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED); -/** @return a buffer block from the buf_pool.free list -@retval NULL if the free list is empty */ -buf_block_t* buf_LRU_get_free_only(); - /** Get a block from the buf_pool.free list. If the list is empty, blocks will be moved from the end of buf_pool.LRU to buf_pool.free. diff -Nru mariadb-10.11.11/storage/innobase/include/dict0dict.h mariadb-10.11.13/storage/innobase/include/dict0dict.h --- mariadb-10.11.11/storage/innobase/include/dict0dict.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/dict0dict.h 2025-05-19 16:14:25.000000000 +0000 @@ -146,21 +146,21 @@ MDL_ticket **mdl= nullptr) MY_ATTRIBUTE((warn_unused_result)); -/** Decrement the count of open handles */ -void dict_table_close(dict_table_t *table); - -/** Decrements the count of open handles of a table. -@param[in,out] table table -@param[in] dict_locked whether dict_sys.latch is being held -@param[in] thd thread to release MDL -@param[in] mdl metadata lock or NULL if the thread is a - foreground one. */ -void -dict_table_close( - dict_table_t* table, - bool dict_locked, - THD* thd = NULL, - MDL_ticket* mdl = NULL); +/** Release a metadata lock. +@param thd connection that holds mdl +@param mdl metadata lock, or nullptr */ +void mdl_release(THD *thd, MDL_ticket *mdl) noexcept; + +/** Release a table reference and a metadata lock. +@param table referenced table +@param thd connection that holds mdl +@param mdl metadata lock, or nullptr */ +inline void dict_table_close(dict_table_t* table, THD *thd, MDL_ticket *mdl) + noexcept +{ + table->release(); + mdl_release(thd, mdl); +} /*********************************************************************//** Gets the minimum number of bytes per character. @@ -674,7 +674,7 @@ @return estimated number of rows */ inline uint64_t dict_table_get_n_rows(const dict_table_t *table) { - ut_ad(table->stat_initialized); + ut_ad(table->stat_initialized()); return table->stat_n_rows; } @@ -1657,6 +1657,27 @@ dict_table_have_virtual_index( dict_table_t* table); +/** Helper for opening the InnoDB persistent statistics tables */ +class dict_stats final +{ + MDL_context *mdl_context= nullptr; + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + dict_table_t *table_stats= nullptr, *index_stats= nullptr; + +public: + dict_stats()= default; + + /** Open the statistics tables. + @return whether the operation failed */ + bool open(THD *thd) noexcept; + + /** Close the statistics tables after !open_tables(thd). */ + void close() noexcept; + + dict_table_t *table() const noexcept { return table_stats; } + dict_table_t *index() const noexcept { return index_stats; } +}; + #include "dict0dict.inl" #endif diff -Nru mariadb-10.11.11/storage/innobase/include/dict0dict.inl mariadb-10.11.13/storage/innobase/include/dict0dict.inl --- mariadb-10.11.11/storage/innobase/include/dict0dict.inl 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/dict0dict.inl 2025-05-19 16:14:25.000000000 +0000 @@ -1076,8 +1076,8 @@ /** Acquire the table handle. */ inline void dict_table_t::acquire() { - ut_ad(dict_sys.frozen()); - n_ref_count++; + ut_d(const auto old=) n_ref_count++; + ut_ad(old || dict_sys.frozen()); } /** Release the table handle. diff -Nru mariadb-10.11.11/storage/innobase/include/dict0mem.h mariadb-10.11.13/storage/innobase/include/dict0mem.h --- mariadb-10.11.11/storage/innobase/include/dict0mem.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/dict0mem.h 2025-05-19 16:14:25.000000000 +0000 @@ -1106,15 +1106,12 @@ is indexed from 0 to n_uniq-1); This is used when innodb_stats_method is "nulls_ignored". */ - ulint stat_index_size; + uint32_t stat_index_size; /*!< approximate index size in database pages */ - ulint stat_n_leaf_pages; + uint32_t stat_n_leaf_pages; /*!< approximate number of leaf pages in the index tree */ - bool stats_error_printed; - /*!< has persistent statistics error printed - for this index ? */ /* @} */ /** Statistics for defragmentation, these numbers are estimations and could be very inaccurate at certain times, e.g. right after restart, @@ -2358,63 +2355,32 @@ /** Statistics for query optimization. Mostly protected by dict_sys.latch and stats_mutex_lock(). @{ */ - /** TRUE if statistics have been calculated the first time after - database startup or table creation. */ - unsigned stat_initialized:1; - /** Timestamp of last recalc of the stats. */ time_t stats_last_recalc; - /** The two bits below are set in the 'stat_persistent' member. They - have the following meaning: - 1. _ON=0, _OFF=0, no explicit persistent stats setting for this table, - the value of the global srv_stats_persistent is used to determine - whether the table has persistent stats enabled or not - 2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this - table, regardless of the value of the global srv_stats_persistent - 3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this - table, regardless of the value of the global srv_stats_persistent - 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ - #define DICT_STATS_PERSISTENT_ON (1 << 1) - #define DICT_STATS_PERSISTENT_OFF (1 << 2) - - /** Indicates whether the table uses persistent stats or not. See - DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */ - ib_uint32_t stat_persistent; - - /** The two bits below are set in the 'stats_auto_recalc' member. They - have the following meaning: - 1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the - value of the global srv_stats_persistent_auto_recalc is used to - determine whether the table has auto recalc enabled or not - 2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table, - regardless of the value of the global srv_stats_persistent_auto_recalc - 3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table, - regardless of the value of the global srv_stats_persistent_auto_recalc - 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ - #define DICT_STATS_AUTO_RECALC_ON (1 << 1) - #define DICT_STATS_AUTO_RECALC_OFF (1 << 2) - - /** Indicates whether the table uses automatic recalc for persistent - stats or not. See DICT_STATS_AUTO_RECALC_ON and - DICT_STATS_AUTO_RECALC_OFF. */ - ib_uint32_t stats_auto_recalc; - - /** The number of pages to sample for this table during persistent - stats estimation. If this is 0, then the value of the global - srv_stats_persistent_sample_pages will be used instead. */ - ulint stats_sample_pages; + static constexpr uint32_t STATS_INITIALIZED= 1U; + static constexpr uint32_t STATS_PERSISTENT_ON= 1U << 1; + static constexpr uint32_t STATS_PERSISTENT_OFF= 1U << 2; + static constexpr uint32_t STATS_AUTO_RECALC_ON= 1U << 3; + static constexpr uint32_t STATS_AUTO_RECALC_OFF= 1U << 4; + + /** flags for index cardinality statistics */ + Atomic_relaxed stat; + /** Approximate clustered index size in database pages. */ + uint32_t stat_clustered_index_size; + /** Approximate size of other indexes in database pages. */ + uint32_t stat_sum_of_other_index_sizes; + + + /** The number of pages to sample for this table during persistent + stats estimation. If this is 0, then the value of the global + srv_stats_persistent_sample_pages will be used instead. */ + uint32_t stats_sample_pages; /** Approximate number of rows in the table. We periodically calculate new estimates. */ ib_uint64_t stat_n_rows; - /** Approximate clustered index size in database pages. */ - ulint stat_clustered_index_size; - - /** Approximate size of other indexes in database pages. */ - ulint stat_sum_of_other_index_sizes; - /** How many rows are modified since last stats recalc. When a row is inserted, updated, or deleted, we add 1 to this number; we calculate new estimates for the table and the indexes if the table has changed @@ -2424,7 +2390,7 @@ ib_uint64_t stat_modified_counter; bool stats_error_printed; - /*!< Has persistent stats error beein + /*!< Has persistent stats error been already printed for this table ? */ /* @} */ @@ -2551,6 +2517,35 @@ /** @return the index for that starts with a specific column */ dict_index_t *get_index(const dict_col_t &col) const; + /** @return whether the statistics are initialized */ + static bool stat_initialized(uint32_t stat) noexcept + { return stat & STATS_INITIALIZED; } + + /** @return whether STATS_PERSISTENT is enabled */ + static bool stats_is_persistent(uint32_t stat) noexcept + { + ut_ad(~(stat & (STATS_PERSISTENT_ON | STATS_PERSISTENT_OFF))); + if (stat & STATS_PERSISTENT_ON) return true; + return !(stat & STATS_PERSISTENT_OFF) && srv_stats_persistent; + } + /** @return whether STATS_AUTO_RECALC is enabled */ + static bool stats_is_auto_recalc(uint32_t stat) noexcept + { + ut_ad(stat_initialized(stat)); + ut_ad(~(stat & (STATS_AUTO_RECALC_ON | STATS_AUTO_RECALC_OFF))); + if (stat & STATS_AUTO_RECALC_ON) return true; + return !(stat & STATS_AUTO_RECALC_OFF) && srv_stats_auto_recalc; + } + + /** @return whether the statistics are initialized */ + bool stat_initialized() const noexcept { return stat_initialized(stat); } + /** @return whether STATS_PERSISTENT is enabled */ + bool stats_is_persistent() const noexcept + { return stats_is_persistent(stat); } + /** @return whether STATS_AUTO_RECALC is enabled */ + bool stats_is_auto_recalc() const noexcept + { return stats_is_auto_recalc(stat); } + /** Create metadata. @param name table name @param space tablespace diff -Nru mariadb-10.11.11/storage/innobase/include/dict0stats.h mariadb-10.11.13/storage/innobase/include/dict0stats.h --- mariadb-10.11.11/storage/innobase/include/dict0stats.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/dict0stats.h 2025-05-19 16:14:25.000000000 +0000 @@ -30,84 +30,6 @@ #include "dict0types.h" #include "trx0types.h" -enum dict_stats_upd_option_t { - DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the - statistics using a precise and slow - algo and save them to the persistent - storage, if the persistent storage is - not present then emit a warning and - fall back to transient stats */ - DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics - using an imprecise quick algo - without saving the results - persistently */ - DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) - into a table and its indexes' statistics - members. The resulting stats correspond to an - empty table. If the table is using persistent - statistics, then they are saved on disk. */ - DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats - from the persistent storage if the in-memory - structures have not been initialized yet, - otherwise do nothing */ -}; - -/*********************************************************************//** -Set the persistent statistics flag for a given table. This is set only -in the in-memory table object and is not saved on disk. It will be read -from the .frm file upon first open from MySQL after a server restart. */ -UNIV_INLINE -void -dict_stats_set_persistent( -/*======================*/ - dict_table_t* table, /*!< in/out: table */ - ibool ps_on, /*!< in: persistent stats explicitly enabled */ - ibool ps_off) /*!< in: persistent stats explicitly disabled */ - MY_ATTRIBUTE((nonnull)); - -/** @return whether persistent statistics is enabled for a given table */ -UNIV_INLINE -bool -dict_stats_is_persistent_enabled(const dict_table_t* table) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - -/*********************************************************************//** -Set the auto recalc flag for a given table (only honored for a persistent -stats enabled table). The flag is set only in the in-memory table object -and is not saved in InnoDB files. It will be read from the .frm file upon -first open from MySQL after a server restart. */ -UNIV_INLINE -void -dict_stats_auto_recalc_set( -/*=======================*/ - dict_table_t* table, /*!< in/out: table */ - ibool auto_recalc_on, /*!< in: explicitly enabled */ - ibool auto_recalc_off); /*!< in: explicitly disabled */ - -/** @return whether auto recalc is enabled for a given table*/ -UNIV_INLINE -bool -dict_stats_auto_recalc_is_enabled(const dict_table_t* table) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - -/*********************************************************************//** -Initialize table's stats for the first time when opening a table. */ -UNIV_INLINE -void -dict_stats_init( -/*============*/ - dict_table_t* table); /*!< in/out: table */ - -/*********************************************************************//** -Deinitialize table's stats after the last close of the table. This is -used to detect "FLUSH TABLE" and refresh the stats upon next open. */ -UNIV_INLINE -void -dict_stats_deinit( -/*==============*/ - dict_table_t* table) /*!< in/out: table */ - MY_ATTRIBUTE((nonnull)); - #ifdef WITH_WSREP /** Update the table modification counter and if necessary, schedule new estimates for table and index statistics to be calculated. @@ -124,19 +46,6 @@ # define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t) #endif -/*********************************************************************//** -Calculates new estimates for table and index statistics. The statistics -are used in query optimization. -@return DB_* error code or DB_SUCCESS */ -dberr_t -dict_stats_update( -/*==============*/ - dict_table_t* table, /*!< in/out: table */ - dict_stats_upd_option_t stats_upd_option); - /*!< in: whether to (re) calc - the stats or to fetch them from - the persistent storage */ - /** Execute DELETE FROM mysql.innodb_table_stats @param database_name database name @param table_name table name @@ -173,6 +82,50 @@ dict_index_t* index) /*!< in/out: index */ MY_ATTRIBUTE((nonnull)); +enum dict_stats_schema_check { + /** The InnoDB persistent statistics tables do not exist. */ + SCHEMA_NOT_EXIST= -1, + /** The schema of the InnoDB persistent statistics tables is valid. */ + SCHEMA_OK= 0, + /** The schema is invalid. */ + SCHEMA_INVALID +}; + +/** @return whether the persistent statistics storage is usable */ +dict_stats_schema_check +dict_stats_persistent_storage_check(bool dict_already_locked= false) noexcept; + +/** Save the persistent statistics of a table or an index. +@param table table whose stats to save +@param only_for_index the index ID to save statistics for (0=all) +@return DB_SUCCESS or error code */ +dberr_t dict_stats_save(dict_table_t* table, index_id_t index_id= 0); + +/** Read the stored persistent statistics of a table. */ +dberr_t dict_stats_fetch_from_ps(dict_table_t *table); + +/** +Calculate new estimates for table and index statistics. This function +is relatively quick and is used to calculate non-persistent statistics. +@param table table for which the non-persistent statistics are being updated +@return error code +@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */ +dberr_t dict_stats_update_transient(dict_table_t *table) noexcept; + +/** +Calculate new estimates for table and index statistics. This function +is slower than dict_stats_update_transient(). +@param table table for which the persistent statistics are being updated +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +dberr_t dict_stats_update_persistent(dict_table_t *table) noexcept; + +/** +Try to calculate and save new estimates for persistent statistics. +If persistent statistics are not enabled for the table or not available, +this does nothing. */ +dberr_t dict_stats_update_persistent_try(dict_table_t *table); + /** Rename a table in InnoDB persistent stats storage. @param old_name old table name @param new_name new table name @@ -229,8 +182,6 @@ dict_stats_report_error(dict_table_t* table, bool defragment = false) MY_ATTRIBUTE((nonnull, warn_unused_result)); -#include "dict0stats.inl" - #ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS void test_dict_stats_all(); #endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ @@ -244,4 +195,8 @@ dict_stats_empty_table( dict_table_t* table, bool empty_defrag_stats); + +/** Clear the statistics for a table and save them if +persistent statistics are enabled. */ +void dict_stats_empty_table_and_save(dict_table_t *table); #endif /* dict0stats_h */ diff -Nru mariadb-10.11.11/storage/innobase/include/dict0stats.inl mariadb-10.11.13/storage/innobase/include/dict0stats.inl --- mariadb-10.11.11/storage/innobase/include/dict0stats.inl 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/dict0stats.inl 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ -/***************************************************************************** - -Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. -Copyright (c) 2017, 2021, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -*****************************************************************************/ - -/**************************************************//** -@file include/dict0stats.ic -Code used for calculating and manipulating table statistics. - -Created Jan 23, 2012 Vasil Dimov -*******************************************************/ - -#include "dict0dict.h" -#include "srv0srv.h" - -/*********************************************************************//** -Set the persistent statistics flag for a given table. This is set only -in the in-memory table object and is not saved on disk. It will be read -from the .frm file upon first open from MySQL after a server restart. */ -UNIV_INLINE -void -dict_stats_set_persistent( -/*======================*/ - dict_table_t* table, /*!< in/out: table */ - ibool ps_on, /*!< in: persistent stats explicitly enabled */ - ibool ps_off) /*!< in: persistent stats explicitly disabled */ -{ - /* Not allowed to have both flags set, but a CREATE or ALTER - statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would - end up having both set. In this case we clear the OFF flag. */ - if (ps_on && ps_off) { - ps_off = FALSE; - } - - ib_uint32_t stat_persistent = 0; - - if (ps_on) { - stat_persistent |= DICT_STATS_PERSISTENT_ON; - } - - if (ps_off) { - stat_persistent |= DICT_STATS_PERSISTENT_OFF; - } - - /* we rely on this assignment to be atomic */ - table->stat_persistent = stat_persistent; -} - -/** @return whether persistent statistics is enabled for a given table */ -UNIV_INLINE -bool -dict_stats_is_persistent_enabled(const dict_table_t* table) -{ - /* Because of the nature of this check (non-locking) it is possible - that a table becomes: - * PS-disabled immediately after this function has returned TRUE or - * PS-enabled immediately after this function has returned FALSE. - This means that it is possible that we do: - + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has - just been PS-disabled or - + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has - just been PS-enabled. - This is acceptable. Avoiding this would mean that we would have to - hold dict_sys.latch or stats_mutex_lock() like for accessing the - other ::stat_ members which would be too big performance penalty, - especially when this function is called from - dict_stats_update_if_needed(). */ - - /* we rely on this read to be atomic */ - ib_uint32_t stat_persistent = table->stat_persistent; - - if (stat_persistent & DICT_STATS_PERSISTENT_ON) { - ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); - return(true); - } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { - return(false); - } else { - return(srv_stats_persistent); - } -} - -/*********************************************************************//** -Set the auto recalc flag for a given table (only honored for a persistent -stats enabled table). The flag is set only in the in-memory table object -and is not saved in InnoDB files. It will be read from the .frm file upon -first open from MySQL after a server restart. */ -UNIV_INLINE -void -dict_stats_auto_recalc_set( -/*=======================*/ - dict_table_t* table, /*!< in/out: table */ - ibool auto_recalc_on, /*!< in: explicitly enabled */ - ibool auto_recalc_off) /*!< in: explicitly disabled */ -{ - ut_ad(!auto_recalc_on || !auto_recalc_off); - - ib_uint32_t stats_auto_recalc = 0; - - if (auto_recalc_on) { - stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; - } - - if (auto_recalc_off) { - stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; - } - - /* we rely on this assignment to be atomic */ - table->stats_auto_recalc = stats_auto_recalc; -} - -/** @return whether auto recalc is enabled for a given table*/ -UNIV_INLINE -bool -dict_stats_auto_recalc_is_enabled(const dict_table_t* table) -{ - /* we rely on this read to be atomic */ - ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; - - if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { - ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); - return(true); - } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { - return(false); - } else { - return(srv_stats_auto_recalc); - } -} - -/*********************************************************************//** -Initialize table's stats for the first time when opening a table. */ -UNIV_INLINE -void -dict_stats_init( -/*============*/ - dict_table_t* table) /*!< in/out: table */ -{ - ut_ad(!table->stats_mutex_is_owner()); - - if (table->stat_initialized) { - return; - } - - dict_stats_upd_option_t opt; - - if (dict_stats_is_persistent_enabled(table)) { - opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; - } else { - opt = DICT_STATS_RECALC_TRANSIENT; - } - - dict_stats_update(table, opt); -} - -/*********************************************************************//** -Deinitialize table's stats after the last close of the table. This is -used to detect "FLUSH TABLE" and refresh the stats upon next open. */ -UNIV_INLINE -void -dict_stats_deinit( -/*==============*/ - dict_table_t* table) /*!< in/out: table */ -{ - ut_ad(table->stats_mutex_is_owner()); - ut_ad(table->get_ref_count() == 0); - -#ifdef HAVE_valgrind - if (!table->stat_initialized) { - return; - } - - MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows); - MEM_UNDEFINED(&table->stat_clustered_index_size, - sizeof table->stat_clustered_index_size); - MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes, - sizeof table->stat_sum_of_other_index_sizes); - MEM_UNDEFINED(&table->stat_modified_counter, - sizeof table->stat_modified_counter); - - dict_index_t* index; - - for (index = dict_table_get_first_index(table); - index != NULL; - index = dict_table_get_next_index(index)) { - MEM_UNDEFINED( - index->stat_n_diff_key_vals, - index->n_uniq - * sizeof index->stat_n_diff_key_vals[0]); - MEM_UNDEFINED( - index->stat_n_sample_sizes, - index->n_uniq - * sizeof index->stat_n_sample_sizes[0]); - MEM_UNDEFINED( - index->stat_n_non_null_key_vals, - index->n_uniq - * sizeof index->stat_n_non_null_key_vals[0]); - MEM_UNDEFINED( - &index->stat_index_size, - sizeof(index->stat_index_size)); - MEM_UNDEFINED( - &index->stat_n_leaf_pages, - sizeof(index->stat_n_leaf_pages)); - } -#endif /* HAVE_valgrind */ - table->stat_initialized = FALSE; -} diff -Nru mariadb-10.11.11/storage/innobase/include/fil0fil.h mariadb-10.11.13/storage/innobase/include/fil0fil.h --- mariadb-10.11.11/storage/innobase/include/fil0fil.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/fil0fil.h 2025-05-19 16:14:25.000000000 +0000 @@ -351,7 +351,7 @@ /** fil_system.spaces chain node */ fil_space_t *hash= nullptr; /** log_sys.get_lsn() of the most recent fil_names_write_if_was_clean(). - Reset to 0 by fil_names_clear(). Protected by log_sys.mutex. + Reset to 0 by fil_names_clear(). Protected by log_sys.latch_have_wr(). If and only if this is nonzero, the tablespace will be in named_spaces. */ lsn_t max_lsn= 0; /** base node for the chain of data files; multiple entries are @@ -422,7 +422,7 @@ bool being_imported= false; /** Whether any corrupton of this tablespace has been reported */ - mutable std::atomic_flag is_corrupted{false}; + mutable std::atomic_flag is_corrupted= ATOMIC_FLAG_INIT; public: /** mutex to protect freed_ranges and last_freed_lsn */ @@ -1527,7 +1527,10 @@ inline void fil_space_t::reacquire() noexcept { - ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed); +#ifdef SAFE_MUTEX + uint32_t n= +#endif + n_pending.fetch_add(1, std::memory_order_relaxed); #ifdef SAFE_MUTEX if (mysql_mutex_is_owner(&fil_system.mutex)) return; ut_ad(n & PENDING); diff -Nru mariadb-10.11.11/storage/innobase/include/fsp0fsp.h mariadb-10.11.13/storage/innobase/include/fsp0fsp.h --- mariadb-10.11.11/storage/innobase/include/fsp0fsp.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/fsp0fsp.h 2025-05-19 16:14:25.000000000 +0000 @@ -355,9 +355,9 @@ @param[out] used number of pages that are used (not more than reserved) @param[in,out] mtr mini-transaction @return number of reserved pages */ -ulint fseg_n_reserved_pages(const buf_block_t &block, - const fseg_header_t *header, ulint *used, - mtr_t *mtr) +uint32_t fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, uint32_t *used, + mtr_t *mtr) noexcept MY_ATTRIBUTE((nonnull)); /**********************************************************************//** Allocates a single free page from a segment. This function implements diff -Nru mariadb-10.11.11/storage/innobase/include/ibuf0ibuf.h mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h --- mariadb-10.11.11/storage/innobase/include/ibuf0ibuf.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h 2025-05-19 16:14:25.000000000 +0000 @@ -62,11 +62,11 @@ /** Insert buffer struct */ struct ibuf_t{ - Atomic_relaxed size; /*!< current size of the ibuf index + Atomic_relaxed size; /*!< current size of the ibuf index tree, in pages */ - Atomic_relaxed max_size; /*!< recommended maximum size of the + Atomic_relaxed max_size;/*!< recommended maximum size of the ibuf index tree, in pages */ - ulint seg_size; /*!< allocated pages of the file + uint32_t seg_size; /*!< allocated pages of the file segment containing ibuf header and tree */ bool empty; /*!< Protected by the page @@ -75,8 +75,8 @@ (FSP_IBUF_TREE_ROOT_PAGE_NO). true if and only if the insert buffer tree is empty. */ - ulint free_list_len; /*!< length of the free list */ - ulint height; /*!< tree height */ + uint8_t height; /*!< tree height */ + uint32_t free_list_len; /*!< length of the free list */ dict_index_t* index; /*!< insert buffer index */ /** number of pages merged */ diff -Nru mariadb-10.11.11/storage/innobase/include/log0log.h mariadb-10.11.13/storage/innobase/include/log0log.h --- mariadb-10.11.11/storage/innobase/include/log0log.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/log0log.h 2025-05-19 16:14:25.000000000 +0000 @@ -64,20 +64,19 @@ /** Write to the log file up to the last log entry. @param durable whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool durable= true); - +void log_buffer_flush_to_disk(bool durable= true) noexcept; /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ -ATTRIBUTE_COLD void log_write_and_flush_prepare(); +ATTRIBUTE_COLD void log_write_and_flush_prepare() noexcept; /** Durably write the log up to log_sys.get_lsn(). */ -ATTRIBUTE_COLD void log_write_and_flush(); +ATTRIBUTE_COLD void log_write_and_flush() noexcept; /** Make a checkpoint */ -ATTRIBUTE_COLD void log_make_checkpoint(); +ATTRIBUTE_COLD void log_make_checkpoint() noexcept; /** Make a checkpoint at the latest lsn on shutdown. */ -ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() noexcept; /******************************************************//** Prints info of the log. */ @@ -167,40 +166,35 @@ static constexpr lsn_t FIRST_LSN= START_OFFSET; private: - /** the lock bit in buf_free */ - static constexpr size_t buf_free_LOCK= ~(~size_t{0} >> 1); + /** the least significant bit of the write_to_buf buffer */ + static constexpr size_t WRITE_TO_BUF_SHIFT{34}; + /** write_lsn_offset component for incrementing write_to_buf */ + static constexpr uint64_t WRITE_TO_BUF{1ULL << WRITE_TO_BUF_SHIFT}; + /** write_lsn_offset flag to indicate that append_prepare_wait() is active */ + static constexpr uint64_t WRITE_BACKOFF{1ULL << 33}; + + /** The current log sequence number, relative to base_lsn, and flags; + may be modified while latch_have_any() */ alignas(CPU_LEVEL1_DCACHE_LINESIZE) - /** first free offset within buf used; - the most significant bit is set by lock_lsn() to protect this field - as well as write_to_buf, waits */ - std::atomic buf_free; -public: - /** number of write requests (to buf); protected by lock_lsn() or lsn_lock */ - size_t write_to_buf; - /** log record buffer, written to by mtr_t::commit() */ - byte *buf; -private: - /** The log sequence number of the last change of durable InnoDB files; - protected by lock_lsn() or lsn_lock or latch.wr_lock() */ - std::atomic lsn; + Atomic_relaxed write_lsn_offset; + /** the LSN of the last write_buf() or persist(); protected by latch */ + std::atomic base_lsn; /** the first guaranteed-durable log sequence number */ std::atomic flushed_to_disk_lsn; public: - /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */ - size_t waits; - /** innodb_log_buffer_size (size of buf,flush_buf if !is_mmap(), in bytes) */ + /** innodb_log_buffer_size (usable append_prepare() size in bytes) */ unsigned buf_size; /** log file size in bytes, including the header */ lsn_t file_size; #ifdef LOG_LATCH_DEBUG typedef srw_lock_debug log_rwlock; - typedef srw_mutex log_lsn_lock; bool latch_have_wr() const { return latch.have_wr(); } bool latch_have_rd() const { return latch.have_rd(); } bool latch_have_any() const { return latch.have_any(); } #else + typedef srw_lock log_rwlock; # ifndef UNIV_DEBUG # elif defined SUX_LOCK_GENERIC bool latch_have_wr() const { return true; } @@ -211,23 +205,23 @@ bool latch_have_rd() const { return latch.is_locked(); } bool latch_have_any() const { return latch.is_locked(); } # endif -# ifdef __aarch64__ - /* On ARM, we spin more */ - typedef srw_spin_lock log_rwlock; - typedef pthread_mutex_wrapper log_lsn_lock; -# else - typedef srw_lock log_rwlock; - typedef srw_mutex log_lsn_lock; -# endif #endif - /** exclusive latch for checkpoint, shared for mtr_t::commit() to buf */ - alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch; + /** latch_have_wr() for checkpoint, latch_have_any() for append_prepare() */ + log_rwlock latch; + + /** log record buffer, written to by mtr_t::commit() */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) byte *buf; + + /** number of write requests to buf, + excluding (write_lsn_offset & WRITE_TO_BUF); + protected by latch.wr_lock() */ + size_t write_to_buf; /** number of writes from buf or flush_buf to log; protected by latch.wr_lock() */ - ulint write_to_log; + size_t write_to_log; - /** Last written LSN */ + /** Last written LSN; protected by latch */ lsn_t write_lsn; /** Buffer for writing data to ib_logfile0, or nullptr if is_mmap(). @@ -241,8 +235,6 @@ Atomic_relaxed checkpoint_pending; /** next checkpoint number (protected by latch.wr_lock()) */ byte next_checkpoint_no; - /** recommended maximum buf_free size, after which the buffer is flushed */ - unsigned max_buf_free; /** Log sequence number when a log file overwrite (broken crash recovery) was noticed. Protected by latch.wr_lock(). */ lsn_t overwrite_warned; @@ -266,12 +258,6 @@ /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; - /** Special implementation of lock_lsn() for IA-32 and AMD64 */ - void lsn_lock_bts() noexcept; - /** Acquire a lock for updating buf_free and related fields. - @return the value of buf_free */ - size_t lock_lsn() noexcept; - /** log sequence number when log resizing was initiated; 0 if the log is not being resized, 1 if resize_start() is in progress */ std::atomic resize_lsn; @@ -303,7 +289,6 @@ bool log_maybe_unbuffered; # endif #endif - /** Fields involved in checkpoints @{ */ lsn_t log_capacity; /*!< capacity of the log; if the checkpoint age exceeds this, it is @@ -326,34 +311,26 @@ /* @} */ private: - /** A lock when the spin-only lock_lsn() is not being used */ - log_lsn_lock lsn_lock; + /** the thread that initiated resize_lsn() */ + Atomic_relaxed resize_initiator; +#ifdef HAVE_PMEM + /** mutex protecting wrap-around in resize_write() */ + srw_mutex resize_wrap_mutex; +#endif public: + /** number of long append_prepare_wait(); protected by latch_have_wr() */ + size_t waits; - bool is_initialised() const noexcept { return max_buf_free != 0; } - - /** whether there is capacity in the log buffer */ - bool buf_free_ok() const noexcept - { - ut_ad(!is_mmap()); - return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) < - max_buf_free; - } - + bool is_initialised() const noexcept + { return base_lsn.load(std::memory_order_relaxed) != 0; } inline void set_recovered() noexcept; - void set_buf_free(size_t f) noexcept - { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } - bool is_mmap() const noexcept { return !flush_buf; } /** @return whether a handle to the log is open; is_mmap() && !is_opened() holds for PMEM */ bool is_opened() const noexcept { return log.is_opened(); } - /** @return target write LSN to react on !buf_free_ok() */ - inline lsn_t get_write_target() const; - /** @return LSN at which log resizing was started and is still in progress @retval 0 if no log resizing is in progress @retval 1 if resize_start() is in progress */ @@ -367,11 +344,17 @@ /** Start resizing the log and release the exclusive latch. @param size requested new file_size + @param thd the current thread identifier @return whether the resizing was started successfully */ - resize_start_status resize_start(os_offset_t size) noexcept; + resize_start_status resize_start(os_offset_t size, void *thd) noexcept; - /** Abort any resize_start(). */ - void resize_abort() noexcept; + /** Abort a resize_start() that we started. + @param thd thread identifier that had been passed to resize_start() */ + void resize_abort(void *thd) noexcept; + + /** @return whether a particular resize_start() is in progress */ + bool resize_running(void *thd) const noexcept + { return thd == resize_initiator; } /** Replicate a write to the log. @param lsn start LSN @@ -400,53 +383,64 @@ { return resize_buf + resize_target; } /** Initialise the redo log subsystem. */ - void create(); + void create() noexcept; /** Attach a log file. @return whether the memory allocation succeeded */ - bool attach(log_file_t file, os_offset_t size); + bool attach(log_file_t file, os_offset_t size) noexcept; /** Disable memory-mapped access (update log_mmap) */ - void clear_mmap(); - void close_file(bool really_close= true); + void clear_mmap() noexcept; + void close_file(bool really_close= true) noexcept; #if defined __linux__ || defined _WIN32 /** Try to enable or disable file system caching (update log_buffered) */ - void set_buffered(bool buffered); + void set_buffered(bool buffered) noexcept; #endif /** Calculate the checkpoint safety margins. */ - static void set_capacity(); + static void set_capacity() noexcept; /** Write a log file header. @param buf log header buffer @param lsn log sequence number corresponding to log_sys.START_OFFSET @param encrypted whether the log is encrypted */ - static void header_write(byte *buf, lsn_t lsn, bool encrypted); + static void header_write(byte *buf, lsn_t lsn, bool encrypted) noexcept; - lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const - { return lsn.load(order); } + /** @return a lower bound estimate of get_lsn(), + using acquire-release ordering with write_buf() or persist(); + this is exact unless append_prepare_wait() is pending */ + lsn_t get_lsn_approx() const noexcept + { + /* acquire-release ordering with write_buf() and persist() */ + lsn_t lsn= base_lsn.load(std::memory_order_acquire); + lsn += write_lsn_offset.load(std::memory_order_relaxed) & + (WRITE_BACKOFF - 1); + return lsn; + } + + /** @return the current log sequence number (logical time stamp) */ + lsn_t get_lsn() const noexcept + { + ut_ad(latch_have_wr()); + return base_lsn.load(std::memory_order_relaxed) + + (write_lsn_offset & (WRITE_BACKOFF - 1)); + } lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) const noexcept { return flushed_to_disk_lsn.load(order); } /** Initialize the LSN on initial log file creation. */ - lsn_t init_lsn() noexcept - { - latch.wr_lock(SRW_LOCK_CALL); - const lsn_t lsn{get_lsn()}; - flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); - write_lsn= lsn; - latch.wr_unlock(); - return lsn; - } + inline lsn_t init_lsn() noexcept; void set_recovered_lsn(lsn_t lsn) noexcept { ut_ad(latch_have_wr()); - write_lsn= lsn; - this->lsn.store(lsn, std::memory_order_relaxed); + uint64_t lsn_offset= ((write_size - 1) & (lsn - first_lsn)); + write_lsn_offset= lsn_offset; + base_lsn.store(lsn - lsn_offset, std::memory_order_relaxed); flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn= lsn; } #ifdef HAVE_PMEM @@ -481,25 +475,19 @@ private: /** Update writer and mtr_t::finisher */ - void writer_update() noexcept; + void writer_update(bool resizing) noexcept; /** Wait in append_prepare() for buffer to become available - @tparam spin whether to use the spin-only lock_lsn() - @param b the value of buf_free - @param ex whether log_sys.latch is exclusively locked - @param lsn log sequence number to write up to - @return the new value of buf_free */ - template - ATTRIBUTE_COLD size_t append_prepare_wait(size_t b, bool ex, lsn_t lsn) - noexcept; + @param late whether the WRITE_BACKOFF flag had already been set + @param ex whether log_sys.latch is exclusively locked */ + ATTRIBUTE_COLD void append_prepare_wait(bool late, bool ex) noexcept; public: /** Reserve space in the log buffer for appending data. - @tparam spin whether to use the spin-only lock_lsn() @tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template + template std::pair append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. @@ -570,7 +558,10 @@ /** Wait for a log checkpoint if needed. NOTE that this function may only be called while not holding any synchronization objects except dict_sys.latch. */ -void log_free_check(); +void log_free_check() noexcept; + +/** @return the current log sequence number (may be stale) */ +lsn_t log_get_lsn() noexcept; /** Release the latches that protect log resizing. */ -void log_resize_release(); +void log_resize_release() noexcept; diff -Nru mariadb-10.11.11/storage/innobase/include/log0recv.h mariadb-10.11.13/storage/innobase/include/log0recv.h --- mariadb-10.11.11/storage/innobase/include/log0recv.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/log0recv.h 2025-05-19 16:14:25.000000000 +0000 @@ -118,15 +118,17 @@ const fil_space_t *space= nullptr, byte *tmp_buf= nullptr) const noexcept; - /** Find the doublewrite copy of an encrypted page with the - smallest FIL_PAGE_LSN that is large enough for recovery. + /** Find the doublewrite copy of an encrypted/page_compressed + page with the smallest FIL_PAGE_LSN that is large enough for + recovery. @param space tablespace object @param page_no page number to find - @param buf buffer for unencrypted page + @param buf buffer for unencrypted/uncompressed page @return buf @retval nullptr if the page was not found in doublewrite buffer */ - byte *find_encrypted_page(const fil_node_t &space, uint32_t page_no, - byte *buf) noexcept; + ATTRIBUTE_COLD byte *find_deferred_page(const fil_node_t &space, + uint32_t page_no, + byte *buf) noexcept; /** Restore the first page of the given tablespace from doublewrite buffer. diff -Nru mariadb-10.11.11/storage/innobase/include/mtr0mtr.h mariadb-10.11.13/storage/innobase/include/mtr0mtr.h --- mariadb-10.11.11/storage/innobase/include/mtr0mtr.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/mtr0mtr.h 2025-05-19 16:14:25.000000000 +0000 @@ -700,19 +700,19 @@ @param mtr mini-transaction @param lsns {start_lsn,flush_ahead} */ template - static void commit_log(mtr_t *mtr, std::pair lsns); + static void commit_log(mtr_t *mtr, std::pair lsns) + noexcept; /** Append the redo log records to the redo log buffer. @return {start_lsn,flush_ahead} */ std::pair do_write(); /** Append the redo log records to the redo log buffer. - @tparam spin whether to use the spin-only log_sys.lock_lsn() @tparam mmap log_sys.is_mmap() @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead} */ - template static + template static std::pair finish_writer(mtr_t *mtr, size_t len); /** The applicable variant of commit_log() */ @@ -723,9 +723,6 @@ std::pair finish_write(size_t len) { return finisher(this, len); } public: - /** Poll interval in log_sys.lock_lsn(); 0 to use log_sys.lsn_lock. - Protected by LOCK_global_system_variables and log_sys.latch. */ - static unsigned spin_wait_delay; /** Update finisher when spin_wait_delay is changing to or from 0. */ static void finisher_update(); private: diff -Nru mariadb-10.11.11/storage/innobase/include/os0file.h mariadb-10.11.13/storage/innobase/include/os0file.h --- mariadb-10.11.11/storage/innobase/include/os0file.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/os0file.h 2025-05-19 16:14:25.000000000 +0000 @@ -1003,6 +1003,8 @@ size_t os_aio_pending_reads_approx() noexcept; /** @return number of pending writes */ size_t os_aio_pending_writes() noexcept; +/** @return approximate number of pending writes */ +size_t os_aio_pending_writes_approx() noexcept; /** Wait until there are no pending asynchronous writes. @param declare whether the wait will be declared in tpool */ diff -Nru mariadb-10.11.11/storage/innobase/include/row0row.h mariadb-10.11.13/storage/innobase/include/row0row.h --- mariadb-10.11.11/storage/innobase/include/row0row.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/row0row.h 2025-05-19 16:14:25.000000000 +0000 @@ -328,22 +328,6 @@ mtr_t* mtr) /*!< in: mtr */ MY_ATTRIBUTE((nonnull, warn_unused_result)); -/** Parse the integer data from specified data, which could be -DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 -and the type is not unsigned then we reset the value to 0 -@param[in] data data to read -@param[in] len length of data -@param[in] mtype mtype of data -@param[in] unsigned_type if the data is unsigned -@return the integer value from the data */ -inline -ib_uint64_t -row_parse_int( - const byte* data, - ulint len, - ulint mtype, - bool unsigned_type); - /** Result of row_search_index_entry */ enum row_search_result { ROW_FOUND = 0, /*!< the record was found */ diff -Nru mariadb-10.11.11/storage/innobase/include/row0row.inl mariadb-10.11.13/storage/innobase/include/row0row.inl --- mariadb-10.11.11/storage/innobase/include/row0row.inl 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/row0row.inl 2025-05-19 16:14:25.000000000 +0000 @@ -170,52 +170,3 @@ } } } - -/** Parse the integer data from specified data, which could be -DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 -and the type is not unsigned then we reset the value to 0 -@param[in] data data to read -@param[in] len length of data -@param[in] mtype mtype of data -@param[in] unsigned_type if the data is unsigned -@return the integer value from the data */ -ib_uint64_t -row_parse_int( - const byte* data, - ulint len, - ulint mtype, - bool unsigned_type) -{ - ib_uint64_t value = 0; - - switch (mtype) { - case DATA_INT: - - ut_a(len <= sizeof value); - value = mach_read_int_type(data, len, unsigned_type); - break; - - case DATA_FLOAT: - - ut_a(len == sizeof(float)); - value = static_cast(mach_float_read(data)); - break; - - case DATA_DOUBLE: - - ut_a(len == sizeof(double)); - value = static_cast(mach_double_read(data)); - break; - - default: - ut_error; - - } - - if (!unsigned_type && static_cast(value) < 0) { - value = 0; - } - - return(value); -} - diff -Nru mariadb-10.11.11/storage/innobase/include/row0sel.h mariadb-10.11.13/storage/innobase/include/row0sel.h --- mariadb-10.11.11/storage/innobase/include/row0sel.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/row0sel.h 2025-05-19 16:14:25.000000000 +0000 @@ -182,9 +182,8 @@ @param[in] index index starting with an AUTO_INCREMENT column @return the largest AUTO_INCREMENT value @retval 0 if no records were found */ -ib_uint64_t -row_search_max_autoinc(dict_index_t* index) - MY_ATTRIBUTE((nonnull, warn_unused_result)); +uint64_t row_search_max_autoinc(dict_index_t *index) noexcept + MY_ATTRIBUTE((nonnull, warn_unused_result)); /** A structure for caching column values for prefetched rows */ struct sel_buf_t{ diff -Nru mariadb-10.11.11/storage/innobase/include/srv0srv.h mariadb-10.11.13/storage/innobase/include/srv0srv.h --- mariadb-10.11.11/storage/innobase/include/srv0srv.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/srv0srv.h 2025-05-19 16:14:25.000000000 +0000 @@ -223,17 +223,6 @@ extern my_bool srv_adaptive_flushing; extern my_bool srv_flush_sync; -/** Requested size in bytes */ -extern ulint srv_buf_pool_size; -/** Requested buffer pool chunk size */ -extern size_t srv_buf_pool_chunk_unit; -/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ -/** Previously requested size */ -extern ulint srv_buf_pool_old_size; -/** Current size as scaling factor for the other components */ -extern ulint srv_buf_pool_base_size; -/** Current size in bytes */ -extern ulint srv_buf_pool_curr_size; /** Dump this % of each buffer pool during BP dump */ extern ulong srv_buf_pool_dump_pct; #ifdef UNIV_DEBUG @@ -267,8 +256,8 @@ /* We use this dummy default value at startup for max_io_capacity. The real value is set based on the value of io_capacity. */ -#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) -#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (UINT32_MAX) +#define SRV_MAX_IO_CAPACITY_LIMIT (UINT32_MAX) extern ulong srv_max_io_capacity; /* The "innodb_stats_method" setting, decides how InnoDB is going @@ -294,9 +283,9 @@ extern ibool srv_innodb_status; -extern unsigned long long srv_stats_transient_sample_pages; +extern uint32_t srv_stats_transient_sample_pages; extern my_bool srv_stats_persistent; -extern unsigned long long srv_stats_persistent_sample_pages; +extern uint32_t srv_stats_persistent_sample_pages; extern my_bool srv_stats_auto_recalc; extern my_bool srv_stats_include_delete_marked; extern unsigned long long srv_stats_modified_counter; @@ -596,7 +585,7 @@ #endif /* BTR_CUR_HASH_ADAPT */ char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */ char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */ - char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ + char innodb_buffer_pool_resize_status[65];/*!< Buf pool resize status */ my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ diff -Nru mariadb-10.11.11/storage/innobase/include/trx0trx.h mariadb-10.11.13/storage/innobase/include/trx0trx.h --- mariadb-10.11.11/storage/innobase/include/trx0trx.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/trx0trx.h 2025-05-19 16:14:25.000000000 +0000 @@ -809,8 +809,13 @@ /** normally set; "SET unique_checks=0, foreign_key_checks=0" enables bulk insert into an empty table */ unsigned check_unique_secondary:1; - /** whether an insert into an empty table is active */ - unsigned bulk_insert:1; + /** whether an insert into an empty table is active + Possible states are + TRX_NO_BULK + TRX_DML_BULK + TRX_DDL_BULK + @see trx_bulk_insert in trx0types.h */ + unsigned bulk_insert:2; /*------------------------------*/ /* MySQL has a transaction coordinator to coordinate two phase commit between multiple storage engines and the binary log. When @@ -1117,6 +1122,7 @@ ut_ad(!is_not_inheriting_locks()); ut_ad(check_foreigns); ut_ad(check_unique_secondary); + ut_ad(bulk_insert == TRX_NO_BULK); } /** This has to be invoked on SAVEPOINT or at the end of a statement. @@ -1142,6 +1148,8 @@ rollback to the start of a statement will work. */ void end_bulk_insert() { + if (bulk_insert == TRX_DDL_BULK) + return; for (auto& t : mod_tables) t.second.end_bulk_insert(); } @@ -1149,7 +1157,15 @@ /** @return whether a bulk insert into empty table is in progress */ bool is_bulk_insert() const { - if (!bulk_insert || check_unique_secondary || check_foreigns) + switch (bulk_insert) { + case TRX_NO_BULK: + return false; + case TRX_DDL_BULK: + return true; + default: + ut_ad(bulk_insert == TRX_DML_BULK); + } + if (check_unique_secondary || check_foreigns) return false; for (const auto& t : mod_tables) if (t.second.is_bulk_insert()) @@ -1179,9 +1195,11 @@ /** Do the bulk insert for the buffered insert operation for the transaction. @return DB_SUCCESS or error code */ + template dberr_t bulk_insert_apply() { - return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS; + static_assert(type != TRX_NO_BULK, ""); + return bulk_insert == type ? bulk_insert_apply_low(): DB_SUCCESS; } private: diff -Nru mariadb-10.11.11/storage/innobase/include/trx0types.h mariadb-10.11.13/storage/innobase/include/trx0types.h --- mariadb-10.11.11/storage/innobase/include/trx0types.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/trx0types.h 2025-05-19 16:14:25.000000000 +0000 @@ -65,6 +65,15 @@ TRX_STATE_COMMITTED_IN_MEMORY }; +/** Transaction bulk insert operation @see trx_t::bulk_insert */ +enum trx_bulk_insert { + TRX_NO_BULK, + /** bulk insert is being executed during DML */ + TRX_DML_BULK, + /** bulk insert is being executed in copy_data_between_tables() */ + TRX_DDL_BULK +}; + /** Memory objects */ /* @{ */ /** Transaction */ diff -Nru mariadb-10.11.11/storage/innobase/include/ut0new.h mariadb-10.11.13/storage/innobase/include/ut0new.h --- mariadb-10.11.11/storage/innobase/include/ut0new.h 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/include/ut0new.h 2025-05-19 16:14:25.000000000 +0000 @@ -277,7 +277,6 @@ #ifdef UNIV_PFS_MEMORY /** Default constructor. */ - explicit ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED) : m_key(key) { diff -Nru mariadb-10.11.11/storage/innobase/lock/lock0lock.cc mariadb-10.11.13/storage/innobase/lock/lock0lock.cc --- mariadb-10.11.11/storage/innobase/lock/lock0lock.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/lock/lock0lock.cc 2025-05-19 16:14:25.000000000 +0000 @@ -4140,13 +4140,12 @@ children.end()) continue; /* We already acquired MDL on this child table. */ MDL_ticket *mdl= nullptr; - child->acquire(); child= dict_acquire_mdl_shared(child, mdl_context, &mdl, DICT_TABLE_OP_NORMAL); if (child) { - if (!mdl) - child->release(); + if (mdl) + child->acquire(); children.emplace_back(table_mdl{child, mdl}); goto rescan; } @@ -6053,17 +6052,10 @@ for it */ trx_t *trx = thr_get_trx(thr); - if (const trx_t *owner = - lock_rec_convert_impl_to_expl(trx, *block, - rec, index, offsets)) { - if (owner == trx) { - /* We already hold an exclusive lock. */ - return DB_SUCCESS; - } - - if (trx->snapshot_isolation && trx->read_view.is_open()) { - return DB_RECORD_CHANGED; - } + if (lock_rec_convert_impl_to_expl(trx, *block, + rec, index, offsets) == trx) { + /* We already hold an exclusive lock. */ + return DB_SUCCESS; } err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP, @@ -6225,19 +6217,11 @@ return DB_SUCCESS; } - if (page_rec_is_supremum(rec)) { - } else if (const trx_t *owner = - lock_rec_convert_impl_to_expl(trx, *block, - rec, index, offsets)) { - if (owner == trx) { - if (gap_mode == LOCK_REC_NOT_GAP) { - /* We already hold an exclusive lock. */ - return DB_SUCCESS; - } - } else if (trx->snapshot_isolation - && trx->read_view.is_open()) { - return DB_RECORD_CHANGED; - } + if (!page_rec_is_supremum(rec) + && lock_rec_convert_impl_to_expl(trx, *block, rec, index, + offsets) == trx + && gap_mode == LOCK_REC_NOT_GAP) { + return DB_SUCCESS; } #ifdef WITH_WSREP @@ -6317,28 +6301,24 @@ trx_t *trx = thr_get_trx(thr); if (lock_table_has(trx, index->table, LOCK_X) || heap_no == PAGE_HEAP_NO_SUPREMUM) { - } else if (const trx_t *owner = - lock_rec_convert_impl_to_expl(trx, *block, - rec, index, offsets)) { - if (owner == trx) { - if (gap_mode == LOCK_REC_NOT_GAP) { - /* We already hold an exclusive lock. */ - return DB_SUCCESS; - } - } else if (trx->snapshot_isolation - && trx->read_view.is_open()) { - return DB_RECORD_CHANGED; - } + } else if (lock_rec_convert_impl_to_expl(trx, *block, rec, index, + offsets) == trx + && gap_mode == LOCK_REC_NOT_GAP) { + /* We already hold an exclusive lock. */ + return DB_SUCCESS; } if (heap_no > PAGE_HEAP_NO_SUPREMUM && gap_mode != LOCK_GAP && trx->snapshot_isolation - && trx->read_view.is_open() - && !trx->read_view.changes_visible( - trx_read_trx_id(rec + row_trx_id_offset(rec, index))) - && IF_WSREP(!(trx->is_wsrep() + && trx->read_view.is_open()) { + trx_id_t trx_id= trx_read_trx_id(rec + + row_trx_id_offset(rec, index)); + if (!trx_sys.is_registered(trx, trx_id) + && !trx->read_view.changes_visible(trx_id) + && IF_WSREP(!(trx->is_wsrep() && wsrep_thd_skip_locking(trx->mysql_thd)), true)) { - return DB_RECORD_CHANGED; + return DB_RECORD_CHANGED; + } } dberr_t err = lock_rec_lock(false, gap_mode | mode, @@ -7109,10 +7089,6 @@ victim->lock.was_chosen_as_deadlock_victim= true; DEBUG_SYNC_C("deadlock_report_before_lock_releasing"); lock_cancel_waiting_and_release(victim->lock.wait_lock); -#ifdef WITH_WSREP - if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd)) - wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd); -#endif } func_exit: diff -Nru mariadb-10.11.11/storage/innobase/log/log0crypt.cc mariadb-10.11.13/storage/innobase/log/log0crypt.cc --- mariadb-10.11.11/storage/innobase/log/log0crypt.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/log/log0crypt.cc 2025-05-19 16:14:25.000000000 +0000 @@ -566,7 +566,7 @@ alignas(8) byte iv[MY_AES_BLOCK_SIZE]; - m_commit_lsn= log_sys.get_lsn(); + m_commit_lsn= log_sys.get_flushed_lsn(); ut_ad(m_commit_lsn); byte *tmp= static_cast(alloca(srv_page_size)), *t= tmp; byte *dst= static_cast(alloca(srv_page_size)); diff -Nru mariadb-10.11.11/storage/innobase/log/log0log.cc mariadb-10.11.13/storage/innobase/log/log0log.cc --- mariadb-10.11.11/storage/innobase/log/log0log.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/log/log0log.cc 2025-05-19 16:14:25.000000000 +0000 @@ -68,7 +68,7 @@ #define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \ + (4U << srv_page_size_shift)) -void log_t::set_capacity() +void log_t::set_capacity() noexcept { ut_ad(log_sys.latch_have_wr()); /* Margin for the free space in the smallest log, before a new query @@ -87,13 +87,15 @@ log_sys.max_checkpoint_age = margin; } -void log_t::create() +void log_t::create() noexcept { ut_ad(this == &log_sys); ut_ad(!is_initialised()); + latch.SRW_LOCK_INIT(log_latch_key); + write_lsn_offset= 0; /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */ - lsn.store(FIRST_LSN, std::memory_order_relaxed); + base_lsn.store(FIRST_LSN, std::memory_order_relaxed); flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed); need_checkpoint.store(true, std::memory_order_relaxed); write_lsn= FIRST_LSN; @@ -102,10 +104,10 @@ ut_ad(!buf); ut_ad(!flush_buf); ut_ad(!writer); - max_buf_free= 1; - latch.SRW_LOCK_INIT(log_latch_key); - lsn_lock.init(); +#ifdef HAVE_PMEM + resize_wrap_mutex.init(); +#endif last_checkpoint_lsn= FIRST_LSN; log_capacity= 0; @@ -114,8 +116,6 @@ next_checkpoint_lsn= 0; checkpoint_pending= false; - set_buf_free(0); - ut_ad(is_initialised()); } @@ -306,7 +306,7 @@ #if defined __linux__ || defined _WIN32 /** Display a message about opening the log */ -ATTRIBUTE_COLD static void log_file_message() +ATTRIBUTE_COLD static void log_file_message() noexcept { sql_print_information("InnoDB: %s (block size=%u bytes)", log_sys.log_mmap @@ -320,10 +320,10 @@ log_sys.write_size); } #else -static inline void log_file_message() {} +static inline void log_file_message() noexcept {} #endif -bool log_t::attach(log_file_t file, os_offset_t size) +bool log_t::attach(log_file_t file, os_offset_t size) noexcept { log= file; ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); @@ -352,8 +352,7 @@ } # endif buf= static_cast(ptr); - max_buf_free= 1; - writer_update(); + writer_update(false); # ifdef HAVE_PMEM if (is_pmem) return true; @@ -366,7 +365,7 @@ if (!buf) { alloc_fail: - max_buf_free= 0; + base_lsn.store(0, std::memory_order_relaxed); sql_print_error("InnoDB: Cannot allocate memory;" " too large innodb_log_buffer_size?"); return false; @@ -394,8 +393,7 @@ TRASH_ALLOC(buf, buf_size); TRASH_ALLOC(flush_buf, buf_size); - max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; - writer_update(); + writer_update(false); memset_aligned<512>(checkpoint_buf, 0, write_size); func_exit: @@ -407,7 +405,7 @@ @param buf log header buffer @param lsn log sequence number corresponding to log_sys.START_OFFSET @param encrypted whether the log is encrypted */ -void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted) +void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted) noexcept { mach_write_to_4(my_assume_aligned<4>(buf) + LOG_HEADER_FORMAT, log_sys.FORMAT_10_8); @@ -436,8 +434,9 @@ ut_ad(is_latest()); ut_ad(this == &log_sys); - this->lsn.store(lsn, std::memory_order_relaxed); - this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn_offset= 0; + base_lsn.store(lsn, std::memory_order_relaxed); + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); first_lsn= lsn; write_lsn= lsn; @@ -452,14 +451,13 @@ mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); memset_aligned<4096>(buf, 0, 4096); log_sys.header_write(buf, lsn, is_encrypted()); - set_buf_free(START_OFFSET); pmem_persist(buf, 512); + buf_size= unsigned(std::min(capacity(), buf_size_max)); } else #endif { ut_ad(!is_mmap()); - set_buf_free(0); memset_aligned<4096>(flush_buf, 0, buf_size); memset_aligned<4096>(buf, 0, buf_size); log_sys.header_write(buf, lsn, is_encrypted()); @@ -468,12 +466,12 @@ } } -ATTRIBUTE_COLD static void log_close_failed(dberr_t err) +ATTRIBUTE_COLD static void log_close_failed(dberr_t err) noexcept { ib::fatal() << "closing ib_logfile0 failed: " << err; } -void log_t::close_file(bool really_close) +void log_t::close_file(bool really_close) noexcept { if (is_mmap()) { @@ -508,16 +506,25 @@ log_close_failed(err); } +/** @return the current log sequence number (may be stale) */ +lsn_t log_get_lsn() noexcept +{ + log_sys.latch.wr_lock(SRW_LOCK_CALL); + lsn_t lsn= log_sys.get_lsn(); + log_sys.latch.wr_unlock(); + return lsn; +} + /** Acquire all latches that protect the log. */ -static void log_resize_acquire() +static void log_resize_acquire() noexcept { #ifdef HAVE_PMEM if (!log_sys.is_mmap()) #endif { - while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + while (flush_lock.acquire(log_get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); - while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + while (write_lock.acquire(log_get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); } @@ -525,7 +532,7 @@ } /** Release the latches that protect the log. */ -void log_resize_release() +void log_resize_release() noexcept { log_sys.latch.wr_unlock(); @@ -542,7 +549,7 @@ #if defined __linux__ || defined _WIN32 /** Try to enable or disable file system caching (update log_buffered) */ -void log_t::set_buffered(bool buffered) +void log_t::set_buffered(bool buffered) noexcept { if (!log_maybe_unbuffered || #ifdef HAVE_PMEM @@ -570,31 +577,35 @@ /** Start resizing the log and release the exclusive latch. @param size requested new file_size +@param thd the current thread identifier @return whether the resizing was started successfully */ -log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept +log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd) + noexcept { ut_ad(size >= 4U << 20); ut_ad(!(size & 4095)); ut_ad(!srv_read_only_mode); + ut_ad(thd); log_resize_acquire(); - resize_start_status status= RESIZE_NO_CHANGE; - lsn_t start_lsn{0}; -#ifdef HAVE_PMEM - bool is_pmem{false}; -#endif + resize_start_status status; - if (resize_in_progress()) + if (size == file_size) + status= RESIZE_NO_CHANGE; + else if (resize_in_progress()) status= RESIZE_IN_PROGRESS; - else if (size != file_size) + else { + lsn_t start_lsn; ut_ad(!resize_in_progress()); ut_ad(!resize_log.is_opened()); ut_ad(!resize_buf); ut_ad(!resize_flush_buf); + ut_ad(!resize_initiator); std::string path{get_log_file_path("ib_logfile101")}; bool success; + resize_initiator= thd; resize_lsn.store(1, std::memory_order_relaxed); resize_target= 0; resize_log.m_file= @@ -612,6 +623,7 @@ #ifdef HAVE_PMEM else if (is_mmap()) { + bool is_pmem{false}; ptr= ::log_mmap(resize_log.m_file, is_pmem, size); if (ptr == MAP_FAILED) @@ -661,34 +673,33 @@ else if (!is_opened()) resize_log.close(); - writer_update(); + resize_lsn.store(start_lsn, std::memory_order_relaxed); + writer_update(true); + log_resize_release(); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t target_lsn= buf_pool.get_oldest_modification(0); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_flush_ahead(start_lsn < target_lsn ? target_lsn + 1 : start_lsn, + false); + return RESIZE_STARTED; } - status= success ? RESIZE_STARTED : RESIZE_FAILED; } - resize_lsn.store(start_lsn, std::memory_order_relaxed); + resize_initiator= nullptr; + resize_lsn.store(0, std::memory_order_relaxed); + status= RESIZE_FAILED; } log_resize_release(); - - if (start_lsn) - { - mysql_mutex_lock(&buf_pool.flush_list_mutex); - lsn_t target_lsn= buf_pool.get_oldest_modification(0); - if (start_lsn < target_lsn) - start_lsn= target_lsn + 1; - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - buf_flush_ahead(start_lsn, false); - } - return status; } -/** Abort log resizing. */ -void log_t::resize_abort() noexcept +/** Abort a resize_start() that we started. */ +void log_t::resize_abort(void *thd) noexcept { log_resize_acquire(); - if (resize_in_progress() > 1) + if (resize_running(thd)) { #ifdef HAVE_PMEM const bool is_mmap{this->is_mmap()}; @@ -715,11 +726,12 @@ resize_buf= nullptr; resize_target= 0; resize_lsn.store(0, std::memory_order_relaxed); + resize_initiator= nullptr; std::string path{get_log_file_path("ib_logfile101")}; IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + writer_update(false); } - writer_update(); log_resize_release(); } @@ -882,9 +894,7 @@ ut_ad(!is_opened()); ut_ad(!write_lock.is_owner()); ut_ad(!flush_lock.is_owner()); -#ifdef LOG_LATCH_DEBUG - ut_ad(latch_have_any()); -#endif + ut_ad(latch_have_wr()); lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed); @@ -902,26 +912,26 @@ else pmem_persist(buf + start, end - start); - old= flushed_to_disk_lsn.load(std::memory_order_relaxed); - - if (old < lsn) - { - while (!flushed_to_disk_lsn.compare_exchange_weak - (old, lsn, std::memory_order_release, std::memory_order_relaxed)) - if (old >= lsn) - break; - - log_flush_notify(lsn); - DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); - } + uint64_t offset{write_lsn_offset}; + const lsn_t new_base_lsn= base_lsn.load(std::memory_order_relaxed) + + (offset & (WRITE_BACKOFF - 1)); + ut_ad(new_base_lsn >= lsn); + write_to_buf+= size_t(offset >> WRITE_TO_BUF_SHIFT); + /* This synchronizes with get_lsn_approx(); + we must store write_lsn_offset before base_lsn. */ + write_lsn_offset.store(0, std::memory_order_relaxed); + base_lsn.store(new_base_lsn, std::memory_order_release); + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + log_flush_notify(lsn); + DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); } ATTRIBUTE_NOINLINE static void log_write_persist(lsn_t lsn) noexcept { - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.persist(lsn); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); } #endif @@ -972,7 +982,7 @@ ut_ad(resizing == RETAIN_LATCH || (resizing == RESIZING) == (resize_in_progress() > 1)); - const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; + const lsn_t lsn{get_lsn()}; if (write_lsn >= lsn) { @@ -988,7 +998,8 @@ ut_ad(write_lsn >= get_flushed_lsn()); const size_t write_size_1{write_size - 1}; ut_ad(ut_is_2pow(write_size)); - size_t length{buf_free.load(std::memory_order_relaxed)}; + lsn_t base= base_lsn.load(std::memory_order_relaxed); + size_t length{size_t(lsn - base)}; lsn_t offset{calc_lsn_offset(write_lsn)}; ut_ad(length >= (offset & write_size_1)); ut_ad(write_size_1 >= 511); @@ -1010,14 +1021,8 @@ { ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1)); /* Keep filling the same buffer until we have more than one block. */ -#if 0 /* TODO: Pad the last log block with dummy records. */ - buf_free= log_pad(lsn, (write_size_1 + 1) - length, - buf + length, flush_buf); - ... /* TODO: Update the LSN and adjust other code. */ -#else MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - length); buf[length]= 0; /* ensure that recovery catches EOF */ -#endif if (UNIV_LIKELY_NULL(re_write_buf)) { MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - length); @@ -1028,8 +1033,13 @@ else { const size_t new_buf_free{length & write_size_1}; + base+= length & ~write_size_1; ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1)); - buf_free.store(new_buf_free, std::memory_order_relaxed); + write_to_buf+= size_t(write_lsn_offset >> WRITE_TO_BUF_SHIFT); + /* This synchronizes with get_lsn_approx(); + we must store write_lsn_offset before base_lsn. */ + write_lsn_offset.store(new_buf_free, std::memory_order_relaxed); + base_lsn.store(base, std::memory_order_release); if (new_buf_free) { @@ -1039,12 +1049,13 @@ the current LSN are generated. */ MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - new_buf_free); buf[length]= 0; /* allow recovery to catch EOF faster */ + if (UNIV_LIKELY_NULL(re_write_buf)) + MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - + new_buf_free); length&= ~write_size_1; memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15); if (UNIV_LIKELY_NULL(re_write_buf)) { - MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - - new_buf_free); memcpy_aligned<16>(resize_flush_buf, re_write_buf + length, (new_buf_free + 15) & ~15); re_write_buf[length + new_buf_free]= 0; @@ -1057,7 +1068,9 @@ std::swap(resize_buf, resize_flush_buf); } + ut_ad(base + (write_lsn_offset & (WRITE_TO_BUF - 1)) == lsn); write_to_log++; + if (resizing != RETAIN_LATCH) latch.wr_unlock(); @@ -1101,7 +1114,7 @@ @retval 0 if there are no pending callbacks on flush_lock or there is another group commit lead. */ -static lsn_t log_flush(lsn_t lsn) +static lsn_t log_flush(lsn_t lsn) noexcept { ut_ad(!log_sys.is_mmap()); ut_a(log_sys.flush(lsn)); @@ -1120,7 +1133,7 @@ void log_write_up_to(lsn_t lsn, bool durable, const completion_callback *callback) noexcept { - ut_ad(!srv_read_only_mode || log_sys.buf_free_ok()); + ut_ad(!srv_read_only_mode); ut_ad(lsn != LSN_MAX); ut_ad(lsn != 0); ut_ad(!log_sys.is_mmap() || !callback || durable); @@ -1133,8 +1146,6 @@ return; } - ut_ad(lsn <= log_sys.get_lsn()); - #ifdef HAVE_PMEM if (log_sys.is_mmap()) { @@ -1151,10 +1162,10 @@ if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) return; /* Promise to other concurrent flush_lock.acquire() that we - will durable at least up to the current LSN. The LSN may still - advance until we acquire log_sys.latch below. */ - lsn= log_sys.get_lsn(); - flush_lock.set_pending(lsn); + will be durable at least up to the current LSN. The LSN may still + advance when we acquire log_sys.latch below. */ + if (lsn > log_sys.get_flushed_lsn()) + flush_lock.set_pending(lsn); } lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; @@ -1190,42 +1201,50 @@ return log_sys.write_buf(); } -void log_t::writer_update() noexcept +void log_t::writer_update(bool resizing) noexcept { ut_ad(latch_have_wr()); - writer= resize_in_progress() ? log_writer_resizing : log_writer; + ut_ad(resizing == (resize_in_progress() > 1)); + writer= resizing ? log_writer_resizing : log_writer; mtr_t::finisher_update(); } /** Write to the log file up to the last log entry. @param durable whether to wait for a durable write to complete */ -void log_buffer_flush_to_disk(bool durable) +void log_buffer_flush_to_disk(bool durable) noexcept { - log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable); + log_write_up_to(log_get_lsn(), durable); } /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ -ATTRIBUTE_COLD void log_write_and_flush_prepare() +ATTRIBUTE_COLD void log_write_and_flush_prepare() noexcept { #ifdef HAVE_PMEM if (log_sys.is_mmap()) return; #endif - while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + while (flush_lock.acquire(log_get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); - while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + while (write_lock.acquire(log_get_lsn() + 1, nullptr) != group_commit_lock::ACQUIRED); } -void log_t::clear_mmap() +void log_t::clear_mmap() noexcept { - if (!is_mmap() || + if (!is_mmap() || high_level_read_only) + return; #ifdef HAVE_PMEM - !is_opened() || -#endif - high_level_read_only) + if (!is_opened()) + { + ut_d(latch.wr_lock(SRW_LOCK_CALL)); + ut_ad(!resize_in_progress()); + ut_ad(get_lsn() == get_flushed_lsn(std::memory_order_relaxed)); + ut_d(latch.wr_unlock()); return; + } +#endif + log_resize_acquire(); ut_ad(!resize_in_progress()); ut_ad(write_lsn == get_lsn()); @@ -1235,10 +1254,10 @@ { alignas(16) byte log_block[4096]; const size_t bs{write_size}; - const size_t bf{buf_free.load(std::memory_order_relaxed)}; { - byte *const b= buf; - memcpy_aligned<16>(log_block, b + (bf & ~(bs - 1)), bs); + const size_t bf= + size_t(write_lsn - base_lsn.load(std::memory_order_relaxed)); + memcpy_aligned<16>(log_block, buf + (bf & ~(bs - 1)), bs); } close_file(false); @@ -1246,14 +1265,13 @@ ut_a(attach(log, file_size)); ut_ad(!is_mmap()); - set_buf_free(bf & (bs - 1)); - memcpy_aligned<16>(log_sys.buf, log_block, bs); + memcpy_aligned<16>(buf, log_block, bs); } log_resize_release(); } /** Durably write the log up to log_sys.get_lsn(). */ -ATTRIBUTE_COLD void log_write_and_flush() +ATTRIBUTE_COLD void log_write_and_flush() noexcept { ut_ad(!srv_read_only_mode); #ifdef HAVE_PMEM @@ -1273,17 +1291,17 @@ that a new log entry can be catenated without an immediate need for a checkpoint. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ -ATTRIBUTE_COLD static void log_checkpoint_margin() +ATTRIBUTE_COLD static void log_checkpoint_margin() noexcept { while (log_sys.check_for_checkpoint()) { - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); ut_ad(!recv_no_log_write); if (!log_sys.check_for_checkpoint()) { func_exit: - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); return; } @@ -1301,7 +1319,7 @@ } DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); /* We must wait to prevent the tail of the log overwriting the head. */ buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20))); @@ -1313,7 +1331,7 @@ /** Wait for a log checkpoint if needed. NOTE that this function may only be called while not holding any synchronization objects except dict_sys.latch. */ -void log_free_check() +void log_free_check() noexcept { ut_ad(!lock_sys.is_holder()); if (log_sys.check_for_checkpoint()) @@ -1323,10 +1341,14 @@ } } -extern void buf_resize_shutdown(); +#ifdef __linux__ +extern void buf_mem_pressure_shutdown() noexcept; +#else +inline void buf_mem_pressure_shutdown() noexcept {} +#endif /** Make a checkpoint at the latest lsn on shutdown. */ -ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() noexcept { lsn_t lsn; ulint count = 0; @@ -1341,8 +1363,7 @@ srv_master_timer.reset(); } - /* Wait for the end of the buffer resize task.*/ - buf_resize_shutdown(); + buf_mem_pressure_shutdown(); dict_stats_shutdown(); btr_defragment_shutdown(); @@ -1464,7 +1485,7 @@ ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT; - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); lsn = log_sys.get_lsn(); @@ -1472,7 +1493,7 @@ && lsn != log_sys.last_checkpoint_lsn + sizeof_cp; ut_ad(lsn >= log_sys.last_checkpoint_lsn); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); if (lsn_changed) { goto loop; @@ -1490,7 +1511,7 @@ "Free innodb buffer pool"); ut_d(buf_pool.assert_all_freed()); - ut_a(lsn == log_sys.get_lsn() + ut_a(lsn == log_get_lsn() || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) { @@ -1504,7 +1525,7 @@ /* Make some checks that the server really is quiet */ ut_ad(!srv_any_background_activity()); - ut_a(lsn == log_sys.get_lsn() + ut_a(lsn == log_get_lsn() || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); } @@ -1515,44 +1536,42 @@ /*======*/ FILE* file) /*!< in: file where to print */ { - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); const lsn_t lsn= log_sys.get_lsn(); mysql_mutex_lock(&buf_pool.flush_list_mutex); const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn); mysql_mutex_unlock(&buf_pool.flush_list_mutex); + const lsn_t flushed_lsn{log_sys.get_flushed_lsn()}; + const lsn_t checkpoint_lsn{log_sys.last_checkpoint_lsn}; + log_sys.latch.wr_unlock(); fprintf(file, "Log sequence number " LSN_PF "\n" "Log flushed up to " LSN_PF "\n" "Pages flushed up to " LSN_PF "\n" "Last checkpoint at " LSN_PF "\n", - lsn, - log_sys.get_flushed_lsn(), - pages_flushed, - lsn_t{log_sys.last_checkpoint_lsn}); - - log_sys.latch.rd_unlock(); + lsn, flushed_lsn, pages_flushed, checkpoint_lsn); } /** Shut down the redo log subsystem. */ void log_t::close() { ut_ad(this == &log_sys); - ut_ad(!(buf_free & buf_free_LOCK)); if (!is_initialised()) return; close_file(); ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); + base_lsn.store(0, std::memory_order_relaxed); latch.destroy(); - lsn_lock.destroy(); +#ifdef HAVE_PMEM + resize_wrap_mutex.destroy(); +#endif recv_sys.close(); - - max_buf_free= 0; } std::string get_log_file_path(const char *filename) diff -Nru mariadb-10.11.11/storage/innobase/log/log0recv.cc mariadb-10.11.13/storage/innobase/log/log0recv.cc --- mariadb-10.11.11/storage/innobase/log/log0recv.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/log/log0recv.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1266,6 +1266,13 @@ } else if (p.second // the first FILE_MODIFY or FILE_RENAME || f.name != fname.name) { reload: + if (f.name.size() == 0) { + /* Augment the recv_spaces.emplace_hint() for the + FILE_MODIFY record that had been added by + recv_sys_t::parse() */ + f.name = fname.name; + } + fil_space_t* space; /* Check if the tablespace file exists and contains @@ -1466,6 +1473,7 @@ mysql_mutex_lock(&mutex); recovery_on= false; + recv_needed_recovery= false; pages.clear(); pages_it= pages.end(); @@ -1473,7 +1481,6 @@ log_sys.clear_mmap(); } - /** Free a redo log snippet. @param data buffer allocated in add() */ inline void recv_sys_t::free(const void *data) @@ -1481,34 +1488,18 @@ ut_ad(!ut_align_offset(data, ALIGNMENT)); mysql_mutex_assert_owner(&mutex); - /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(), - we must acquire and hold the buffer pool mutex here. */ - ut_ad(!buf_pool.resize_in_progress()); - - auto *chunk= buf_pool.chunks; - for (auto i= buf_pool.n_chunks; i--; chunk++) + buf_block_t *block= buf_pool.block_from(data); + ut_ad(block->page.frame == page_align(data)); + ut_ad(block->page.state() == buf_page_t::MEMORY); + ut_ad(uint16_t(block->page.free_offset - 1) < srv_page_size); + ut_ad(block->page.used_records); + if (!--block->page.used_records) { - if (data < chunk->blocks->page.frame) - continue; - const size_t offs= (reinterpret_cast(data) - - chunk->blocks->page.frame) >> srv_page_size_shift; - if (offs >= chunk->size) - continue; - buf_block_t *block= &chunk->blocks[offs]; - ut_ad(block->page.frame == page_align(data)); - ut_ad(block->page.state() == buf_page_t::MEMORY); - ut_ad(uint16_t(block->page.free_offset - 1) < srv_page_size); - ut_ad(block->page.used_records); - if (!--block->page.used_records) - { - block->page.hash= nullptr; - UT_LIST_REMOVE(blocks, block); - MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size); - buf_block_free(block); - } - return; + block->page.hash= nullptr; + UT_LIST_REMOVE(blocks, block); + MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size); + buf_block_free(block); } - ut_ad(0); } @@ -2057,12 +2048,13 @@ { mysql_mutex_unlock(&mutex); os_aio_wait_until_no_pending_reads(false); + os_aio_wait_until_no_pending_writes(false); mysql_mutex_lock(&mutex); garbage_collect(); mysql_mutex_lock(&buf_pool.mutex); - bool need_more= UT_LIST_GET_LEN(buf_pool.free) < pages; + const size_t available= UT_LIST_GET_LEN(buf_pool.free); mysql_mutex_unlock(&buf_pool.mutex); - if (need_more) + if (available < pages) buf_flush_sync_batch(lsn); } @@ -2507,9 +2499,11 @@ ut_ad(log_sys.is_latest()); alignas(8) byte iv[MY_AES_BLOCK_SIZE]; - byte *decrypt_buf= storing != BACKUP - ? static_cast(alloca(srv_page_size)) : nullptr; - + byte *decrypt_buf= + static_cast(alloca(storing == BACKUP + ? 1/*type,length*/ + 5/*space_id*/ + + 5/*page_no*/ + 1/*rlen*/ + : srv_page_size)); const lsn_t start_lsn{lsn}; /* Check that the entire mini-transaction is included within the buffer */ @@ -2599,7 +2593,10 @@ ut_d(std::set modified); #endif - uint32_t space_id= 0, page_no= 0, last_offset= 0; + uint32_t space_id= 0, page_no= 0; + /* The end offset the last write (always 0 in storing==BACKUP). + The value 1 means that no "same page" record is allowed. */ + uint last_offset= 0; bool got_page_op= false; for (l= begin;; l+= rlen) @@ -2712,8 +2709,7 @@ { mach_write_to_4(iv + 8, space_id); mach_write_to_4(iv + 12, page_no); - byte eb[1/*type,length*/ + 5/*space_id*/ + 5/*page_no*/ + 1/*rlen*/]; - if (*l.copy_if_needed(iv, eb, recs, 1) == TRIM_PAGES) + if (*l.copy_if_needed(iv, decrypt_buf, recs, 1) == TRIM_PAGES) undo_space_trunc(space_id); } continue; @@ -2726,8 +2722,8 @@ if (i != recv_spaces.end() && i->first == space_id); else if (lsn < file_checkpoint) /* We have not seen all records between the checkpoint and - FILE_CHECKPOINT. There should be a FILE_DELETE for this - tablespace later. */ + FILE_CHECKPOINT. There should be a FILE_DELETE or FILE_MODIFY + for this tablespace later, to be handled in fil_name_process(). */ recv_spaces.emplace_hint(i, space_id, file_name_t("", false)); else { @@ -2762,10 +2758,10 @@ case FREE_PAGE: ut_ad(freed.emplace(id).second); /* the next record must not be same_page */ - last_offset= 1; + if (storing != BACKUP) last_offset= 1; goto free_or_init_page; case INIT_PAGE: - last_offset= FIL_PAGE_TYPE; + if (storing != BACKUP) last_offset= FIL_PAGE_TYPE; free_or_init_page: if (UNIV_UNLIKELY(rlen != 0)) goto record_corrupted; @@ -2797,7 +2793,8 @@ erase(r); continue; } - cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + if (storing == YES) + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); break; case EXTENDED: if (storing == NO) @@ -2811,7 +2808,8 @@ continue; if (UNIV_UNLIKELY(!rlen)) goto record_corrupted; - cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + if (storing == YES || rlen == 1) + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); if (rlen == 1 && *cl == TRIM_PAGES) { if (!srv_is_undo_tablespace(space_id) || @@ -2825,7 +2823,7 @@ truncated_undo_spaces[space_id - srv_undo_space_id_start]= { start_lsn, page_no }; /* the next record must not be same_page */ - last_offset= 1; + if (storing != BACKUP) last_offset= 1; if (undo_space_trunc) undo_space_trunc(space_id); continue; @@ -2833,7 +2831,7 @@ /* This record applies to an undo log or index page, and it may be followed by subsequent WRITE or similar records for the same page in the same mini-transaction. */ - last_offset= FIL_PAGE_TYPE; + if (storing != BACKUP) last_offset= FIL_PAGE_TYPE; break; case OPTION: /* OPTION records can be safely ignored in recovery */ @@ -2850,6 +2848,8 @@ case WRITE: case MEMMOVE: case MEMSET: + if (storing == BACKUP) + continue; if (storing == NO && UNIV_LIKELY(page_no != 0)) /* fil_space_set_recv_size_and_flags() is mandatory for storing==NO. It is only applicable to page_no == 0. Other than that, we can just @@ -2979,7 +2979,7 @@ l - recs + rlen))) { lsn= start_lsn; - if (lsn > log_sys.get_lsn()) + if (lsn > log_sys.get_flushed_lsn(std::memory_order_relaxed)) log_sys.set_recovered_lsn(start_lsn); l+= rlen; offset= begin.ptr - log_sys.buf; @@ -3566,13 +3566,14 @@ } else { + const lsn_t end{std::max(recv_sys.scanned_lsn, recv_sys.file_checkpoint)}; sql_print_information("InnoDB: To recover: LSN " LSN_PF "/" LSN_PF "; %zu pages", - recv_sys.lsn, recv_sys.scanned_lsn, n); + recv_sys.lsn, end, n); service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: LSN " LSN_PF "/" LSN_PF "; %zu pages", - recv_sys.lsn, recv_sys.scanned_lsn, n); + recv_sys.lsn, end, n); } } @@ -4113,8 +4114,8 @@ {log_sys.buf + recv_sys.len, size})) { mysql_mutex_unlock(&recv_sys.mutex); - ib::error() << "Failed to read log at " << source_offset - << ": " << err; + sql_print_error("InnoDB: Failed to read log at %" PRIu64 ": %s", + source_offset, ut_strerr(err)); recv_sys.set_corrupt_log(); mysql_mutex_lock(&recv_sys.mutex); } @@ -4294,7 +4295,7 @@ break; case SRV_OPERATION_RESTORE: case SRV_OPERATION_RESTORE_EXPORT: - if (i->second.name.find("/#sql") != std::string::npos) { + if (i->second.name.find("/#sql") == std::string::npos) { sql_print_warning("InnoDB: Tablespace " UINT32PF " was not found at %.*s when" " restoring a (partial?) backup." @@ -4588,19 +4589,19 @@ inline void log_t::set_recovered() noexcept { ut_ad(get_flushed_lsn() == get_lsn()); - ut_ad(recv_sys.lsn == get_lsn()); - size_t offset{recv_sys.offset}; + ut_ad(recv_sys.lsn == get_flushed_lsn()); if (!is_mmap()) { const size_t bs{log_sys.write_size}, bs_1{bs - 1}; - memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs); - offset&= bs_1; + memmove_aligned<512>(buf, buf + (recv_sys.offset & ~bs_1), bs); } -#ifndef _WIN32 +#ifdef HAVE_PMEM else + { + buf_size= unsigned(std::min(capacity(), buf_size_max)); mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + } #endif - set_buf_free(offset); } inline bool recv_sys_t::validate_checkpoint() const noexcept @@ -4674,7 +4675,7 @@ goto err_exit; } ut_ad(recv_sys.file_checkpoint); - ut_ad(log_sys.get_lsn() >= recv_sys.scanned_lsn); + ut_ad(log_sys.get_flushed_lsn() >= recv_sys.scanned_lsn); if (rewind) { recv_sys.lsn = log_sys.next_checkpoint_lsn; recv_sys.offset = 0; @@ -4736,7 +4737,7 @@ tablespaces (not individual pages), while retaining the initial recv_sys.pages. */ mysql_mutex_lock(&recv_sys.mutex); - ut_ad(log_sys.get_lsn() >= recv_sys.lsn); + ut_ad(log_sys.get_flushed_lsn() >= recv_sys.lsn); recv_sys.clear(); recv_sys.lsn = log_sys.next_checkpoint_lsn; mysql_mutex_unlock(&recv_sys.mutex); @@ -4744,7 +4745,8 @@ if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { mysql_mutex_lock(&recv_sys.mutex); - deferred_spaces.deferred_dblwr(log_sys.get_lsn()); + deferred_spaces.deferred_dblwr( + log_sys.get_flushed_lsn()); buf_dblwr.recover(); mysql_mutex_unlock(&recv_sys.mutex); } @@ -4777,16 +4779,6 @@ if (!srv_read_only_mode && log_sys.is_latest()) { log_sys.set_recovered(); - if (recv_needed_recovery - && srv_operation <= SRV_OPERATION_EXPORT_RESTORED - && recv_sys.lsn - log_sys.next_checkpoint_lsn - < log_sys.log_capacity) { - /* Write a FILE_CHECKPOINT marker as the first thing, - before generating any other redo log. This ensures - that subsequent crash recovery will be possible even - if the server were killed soon after this. */ - fil_names_clear(log_sys.next_checkpoint_lsn); - } } DBUG_EXECUTE_IF("before_final_redo_apply", goto err_exit;); @@ -4892,28 +4884,43 @@ goto check_if_corrupted; } -byte *recv_dblwr_t::find_encrypted_page(const fil_node_t &node, - uint32_t page_no, - byte *buf) noexcept +ATTRIBUTE_COLD +byte *recv_dblwr_t::find_deferred_page(const fil_node_t &node, + uint32_t page_no, + byte *buf) noexcept { - ut_ad(node.space->crypt_data); ut_ad(node.space->full_crc32()); mysql_mutex_lock(&recv_sys.mutex); byte *result_page= nullptr; + bool is_encrypted= node.space->crypt_data && + node.space->crypt_data->is_encrypted(); for (list::iterator page_it= pages.begin(); page_it != pages.end(); page_it++) { if (page_get_page_no(*page_it) != page_no || buf_page_is_corrupted(true, *page_it, node.space->flags)) continue; + + if (is_encrypted && + !mach_read_from_4(*page_it + FIL_PAGE_FCRC32_KEY_VERSION)) + continue; + memcpy(buf, *page_it, node.space->physical_size()); buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve(false); ut_a(slot); slot->allocate(); - bool invalidate= - !fil_space_decrypt(node.space, slot->crypt_buf, buf) || - (node.space->is_compressed() && - !fil_page_decompress(slot->crypt_buf, buf, node.space->flags)); + + bool invalidate= false; + if (is_encrypted) + { + invalidate= !fil_space_decrypt(node.space, slot->crypt_buf, buf); + if (!invalidate && node.space->is_compressed()) + goto decompress; + } + else +decompress: + invalidate= !fil_page_decompress(slot->crypt_buf, buf, + node.space->flags); slot->release(); if (invalidate || diff -Nru mariadb-10.11.11/storage/innobase/mtr/mtr0mtr.cc mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc --- mariadb-10.11.11/storage/innobase/mtr/mtr0mtr.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc 2025-05-19 16:14:25.000000000 +0000 @@ -44,7 +44,6 @@ #endif std::pair (*mtr_t::finisher)(mtr_t *, size_t); -unsigned mtr_t::spin_wait_delay; void mtr_t::finisher_update() { @@ -53,15 +52,12 @@ if (log_sys.is_mmap()) { commit_logger= mtr_t::commit_log; - finisher= spin_wait_delay - ? mtr_t::finish_writer : mtr_t::finish_writer; + finisher= mtr_t::finish_writer; return; } commit_logger= mtr_t::commit_log; #endif - finisher= - (spin_wait_delay - ? mtr_t::finish_writer : mtr_t::finish_writer); + finisher= mtr_t::finish_writer; } void mtr_memo_slot_t::release() const @@ -169,7 +165,7 @@ else flush_list_bytes+= block->physical_size(); - ut_ad(flush_list_bytes <= curr_pool_size); + ut_ad(flush_list_bytes <= size_in_bytes); if (prev) UT_LIST_INSERT_AFTER(flush_list, prev, &block->page); @@ -257,7 +253,7 @@ { if (block->page.oldest_modification() <= 1) { - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); /* For unlogged mtrs (MTR_LOG_NO_REDO), we use the current system LSN. The mtr that generated the LSN is either already committed or in mtr_t::commit. Shared latch and relaxed atomics should be fine here as it is guaranteed @@ -269,7 +265,7 @@ mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_pool.insert_into_flush_list (buf_pool.prepare_insert_into_flush_list(lsn), block, lsn); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); mysql_mutex_unlock(&buf_pool.flush_list_mutex); } } @@ -339,24 +335,11 @@ m_memo.clear(); } -inline lsn_t log_t::get_write_target() const -{ - ut_ad(latch_have_any()); - if (UNIV_LIKELY(buf_free_ok())) - return 0; - /* The LSN corresponding to the end of buf is - write_lsn - (first_lsn & 4095) + buf_free, - but we use simpler arithmetics to return a smaller write target in - order to minimize waiting in log_write_up_to(). */ - ut_ad(max_buf_free >= 4096 * 4); - return write_lsn + max_buf_free / 2; -} - template void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) + noexcept { size_t modified= 0; - const lsn_t write_lsn= mmap ? 0 : log_sys.get_write_target(); if (mtr->m_made_dirty) { @@ -475,9 +458,6 @@ if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); - - if (!mmap && UNIV_UNLIKELY(write_lsn != 0)) - log_write_up_to(write_lsn, false); } /** Commit a mini-transaction. */ @@ -690,7 +670,7 @@ /* We will not encrypt any FILE_ records, but we will reserve a nonce at the end. */ size+= 8; - m_commit_lsn= log_sys.get_lsn(); + m_commit_lsn= log_sys.get_flushed_lsn(); } else m_commit_lsn= 0; @@ -775,7 +755,7 @@ /* We will not encrypt any FILE_ records, but we will reserve a nonce at the end. */ size+= 8; - m_commit_lsn= log_sys.get_lsn(); + m_commit_lsn= log_sys.get_flushed_lsn(); } else m_commit_lsn= 0; @@ -897,181 +877,109 @@ ? ". Shutdown is in progress" : ""); } -static ATTRIBUTE_NOINLINE void lsn_delay(size_t delay, size_t mult) noexcept +ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept { - delay*= mult * 2; // GCC 13.2.0 -O2 targeting AMD64 wants to unroll twice - HMT_low(); - do - MY_RELAX_CPU(); - while (--delay); - HMT_medium(); -} - -#if defined __clang_major__ && __clang_major__ < 10 -/* Only clang-10 introduced support for asm goto */ -#elif defined __APPLE__ -/* At least some versions of Apple Xcode do not support asm goto */ -#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) -# if SIZEOF_SIZE_T == 8 -# define LOCK_TSET \ - __asm__ goto("lock btsq $63, %0\n\t" "jnc %l1" \ - : : "m"(buf_free) : "cc", "memory" : got) -# else -# define LOCK_TSET \ - __asm__ goto("lock btsl $31, %0\n\t" "jnc %l1" \ - : : "m"(buf_free) : "cc", "memory" : got) -# endif -#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) -# if SIZEOF_SIZE_T == 8 -# define LOCK_TSET \ - if (!_interlockedbittestandset64 \ - (reinterpret_cast(&buf_free), 63)) return -# else -# define LOCK_TSET \ - if (!_interlockedbittestandset \ - (reinterpret_cast(&buf_free), 31)) return -# endif -#endif - -#ifdef LOCK_TSET -ATTRIBUTE_NOINLINE -void log_t::lsn_lock_bts() noexcept -{ - LOCK_TSET; - { - const size_t m= mtr_t::spin_wait_delay; - constexpr size_t DELAY= 10, MAX_ITERATIONS= 10; - for (size_t delay_count= DELAY, delay_iterations= 1;; - lsn_delay(delay_iterations, m)) + if (UNIV_LIKELY(!ex)) + { + latch.rd_unlock(); + if (!late) { - if (!(buf_free.load(std::memory_order_relaxed) & buf_free_LOCK)) - LOCK_TSET; - if (!delay_count); - else if (delay_iterations < MAX_ITERATIONS) - delay_count= DELAY, delay_iterations++; - else - delay_count--; + /* Wait for all threads to back off. */ + latch.wr_lock(SRW_LOCK_CALL); + goto got_ex; } - } -# ifdef __GNUC__ - got: - return; -# endif -} + const auto delay= my_cpu_relax_multiplier / 4 * srv_spin_wait_delay; + const auto rounds= srv_n_spin_wait_rounds; -inline -#else -ATTRIBUTE_NOINLINE -#endif -size_t log_t::lock_lsn() noexcept -{ -#ifdef LOCK_TSET - lsn_lock_bts(); - return ~buf_free_LOCK & buf_free.load(std::memory_order_relaxed); -# undef LOCK_TSET -#else - size_t b= buf_free.fetch_or(buf_free_LOCK, std::memory_order_acquire); - if (b & buf_free_LOCK) - { - const size_t m= mtr_t::spin_wait_delay; - constexpr size_t DELAY= 10, MAX_ITERATIONS= 10; - for (size_t delay_count= DELAY, delay_iterations= 1; - ((b= buf_free.load(std::memory_order_relaxed)) & buf_free_LOCK) || - (buf_free_LOCK & (b= buf_free.fetch_or(buf_free_LOCK, - std::memory_order_acquire))); - lsn_delay(delay_iterations, m)) - if (!delay_count); - else if (delay_iterations < MAX_ITERATIONS) - delay_count= DELAY, delay_iterations++; - else - delay_count--; + for (;;) + { + HMT_low(); + for (auto r= rounds + 1; r--; ) + { + if (write_lsn_offset.load(std::memory_order_relaxed) & WRITE_BACKOFF) + { + for (auto d= delay; d--; ) + MY_RELAX_CPU(); + } + else + { + HMT_medium(); + goto done; + } + } + HMT_medium(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } } - return b; -#endif -} - -template -ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn) - noexcept -{ - waits++; - ut_ad(buf_free.load(std::memory_order_relaxed) == - (spin ? (b | buf_free_LOCK) : b)); - if (spin) - buf_free.store(b, std::memory_order_release); else - lsn_lock.wr_unlock(); - - if (ex) + { + got_ex: + const uint64_t l= write_lsn_offset.load(std::memory_order_relaxed); + const lsn_t lsn= base_lsn.load(std::memory_order_relaxed) + + (l & (WRITE_BACKOFF - 1)); + waits++; +#ifdef HAVE_PMEM + const bool is_pmem{is_mmap()}; + if (is_pmem) + { + ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity()); + persist(lsn); + } +#endif latch.wr_unlock(); - else - latch.rd_unlock(); - - log_write_up_to(lsn, is_mmap()); - - if (ex) - latch.wr_lock(SRW_LOCK_CALL); - else - latch.rd_lock(SRW_LOCK_CALL); - - if (spin) - return lock_lsn(); + /* write_buf() or persist() will clear the WRITE_BACKOFF flag, + which our caller will recheck. */ +#ifdef HAVE_PMEM + if (!is_pmem) +#endif + log_write_up_to(lsn, false); + if (ex) + { + latch.wr_lock(SRW_LOCK_CALL); + return; + } + } - lsn_lock.wr_lock(); - return buf_free.load(std::memory_order_relaxed); +done: + latch.rd_lock(SRW_LOCK_CALL); } /** Reserve space in the log buffer for appending data. -@tparam spin whether to use the spin-only lock_lsn() @tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ -template +template inline std::pair log_t::append_prepare(size_t size, bool ex) noexcept { ut_ad(ex ? latch_have_wr() : latch_have_rd()); ut_ad(mmap == is_mmap()); - if (!spin) - lsn_lock.wr_lock(); - size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)}; - write_to_buf++; - - lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size}; - - if (UNIV_UNLIKELY(mmap - ? (end_lsn - - get_flushed_lsn(std::memory_order_relaxed)) > capacity() - : b + size >= buf_size)) - { - b= append_prepare_wait(b, ex, l); - /* While flushing log, we had released the lsn lock and LSN could have - progressed in the meantime. */ - l= lsn.load(std::memory_order_relaxed); - end_lsn= l + size; - } - - size_t new_buf_free= b + size; - if (mmap && new_buf_free >= file_size) - new_buf_free-= size_t(capacity()); + ut_ad(!mmap || buf_size == std::min(capacity(), buf_size_max)); + const size_t buf_size{this->buf_size - size}; + uint64_t l; + static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); + while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) & + (WRITE_TO_BUF - 1)) >= buf_size)) + { + /* The following is inlined here instead of being part of + append_prepare_wait(), in order to increase the locality of reference + and to set the WRITE_BACKOFF flag as soon as possible. */ + bool late(write_lsn_offset.fetch_or(WRITE_BACKOFF) & WRITE_BACKOFF); + /* Subtract our LSN overshoot. */ + write_lsn_offset.fetch_sub(size); + append_prepare_wait(late, ex); + } - lsn.store(end_lsn, std::memory_order_relaxed); + const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)}, + end_lsn{lsn + size}; if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity)) set_check_for_checkpoint(true); - byte *our_buf= buf; - if (spin) - buf_free.store(new_buf_free, std::memory_order_release); - else - { - buf_free.store(new_buf_free, std::memory_order_relaxed); - lsn_lock.wr_unlock(); - } - - return {l, our_buf + b}; + return {lsn, + buf + size_t(mmap ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)}; } /** Finish appending data to the log. @@ -1216,7 +1124,7 @@ if (!resize_flush_buf) { ut_ad(is_mmap()); - lsn_lock.wr_lock(); + resize_wrap_mutex.wr_lock(); const size_t resize_capacity{resize_target - START_OFFSET}; { const lsn_t resizing{resize_in_progress()}; @@ -1227,7 +1135,7 @@ if (UNIV_UNLIKELY(lsn < resizing)) { /* This function may execute in multiple concurrent threads - that hold a shared log_sys.latch. Before we got lsn_lock, + that hold a shared log_sys.latch. Before we got resize_wrap_mutex, another thread could have executed resize_lsn.store(lsn) below with a larger lsn than ours. @@ -1277,7 +1185,7 @@ ut_ad(resize_buf[s] <= 1); resize_buf[s]= 1; mmap_done: - lsn_lock.wr_unlock(); + resize_wrap_mutex.wr_unlock(); } else #endif @@ -1304,7 +1212,7 @@ d+= size; } -template +template std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) { @@ -1315,7 +1223,7 @@ const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); if (!mmap) { diff -Nru mariadb-10.11.11/storage/innobase/os/os0file.cc mariadb-10.11.13/storage/innobase/os/os0file.cc --- mariadb-10.11.11/storage/innobase/os/os0file.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/os/os0file.cc 2025-05-19 16:14:25.000000000 +0000 @@ -2314,8 +2314,20 @@ ut_ad(exists); #endif /* UNIV_DEBUG */ - if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) { - return(true); + for (int retry= 50;; retry--){ + if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) + return true; + + if (!retry) + break; + + if (GetLastError() != ERROR_SHARING_VIOLATION) + break; + + // oldpath was opened by someone else (antivirus?) + //without FILE_SHARE_DELETE flag. Retry operation + + Sleep(10); } os_file_handle_rename_error(oldpath, newpath); @@ -3357,6 +3369,12 @@ return pending; } +/** @return approximate number of pending writes */ +size_t os_aio_pending_writes_approx() noexcept +{ + return write_slots->pending_io_count(); +} + /** Wait until all pending asynchronous reads have completed. @param declare whether the wait will be declared in tpool */ void os_aio_wait_until_no_pending_reads(bool declare) noexcept diff -Nru mariadb-10.11.11/storage/innobase/pars/pars0pars.cc mariadb-10.11.13/storage/innobase/pars/pars0pars.cc --- mariadb-10.11.11/storage/innobase/pars/pars0pars.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/pars/pars0pars.cc 2025-05-19 16:14:25.000000000 +0000 @@ -783,11 +783,6 @@ { ulint count = 0; - if (sym_node == NULL) { - - return(count); - } - while (sym_node) { pars_retrieve_table_def(sym_node); diff -Nru mariadb-10.11.11/storage/innobase/row/row0ins.cc mariadb-10.11.13/storage/innobase/row/row0ins.cc --- mariadb-10.11.11/storage/innobase/row/row0ins.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0ins.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1955,7 +1955,7 @@ TRUE, foreign, table, ref_tuple, thr); if (ref_table) { - dict_table_close(ref_table); + ref_table->release(); } } } @@ -2580,12 +2580,44 @@ } } -#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ -/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock(). -We would only need this for row_ins_clust_index_entry_low(), -but GCC 4.8.5 does not support pop_options. */ -# pragma GCC optimize ("O0") -#endif +/** Parse the integer data from specified data, which could be +DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 +and the type is not unsigned then we reset the value to 0 +@param data data to read +@param len length of data +@param mtype main type of the column +@param prtype precise type of the column +@return the integer value from the data +@retval 0 if the value is negative or the type or length invalid */ +static uint64_t row_parse_int(const byte *data, size_t len, + ulint mtype, ulint prtype) noexcept +{ + switch (mtype) { + case DATA_FLOAT: + if (len != sizeof(float)) + return 0; + { + float f= mach_float_read(data); + return f <= 0.0 ? 0 : uint64_t(f); + } + case DATA_DOUBLE: + if (len != sizeof(double)) + return 0; + { + double d= mach_double_read(data); + return d <= 0.0 ? 0 : uint64_t(d); + } + case DATA_INT: + if (len == 0 || len > 8) + return 0; + const ibool unsigned_type{prtype & DATA_UNSIGNED}; + uint64_t value= mach_read_int_type(data, len, unsigned_type); + return !unsigned_type && int64_t(value) < 0 ? 0 : value; + } + + ut_ad("invalid type" == 0); + return 0; +} /***************************************************************//** Tries to insert an entry into a clustered index, ignoring foreign key @@ -2672,8 +2704,7 @@ dfield->data), dfield->len, dfield->type.mtype, - dfield->type.prtype - & DATA_UNSIGNED); + dfield->type.prtype); if (auto_inc && mode != BTR_MODIFY_TREE) { mode = btr_latch_mode( @@ -2722,6 +2753,12 @@ DBUG_EXECUTE_IF("row_ins_row_level", goto row_level_insert;); +#ifdef WITH_WSREP + /* Appliers never execute bulk insert statements directly. */ + if (trx->is_wsrep() && !wsrep_thd_is_local_transaction(trx->mysql_thd)) + goto row_level_insert; +#endif /* WITH_WSREP */ + if (!(flags & BTR_NO_UNDO_LOG_FLAG) && page_is_empty(block->page.frame) && !entry->is_metadata() && !trx->duplicates @@ -2738,28 +2775,24 @@ && !index->table->has_spatial_index()) { ut_ad(!index->table->skip_alter_undo); - trx->bulk_insert = true; + trx->bulk_insert = TRX_DML_BULK; err = lock_table(index->table, NULL, LOCK_X, thr); if (err != DB_SUCCESS) { trx->error_state = err; - trx->bulk_insert = false; + trx->bulk_insert = TRX_NO_BULK; goto err_exit; } if (index->table->n_rec_locks) { avoid_bulk: - trx->bulk_insert = false; + trx->bulk_insert = TRX_NO_BULK; goto row_level_insert; } #ifdef WITH_WSREP - if (trx->is_wsrep()) + if (trx->is_wsrep() && + wsrep_append_table_key(trx->mysql_thd, *index->table)) { - if (!wsrep_thd_is_local_transaction(trx->mysql_thd)) - goto row_level_insert; - if (wsrep_append_table_key(trx->mysql_thd, *index->table)) - { - trx->error_state = DB_ROLLBACK; - goto err_exit; - } + trx->error_state = DB_ROLLBACK; + goto err_exit; } #endif /* WITH_WSREP */ @@ -2811,7 +2844,7 @@ bulk buffer and doesn't check for constraint validity of foreign key relationship. */ trx_start_if_not_started(trx, true); - trx->bulk_insert = true; + trx->bulk_insert = TRX_DDL_BULK; auto m = trx->mod_tables.emplace(index->table, 0); m.first->second.start_bulk_insert(index->table, true); err = m.first->second.bulk_insert_buffered( @@ -3891,3 +3924,79 @@ return(thr); } + +/** Read the AUTOINC column from an index record +@param index index of the record +@param rec the record +@return value read from the first column +@retval 0 if the value would be NULL or negative */ +static uint64_t row_read_autoinc(const dict_index_t &index, const rec_t *rec) + noexcept +{ + const dict_field_t &field= index.fields[0]; + ut_ad(!DATA_BIG_COL(field.col)); + ut_ad(!(rec_get_info_bits(rec, index.table->not_redundant()) & + (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))); + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_HEADER_SIZE + 2]; + rec_offs_init(offsets_); + rec_offs *offsets= rec_get_offsets(rec, &index, offsets_, + index.n_core_fields, 1, &heap); + ut_ad(!heap); + + size_t len; + ut_d(size_t first_offset=) rec_get_nth_field_offs(offsets, 0, &len); + ut_ad(!first_offset); + return row_parse_int(rec, len, field.col->mtype, field.col->prtype); +} + +/** Get the maximum and non-delete-marked record in an index. +@param index index B-tree +@param mtr mini-transaction (may be committed and restarted) +@return maximum record, page s-latched in mtr +@retval nullptr if there are no records, or if all of them are delete-marked */ +static +const rec_t *row_search_get_max_rec(dict_index_t *index, mtr_t *mtr) noexcept +{ + btr_pcur_t pcur; + const bool desc= index->fields[0].descending; + + /* Open at the high/right end (false), and init cursor */ + if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) + return nullptr; + + if (desc) + { + const bool comp= index->table->not_redundant(); + while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) + { + const rec_t *rec= btr_pcur_get_rec(&pcur); + if (!rec_is_metadata(rec, comp) && !rec_get_deleted_flag(rec, comp)) + return rec; + } + return nullptr; + } + + do + { + const page_t *page= btr_pcur_get_page(&pcur); + const rec_t *rec= page_find_rec_last_not_deleted(page); + if (page_rec_is_user_rec_low(rec - page)) + return rec; + btr_pcur_move_before_first_on_page(&pcur); + } + while (btr_pcur_move_to_prev(&pcur, mtr)); + + return nullptr; +} + +uint64_t row_search_max_autoinc(dict_index_t *index) noexcept +{ + uint64_t value= 0; + mtr_t mtr; + mtr.start(); + if (const rec_t *rec= row_search_get_max_rec(index, &mtr)) + value= row_read_autoinc(*index, rec); + mtr.commit(); + return value; +} diff -Nru mariadb-10.11.11/storage/innobase/row/row0log.cc mariadb-10.11.13/storage/innobase/row/row0log.cc --- mariadb-10.11.11/storage/innobase/row/row0log.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0log.cc 2025-05-19 16:14:25.000000000 +0000 @@ -4065,21 +4065,20 @@ if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++) - dfield_get_type( - dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING; + dfield_get_type(dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING; } + if (table->n_v_cols) + row_upd_replace_vcol(row, table, update, false, nullptr, + (cmpl_info & UPD_NODE_NO_ORD_CHANGE) + ? nullptr : undo_rec); + if (is_update) { old_row= dtuple_copy(row, heap); row_upd_replace(old_row, &old_ext, clust_index, update, heap); } - if (table->n_v_cols) - row_upd_replace_vcol(row, table, update, false, nullptr, - (cmpl_info & UPD_NODE_NO_ORD_CHANGE) - ? nullptr : undo_rec); - bool success= true; dict_index_t *index= dict_table_get_next_index(clust_index); while (index) diff -Nru mariadb-10.11.11/storage/innobase/row/row0mysql.cc mariadb-10.11.13/storage/innobase/row/row0mysql.cc --- mariadb-10.11.11/storage/innobase/row/row0mysql.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0mysql.cc 2025-05-19 16:14:25.000000000 +0000 @@ -69,7 +69,7 @@ /** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */ -static void row_mysql_delay_if_needed() +static void row_mysql_delay_if_needed() noexcept { const auto delay= srv_dml_needed_delay; if (UNIV_UNLIKELY(delay != 0)) @@ -78,8 +78,8 @@ log_sys.latch.rd_lock(SRW_LOCK_CALL); const lsn_t last= log_sys.last_checkpoint_lsn, max_age= log_sys.max_checkpoint_age; + const lsn_t lsn= log_sys.get_flushed_lsn(); log_sys.latch.rd_unlock(); - const lsn_t lsn= log_sys.get_lsn(); if ((lsn - last) / 4 >= max_age / 5) buf_flush_ahead(last + max_age / 5, false); purge_sys.wake_if_not_active(); @@ -687,8 +687,12 @@ /* MariaDB will roll back the latest SQL statement */ break; } - /* MariaDB will roll back the entire transaction. */ - trx->bulk_insert = false; + /* For DML, InnoDB does partial rollback and clear + bulk buffer in row_mysql_handle_errors(). + For ALTER TABLE ALGORITHM=COPY & CREATE TABLE...SELECT, + the bulk insert transaction will be rolled back inside + ha_innobase::extra(HA_EXTRA_ABORT_ALTER_COPY) */ + trx->bulk_insert &= TRX_DDL_BULK; trx->last_stmt_start = 0; break; case DB_LOCK_WAIT: @@ -981,7 +985,7 @@ rtr_clean_rtr_info(prebuilt->rtr_info, true); } if (prebuilt->table) { - dict_table_close(prebuilt->table); + prebuilt->table->release(); } mem_heap_free(prebuilt->heap); @@ -1599,7 +1603,7 @@ ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED); ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); - ut_ad(table->stat_initialized); + ut_ad(table->stat_initialized()); if (!table->is_readable()) { return row_mysql_get_table_error(trx, table); @@ -2159,11 +2163,9 @@ index = node->index; - ut_ad(!index == (err != DB_SUCCESS)); - que_graph_free((que_t*) que_node_get_parent(thr)); - if (index && (index->type & DICT_FTS)) { + if (err == DB_SUCCESS && (index->type & DICT_FTS)) { err = fts_create_index_tables(trx, index, table->id); } diff -Nru mariadb-10.11.11/storage/innobase/row/row0purge.cc mariadb-10.11.13/storage/innobase/row/row0purge.cc --- mariadb-10.11.11/storage/innobase/row/row0purge.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0purge.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1564,7 +1564,7 @@ case TRX_UNDO_DEL_MARK_REC: purged = row_purge_del_mark(node); if (purged) { - if (node->table->stat_initialized + if (node->table->stat_initialized() && srv_stats_include_delete_marked) { dict_stats_update_if_needed( node->table, *thr->graph->trx); diff -Nru mariadb-10.11.11/storage/innobase/row/row0sel.cc mariadb-10.11.13/storage/innobase/row/row0sel.cc --- mariadb-10.11.11/storage/innobase/row/row0sel.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0sel.cc 2025-05-19 16:14:25.000000000 +0000 @@ -6852,123 +6852,3 @@ goto rec_loop; } - -/*******************************************************************//** -Read the AUTOINC column from the current row. If the value is less than -0 and the type is not unsigned then we reset the value to 0. -@return value read from the column */ -static -ib_uint64_t -row_search_autoinc_read_column( -/*===========================*/ - dict_index_t* index, /*!< in: index to read from */ - const rec_t* rec, /*!< in: current rec */ - ulint col_no, /*!< in: column number */ - ulint mtype, /*!< in: column main type */ - ibool unsigned_type) /*!< in: signed or unsigned flag */ -{ - ulint len; - const byte* data; - ib_uint64_t value; - mem_heap_t* heap = NULL; - rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; - rec_offs* offsets = offsets_; - - rec_offs_init(offsets_); - ut_ad(page_rec_is_leaf(rec)); - - offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, - col_no + 1, &heap); - - if (rec_offs_nth_sql_null(offsets, col_no)) { - /* There is no non-NULL value in the auto-increment column. */ - value = 0; - goto func_exit; - } - - data = rec_get_nth_field(rec, offsets, col_no, &len); - - value = row_parse_int(data, len, mtype, unsigned_type); - -func_exit: - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } - - return(value); -} - -/** Get the maximum and non-delete-marked record in an index. -@param[in] index index tree -@param[in,out] mtr mini-transaction (may be committed and restarted) -@return maximum record, page s-latched in mtr -@retval NULL if there are no records, or if all of them are delete-marked */ -static -const rec_t* -row_search_get_max_rec( - dict_index_t* index, - mtr_t* mtr) -{ - btr_pcur_t pcur; - const rec_t* rec; - const bool desc = index->fields[0].descending; - - if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) { - return nullptr; - } - - if (desc) { - const bool comp = index->table->not_redundant(); - while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) { - rec = btr_pcur_get_rec(&pcur); - if (rec_is_metadata(rec, *index)) { - continue; - } - if (!rec_get_deleted_flag(rec, comp)) { - goto found; - } - } - } else { - do { - rec = page_find_rec_last_not_deleted( - btr_pcur_get_page(&pcur)); - if (page_rec_is_user_rec(rec)) { - goto found; - } - btr_pcur_move_before_first_on_page(&pcur); - } while (btr_pcur_move_to_prev(&pcur, mtr)); - } - - rec = nullptr; - -found: - ut_ad(!rec - || !(rec_get_info_bits(rec, dict_table_is_comp(index->table)) - & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))); - return(rec); -} - -/** Read the max AUTOINC value from an index. -@param[in] index index starting with an AUTO_INCREMENT column -@return the largest AUTO_INCREMENT value -@retval 0 if no records were found */ -ib_uint64_t -row_search_max_autoinc(dict_index_t* index) -{ - const dict_field_t* dfield = dict_index_get_nth_field(index, 0); - - ib_uint64_t value = 0; - - mtr_t mtr; - mtr.start(); - - if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) { - value = row_search_autoinc_read_column( - index, rec, 0, - dfield->col->mtype, - dfield->col->prtype & DATA_UNSIGNED); - } - - mtr.commit(); - return(value); -} diff -Nru mariadb-10.11.11/storage/innobase/row/row0uins.cc mariadb-10.11.13/storage/innobase/row/row0uins.cc --- mariadb-10.11.11/storage/innobase/row/row0uins.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0uins.cc 2025-05-19 16:14:25.000000000 +0000 @@ -244,8 +244,7 @@ btr_pcur_commit_specify_mtr(&node->pcur, &mtr); if (UNIV_LIKELY_NULL(table)) { - dict_table_close(table, dict_locked, - node->trx->mysql_thd, mdl_ticket); + dict_table_close(table, node->trx->mysql_thd, mdl_ticket); } return(err); @@ -452,7 +451,7 @@ would probably be better to just drop all temporary tables (and temporary undo log records) of the current connection, instead of doing this rollback. */ - dict_table_close(node->table, dict_locked); + node->table->release(); node->table = NULL; return false; } else { @@ -614,7 +613,7 @@ err = row_undo_ins_remove_clust_rec(node); } - if (err == DB_SUCCESS && node->table->stat_initialized) { + if (err == DB_SUCCESS && node->table->stat_initialized()) { /* Not protected by dict_sys.latch or table->stats_mutex_lock() for performance reasons, we would rather get garbage @@ -644,8 +643,7 @@ break; } - dict_table_close(node->table, dict_locked); - + node->table->release(); node->table = NULL; return(err); diff -Nru mariadb-10.11.11/storage/innobase/row/row0umod.cc mariadb-10.11.13/storage/innobase/row/row0umod.cc --- mariadb-10.11.11/storage/innobase/row/row0umod.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0umod.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1259,7 +1259,7 @@ would probably be better to just drop all temporary tables (and temporary undo log records) of the current connection, instead of doing this rollback. */ - dict_table_close(node->table, dict_locked); + node->table->release(); node->table = NULL; return false; } @@ -1388,7 +1388,7 @@ bool update_statistics = !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE); - if (err == DB_SUCCESS && node->table->stat_initialized) { + if (err == DB_SUCCESS && node->table->stat_initialized()) { switch (node->rec_type) { case TRX_UNDO_UPD_EXIST_REC: break; @@ -1418,8 +1418,7 @@ } } - dict_table_close(node->table, dict_locked); - + node->table->release(); node->table = NULL; return(err); diff -Nru mariadb-10.11.11/storage/innobase/row/row0upd.cc mariadb-10.11.13/storage/innobase/row/row0upd.cc --- mariadb-10.11.11/storage/innobase/row/row0upd.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/row/row0upd.cc 2025-05-19 16:14:25.000000000 +0000 @@ -253,7 +253,7 @@ FALSE, foreign, table, entry, thr); if (ref_table) { - dict_table_close(ref_table); + ref_table->release(); } if (err != DB_SUCCESS) { @@ -338,7 +338,7 @@ TRUE, foreign, table, entry, thr); if (opened) { - dict_table_close(opened); + opened->release(); } if (err != DB_SUCCESS) { diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0mon.cc mariadb-10.11.13/storage/innobase/srv/srv0mon.cc --- mariadb-10.11.11/storage/innobase/srv/srv0mon.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/srv/srv0mon.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1366,12 +1366,13 @@ /* innodb_buffer_pool_pages_total */ case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL: - value = buf_pool.get_n_pages(); + case MONITOR_OVLD_BUFFER_POOL_SIZE: + value = buf_pool.curr_size(); break; /* innodb_buffer_pool_pages_misc */ case MONITOR_OVLD_BUF_POOL_PAGE_MISC: - value = buf_pool.get_n_pages() + value = buf_pool.curr_size() - UT_LIST_GET_LEN(buf_pool.LRU) - UT_LIST_GET_LEN(buf_pool.free); break; @@ -1453,7 +1454,7 @@ /* innodb_os_log_written */ case MONITOR_OVLD_OS_LOG_WRITTEN: - value = log_sys.get_lsn() - recv_sys.lsn; + value = log_get_lsn() - recv_sys.lsn; break; /* innodb_log_waits */ @@ -1490,10 +1491,6 @@ value = srv_page_size; break; - case MONITOR_OVLD_BUFFER_POOL_SIZE: - value = srv_buf_pool_size; - break; - /* innodb_row_lock_current_waits */ case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT: // dirty read without lock_sys.wait_mutex @@ -1590,7 +1587,7 @@ break; case MONITOR_OVLD_LSN_CURRENT: - value = log_sys.get_lsn(); + value = log_get_lsn(); break; case MONITOR_OVLD_CHECKPOINTS: @@ -1598,10 +1595,10 @@ break; case MONITOR_LSN_CHECKPOINT_AGE: - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); value = static_cast(log_sys.get_lsn() - log_sys.last_checkpoint_lsn); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); break; case MONITOR_OVLD_BUF_OLDEST_LSN: diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0srv.cc mariadb-10.11.13/storage/innobase/srv/srv0srv.cc --- mariadb-10.11.11/storage/innobase/srv/srv0srv.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/srv/srv0srv.cc 2025-05-19 16:14:25.000000000 +0000 @@ -178,16 +178,6 @@ with mysql_mutex_lock(), which will wait until it gets the mutex. */ #define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT) -/** copy of innodb_buffer_pool_size */ -ulint srv_buf_pool_size; -/** Requested buffer pool chunk size */ -size_t srv_buf_pool_chunk_unit; -/** Previously requested size */ -ulint srv_buf_pool_old_size; -/** Current size as scaling factor for the other components */ -ulint srv_buf_pool_base_size; -/** Current size in bytes */ -ulint srv_buf_pool_curr_size; /** Dump this % of each buffer pool during BP dump */ ulong srv_buf_pool_dump_pct; /** Abort load after this amount of pages */ @@ -291,13 +281,13 @@ in the innodb database. * quick transient stats, that are used if persistent stats for the given table/index are not found in the innodb database */ -unsigned long long srv_stats_transient_sample_pages; +uint32_t srv_stats_transient_sample_pages; /** innodb_stats_persistent */ my_bool srv_stats_persistent; /** innodb_stats_include_delete_marked */ my_bool srv_stats_include_delete_marked; /** innodb_stats_persistent_sample_pages */ -unsigned long long srv_stats_persistent_sample_pages; +uint32_t srv_stats_persistent_sample_pages; /** innodb_stats_auto_recalc */ my_bool srv_stats_auto_recalc; @@ -901,6 +891,7 @@ export_vars.innodb_buffer_pool_read_requests = buf_pool.stat.n_page_gets; + mysql_mutex_lock(&buf_pool.mutex); export_vars.innodb_buffer_pool_bytes_data = buf_pool.stat.LRU_bytes + (UT_LIST_GET_LEN(buf_pool.unzip_LRU) @@ -910,12 +901,21 @@ export_vars.innodb_buffer_pool_pages_latched = buf_get_latched_pages_number(); #endif /* UNIV_DEBUG */ - export_vars.innodb_buffer_pool_pages_total = buf_pool.get_n_pages(); + export_vars.innodb_buffer_pool_pages_total = buf_pool.curr_size(); export_vars.innodb_buffer_pool_pages_misc = - buf_pool.get_n_pages() + export_vars.innodb_buffer_pool_pages_total - UT_LIST_GET_LEN(buf_pool.LRU) - UT_LIST_GET_LEN(buf_pool.free); + if (size_t shrinking = buf_pool.is_shrinking()) { + snprintf(export_vars.innodb_buffer_pool_resize_status, + sizeof export_vars.innodb_buffer_pool_resize_status, + "Withdrawing blocks. (%zu/%zu).", + buf_pool.to_withdraw(), shrinking); + } else { + export_vars.innodb_buffer_pool_resize_status[0] = '\0'; + } + mysql_mutex_unlock(&buf_pool.mutex); export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id(); export_vars.innodb_history_list_length = trx_sys.history_size_approx(); @@ -979,13 +979,13 @@ mysql_mutex_unlock(&srv_innodb_monitor_mutex); - log_sys.latch.rd_lock(SRW_LOCK_CALL); + log_sys.latch.wr_lock(SRW_LOCK_CALL); export_vars.innodb_lsn_current = log_sys.get_lsn(); export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn(); export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn; export_vars.innodb_checkpoint_max_age = static_cast( log_sys.max_checkpoint_age); - log_sys.latch.rd_unlock(); + log_sys.latch.wr_unlock(); export_vars.innodb_os_log_written = export_vars.innodb_lsn_current - recv_sys.lsn; @@ -1072,7 +1072,7 @@ /* Try to track a strange bug reported by Harald Fuchs and others, where the lsn seems to decrease at times */ - lsn_t new_lsn = log_sys.get_lsn(); + lsn_t new_lsn = log_get_lsn(); ut_a(new_lsn >= old_lsn); old_lsn = new_lsn; @@ -1088,6 +1088,7 @@ now -= start; ulong waited = static_cast(now / 1000000); if (waited >= threshold) { + buf_pool.print_flush_info(); ib::fatal() << dict_sys.fatal_msg; } diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0start.cc mariadb-10.11.13/storage/innobase/srv/srv0start.cc --- mariadb-10.11.11/storage/innobase/srv/srv0start.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/srv/srv0start.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1057,7 +1057,7 @@ /** Prepare to delete the redo log file. Flush the dirty pages from all the buffer pools. Flush the redo log buffer to the redo log file. @return lsn upto which data pages have been flushed. */ -static lsn_t srv_prepare_to_delete_redo_log_file() +static lsn_t srv_prepare_to_delete_redo_log_file() noexcept { DBUG_ENTER("srv_prepare_to_delete_redo_log_file"); @@ -1071,7 +1071,7 @@ log_sys.latch.wr_lock(SRW_LOCK_CALL); const bool latest_format{log_sys.is_latest()}; - lsn_t flushed_lsn{log_sys.get_lsn()}; + lsn_t flushed_lsn{log_sys.get_flushed_lsn(std::memory_order_relaxed)}; if (latest_format && !(log_sys.file_size & 4095) && flushed_lsn != log_sys.next_checkpoint_lsn + @@ -1079,6 +1079,11 @@ ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT)) { +#ifdef HAVE_PMEM + if (!log_sys.is_opened()) + log_sys.buf_size= unsigned(std::min(log_sys.capacity(), + log_sys.buf_size_max)); +#endif fil_names_clear(flushed_lsn); flushed_lsn= log_sys.get_lsn(); } @@ -1119,7 +1124,7 @@ if (latest_format) log_write_up_to(flushed_lsn, false); - ut_ad(flushed_lsn == log_sys.get_lsn()); + ut_ad(flushed_lsn == log_get_lsn()); ut_ad(!os_aio_pending_reads()); ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex)); ut_ad(!buf_pool.get_oldest_modification(0)); @@ -1134,6 +1139,18 @@ nullptr, &rollback_all_recovered_group); +inline lsn_t log_t::init_lsn() noexcept +{ + latch.wr_lock(SRW_LOCK_CALL); + ut_ad(!write_lsn_offset); + write_lsn_offset= 0; + const lsn_t lsn{base_lsn.load(std::memory_order_relaxed)}; + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn= lsn; + latch.wr_unlock(); + return lsn; +} + /** Start InnoDB. @param[in] create_new_db whether to create a new database @return DB_SUCCESS or error code */ @@ -1288,34 +1305,13 @@ fil_system.create(srv_file_per_table ? 50000 : 5000); - ib::info() << "Initializing buffer pool, total size = " - << ib::bytes_iec{srv_buf_pool_size} - << ", chunk size = " << ib::bytes_iec{srv_buf_pool_chunk_unit}; - if (buf_pool.create()) { - ib::error() << "Cannot allocate memory for the buffer pool"; - return(srv_init_abort(DB_ERROR)); } - ib::info() << "Completed initialization of buffer pool"; - -#ifdef UNIV_DEBUG - /* We have observed deadlocks with a 5MB buffer pool but - the actual lower limit could very well be a little higher. */ - - if (srv_buf_pool_size <= 5 * 1024 * 1024) { - - ib::info() << "Small buffer pool size (" - << ib::bytes_iec{srv_buf_pool_size} - << "), the flst_validate() debug function can cause a" - << " deadlock if the buffer pool fills up."; - } -#endif /* UNIV_DEBUG */ - log_sys.create(); recv_sys.create(); - lock_sys.create(srv_lock_table_size); + lock_sys.create(srv_lock_table_size = 5 * buf_pool.curr_size()); srv_startup_is_before_trx_rollback_phase = true; diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0purge.cc mariadb-10.11.13/storage/innobase/trx/trx0purge.cc --- mariadb-10.11.11/storage/innobase/trx/trx0purge.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/trx/trx0purge.cc 2025-05-19 16:14:25.000000000 +0000 @@ -1052,16 +1052,25 @@ /** Close all tables that were opened in a purge batch for a worker. @param node purge task context @param thd purge coordinator thread handle */ -static void trx_purge_close_tables(purge_node_t *node, THD *thd) +static void trx_purge_close_tables(purge_node_t *node, THD *thd) noexcept { for (auto &t : node->tables) { - if (!t.second.first); - else if (t.second.first == reinterpret_cast(-1)); - else + dict_table_t *table= t.second.first; + if (table != nullptr && table != reinterpret_cast(-1)) + table->release(); + } + + MDL_context *mdl_context= static_cast(thd_mdl_context(thd)); + + for (auto &t : node->tables) + { + dict_table_t *table= t.second.first; + if (table != nullptr && table != reinterpret_cast(-1)) { - dict_table_close(t.second.first, false, thd, t.second.second); t.second.first= reinterpret_cast(-1); + if (mdl_context != nullptr && t.second.second != nullptr) + mdl_context->release_lock(t.second.second); } } } @@ -1073,36 +1082,35 @@ } __attribute__((nonnull)) -/** Aqcuire a metadata lock on a table. +/** Acquire a metadata lock on a table. @param table table handle @param mdl_context metadata lock acquisition context -@param mdl metadata lcok +@param mdl metadata lock @return table handle @retval nullptr if the table is not found or accessible @retval -1 if the purge of history must be suspended due to DDL */ static dict_table_t *trx_purge_table_acquire(dict_table_t *table, MDL_context *mdl_context, - MDL_ticket **mdl) + MDL_ticket **mdl) noexcept { ut_ad(dict_sys.frozen_not_locked()); *mdl= nullptr; if (!table->is_readable() || table->corrupted) - { - table->release(); return nullptr; - } size_t db_len= dict_get_db_name_len(table->name.m_name); if (db_len == 0) - return table; /* InnoDB system tables are not covered by MDL */ + { + /* InnoDB system tables are not covered by MDL */ + got_table: + table->acquire(); + return table; + } if (purge_sys.must_wait_FTS()) - { must_wait: - table->release(); return reinterpret_cast(-1); - } char db_buf[NAME_LEN + 1]; char tbl_buf[NAME_LEN + 1]; @@ -1110,7 +1118,7 @@ if (!table->parse_name(db_buf, tbl_buf, &db_len, &tbl_len)) /* The name of an intermediate table starts with #sql */ - return table; + goto got_table; { MDL_request request; @@ -1123,37 +1131,38 @@ goto must_wait; } - return table; + goto got_table; } /** Open a table handle for the purge of committed transaction history @param table_id InnoDB table identifier @param mdl_context metadata lock acquisition context -@param mdl metadata lcok +@param mdl metadata lock @return table handle @retval nullptr if the table is not found or accessible @retval -1 if the purge of history must be suspended due to DDL */ static dict_table_t *trx_purge_table_open(table_id_t table_id, MDL_context *mdl_context, - MDL_ticket **mdl) + MDL_ticket **mdl) noexcept { - dict_sys.freeze(SRW_LOCK_CALL); - - dict_table_t *table= dict_sys.find_table(table_id); + dict_table_t *table; - if (table) - table->acquire(); - else + for (;;) { + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_sys.find_table(table_id); + if (table) + break; dict_sys.unfreeze(); dict_sys.lock(SRW_LOCK_CALL); table= dict_load_table_on_id(table_id, DICT_ERR_IGNORE_FK_NOKEY); - if (table) - table->acquire(); dict_sys.unlock(); if (!table) return nullptr; - dict_sys.freeze(SRW_LOCK_CALL); + /* At this point, the freshly loaded table may already have been evicted. + We must look it up again while holding a shared dict_sys.latch. We keep + trying this until the table is found in the cache or it cannot be found + in the dictionary (because the table has been dropped or rebuilt). */ } table= trx_purge_table_acquire(table, mdl_context, mdl); @@ -1172,10 +1181,7 @@ for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr; thr= UT_LIST_GET_NEXT(thrs, thr)) - { - purge_node_t *node= static_cast(thr->child); - trx_purge_close_tables(node, thd); - } + trx_purge_close_tables(static_cast(thr->child), thd); m_active= false; wait_FTS(false); @@ -1198,7 +1204,7 @@ if (t.second.first == reinterpret_cast(-1)) { if (table) - dict_table_close(table, false, thd, *mdl); + dict_table_close(table, thd, *mdl); goto retry; } } @@ -1231,9 +1237,6 @@ static_cast(thd_mdl_context(thd)); ut_ad(mdl_context); - const size_t max_pages= - std::min(buf_pool.curr_size * 3 / 4, size_t{srv_purge_batch_size}); - while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) { /* Track the max {trx_id, undo_no} for truncating the @@ -1283,12 +1286,12 @@ ut_ad(!table_node->in_progress); } - if (purge_sys.n_pages_handled() >= max_pages) + const size_t size{purge_sys.n_pages_handled()}; + if (size >= size_t{srv_purge_batch_size} || + size >= buf_pool.usable_size() * 3 / 4) break; } - purge_sys.m_active= false; - #ifdef UNIV_DEBUG thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); for (ulint i= 0; thr && i < *n_work_items; @@ -1337,6 +1340,8 @@ TRANSACTIONAL_INLINE void purge_sys_t::batch_cleanup(const purge_sys_t::iterator &head) { + m_active= false; + /* Release the undo pages. */ for (auto p : pages) p.second->unfix(); diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0rec.cc mariadb-10.11.13/storage/innobase/trx/trx0rec.cc --- mariadb-10.11.11/storage/innobase/trx/trx0rec.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/trx/trx0rec.cc 2025-05-19 16:14:25.000000000 +0000 @@ -152,7 +152,9 @@ ulint n_idx = 0; for (const auto& v_index : vcol->v_indexes) { n_idx++; - /* FIXME: index->id is 64 bits! */ + if (uint32_t hi= uint32_t(v_index.index->id >> 32)) { + size += 1 + mach_get_compressed_size(hi); + } size += mach_get_compressed_size(uint32_t(v_index.index->id)); size += mach_get_compressed_size(v_index.nth_field); } @@ -179,10 +181,14 @@ ptr += mach_write_compressed(ptr, n_idx); for (const auto& v_index : vcol->v_indexes) { - ptr += mach_write_compressed( - /* FIXME: index->id is 64 bits! */ - ptr, uint32_t(v_index.index->id)); - + /* This is compatible with + ptr += mach_u64_write_much_compressed(ptr, v_index.index-id) + (the added "if" statement is fixing an old regression). */ + if (uint32_t hi= uint32_t(v_index.index->id >> 32)) { + *ptr++ = 0xff; + ptr += mach_write_compressed(ptr, hi); + } + ptr += mach_write_compressed(ptr, uint32_t(v_index.index->id)); ptr += mach_write_compressed(ptr, v_index.nth_field); } @@ -221,7 +227,15 @@ dict_index_t* clust_index = dict_table_get_first_index(table); for (ulint i = 0; i < num_idx; i++) { - index_id_t id = mach_read_next_compressed(&ptr); + index_id_t id = 0; + /* This is like mach_u64_read_much_compressed(), + but advancing ptr to the next field. */ + if (*ptr == 0xff) { + ptr++; + id = mach_read_next_compressed(&ptr); + id <<= 32; + } + id |= mach_read_next_compressed(&ptr); ulint pos = mach_read_next_compressed(&ptr); dict_index_t* index = dict_table_get_next_index(clust_index); diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0trx.cc mariadb-10.11.13/storage/innobase/trx/trx0trx.cc --- mariadb-10.11.11/storage/innobase/trx/trx0trx.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/trx/trx0trx.cc 2025-05-19 16:14:25.000000000 +0000 @@ -134,8 +134,6 @@ trx->will_lock = false; - trx->bulk_insert = false; - trx->apply_online_log = false; ut_d(trx->start_file = 0); @@ -452,7 +450,7 @@ /** Transition to committed state, to release implicit locks. */ TRANSACTIONAL_INLINE inline void trx_t::commit_state() { - ut_d(auto trx_state{state}); + ut_d(auto trx_state= state); ut_ad(trx_state == TRX_STATE_PREPARED || trx_state == TRX_STATE_PREPARED_RECOVERED || trx_state == TRX_STATE_ACTIVE); @@ -1513,6 +1511,7 @@ *detailed_error= '\0'; mod_tables.clear(); + bulk_insert= TRX_NO_BULK; check_foreigns= true; check_unique_secondary= true; assert_freed(); diff -Nru mariadb-10.11.11/storage/innobase/ut/ut0rnd.cc mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc --- mariadb-10.11.11/storage/innobase/ut/ut0rnd.cc 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc 2025-05-19 16:14:25.000000000 +0000 @@ -48,6 +48,8 @@ ulint pow2; ulint i; + ut_ad(n); + n += 100; pow2 = 1; diff -Nru mariadb-10.11.11/storage/maria/ma_control_file.c mariadb-10.11.13/storage/maria/ma_control_file.c --- mariadb-10.11.11/storage/maria/ma_control_file.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/maria/ma_control_file.c 2025-05-19 16:14:25.000000000 +0000 @@ -276,7 +276,7 @@ int open_flags) { uchar buffer[CF_MAX_SIZE]; - char name[FN_REFLEN], errmsg_buff[256]; + char name[FN_REFLEN], errmsg_buff[512]; const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;" " file is probably in use by another process"; uint new_cf_create_time_size, new_cf_changeable_size, new_block_size; @@ -399,10 +399,14 @@ if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE || new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE || - new_cf_create_time_size + new_cf_changeable_size != file_size) + new_cf_create_time_size + new_cf_changeable_size > file_size) { error= CONTROL_FILE_INCONSISTENT_INFORMATION; - errmsg= "Sizes stored in control file are inconsistent"; + sprintf(errmsg_buff, + "Sizes stored in control file are inconsistent. " + "create_time_size: %u changeable_size: %u file_size: %llu", + new_cf_create_time_size, new_cf_changeable_size, (ulonglong) file_size); + errmsg= errmsg_buff; goto err; } @@ -622,6 +626,20 @@ return (control_file_fd >= 0); } + + +static int check_zerofill(uchar *buffer, ulonglong offset, ulonglong length) +{ + uchar *pos= buffer + offset, *end= buffer+length; + while (pos < end) + { + if (*pos++) + return 1; + } + return 0; +} + + /** Print content of aria_log_control file */ @@ -629,6 +647,7 @@ my_bool print_aria_log_control() { uchar buffer[CF_MAX_SIZE]; + char errmsg_buff[512]; char name[FN_REFLEN], uuid_str[MY_UUID_STRING_LENGTH+1]; const char *errmsg; uint new_cf_create_time_size, new_cf_changeable_size; @@ -705,10 +724,14 @@ if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE || new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE || - new_cf_create_time_size + new_cf_changeable_size != file_size) + new_cf_create_time_size + new_cf_changeable_size > file_size) { error= CONTROL_FILE_INCONSISTENT_INFORMATION; - errmsg= "Sizes stored in control file are inconsistent"; + sprintf(errmsg_buff, + "Sizes stored in control file are inconsistent. " + "create_time_size: %u changeable_size: %u file_size: %llu", + new_cf_create_time_size, new_cf_changeable_size, (ulonglong) file_size); + errmsg= errmsg_buff; goto err; } checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size + @@ -732,6 +755,18 @@ (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0]; printf("recovery_failures: %u\n", recovery_fails); } + if (check_zerofill(buffer, new_cf_create_time_size + new_cf_changeable_size, file_size)) + { + printf("Warning: %s file_size is %llu (should be %llu) and contains unknown data.\n" + "It will still work but should be examined.\n", + name, (ulonglong) file_size, + (ulonglong) (new_cf_create_time_size + new_cf_changeable_size)); + } + else if (new_cf_create_time_size + new_cf_changeable_size < file_size) + printf("Note: file_size (%llu) is bigger than the expected file size %llu.\n" + "This is unexpected but will not cause any issues.\n", + (ulonglong) file_size, + (ulonglong) (new_cf_create_time_size + new_cf_changeable_size)); mysql_file_close(file, MYF(0)); DBUG_RETURN(0); diff -Nru mariadb-10.11.11/storage/maria/ma_pagecache.c mariadb-10.11.13/storage/maria/ma_pagecache.c --- mariadb-10.11.11/storage/maria/ma_pagecache.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/maria/ma_pagecache.c 2025-05-19 16:14:25.000000000 +0000 @@ -4726,10 +4726,10 @@ static int cmp_sec_link(const void *a_, const void *b_) { - PAGECACHE_BLOCK_LINK *const *a= a_; - PAGECACHE_BLOCK_LINK *const *b= b_; - return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 : - ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0); + const PAGECACHE_BLOCK_LINK *a= *(const PAGECACHE_BLOCK_LINK **) a_; + const PAGECACHE_BLOCK_LINK *b= *(const PAGECACHE_BLOCK_LINK **) b_; + return ((a->hash_link->pageno < b->hash_link->pageno) ? -1 : + (a->hash_link->pageno > b->hash_link->pageno) ? 1 : 0); } diff -Nru mariadb-10.11.11/storage/maria/ma_unique.c mariadb-10.11.13/storage/maria/ma_unique.c --- mariadb-10.11.11/storage/maria/ma_unique.c 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/maria/ma_unique.c 2025-05-19 16:14:25.000000000 +0000 @@ -139,6 +139,8 @@ { uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); memcpy((void*) &pos,pos+keyseg->bit_start,sizeof(char*)); + if (!pos) + pos= (const uchar*) ""; /* hash_sort does not support NULL ptr */ if (!length || length > tmp_length) length=tmp_length; /* The whole blob */ } @@ -236,6 +238,10 @@ } memcpy((void*) &pos_a, pos_a+keyseg->bit_start, sizeof(char*)); memcpy((void*) &pos_b, pos_b+keyseg->bit_start, sizeof(char*)); + if (pos_a == 0) + pos_a= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */ + if (pos_b == 0) + pos_b= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */ } if (type == HA_KEYTYPE_TEXT/* the CHAR data type*/) { diff -Nru mariadb-10.11.11/storage/mroonga/CMakeLists.txt mariadb-10.11.13/storage/mroonga/CMakeLists.txt --- mariadb-10.11.11/storage/mroonga/CMakeLists.txt 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/CMakeLists.txt 2025-05-19 16:14:25.000000000 +0000 @@ -17,7 +17,7 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 2.8...3.12) project(mroonga) if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") diff -Nru mariadb-10.11.11/storage/mroonga/ha_mroonga.cpp mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp --- mariadb-10.11.11/storage/mroonga/ha_mroonga.cpp 2025-01-30 11:01:24.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp 2025-05-19 16:14:25.000000000 +0000 @@ -558,6 +558,9 @@ case HA_EXTRA_END_ALTER_COPY: inspected = "HA_EXTRA_END_ALTER_COPY"; break; + case HA_EXTRA_ABORT_ALTER_COPY: + inspected = "HA_EXTRA_ABORT_ALTER_COPY"; + break; #ifdef MRN_HAVE_HA_EXTRA_EXPORT case HA_EXTRA_EXPORT: inspected = "HA_EXTRA_EXPORT"; @@ -593,6 +596,11 @@ inspected = "HA_EXTRA_END_ALTER_COPY"; break; #endif +#ifdef MRN_HAVE_HA_EXTRA_ABORT_ALTER_COPY + case HA_EXTRA_ABORT_ALTER_COPY: + inspected = "HA_EXTRA_ABORT_ALTER_COPY"; + break; +#endif #ifdef MRN_HAVE_HA_EXTRA_NO_AUTOINC_LOCKING case HA_EXTRA_NO_AUTOINC_LOCKING: inspected = "HA_EXTRA_NO_AUTOINC_LOCKING"; diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/CMakeLists.txt mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt --- mariadb-10.11.11/storage/mroonga/vendor/groonga/CMakeLists.txt 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt 2025-05-19 16:14:26.000000000 +0000 @@ -15,7 +15,7 @@ # https://buildbot.askmonty.org/buildbot/builders/work-amd64-valgrind/builds/5263/steps/compile/logs/stdio # says CMake 2.6.2... We want to drop old software support... -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 2.8...3.12) # cmake_minimum_required(VERSION 2.6.4) # CentOS 5 set(GRN_PROJECT_NAME "groonga") set(GRN_PROJECT_LABEL "Groonga") diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/db.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c --- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/db.c 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c 2025-05-19 16:14:26.000000000 +0000 @@ -969,8 +969,8 @@ *subrec_size = range_size + sizeof(uint32_t) + sizeof(uint32_t); break; } - *value_size = (uintptr_t)GRN_RSET_SUBRECS_NTH((((grn_rset_recinfo *)0)->subrecs), - *subrec_size, max_n_subrecs); + *value_size = (uintptr_t) GRN_RSET_SUBRECS_NTH(offsetof(grn_rset_recinfo, subrecs), + *subrec_size, max_n_subrecs); } else { *value_size = range_size; } diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/hash.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c --- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/hash.c 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c 2025-05-19 16:14:26.000000000 +0000 @@ -1727,15 +1727,15 @@ { if (flags & GRN_OBJ_KEY_VAR_SIZE) { if (flags & GRN_OBJ_KEY_LARGE) { - return (uintptr_t)((grn_io_hash_entry_large *)0)->value + value_size; + return offsetof(grn_io_hash_entry_large, value) + value_size; } else { - return (uintptr_t)((grn_io_hash_entry_normal *)0)->value + value_size; + return offsetof(grn_io_hash_entry_normal, value) + value_size; } } else { if (key_size == sizeof(uint32_t)) { - return (uintptr_t)((grn_plain_hash_entry *)0)->value + value_size; + return offsetof(grn_plain_hash_entry, value) + value_size; } else { - return (uintptr_t)((grn_rich_hash_entry *)0)->key_and_value + return offsetof(grn_rich_hash_entry, key_and_value) + key_size + value_size; } } @@ -1865,12 +1865,12 @@ { uint32_t entry_size; if (flags & GRN_OBJ_KEY_VAR_SIZE) { - entry_size = (uintptr_t)((grn_tiny_hash_entry *)0)->value + value_size; + entry_size = offsetof(grn_tiny_hash_entry, value) + value_size; } else { if (key_size == sizeof(uint32_t)) { - entry_size = (uintptr_t)((grn_plain_hash_entry *)0)->value + value_size; + entry_size = offsetof(grn_plain_hash_entry, value) + value_size; } else { - entry_size = (uintptr_t)((grn_rich_hash_entry *)0)->key_and_value + entry_size = offsetof(grn_rich_hash_entry, key_and_value) + key_size + value_size; } } diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/ii.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c --- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/ii.c 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c 2025-05-19 16:14:26.000000000 +0000 @@ -2049,7 +2049,7 @@ if ((df & 1)) { df >>= 1; size = nreq == dvlen ? data_size : df * nreq; - if (dv[dvlen].data < dv[0].data + size) { + if (!dv[0].data || dv[dvlen].data < dv[0].data + size) { if (dv[0].data) { GRN_FREE(dv[0].data); } if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } dv[dvlen].data = rp + size; @@ -10653,7 +10653,7 @@ } #define GRN_II_BUILDER_TERM_INPLACE_SIZE\ - (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy) + (sizeof(grn_ii_builder_term) - offsetof(grn_ii_builder_term, dummy)) typedef struct { grn_id rid; /* Last record ID */ diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt --- mariadb-10.11.11/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt 2025-05-19 16:14:26.000000000 +0000 @@ -15,7 +15,7 @@ # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1335 USA -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 2.8...3.12) if(NOT DEFINED GROONGA_NORMALIZER_MYSQL_PROJECT_NAME) set(GROONGA_NORMALIZER_MYSQL_PROJECT_NAME "groonga-normalizer-mysql") endif() diff -Nru mariadb-10.11.11/storage/myisam/mi_unique.c mariadb-10.11.13/storage/myisam/mi_unique.c --- mariadb-10.11.11/storage/myisam/mi_unique.c 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/myisam/mi_unique.c 2025-05-19 16:14:26.000000000 +0000 @@ -115,6 +115,8 @@ { uint tmp_length=_mi_calc_blob_length(keyseg->bit_start,pos); memcpy((char**) &pos, pos+keyseg->bit_start, sizeof(char*)); + if (!pos) + pos= (const uchar*) ""; /* hash_sort does not support NULL ptr */ if (!length || length > tmp_length) length=tmp_length; /* The whole blob */ } @@ -211,6 +213,10 @@ } memcpy((char**) &pos_a, pos_a+keyseg->bit_start, sizeof(char*)); memcpy((char**) &pos_b, pos_b+keyseg->bit_start, sizeof(char*)); + if (pos_a == 0) + pos_a= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */ + if (pos_b == 0) + pos_b= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */ } if (type == HA_KEYTYPE_TEXT/*The CHAR data type*/) { diff -Nru mariadb-10.11.11/storage/rocksdb/build_rocksdb.cmake mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake --- mariadb-10.11.11/storage/rocksdb/build_rocksdb.cmake 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake 2025-05-19 16:14:26.000000000 +0000 @@ -176,35 +176,53 @@ # - *_test.cc # - *_bench.cc set(ROCKSDB_SOURCES + cache/cache.cc + cache/cache_entry_roles.cc + cache/cache_key.cc + cache/cache_reservation_manager.cc cache/clock_cache.cc cache/lru_cache.cc cache/sharded_cache.cc db/arena_wrapped_db_iter.cc + db/blob/blob_fetcher.cc + db/blob/blob_file_addition.cc + db/blob/blob_file_builder.cc + db/blob/blob_file_builder.cc + db/blob/blob_file_cache.cc + db/blob/blob_file_garbage.cc + db/blob/blob_file_meta.cc + db/blob/blob_file_reader.cc + db/blob/blob_garbage_meter.cc + db/blob/blob_log_format.cc + db/blob/blob_log_sequential_reader.cc + db/blob/blob_log_writer.cc + db/blob/prefetch_buffer_collection.cc db/builder.cc db/c.cc db/column_family.cc - db/compacted_db_impl.cc db/compaction/compaction.cc db/compaction/compaction_iterator.cc - db/compaction/compaction_picker.cc db/compaction/compaction_job.cc + db/compaction/compaction_picker.cc db/compaction/compaction_picker_fifo.cc db/compaction/compaction_picker_level.cc db/compaction/compaction_picker_universal.cc + db/compaction/sst_partitioner.cc db/convenience.cc db/db_filesnapshot.cc + db/dbformat.cc + db/db_impl/compacted_db_impl.cc db/db_impl/db_impl.cc - db/db_impl/db_impl_write.cc db/db_impl/db_impl_compaction_flush.cc - db/db_impl/db_impl_files.cc - db/db_impl/db_impl_open.cc db/db_impl/db_impl_debug.cc db/db_impl/db_impl_experimental.cc + db/db_impl/db_impl_files.cc + db/db_impl/db_impl_open.cc db/db_impl/db_impl_readonly.cc db/db_impl/db_impl_secondary.cc + db/db_impl/db_impl_write.cc db/db_info_dumper.cc db/db_iter.cc - db/dbformat.cc db/error_handler.cc db/event_helpers.cc db/experimental.cc @@ -215,14 +233,16 @@ db/forward_iterator.cc db/import_column_family_job.cc db/internal_stats.cc - db/logs_with_prep_tracker.cc db/log_reader.cc + db/logs_with_prep_tracker.cc db/log_writer.cc db/malloc_stats.cc db/memtable.cc db/memtable_list.cc db/merge_helper.cc db/merge_operator.cc + db/output_validator.cc + db/periodic_work_scheduler.cc db/range_del_aggregator.cc db/range_tombstone_fragmenter.cc db/repair.cc @@ -233,25 +253,32 @@ db/trim_history_scheduler.cc db/version_builder.cc db/version_edit.cc + db/version_edit_handler.cc db/version_set.cc + db/wal_edit.cc db/wal_manager.cc - db/write_batch.cc db/write_batch_base.cc + db/write_batch.cc db/write_controller.cc db/write_thread.cc + env/composite_env.cc env/env.cc env/env_chroot.cc env/env_encryption.cc env/env_hdfs.cc env/file_system.cc + env/file_system_tracer.cc + env/fs_remap.cc env/mock_env.cc + env/unique_id_gen.cc file/delete_scheduler.cc + file/filename.cc file/file_prefetch_buffer.cc file/file_util.cc - file/filename.cc + file/line_file_reader.cc file/random_access_file_reader.cc - file/read_write_util.cc file/readahead_raf.cc + file/read_write_util.cc file/sequence_file_reader.cc file/sst_file_manager_impl.cc file/writable_file_writer.cc @@ -281,29 +308,38 @@ monitoring/thread_status_util.cc monitoring/thread_status_util_debug.cc options/cf_options.cc + options/configurable.cc + options/customizable.cc options/db_options.cc options/options.cc options/options_helper.cc options/options_parser.cc - options/options_sanity_check.cc port/stack_trace.cc table/adaptive/adaptive_table_factory.cc - table/block_based/block.cc + table/block_based/binary_search_index_reader.cc table/block_based/block_based_filter_block.cc table/block_based/block_based_table_builder.cc table/block_based/block_based_table_factory.cc + table/block_based/block_based_table_iterator.cc table/block_based/block_based_table_reader.cc table/block_based/block_builder.cc + table/block_based/block.cc + table/block_based/block_prefetcher.cc table/block_based/block_prefix_index.cc - table/block_based/data_block_hash_index.cc table/block_based/data_block_footer.cc + table/block_based/data_block_hash_index.cc table/block_based/filter_block_reader_common.cc table/block_based/filter_policy.cc table/block_based/flush_block_policy.cc table/block_based/full_filter_block.cc + table/block_based/hash_index_reader.cc table/block_based/index_builder.cc + table/block_based/index_reader_common.cc table/block_based/parsed_full_filter_block.cc table/block_based/partitioned_filter_block.cc + table/block_based/partitioned_index_iterator.cc + table/block_based/partitioned_index_reader.cc + table/block_based/reader_common.cc table/block_based/uncompression_dict_reader.cc table/block_fetcher.cc table/cuckoo/cuckoo_table_builder.cc @@ -321,10 +357,13 @@ table/plain/plain_table_index.cc table/plain/plain_table_key_coding.cc table/plain/plain_table_reader.cc + table/sst_file_dumper.cc table/sst_file_reader.cc table/sst_file_writer.cc + table/table_factory.cc table/table_properties.cc table/two_level_iterator.cc + table/unique_id.cc test_util/sync_point.cc test_util/sync_point_impl.cc test_util/testutil.cc @@ -335,8 +374,12 @@ tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc - trace_replay/trace_replay.cc trace_replay/block_cache_tracer.cc + trace_replay/io_tracer.cc + trace_replay/trace_record.cc + trace_replay/trace_record_handler.cc + trace_replay/trace_record_result.cc + trace_replay/trace_replay.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc @@ -344,17 +387,8 @@ util/concurrent_task_limiter_impl.cc util/crc32c.cc util/dynamic_bloom.cc - util/hash.cc - util/murmurhash.cc - util/random.cc - util/rate_limiter.cc - util/slice.cc util/file_checksum_helper.cc - util/status.cc - util/string_util.cc - util/thread_local.cc - util/threadpool_imp.cc - util/xxhash.cc + util/hash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_compaction_filter.cc utilities/blob_db/blob_db.cc @@ -362,10 +396,8 @@ utilities/blob_db/blob_db_impl_filesnapshot.cc utilities/blob_db/blob_dump_tool.cc utilities/blob_db/blob_file.cc - utilities/blob_db/blob_log_reader.cc - utilities/blob_db/blob_log_writer.cc - utilities/blob_db/blob_log_format.cc utilities/checkpoint/checkpoint_impl.cc + utilities/compaction_filters.cc utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc utilities/debug.cc utilities/env_mirror.cc @@ -373,11 +405,12 @@ utilities/leveldb_options/leveldb_options.cc utilities/memory/memory_util.cc utilities/merge_operators/bytesxor.cc + utilities/merge_operators.cc utilities/merge_operators/max.cc utilities/merge_operators/put.cc utilities/merge_operators/sortlist.cc - utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/string_append/stringappend2.cc + utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/uint64add.cc utilities/object_registry.cc utilities/option_change_migration/option_change_migration.cc @@ -391,22 +424,37 @@ utilities/simulator_cache/sim_cache.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/trace/file_trace_reader_writer.cc - utilities/transactions/optimistic_transaction_db_impl.cc + utilities/trace/replayer_impl.cc + utilities/transactions/lock/lock_manager.cc + utilities/transactions/lock/point/point_lock_manager.cc + utilities/transactions/lock/point/point_lock_tracker.cc utilities/transactions/optimistic_transaction.cc + utilities/transactions/optimistic_transaction_db_impl.cc utilities/transactions/pessimistic_transaction.cc utilities/transactions/pessimistic_transaction_db.cc utilities/transactions/snapshot_checker.cc utilities/transactions/transaction_base.cc utilities/transactions/transaction_db_mutex_impl.cc - utilities/transactions/transaction_lock_mgr.cc utilities/transactions/transaction_util.cc utilities/transactions/write_prepared_txn.cc utilities/transactions/write_prepared_txn_db.cc utilities/transactions/write_unprepared_txn.cc utilities/transactions/write_unprepared_txn_db.cc utilities/ttl/db_ttl_impl.cc + utilities/wal_filter.cc utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc + util/murmurhash.cc + util/random.cc + util/rate_limiter.cc + util/regex.cc + util/ribbon_config.cc + util/slice.cc + util/status.cc + util/string_util.cc + util/thread_local.cc + util/threadpool_imp.cc + util/xxhash.cc ) @@ -484,8 +532,10 @@ STRING(TIMESTAMP GIT_DATE_TIME "%Y-%m-%d %H:%M:%S") ENDIF() +# psergey-added: +SET(GIT_MOD 0) CONFIGURE_FILE(${ROCKSDB_SOURCE_DIR}/util/build_version.cc.in build_version.cc @ONLY) -INCLUDE_DIRECTORIES(${ROCKSDB_SOURCE_DIR}/util) + list(APPEND SOURCES ${CMAKE_CURRENT_BINARY_DIR}/build_version.cc) ADD_CONVENIENCE_LIBRARY(rocksdblib ${SOURCES}) diff -Nru mariadb-10.11.11/storage/rocksdb/ha_rocksdb.cc mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc --- mariadb-10.11.11/storage/rocksdb/ha_rocksdb.cc 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc 2025-05-19 16:14:26.000000000 +0000 @@ -1250,7 +1250,7 @@ "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)", nullptr, rocksdb_set_rocksdb_stats_level, /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers, - /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers, + /* min */ (uint)rocksdb::StatsLevel::kDisableAll, /* max */ (uint)rocksdb::StatsLevel::kAll, 0); static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size, @@ -1596,7 +1596,7 @@ "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr, rocksdb_tbl_options->no_block_cache); -static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size, +static MYSQL_SYSVAR_UINT64_T(block_size, rocksdb_tbl_options->block_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "BlockBasedTableOptions::block_size for RocksDB", nullptr, nullptr, rocksdb_tbl_options->block_size, @@ -3992,7 +3992,7 @@ DBUG_ASSERT(xid != nullptr); DBUG_ASSERT(commit_latency_stats != nullptr); - rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); + rocksdb::StopWatchNano timer(rocksdb::SystemClock::Default().get(), true); const auto name = rdb_xid_to_string(*xid); DBUG_ASSERT(!name.empty()); @@ -4187,7 +4187,7 @@ DBUG_ASSERT(thd != nullptr); DBUG_ASSERT(commit_latency_stats != nullptr); - rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); + rocksdb::StopWatchNano timer(rocksdb::SystemClock::Default().get(), true); /* note: h->external_lock(F_UNLCK) is called after this function is called) */ Rdb_transaction *tx = get_tx_from_thd(thd); @@ -4732,8 +4732,7 @@ if (tf_name.find("BlockBasedTable") != std::string::npos) { const rocksdb::BlockBasedTableOptions *const bbt_opt = - reinterpret_cast( - table_factory->GetOptions()); + table_factory->GetOptions(); if (bbt_opt != nullptr) { if (bbt_opt->block_cache.get() != nullptr) { diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result 2025-05-19 16:14:26.000000000 +0000 @@ -20,7 +20,7 @@ set rocksdb_verify_row_debug_checksums=1; set session debug_dbug= "+d,myrocks_simulate_bad_row_read1"; select * from t1 where pk=1; -ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB set session debug_dbug= "-d,myrocks_simulate_bad_row_read1"; set rocksdb_verify_row_debug_checksums=@tmp1; select * from t1 where pk=1; @@ -28,11 +28,11 @@ 1 1 set session debug_dbug= "+d,myrocks_simulate_bad_row_read2"; select * from t1 where pk=1; -ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB set session debug_dbug= "-d,myrocks_simulate_bad_row_read2"; set session debug_dbug= "+d,myrocks_simulate_bad_row_read3"; select * from t1 where pk=1; -ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB set session debug_dbug= "-d,myrocks_simulate_bad_row_read3"; insert into t1 values(4,'0123456789'); select * from t1; @@ -56,7 +56,7 @@ ABCD 1 set session debug_dbug= "+d,myrocks_simulate_bad_pk_read1"; select * from t2; -ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB set session debug_dbug= "-d,myrocks_simulate_bad_pk_read1"; drop table t2; create table t2 ( @@ -69,6 +69,6 @@ ABCD 1 set session debug_dbug= "+d,myrocks_simulate_bad_pk_read1"; select * from t2; -ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB +ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB set session debug_dbug= "-d,myrocks_simulate_bad_pk_read1"; drop table t2; diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result 2025-05-19 16:14:26.000000000 +0000 @@ -1,12 +1,6 @@ -call mtr.add_suppression("Column family 'cf1' not found"); -call mtr.add_suppression("Column family 'rev:cf2' not found"); DROP TABLE IF EXISTS t1; call mtr.add_suppression("Column family 'cf1' not found"); call mtr.add_suppression("Column family 'rev:cf2' not found"); -set global rocksdb_compact_cf = 'cf1'; -set global rocksdb_compact_cf = 'rev:cf2'; -set global rocksdb_signal_drop_index_thread = 1; -# restart CREATE TABLE t1 ( a int not null, b int not null, @@ -15,6 +9,10 @@ key (b) comment 'rev:cf2' ) ENGINE=RocksDB; DELETE FROM t1; +set global rocksdb_compact_cf = 'cf1'; +set global rocksdb_compact_cf = 'rev:cf2'; +set global rocksdb_signal_drop_index_thread = 1; +# restart select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; drop table t1; select case when variable_value-@a < 500000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result 2025-05-19 16:14:26.000000000 +0000 @@ -982,7 +982,7 @@ rocksdb_skip_unique_check_tables .* rocksdb_sst_mgr_rate_bytes_per_sec 0 rocksdb_stats_dump_period_sec 600 -rocksdb_stats_level 0 +rocksdb_stats_level 1 rocksdb_stats_recalc_rate 0 rocksdb_store_row_debug_checksums OFF rocksdb_strict_collation_check OFF diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result 2025-05-19 16:14:26.000000000 +0000 @@ -1,2 +1,2 @@ Check for MANIFEST files -MANIFEST-000006 +MANIFEST-000004 diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result 2025-05-19 16:14:26.000000000 +0000 @@ -1,12 +1,6 @@ -call mtr.add_suppression("Column family 'cf1' not found"); -call mtr.add_suppression("Column family 'rev:cf2' not found"); DROP TABLE IF EXISTS t1; call mtr.add_suppression("Column family 'cf1' not found"); call mtr.add_suppression("Column family 'rev:cf2' not found"); -set global rocksdb_compact_cf = 'cf1'; -set global rocksdb_compact_cf = 'rev:cf2'; -set global rocksdb_signal_drop_index_thread = 1; -# restart CREATE TABLE t1 ( a int not null, b int not null, @@ -15,6 +9,10 @@ key (b) comment 'rev:cf2' ) ENGINE=RocksDB; DELETE FROM t1; +set global rocksdb_compact_cf = 'cf1'; +set global rocksdb_compact_cf = 'rev:cf2'; +set global rocksdb_signal_drop_index_thread = 1; +# restart select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; truncate table t1; select case when variable_value-@a < 500000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc 2025-05-19 16:14:26.000000000 +0000 @@ -1,8 +1,5 @@ --source include/have_rocksdb.inc -call mtr.add_suppression("Column family 'cf1' not found"); -call mtr.add_suppression("Column family 'rev:cf2' not found"); - --disable_warnings DROP TABLE IF EXISTS t1; --enable_warnings @@ -10,11 +7,6 @@ call mtr.add_suppression("Column family 'cf1' not found"); call mtr.add_suppression("Column family 'rev:cf2' not found"); -# Start from clean slate -set global rocksdb_compact_cf = 'cf1'; -set global rocksdb_compact_cf = 'rev:cf2'; -set global rocksdb_signal_drop_index_thread = 1; ---source include/restart_mysqld.inc CREATE TABLE t1 ( a int not null, @@ -29,6 +21,12 @@ let $table = t1; --source drop_table3_repopulate_table.inc +# Start from clean slate +set global rocksdb_compact_cf = 'cf1'; +set global rocksdb_compact_cf = 'rev:cf2'; +set global rocksdb_signal_drop_index_thread = 1; +--source include/restart_mysqld.inc + --disable_cursor_protocol select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; --enable_cursor_protocol @@ -49,6 +47,7 @@ --source include/wait_condition.inc select case when variable_value-@a < 500000 then 'true' else 'false' end from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; +#select variable_value-@a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes'; # Cleanup DROP TABLE IF EXISTS t1; diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result --- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result 2025-05-19 16:14:26.000000000 +0000 @@ -11,7 +11,7 @@ SET @start_global_value = @@global.ROCKSDB_STATS_LEVEL; SELECT @start_global_value; @start_global_value -0 +1 '# Setting to valid values in global scope#' "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 0" SET @@global.ROCKSDB_STATS_LEVEL = 0; @@ -22,7 +22,7 @@ SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT; SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 4" SET @@global.ROCKSDB_STATS_LEVEL = 4; SELECT @@global.ROCKSDB_STATS_LEVEL; @@ -32,7 +32,7 @@ SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT; SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 2" SET @@global.ROCKSDB_STATS_LEVEL = 2; SELECT @@global.ROCKSDB_STATS_LEVEL; @@ -42,7 +42,7 @@ SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT; SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@session.ROCKSDB_STATS_LEVEL to 444. It should fail because it is not session." SET @@session.ROCKSDB_STATS_LEVEL = 444; ERROR HY000: Variable 'rocksdb_stats_level' is a GLOBAL variable and should be set with SET GLOBAL @@ -52,34 +52,34 @@ Got one of the listed errors SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 'bbb'" SET @@global.ROCKSDB_STATS_LEVEL = 'bbb'; Got one of the listed errors SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '-1'" SET @@global.ROCKSDB_STATS_LEVEL = '-1'; Got one of the listed errors SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '101'" SET @@global.ROCKSDB_STATS_LEVEL = '101'; Got one of the listed errors SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '484436'" SET @@global.ROCKSDB_STATS_LEVEL = '484436'; Got one of the listed errors SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 SET @@global.ROCKSDB_STATS_LEVEL = @start_global_value; SELECT @@global.ROCKSDB_STATS_LEVEL; @@global.ROCKSDB_STATS_LEVEL -0 +1 DROP TABLE valid_values; DROP TABLE invalid_values; diff -Nru mariadb-10.11.11/storage/rocksdb/rdb_i_s.cc mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc --- mariadb-10.11.11/storage/rocksdb/rdb_i_s.cc 2025-01-30 11:01:25.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc 2025-05-19 16:14:26.000000000 +0000 @@ -587,8 +587,7 @@ cf_option_types.push_back( {"PREFIX_EXTRACTOR", opts.prefix_extractor == nullptr ? "NULL" - : std::string(opts.prefix_extractor->Name())}); - + : std::string(opts.prefix_extractor->AsString())}); // get COMPACTION_STYLE option switch (opts.compaction_style) { case rocksdb::kCompactionStyleLevel: @@ -646,7 +645,7 @@ // get table related options std::vector table_options = - split_into_vector(opts.table_factory->GetPrintableTableOptions(), '\n'); + split_into_vector(opts.table_factory->GetPrintableOptions(), '\n'); for (auto option : table_options) { option.erase(std::remove(option.begin(), option.end(), ' '), diff -Nru mariadb-10.11.11/storage/rocksdb/rdb_source_revision.h mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h --- mariadb-10.11.11/storage/rocksdb/rdb_source_revision.h 2025-01-30 11:01:27.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h 2025-05-19 16:14:28.000000000 +0000 @@ -1 +1 @@ -#define ROCKSDB_GIT_HASH "bba5e7bc21093d7cfa765e1280a7c4fdcd284288" +#define ROCKSDB_GIT_HASH "79f08d7ffa6d34d9ca3357777bcb335884a56cfb" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain --- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,54 @@ +#! /bin/bash + +# Work around issue with parallel make output causing random error, as in +# make[1]: write error: stdout +# Probably due to a kernel bug: +# https://bugs.launchpad.net/ubuntu/+source/linux-signed/+bug/1814393 +# Seems to affect image ubuntu-1604:201903-01 and ubuntu-1604:202004-01 + +cd "$(dirname $0)" + +if [ ! -x cat_ignore_eagain.out ]; then + cc -x c -o cat_ignore_eagain.out - << EOF +#include +#include +#include +int main() { + int n, m, p; + char buf[1024]; + for (;;) { + n = read(STDIN_FILENO, buf, 1024); + if (n > 0 && n <= 1024) { + for (m = 0; m < n;) { + p = write(STDOUT_FILENO, buf + m, n - m); + if (p < 0) { + if (errno == EAGAIN) { + // ignore but pause a bit + usleep(100); + } else { + perror("write failed"); + return 42; + } + } else { + m += p; + } + } + } else if (n < 0) { + if (errno == EAGAIN) { + // ignore but pause a bit + usleep(100); + } else { + // Some non-ignorable error + perror("read failed"); + return 43; + } + } else { + // EOF + return 0; + } + } +} +EOF +fi + +exec ./cat_ignore_eagain.out diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/config.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/config.yml 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,872 @@ +version: 2.1 + +orbs: + win: circleci/windows@2.4.0 + slack: circleci/slack@3.4.2 + +aliases: + - ¬ify-on-main-failure + fail_only: true + only_for_branches: main + +commands: + install-cmake-on-macos: + steps: + - run: + name: Install cmake on macos + command: | + HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake + + install-jdk8-on-macos: + steps: + - run: + name: Install JDK 8 on macos + command: | + brew install --cask adoptopenjdk/openjdk/adoptopenjdk8 + + increase-max-open-files-on-macos: + steps: + - run: + name: Increase max open files + command: | + sudo sysctl -w kern.maxfiles=1048576 + sudo sysctl -w kern.maxfilesperproc=1048576 + sudo launchctl limit maxfiles 1048576 + + pre-steps: + steps: + - checkout + - run: + name: Setup Environment Variables + command: | + echo "export GTEST_THROW_ON_FAILURE=0" >> $BASH_ENV + echo "export GTEST_OUTPUT=\"xml:/tmp/test-results/\"" >> $BASH_ENV + echo "export SKIP_FORMAT_BUCK_CHECKS=1" >> $BASH_ENV + echo "export GTEST_COLOR=1" >> $BASH_ENV + echo "export CTEST_OUTPUT_ON_FAILURE=1" >> $BASH_ENV + echo "export CTEST_TEST_TIMEOUT=300" >> $BASH_ENV + echo "export ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib" >> $BASH_ENV + echo "export BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2" >> $BASH_ENV + echo "export SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy" >> $BASH_ENV + echo "export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> $BASH_ENV + echo "export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> $BASH_ENV + + pre-steps-macos: + steps: + - pre-steps + + post-steps: + steps: + - slack/status: *notify-on-main-failure + - store_test_results: # store test result if there's any + path: /tmp/test-results + - store_artifacts: # store LOG for debugging if there's any + path: LOG + - run: # on fail, compress Test Logs for diagnosing the issue + name: Compress Test Logs + command: tar -cvzf t.tar.gz t + when: on_fail + - store_artifacts: # on fail, store Test Logs for diagnosing the issue + path: t.tar.gz + destination: test_logs + when: on_fail + + install-clang-10: + steps: + - run: + name: Install Clang 10 + command: | + echo "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" | sudo tee -a /etc/apt/sources.list + echo "deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" | sudo tee -a /etc/apt/sources.list + echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable + sudo apt-get update -y && sudo apt-get install -y clang-10 + + install-clang-13: + steps: + - run: + name: Install Clang 13 + command: | + echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list + echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list + echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add - + sudo apt-get update -y && sudo apt-get install -y clang-13 + + install-gflags: + steps: + - run: + name: Install gflags + command: | + sudo apt-get update -y && sudo apt-get install -y libgflags-dev + + install-benchmark: + steps: + - run: # currently doesn't support ubuntu-1604 which doesn't have libbenchmark package, user can still install by building it youself + name: Install benchmark + command: | + sudo apt-get update -y && sudo apt-get install -y libbenchmark-dev + + install-librados: + steps: + - run: + name: Install librados + command: | + sudo apt-get update -y && sudo apt-get install -y librados-dev + + upgrade-cmake: + steps: + - run: + name: Upgrade cmake + command: | + sudo apt remove --purge cmake + sudo snap install cmake --classic + + install-gflags-on-macos: + steps: + - run: + name: Install gflags on macos + command: | + HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags + + install-gtest-parallel: + steps: + - run: + name: Install gtest-parallel + command: | + git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel + echo 'export PATH=$HOME/gtest-parallel:$PATH' >> $BASH_ENV + + install-compression-libs: + steps: + - run: + name: Install compression libs + command: | + sudo apt-get update -y && sudo apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev + +executors: + windows-2xlarge: + machine: + image: 'windows-server-2019-vs2019:stable' + resource_class: windows.2xlarge + shell: bash.exe + +jobs: + build-macos: + macos: + xcode: 12.5.1 + resource_class: large + environment: + ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc cause env_test hang, disable it for now + steps: + - increase-max-open-files-on-macos + - install-gflags-on-macos + - pre-steps-macos + - run: ulimit -S -n 1048576 && OPT=-DCIRCLECI make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-macos-cmake: + macos: + xcode: 12.5.1 + resource_class: large + steps: + - increase-max-open-files-on-macos + - install-cmake-on-macos + - install-gflags-on-macos + - pre-steps-macos + - run: ulimit -S -n 1048576 && (mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. && make V=1 -j32 && ctest -j10) 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - run: make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-mem-env-librados: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-librados + - run: MEM_ENV=1 ROCKSDB_USE_LIBRADOS=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-encrypted-env: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - run: ENCRYPTED_ENV=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-shared_lib-alt_namespace-status_checked: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-release: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + steps: + - checkout # check out the code in the project directory + - run: make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain + - run: if ./db_stress --version; then false; else true; fi # ensure without gflags + - install-gflags + - run: make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain + - run: ./db_stress --version # ensure with gflags + - post-steps + + build-linux-release-rtti: + machine: + image: ubuntu-1604:201903-01 + resource_class: large + steps: + - checkout # check out the code in the project directory + - run: make clean + - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j8 static_lib tools db_bench 2>&1 | .circleci/cat_ignore_eagain + - run: if ./db_stress --version; then false; else true; fi # ensure without gflags + - run: sudo apt-get update -y && sudo apt-get install -y libgflags-dev + - run: make clean + - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j8 static_lib tools db_bench 2>&1 | .circleci/cat_ignore_eagain + - run: ./db_stress --version # ensure with gflags + + build-linux-lite: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - run: LITE=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-lite-release: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + steps: + - checkout # check out the code in the project directory + - run: LITE=1 make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain + - run: if ./db_stress --version; then false; else true; fi # ensure without gflags + - install-gflags + - run: LITE=1 make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain + - run: ./db_stress --version # ensure with gflags + - post-steps + + build-linux-clang-no_test_run: + machine: + image: ubuntu-1604:202104-01 + resource_class: xlarge + steps: + - checkout # check out the code in the project directory + - run: sudo apt-get update -y && sudo apt-get install -y clang libgflags-dev libtbb-dev + - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-clang10-asan: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-clang-10 + - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out + - post-steps + + build-linux-clang10-mini-tsan: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-clang-10 + - run: COMPILE_WITH_TSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out. + - post-steps + + build-linux-clang10-ubsan: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-clang-10 + - run: COMPILE_WITH_UBSAN=1 OPT="-fsanitize-blacklist=.circleci/ubsan_suppression_list.txt" CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out + - post-steps + + build-linux-clang10-clang-analyze: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-clang-10 + - run: sudo apt-get update -y && sudo apt-get install -y clang-tools-10 + - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out. For unknown, reason passing "clang++-10" as CLANG_ANALYZER doesn't work, and we need a full path. + - post-steps + + build-linux-cmake: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - upgrade-cmake + - run: (mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. && make V=1 -j20 && ctest -j20) 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-cmake-ubuntu-20: + machine: + image: ubuntu-2004:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-benchmark + - run: (mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20 && make microbench) 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-unity-and-headers: + docker: # executor type + - image: gcc:latest + resource_class: large + steps: + - checkout # check out the code in the project directory + - run: apt-get update -y && apt-get install -y libgflags-dev + - run: TEST_TMPDIR=/dev/shm && make V=1 -j8 unity_test 2>&1 | .circleci/cat_ignore_eagain + - run: make V=1 -j8 -k check-headers 2>&1 | .circleci/cat_ignore_eagain # could be moved to a different build + - post-steps + + build-linux-gcc-4_8-no_test_run: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + steps: + - pre-steps + - run: sudo apt-get update -y && sudo apt-get install gcc-4.8 g++-4.8 libgflags-dev + - run: CC=gcc-4.8 CXX=g++-4.8 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI + - post-steps + + build-linux-gcc-8-no_test_run: + machine: + image: ubuntu-2004:202010-01 + resource_class: large + steps: + - pre-steps + - run: sudo apt-get update -y && sudo apt-get install gcc-8 g++-8 libgflags-dev + - run: CC=gcc-8 CXX=g++-8 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI + - post-steps + + build-linux-gcc-9-no_test_run: + machine: + image: ubuntu-2004:202010-01 + resource_class: large + steps: + - pre-steps + - run: sudo apt-get update -y && sudo apt-get install gcc-9 g++-9 libgflags-dev + - run: CC=gcc-9 CXX=g++-9 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI + - post-steps + + build-linux-gcc-10-cxx20-no_test_run: + machine: + image: ubuntu-2004:202010-01 + resource_class: xlarge + steps: + - pre-steps + - run: sudo apt-get update -y && sudo apt-get install gcc-10 g++-10 libgflags-dev + - run: CC=gcc-10 CXX=g++-10 V=1 SKIP_LINK=1 ROCKSDB_CXX_STANDARD=c++20 make -j16 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI + - post-steps + + build-linux-gcc-11-no_test_run: + machine: + image: ubuntu-2004:202010-01 + resource_class: xlarge + steps: + - pre-steps + - run: sudo apt-get update -y && sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get install gcc-11 g++-11 libgflags-dev + - run: CC=gcc-11 CXX=g++-11 V=1 SKIP_LINK=1 make -j16 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI + - post-steps + + build-linux-clang-13-no_test_run: + machine: + image: ubuntu-2004:202010-01 + resource_class: xlarge + steps: + - pre-steps + - install-clang-13 + - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j16 all 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + # This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing. + build-linux-microbench: + machine: + image: ubuntu-2004:202010-01 + resource_class: xlarge + steps: + - pre-steps + - install-benchmark + - run: DEBUG_LEVEL=0 make microbench 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-windows: + executor: windows-2xlarge + parameters: + extra_cmake_opt: + default: "" + type: string + vs_year: + default: "2019" + type: string + cmake_generator: + default: "Visual Studio 16 2019" + type: string + environment: + THIRDPARTY_HOME: C:/Users/circleci/thirdparty + CMAKE_HOME: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64 + CMAKE_BIN: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64/bin/cmake.exe + SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.7 + SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.7;C:/Users/circleci/thirdparty/snappy-1.1.7/build + SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.7/build/Debug/snappy.lib + VS_YEAR: <> + CMAKE_GENERATOR: <> + steps: + - checkout + - run: + name: "Setup VS" + command: | + if [[ "${VS_YEAR}" == "2019" ]]; then + echo "VS2019 already present." + elif [[ "${VS_YEAR}" == "2017" ]]; then + echo "Installing VS2017..." + powershell .circleci/vs2017_install.ps1 + elif [[ "${VS_YEAR}" == "2015" ]]; then + echo "Installing VS2015..." + powershell .circleci/vs2015_install.ps1 + fi + - store_artifacts: + path: \Users\circleci\AppData\Local\Temp\vslogs.zip + - run: + name: "Install thirdparty dependencies" + command: | + mkdir ${THIRDPARTY_HOME} + cd ${THIRDPARTY_HOME} + echo "Installing CMake..." + curl --fail --silent --show-error --output cmake-3.16.4-win64-x64.zip --location https://github.com/Kitware/CMake/releases/download/v3.16.4/cmake-3.16.4-win64-x64.zip + unzip -q cmake-3.16.4-win64-x64.zip + echo "Building Snappy dependency..." + curl --fail --silent --show-error --output snappy-1.1.7.zip --location https://github.com/google/snappy/archive/1.1.7.zip + unzip -q snappy-1.1.7.zip + cd snappy-1.1.7 + mkdir build + cd build + ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" .. + msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + - run: + name: "Build RocksDB" + command: | + mkdir build + cd build + ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 << parameters.extra_cmake_opt >> .. + cd .. + echo "Building with VS version: ${CMAKE_GENERATOR}" + msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + - run: + name: "Test RocksDB" + shell: powershell.exe + command: | + build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + + build-linux-java: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + environment: + JAVA_HOME: /usr/lib/jvm/java-1.8.0-openjdk-amd64 + steps: + - pre-steps + - install-gflags + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build RocksDBJava Shared Library" + command: make V=1 J=8 -j8 rocksdbjava 2>&1 | .circleci/cat_ignore_eagain + - run: + name: "Test RocksDBJava" + command: make V=1 J=8 -j8 jtest 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-java-static: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + environment: + JAVA_HOME: /usr/lib/jvm/java-1.8.0-openjdk-amd64 + steps: + - pre-steps + - install-gflags + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build RocksDBJava Static Library" + command: make V=1 J=8 -j8 rocksdbjavastatic 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-macos-java: + macos: + xcode: 12.5.1 + resource_class: medium + environment: + JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc causes java 8 crash + steps: + - increase-max-open-files-on-macos + - install-gflags-on-macos + - install-jdk8-on-macos + - pre-steps-macos + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build RocksDBJava Shared Library" + command: make V=1 J=8 -j8 rocksdbjava 2>&1 | .circleci/cat_ignore_eagain + - run: + name: "Test RocksDBJava" + command: make V=1 J=8 -j8 jtest 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-macos-java-static: + macos: + xcode: 12.5.1 + resource_class: medium + environment: + JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + steps: + - increase-max-open-files-on-macos + - install-gflags-on-macos + - install-cmake-on-macos + - install-jdk8-on-macos + - pre-steps-macos + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build RocksDBJava x86 and ARM Static Libraries" + command: make V=1 J=8 -j8 rocksdbjavastaticosx 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-macos-java-static-universal: + macos: + xcode: 12.5.1 + resource_class: medium + environment: + JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + steps: + - increase-max-open-files-on-macos + - install-gflags-on-macos + - install-cmake-on-macos + - install-jdk8-on-macos + - pre-steps-macos + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build RocksDBJava Universal Binary Static Library" + command: make V=1 J=8 -j8 rocksdbjavastaticosx_ub 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-examples: + machine: + image: ubuntu-1604:202104-01 + resource_class: large + steps: + - pre-steps + - install-gflags + - run: + name: "Build examples" + command: | + OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4 | ../.circleci/cat_ignore_eagain + - post-steps + + build-cmake-mingw: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - run: sudo apt-get update -y && sudo apt-get install -y mingw-w64 + - run: sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix + - run: + name: "Build cmake-mingw" + command: | + sudo apt-get install snapd && sudo snap install cmake --beta --classic + export PATH=/snap/bin:$PATH + sudo apt-get install -y openjdk-8-jdk + export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 + export PATH=$JAVA_HOME/bin:$PATH + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version + mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni + - post-steps + + build-linux-non-shm: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + parameters: + start_test: + default: "" + type: string + end_test: + default: "" + type: string + steps: + - pre-steps + - install-gflags + - install-gtest-parallel + - run: + name: "Build unit tests" + command: | + echo "env: $(env)" + echo "** done env" + ROCKSDBTESTS_START=<> ROCKSDBTESTS_END=<> ROCKSDBTESTS_SUBSET_TESTS_TO_FILE=/tmp/test_list make V=1 -j32 --output-sync=target build_subset_tests + - run: + name: "Run unit tests in parallel" + command: | + sed -i 's/[[:space:]]*$//; s/ / \.\//g; s/.*/.\/&/' /tmp/test_list + cat /tmp/test_list + export TEST_TMPDIR=/tmp/rocksdb_test_tmp + gtest-parallel $(&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-arm: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.large + steps: + - pre-steps + - install-gflags + - run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some 2>&1 | .circleci/cat_ignore_eagain + - post-steps + + build-linux-arm-cmake-no_test_run: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.large + environment: + JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64 + steps: + - pre-steps + - install-gflags + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "Build with cmake" + command: | + mkdir build + cd build + cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=1 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. + make -j4 + - run: + name: "Build Java with cmake" + command: | + rm -rf build + mkdir build + cd build + cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 .. + make -j4 rocksdb rocksdbjni + - post-steps + + build-format-compatible: + machine: + image: ubuntu-1604:202104-01 + resource_class: 2xlarge + steps: + - pre-steps + - install-gflags + - install-compression-libs + - run: + name: "test" + command: | + export TEST_TMPDIR=/dev/shm/rocksdb + rm -rf /dev/shm/rocksdb + mkdir /dev/shm/rocksdb + tools/check_format_compatible.sh + - post-steps + +workflows: + version: 2 + build-linux: + jobs: + - build-linux + build-linux-cmake: + jobs: + - build-linux-cmake + - build-linux-cmake-ubuntu-20 + build-linux-mem-env-librados: + jobs: + - build-linux-mem-env-librados + build-linux-encrypted-env: + jobs: + - build-linux-encrypted-env + build-linux-shared_lib-alt_namespace-status_checked: + jobs: + - build-linux-shared_lib-alt_namespace-status_checked + build-linux-lite: + jobs: + - build-linux-lite + build-linux-release: + jobs: + - build-linux-release + build-linux-release-rtti: + jobs: + - build-linux-release-rtti + build-linux-lite-release: + jobs: + - build-linux-lite-release + build-linux-clang10-asan: + jobs: + - build-linux-clang10-asan + build-linux-clang10-mini-tsan: + jobs: + - build-linux-clang10-mini-tsan + build-linux-clang10-ubsan: + jobs: + - build-linux-clang10-ubsan + build-linux-clang10-clang-analyze: + jobs: + - build-linux-clang10-clang-analyze + build-linux-unity-and-headers: + jobs: + - build-linux-unity-and-headers + build-windows-vs2019: + jobs: + - build-windows: + name: "build-windows-vs2019" + build-windows-vs2019-cxx20: + jobs: + - build-windows: + name: "build-windows-vs2019-cxx20" + extra_cmake_opt: -DCMAKE_CXX_STANDARD=20 + build-windows-vs2017: + jobs: + - build-windows: + name: "build-windows-vs2017" + vs_year: "2017" + cmake_generator: "Visual Studio 15 Win64" + build-java: + jobs: + - build-linux-java + - build-linux-java-static + - build-macos-java + - build-macos-java-static + - build-macos-java-static-universal + build-examples: + jobs: + - build-examples + build-linux-non-shm: + jobs: + - build-linux-non-shm: + start_test: "" + end_test: "db_options_test" # make sure unique in src.mk + - build-linux-non-shm: + start_test: "db_options_test" # make sure unique in src.mk + end_test: "filename_test" # make sure unique in src.mk + - build-linux-non-shm: + start_test: "filename_test" # make sure unique in src.mk + end_test: "statistics_test" # make sure unique in src.mk + - build-linux-non-shm: + start_test: "statistics_test" # make sure unique in src.mk + end_test: "" + build-linux-compilers-no_test_run: + jobs: + - build-linux-clang-no_test_run + - build-linux-clang-13-no_test_run + - build-linux-gcc-4_8-no_test_run + - build-linux-gcc-8-no_test_run + - build-linux-gcc-9-no_test_run + - build-linux-gcc-10-cxx20-no_test_run + - build-linux-gcc-11-no_test_run + - build-linux-arm-cmake-no_test_run + build-macos: + jobs: + - build-macos + build-macos-cmake: + jobs: + - build-macos-cmake + build-cmake-mingw: + jobs: + - build-cmake-mingw + build-linux-arm: + jobs: + - build-linux-arm + build-microbench: + jobs: + - build-linux-microbench + nightly: + triggers: + - schedule: + cron: "0 0 * * *" + filters: + branches: + only: + - main + jobs: + - build-format-compatible + - build-linux-arm-test-full diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,6 @@ +# Supress UBSAN warnings related to stl_tree.h, e.g. +# UndefinedBehaviorSanitizer: undefined-behavior /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43 in +# /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43: +# runtime error: upcast of address 0x000001fa8820 with insufficient space for an object of type +# 'std::_Rb_tree_node, rocksdb::(anonymous namespace)::LockHoldingInfo> >' +src:*bits/stl_tree.h diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 --- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,24 @@ +$VS_DOWNLOAD_LINK = "https://go.microsoft.com/fwlink/?LinkId=691126" +$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe" +curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe +if ($LASTEXITCODE -ne 0) { + echo "Download of the VS 2015 installer failed" + exit 1 +} +$VS_INSTALL_ARGS = @("/Quiet", "/NoRestart") +$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru +Remove-Item -Path vs_installer.exe -Force +$exitCode = $process.ExitCode +if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { + echo "VS 2015 installer exited with code $exitCode, which should be one of [0, 3010]." + curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe + if ($LASTEXITCODE -ne 0) { + echo "Download of the VS Collect tool failed." + exit 1 + } + Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru + New-Item -Path "C:\w\build-results" -ItemType "directory" -Force + Copy-Item -Path "C:\Users\circleci\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\" + exit 1 +} +echo "VS 2015 installed." diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 --- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,35 @@ +$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe" +$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe" +$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools", + "--add Microsoft.VisualStudio.Component.VC.Tools.14.13", + "--add Microsoft.Component.MSBuild", + "--add Microsoft.VisualStudio.Component.Roslyn.Compiler", + "--add Microsoft.VisualStudio.Component.TextTemplating", + "--add Microsoft.VisualStudio.Component.VC.CoreIde", + "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest", + "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core", + "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64", + "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81") + +curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe +if ($LASTEXITCODE -ne 0) { + echo "Download of the VS 2017 installer failed" + exit 1 +} + +$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru +Remove-Item -Path vs_installer.exe -Force +$exitCode = $process.ExitCode +if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { + echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]." + curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe + if ($LASTEXITCODE -ne 0) { + echo "Download of the VS Collect tool failed." + exit 1 + } + Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru + New-Item -Path "C:\w\build-results" -ItemType "directory" -Force + Copy-Item -Path "C:\Users\circleci\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\" + exit 1 +} +echo "VS 2017 installed." diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,44 @@ +name: Check buck targets and code format +on: [push, pull_request] +jobs: + check: + name: Check TARGETS file and code format + runs-on: ubuntu-latest + steps: + - name: Checkout feature branch + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Fetch from upstream + run: | + git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream + + - name: Where am I + run: | + echo git status && git status + echo "git remote -v" && git remote -v + echo git branch && git branch + + - name: Setup Python + uses: actions/setup-python@v1 + + - name: Install Dependencies + run: python -m pip install --upgrade pip + + - name: Install argparse + run: pip install argparse + + - name: Download clang-format-diff.py + uses: wei/wget@v1 + with: + args: https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py + + - name: Check format + run: VERBOSE_CHECK=1 make check-format + + - name: Compare buckify output + run: make check-buck-targets + + - name: Simple source code checks + run: make check-sources diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.gitignore mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore --- mariadb-10.11.11/storage/rocksdb/rocksdb/.gitignore 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore 2025-05-19 16:14:27.000000000 +0000 @@ -1,4 +1,5 @@ make_config.mk +rocksdb.pc *.a *.arc @@ -7,6 +8,7 @@ *.gcda *.gcno *.o +*.o.tmp *.so *.so.* *_test @@ -34,6 +36,7 @@ sst_dump blob_dump block_cache_trace_analyzer +db_with_timestamp_basic_test tools/block_cache_analyzer/*.pyc column_aware_encoding_exp util/build_version.cc @@ -51,6 +54,7 @@ trace_analyzer trace_analyzer_test block_cache_trace_analyzer +io_tracer_parser .DS_Store .vs .vscode @@ -82,3 +86,12 @@ fbcode buckifier/*.pyc buckifier/__pycache__ + +compile_commands.json +clang-format-diff.py +.py3/ + +fuzz/proto/gen/ +fuzz/crash-* + +cmake-build-* diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.travis.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/.travis.yml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml 2025-05-19 16:14:27.000000000 +0000 @@ -2,18 +2,19 @@ language: cpp os: - linux - - osx +arch: + - arm64 + - ppc64le + - s390x compiler: - clang - gcc -osx_image: xcode9.4 -jdk: - - openjdk7 cache: - ccache addons: apt: + update: true sources: - ubuntu-toolchain-r-test packages: @@ -24,15 +25,6 @@ - liblzma-dev # xv - libzstd-dev - zlib1g-dev - homebrew: - update: true - packages: - - ccache - - gflags - - lz4 - - snappy - - xz - - zstd env: - TEST_GROUP=platform_dependent # 16-18 minutes @@ -48,43 +40,209 @@ - JOB_NAME=examples # 5-7 minutes - JOB_NAME=cmake # 3-5 minutes - JOB_NAME=cmake-gcc8 # 3-5 minutes + - JOB_NAME=cmake-gcc9 # 3-5 minutes + - JOB_NAME=cmake-gcc9-c++20 # 3-5 minutes - JOB_NAME=cmake-mingw # 3 minutes + - JOB_NAME=make-gcc4.8 + - JOB_NAME=status_checked matrix: exclude: - - os: osx + - os : linux + arch: arm64 + env: JOB_NAME=cmake-mingw + - os : linux + arch: arm64 + env: JOB_NAME=make-gcc4.8 + - os: linux + arch: ppc64le + env: JOB_NAME=cmake-mingw + - os: linux + arch: ppc64le + env: JOB_NAME=make-gcc4.8 + - os: linux + arch: s390x + env: JOB_NAME=cmake-mingw + - os: linux + arch: s390x + env: JOB_NAME=make-gcc4.8 + - os: linux + compiler: clang + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: arm64 + env: TEST_GROUP=platform_dependent + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: TEST_GROUP=1 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: TEST_GROUP=1 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x env: TEST_GROUP=1 - - os: osx + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 env: TEST_GROUP=2 - - os: osx + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: TEST_GROUP=2 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: TEST_GROUP=2 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: TEST_GROUP=3 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le env: TEST_GROUP=3 - - os: osx + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: TEST_GROUP=3 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: TEST_GROUP=4 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le env: TEST_GROUP=4 - - os: osx + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: TEST_GROUP=4 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=cmake + - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/ + os : linux + arch: arm64 + env: JOB_NAME=java_test + - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/ + os: linux + arch: ppc64le + env: JOB_NAME=java_test + - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/ + os: linux + arch: s390x + env: JOB_NAME=java_test + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=lite_build + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=lite_build + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=lite_build + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=examples + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=examples + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=examples + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 env: JOB_NAME=cmake-gcc8 - - os : osx - env: JOB_NAME=cmake-mingw - - os : linux - compiler: clang - - os : osx - compiler: gcc + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=cmake-gcc8 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=cmake-gcc8 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=cmake-gcc9 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=cmake-gcc9 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=cmake-gcc9 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=cmake-gcc9-c++20 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=cmake-gcc9-c++20 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=cmake-gcc9-c++20 + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os : linux + arch: arm64 + env: JOB_NAME=status_checked + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: ppc64le + env: JOB_NAME=status_checked + - if: type = pull_request AND commit_message !~ /FULL_CI/ + os: linux + arch: s390x + env: JOB_NAME=status_checked install: - - if [ "${TRAVIS_OS_NAME}" == osx ]; then - PATH=$PATH:/usr/local/opt/ccache/libexec; - fi - if [ "${JOB_NAME}" == cmake-gcc8 ]; then - sudo apt-get install -y g++-8; + sudo apt-get install -y g++-8 || exit $?; CC=gcc-8 && CXX=g++-8; fi + - if [ "${JOB_NAME}" == cmake-gcc9 ] || [ "${JOB_NAME}" == cmake-gcc9-c++20 ]; then + sudo apt-get install -y g++-9 || exit $?; + CC=gcc-9 && CXX=g++-9; + fi - if [ "${JOB_NAME}" == cmake-mingw ]; then - sudo apt-get install -y mingw-w64 ; + sudo apt-get install -y mingw-w64 || exit $?; + fi + - if [ "${JOB_NAME}" == make-gcc4.8 ]; then + sudo apt-get install -y g++-4.8 || exit $?; + CC=gcc-4.8 && CXX=g++-4.8; fi - - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then - mkdir cmake-dist && curl --silent --fail --show-error --location https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH; + - | + if [[ "${JOB_NAME}" == cmake* ]]; then + sudo apt-get remove -y cmake cmake-data + export CMAKE_DEB="cmake-3.14.5-Linux-$(uname -m).deb" + export CMAKE_DEB_URL="https://rocksdb-deps.s3-us-west-2.amazonaws.com/cmake/${CMAKE_DEB}" + curl --silent --fail --show-error --location --output "${CMAKE_DEB}" "${CMAKE_DEB_URL}" || exit $? + sudo dpkg -i "${CMAKE_DEB}" || exit $? + which cmake && cmake --version fi - - if [[ "${JOB_NAME}" == java_test ]]; then - java -version && echo "JAVA_HOME=${JAVA_HOME}"; + - | + if [[ "${JOB_NAME}" == java_test || "${JOB_NAME}" == cmake* ]]; then + # Ensure JDK 8 + sudo apt-get install -y openjdk-8-jdk || exit $? + export PATH=/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)/bin:$PATH + export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture) + echo "JAVA_HOME=${JAVA_HOME}" + which java && java -version + which javac && javac -version fi before_script: @@ -93,41 +251,53 @@ - ulimit -n 8192 script: - - ${CXX} --version + - date; ${CXX} --version - if [ `command -v ccache` ]; then ccache -C; fi - case $TEST_GROUP in platform_dependent) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some + OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=only make -j4 all_but_some_tests check_some ;; 1) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=db_iter_test make -j4 check_some + OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_END=backupable_db_test make -j4 check_some ;; 2) - OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" V=1 make -j4 tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" V=1 ROCKSDBTESTS_START=db_iter_test ROCKSDBTESTS_END=options_file_test make -j4 check_some + OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 make -j4 tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=backupable_db_test ROCKSDBTESTS_END=db_universal_compaction_test make -j4 check_some ;; 3) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=options_file_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some + OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=db_universal_compaction_test ROCKSDBTESTS_END=table_properties_collector_test make -j4 check_some ;; 4) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some + OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=table_properties_collector_test make -j4 check_some ;; esac - case $JOB_NAME in java_test) - OPT=-DTRAVIS V=1 make rocksdbjava jtest + OPT=-DTRAVIS LIB_MODE=shared V=1 make rocksdbjava jtest ;; lite_build) - OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools + OPT='-DTRAVIS -DROCKSDB_LITE' LIB_MODE=shared V=1 make -j4 all ;; examples) - OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4 + OPT=-DTRAVIS LIB_MODE=shared V=1 make -j4 static_lib && cd examples && make -j4 ;; cmake-mingw) sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix; mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni ;; cmake*) - mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release && make -j4 rocksdb rocksdbjni + case $JOB_NAME in + *-c++20) + OPT=-DCMAKE_CXX_STANDARD=20 + ;; + esac + + mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=0 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. && make -j4 && cd .. && rm -rf build && mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release $OPT && make -j4 rocksdb rocksdbjni + ;; + make-gcc4.8) + OPT=-DTRAVIS LIB_MODE=shared V=1 SKIP_LINK=1 make -j4 all && [ "Linking broken because libgflags compiled with newer ABI" ] + ;; + status_checked) + OPT=-DTRAVIS LIB_MODE=shared V=1 ASSERT_STATUS_CHECKED=1 make -j4 check_some ;; esac notifications: diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/CMakeLists.txt 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -32,10 +32,11 @@ # 3. cmake .. # 4. make -j -cmake_minimum_required(VERSION 3.5.1) +cmake_minimum_required(VERSION 3.10) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(ReadVersion) +include(GoogleTest) get_rocksdb_version(rocksdb_VERSION) project(rocksdb VERSION ${rocksdb_VERSION} @@ -62,6 +63,7 @@ endif(CCACHE_FOUND) option(WITH_JEMALLOC "build with JeMalloc" OFF) +option(WITH_LIBURING "build with liburing" ON) option(WITH_SNAPPY "build with SNAPPY" OFF) option(WITH_LZ4 "build with lz4" OFF) option(WITH_ZLIB "build with zlib" OFF) @@ -70,6 +72,12 @@ if (WITH_WINDOWS_UTF8_FILENAMES) add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES) endif() + +if ($ENV{CIRCLECI}) + message(STATUS "Build for CircieCI env, a few tests may be disabled") + add_definitions(-DCIRCLECI) +endif() + # third-party/folly is only validated to work on Linux and Windows for now. # So only turn it on there by default. if(CMAKE_SYSTEM_NAME MATCHES "Linux|Windows") @@ -83,15 +91,18 @@ option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF) endif() +if( NOT DEFINED CMAKE_CXX_STANDARD ) + set(CMAKE_CXX_STANDARD 11) +endif() + include(CMakeDependentOption) -CMAKE_DEPENDENT_OPTION(WITH_GFLAGS "build with GFlags" ON - "NOT MSVC;NOT MINGW" OFF) if(MSVC) + option(WITH_GFLAGS "build with GFlags" OFF) option(WITH_XPRESS "build with windows built in compression" OFF) include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc) else() - if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") # FreeBSD has jemalloc as default malloc # but it does not have all the jemalloc files in include/... set(WITH_JEMALLOC ON) @@ -103,18 +114,40 @@ endif() endif() - # No config file for this + if(MINGW) + option(WITH_GFLAGS "build with GFlags" OFF) + else() + option(WITH_GFLAGS "build with GFlags" ON) + endif() + set(GFLAGS_LIB) if(WITH_GFLAGS) - find_package(gflags REQUIRED) + # Config with namespace available since gflags 2.2.2 + option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON) + find_package(gflags CONFIG) + if(gflags_FOUND) + if(TARGET ${GFLAGS_TARGET}) + # Config with GFLAGS_TARGET available since gflags 2.2.0 + set(GFLAGS_LIB ${GFLAGS_TARGET}) + else() + # Config with GFLAGS_LIBRARIES available since gflags 2.1.0 + set(GFLAGS_LIB ${gflags_LIBRARIES}) + endif() + else() + find_package(gflags REQUIRED) + set(GFLAGS_LIB gflags::gflags) + endif() + include_directories(${GFLAGS_INCLUDE_DIR}) + list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB}) add_definitions(-DGFLAGS=1) - include_directories(${gflags_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS gflags::gflags) endif() if(WITH_SNAPPY) - find_package(snappy REQUIRED) + find_package(Snappy CONFIG) + if(NOT Snappy_FOUND) + find_package(Snappy REQUIRED) + endif() add_definitions(-DSNAPPY) - list(APPEND THIRDPARTY_LIBS snappy::snappy) + list(APPEND THIRDPARTY_LIBS Snappy::snappy) endif() if(WITH_ZLIB) @@ -149,23 +182,25 @@ endif() endif() -string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC) -set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb") +string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC) +set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb") find_package(Git) if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") - if(WIN32) - execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) - else() - execute_process(COMMAND ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD ) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad") + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE) + if (rv AND NOT rv EQUAL 0) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE) endif() else() set(GIT_SHA 0) + set(GIT_MOD 1) endif() - -string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") - +string(REGEX REPLACE "[^0-9a-fA-F]+" "" GIT_SHA "${GIT_SHA}") +string(REGEX REPLACE "[^0-9: /-]+" "" GIT_DATE "${GIT_DATE}") option(WITH_MD_LIBRARY "build with MD" ON) if(WIN32 AND MSVC) @@ -178,20 +213,20 @@ set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc) configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY) -add_library(build_version OBJECT ${BUILD_VERSION_CC}) -target_include_directories(build_version PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/util) + if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wstrict-prototypes") + endif() if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format -fno-asynchronous-unwind-tables") add_definitions(-D_POSIX_C_SOURCE=1) endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") include(CheckCXXCompilerFlag) @@ -203,49 +238,91 @@ endif() include(CheckCCompilerFlag) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") + CHECK_C_COMPILER_FLAG("-mcpu=power9" HAS_POWER9) + if(HAS_POWER9) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power9 -mtune=power9") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power9 -mtune=power9") + else() + CHECK_C_COMPILER_FLAG("-mcpu=power8" HAS_POWER8) + if(HAS_POWER8) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8 -mtune=power8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8 -mtune=power8") + endif(HAS_POWER8) + endif(HAS_POWER9) CHECK_C_COMPILER_FLAG("-maltivec" HAS_ALTIVEC) if(HAS_ALTIVEC) message(STATUS " HAS_ALTIVEC yes") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maltivec") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8") endif(HAS_ALTIVEC) -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64") CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC) if(HAS_ARMV8_CRC) message(STATUS " HAS_ARMV8_CRC yes") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") endif(HAS_ARMV8_CRC) -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") + CHECK_C_COMPILER_FLAG("-march=native" HAS_S390X_MARCH_NATIVE) + if(HAS_S390X_MARCH_NATIVE) + message(STATUS " HAS_S390X_MARCH_NATIVE yes") + endif(HAS_S390X_MARCH_NATIVE) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) +option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF) +option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF) if(PORTABLE) # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h # is available, it is available by default. if(FORCE_SSE42 AND NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul") endif() + if(MSVC) + if(FORCE_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") + endif() + # MSVC automatically enables BMI / lzcnt with AVX2. + if(FORCE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + endif() + else() + if(FORCE_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") + endif() + if(FORCE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt") + endif() + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") + endif() + endif() else() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") else() - if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") + elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() endif() endif() include(CheckCXXSourceCompiles) +set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() -CHECK_CXX_SOURCE_COMPILES(" + +if (NOT PORTABLE OR FORCE_SSE42) + CHECK_CXX_SOURCE_COMPILES(" #include #include #include @@ -257,26 +334,66 @@ auto d = _mm_cvtsi128_si64(c); } " HAVE_SSE42) -unset(CMAKE_REQUIRED_FLAGS) -if(HAVE_SSE42) - add_definitions(-DHAVE_SSE42) - add_definitions(-DHAVE_PCLMUL) -elseif(FORCE_SSE42) - message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") + if(HAVE_SSE42) + add_definitions(-DHAVE_SSE42) + add_definitions(-DHAVE_PCLMUL) + elseif(FORCE_SSE42) + message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") + endif() +endif() + +# Check if -latomic is required or not +if (NOT MSVC) + set(CMAKE_REQUIRED_FLAGS "--std=c++11") + CHECK_CXX_SOURCE_COMPILES(" +#include +std::atomic x(0); +int main() { + uint64_t i = x.load(std::memory_order_relaxed); + bool b = x.is_lock_free(); + return 0; +} +" BUILTIN_ATOMIC) + if (NOT BUILTIN_ATOMIC) + #TODO: Check if -latomic exists + list(APPEND THIRDPARTY_LIBS atomic) + endif() endif() +if (WITH_LIBURING) + find_package(uring) + if (uring_FOUND) + add_definitions(-DROCKSDB_IOURING_PRESENT) + list(APPEND THIRDPARTY_LIBS uring::uring) + endif() +endif() + +# Reset the required flags +set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) + CHECK_CXX_SOURCE_COMPILES(" #if defined(_MSC_VER) && !defined(__thread) #define __thread __declspec(thread) #endif int main() { static __thread int tls; + (void)tls; } " HAVE_THREAD_LOCAL) if(HAVE_THREAD_LOCAL) add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL) endif() +option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON) +if (NOT WITH_IOSTATS_CONTEXT) + add_definitions(-DNIOSTATS_CONTEXT) +endif() + +option(WITH_PERF_CONTEXT "Enable perf context" ON) +if (NOT WITH_PERF_CONTEXT) + add_definitions(-DNPERF_CONTEXT) +endif() + option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON) if(FAIL_ON_WARNINGS) if(MSVC) @@ -343,6 +460,12 @@ add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION) endif() +option(ASSERT_STATUS_CHECKED "build with assert status checked" OFF) +if (ASSERT_STATUS_CHECKED) + message(STATUS "Build with assert status checked") + add_definitions(-DROCKSDB_ASSERT_STATUS_CHECKED) +endif() + if(DEFINED USE_RTTI) if(USE_RTTI) message(STATUS "Enabling RTTI") @@ -377,7 +500,15 @@ message(STATUS "Debug optimization is enabled") set(CMAKE_CXX_FLAGS_DEBUG "/Oxt") else() - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1") + + # Minimal Build is deprecated after MSVC 2015 + if( MSVC_VERSION GREATER 1900 ) + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-") + else() + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm") + endif() + endif() if(WITH_RUNTIME_DEBUG) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d") @@ -404,15 +535,12 @@ add_definitions(-fno-builtin-memcmp -DCYGWIN) elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") add_definitions(-DOS_MACOSX) - if(CMAKE_SYSTEM_PROCESSOR MATCHES arm) - add_definitions(-DIOS_CROSS_COMPILE -DROCKSDB_LITE) - # no debug info for IOS, that will make our library big - add_definitions(-DNDEBUG) - endif() elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") add_definitions(-DOS_LINUX) elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS") add_definitions(-DOS_SOLARIS) +elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD") + add_definitions(-DOS_GNU_KFREEBSD) elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") add_definitions(-DOS_FREEBSD) elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD") @@ -471,7 +599,11 @@ endif() include(CheckCXXSymbolExists) -check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE) +if(CMAKE_SYSTEM_NAME MATCHES "^FreeBSD") + check_cxx_symbol_exists(malloc_usable_size malloc_np.h HAVE_MALLOC_USABLE_SIZE) +else() + check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE) +endif() if(HAVE_MALLOC_USABLE_SIZE) add_definitions(-DROCKSDB_MALLOC_USABLE_SIZE) endif() @@ -481,9 +613,18 @@ add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT) endif() +check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL) +if(HAVE_AUXV_GETAUXVAL) + add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT) +endif() + +check_cxx_symbol_exists(F_FULLFSYNC "fcntl.h" HAVE_FULLFSYNC) +if(HAVE_FULLFSYNC) + add_definitions(-DHAVE_FULLFSYNC) +endif() + include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) -include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src) if(WITH_FOLLY_DISTRIBUTED_MUTEX) include_directories(${PROJECT_SOURCE_DIR}/third-party/folly) endif() @@ -492,14 +633,29 @@ # Main library source code set(SOURCES + cache/cache.cc + cache/cache_entry_roles.cc + cache/cache_key.cc + cache/cache_reservation_manager.cc cache/clock_cache.cc cache/lru_cache.cc cache/sharded_cache.cc db/arena_wrapped_db_iter.cc + db/blob/blob_fetcher.cc + db/blob/blob_file_addition.cc + db/blob/blob_file_builder.cc + db/blob/blob_file_cache.cc + db/blob/blob_file_garbage.cc + db/blob/blob_file_meta.cc + db/blob/blob_file_reader.cc + db/blob/blob_garbage_meter.cc + db/blob/blob_log_format.cc + db/blob/blob_log_sequential_reader.cc + db/blob/blob_log_writer.cc + db/blob/prefetch_buffer_collection.cc db/builder.cc db/c.cc db/column_family.cc - db/compacted_db_impl.cc db/compaction/compaction.cc db/compaction/compaction_iterator.cc db/compaction/compaction_picker.cc @@ -507,8 +663,10 @@ db/compaction/compaction_picker_fifo.cc db/compaction/compaction_picker_level.cc db/compaction/compaction_picker_universal.cc + db/compaction/sst_partitioner.cc db/convenience.cc db/db_filesnapshot.cc + db/db_impl/compacted_db_impl.cc db/db_impl/db_impl.cc db/db_impl/db_impl_write.cc db/db_impl/db_impl_compaction_flush.cc @@ -539,6 +697,8 @@ db/memtable_list.cc db/merge_helper.cc db/merge_operator.cc + db/output_validator.cc + db/periodic_work_scheduler.cc db/range_del_aggregator.cc db/range_tombstone_fragmenter.cc db/repair.cc @@ -549,22 +709,29 @@ db/trim_history_scheduler.cc db/version_builder.cc db/version_edit.cc + db/version_edit_handler.cc db/version_set.cc + db/wal_edit.cc db/wal_manager.cc db/write_batch.cc db/write_batch_base.cc db/write_controller.cc db/write_thread.cc + env/composite_env.cc env/env.cc env/env_chroot.cc env/env_encryption.cc env/env_hdfs.cc env/file_system.cc + env/file_system_tracer.cc + env/fs_remap.cc env/mock_env.cc + env/unique_id_gen.cc file/delete_scheduler.cc file/file_prefetch_buffer.cc file/file_util.cc file/filename.cc + file/line_file_reader.cc file/random_access_file_reader.cc file/read_write_util.cc file/readahead_raf.cc @@ -577,6 +744,8 @@ memory/arena.cc memory/concurrent_arena.cc memory/jemalloc_nodump_allocator.cc + memory/memkind_kmem_allocator.cc + memory/memory_allocator.cc memtable/alloc_tracker.cc memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc @@ -597,19 +766,23 @@ monitoring/thread_status_util.cc monitoring/thread_status_util_debug.cc options/cf_options.cc + options/configurable.cc + options/customizable.cc options/db_options.cc options/options.cc options/options_helper.cc options/options_parser.cc - options/options_sanity_check.cc port/stack_trace.cc table/adaptive/adaptive_table_factory.cc + table/block_based/binary_search_index_reader.cc table/block_based/block.cc table/block_based/block_based_filter_block.cc table/block_based/block_based_table_builder.cc table/block_based/block_based_table_factory.cc + table/block_based/block_based_table_iterator.cc table/block_based/block_based_table_reader.cc table/block_based/block_builder.cc + table/block_based/block_prefetcher.cc table/block_based/block_prefix_index.cc table/block_based/data_block_hash_index.cc table/block_based/data_block_footer.cc @@ -617,9 +790,14 @@ table/block_based/filter_policy.cc table/block_based/flush_block_policy.cc table/block_based/full_filter_block.cc + table/block_based/hash_index_reader.cc table/block_based/index_builder.cc + table/block_based/index_reader_common.cc table/block_based/parsed_full_filter_block.cc table/block_based/partitioned_filter_block.cc + table/block_based/partitioned_index_iterator.cc + table/block_based/partitioned_index_reader.cc + table/block_based/reader_common.cc table/block_based/uncompression_dict_reader.cc table/block_fetcher.cc table/cuckoo/cuckoo_table_builder.cc @@ -637,22 +815,30 @@ table/plain/plain_table_index.cc table/plain/plain_table_key_coding.cc table/plain/plain_table_reader.cc + table/sst_file_dumper.cc table/sst_file_reader.cc table/sst_file_writer.cc + table/table_factory.cc table/table_properties.cc table/two_level_iterator.cc + table/unique_id.cc test_util/sync_point.cc test_util/sync_point_impl.cc test_util/testutil.cc test_util/transaction_test_util.cc tools/block_cache_analyzer/block_cache_trace_analyzer.cc tools/dump/db_dump_tool.cc + tools/io_tracer_parser_tool.cc tools/ldb_cmd.cc tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc - trace_replay/trace_replay.cc trace_replay/block_cache_tracer.cc + trace_replay/io_tracer.cc + trace_replay/trace_record_handler.cc + trace_replay/trace_record_result.cc + trace_replay/trace_record.cc + trace_replay/trace_replay.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc @@ -664,6 +850,8 @@ util/murmurhash.cc util/random.cc util/rate_limiter.cc + util/ribbon_config.cc + util/regex.cc util/slice.cc util/file_checksum_helper.cc util/status.cc @@ -678,19 +866,23 @@ utilities/blob_db/blob_db_impl_filesnapshot.cc utilities/blob_db/blob_dump_tool.cc utilities/blob_db/blob_file.cc - utilities/blob_db/blob_log_reader.cc - utilities/blob_db/blob_log_writer.cc - utilities/blob_db/blob_log_format.cc + utilities/cache_dump_load.cc + utilities/cache_dump_load_impl.cc utilities/cassandra/cassandra_compaction_filter.cc utilities/cassandra/format.cc utilities/cassandra/merge_operator.cc utilities/checkpoint/checkpoint_impl.cc + utilities/compaction_filters.cc utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc utilities/debug.cc utilities/env_mirror.cc utilities/env_timed.cc + utilities/fault_injection_env.cc + utilities/fault_injection_fs.cc + utilities/fault_injection_secondary_cache.cc utilities/leveldb_options/leveldb_options.cc utilities/memory/memory_util.cc + utilities/merge_operators.cc utilities/merge_operators/bytesxor.cc utilities/merge_operators/max.cc utilities/merge_operators/put.cc @@ -710,6 +902,12 @@ utilities/simulator_cache/sim_cache.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/trace/file_trace_reader_writer.cc + utilities/trace/replayer_impl.cc + utilities/transactions/lock/lock_manager.cc + utilities/transactions/lock/point/point_lock_tracker.cc + utilities/transactions/lock/point/point_lock_manager.cc + utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc + utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc utilities/transactions/optimistic_transaction_db_impl.cc utilities/transactions/optimistic_transaction.cc utilities/transactions/pessimistic_transaction.cc @@ -717,16 +915,54 @@ utilities/transactions/snapshot_checker.cc utilities/transactions/transaction_base.cc utilities/transactions/transaction_db_mutex_impl.cc - utilities/transactions/transaction_lock_mgr.cc utilities/transactions/transaction_util.cc utilities/transactions/write_prepared_txn.cc utilities/transactions/write_prepared_txn_db.cc utilities/transactions/write_unprepared_txn.cc utilities/transactions/write_unprepared_txn_db.cc utilities/ttl/db_ttl_impl.cc + utilities/wal_filter.cc utilities/write_batch_with_index/write_batch_with_index.cc - utilities/write_batch_with_index/write_batch_with_index_internal.cc - $) + utilities/write_batch_with_index/write_batch_with_index_internal.cc) + +list(APPEND SOURCES + utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc + utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc + utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc + utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc + utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc + utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc + utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc + utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc + utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc + utilities/transactions/lock/range/range_tree/lib/standalone_port.cc + utilities/transactions/lock/range/range_tree/lib/util/dbt.cc + utilities/transactions/lock/range/range_tree/lib/util/memarena.cc) + +message(STATUS "ROCKSDB_PLUGINS: ${ROCKSDB_PLUGINS}") +if ( ROCKSDB_PLUGINS ) + string(REPLACE " " ";" PLUGINS ${ROCKSDB_PLUGINS}) + foreach (plugin ${PLUGINS}) + add_subdirectory("plugin/${plugin}") + foreach (src ${${plugin}_SOURCES}) + list(APPEND SOURCES plugin/${plugin}/${src}) + set_source_files_properties( + plugin/${plugin}/${src} + PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}") + endforeach() + foreach (path ${${plugin}_INCLUDE_PATHS}) + include_directories(${path}) + endforeach() + foreach (lib ${${plugin}_LIBS}) + list(APPEND THIRDPARTY_LIBS ${lib}) + endforeach() + foreach (link_path ${${plugin}_LINK_PATHS}) + link_directories(AFTER ${link_path}) + endforeach() + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${${plugin}_CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${${plugin}_CMAKE_EXE_LINKER_FLAGS}") + endforeach() +endif() if(HAVE_SSE42 AND NOT MSVC) set_source_files_properties( @@ -734,11 +970,11 @@ PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") endif() -if(HAVE_POWER8) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") list(APPEND SOURCES util/crc32c_ppc.c util/crc32c_ppc_asm.S) -endif(HAVE_POWER8) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") if(HAS_ARMV8_CRC) list(APPEND SOURCES @@ -753,7 +989,6 @@ port/win/port_win.cc port/win/win_logger.cc port/win/win_thread.cc) - if(WITH_XPRESS) list(APPEND SOURCES port/win/xpress_win.cc) @@ -799,13 +1034,13 @@ set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT}) endif() -add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES}) -target_link_libraries(${ROCKSDB_STATIC_LIB} +add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC}) +target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) if(ROCKSDB_BUILD_SHARED) - add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES}) - target_link_libraries(${ROCKSDB_SHARED_LIB} + add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC}) + target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) if(WIN32) @@ -822,8 +1057,7 @@ LINKER_LANGUAGE CXX VERSION ${rocksdb_VERSION} SOVERSION ${rocksdb_VERSION_MAJOR} - CXX_STANDARD 11 - OUTPUT_NAME "rocksdb") + OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}") endif() endif() @@ -834,6 +1068,16 @@ endif() option(WITH_JNI "build with JNI" OFF) +# Tests are excluded from Release builds +CMAKE_DEPENDENT_OPTION(WITH_TESTS "build with tests" ON + "CMAKE_BUILD_TYPE STREQUAL Debug" OFF) +option(WITH_BENCHMARK_TOOLS "build with benchmarks" ON) +option(WITH_CORE_TOOLS "build with ldb and sst_dump" ON) +option(WITH_TOOLS "build with tools" ON) + +if(WITH_TESTS OR WITH_BENCHMARK_TOOLS OR WITH_TOOLS OR WITH_JNI OR JNI) + include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src) +endif() if(WITH_JNI OR JNI) message(STATUS "JNI library is enabled") add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/java) @@ -871,6 +1115,8 @@ install(DIRECTORY include/rocksdb COMPONENT devel DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination}) + install( TARGETS ${ROCKSDB_STATIC_LIB} EXPORT RocksDBTargets @@ -907,29 +1153,49 @@ ) endif() -# Tests are excluded from Release builds -CMAKE_DEPENDENT_OPTION(WITH_TESTS "build with tests" ON - "CMAKE_BUILD_TYPE STREQUAL Debug" OFF) -if(WITH_TESTS) +option(WITH_ALL_TESTS "Build all test, rather than a small subset" ON) + +if(WITH_TESTS OR WITH_BENCHMARK_TOOLS) add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest) add_library(testharness STATIC + test_util/mock_time_env.cc test_util/testharness.cc) target_link_libraries(testharness gtest) +endif() +if(WITH_TESTS) set(TESTS + db/db_basic_test.cc + env/env_basic_test.cc + ) + if(WITH_ALL_TESTS) + list(APPEND TESTS + cache/cache_reservation_manager_test.cc cache/cache_test.cc cache/lru_cache_test.cc + db/blob/blob_counting_iterator_test.cc + db/blob/blob_file_addition_test.cc + db/blob/blob_file_builder_test.cc + db/blob/blob_file_cache_test.cc + db/blob/blob_file_garbage_test.cc + db/blob/blob_file_reader_test.cc + db/blob/blob_garbage_meter_test.cc + db/blob/db_blob_basic_test.cc + db/blob/db_blob_compaction_test.cc + db/blob/db_blob_corruption_test.cc + db/blob/db_blob_index_test.cc db/column_family_test.cc db/compact_files_test.cc + db/compaction/clipping_iterator_test.cc db/compaction/compaction_job_stats_test.cc db/compaction/compaction_job_test.cc db/compaction/compaction_iterator_test.cc db/compaction/compaction_picker_test.cc + db/compaction/compaction_service_test.cc db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc - db/db_basic_test.cc - db/db_blob_index_test.cc + db/db_with_timestamp_basic_test.cc db/db_block_cache_test.cc db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc @@ -941,6 +1207,7 @@ db/db_iter_test.cc db/db_iter_stress_test.cc db/db_iterator_test.cc + db/db_kv_checksum_test.cc db/db_log_iter_test.cc db/db_memtable_test.cc db/db_merge_operator_test.cc @@ -948,19 +1215,21 @@ db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc - db/db_impl/db_secondary_test.cc + db/db_secondary_test.cc db/db_sst_test.cc db/db_statistics_test.cc db/db_table_properties_test.cc db/db_tailing_iter_test.cc db/db_test.cc db/db_test2.cc + db/db_logical_block_size_cache_test.cc db/db_universal_compaction_test.cc db/db_wal_test.cc + db/db_with_timestamp_compaction_test.cc db/db_write_test.cc db/dbformat_test.cc db/deletefile_test.cc - db/error_handler_test.cc + db/error_handler_fs_test.cc db/obsolete_files_test.cc db/external_sst_file_basic_test.cc db/external_sst_file_test.cc @@ -976,6 +1245,7 @@ db/merge_test.cc db/options_file_test.cc db/perf_context_test.cc + db/periodic_work_scheduler_test.cc db/plain_table_db_test.cc db/prefix_test.cc db/range_del_aggregator_test.cc @@ -986,17 +1256,21 @@ db/version_edit_test.cc db/version_set_test.cc db/wal_manager_test.cc + db/wal_edit_test.cc db/write_batch_test.cc db/write_callback_test.cc db/write_controller_test.cc - env/env_basic_test.cc env/env_test.cc + env/io_posix_test.cc env/mock_env_test.cc file/delete_scheduler_test.cc + file/prefetch_test.cc + file/random_access_file_reader_test.cc logging/auto_roll_logger_test.cc logging/env_logger_test.cc logging/event_logger_test.cc memory/arena_test.cc + memory/memory_allocator_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc memtable/write_buffer_manager_test.cc @@ -1004,9 +1278,12 @@ monitoring/iostats_context_test.cc monitoring/statistics_test.cc monitoring/stats_history_test.cc + options/configurable_test.cc + options/customizable_test.cc options/options_settable_test.cc options/options_test.cc table/block_based/block_based_filter_block_test.cc + table/block_based/block_based_table_reader_test.cc table/block_based/block_test.cc table/block_based/data_block_hash_index_test.cc table/block_based/full_filter_block_test.cc @@ -1017,7 +1294,12 @@ table/merger_test.cc table/sst_file_reader_test.cc table/table_test.cc + table/block_fetcher_test.cc + test_util/testutil_test.cc + trace_replay/block_cache_tracer_test.cc + trace_replay/io_tracer_test.cc tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc + tools/io_tracer_parser_test.cc tools/ldb_cmd_test.cc tools/reduce_levels_test.cc tools/sst_dump_test.cc @@ -1035,11 +1317,14 @@ util/random_test.cc util/rate_limiter_test.cc util/repeatable_thread_test.cc + util/ribbon_test.cc util/slice_test.cc util/slice_transform_test.cc util/timer_queue_test.cc + util/timer_test.cc util/thread_list_test.cc util/thread_local_test.cc + util/work_queue_test.cc utilities/backupable/backupable_db_test.cc utilities/blob_db/blob_db_test.cc utilities/cassandra/cassandra_functional_test.cc @@ -1059,11 +1344,14 @@ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc utilities/transactions/transaction_test.cc + utilities/transactions/lock/point/point_lock_manager_test.cc utilities/transactions/write_prepared_transaction_test.cc utilities/transactions/write_unprepared_transaction_test.cc + utilities/transactions/lock/range/range_locking_test.cc utilities/ttl/ttl_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc - ) + ) + endif() if(WITH_LIBRADOS) list(APPEND TESTS utilities/env_librados_test.cc) endif() @@ -1076,7 +1364,6 @@ db/db_test_util.cc monitoring/thread_status_updater_debug.cc table/mock_table.cc - test_util/fault_injection_test_env.cc utilities/cassandra/test_utils.cc ) enable_testing() @@ -1091,21 +1378,25 @@ PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 - ) + ) foreach(sourcefile ${TESTS}) get_filename_component(exename ${sourcefile} NAME_WE) - add_executable(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} ${sourcefile}) - set_target_properties(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} + add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}) + set_target_properties(${exename}${ARTIFACT_SUFFIX} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX} - ) - target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${ROCKSDB_LIB}) + ) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) if(NOT "${exename}" MATCHES "db_sanity_test") - add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX}) - add_dependencies(check ${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX}) + gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120) + add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) + endif() + if("${exename}" MATCHES "env_librados_test") + # env_librados_test.cc uses librados directly + target_link_libraries(${exename}${ARTIFACT_SUFFIX} rados) endif() endforeach(sourcefile ${TESTS}) @@ -1122,57 +1413,71 @@ if(ROCKSDB_LIB_FOR_C) set(C_TESTS db/c_test.c) - # C executables must link to a shared object add_executable(c_test db/c_test.c) - target_link_libraries(c_test ${ROCKSDB_SHARED_LIB} testharness) + target_link_libraries(c_test ${ROCKSDB_LIB_FOR_C} testharness) add_test(NAME c_test COMMAND c_test${ARTIFACT_SUFFIX}) add_dependencies(check c_test) endif() endif() -option(WITH_BENCHMARK_TOOLS "build with benchmarks" ON) if(WITH_BENCHMARK_TOOLS) - add_executable(db_bench + add_executable(db_bench${ARTIFACT_SUFFIX} + tools/simulated_hybrid_file_system.cc tools/db_bench.cc tools/db_bench_tool.cc) - target_link_libraries(db_bench - ${ROCKSDB_LIB}) + target_link_libraries(db_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) - add_executable(cache_bench - cache/cache_bench.cc) - target_link_libraries(cache_bench - ${ROCKSDB_LIB}) + add_executable(cache_bench${ARTIFACT_SUFFIX} + cache/cache_bench.cc + cache/cache_bench_tool.cc) + target_link_libraries(cache_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(memtablerep_bench + add_executable(memtablerep_bench${ARTIFACT_SUFFIX} memtable/memtablerep_bench.cc) - target_link_libraries(memtablerep_bench - ${ROCKSDB_LIB}) + target_link_libraries(memtablerep_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(range_del_aggregator_bench + add_executable(range_del_aggregator_bench${ARTIFACT_SUFFIX} db/range_del_aggregator_bench.cc) - target_link_libraries(range_del_aggregator_bench - ${ROCKSDB_LIB}) + target_link_libraries(range_del_aggregator_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(table_reader_bench + add_executable(table_reader_bench${ARTIFACT_SUFFIX} table/table_reader_bench.cc) - target_link_libraries(table_reader_bench - ${ROCKSDB_LIB} testharness) + target_link_libraries(table_reader_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} testharness ${GFLAGS_LIB}) - add_executable(filter_bench + add_executable(filter_bench${ARTIFACT_SUFFIX} util/filter_bench.cc) - target_link_libraries(filter_bench - ${ROCKSDB_LIB}) + target_link_libraries(filter_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(hash_table_bench + add_executable(hash_table_bench${ARTIFACT_SUFFIX} utilities/persistent_cache/hash_table_bench.cc) - target_link_libraries(hash_table_bench - ${ROCKSDB_LIB}) + target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX} + ${ROCKSDB_LIB} ${GFLAGS_LIB}) endif() -option(WITH_TOOLS "build with tools" ON) -if(WITH_TOOLS) +if(WITH_CORE_TOOLS OR WITH_TOOLS) add_subdirectory(tools) + add_custom_target(core_tools + DEPENDS ${core_tool_deps}) +endif() + +if(WITH_TOOLS) add_subdirectory(db_stress_tool) add_custom_target(tools DEPENDS ${tool_deps}) endif() + +option(WITH_EXAMPLES "build with examples" OFF) +if(WITH_EXAMPLES) + add_subdirectory(examples) +endif() + +option(WITH_BENCHMARK "build benchmark tests" OFF) +if(WITH_BENCHMARK) + add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/) +endif() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md 2025-05-19 16:14:27.000000000 +0000 @@ -1,4 +1,4 @@ -# RocksDB default options change log +# RocksDB default options change log (NO LONGER MAINTAINED) ## Unreleased * delayed_write_rate takes the rate given by rate_limiter if not specified. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/HISTORY.md mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/HISTORY.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md 2025-05-19 16:14:27.000000000 +0000 @@ -1,9 +1,707 @@ # Rocksdb Change Log -## Unreleased +## 6.29.5 (03/29/2022) ### Bug Fixes +* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled. +* Fixed a race condition when mmaping a WritableFile on POSIX. +* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail. +* Fixed a heap use-after-free race with DropColumnFamily. +* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722). + +## 6.29.4 (03/22/2022) +### Bug Fixes +* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data. +* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction. +* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed. +* Fixed a race condition when disable and re-enable manual compaction. +* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread. +* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496) +* Fixed a read-after-free bug in `DB::GetMergeOperands()`. +* Fixed NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, NUM_DATA_BLOCKS_READ_PER_LEVEL, and NUM_SST_READ_PER_LEVEL stats to be reported once per MultiGet batch per level. + +## 6.29.3 (02/17/2022) +### Bug Fixes +* Fix a data loss bug for 2PC write-committed transaction caused by concurrent transaction commit and memtable switch (#9571). + +## 6.29.2 (02/15/2022) +### Performance Improvements +* DisableManualCompaction() doesn't have to wait scheduled manual compaction to be executed in thread-pool to cancel the job. + +## 6.29.1 (01/31/2022) +### Bug Fixes +* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.) + +## 6.29.0 (01/21/2022) +Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info. +### Public API change +* Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named. +* Added `TraceOptions::preserve_write_order`. When enabled it guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression. +* Made the Env class extend the Customizable class. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. +* `Options::OldDefaults` is marked deprecated, as it is no longer maintained. +* Add ObjectLibrary::AddFactory and ObjectLibrary::PatternEntry classes. This method and associated class are the preferred mechanism for registering factories with the ObjectLibrary going forward. The ObjectLibrary::Register method, which uses regular expressions and may be problematic, is deprecated and will be in a future release. +* Changed `BlockBasedTableOptions::block_size` from `size_t` to `uint64_t`. +* Added API warning against using `Iterator::Refresh()` together with `DB::DeleteRange()`, which are incompatible and have always risked causing the refreshed iterator to return incorrect results. + +### Behavior Changes +* `DB::DestroyColumnFamilyHandle()` will return Status::InvalidArgument() if called with `DB::DefaultColumnFamily()`. +* On 32-bit platforms, mmap reads are no longer quietly disabled, just discouraged. + +### New Features +* Added `Options::DisableExtraChecks()` that can be used to improve peak write performance by disabling checks that should not be necessary in the absence of software logic errors or CPU+memory hardware errors. (Default options are slowly moving toward some performance overheads for extra correctness checking.) + +### Performance Improvements +* Improved read performance when a prefix extractor is used (Seek, Get, MultiGet), even compared to version 6.25 baseline (see bug fix below), by optimizing the common case of prefix extractor compatible with table file and unchanging. + +### Bug Fixes +* Fix a bug that FlushMemTable may return ok even flush not succeed. +* Fixed a bug of Sync() and Fsync() not using `fcntl(F_FULLFSYNC)` on OS X and iOS. +* Fixed a significant performance regression in version 6.26 when a prefix extractor is used on the read path (Seek, Get, MultiGet). (Excessive time was spent in SliceTransform::AsString().) + +### New Features +* Added RocksJava support for MacOS universal binary (ARM+x86) + +## 6.28.0 (2021-12-17) +### New Features +* Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps. +* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD. + +### Bug Fixes +* Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next. +* Fixed a bug in TableOptions.prepopulate_block_cache which causes segmentation fault when used with TableOptions.partition_filters = true and TableOptions.cache_index_and_filter_blocks = true. +* Fixed a bug affecting custom memtable factories which are not registered with the `ObjectRegistry`. The bug could result in failure to save the OPTIONS file. +* Fixed a bug causing two duplicate entries to be appended to a file opened in non-direct mode and tracked by `FaultInjectionTestFS`. +* Fixed a bug in TableOptions.prepopulate_block_cache to support block-based filters also. +* Block cache keys no longer use `FSRandomAccessFile::GetUniqueId()` (previously used when available), so a filesystem recycling unique ids can no longer lead to incorrect result or crash (#7405). For files generated by RocksDB >= 6.24, the cache keys are stable across DB::Open and DB directory move / copy / import / export / migration, etc. Although collisions are still theoretically possible, they are (a) impossible in many common cases, (b) not dependent on environmental factors, and (c) much less likely than a CPU miscalculation while executing RocksDB. +* Fixed a bug in C bindings causing iterator to return incorrect result (#9343). + +### Behavior Changes +* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371. + +### Public API change +* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes. +* Introduce a new EventListener callback that will be called upon the end of automatic error recovery. +* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately. +* Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family. + +### Performance Improvements +* Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory. +* Block cache accesses are faster by RocksDB using cache keys of fixed size (16 bytes). + +### Java API Changes +* Removed Java API `TableProperties.getPropertiesOffsets()` as it exposed internal details to external users. + +## 6.27.0 (2021-11-19) +### New Features +* Added new ChecksumType kXXH3 which is faster than kCRC32c on almost all x86\_64 hardware. +* Added a new online consistency check for BlobDB which validates that the number/total size of garbage blobs does not exceed the number/total size of all blobs in any given blob file. +* Provided support for tracking per-sst user-defined timestamp information in MANIFEST. +* Added new option "adaptive_readahead" in ReadOptions. For iterators, RocksDB does auto-readahead on noticing sequential reads and by enabling this option, readahead_size of current file (if reads are sequential) will be carried forward to next file instead of starting from the scratch at each level (except L0 level files). If reads are not sequential it will fall back to 8KB. This option is applicable only for RocksDB internal prefetch buffer and isn't supported with underlying file system prefetching. +* Added the read count and read bytes related stats to Statistics for tiered storage hot, warm, and cold file reads. +* Added an option to dynamically charge an updating estimated memory usage of block-based table building to block cache if block cache available. It currently only includes charging memory usage of constructing (new) Bloom Filter and Ribbon Filter to block cache. To enable this feature, set `BlockBasedTableOptions::reserve_table_builder_memory = true`. +* Add a new API OnIOError in listener.h that notifies listeners when an IO error occurs during FileSystem operation along with filename, status etc. +* Added compaction readahead support for blob files to the integrated BlobDB implementation, which can improve compaction performance when the database resides on higher-latency storage like HDDs or remote filesystems. Readahead can be configured using the column family option `blob_compaction_readahead_size`. + +### Bug Fixes +* Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption. +* Fixed a bug in CompactionIterator when write-prepared transaction is used. A released earliest write conflict snapshot may cause assertion failure in dbg mode and unexpected key in opt mode. +* Fix ticker WRITE_WITH_WAL("rocksdb.write.wal"), this bug is caused by a bad extra `RecordTick(stats_, WRITE_WITH_WAL)` (at 2 place), this fix remove the extra `RecordTick`s and fix the corresponding test case. +* EventListener::OnTableFileCreated was previously called with OK status and file_size==0 in cases of no SST file contents written (because there was no content to add) and the empty file deleted before calling the listener. Now the status is Aborted. +* Fixed a bug in CompactionIterator when write-preared transaction is used. Releasing earliest_snapshot during compaction may cause a SingleDelete to be output after a PUT of the same user key whose seq has been zeroed. +* Added input sanitization on negative bytes passed into `GenericRateLimiter::Request`. +* Fixed an assertion failure in CompactionIterator when write-prepared transaction is used. We prove that certain operations can lead to a Delete being followed by a SingleDelete (same user key). We can drop the SingleDelete. +* Fixed a bug of timestamp-based GC which can cause all versions of a key under full_history_ts_low to be dropped. This bug will be triggered when some of the ikeys' timestamps are lower than full_history_ts_low, while others are newer. +* In some cases outside of the DB read and compaction paths, SST block checksums are now checked where they were not before. +* Explicitly check for and disallow the `BlockBasedTableOptions` if insertion into one of {`block_cache`, `block_cache_compressed`, `persistent_cache`} can show up in another of these. (RocksDB expects to be able to use the same key for different physical data among tiers.) +* Users who configured a dedicated thread pool for bottommost compactions by explicitly adding threads to the `Env::Priority::BOTTOM` pool will no longer see RocksDB schedule automatic compactions exceeding the DB's compaction concurrency limit. For details on per-DB compaction concurrency limit, see API docs of `max_background_compactions` and `max_background_jobs`. +* Fixed a bug of background flush thread picking more memtables to flush and prematurely advancing column family's log_number. +* Fixed an assertion failure in ManifestTailer. +* Fixed a bug that could, with WAL enabled, cause backups, checkpoints, and `GetSortedWalFiles()` to fail randomly with an error like `IO error: 001234.log: No such file or directory` + +### Behavior Changes +* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files. +* `TransactionUtil::CheckKeyForConflicts` can also perform conflict-checking based on user-defined timestamps in addition to sequence numbers. +* Removed `GenericRateLimiter`'s minimum refill bytes per period previously enforced. + +### Public API change +* When options.ttl is used with leveled compaction with compactinon priority kMinOverlappingRatio, files exceeding half of TTL value will be prioritized more, so that by the time TTL is reached, fewer extra compactions will be scheduled to clear them up. At the same time, when compacting files with data older than half of TTL, output files may be cut off based on those files' boundaries, in order for the early TTL compaction to work properly. +* Made FileSystem and RateLimiter extend the Customizable class and added a CreateFromString method. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. +* Clarified in API comments that RocksDB is not exception safe for callbacks and custom extensions. An exception propagating into RocksDB can lead to undefined behavior, including data loss, unreported corruption, deadlocks, and more. +* Marked `WriteBufferManager` as `final` because it is not intended for extension. +* Removed unimportant implementation details from table_properties.h +* Add API `FSDirectory::FsyncWithDirOptions()`, which provides extra information like directory fsync reason in `DirFsyncOptions`. File system like btrfs is using that to skip directory fsync for creating a new file, or when renaming a file, fsync the target file instead of the directory, which improves the `DB::Open()` speed by ~20%. +* `DB::Open()` is not going be blocked by obsolete file purge if `DBOptions::avoid_unnecessary_blocking_io` is set to true. +* In builds where glibc provides `gettid()`, info log ("LOG" file) lines now print a system-wide thread ID from `gettid()` instead of the process-local `pthread_self()`. For all users, the thread ID format is changed from hexadecimal to decimal integer. +* In builds where glibc provides `pthread_setname_np()`, the background thread names no longer contain an ID suffix. For example, "rocksdb:bottom7" (and all other threads in the `Env::Priority::BOTTOM` pool) are now named "rocksdb:bottom". Previously large thread pools could breach the name size limit (e.g., naming "rocksdb:bottom10" would fail). +* Deprecating `ReadOptions::iter_start_seqnum` and `DBOptions::preserve_deletes`, please try using user defined timestamp feature instead. The options will be removed in a future release, currently it logs a warning message when using. + +### Performance Improvements +* Released some memory related to filter construction earlier in `BlockBasedTableBuilder` for `FullFilter` and `PartitionedFilter` case (#9070) + +### Behavior Changes +* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files. + +## 6.26.0 (2021-10-20) +### Bug Fixes +* Fixes a bug in directed IO mode when calling MultiGet() for blobs in the same blob file. The bug is caused by not sorting the blob read requests by file offsets. +* Fix the incorrect disabling of SST rate limited deletion when the WAL and DB are in different directories. Only WAL rate limited deletion should be disabled if its in a different directory. +* Fix `DisableManualCompaction()` to cancel compactions even when they are waiting on automatic compactions to drain due to `CompactRangeOptions::exclusive_manual_compactions == true`. +* Fix contract of `Env::ReopenWritableFile()` and `FileSystem::ReopenWritableFile()` to specify any existing file must not be deleted or truncated. +* Fixed bug in calls to `IngestExternalFiles()` with files for multiple column families. The bug could have introduced a delay in ingested file keys becoming visible after `IngestExternalFiles()` returned. Furthermore, mutations to ingested file keys while they were invisible could have been dropped (not necessarily immediately). +* Fixed a possible race condition impacting users of `WriteBufferManager` who constructed it with `allow_stall == true`. The race condition led to undefined behavior (in our experience, typically a process crash). +* Fixed a bug where stalled writes would remain stalled forever after the user calls `WriteBufferManager::SetBufferSize()` with `new_size == 0` to dynamically disable memory limiting. +* Make `DB::close()` thread-safe. +* Fix a bug in atomic flush where one bg flush thread will wait forever for a preceding bg flush thread to commit its result to MANIFEST but encounters an error which is mapped to a soft error (DB not stopped). +* Fix a bug in `BackupEngine` where some internal callers of `GenericRateLimiter::Request()` do not honor `bytes <= GetSingleBurstBytes()`. + +### New Features +* Print information about blob files when using "ldb list_live_files_metadata" +* Provided support for SingleDelete with user defined timestamp. +* Experimental new function DB::GetLiveFilesStorageInfo offers essentially a unified version of other functions like GetLiveFiles, GetLiveFilesChecksumInfo, and GetSortedWalFiles. Checkpoints and backups could show small behavioral changes and/or improved performance as they now use this new API. +* Add remote compaction read/write bytes statistics: `REMOTE_COMPACT_READ_BYTES`, `REMOTE_COMPACT_WRITE_BYTES`. +* Introduce an experimental feature to dump out the blocks from block cache and insert them to the secondary cache to reduce the cache warmup time (e.g., used while migrating DB instance). More information are in `class CacheDumper` and `CacheDumpedLoader` at `rocksdb/utilities/cache_dump_load.h` Note that, this feature is subject to the potential change in the future, it is still experimental. +* Introduced a new BlobDB configuration option `blob_garbage_collection_force_threshold`, which can be used to trigger compactions targeting the SST files which reference the oldest blob files when the ratio of garbage in those blob files meets or exceeds the specified threshold. This can reduce space amplification with skewed workloads where the affected SST files might not otherwise get picked up for compaction. +* Added EXPERIMENTAL support for table file (SST) unique identifiers that are stable and universally unique, available with new function `GetUniqueIdFromTableProperties`. Only SST files from RocksDB >= 6.24 support unique IDs. +* Added `GetMapProperty()` support for "rocksdb.dbstats" (`DB::Properties::kDBStats`). As a map property, it includes DB-level internal stats accumulated over the DB's lifetime, such as user write related stats and uptime. + +### Public API change +* Made SystemClock extend the Customizable class and added a CreateFromString method. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. +* Made SliceTransform extend the Customizable class and added a CreateFromString method. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. The Capped and Prefixed transform classes return a short name (no length); use GetId for the fully qualified name. +* Made FileChecksumGenFactory, SstPartitionerFactory, TablePropertiesCollectorFactory, and WalFilter extend the Customizable class and added a CreateFromString method. +* Some fields of SstFileMetaData are deprecated for compatibility with new base class FileStorageInfo. +* Add `file_temperature` to `IngestExternalFileArg` such that when ingesting SST files, we are able to indicate the temperature of the this batch of files. +* If `DB::Close()` failed with a non aborted status, calling `DB::Close()` again will return the original status instead of Status::OK. +* Add CacheTier to advanced_options.h to describe the cache tier we used. Add a `lowest_used_cache_tier` option to `DBOptions` (immutable) and pass it to BlockBasedTableReader. By default it is `CacheTier::kNonVolatileBlockTier`, which means, we always use both block cache (kVolatileTier) and secondary cache (kNonVolatileBlockTier). By set it to `CacheTier::kVolatileTier`, the DB will not use the secondary cache. +* Even when options.max_compaction_bytes is hit, compaction output files are only cut when it aligns with grandparent files' boundaries. options.max_compaction_bytes could be slightly violated with the change, but the violation is no more than one target SST file size, which is usually much smaller. + +### Performance Improvements +* Improved CPU efficiency of building block-based table (SST) files (#9039 and #9040). + +### Java API Changes +* Add Java API bindings for new integrated BlobDB options +* `keyMayExist()` supports ByteBuffer. +* Fix multiget throwing Null Pointer Exception for num of keys > 70k (https://github.com/facebook/rocksdb/issues/8039). + +## 6.25.0 (2021-09-20) +### Bug Fixes +* Allow secondary instance to refresh iterator. Assign read seq after referencing SuperVersion. +* Fixed a bug of secondary instance's last_sequence going backward, and reads on the secondary fail to see recent updates from the primary. +* Fixed a bug that could lead to duplicate DB ID or DB session ID in POSIX environments without /proc/sys/kernel/random/uuid. +* Fix a race in DumpStats() with column family destruction due to not taking a Ref on each entry while iterating the ColumnFamilySet. +* Fix a race in item ref counting in LRUCache when promoting an item from the SecondaryCache. +* Fix a race in BackupEngine if RateLimiter is reconfigured during concurrent Restore operations. +* Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding. +* Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates. +* Fix the implementation of `prepopulate_block_cache = kFlushOnly` to only apply to flushes rather than to all generated files. +* Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_. +* Add checks for validity of the IO uring completion queue entries, and fail the BlockBasedTableReader MultiGet sub-batch if there's an invalid completion +* Add an interface RocksDbIOUringEnable() that, if defined by the user, will allow them to enable/disable the use of IO uring by RocksDB +* Fix the bug that when direct I/O is used and MultiRead() returns a short result, RandomAccessFileReader::MultiRead() still returns full size buffer, with returned short value together with some data in original buffer. This bug is unlikely cause incorrect results, because (1) since FileSystem layer is expected to retry on short result, returning short results is only possible when asking more bytes in the end of the file, which RocksDB doesn't do when using MultiRead(); (2) checksum is unlikely to match. + +### New Features +* RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions. +* Added a ticker statistic, "rocksdb.verify_checksum.read.bytes", reporting how many bytes were read from file to serve `VerifyChecksum()` and `VerifyFileChecksums()` queries. +* Added ticker statistics, "rocksdb.backup.read.bytes" and "rocksdb.backup.write.bytes", reporting how many bytes were read and written during backup. +* Added properties for BlobDB: `rocksdb.num-blob-files`, `rocksdb.blob-stats`, `rocksdb.total-blob-file-size`, and `rocksdb.live-blob-file-size`. The existing property `rocksdb.estimate_live-data-size` was also extended to include live bytes residing in blob files. +* Added two new RateLimiter IOPriorities: `Env::IO_USER`,`Env::IO_MID`. `Env::IO_USER` will have superior priority over all other RateLimiter IOPriorities without being subject to fair scheduling constraint. +* `SstFileWriter` now supports `Put`s and `Delete`s with user-defined timestamps. Note that the ingestion logic itself is not timestamp-aware yet. +* Allow a single write batch to include keys from multiple column families whose timestamps' formats can differ. For example, some column families may disable timestamp, while others enable timestamp. +* Add compaction priority information in RemoteCompaction, which can be used to schedule high priority job first. +* Added new callback APIs `OnBlobFileCreationStarted`,`OnBlobFileCreated`and `OnBlobFileDeleted` in `EventListener` class of listener.h. It notifies listeners during creation/deletion of individual blob files in Integrated BlobDB. It also log blob file creation finished event and deletion event in LOG file. +* Batch blob read requests for `DB::MultiGet` using `MultiRead`. +* Add support for fallback to local compaction, the user can return `CompactionServiceJobStatus::kUseLocal` to instruct RocksDB to run the compaction locally instead of waiting for the remote compaction result. +* Add built-in rate limiter's implementation of `RateLimiter::GetTotalPendingRequest(int64_t* total_pending_requests, const Env::IOPriority pri)` for the total number of requests that are pending for bytes in the rate limiter. +* Charge memory usage during data buffering, from which training samples are gathered for dictionary compression, to block cache. Unbuffering data can now be triggered if the block cache becomes full and `strict_capacity_limit=true` for the block cache, in addition to existing conditions that can trigger unbuffering. + +### Public API change +* Remove obsolete implementation details FullKey and ParseFullKey from public API +* Change `SstFileMetaData::size` from `size_t` to `uint64_t`. +* Made Statistics extend the Customizable class and added a CreateFromString method. Implementations of Statistics need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. +* Extended `FlushJobInfo` and `CompactionJobInfo` in listener.h to provide information about the blob files generated by a flush/compaction and garbage collected during compaction in Integrated BlobDB. Added struct members `blob_file_addition_infos` and `blob_file_garbage_infos` that contain this information. +* Extended parameter `output_file_names` of `CompactFiles` API to also include paths of the blob files generated by the compaction in Integrated BlobDB. +* Most `BackupEngine` functions now return `IOStatus` instead of `Status`. Most existing code should be compatible with this change but some calls might need to be updated. +* Add a new field `level_at_creation` in `TablePropertiesCollectorFactory::Context` to capture the level at creating the SST file (i.e, table), of which the properties are being collected. + +### Miscellaneous +* Add a paranoid check where in case FileSystem layer doesn't fill the buffer but returns succeed, checksum is unlikely to match even if buffer contains a previous block. The byte modified is not useful anyway, so it isn't expected to change any behavior when FileSystem is satisfying its contract. + +## 6.24.0 (2021-08-20) +### Bug Fixes +* If the primary's CURRENT file is missing or inaccessible, the secondary instance should not hang repeatedly trying to switch to a new MANIFEST. It should instead return the error code encountered while accessing the file. +* Restoring backups with BackupEngine is now a logically atomic operation, so that if a restore operation is interrupted, DB::Open on it will fail. Using BackupEngineOptions::sync (default) ensures atomicity even in case of power loss or OS crash. +* Fixed a race related to the destruction of `ColumnFamilyData` objects. The earlier logic unlocked the DB mutex before destroying the thread-local `SuperVersion` pointers, which could result in a process crash if another thread managed to get a reference to the `ColumnFamilyData` object. +* Removed a call to `RenameFile()` on a non-existent info log file ("LOG") when opening a new DB. Such a call was guaranteed to fail though did not impact applications since we swallowed the error. Now we also stopped swallowing errors in renaming "LOG" file. +* Fixed an issue where `OnFlushCompleted` was not called for atomic flush. +* Fixed a bug affecting the batched `MultiGet` API when used with keys spanning multiple column families and `sorted_input == false`. +* Fixed a potential incorrect result in opt mode and assertion failures caused by releasing snapshot(s) during compaction. +* Fixed passing of BlobFileCompletionCallback to Compaction job and Atomic flush job which was default paramter (nullptr). BlobFileCompletitionCallback is internal callback that manages addition of blob files to SSTFileManager. +* Fixed MultiGet not updating the block_read_count and block_read_byte PerfContext counters. + +### New Features +* Made the EventListener extend the Customizable class. +* EventListeners that have a non-empty Name() and that are registered with the ObjectRegistry can now be serialized to/from the OPTIONS file. +* Insert warm blocks (data blocks, uncompressed dict blocks, index and filter blocks) in Block cache during flush under option BlockBasedTableOptions.prepopulate_block_cache. Previously it was enabled for only data blocks. +* BlockBasedTableOptions.prepopulate_block_cache can be dynamically configured using DB::SetOptions. +* Add CompactionOptionsFIFO.age_for_warm, which allows RocksDB to move old files to warm tier in FIFO compactions. Note that file temperature is still an experimental feature. +* Add a comment to suggest btrfs user to disable file preallocation by setting `options.allow_fallocate=false`. +* Fast forward option in Trace replay changed to double type to allow replaying at a lower speed, by settings the value between 0 and 1. This option can be set via `ReplayOptions` in `Replayer::Replay()`, or via `--trace_replay_fast_forward` in db_bench. +* Add property `LiveSstFilesSizeAtTemperature` to retrieve sst file size at different temperature. +* Added a stat rocksdb.secondary.cache.hits. +* Added a PerfContext counter secondary_cache_hit_count. +* The integrated BlobDB implementation now supports the tickers `BLOB_DB_BLOB_FILE_BYTES_READ`, `BLOB_DB_GC_NUM_KEYS_RELOCATED`, and `BLOB_DB_GC_BYTES_RELOCATED`, as well as the histograms `BLOB_DB_COMPRESSION_MICROS` and `BLOB_DB_DECOMPRESSION_MICROS`. +* Added hybrid configuration of Ribbon filter and Bloom filter where some LSM levels use Ribbon for memory space efficiency and some use Bloom for speed. See NewRibbonFilterPolicy. This also changes the default behavior of NewRibbonFilterPolicy to use Bloom for flushes under Leveled and Universal compaction and Ribbon otherwise. The C API function `rocksdb_filterpolicy_create_ribbon` is unchanged but adds new `rocksdb_filterpolicy_create_ribbon_hybrid`. + +### Public API change +* Added APIs to decode and replay trace file via Replayer class. Added `DB::NewDefaultReplayer()` to create a default Replayer instance. Added `TraceReader::Reset()` to restart reading a trace file. Created trace_record.h, trace_record_result.h and utilities/replayer.h files to access the decoded Trace records, replay them, and query the actual operation results. +* Added Configurable::GetOptionsMap to the public API for use in creating new Customizable classes. +* Generalized bits_per_key parameters in C API from int to double for greater configurability. Although this is a compatible change for existing C source code, anything depending on C API signatures, such as foreign function interfaces, will need to be updated. + +### Performance Improvements +* Try to avoid updating DBOptions if `SetDBOptions()` does not change any option value. + +### Behavior Changes +* `StringAppendOperator` additionally accepts a string as the delimiter. +* BackupEngineOptions::sync (default true) now applies to restoring backups in addition to creating backups. This could slow down restores, but ensures they are fully persisted before returning OK. (Consider increasing max_background_operations to improve performance.) + +## 6.23.0 (2021-07-16) +### Behavior Changes +* Obsolete keys in the bottommost level that were preserved for a snapshot will now be cleaned upon snapshot release in all cases. This form of compaction (snapshot release triggered compaction) previously had an artificial limitation that multiple tombstones needed to be present. +### Bug Fixes +* Blob file checksums are now printed in hexadecimal format when using the `manifest_dump` `ldb` command. +* `GetLiveFilesMetaData()` now populates the `temperature`, `oldest_ancester_time`, and `file_creation_time` fields of its `LiveFileMetaData` results when the information is available. Previously these fields always contained zero indicating unknown. +* Fix mismatches of OnCompaction{Begin,Completed} in case of DisableManualCompaction(). +* Fix continuous logging of an existing background error on every user write +* Fix a bug that `Get()` return Status::OK() and an empty value for non-existent key when `read_options.read_tier = kBlockCacheTier`. +* Fix a bug that stat in `get_context` didn't accumulate to statistics when query is failed. +* Fixed handling of DBOptions::wal_dir with LoadLatestOptions() or ldb --try_load_options on a copied or moved DB. Previously, when the WAL directory is same as DB directory (default), a copied or moved DB would reference the old path of the DB as the WAL directory, potentially corrupting both copies. Under this change, the wal_dir from DB::GetOptions() or LoadLatestOptions() may now be empty, indicating that the current DB directory is used for WALs. This is also a subtle API change. + +### New Features +* ldb has a new feature, `list_live_files_metadata`, that shows the live SST files, as well as their LSM storage level and the column family they belong to. +* The new BlobDB implementation now tracks the amount of garbage in each blob file in the MANIFEST. +* Integrated BlobDB now supports Merge with base values (Put/Delete etc.). +* RemoteCompaction supports sub-compaction, the job_id in the user interface is changed from `int` to `uint64_t` to support sub-compaction id. +* Expose statistics option in RemoteCompaction worker. + +### Public API change +* Added APIs to the Customizable class to allow developers to create their own Customizable classes. Created the utilities/customizable_util.h file to contain helper methods for developing new Customizable classes. +* Change signature of SecondaryCache::Name(). Make SecondaryCache customizable and add SecondaryCache::CreateFromString method. + +## 6.22.0 (2021-06-18) +### Behavior Changes +* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time. +* Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction. +### Bug Fixes +* fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root. Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root. Out of space could result for non-root users. +* Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp. +* Fix an issue that `DeleteFilesInRange()` may cause ongoing compaction reports corruption exception, or ASSERT for debug build. There's no actual data loss or corruption that we find. +* Fixed confusingly duplicated output in LOG for periodic stats ("DUMPING STATS"), including "Compaction Stats" and "File Read Latency Histogram By Level". +* Fixed performance bugs in background gathering of block cache entry statistics, that could consume a lot of CPU when there are many column families with a shared block cache. + +### New Features +* Marked the Ribbon filter and optimize_filters_for_memory features as production-ready, each enabling memory savings for Bloom-like filters. Use `NewRibbonFilterPolicy` in place of `NewBloomFilterPolicy` to use Ribbon filters instead of Bloom, or `ribbonfilter` in place of `bloomfilter` in configuration string. +* Allow `DBWithTTL` to use `DeleteRange` api just like other DBs. `DeleteRangeCF()` which executes `WriteBatchInternal::DeleteRange()` has been added to the handler in `DBWithTTLImpl::Write()` to implement it. +* Add BlockBasedTableOptions.prepopulate_block_cache. If enabled, it prepopulate warm/hot data blocks which are already in memory into block cache at the time of flush. On a flush, the data block that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this data back into memory again, which is avoided by enabling this option and it also helps with Distributed FileSystem. More details in include/rocksdb/table.h. +* Added a `cancel` field to `CompactRangeOptions`, allowing individual in-process manual range compactions to be cancelled. + +### New Features +* Added BlobMetaData to the ColumnFamilyMetaData to return information about blob files + +### Public API change +* Added GetAllColumnFamilyMetaData API to retrieve the ColumnFamilyMetaData about all column families. + +## 6.21.0 (2021-05-21) +### Bug Fixes +* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened. +* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results. +* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`. +* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. +* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed. +* Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe(). +* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file. +* Fixed a bug that `AdvancedColumnFamilyOptions.max_compaction_bytes` is under-calculated for manual compaction (`CompactRange()`). Manual compaction is split to multiple compactions if the compaction size exceed the `max_compaction_bytes`. The bug creates much larger compaction which size exceed the user setting. On the other hand, larger manual compaction size can increase the subcompaction parallelism, you can tune that by setting `max_compaction_bytes`. + +### Behavior Changes +* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. + +### New Features +* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size. +* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`. +* Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree. +* Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`. +* Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support. +* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs. +* Add `TableProperties::num_filter_entries`, which can be used with `TableProperties::filter_size` to calculate the effective bits per filter entry (unique user key or prefix) for a table file. + +### Performance Improvements +* BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches. + +### Public API change +* Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API. +* Removed unused structure `CompactionFilterContext`. +* The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters. +* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead. +* Added a new pure virtual function `ApplyToAllEntries` to `Cache`, to replace `ApplyToAllCacheEntries`. Custom `Cache` implementations must add an implementation. Because this function is for gathering statistics, an empty implementation could be acceptable for some applications. +* Added the ObjectRegistry to the ConfigOptions class. This registry instance will be used to find any customizable loadable objects during initialization. +* Expanded the ObjectRegistry functionality to allow nested ObjectRegistry instances. Added methods to register a set of functions with the registry/library as a group. +* Deprecated backupable_db.h and BackupableDBOptions in favor of new versions with appropriate names: backup_engine.h and BackupEngineOptions. Old API compatibility is preserved. + +### Default Option Change +* When options.arena_block_size <= 0 (default value 0), still use writer_buffer_size / 8 but cap to 1MB. Too large alloation size might not be friendly to allocator and might cause performance issues in extreme cases. + +### Build +* By default, try to build with liburing. For make, if ROCKSDB_USE_IO_URING is not set, treat as enable, which means RocksDB will try to build with liburing. Users can disable it with ROCKSDB_USE_IO_URING=0. For cmake, add WITH_LIBURING to control it, with default on. + +## 6.20.0 (2021-04-16) +### Behavior Changes +* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush. +* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions. +* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions(). +* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well. +* Added support for WriteBatchWithIndex::NewIteratorWithBase when overwrite_key=false. Previously, this combination was not supported and would assert or return nullptr. +* Improve the behavior of WriteBatchWithIndex for Merge operations. Now more operations may be stored in order to return the correct merged result. + +### Bug Fixes +* Use thread-safe `strerror_r()` to get error messages. +* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`) +* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory. +* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones. +* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result. +* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`. + +### Performance Improvements +* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance. + +### Public API change +* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead. +* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace +* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`. +* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files. + +### New Features +* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true. +* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up. +* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup. +* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups. +* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change. + +## 6.19.0 (2021-03-21) +### Bug Fixes +* Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file. +* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families. + +### Public API change +* Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming. +* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change. +* Add suppport to extend DB::VerifyFileChecksums API to also verify blob files checksum. +* When using the new BlobDB, the amount of data written by flushes/compactions is now broken down into table files and blob files in the compaction statistics; namely, Write(GB) denotes the amount of data written to table files, while Wblob(GB) means the amount of data written to blob files. +* New default BlockBasedTableOptions::format_version=5 to enable new Bloom filter implementation by default, compatible with RocksDB versions >= 6.6.0. +* Add new SetBufferSize API to WriteBufferManager to allow dynamic management of memory allotted to all write buffers. This allows user code to adjust memory monitoring provided by WriteBufferManager as process memory needs change datasets grow and shrink. +* Clarified the required semantics of Read() functions in FileSystem and Env APIs. Please ensure any custom implementations are compliant. +* For the new integrated BlobDB implementation, compaction statistics now include the amount of data read from blob files during compaction (due to garbage collection or compaction filters). Write amplification metrics have also been extended to account for data read from blob files. +* Add EqualWithoutTimestamp() to Comparator. +* Extend support to track blob files in SSTFileManager whenever a blob file is created/deleted. Blob files will be scheduled to delete via SSTFileManager and SStFileManager will now take blob files in account while calculating size and space limits along with SST files. +* Add new Append and PositionedAppend API with checksum handoff to legacy Env. + +### New Features +* Support compaction filters for the new implementation of BlobDB. Add `FilterBlobByKey()` to `CompactionFilter`. Subclasses can override this method so that compaction filters can determine whether the actual blob value has to be read during compaction. Use a new `kUndetermined` in `CompactionFilter::Decision` to indicated that further action is necessary for compaction filter to make a decision. +* Add support to extend retrieval of checksums for blob files from the MANIFEST when checkpointing. During backup, rocksdb can detect corruption in blob files during file copies. +* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1. +* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage. +* Enable backward iteration on keys with user-defined timestamps. +* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries. + +### Behavior Changes +* During flush, only WAL sync retryable IO error is mapped to hard error, which will stall the writes. When WAL is used but only SST file write has retryable IO error, it will be mapped to soft error and write will not be affected. + +## 6.18.0 (2021-02-19) +### Behavior Changes +* When retryable IO error occurs during compaction, it is mapped to soft error and set the BG error. However, auto resume is not called to clean the soft error since compaction will reschedule by itself. In this change, When retryable IO error occurs during compaction, BG error is not set. User will be informed the error via EventHelper. +* Introduce a new trace file format for query tracing and replay and trace file version is bump up to 0.2. A payload map is added as the first portion of the payload. We will not have backward compatible issues when adding new entries to trace records. Added the iterator_upper_bound and iterator_lower_bound in Seek and SeekForPrev tracing function. Added them as the new payload member for iterator tracing. + +### New Features +* Add support for key-value integrity protection in live updates from the user buffers provided to `WriteBatch` through the write to RocksDB's in-memory update buffer (memtable). This is intended to detect some cases of in-memory data corruption, due to either software or hardware errors. Users can enable protection by constructing their `WriteBatch` with `protection_bytes_per_key == 8`. +* Add support for updating `full_history_ts_low` option in manual compaction, which is for old timestamp data GC. +* Add a mechanism for using Makefile to build external plugin code into the RocksDB libraries/binaries. This intends to simplify compatibility and distribution for plugins (e.g., special-purpose `FileSystem`s) whose source code resides outside the RocksDB repo. See "plugin/README.md" for developer details, and "PLUGINS.md" for a listing of available plugins. +* Added memory pre-fetching for experimental Ribbon filter, which especially optimizes performance with batched MultiGet. +* A new, experimental version of BlobDB (key-value separation) is now available. The new implementation is integrated into the RocksDB core, i.e. it is accessible via the usual `rocksdb::DB` API, as opposed to the separate `rocksdb::blob_db::BlobDB` interface used by the earlier version, and can be configured on a per-column family basis using the configuration options `enable_blob_files`, `min_blob_size`, `blob_file_size`, `blob_compression_type`, `enable_blob_garbage_collection`, and `blob_garbage_collection_age_cutoff`. It extends RocksDB's consistency guarantees to blobs, and offers more features and better performance. Note that some features, most notably `Merge`, compaction filters, and backup/restore are not yet supported, and there is no support for migrating a database created by the old implementation. + +### Bug Fixes +* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. +* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. +* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot. +* In DB::OpenForReadOnly, if any error happens while checking Manifest file path, it was overridden by Status::NotFound. It has been fixed and now actual error is returned. + +### Public API Change +* Added a "only_mutable_options" flag to the ConfigOptions. When this flag is "true", the Configurable functions and convenience methods (such as GetDBOptionsFromString) will only deal with options that are marked as mutable. When this flag is true, only options marked as mutable can be configured (a Status::InvalidArgument will be returned) and options not marked as mutable will not be returned or compared. The default is "false", meaning to compare all options. +* Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff. +* Add an option, `CompressionOptions::max_dict_buffer_bytes`, to limit the in-memory buffering for selecting samples for generating/training a dictionary. The limit is currently loosely adhered to. + + +## 6.17.0 (2021-01-15) +### Behavior Changes +* When verifying full file checksum with `DB::VerifyFileChecksums()`, we now fail with `Status::InvalidArgument` if the name of the checksum generator used for verification does not match the name of the checksum generator used for protecting the file when it was created. +* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`. + +### Bug Fixes +* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions. +* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated. + +### Public API Change +* Add a public API WriteBufferManager::dummy_entries_in_cache_usage() which reports the size of dummy entries stored in cache (passed to WriteBufferManager). Dummy entries are used to account for DataBlocks. +* Add a SystemClock class that contains the time-related methods from Env. The original methods in Env may be deprecated in a future release. This class will allow easier testing, development, and expansion of time-related features. +* Add a public API GetRocksBuildProperties and GetRocksBuildInfoAsString to get properties about the current build. These properties may include settings related to the GIT settings (branch, timestamp). This change also sets the "build date" based on the GIT properties, rather than the actual build time, thereby enabling more reproducible builds. + +## 6.16.0 (2020-12-18) +### Behavior Changes +* Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. + +### Bug Fixes +* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`. +* Fix a bug where compressed blocks read by MultiGet are not inserted into the compressed block cache when use_direct_reads = true. +* Fixed the issue of full scanning on obsolete files when there are too many outstanding compactions with ConcurrentTaskLimiter enabled. +* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected. +* Fixed prefix extractor with timestamp issues. +* Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect. +* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected. +* When ldb --try_load_options is used with the --column_family option, the ColumnFamilyOptions for the specified column family was not loaded from the OPTIONS file. Fix it so its loaded from OPTIONS and then overridden with command line overrides. + +### New Features +* User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`. +* Support getting aggregated table properties (kAggregatedTableProperties and kAggregatedTablePropertiesAtLevel) with DB::GetMapProperty, for easier access to the data in a structured format. +* Experimental option BlockBasedTableOptions::optimize_filters_for_memory now works with experimental Ribbon filter (as well as Bloom filter). + +### Public API Change +* Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t. +* To improve portability the functions `Env::GetChildren` and `Env::GetChildrenFileAttributes` will no longer return entries for the special directories `.` or `..`. +* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance. +* `rocksdb_approximate_sizes` and `rocksdb_approximate_sizes_cf` in the C API now requires an error pointer (`char** errptr`) for receiving any error. +* All overloads of DB::GetApproximateSizes now return Status, so that any failure to obtain the sizes is indicated to the caller. + +## 6.15.0 (2020-11-13) +### Bug Fixes +* Fixed a bug in the following combination of features: indexes with user keys (`format_version >= 3`), indexes are partitioned (`index_type == kTwoLevelIndexSearch`), and some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`). The bug could cause keys to be truncated when read from the index leading to wrong read results or other unexpected behavior. +* Fixed a bug when indexes are partitioned (`index_type == kTwoLevelIndexSearch`), some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`), and partitions reads could be mixed between block cache and directly from the file (e.g., with `enable_index_compression == 1` and `mmap_read == 1`, partitions that were stored uncompressed due to poor compression ratio would be read directly from the file via mmap, while partitions that were stored compressed would be read from block cache). The bug could cause index partitions to be mistakenly considered empty during reads leading to wrong read results. +* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121). +* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files. +* Since 6.14, fix a bug that could cause a stalled write to crash with mixed of slowdown and no_slowdown writes (`WriteOptions.no_slowdown=true`). +* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14. +* Fixed a bug which causes Get() to return incorrect result when a key's merge operand is applied twice. This can occur if the thread performing Get() runs concurrently with a background flush thread and another thread writing to the MANIFEST file (PR6069). +* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed. +* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version. +* Fixed MultiGet bugs it doesn't return valid data with user defined timestamp. +* Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio. +* Fixed a seek issue with prefix extractor and timestamp. +* Fixed a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer. +* Fixed a bug of a recovery corner case, details in PR7621. + +### Public API Change +* Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options. +* Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums. + +### Behavior Changes +* The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag. +* When the `enabled` flag is set in `ColumnFamilyOptions::bottommost_compression_opts`, those compression options now take effect regardless of the value in `ColumnFamilyOptions::bottommost_compression`. Previously, those compression options only took effect when `ColumnFamilyOptions::bottommost_compression != kDisableCompressionOption`. Now, they additionally take effect when `ColumnFamilyOptions::bottommost_compression == kDisableCompressionOption` (such a setting causes bottommost compression type to fall back to `ColumnFamilyOptions::compression_per_level` if configured, and otherwise fall back to `ColumnFamilyOptions::compression`). + +### New Features +* An EXPERIMENTAL new Bloom alternative that saves about 30% space compared to Bloom filters, with about 3-4x construction time and similar query times is available using NewExperimentalRibbonFilterPolicy. + +## 6.14 (2020-10-09) +### Bug fixes +* Fixed a bug after a `CompactRange()` with `CompactRangeOptions::change_level` set fails due to a conflict in the level change step, which caused all subsequent calls to `CompactRange()` with `CompactRangeOptions::change_level` set to incorrectly fail with a `Status::NotSupported("another thread is refitting")` error. +* Fixed a bug that the bottom most level compaction could still be a trivial move even if `BottommostLevelCompaction.kForce` or `kForceOptimized` is set. + +### Public API Change +* The methods to create and manage EncrypedEnv have been changed. The EncryptionProvider is now passed to NewEncryptedEnv as a shared pointer, rather than a raw pointer. Comparably, the CTREncryptedProvider now takes a shared pointer, rather than a reference, to a BlockCipher. CreateFromString methods have been added to BlockCipher and EncryptionProvider to provide a single API by which different ciphers and providers can be created, respectively. +* The internal classes (CTREncryptionProvider, ROT13BlockCipher, CTRCipherStream) associated with the EncryptedEnv have been moved out of the public API. To create a CTREncryptionProvider, one can either use EncryptionProvider::NewCTRProvider, or EncryptionProvider::CreateFromString("CTR"). To create a new ROT13BlockCipher, one can either use BlockCipher::NewROT13Cipher or BlockCipher::CreateFromString("ROT13"). +* The EncryptionProvider::AddCipher method has been added to allow keys to be added to an EncryptionProvider. This API will allow future providers to support multiple cipher keys. +* Add a new option "allow_data_in_errors". When this new option is set by users, it allows users to opt-in to get error messages containing corrupted keys/values. Corrupt keys, values will be logged in the messages, logs, status etc. that will help users with the useful information regarding affected data. By default value of this option is set false to prevent users data to be exposed in the messages so currently, data will be redacted from logs, messages, status by default. +* AdvancedColumnFamilyOptions::force_consistency_checks is now true by default, for more proactive DB corruption detection at virtually no cost (estimated two extra CPU cycles per million on a major production workload). Corruptions reported by these checks now mention "force_consistency_checks" in case a false positive corruption report is suspected and the option needs to be disabled (unlikely). Since existing column families have a saved setting for force_consistency_checks, only new column families will pick up the new default. + +### General Improvements +* The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features). The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation. + +### New Features + +* Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit). This change will allow for better and more thorough configuration management and retrieval in the future. The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method. The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods. The list of options supported by an object can be obtained via the GetOptionNames method. The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method. Configurable options can be compared via the AreEquivalent method. The settings within a Configurable object may be validated via the ValidateOptions method. The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method. +* Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail. +* Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface. +* Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level. +* SST files have a new table property called db_host_id, which is set to the hostname by default. A new option in DBOptions, db_host_id, allows the property value to be overridden with a user specified string, or disable it completely by making the option string empty. +* Methods to create customizable extensions -- such as TableFactory -- are exposed directly through the Customizable base class (from which these objects inherit). This change will allow these Customizable classes to be loaded and configured in a standard way (via CreateFromString). More information on how to write and use Customizable classes is in the customizable.h header file. + +## 6.13 (2020-09-12) +### Bug fixes +* Fix a performance regression introduced in 6.4 that makes a upper bound check for every Next() even if keys are within a data block that is within the upper bound. +* Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel. +* Sanitize `recycle_log_file_num` to zero when the user attempts to enable it in combination with `WALRecoveryMode::kTolerateCorruptedTailRecords`. Previously the two features were allowed together, which compromised the user's configured crash-recovery guarantees. +* Fix a bug where a level refitting in CompactRange() might race with an automatic compaction that puts the data to the target level of the refitting. The bug has been there for years. +* Fixed a bug in version 6.12 in which BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory. +* Fix useless no-op compactions scheduled upon snapshot release when options.disable-auto-compactions = true. +* Fix a bug when max_write_buffer_size_to_maintain is set, immutable flushed memtable destruction is delayed until the next super version is installed. A memtable is not added to delete list because of its reference hold by super version and super version doesn't switch because of empt delete list. So memory usage keeps on increasing beyond write_buffer_size + max_write_buffer_size_to_maintain. +* Avoid converting MERGES to PUTS when allow_ingest_behind is true. +* Fix compression dictionary sampling together with `SstFileWriter`. Previously, the dictionary would be trained/finalized immediately with zero samples. Now, the whole `SstFileWriter` file is buffered in memory and then sampled. +* Fix a bug with `avoid_unnecessary_blocking_io=1` and creating backups (BackupEngine::CreateNewBackup) or checkpoints (Checkpoint::Create). With this setting and WAL enabled, these operations could randomly fail with non-OK status. +* Fix a bug in which bottommost compaction continues to advance the underlying InternalIterator to skip tombstones even after shutdown. + +### New Features +* A new field `std::string requested_checksum_func_name` is added to `FileChecksumGenContext`, which enables the checksum factory to create generators for a suite of different functions. +* Added a new subcommand, `ldb unsafe_remove_sst_file`, which removes a lost or corrupt SST file from a DB's metadata. This command involves data loss and must not be used on a live DB. + +### Performance Improvements +* Reduce thread number for multiple DB instances by re-using one global thread for statistics dumping and persisting. +* Reduce write-amp in heavy write bursts in `kCompactionStyleLevel` compaction style with `level_compaction_dynamic_level_bytes` set. +* BackupEngine incremental backups no longer read DB table files that are already saved to a shared part of the backup directory, unless `share_files_with_checksum` is used with `kLegacyCrc32cAndFileSize` naming (discouraged). + * For `share_files_with_checksum`, we are confident there is no regression (vs. pre-6.12) in detecting DB or backup corruption at backup creation time, mostly because the old design did not leverage this extra checksum computation for detecting inconsistencies at backup creation time. + * For `share_table_files` without "checksum" (not recommended), there is a regression in detecting fundamentally unsafe use of the option, greatly mitigated by file size checking (under "Behavior Changes"). Almost no reason to use `share_files_with_checksum=false` should remain. + * `DB::VerifyChecksum` and `BackupEngine::VerifyBackup` with checksum checking are still able to catch corruptions that `CreateNewBackup` does not. + +### Public API Change +* Expose kTypeDeleteWithTimestamp in EntryType and update GetEntryType() accordingly. +* Added file_checksum and file_checksum_func_name to TableFileCreationInfo, which can pass the table file checksum information through the OnTableFileCreated callback during flush and compaction. +* A warning is added to `DB::DeleteFile()` API describing its known problems and deprecation plan. +* Add a new stats level, i.e. StatsLevel::kExceptTickers (PR7329) to exclude tickers even if application passes a non-null Statistics object. +* Added a new status code IOStatus::IOFenced() for the Env/FileSystem to indicate that writes from this instance are fenced off. Like any other background error, this error is returned to the user in Put/Merge/Delete/Flush calls and can be checked using Status::IsIOFenced(). + +### Behavior Changes +* File abstraction `FSRandomAccessFile.Prefetch()` default return status is changed from `OK` to `NotSupported`. If the user inherited file doesn't implement prefetch, RocksDB will create internal prefetch buffer to improve read performance. +* When retryabel IO error happens during Flush (manifest write error is excluded) and WAL is disabled, originally it is mapped to kHardError. Now,it is mapped to soft error. So DB will not stall the writes unless the memtable is full. At the same time, when auto resume is triggered to recover the retryable IO error during Flush, SwitchMemtable is not called to avoid generating to many small immutable memtables. If WAL is enabled, no behavior changes. +* When considering whether a table file is already backed up in a shared part of backup directory, BackupEngine would already query the sizes of source (DB) and pre-existing destination (backup) files. BackupEngine now uses these file sizes to detect corruption, as at least one of (a) old backup, (b) backup in progress, or (c) current DB is corrupt if there's a size mismatch. + +### Others +* Error in prefetching partitioned index blocks will not be swallowed. It will fail the query and return the IOError users. + +## 6.12 (2020-07-28) +### Public API Change +* Encryption file classes now exposed for inheritance in env_encryption.h +* File I/O listener is extended to cover more I/O operations. Now class `EventListener` in listener.h contains new callback functions: `OnFileFlushFinish()`, `OnFileSyncFinish()`, `OnFileRangeSyncFinish()`, `OnFileTruncateFinish()`, and ``OnFileCloseFinish()``. +* `FileOperationInfo` now reports `duration` measured by `std::chrono::steady_clock` and `start_ts` measured by `std::chrono::system_clock` instead of start and finish timestamps measured by `system_clock`. Note that `system_clock` is called before `steady_clock` in program order at operation starts. +* `DB::GetDbSessionId(std::string& session_id)` is added. `session_id` stores a unique identifier that gets reset every time the DB is opened. This DB session ID should be unique among all open DB instances on all hosts, and should be unique among re-openings of the same or other DBs. This identifier is recorded in the LOG file on the line starting with "DB Session ID:". +* `DB::OpenForReadOnly()` now returns `Status::NotFound` when the specified DB directory does not exist. Previously the error returned depended on the underlying `Env`. This change is available in all 6.11 releases as well. +* A parameter `verify_with_checksum` is added to `BackupEngine::VerifyBackup`, which is false by default. If it is ture, `BackupEngine::VerifyBackup` verifies checksums and file sizes of backup files. Pass `false` for `verify_with_checksum` to maintain the previous behavior and performance of `BackupEngine::VerifyBackup`, by only verifying sizes of backup files. + +### Behavior Changes +* Best-efforts recovery ignores CURRENT file completely. If CURRENT file is missing during recovery, best-efforts recovery still proceeds with MANIFEST file(s). +* In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early. +* When `file_checksum_gen_factory` is set to `GetFileChecksumGenCrc32cFactory()`, BackupEngine will compare the crc32c checksums of table files computed when creating a backup to the expected checksums stored in the DB manifest, and will fail `CreateNewBackup()` on mismatch (corruption). If the `file_checksum_gen_factory` is not set or set to any other customized factory, there is no checksum verification to detect if SST files in a DB are corrupt when read, copied, and independently checksummed by BackupEngine. +* When a DB sets `stats_dump_period_sec > 0`, either as the initial value for DB open or as a dynamic option change, the first stats dump is staggered in the following X seconds, where X is an integer in `[0, stats_dump_period_sec)`. Subsequent stats dumps are still spaced `stats_dump_period_sec` seconds apart. +* When the paranoid_file_checks option is true, a hash is generated of all keys and values are generated when the SST file is written, and then the values are read back in to validate the file. A corruption is signaled if the two hashes do not match. + +### Bug fixes +* Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too. +* Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement. +* Disable file deletion after MANIFEST write/sync failure until db re-open or Resume() so that subsequent re-open will not see MANIFEST referencing deleted SSTs. +* Fix a bug when index_type == kTwoLevelIndexSearch in PartitionedIndexBuilder to update FlushPolicy to point to internal key partitioner when it changes from user-key mode to internal-key mode in index partition. +* Make compaction report InternalKey corruption while iterating over the input. +* Fix a bug which may cause MultiGet to be slow because it may read more data than requested, but this won't affect correctness. The bug was introduced in 6.10 release. +* Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further. +* Fixed a bug in size-amp-triggered and periodic-triggered universal compaction, where the compression settings for the first input level were used rather than the compression settings for the output (bottom) level. + +### New Features +* DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer†and “DB Repairerâ€, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called. +* Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5. +* `BackupableDBOptions::share_files_with_checksum_naming` is added with new default behavior for naming backup files with `share_files_with_checksum`, to address performance and backup integrity issues. See API comments for details. +* Added auto resume function to automatically recover the DB from background Retryable IO Error. When retryable IOError happens during flush and WAL write, the error is mapped to Hard Error and DB will be in read mode. When retryable IO Error happens during compaction, the error will be mapped to Soft Error. DB is still in write/read mode. Autoresume function will create a thread for a DB to call DB->ResumeImpl() to try the recover for Retryable IO Error during flush and WAL write. Compaction will be rescheduled by itself if retryable IO Error happens. Auto resume may also cause other Retryable IO Error during the recovery, so the recovery will fail. Retry the auto resume may solve the issue, so we use max_bgerror_resume_count to decide how many resume cycles will be tried in total. If it is <=0, auto resume retryable IO Error is disabled. Default is INT_MAX, which will lead to a infinit auto resume. bgerror_resume_retry_interval decides the time interval between two auto resumes. +* Option `max_subcompactions` can be set dynamically using DB::SetDBOptions(). +* Added experimental ColumnFamilyOptions::sst_partitioner_factory to define determine the partitioning of sst files. This helps compaction to split the files on interesting boundaries (key prefixes) to make propagation of sst files less write amplifying (covering the whole key space). + +### Performance Improvements +* Eliminate key copies for internal comparisons while accessing ingested block-based tables. +* Reduce key comparisons during random access in all block-based tables. +* BackupEngine avoids unnecessary repeated checksum computation for backing up a table file to the `shared_checksum` directory when using `share_files_with_checksum_naming = kUseDbSessionId` (new default), except on SST files generated before this version of RocksDB, which fall back on using `kLegacyCrc32cAndFileSize`. + +## 6.11 (2020-06-12) +### Bug Fixes +* Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true. +* Fix possible false NotFound status from batched MultiGet using index type kHashSearch. +* Fix corruption caused by enabling delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode, along with parallel compactions. The bug can result in two parallel compactions picking the same input files, resulting in the DB resurrecting older and deleted versions of some keys. +* Fix a use-after-free bug in best-efforts recovery. column_family_memtables_ needs to point to valid ColumnFamilySet. +* Let best-efforts recovery ignore corrupted files during table loading. +* Fix corrupt key read from ingested file when iterator direction switches from reverse to forward at a key that is a prefix of another key in the same file. It is only possible in files with a non-zero global seqno. +* Fix abnormally large estimate from GetApproximateSizes when a range starts near the end of one SST file and near the beginning of another. Now GetApproximateSizes consistently and fairly includes the size of SST metadata in addition to data blocks, attributing metadata proportionally among the data blocks based on their size. +* Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile(). +* Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch). +* Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks. +* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value. + +### Public API Change +* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request. +* BlobDB now explicitly disallows using the default column family's storage directories as blob directory. +* DeleteRange now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. +* ldb now uses options.force_consistency_checks = true by default and "--disable_consistency_checks" is added to disable it. +* DB::OpenForReadOnly no longer creates files or directories if the named DB does not exist, unless create_if_missing is set to true. +* The consistency checks that validate LSM state changes (table file additions/deletions during flushes and compactions) are now stricter, more efficient, and no longer optional, i.e. they are performed even if `force_consistency_checks` is `false`. +* Disable delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode and num_levels = 1 in order to avoid a corruption bug. +* `pin_l0_filter_and_index_blocks_in_cache` no longer applies to L0 files larger than `1.5 * write_buffer_size` to give more predictable memory usage. Such L0 files may exist due to intra-L0 compaction, external file ingestion, or user dynamically changing `write_buffer_size` (note, however, that files that are already pinned will continue being pinned, even after such a dynamic change). +* In point-in-time wal recovery mode, fail database recovery in case of IOError while reading the WAL to avoid data loss. +* A new method `Env::LowerThreadPoolCPUPriority(Priority, CpuPriority)` is added to `Env` to be able to lower to a specific priority such as `CpuPriority::kIdle`. + +### New Features +* sst_dump to add a new --readahead_size argument. Users can specify read size when scanning the data. Sst_dump also tries to prefetch tail part of the SST files so usually some number of I/Os are saved there too. +* Generate file checksum in SstFileWriter if Options.file_checksum_gen_factory is set. The checksum and checksum function name are stored in ExternalSstFileInfo after the sst file write is finished. +* Add a value_size_soft_limit in read options which limits the cumulative value size of keys read in batches in MultiGet. Once the cumulative value size of found keys exceeds read_options.value_size_soft_limit, all the remaining keys are returned with status Abort without further finding their values. By default the value_size_soft_limit is std::numeric_limits::max(). +* Enable SST file ingestion with file checksum information when calling IngestExternalFiles(const std::vector& args). Added files_checksums and files_checksum_func_names to IngestExternalFileArg such that user can ingest the sst files with their file checksum information. Added verify_file_checksum to IngestExternalFileOptions (default is True). To be backward compatible, if DB does not enable file checksum or user does not provide checksum information (vectors of files_checksums and files_checksum_func_names are both empty), verification of file checksum is always sucessful. If DB enables file checksum, DB will always generate the checksum for each ingested SST file during Prepare stage of ingestion and store the checksum in Manifest, unless verify_file_checksum is False and checksum information is provided by the application. In this case, we only verify the checksum function name and directly store the ingested checksum in Manifest. If verify_file_checksum is set to True, DB will verify the ingested checksum and function name with the genrated ones. Any mismatch will fail the ingestion. Note that, if IngestExternalFileOptions::write_global_seqno is True, the seqno will be changed in the ingested file. Therefore, the checksum of the file will be changed. In this case, a new checksum will be generated after the seqno is updated and be stored in the Manifest. + +### Performance Improvements +* Eliminate redundant key comparisons during random access in block-based tables. + +## 6.10 (2020-05-02) +### Bug Fixes +* Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced. +* Finish implementation of BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It's now ready for use. Significantly reduces read amplification in some setups, especially for iterator seeks. +* Fix a bug by updating CURRENT file so that it points to the correct MANIFEST file after best-efforts recovery. +* Fixed a bug where ColumnFamilyHandle objects were not cleaned up in case an error happened during BlobDB's open after the base DB had been opened. +* Fix a potential undefined behavior caused by trying to dereference nullable pointer (timestamp argument) in DB::MultiGet. +* Fix a bug caused by not including user timestamp in MultiGet LookupKey construction. This can lead to wrong query result since the trailing bytes of a user key, if not shorter than timestamp, will be mistaken for user timestamp. +* Fix a bug caused by using wrong compare function when sorting the input keys of MultiGet with timestamps. +* Upgraded version of bzip library (1.0.6 -> 1.0.8) used with RocksJava to address potential vulnerabilities if an attacker can manipulate compressed data saved and loaded by RocksDB (not normal). See issue #6703. + +### Public API Change +* Add a ConfigOptions argument to the APIs dealing with converting options to and from strings and files. The ConfigOptions is meant to replace some of the options (such as input_strings_escaped and ignore_unknown_options) and allow for more parameters to be passed in the future without changing the function signature. +* Add NewFileChecksumGenCrc32cFactory to the file checksum public API, such that the builtin Crc32c based file checksum generator factory can be used by applications. +* Add IsDirectory to Env and FS to indicate if a path is a directory. + +### New Features +* Added support for pipelined & parallel compression optimization for `BlockBasedTableBuilder`. This optimization makes block building, block compression and block appending a pipeline, and uses multiple threads to accelerate block compression. Users can set `CompressionOptions::parallel_threads` greater than 1 to enable compression parallelism. This feature is experimental for now. +* Provide an allocator for memkind to be used with block cache. This is to work with memory technologies (Intel DCPMM is one such technology currently available) that require different libraries for allocation and management (such as PMDK and memkind). The high capacities available make it possible to provision large caches (up to several TBs in size) beyond what is achievable with DRAM. +* Option `max_background_flushes` can be set dynamically using DB::SetDBOptions(). +* Added functionality in sst_dump tool to check the compressed file size for different compression levels and print the time spent on compressing files with each compression type. Added arguments `--compression_level_from` and `--compression_level_to` to report size of all compression levels and one compression_type must be specified with it so that it will report compressed sizes of one compression type with different levels. +* Added statistics for redundant insertions into block cache: rocksdb.block.cache.*add.redundant. (There is currently no coordination to ensure that only one thread loads a table block when many threads are trying to access that same table block.) + +### Bug Fixes +* Fix a bug when making options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts dynamically changeable: the modified values are not written to option files or returned back to users when being queried. +* Fix a bug where index key comparisons were unaccounted in `PerfContext::user_key_comparison_count` for lookups in files written with `format_version >= 3`. +* Fix many bloom.filter statistics not being updated in batch MultiGet. + +### Performance Improvements +* Improve performance of batch MultiGet with partitioned filters, by sharing block cache lookups to applicable filter blocks. +* Reduced memory copies when fetching and uncompressing compressed blocks from sst files. + +## 6.9.0 (2020-03-29) +### Behavior changes +* Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly. + +### Public API Change +* Fix spelling so that API now has correctly spelled transaction state name `COMMITTED`, while the old misspelled `COMMITED` is still available as an alias. +* Updated default format_version in BlockBasedTableOptions from 2 to 4. SST files generated with the new default can be read by RocksDB versions 5.16 and newer, and use more efficient encoding of keys in index blocks. +* A new parameter `CreateBackupOptions` is added to both `BackupEngine::CreateNewBackup` and `BackupEngine::CreateNewBackupWithMetadata`, you can decrease CPU priority of `BackupEngine`'s background threads by setting `decrease_background_thread_cpu_priority` and `background_thread_cpu_priority` in `CreateBackupOptions`. +* Updated the public API of SST file checksum. Introduce the FileChecksumGenFactory to create the FileChecksumGenerator for each SST file, such that the FileChecksumGenerator is not shared and it can be more general for checksum implementations. Changed the FileChecksumGenerator interface from Value, Extend, and GetChecksum to Update, Finalize, and GetChecksum. Finalize should be only called once after all data is processed to generate the final checksum. Temproal data should be maintained by the FileChecksumGenerator object itself and finally it can return the checksum string. + +### Bug Fixes +* Fix a bug where range tombstone blocks in ingested files were cached incorrectly during ingestion. If range tombstones were read from those incorrectly cached blocks, the keys they covered would be exposed. * Fix a data race that might cause crash when calling DB::GetCreationTimeOfOldestFile() by a small chance. The bug was introduced in 6.6 Release. +* Fix a bug where a boolean value optimize_filters_for_hits was for max threads when calling load table handles after a flush or compaction. The value is correct to 1. The bug should not cause user visible problems. +* Fix a bug which might crash the service when write buffer manager fails to insert the dummy handle to the block cache. + +### Performance Improvements +* In CompactRange, for levels starting from 0, if the level does not have any file with any key falling in the specified range, the level is skipped. So instead of always compacting from level 0, the compaction starts from the first level with keys in the specified range until the last such level. +* Reduced memory copy when reading sst footer and blobdb in direct IO mode. +* When restarting a database with large numbers of sst files, large amount of CPU time is spent on getting logical block size of the sst files, which slows down the starting progress, this inefficiency is optimized away with an internal cache for the logical block sizes. + +### New Features +* Basic support for user timestamp in iterator. Seek/SeekToFirst/Next and lower/upper bounds are supported. Reverse iteration is not supported. Merge is not considered. +* When file lock failure when the lock is held by the current process, return acquiring time and thread ID in the error message. +* Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled. +* options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts are now dynamically changeable. -## 6.8.0 (02/24/2020) +## 6.8.0 (2020-02-24) ### Java API Changes * Major breaking changes to Java comparators, toward standardizing on ByteBuffer for performant, locale-neutral operations on keys (#6252). * Added overloads of common API methods using direct ByteBuffers for keys and values (#2283). @@ -30,7 +728,7 @@ * `db_bench` now supports `value_size_distribution_type`, `value_size_min`, `value_size_max` options for generating random variable sized value. Added `blob_db_compression_type` option for BlobDB to enable blob compression. * Replace RocksDB namespace "rocksdb" with flag "ROCKSDB_NAMESPACE" which if is not defined, defined as "rocksdb" in header file rocksdb_namespace.h. -## 6.7.0 (01/21/2020) +## 6.7.0 (2020-01-21) ### Public API Change * Added a rocksdb::FileSystem class in include/rocksdb/file_system.h to encapsulate file creation/read/write operations, and an option DBOptions::file_system to allow a user to pass in an instance of rocksdb::FileSystem. If its a non-null value, this will take precendence over DBOptions::env for file operations. A new API rocksdb::FileSystem::Default() returns a platform default object. The DBOptions::env option and Env::Default() API will continue to be used for threading and other OS related functions, and where DBOptions::file_system is not specified, for file operations. For storage developers who are accustomed to rocksdb::Env, the interface in rocksdb::FileSystem is new and will probably undergo some changes as more storage systems are ported to it from rocksdb::Env. As of now, no env other than Posix has been ported to the new interface. * A new rocksdb::NewSstFileManager() API that allows the caller to pass in separate Env and FileSystem objects. @@ -55,11 +753,11 @@ * Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound. * MultiGet() can use IO Uring to parallelize read from the same SST file. This featuer is by default disabled. It can be enabled with environment variable ROCKSDB_USE_IO_URING. -## 6.6.2 (01/13/2020) +## 6.6.2 (2020-01-13) ### Bug Fixes * Fixed a bug where non-L0 compaction input files were not considered to compute the `creation_time` of new compaction outputs. -## 6.6.1 (01/02/2020) +## 6.6.1 (2020-01-02) ### Bug Fixes * Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32 * Fixed two performance issues related to memtable history trimming. First, a new SuperVersion is now created only if some memtables were actually trimmed. Second, trimming is only scheduled if there is at least one flushed memtable that is kept in memory for the purposes of transaction conflict checking. @@ -69,7 +767,7 @@ * Delete superversions in BackgroundCallPurge. * Fix use-after-free and double-deleting files in BackgroundCallPurge(). -## 6.6.0 (11/25/2019) +## 6.6.0 (2019-11-25) ### Bug Fixes * Fix data corruption caused by output of intra-L0 compaction on ingested file not being placed in correct order in L0. * Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression. @@ -122,19 +820,19 @@ * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement. * Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom. -## 6.5.2 (11/15/2019) +## 6.5.2 (2019-11-15) ### Bug Fixes * Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured. * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files. -## 6.5.1 (10/16/2019) +## 6.5.1 (2019-10-16) ### Bug Fixes * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound. * Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound. * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand. -## 6.5.0 (9/13/2019) +## 6.5.0 (2019-09-13) ### Bug Fixes * Fixed a number of data races in BlobDB. * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0.. @@ -155,7 +853,7 @@ ### Performance Improvements * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance. -## 6.4.0 (7/30/2019) +## 6.4.0 (2019-07-30) ### Default Option Change * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explicitly created, the small block cache created by BlockBasedTable will still has this option to be 0.0. * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. @@ -191,7 +889,7 @@ * Fixed a regression where the fill_cache read option also affected index blocks. * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well. -## 6.3.2 (8/15/2019) +## 6.3.2 (2019-08-15) ### Public API Change * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. @@ -199,11 +897,11 @@ * Fixed a regression where the fill_cache read option also affected index blocks. * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well. -## 6.3.1 (7/24/2019) +## 6.3.1 (2019-07-24) ### Bug Fixes * Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails. -## 6.3.0 (6/18/2019) +## 6.3.0 (2019-06-18) ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers. @@ -240,7 +938,7 @@ * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. * On DB open, delete WAL trash files left behind in wal_dir -## 6.2.0 (4/30/2019) +## 6.2.0 (2019-04-30) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. @@ -262,7 +960,7 @@ * Close a WAL file before another thread deletes it. * Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. -## 6.1.1 (4/9/2019) +## 6.1.1 (2019-04-09) ### New Features * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry. @@ -272,7 +970,7 @@ * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction. * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries. -## 6.1.0 (3/27/2019) +## 6.1.0 (2019-03-27) ### New Features * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. * Added a feature to perform data-block sampling for compressibility, and report stats to user. @@ -290,7 +988,7 @@ * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms. * Fix SstFileReader not able to open file ingested with write_glbal_seqno=true. -## 6.0.0 (2/19/2019) +## 6.0.0 (2019-02-19) ### New Features * Enabled checkpoint on readonly db (DBImplReadOnly). * Make DB ignore dropped column families while committing results of atomic flush. @@ -332,7 +1030,7 @@ ### Change Default Options * Change options.compaction_pri's default to kMinOverlappingRatio -## 5.18.0 (11/30/2018) +## 5.18.0 (2018-11-30) ### New Features * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump. * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query. @@ -360,7 +1058,7 @@ * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously. * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files. -## 5.17.0 (10/05/2018) +## 5.17.0 (2018-10-05) ### Public API Change * `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero. * Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not. @@ -374,21 +1072,21 @@ * Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction. * Sync CURRENT file contents during checkpoint. -## 5.16.3 (10/1/2018) +## 5.16.3 (2018-10-01) ### Bug Fixes * Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options. -## 5.16.2 (9/21/2018) +## 5.16.2 (2018-09-21) ### Bug Fixes * Fix bug in partition filters with format_version=4. -## 5.16.1 (9/17/2018) +## 5.16.1 (2018-09-17) ### Bug Fixes * Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file. * Fix RocksDB Java build and tests. * Remove sync point in Block destructor. -## 5.16.0 (8/21/2018) +## 5.16.0 (2018-08-21) ### Public API Change * The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons * GetAllKeyVersions() to take an extra argument of `max_num_ikeys`. @@ -402,7 +1100,7 @@ ### Bug Fixes * Fix a bug in misreporting the estimated partition index size in properties block. -## 5.15.0 (7/17/2018) +## 5.15.0 (2018-07-17) ### Public API Change * Remove managed iterator. ReadOptions.managed is not effective anymore. * For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default. @@ -428,7 +1126,7 @@ * Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache. * Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0. -## 5.14.0 (5/16/2018) +## 5.14.0 (2018-05-16) ### Public API Change * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. * The background thread naming convention changed (on supporting platforms) to "rocksdb:", e.g., "rocksdb:low0". @@ -461,7 +1159,7 @@ * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances. * Added SstFileManager to the Java API to allow managing SST files across DB instances. -## 5.13.0 (3/20/2018) +## 5.13.0 (2018-03-20) ### Public API Change * RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version. * Remove CompactionEventListener. @@ -477,7 +1175,7 @@ * Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map. * Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread. -## 5.12.0 (2/14/2018) +## 5.12.0 (2018-02-14) ### Public API Change * Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake. * Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`. @@ -499,7 +1197,7 @@ * Fix advance reservation of arena block addresses. * Fix handling of empty string as checkpoint directory. -## 5.11.0 (01/08/2018) +## 5.11.0 (2018-01-08) ### Public API Change * Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter @@ -516,7 +1214,7 @@ * Fix a mislabel bug for bottom-pri compaction threads. * Fix DB::Flush() keep waiting after flush finish under certain condition. -## 5.10.0 (12/11/2017) +## 5.10.0 (2017-12-11) ### Public API Change * When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features. @@ -531,7 +1229,7 @@ * Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files. * Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker. -## 5.9.0 (11/1/2017) +## 5.9.0 (2017-11-01) ### Public API Change * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened. * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default. @@ -558,7 +1256,7 @@ * Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery. * Fix possible metadata corruption in databases using `DeleteRange()`. -## 5.8.0 (08/30/2017) +## 5.8.0 (2017-08-30) ### Public API Change * Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints. * `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr. @@ -578,7 +1276,7 @@ * Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled. * Fix potentially wrong file smallest key when range deletions separated by snapshot are written together. -## 5.7.0 (07/13/2017) +## 5.7.0 (2017-07-13) ### Public API Change * DB property "rocksdb.sstables" now prints keys in hex form. @@ -593,7 +1291,7 @@ ### Bug Fixes * Fix discarding empty compaction output files when `DeleteRange()` is used together with subcompactions. -## 5.6.0 (06/06/2017) +## 5.6.0 (2017-06-06) ### Public API Change * Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads. * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction. @@ -610,7 +1308,7 @@ ### Bug Fixes * Shouldn't ignore return value of fsync() in flush. -## 5.5.0 (05/17/2017) +## 5.5.0 (2017-05-17) ### New Features * FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true. * DB::ResetStats() to reset internal stats. @@ -627,7 +1325,7 @@ ### Bug Fixes * Fix the bug that Direct I/O uses direct reads for non-SST file -## 5.4.0 (04/11/2017) +## 5.4.0 (2017-04-11) ### Public API Change * random_access_max_buffer_size no longer has any effect * Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest() @@ -644,7 +1342,7 @@ * Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often. * (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size. -## 5.3.0 (03/08/2017) +## 5.3.0 (2017-03-08) ### Public API Change * Remove disableDataSync option. * Remove timeout_hint_us option from WriteOptions. The option has been deprecated and has no effect since 3.13.0. @@ -654,7 +1352,7 @@ ### Bug Fixes * Fix the bug that iterator may skip keys -## 5.2.0 (02/08/2017) +## 5.2.0 (2017-02-08) ### Public API Change * NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explicit provide one. * Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files. @@ -672,7 +1370,7 @@ * Some fixes related to 2PC. * Fix bugs of data corruption in direct I/O -## 5.1.0 (01/13/2017) +## 5.1.0 (2017-01-13) * Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions(). * Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully. * BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env. @@ -681,7 +1379,7 @@ * Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions. * When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying. -## 5.0.0 (11/17/2016) +## 5.0.0 (2016-11-17) ### Public API Change * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters. * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions(). @@ -700,7 +1398,7 @@ * Add LuaCompactionFilter in utilities. This allows developers to write compaction filters in Lua. To use this feature, LUA_PATH needs to be set to the root directory of Lua. * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory. -## 4.13.0 (10/18/2016) +## 4.13.0 (2016-10-18) ### Public API Change * DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference. * Added Statistics::getAndResetTickerCount(). @@ -709,7 +1407,7 @@ * Add DB::SetDBOptions() to dynamic change base_background_compactions and max_background_compactions. * Added Iterator::SeekForPrev(). This new API will seek to the last key that less than or equal to the target key. -## 4.12.0 (9/12/2016) +## 4.12.0 (2016-09-12) ### Public API Change * CancelAllBackgroundWork() flushes all memtables for databases containing writes that have bypassed the WAL (writes issued with WriteOptions::disableWAL=true) before shutting down background threads. * Merge options source_compaction_factor, max_grandparent_overlap_bytes and expanded_compaction_factor into max_compaction_bytes. @@ -721,7 +1419,7 @@ * Change ticker/histogram statistics implementations to accumulate data in thread-local storage, which improves CPU performance by reducing cache coherency costs. Callers of CreateDBStatistics do not need to change anything to use this feature. * Block cache mid-point insertion, where index and filter block are inserted into LRU block cache with higher priority. The feature can be enabled by setting BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority to true and high_pri_pool_ratio > 0 when creating NewLRUCache. -## 4.11.0 (8/1/2016) +## 4.11.0 (2016-08-01) ### Public API Change * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter. @@ -729,7 +1427,7 @@ * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h. * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destroying iterators. -## 4.10.0 (7/5/2016) +## 4.10.0 (2016-07-05) ### Public API Change * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one. @@ -741,7 +1439,7 @@ * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family. * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances. -## 4.9.0 (6/9/2016) +## 4.9.0 (2016-06-09) ### Public API changes * Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB). * Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction. @@ -751,7 +1449,7 @@ ### New Features * Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size. -## 4.8.0 (5/2/2016) +## 4.8.0 (2016-05-02) ### Public API Change * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes. * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F @@ -761,12 +1459,12 @@ ### New Features * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size. -## 4.7.0 (4/8/2016) +## 4.7.0 (2016-04-08) ### Public API Change * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too. * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options. -## 4.6.0 (3/10/2016) +## 4.6.0 (2016-03-10) ### Public API Changes * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier. * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly. @@ -777,7 +1475,7 @@ * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned" -## 4.5.0 (2/5/2016) +## 4.5.0 (2016-02-05) ### Public API Changes * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes. * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll. @@ -788,7 +1486,7 @@ * Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true. * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate. -## 4.4.0 (1/14/2016) +## 4.4.0 (2016-01-14) ### Public API Changes * Change names in CompactionPri and add a new one. * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit. @@ -798,7 +1496,7 @@ * Increase default options.delayed_write_rate to 2MB/s. * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb. -## 4.3.0 (12/8/2015) +## 4.3.0 (2015-12-08) ### New Features * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key. * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions. @@ -808,7 +1506,7 @@ ### Public API Changes * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families. -## 4.2.0 (11/9/2015) +## 4.2.0 (2015-11-09) ### New Features * Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions. * Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families. @@ -821,7 +1519,7 @@ * Remove DefaultCompactionFilterFactory. -## 4.1.0 (10/8/2015) +## 4.1.0 (2015-10-08) ### New Features * Added single delete operation as a more efficient way to delete keys that have not been overwritten. * Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info. @@ -835,7 +1533,7 @@ * CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand. * We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13. -## 4.0.0 (9/9/2015) +## 4.0.0 (2015-09-09) ### New Features * Added support for transactions. See include/rocksdb/utilities/transaction.h for more info. * DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used. @@ -848,7 +1546,7 @@ * Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons. * Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class. -## 3.13.0 (8/6/2015) +## 3.13.0 (2015-08-06) ### New Features * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex * Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file. Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart. @@ -863,7 +1561,7 @@ * Add statistics::getHistogramString() to print detailed distribution of a histogram metric. * Add DBOptions::skip_stats_update_on_db_open. When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction. -## 3.12.0 (7/2/2015) +## 3.12.0 (2015-07-02) ### New Features * Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info. * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds) @@ -893,7 +1591,7 @@ * Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore. * Add DB::SyncWAL() that does a WAL sync without blocking writers. -## 3.11.0 (5/19/2015) +## 3.11.0 (2015-05-19) ### New Features * Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy. * Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv @@ -904,7 +1602,7 @@ * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users. * DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync -## 3.10.0 (3/24/2015) +## 3.10.0 (2015-03-24) ### New Features * GetThreadStatus() is now able to report detailed thread status, including: - Thread Operation including flush and compaction. @@ -939,7 +1637,7 @@ * lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`. * Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly. -## 3.9.0 (12/8/2014) +## 3.9.0 (2014-12-08) ### New Features * Add rocksdb::GetThreadList(), which in the future will return the current status of all @@ -958,7 +1656,7 @@ ### Improvements * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag. -## 3.8.0 (11/14/2014) +## 3.8.0 (2014-11-14) ### Public API changes * BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on. @@ -972,14 +1670,14 @@ * CompactFiles and EventListener, although they are still in experimental state * Full ColumnFamily support in RocksJava. -## 3.7.0 (11/6/2014) +## 3.7.0 (2014-11-06) ### Public API changes * Introduce SetOptions() API to allow adjusting a subset of options dynamically online * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString() * Remove WriteBatchWithIndex.Delete() overloads using SliceParts * When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it. -## 3.6.0 (10/7/2014) +## 3.6.0 (2014-10-07) ### Disk format changes * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy @@ -992,7 +1690,7 @@ * Change target_file_size_base type to uint64_t from int. * Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on. -## 3.5.0 (9/3/2014) +## 3.5.0 (2014-09-03) ### New Features * Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it. * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: @@ -1003,7 +1701,7 @@ ### Public API changes * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. -## 3.4.0 (8/18/2014) +## 3.4.0 (2014-08-18) ### New Features * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. @@ -1019,7 +1717,7 @@ * Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t. * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. -## 3.3.0 (7/10/2014) +## 3.3.0 (2014-07-10) ### New Features * Added JSON API prototype. * HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory(). @@ -1030,7 +1728,7 @@ ### Public API changes * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect. -## 3.2.0 (06/20/2014) +## 3.2.0 (2014-06-20) ### Public API changes * We removed seek compaction as a concept from RocksDB because: @@ -1048,7 +1746,7 @@ ### Performance Improvements * Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement. -## 3.1.0 (05/21/2014) +## 3.1.0 (2014-05-21) ### Public API changes * Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories @@ -1057,7 +1755,7 @@ * Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open. * FIFO compaction style -## 3.0.0 (05/05/2014) +## 3.0.0 (2014-05-05) ### Public API changes * Added _LEVEL to all InfoLogLevel enums @@ -1069,7 +1767,7 @@ * Added an option to use different checksum functions in BlockBasedTableOptions * Added ApplyToAllCacheEntries() function to Cache -## 2.8.0 (04/04/2014) +## 2.8.0 (2014-04-04) * Removed arena.h from public header files. * By default, checksums are verified on every read from database @@ -1098,7 +1796,7 @@ * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. * Geo-spatial support for locations and radial-search. -## 2.7.0 (01/28/2014) +## 2.7.0 (2014-01-28) ### Public API changes diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/INSTALL.md mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/INSTALL.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md 2025-05-19 16:14:27.000000000 +0000 @@ -43,6 +43,8 @@ command line flags processing. You can compile rocksdb library even if you don't have gflags installed. +* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html) + * If you wish to build the RocksJava static target, then cmake is required for building Snappy. ## Supported platforms @@ -94,12 +96,21 @@ sudo yum install libasan * Install zstandard: + * With [EPEL](https://fedoraproject.org/wiki/EPEL): + + sudo yum install libzstd-devel + + * With CentOS 8: + + sudo dnf install libzstd-devel + + * From source: - wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz - mv v1.1.3.tar.gz zstd-1.1.3.tar.gz - tar zxvf zstd-1.1.3.tar.gz - cd zstd-1.1.3 - make && sudo make install + wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz + mv v1.1.3.tar.gz zstd-1.1.3.tar.gz + tar zxvf zstd-1.1.3.tar.gz + cd zstd-1.1.3 + make && sudo make install * **OS X**: * Install latest C++ compiler that supports C++ 11: diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md 2025-05-19 16:14:27.000000000 +0000 @@ -1,6 +1,6 @@ This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it. -* Java - https://github.com/facebook/rocksdb/tree/master/java +* Java - https://github.com/facebook/rocksdb/tree/main/java * Python * http://python-rocksdb.readthedocs.io/en/latest/ * http://pyrocksdb.readthedocs.org/en/latest/ (unmaintained) @@ -10,7 +10,9 @@ * Ruby - http://rubygems.org/gems/rocksdb-ruby * Haskell - https://hackage.haskell.org/package/rocksdb-haskell * PHP - https://github.com/Photonios/rocksdb-php -* C# - https://github.com/warrenfalk/rocksdb-sharp +* C# + * https://github.com/warrenfalk/rocksdb-sharp + * https://github.com/curiosity-ai/rocksdb-sharp * Rust * https://github.com/pingcap/rust-rocksdb (used in production fork of https://github.com/spacejam/rust-rocksdb) * https://github.com/spacejam/rust-rocksdb diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile --- mariadb-10.11.11/storage/rocksdb/rocksdb/Makefile 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile 2025-05-19 16:14:27.000000000 +0000 @@ -8,6 +8,11 @@ BASH_EXISTS := $(shell which bash) SHELL := $(shell which bash) +# Default to python3. Some distros like CentOS 8 do not have `python`. +ifeq ($(origin PYTHON), undefined) + PYTHON := $(shell which python3 || which python || echo python3) +endif +export PYTHON CLEAN_FILES = # deliberately empty, so we can append below. CFLAGS += ${EXTRA_CFLAGS} @@ -43,60 +48,43 @@ # Set the default DEBUG_LEVEL to 1 DEBUG_LEVEL?=1 -ifeq ($(MAKECMDGOALS),dbg) - DEBUG_LEVEL=2 -endif - -ifeq ($(MAKECMDGOALS),clean) - DEBUG_LEVEL=0 -endif - -ifeq ($(MAKECMDGOALS),release) - DEBUG_LEVEL=0 -endif +# LIB_MODE says whether or not to use/build "shared" or "static" libraries. +# Mode "static" means to link against static libraries (.a) +# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc) +# +# Set the default LIB_MODE to static +LIB_MODE?=static -ifeq ($(MAKECMDGOALS),shared_lib) - DEBUG_LEVEL=0 -endif +# OBJ_DIR is where the object files reside. Default to the current directory +OBJ_DIR?=. -ifeq ($(MAKECMDGOALS),install-shared) - DEBUG_LEVEL=0 -endif +# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately -ifeq ($(MAKECMDGOALS),static_lib) +ifneq ($(filter clean release install, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 endif - -ifeq ($(MAKECMDGOALS),install-static) +ifneq ($(filter dbg, $(MAKECMDGOALS)),) + DEBUG_LEVEL=2 +else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 -endif - -ifeq ($(MAKECMDGOALS),install) + LIB_MODE=shared +else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 -endif - -ifeq ($(MAKECMDGOALS),rocksdbjavastatic) - ifneq ($(DEBUG_LEVEL),2) - DEBUG_LEVEL=0 - endif -endif - -ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease) - ifneq ($(DEBUG_LEVEL),2) - DEBUG_LEVEL=0 + LIB_MODE=static +else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),) + OBJ_DIR=jl + LIB_MODE=shared + ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),) + OBJ_DIR=jls + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif + ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) + DEBUG_LEVEL=0 + endif endif endif -ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker) - ifneq ($(DEBUG_LEVEL),2) - DEBUG_LEVEL=0 - endif -endif - -ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) - DEBUG_LEVEL=0 -endif - $(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}) # Lite build flag. @@ -116,11 +104,14 @@ # Figure out optimize level. ifneq ($(DEBUG_LEVEL), 2) ifeq ($(LITE), 0) - OPT += -O2 + OPTIMIZE_LEVEL ?= -O2 else - OPT += -Os + OPTIMIZE_LEVEL ?= -Os endif endif +# `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`. +# In that case, the compiler default (`-O0` for gcc and clang) will be used. +OPT += $(OPTIMIZE_LEVEL) # compile with -O2 if debug level is not 2 ifneq ($(DEBUG_LEVEL), 2) @@ -143,10 +134,10 @@ HAVE_POWER8=1 endif -ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1)) -CXXFLAGS += -march=armv8-a+crc+crypto -CFLAGS += -march=armv8-a+crc+crypto -ARMCRC_SOURCE=1 +# if we're compiling for shared libraries, add the shared flags +ifeq ($(LIB_MODE),shared) +CXXFLAGS += $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL +CFLAGS += $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL endif # if we're compiling for release, compile without debug code (-DNDEBUG) @@ -165,13 +156,35 @@ CXXFLAGS += -fno-rtti endif +ifdef ASSERT_STATUS_CHECKED +# For ASC, turn off constructor elision, preventing the case where a constructor returned +# by a method may pass the ASC check if the status is checked in the inner method. Forcing +# the copy constructor to be invoked disables the optimization and will cause the calling method +# to check the status in order to prevent an error from being raised. +PLATFORM_CXXFLAGS += -fno-elide-constructors +ifeq ($(filter -DROCKSDB_ASSERT_STATUS_CHECKED,$(OPT)),) + OPT += -DROCKSDB_ASSERT_STATUS_CHECKED +endif +endif + $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production) endif +# `USE_LTO=1` enables link-time optimizations. Among other things, this enables +# more devirtualization opportunities and inlining across translation units. +# This can save significant overhead introduced by RocksDB's pluggable +# interfaces/internal abstractions, like in the iterator hierarchy. It works +# better when combined with profile-guided optimizations (not currently +# supported natively in Makefile). +ifeq ($(USE_LTO), 1) + CXXFLAGS += -flto + LDFLAGS += -flto -fuse-linker-plugin +endif + #----------------------------------------------- include src.mk -AM_DEFAULT_VERBOSITY = 0 +AM_DEFAULT_VERBOSITY ?= 0 AM_V_GEN = $(am__v_GEN_$(V)) am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) @@ -186,12 +199,16 @@ am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) am__v_CC_0 = @echo " CC " $@; am__v_CC_1 = -CCLD = $(CC) -LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ + AM_V_CCLD = $(am__v_CCLD_$(V)) am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +ifneq ($(SKIP_LINK), 1) am__v_CCLD_0 = @echo " CCLD " $@; am__v_CCLD_1 = +else +am__v_CCLD_0 = @echo " !CCLD " $@; true skip +am__v_CCLD_1 = true skip +endif AM_V_AR = $(am__v_AR_$(V)) am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY)) am__v_AR_0 = @echo " AR " $@; @@ -199,15 +216,66 @@ ifdef ROCKSDB_USE_LIBRADOS LIB_SOURCES += utilities/env_librados.cc +TEST_MAIN_SOURCES += utilities/env_librados_test.cc LDFLAGS += -lrados endif -AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -# detect what platform we're building on -dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; export PORTABLE="$(PORTABLE)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) +AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(LDFLAGS) -o $@ + +# Detect what platform we're building on. +# Export some common variables that might have been passed as Make variables +# instead of environment variables. +dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ + export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ + export LDFLAGS="$(EXTRA_LDFLAGS)"; \ + export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \ + export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \ + export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \ + export PORTABLE="$(PORTABLE)"; \ + export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \ + export USE_CLANG="$(USE_CLANG)"; \ + "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) # this file is generated by the previous line to set build flags and sources include make_config.mk -CLEAN_FILES += make_config.mk + +ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk) +include $(ROCKSDB_PLUGIN_MKS) +ROCKSDB_PLUGIN_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach source, $($(plugin)_SOURCES), plugin/$(plugin)/$(source))) +ROCKSDB_PLUGIN_HEADERS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach header, $($(plugin)_HEADERS), plugin/$(plugin)/$(header))) +ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_PKGCONFIG_REQUIRES)) +PLATFORM_LDFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS)) +CXXFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_CXXFLAGS)) + +ifneq ($(strip $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)),) +LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)) +ifneq ($(.SHELLSTATUS),0) +$(error pkg-config failed) +endif +CXXFLAGS := $(CXXFLAGS) $(shell pkg-config --cflags $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)) +ifneq ($(.SHELLSTATUS),0) +$(error pkg-config failed) +endif +endif + +CXXFLAGS += $(ARCHFLAG) + +ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1)) +ifneq ($(PLATFORM),OS_MACOSX) +CXXFLAGS += -march=armv8-a+crc+crypto +CFLAGS += -march=armv8-a+crc+crypto +ARMCRC_SOURCE=1 +endif +endif + +export JAVAC_ARGS +CLEAN_FILES += make_config.mk rocksdb.pc + +ifeq ($(V), 1) +$(info $(shell uname -a)) +$(info $(shell $(CC) --version)) +$(info $(shell $(CXX) --version)) +endif missing_make_config_paths := $(shell \ grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | \ @@ -216,7 +284,7 @@ done | sort | uniq) $(foreach path, $(missing_make_config_paths), \ - $(warning Warning: $(path) dont exist)) + $(warning Warning: $(path) does not exist)) ifeq ($(PLATFORM), OS_AIX) # no debug info @@ -244,12 +312,37 @@ LUA_PATH = endif +ifeq ($(LIB_MODE),shared) +# So that binaries are executable from build location, in addition to install location +EXEC_LDFLAGS += -Wl,-rpath -Wl,'$$ORIGIN' +endif + +ifeq ($(PLATFORM), OS_MACOSX) +ifeq ($(ARCHFLAG), -arch arm64) +ifneq ($(MACHINE), arm64) +# If we're building on a non-arm64 machine but targeting arm64 Mac, we need to disable +# linking with jemalloc (as it won't be arm64-compatible) and remove some other options +# set during platform detection +DISABLE_JEMALLOC=1 +PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS)) +PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS)) +endif +endif +endif + # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. ifdef COMPILE_WITH_ASAN DISABLE_JEMALLOC=1 EXEC_LDFLAGS += -fsanitize=address PLATFORM_CCFLAGS += -fsanitize=address PLATFORM_CXXFLAGS += -fsanitize=address +ifeq ($(LIB_MODE),shared) +ifdef USE_CLANG +# Fix false ODR violation; see https://github.com/google/sanitizers/issues/1017 + EXEC_LDFLAGS += -mllvm -asan-use-private-alias=1 + PLATFORM_CXXFLAGS += -mllvm -asan-use-private-alias=1 +endif +endif endif # TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc. @@ -289,6 +382,12 @@ PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN endif +ifdef ROCKSDB_FULL_VALGRIND_RUN + # Some tests are slow when run under valgrind and are only run when + # explicitly requested via the ROCKSDB_FULL_VALGRIND_RUN compiler flag. + PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN + PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN +endif ifndef DISABLE_JEMALLOC ifdef JEMALLOC @@ -308,9 +407,14 @@ USE_FOLLY_DISTRIBUTED_MUTEX=0 endif -export GTEST_THROW_ON_FAILURE=1 -export GTEST_HAS_EXCEPTIONS=1 -GTEST_DIR = ./third-party/gtest-1.8.1/fused-src +ifndef GTEST_THROW_ON_FAILURE + export GTEST_THROW_ON_FAILURE=1 +endif +ifndef GTEST_HAS_EXCEPTIONS + export GTEST_HAS_EXCEPTIONS=1 +endif + +GTEST_DIR = third-party/gtest-1.8.1/fused-src # AIX: pre-defined system headers are surrounded by an extern "C" block ifeq ($(PLATFORM), OS_AIX) PLATFORM_CCFLAGS += -I$(GTEST_DIR) @@ -336,6 +440,14 @@ PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE) PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE) endif +ifdef TEST_UINT128_COMPAT + PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1 + PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1 +endif +ifdef ROCKSDB_MODIFY_NPHASH + PLATFORM_CCFLAGS += -DROCKSDB_MODIFY_NPHASH=1 + PLATFORM_CXXFLAGS += -DROCKSDB_MODIFY_NPHASH=1 +endif # This (the first rule) must depend on "all". default: all @@ -343,6 +455,15 @@ WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \ -Wunused-parameter +ifeq (,$(filter amd64, $(MACHINE))) + C_WARNING_FLAGS = -Wstrict-prototypes +endif + +ifdef USE_CLANG + # Used by some teams in Facebook + WARNING_FLAGS += -Wshift-sign-overflow +endif + ifeq ($(PLATFORM), OS_OPENBSD) WARNING_FLAGS += -Wno-unused-lambda-capture endif @@ -382,69 +503,113 @@ CXXFLAGS += -DNO_THREEWAY_CRC32C endif -CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers LDFLAGS += $(PLATFORM_LDFLAGS) -# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but -# the file needs to already exist or else the build will fail -ifndef NO_UPDATE_BUILD_VERSION -date := $(shell date +%F) -ifdef FORCE_GIT_SHA - git_sha := $(FORCE_GIT_SHA) -else - git_sha := $(shell git rev-parse HEAD 2>/dev/null) -endif -gen_build_version = sed -e s/@@GIT_SHA@@/$(git_sha)/ -e s/@@GIT_DATE_TIME@@/$(date)/ util/build_version.cc.in - -# Record the version of the source that we are compiling. -# We keep a record of the git revision in this file. It is then built -# as a regular source file as part of the compilation process. -# One can run "strings executable_filename | grep _build_" to find -# the version of the source that we used to build the executable file. -FORCE: -util/build_version.cc: FORCE - $(AM_V_GEN)rm -f $@-t - $(AM_V_at)$(gen_build_version) > $@-t - $(AM_V_at)if test -f $@; then \ - cmp -s $@-t $@ && rm -f $@-t || mv -f $@-t $@; \ - else mv -f $@-t $@; fi -endif - -LIBOBJECTS = $(LIB_SOURCES:.cc=.o) +LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES)) +LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES)) ifeq ($(HAVE_POWER8),1) -LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o) -LIBOBJECTS += $(LIB_SOURCES_C:.c=.o) -LIBOBJECTS += $(LIB_SOURCES_ASM:.S=.o) -else -LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o) +LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C)) +LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM)) endif -LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o) -MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o) ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) - FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o) + LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES)) +endif + +# range_tree is not compatible with non GNU libc on ppc64 +# see https://jira.percona.com/browse/PS-7559 +ifneq ($(PPC_LIBC_IS_GNU),0) + LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES)) endif -GTEST = $(GTEST_DIR)/gtest/gtest-all.o -TESTUTIL = ./test_util/testutil.o -TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST) +GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o +TESTUTIL = $(OBJ_DIR)/test_util/testutil.o +TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST) VALGRIND_ERROR = 2 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full +# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable -BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) +TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST) +BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES)) +CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES)) +TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES)) +ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES)) +STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES)) + +# Exclude build_version.cc -- a generated source file -- from all sources. Not needed for dependencies +ALL_SOURCES = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc +ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) +ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES) +ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) -ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o) +TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES))) +TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C))) -STRESSTOOLOBJECTS = $(STRESS_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + TESTS += folly_synchronization_distributed_mutex_test + ALL_SOURCES += third-party/folly/folly/synchronization/test/DistributedMutexTest.cc +endif + +# `make check-headers` to very that each header file includes its own +# dependencies +ifneq ($(filter check-headers, $(MAKECMDGOALS)),) +# TODO: add/support JNI headers + DEV_HEADER_DIRS := $(sort include/ hdfs/ $(dir $(ALL_SOURCES))) +# Some headers like in port/ are platform-specific + DEV_HEADERS := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | egrep -v 'port/|plugin/|lua/|range_tree/|tools/rdb/db_wrapper.h|include/rocksdb/utilities/env_librados.h') +else + DEV_HEADERS := +endif +HEADER_OK_FILES = $(patsubst %.h, %.h.ok, $(DEV_HEADERS)) + +AM_V_CCH = $(am__v_CCH_$(V)) +am__v_CCH_ = $(am__v_CCH_$(AM_DEFAULT_VERBOSITY)) +am__v_CCH_0 = @echo " CC.h " $<; +am__v_CCH_1 = + +%.h.ok: %.h # .h.ok not actually created, so re-checked on each invocation +# -DROCKSDB_NAMESPACE=42 ensures the namespace header is included + $(AM_V_CCH) echo '#include "$<"' | $(CXX) $(CXXFLAGS) -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null + +check-headers: $(HEADER_OK_FILES) + +# options_settable_test doesn't pass with UBSAN as we use hack in the test +ifdef COMPILE_WITH_UBSAN + TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g') +endif +ifdef ASSERT_STATUS_CHECKED + # TODO: finish fixing all tests to pass this check + TESTS_FAILING_ASC = \ + c_test \ + env_test \ + range_locking_test \ + testutil_test \ + + # Since we have very few ASC exclusions left, excluding them from + # the build is the most convenient way to exclude them from testing + TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS)) +endif + +ROCKSDBTESTS_SUBSET ?= $(TESTS) + +# env_test - suspicious use of test::TmpDir +# deletefile_test - serial because it generates giant temporary files in +# its various tests. Parallel can fill up your /dev/shm +NON_PARALLEL_TEST = \ + env_test \ + deletefile_test \ -EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL) +PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS)) -TESTS = \ +# Not necessarily well thought out or up-to-date, but matches old list +TESTS_PLATFORM_DEPENDENT := \ db_basic_test \ + db_blob_basic_test \ db_encryption_test \ db_test2 \ external_sst_file_basic_test \ @@ -459,220 +624,115 @@ env_basic_test \ env_test \ env_logger_test \ + io_posix_test \ hash_test \ random_test \ + ribbon_test \ thread_local_test \ + work_queue_test \ rate_limiter_test \ perf_context_test \ iostats_context_test \ db_wal_test \ - db_block_cache_test \ - db_test \ - db_blob_index_test \ - db_iter_test \ - db_iter_stress_test \ - db_log_iter_test \ - db_bloom_filter_test \ - db_compaction_filter_test \ - db_compaction_test \ - db_dynamic_level_test \ - db_flush_test \ - db_inplace_update_test \ - db_iterator_test \ - db_memtable_test \ - db_merge_operator_test \ - db_merge_operand_test \ - db_options_test \ - db_range_del_test \ - db_secondary_test \ - db_sst_test \ - db_tailing_iter_test \ - db_io_failure_test \ - db_properties_test \ - db_table_properties_test \ - db_statistics_test \ - db_write_test \ - error_handler_test \ - autovector_test \ - blob_db_test \ - cleanable_test \ - column_family_test \ - table_properties_collector_test \ - arena_test \ - block_test \ - data_block_hash_index_test \ - cache_test \ - corruption_test \ - slice_test \ - slice_transform_test \ - dbformat_test \ - fault_injection_test \ - filelock_test \ - filename_test \ - file_reader_writer_test \ - block_based_filter_block_test \ - full_filter_block_test \ - partitioned_filter_block_test \ - hash_table_test \ - histogram_test \ - log_test \ - manual_compaction_test \ - mock_env_test \ - memtable_list_test \ - merge_helper_test \ - memory_test \ - merge_test \ - merger_test \ - util_merge_operators_test \ - options_file_test \ - reduce_levels_test \ - plain_table_db_test \ - comparator_db_test \ - external_sst_file_test \ - import_column_family_test \ - prefix_test \ - skiplist_test \ - write_buffer_manager_test \ - stringappend_test \ - cassandra_format_test \ - cassandra_functional_test \ - cassandra_row_merge_test \ - cassandra_serialize_test \ - ttl_test \ - backupable_db_test \ - cache_simulator_test \ - sim_cache_test \ - version_edit_test \ - version_set_test \ - compaction_picker_test \ - version_builder_test \ - file_indexer_test \ - write_batch_test \ - write_batch_with_index_test \ - write_controller_test\ - deletefile_test \ - obsolete_files_test \ - table_test \ - delete_scheduler_test \ - options_test \ - options_settable_test \ - options_util_test \ - event_logger_test \ - timer_queue_test \ - cuckoo_table_builder_test \ - cuckoo_table_reader_test \ - cuckoo_table_db_test \ - flush_job_test \ - wal_manager_test \ - listener_test \ - compaction_iterator_test \ - compaction_job_test \ - thread_list_test \ - sst_dump_test \ - compact_files_test \ - optimistic_transaction_test \ - write_callback_test \ - heap_test \ - compact_on_deletion_collector_test \ - compaction_job_stats_test \ - option_change_migration_test \ - transaction_test \ - ldb_cmd_test \ - persistent_cache_test \ - statistics_test \ - stats_history_test \ - lru_cache_test \ - object_registry_test \ - repair_test \ - env_timed_test \ - write_prepared_transaction_test \ - write_unprepared_transaction_test \ - db_universal_compaction_test \ - trace_analyzer_test \ - repeatable_thread_test \ - range_tombstone_fragmenter_test \ - range_del_aggregator_test \ - sst_file_reader_test \ - db_secondary_test \ - block_cache_tracer_test \ - block_cache_trace_analyzer_test \ - defer_test \ - -ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) - TESTS += folly_synchronization_distributed_mutex_test -endif -PARALLEL_TEST = \ - backupable_db_test \ - db_bloom_filter_test \ - db_compaction_filter_test \ - db_compaction_test \ - db_merge_operator_test \ - db_sst_test \ - db_test \ - db_universal_compaction_test \ - db_wal_test \ - external_sst_file_test \ - import_column_family_test \ - fault_injection_test \ - file_reader_writer_test \ - inlineskiplist_test \ - manual_compaction_test \ - persistent_cache_test \ - table_test \ - transaction_test \ - write_prepared_transaction_test \ - write_unprepared_transaction_test \ +# Sort ROCKSDBTESTS_SUBSET for filtering, except db_test is special (expensive) +# so is placed first (out-of-order) +ROCKSDBTESTS_SUBSET := $(filter db_test, $(ROCKSDBTESTS_SUBSET)) $(sort $(filter-out db_test, $(ROCKSDBTESTS_SUBSET))) -# options_settable_test doesn't pass with UBSAN as we use hack in the test -ifdef COMPILE_WITH_UBSAN - TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g') -endif -SUBSET := $(TESTS) ifdef ROCKSDBTESTS_START - SUBSET := $(shell echo $(SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/') + ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/') endif ifdef ROCKSDBTESTS_END - SUBSET := $(shell echo $(SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//') + ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//') endif -TOOLS = \ - sst_dump \ - db_sanity_test \ - db_stress \ - write_stress \ - ldb \ - db_repl_stress \ - rocksdb_dump \ - rocksdb_undump \ - blob_dump \ - trace_analyzer \ - block_cache_trace_analyzer \ +ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), only) + ROCKSDBTESTS_SUBSET := $(filter $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET)) +else ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), exclude) + ROCKSDBTESTS_SUBSET := $(filter-out $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET)) +endif + +# bench_tool_analyer main is in bench_tool_analyzer_tool, or this would be simpler... +TOOLS = $(patsubst %.cc, %, $(notdir $(patsubst %_tool.cc, %.cc, $(TOOLS_MAIN_SOURCES)))) TEST_LIBS = \ librocksdb_env_basic_test.a # TODO: add back forward_iterator_bench, after making it build in all environemnts. -BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench filter_bench persistent_cache_bench range_del_aggregator_bench +BENCHMARKS = $(patsubst %.cc, %, $(notdir $(BENCH_MAIN_SOURCES))) + +MICROBENCHS = $(patsubst %.cc, %, $(notdir $(MICROBENCH_SOURCES))) # if user didn't config LIBNAME, set the default ifeq ($(LIBNAME),) + LIBNAME=librocksdb # we should only run rocksdb in production with DEBUG_LEVEL 0 -ifeq ($(DEBUG_LEVEL),0) - LIBNAME=librocksdb -else - LIBNAME=librocksdb_debug +ifneq ($(DEBUG_LEVEL),0) + LIBDEBUG=_debug endif endif -LIBRARY = ${LIBNAME}.a -TOOLS_LIBRARY = ${LIBNAME}_tools.a -STRESS_LIBRARY = ${LIBNAME}_stress.a +STATIC_LIBRARY = ${LIBNAME}$(LIBDEBUG).a +STATIC_TEST_LIBRARY = ${LIBNAME}_test$(LIBDEBUG).a +STATIC_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).a +STATIC_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).a + +ALL_STATIC_LIBS = $(STATIC_LIBRARY) $(STATIC_TEST_LIBRARY) $(STATIC_TOOLS_LIBRARY) $(STATIC_STRESS_LIBRARY) + +SHARED_TEST_LIBRARY = ${LIBNAME}_test$(LIBDEBUG).$(PLATFORM_SHARED_EXT) +SHARED_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).$(PLATFORM_SHARED_EXT) +SHARED_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).$(PLATFORM_SHARED_EXT) + +ALL_SHARED_LIBS = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) $(SHARED_TEST_LIBRARY) $(SHARED_TOOLS_LIBRARY) $(SHARED_STRESS_LIBRARY) + +ifeq ($(LIB_MODE),shared) +LIBRARY=$(SHARED1) +TEST_LIBRARY=$(SHARED_TEST_LIBRARY) +TOOLS_LIBRARY=$(SHARED_TOOLS_LIBRARY) +STRESS_LIBRARY=$(SHARED_STRESS_LIBRARY) +CLOUD_LIBRARY=$(SHARED_CLOUD_LIBRARY) +else +LIBRARY=$(STATIC_LIBRARY) +TEST_LIBRARY=$(STATIC_TEST_LIBRARY) +TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY) +endif +STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY) ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) +# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but +# the file needs to already exist or else the build will fail +ifndef NO_UPDATE_BUILD_VERSION + +# By default, use the current date-time as the date. If there are no changes, +# we will use the last commit date instead. +build_date := $(shell date "+%Y-%m-%d %T") + +ifdef FORCE_GIT_SHA + git_sha := $(FORCE_GIT_SHA) + git_mod := 1 + git_date := $(build_date) +else + git_sha := $(shell git rev-parse HEAD 2>/dev/null) + git_tag := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null) + git_mod := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?) + git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null) +endif +gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ util/build_version.cc.in + +# Record the version of the source that we are compiling. +# We keep a record of the git revision in this file. It is then built +# as a regular source file as part of the compilation process. +# One can run "strings executable_filename | grep _build_" to find +# the version of the source that we used to build the executable file. +util/build_version.cc: $(filter-out $(OBJ_DIR)/util/build_version.o, $(LIB_OBJECTS)) util/build_version.cc.in + $(AM_V_GEN)rm -f $@-t + $(AM_V_at)$(gen_build_version) > $@ +endif +CLEAN_FILES += util/build_version.cc + default: all #----------------------------------------------- @@ -681,7 +741,7 @@ ifneq ($(PLATFORM_SHARED_EXT),) ifneq ($(PLATFORM_SHARED_VERSIONED),true) -SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) +SHARED1 = ${LIBNAME}$(LIBDEBUG).$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1) SHARED3 = $(SHARED1) SHARED4 = $(SHARED1) @@ -692,7 +752,7 @@ SHARED_PATCH = $(ROCKSDB_PATCH) SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) ifeq ($(PLATFORM), OS_MACOSX) -SHARED_OSX = $(LIBNAME).$(SHARED_MAJOR) +SHARED_OSX = $(LIBNAME)$(LIBDEBUG).$(SHARED_MAJOR) SHARED2 = $(SHARED_OSX).$(PLATFORM_SHARED_EXT) SHARED3 = $(SHARED_OSX).$(SHARED_MINOR).$(PLATFORM_SHARED_EXT) SHARED4 = $(SHARED_OSX).$(SHARED_MINOR).$(SHARED_PATCH).$(PLATFORM_SHARED_EXT) @@ -700,61 +760,35 @@ SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH) -endif +endif # MACOSX SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) -$(SHARED1): $(SHARED4) +$(SHARED1): $(SHARED4) $(SHARED2) ln -fs $(SHARED4) $(SHARED1) -$(SHARED2): $(SHARED4) +$(SHARED2): $(SHARED4) $(SHARED3) ln -fs $(SHARED4) $(SHARED2) $(SHARED3): $(SHARED4) ln -fs $(SHARED4) $(SHARED3) -endif -ifeq ($(HAVE_POWER8),1) -SHARED_C_OBJECTS = $(LIB_SOURCES_C:.c=.o) -SHARED_ASM_OBJECTS = $(LIB_SOURCES_ASM:.S=.o) -SHARED_C_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_C_OBJECTS)) -SHARED_ASM_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_ASM_OBJECTS)) -shared_libobjects = $(patsubst %,shared-objects/%,$(LIB_CC_OBJECTS)) -else -shared_libobjects = $(patsubst %,shared-objects/%,$(LIBOBJECTS)) -endif - -CLEAN_FILES += shared-objects -shared_all_libobjects = $(shared_libobjects) - -ifeq ($(HAVE_POWER8),1) -shared-ppc-objects = $(SHARED_C_LIBOBJECTS) $(SHARED_ASM_LIBOBJECTS) - -shared-objects/util/crc32c_ppc.o: util/crc32c_ppc.c - $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ - -shared-objects/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S - $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ -endif -$(shared_libobjects): shared-objects/%.o: %.cc - $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ - -ifeq ($(HAVE_POWER8),1) -shared_all_libobjects = $(shared_libobjects) $(shared-ppc-objects) -endif -$(SHARED4): $(shared_all_libobjects) - $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(shared_all_libobjects) $(LDFLAGS) -o $@ +endif # PLATFORM_SHARED_VERSIONED +$(SHARED4): $(LIB_OBJECTS) + $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(LIB_OBJECTS) $(LDFLAGS) -o $@ endif # PLATFORM_SHARED_EXT .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ release tags tags0 valgrind_check whitebox_crash_test format static_lib shared_lib all \ - dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \ - analyze tools tools_lib \ + dbg rocksdbjavastatic rocksdbjava gen-pc install install-static install-shared uninstall \ + analyze tools tools_lib check-headers \ blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush \ - blackbox_crash_test_with_txn whitebox_crash_test_with_txn + blackbox_crash_test_with_txn whitebox_crash_test_with_txn \ + blackbox_crash_test_with_best_efforts_recovery \ + blackbox_crash_test_with_ts whitebox_crash_test_with_ts all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS) -all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(SUBSET) +all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(ROCKSDBTESTS_SUBSET) -static_lib: $(LIBRARY) +static_lib: $(STATIC_LIBRARY) shared_lib: $(SHARED) @@ -766,19 +800,22 @@ test_libs: $(TEST_LIBS) +benchmarks: $(BENCHMARKS) + +microbench: $(MICROBENCHS) + for t in $(MICROBENCHS); do echo "===== Running benchmark $$t (`date`)"; ./$$t || exit 1; done; + dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS) -# creates static library and programs -release: - $(MAKE) clean - DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench +# creates library and programs +release: clean + LIB_MODE=$(LIB_MODE) DEBUG_LEVEL=0 $(MAKE) $(LIBRARY) tools db_bench -coverage: - $(MAKE) clean +coverage: clean COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check cd coverage && ./coverage_test.sh # Delete intermediate files - $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm -f {} \; ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),) # Use /dev/shm if it has the sticky bit set (otherwise, /tmp), @@ -824,14 +861,11 @@ $(parallel_tests): $(PARALLEL_TEST) $(AM_V_at)TEST_BINARY=$(patsubst parallel_%,%,$@); \ TEST_NAMES=` \ - ./$$TEST_BINARY --gtest_list_tests \ - | perl -n \ - -e 's/ *\#.*//;' \ - -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ - -e 'print qq! $$p$$2!'`; \ + (./$$TEST_BINARY --gtest_list_tests || echo " $${TEST_BINARY}__list_tests_failure") \ + | awk '/^[^ ]/ { prefix = $$1 } /^[ ]/ { print prefix $$1 }'`; \ + echo " Generating parallel test scripts for $$TEST_BINARY"; \ for TEST_NAME in $$TEST_NAMES; do \ TEST_SCRIPT=t/run-$$TEST_BINARY-$${TEST_NAME//\//-}; \ - echo " GEN " $$TEST_SCRIPT; \ printf '%s\n' \ '#!/bin/sh' \ "d=\$(TMPD)$$TEST_SCRIPT" \ @@ -843,7 +877,7 @@ gen_parallel_tests: $(AM_V_at)mkdir -p t - $(AM_V_at)rm -f t/run-* + $(AM_V_at)$(FIND) t -type f -name 'run-*' -exec rm -f {} \; $(MAKE) $(parallel_tests) # Reorder input lines (which are one per test) so that the @@ -863,7 +897,7 @@ # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest # slow_test_regexp = \ - ^.*SnapshotConcurrentAccessTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$ + ^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$ prioritize_long_running_tests = \ perl -pe 's,($(slow_test_regexp)),100 $$1,' \ | sort -k1,1gr \ @@ -878,6 +912,19 @@ # Use this regexp to select the subset of tests whose names match. tests-regexp = . +EXCLUDE_TESTS_REGEX ?= "^$$" + +ifeq ($(PRINT_PARALLEL_OUTPUTS), 1) + parallel_redir = +else ifeq ($(QUIET_PARALLEL_TESTS), 1) + parallel_redir = >& t/$(test_log_prefix)log-{/} +else +# Default: print failure output only, as it happens +# Note: gnu_parallel --eta is now always used, but has been modified to provide +# only infrequent updates when not connected to a terminal. (CircleCI will +# kill a job if no output for 10min.) + parallel_redir = >& t/$(test_log_prefix)log-{/} || bash -c "cat t/$(test_log_prefix)log-{/}; exit $$?" +endif .PHONY: check_0 check_0: @@ -885,34 +932,38 @@ printf '%s\n' '' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ - test -t 1 && eta=--eta || eta=; \ { \ printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \ find t -name 'run-*' -print; \ } \ | $(prioritize_long_running_tests) \ | grep -E '$(tests-regexp)' \ - | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu '{} >& t/log-{/}' + | grep -E -v '$(EXCLUDE_TESTS_REGEX)' \ + | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu '{} $(parallel_redir)' ; \ + parallel_retcode=$$? ; \ + awk '{ if ($$7 != 0 || $$8 != 0) { if ($$7 == "Exitval") { h = $$0; } else { if (!f) print h; print; f = 1 } } } END { if(f) exit 1; }' < LOG ; \ + awk_retcode=$$?; \ + if [ $$parallel_retcode -ne 0 ] || [ $$awk_retcode -ne 0 ] ; then exit 1 ; fi -valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest +valgrind-exclude-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest .PHONY: valgrind_check_0 +valgrind_check_0: test_log_prefix := valgrind_ valgrind_check_0: $(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \ printf '%s\n' '' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ - test -t 1 && eta=--eta || eta=; \ { \ printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS)); \ find t -name 'run-*' -print; \ } \ | $(prioritize_long_running_tests) \ | grep -E '$(tests-regexp)' \ - | grep -E -v '$(valgrind-blacklist-regexp)' \ - | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu \ - '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) ' \ - '>& t/valgrind_log-{/}' + | grep -E -v '$(valgrind-exclude-regexp)' \ + | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu \ + '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) \ + $(parallel_redir)' \ CLEAN_FILES += t LOG $(TMPD) @@ -926,6 +977,9 @@ watch-log: $(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)' +dump-log: + bash -c '$(quoted_perl_command)' < LOG + # If J != 1 and GNU parallel is installed, run the tests in parallel, # via the check_0 rule above. Otherwise, run them sequentially. check: all @@ -937,102 +991,160 @@ $(MAKE) T="$$t" TMPD=$(TMPD) check_0; \ else \ for t in $(TESTS); do \ - echo "===== Running $$t"; ./$$t || exit 1; done; \ + echo "===== Running $$t (`date`)"; ./$$t || exit 1; done; \ fi rm -rf $(TMPD) ifneq ($(PLATFORM), OS_AIX) - python tools/check_all_python.py + $(PYTHON) tools/check_all_python.py ifeq ($(filter -DROCKSDB_LITE,$(OPT)),) - python tools/ldb_test.py +ifndef ASSERT_STATUS_CHECKED # not yet working with these tests + $(PYTHON) tools/ldb_test.py sh tools/rocksdb_dump_test.sh endif endif +endif +ifndef SKIP_FORMAT_BUCK_CHECKS + $(MAKE) check-format + $(MAKE) check-buck-targets + $(MAKE) check-sources +endif # TODO add ldb_tests -check_some: $(SUBSET) - for t in $(SUBSET); do echo "===== Running $$t"; ./$$t || exit 1; done +check_some: $(ROCKSDBTESTS_SUBSET) + for t in $(ROCKSDBTESTS_SUBSET); do echo "===== Running $$t (`date`)"; ./$$t || exit 1; done .PHONY: ldb_tests ldb_tests: ldb - python tools/ldb_test.py - -crash_test: whitebox_crash_test blackbox_crash_test + $(PYTHON) tools/ldb_test.py -crash_test_with_atomic_flush: whitebox_crash_test_with_atomic_flush blackbox_crash_test_with_atomic_flush - -crash_test_with_txn: whitebox_crash_test_with_txn blackbox_crash_test_with_txn +crash_test: +# Do not parallelize + $(MAKE) whitebox_crash_test + $(MAKE) blackbox_crash_test + +crash_test_with_atomic_flush: +# Do not parallelize + $(MAKE) whitebox_crash_test_with_atomic_flush + $(MAKE) blackbox_crash_test_with_atomic_flush + +crash_test_with_txn: +# Do not parallelize + $(MAKE) whitebox_crash_test_with_txn + $(MAKE) blackbox_crash_test_with_txn + +crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery + +crash_test_with_ts: +# Do not parallelize + $(MAKE) whitebox_crash_test_with_ts + $(MAKE) blackbox_crash_test_with_ts blackbox_crash_test: db_stress - python -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS) - python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS) + $(PYTHON) -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS) + $(PYTHON) -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_atomic_flush: db_stress - python -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS) + $(PYTHON) -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_txn: db_stress - python -u tools/db_crashtest.py --txn blackbox $(CRASH_TEST_EXT_ARGS) + $(PYTHON) -u tools/db_crashtest.py --txn blackbox $(CRASH_TEST_EXT_ARGS) + +blackbox_crash_test_with_best_efforts_recovery: db_stress + $(PYTHON) -u tools/db_crashtest.py --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS) + +blackbox_crash_test_with_ts: db_stress + $(PYTHON) -u tools/db_crashtest.py --enable_ts blackbox $(CRASH_TEST_EXT_ARGS) ifeq ($(CRASH_TEST_KILL_ODD),) CRASH_TEST_KILL_ODD=888887 endif whitebox_crash_test: db_stress - python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \ + $(PYTHON) -u tools/db_crashtest.py --simple whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) - python -u tools/db_crashtest.py whitebox --random_kill_odd \ + $(PYTHON) -u tools/db_crashtest.py whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) whitebox_crash_test_with_atomic_flush: db_stress - python -u tools/db_crashtest.py --cf_consistency whitebox --random_kill_odd \ + $(PYTHON) -u tools/db_crashtest.py --cf_consistency whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) whitebox_crash_test_with_txn: db_stress - python -u tools/db_crashtest.py --txn whitebox --random_kill_odd \ + $(PYTHON) -u tools/db_crashtest.py --txn whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) -asan_check: - $(MAKE) clean +whitebox_crash_test_with_ts: db_stress + $(PYTHON) -u tools/db_crashtest.py --enable_ts whitebox --random_kill_odd \ + $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + +asan_check: clean COMPILE_WITH_ASAN=1 $(MAKE) check -j32 $(MAKE) clean -asan_crash_test: - $(MAKE) clean +asan_crash_test: clean COMPILE_WITH_ASAN=1 $(MAKE) crash_test $(MAKE) clean -asan_crash_test_with_atomic_flush: +whitebox_asan_crash_test: clean + COMPILE_WITH_ASAN=1 $(MAKE) whitebox_crash_test $(MAKE) clean - COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush + +blackbox_asan_crash_test: clean + COMPILE_WITH_ASAN=1 $(MAKE) blackbox_crash_test $(MAKE) clean -asan_crash_test_with_txn: +asan_crash_test_with_atomic_flush: clean + COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush $(MAKE) clean + +asan_crash_test_with_txn: clean COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_txn $(MAKE) clean -ubsan_check: +asan_crash_test_with_best_efforts_recovery: clean + COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_best_efforts_recovery $(MAKE) clean + +ubsan_check: clean COMPILE_WITH_UBSAN=1 $(MAKE) check -j32 $(MAKE) clean -ubsan_crash_test: - $(MAKE) clean +ubsan_crash_test: clean COMPILE_WITH_UBSAN=1 $(MAKE) crash_test $(MAKE) clean -ubsan_crash_test_with_atomic_flush: +whitebox_ubsan_crash_test: clean + COMPILE_WITH_UBSAN=1 $(MAKE) whitebox_crash_test $(MAKE) clean - COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush + +blackbox_ubsan_crash_test: clean + COMPILE_WITH_UBSAN=1 $(MAKE) blackbox_crash_test $(MAKE) clean -ubsan_crash_test_with_txn: +ubsan_crash_test_with_atomic_flush: clean + COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush $(MAKE) clean + +ubsan_crash_test_with_txn: clean COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_txn $(MAKE) clean +ubsan_crash_test_with_best_efforts_recovery: clean + COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_best_efforts_recovery + $(MAKE) clean + +full_valgrind_test: + ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check + +full_valgrind_test_some: + ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some + valgrind_test: ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check +valgrind_test_some: + ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some + valgrind_check: $(TESTS) $(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests $(AM_V_GEN)if test "$(J)" != 1 \ @@ -1051,12 +1163,20 @@ done; \ fi +valgrind_check_some: $(ROCKSDBTESTS_SUBSET) + for t in $(ROCKSDBTESTS_SUBSET); do \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + ret_code=$$?; \ + if [ $$ret_code -ne 0 ]; then \ + exit $$ret_code; \ + fi; \ + done ifneq ($(PAR_TEST),) parloop: ret_bad=0; \ for t in $(PAR_TEST); do \ - echo "===== Running $$t in parallel $(NUM_PAR)";\ + echo "===== Running $$t in parallel $(NUM_PAR) (`date`)";\ if [ $(db_test) -eq 1 ]; then \ seq $(J) | v="$$t" build_tools/gnu_parallel --gnu --plain 's=$(TMPD)/rdb-{}; export TEST_TMPDIR=$$s;' \ 'timeout 2m ./db_test --gtest_filter=$$v >> $$s/log-{} 2>1'; \ @@ -1108,22 +1228,22 @@ $(MAKE) dbg CLEAN_FILES += unity.cc -unity.cc: Makefile +unity.cc: Makefile util/build_version.cc.in rm -f $@ $@-t + $(AM_V_at)$(gen_build_version) > util/build_version.cc for source_file in $(LIB_SOURCES); do \ echo "#include \"$$source_file\"" >> $@-t; \ done chmod a=r $@-t mv $@-t $@ -unity.a: unity.o +unity.a: $(OBJ_DIR)/unity.o $(AM_V_AR)rm -f $@ - $(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o + $(AM_V_at)$(AR) $(ARFLAGS) $@ $(OBJ_DIR)/unity.o -TOOLLIBOBJECTS = $(TOOL_LIB_SOURCES:.cc=.o) # try compiling db_test with unity -unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unity.a +unity_test: $(OBJ_DIR)/db/db_basic_test.o $(OBJ_DIR)/db/db_test_util.o $(TEST_OBJECTS) $(TOOL_OBJECTS) unity.a $(AM_LINK) ./unity_test @@ -1135,12 +1255,15 @@ clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava clean-rocks: - rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED) + echo shared=$(ALL_SHARED_LIBS) + echo static=$(ALL_STATIC_LIBS) + rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report $(FIND) . -name "*.[oda]" -exec rm -f {} \; - $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm -f {} \; clean-rocksjava: + rm -rf jl jls cd java && $(MAKE) clean clean-not-downloaded-rocksjava: @@ -1167,603 +1290,769 @@ format: build_tools/format-diff.sh +check-format: + build_tools/format-diff.sh -c + +check-buck-targets: + buckifier/check_buck_targets.sh + +check-sources: + build_tools/check-sources.sh + package: bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- -$(LIBRARY): $(LIBOBJECTS) - $(AM_V_AR)rm -f $@ - $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) +$(STATIC_LIBRARY): $(LIB_OBJECTS) + $(AM_V_AR)rm -f $@ $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) + $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIB_OBJECTS) -$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) - $(AM_V_AR)rm -f $@ +$(STATIC_TEST_LIBRARY): $(TEST_OBJECTS) + $(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ -$(STRESS_LIBRARY): $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(STRESS_LIB_SOURCES:.cc=.o) - $(AM_V_AR)rm -f $@ +$(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS) + $(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ -librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS) +$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) + $(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY) + $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ + +$(SHARED_TEST_LIBRARY): $(TEST_OBJECTS) $(SHARED1) + $(AM_V_AR)rm -f $@ $(STATIC_TEST_LIBRARY) + $(AM_SHARE) + +$(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1) + $(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY) + $(AM_SHARE) + +$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1) + $(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY) + $(AM_SHARE) + +librocksdb_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TESTHARNESS) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ -db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) +db_bench: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY) $(AM_LINK) -trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) +trace_analyzer: $(OBJ_DIR)/tools/trace_analyzer.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) -block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) +block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) -folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o +folly_synchronization_distributed_mutex_test: $(OBJ_DIR)/third-party/folly/folly/synchronization/test/DistributedMutexTest.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) endif -cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) +cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY) + $(AM_LINK) + +persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY) + $(AM_LINK) + +memtablerep_bench: $(OBJ_DIR)/memtable/memtablerep_bench.o $(LIBRARY) + $(AM_LINK) + +filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY) $(AM_LINK) -persistent_cache_bench: utilities/persistent_cache/persistent_cache_bench.o $(LIBOBJECTS) $(TESTUTIL) +db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) -memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) +write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY) $(AM_LINK) -filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL) +db_sanity_test: $(OBJ_DIR)/tools/db_sanity_test.o $(LIBRARY) $(AM_LINK) -db_stress: db_stress_tool/db_stress.o $(STRESSTOOLOBJECTS) +db_repl_stress: $(OBJ_DIR)/tools/db_repl_stress.o $(LIBRARY) $(AM_LINK) -write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL) +arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) +memory_allocator_test: memory/memory_allocator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) +autovector_test: $(OBJ_DIR)/util/autovector_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) +column_family_test: $(OBJ_DIR)/db/column_family_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) +table_properties_collector_test: $(OBJ_DIR)/db/table_properties_collector_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -column_family_test: db/column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +bloom_test: $(OBJ_DIR)/util/bloom_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) +dynamic_bloom_test: $(OBJ_DIR)/util/dynamic_bloom_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) +c_test: $(OBJ_DIR)/db/c_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) +cache_test: $(OBJ_DIR)/cache/cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) +coding_test: $(OBJ_DIR)/util/coding_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cache_test: cache/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) +hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) +random_test: $(OBJ_DIR)/util/random_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS) +ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -random_test: util/random_test.o $(LIBOBJECTS) $(TESTHARNESS) +option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) +cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_format_test: utilities/cassandra/cassandra_format_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS) +cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_functional_test: utilities/cassandra/cassandra_functional_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS) +cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_row_merge_test: utilities/cassandra/cassandra_row_merge_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS) +cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cassandra_serialize_test: utilities/cassandra/cassandra_serialize_test.o $(LIBOBJECTS) $(TESTHARNESS) +hash_table_test: $(OBJ_DIR)/utilities/persistent_cache/hash_table_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -hash_table_test: utilities/persistent_cache/hash_table_test.o $(LIBOBJECTS) $(TESTHARNESS) +histogram_test: $(OBJ_DIR)/monitoring/histogram_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) +thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) +work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) +crc32c_test: $(OBJ_DIR)/util/crc32c_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -slice_test: util/slice_test.o $(LIBOBJECTS) $(TESTHARNESS) +slice_test: $(OBJ_DIR)/util/slice_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) +slice_transform_test: $(OBJ_DIR)/util/slice_transform_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_basic_test: db/db_basic_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_basic_test: $(OBJ_DIR)/db/db_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_encryption_test: db/db_encryption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_blob_index_test: db/db_blob_index_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_with_timestamp_compaction_test: db/db_with_timestamp_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_encryption_test: $(OBJ_DIR)/db/db_encryption_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_bloom_filter_test: db/db_bloom_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_compaction_filter_test: db/db_compaction_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_compaction_test: db/db_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_blob_index_test: $(OBJ_DIR)/db/blob/db_blob_index_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_block_cache_test: $(OBJ_DIR)/db/db_block_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_flush_test: db/db_flush_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_bloom_filter_test: $(OBJ_DIR)/db/db_bloom_filter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_log_iter_test: $(OBJ_DIR)/db/db_log_iter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_flush_test: $(OBJ_DIR)/db/db_flush_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_inplace_update_test: $(OBJ_DIR)/db/db_inplace_update_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_range_del_test: db/db_range_del_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_iterator_test: $(OBJ_DIR)/db/db_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_sst_test: db/db_sst_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_kv_checksum_test: $(OBJ_DIR)/db/db_kv_checksum_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_statistics_test: db/db_statistics_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_memtable_test: $(OBJ_DIR)/db/db_memtable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_write_test: db/db_write_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_merge_operator_test: $(OBJ_DIR)/db/db_merge_operator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -error_handler_test: db/error_handler_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_merge_operand_test: $(OBJ_DIR)/db/db_merge_operand_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_options_test: $(OBJ_DIR)/db/db_options_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_range_del_test: $(OBJ_DIR)/db/db_range_del_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_sst_test: $(OBJ_DIR)/db/db_sst_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_statistics_test: $(OBJ_DIR)/db/db_statistics_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS) +db_write_test: $(OBJ_DIR)/db/db_write_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_iter_stress_test: db/db_iter_stress_test.o $(LIBOBJECTS) $(TESTHARNESS) +error_handler_fs_test: $(OBJ_DIR)/db/error_handler_fs_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +external_sst_file_basic_test: $(OBJ_DIR)/db/external_sst_file_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +external_sst_file_test: $(OBJ_DIR)/db/external_sst_file_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_io_failure_test: db/db_io_failure_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +import_column_family_test: $(OBJ_DIR)/db/import_column_family_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_properties_test: db/db_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_tailing_iter_test: $(OBJ_DIR)/db/db_tailing_iter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_table_properties_test: db/db_table_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_iter_test: $(OBJ_DIR)/db/db_iter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) +db_iter_stress_test: $(OBJ_DIR)/db/db_iter_stress_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_universal_compaction_test: $(OBJ_DIR)/db/db_universal_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_wal_test: $(OBJ_DIR)/db/db_wal_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_io_failure_test: $(OBJ_DIR)/db/db_io_failure_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_properties_test: $(OBJ_DIR)/db/db_properties_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_table_properties_test: $(OBJ_DIR)/db/db_table_properties_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +log_write_bench: $(OBJ_DIR)/util/log_write_bench.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) $(PROFILING_FLAGS) -plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) +plain_table_db_test: $(OBJ_DIR)/db/plain_table_db_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) +comparator_db_test: $(OBJ_DIR)/db/comparator_db_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) +table_reader_bench: $(OBJ_DIR)/table/table_reader_bench.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) $(PROFILING_FLAGS) -perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) +perf_context_test: $(OBJ_DIR)/db/perf_context_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) -prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) +prefix_test: $(OBJ_DIR)/db/prefix_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) -backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS) +backupable_db_test: $(OBJ_DIR)/utilities/backupable/backupable_db_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS) +checkpoint_test: $(OBJ_DIR)/utilities/checkpoint/checkpoint_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS) +cache_simulator_test: $(OBJ_DIR)/utilities/simulator_cache/cache_simulator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +sim_cache_test: $(OBJ_DIR)/utilities/simulator_cache/sim_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_mirror_test: $(OBJ_DIR)/utilities/env_mirror_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -env_timed_test: utilities/env_timed_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_timed_test: $(OBJ_DIR)/utilities/env_timed_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) ifdef ROCKSDB_USE_LIBRADOS -env_librados_test: utilities/env_librados_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +env_librados_test: $(OBJ_DIR)/utilities/env_librados_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) endif -object_registry_test: utilities/object_registry_test.o $(LIBOBJECTS) $(TESTHARNESS) +object_registry_test: $(OBJ_DIR)/utilities/object_registry_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +ttl_test: $(OBJ_DIR)/utilities/ttl/ttl_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +write_batch_with_index_test: $(OBJ_DIR)/utilities/write_batch_with_index/write_batch_with_index_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +flush_job_test: $(OBJ_DIR)/db/flush_job_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compaction_iterator_test: $(OBJ_DIR)/db/compaction/compaction_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compaction_job_test: $(OBJ_DIR)/db/compaction/compaction_job_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compaction_job_stats_test: $(OBJ_DIR)/db/compaction/compaction_job_stats_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compaction_service_test: $(OBJ_DIR)/db/compaction/compaction_service_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +compact_on_deletion_collector_test: $(OBJ_DIR)/utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +wal_manager_test: $(OBJ_DIR)/db/wal_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) +wal_edit_test: $(OBJ_DIR)/db/wal_edit_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) +dbformat_test: $(OBJ_DIR)/db/dbformat_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_basic_test: $(OBJ_DIR)/env/env_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_test: $(OBJ_DIR)/env/env_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) +io_posix_test: $(OBJ_DIR)/env/io_posix_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS) +fault_injection_test: $(OBJ_DIR)/db/fault_injection_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) +rate_limiter_test: $(OBJ_DIR)/util/rate_limiter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) +delete_scheduler_test: $(OBJ_DIR)/file/delete_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) +filename_test: $(OBJ_DIR)/db/filename_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -env_basic_test: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS) +random_access_file_reader_test: $(OBJ_DIR)/file/random_access_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -env_test: env/env_test.o $(LIBOBJECTS) $(TESTHARNESS) +file_reader_writer_test: $(OBJ_DIR)/util/file_reader_writer_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_based_filter_block_test: $(OBJ_DIR)/table/block_based/block_based_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +block_based_table_reader_test: table/block_based/block_based_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS) +full_filter_block_test: $(OBJ_DIR)/table/block_based/full_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) +partitioned_filter_block_test: $(OBJ_DIR)/table/block_based/partitioned_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS) +log_test: $(OBJ_DIR)/db/log_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +cleanable_test: $(OBJ_DIR)/table/cleanable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +table_test: $(OBJ_DIR)/table/table_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_fetcher_test: table/block_fetcher_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS) +data_block_hash_index_test: $(OBJ_DIR)/table/block_based/data_block_hash_index_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) +inlineskiplist_test: $(OBJ_DIR)/memtable/inlineskiplist_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS) +skiplist_test: $(OBJ_DIR)/memtable/skiplist_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_buffer_manager_test: $(OBJ_DIR)/memtable/write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) +version_edit_test: $(OBJ_DIR)/db/version_edit_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -skiplist_test: memtable/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) +version_set_test: $(OBJ_DIR)/db/version_set_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_buffer_manager_test: memtable/write_buffer_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_picker_test: $(OBJ_DIR)/db/compaction/compaction_picker_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) +version_builder_test: $(OBJ_DIR)/db/version_builder_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) +file_indexer_test: $(OBJ_DIR)/db/file_indexer_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) +reduce_levels_test: $(OBJ_DIR)/tools/reduce_levels_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_batch_test: $(OBJ_DIR)/db/write_batch_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -file_indexer_test: db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_controller_test: $(OBJ_DIR)/db/write_controller_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) +merge_helper_test: $(OBJ_DIR)/db/merge_helper_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) +memory_test: $(OBJ_DIR)/utilities/memory/memory_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) +merge_test: $(OBJ_DIR)/db/merge_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -merge_helper_test: db/merge_helper_test.o $(LIBOBJECTS) $(TESTHARNESS) +merger_test: $(OBJ_DIR)/table/merger_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -memory_test: utilities/memory/memory_test.o $(LIBOBJECTS) $(TESTHARNESS) +util_merge_operators_test: $(OBJ_DIR)/utilities/util_merge_operators_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) +options_file_test: $(OBJ_DIR)/db/options_file_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) +deletefile_test: $(OBJ_DIR)/db/deletefile_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $(TESTHARNESS) +obsolete_files_test: $(OBJ_DIR)/db/obsolete_files_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS) +rocksdb_dump: $(OBJ_DIR)/tools/dump/rocksdb_dump.o $(LIBRARY) $(AM_LINK) -deletefile_test: db/deletefile_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +rocksdb_undump: $(OBJ_DIR)/tools/dump/rocksdb_undump.o $(LIBRARY) $(AM_LINK) -obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +cuckoo_table_builder_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_builder_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS) +cuckoo_table_reader_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS) +cuckoo_table_db_test: $(OBJ_DIR)/db/cuckoo_table_db_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) +listener_test: $(OBJ_DIR)/db/listener_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) +thread_list_test: $(OBJ_DIR)/util/thread_list_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) +compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -listener_test: db/listener_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) +customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -compact_files_test: db/compact_files_test.o $(LIBOBJECTS) $(TESTHARNESS) +options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -options_test: options/options_test.o $(LIBOBJECTS) $(TESTHARNESS) +options_settable_test: $(OBJ_DIR)/options/options_settable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -options_settable_test: options/options_settable_test.o $(LIBOBJECTS) $(TESTHARNESS) +options_util_test: $(OBJ_DIR)/utilities/options/options_util_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -options_util_test: utilities/options/options_util_test.o $(LIBOBJECTS) $(TESTHARNESS) +db_bench_tool_test: $(OBJ_DIR)/tools/db_bench_tool_test.o $(BENCH_OBJECTS) $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS) +trace_analyzer_test: $(OBJ_DIR)/tools/trace_analyzer_test.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS) +event_logger_test: $(OBJ_DIR)/logging/event_logger_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +timer_queue_test: $(OBJ_DIR)/util/timer_queue_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS) +sst_dump_test: $(OBJ_DIR)/tools/sst_dump_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) +optimistic_transaction_test: $(OBJ_DIR)/utilities/transactions/optimistic_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -optimistic_transaction_test: utilities/transactions/optimistic_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +mock_env_test : $(OBJ_DIR)/env/mock_env_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -mock_env_test : env/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) +manual_compaction_test: $(OBJ_DIR)/db/manual_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +filelock_test: $(OBJ_DIR)/util/filelock_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) +auto_roll_logger_test: $(OBJ_DIR)/logging/auto_roll_logger_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_logger_test: $(OBJ_DIR)/logging/env_logger_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +memtable_list_test: $(OBJ_DIR)/db/memtable_list_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS) +heap_test: $(OBJ_DIR)/util/heap_test.o $(GTEST) $(AM_LINK) -heap_test: util/heap_test.o $(GTEST) +point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_prepared_transaction_test: utilities/transactions/write_prepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -write_unprepared_transaction_test: utilities/transactions/write_unprepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS) +write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -sst_dump: tools/sst_dump.o $(LIBOBJECTS) +sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) -blob_dump: tools/blob_dump.o $(LIBOBJECTS) +blob_dump: $(OBJ_DIR)/tools/blob_dump.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) -repair_test: db/repair_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +repair_test: $(OBJ_DIR)/db/repair_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -ldb_cmd_test: tools/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS) +ldb_cmd_test: $(OBJ_DIR)/tools/ldb_cmd_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -ldb: tools/ldb.o $(LIBOBJECTS) +ldb: $(OBJ_DIR)/tools/ldb.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) -iostats_context_test: monitoring/iostats_context_test.o $(LIBOBJECTS) $(TESTHARNESS) +iostats_context_test: $(OBJ_DIR)/monitoring/iostats_context_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +persistent_cache_test: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +statistics_test: $(OBJ_DIR)/monitoring/statistics_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +stats_history_test: $(OBJ_DIR)/monitoring/stats_history_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +lru_cache_test: $(OBJ_DIR)/cache/lru_cache_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +range_del_aggregator_test: $(OBJ_DIR)/db/range_del_aggregator_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +range_del_aggregator_bench: $(OBJ_DIR)/db/range_del_aggregator_bench.o $(LIBRARY) + $(AM_LINK) + +blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +sst_file_reader_test: $(OBJ_DIR)/table/sst_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_secondary_test: $(OBJ_DIR)/db/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +block_cache_tracer_test: $(OBJ_DIR)/trace_replay/block_cache_tracer_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +block_cache_trace_analyzer_test: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_test.o $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS) +defer_test: $(OBJ_DIR)/util/defer_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +blob_counting_iterator_test: $(OBJ_DIR)/db/blob/blob_counting_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS) +blob_file_addition_test: $(OBJ_DIR)/db/blob/blob_file_addition_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +blob_file_builder_test: $(OBJ_DIR)/db/blob/blob_file_builder_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -range_del_aggregator_bench: db/range_del_aggregator_bench.o $(LIBOBJECTS) $(TESTUTIL) +blob_file_cache_test: $(OBJ_DIR)/db/blob/blob_file_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS) +blob_file_garbage_test: $(OBJ_DIR)/db/blob/blob_file_garbage_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS) +blob_file_reader_test: $(OBJ_DIR)/db/blob/blob_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) +timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +periodic_work_scheduler_test: $(OBJ_DIR)/db/periodic_work_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS) +testutil_test: $(OBJ_DIR)/test_util/testutil_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS) +io_tracer_test: $(OBJ_DIR)/trace_replay/io_tracer_test.o $(OBJ_DIR)/trace_replay/io_tracer.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -defer_test: util/defer_test.o $(LIBOBJECTS) $(TESTHARNESS) +prefetch_test: $(OBJ_DIR)/file/prefetch_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +ribbon_bench: $(OBJ_DIR)/microbench/ribbon_bench.o $(LIBRARY) + $(AM_LINK) + +db_basic_bench: $(OBJ_DIR)/microbench/db_basic_bench.o $(LIBRARY) + $(AM_LINK) + +cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) #------------------------------------------------- # make install related stuff -INSTALL_PATH ?= /usr/local +PREFIX ?= /usr/local +LIBDIR ?= $(PREFIX)/lib +INSTALL_LIBDIR = $(DESTDIR)$(LIBDIR) uninstall: - rm -rf $(INSTALL_PATH)/include/rocksdb \ - $(INSTALL_PATH)/lib/$(LIBRARY) \ - $(INSTALL_PATH)/lib/$(SHARED4) \ - $(INSTALL_PATH)/lib/$(SHARED3) \ - $(INSTALL_PATH)/lib/$(SHARED2) \ - $(INSTALL_PATH)/lib/$(SHARED1) - -install-headers: - install -d $(INSTALL_PATH)/lib + rm -rf $(DESTDIR)$(PREFIX)/include/rocksdb \ + $(INSTALL_LIBDIR)/$(LIBRARY) \ + $(INSTALL_LIBDIR)/$(SHARED4) \ + $(INSTALL_LIBDIR)/$(SHARED3) \ + $(INSTALL_LIBDIR)/$(SHARED2) \ + $(INSTALL_LIBDIR)/$(SHARED1) \ + $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc + +install-headers: gen-pc + install -d $(INSTALL_LIBDIR) + install -d $(INSTALL_LIBDIR)/pkgconfig for header_dir in `$(FIND) "include/rocksdb" -type d`; do \ - install -d $(INSTALL_PATH)/$$header_dir; \ + install -d $(DESTDIR)/$(PREFIX)/$$header_dir; \ done for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \ - install -C -m 644 $$header $(INSTALL_PATH)/$$header; \ + install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \ + done + for header in $(ROCKSDB_PLUGIN_HEADERS); do \ + install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \ + install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \ done + install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc install-static: install-headers $(LIBRARY) - install -C -m 755 $(LIBRARY) $(INSTALL_PATH)/lib + install -d $(INSTALL_LIBDIR) + install -C -m 755 $(LIBRARY) $(INSTALL_LIBDIR) install-shared: install-headers $(SHARED4) - install -C -m 755 $(SHARED4) $(INSTALL_PATH)/lib && \ - ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED3) && \ - ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED2) && \ - ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED1) + install -d $(INSTALL_LIBDIR) + install -C -m 755 $(SHARED4) $(INSTALL_LIBDIR) + ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED3) + ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED2) + ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED1) # install static by default + install shared if it exists install: install-static [ -e $(SHARED4) ] && $(MAKE) install-shared || : +# Generate the pkg-config file +gen-pc: + -echo 'prefix=$(PREFIX)' > rocksdb.pc + -echo 'exec_prefix=$${prefix}' >> rocksdb.pc + -echo 'includedir=$${prefix}/include' >> rocksdb.pc + -echo 'libdir=$(LIBDIR)' >> rocksdb.pc + -echo '' >> rocksdb.pc + -echo 'Name: rocksdb' >> rocksdb.pc + -echo 'Description: An embeddable persistent key-value store for fast storage' >> rocksdb.pc + -echo Version: $(shell ./build_tools/version.sh full) >> rocksdb.pc + -echo 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' >> rocksdb.pc + -echo 'Libs.private: $(PLATFORM_LDFLAGS)' >> rocksdb.pc + -echo 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' >> rocksdb.pc + -echo 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' >> rocksdb.pc + #------------------------------------------------- # --------------------------------------------------------------------------- # Jni stuff # --------------------------------------------------------------------------- - JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) else ifeq ($(PLATFORM), OS_OPENBSD) - ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE))) + ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE))) ARCH := 64 else ARCH := 32 @@ -1783,37 +2072,48 @@ JNI_LIBC_POSTFIX = -$(JNI_LIBC) endif -ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE))) +ifeq (,$(ROCKSDBJNILIB)) +ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE))) ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so else ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so endif -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar -ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar -ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar +endif +ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH) +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar +ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar +ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar +ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar SHA256_CMD = sha256sum ZLIB_VER ?= 1.2.11 ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1 -ZLIB_DOWNLOAD_BASE ?= http://zlib.net -BZIP2_VER ?= 1.0.6 -BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd -BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2 -SNAPPY_VER ?= 1.1.7 -SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4 +ZLIB_DOWNLOAD_BASE ?= https://zlib.net/fossils +BZIP2_VER ?= 1.0.8 +BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 +BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2 +SNAPPY_VER ?= 1.1.8 +SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive -LZ4_VER ?= 1.9.2 -LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc +LZ4_VER ?= 1.9.3 +LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.4.4 -ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8 +ZSTD_VER ?= 1.4.9 +ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 ifeq ($(PLATFORM), OS_MACOSX) +ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB))) +ifeq ($(MACHINE),arm64) + ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib +else ifeq ($(MACHINE),x86_64) + ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib +else ROCKSDBJNILIB = librocksdbjni-osx.jnilib - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar +endif +endif + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-osx.jar SHA256_CMD = openssl sha256 -r ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin @@ -1821,10 +2121,11 @@ JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ endif endif + ifeq ($(PLATFORM), OS_FREEBSD) JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/freebsd ROCKSDBJNILIB = librocksdbjni-freebsd$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-freebsd$(ARCH).jar + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-freebsd$(ARCH).jar endif ifeq ($(PLATFORM), OS_SOLARIS) ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so @@ -1839,142 +2140,186 @@ SNAPPY_MAKE_TARGET = libsnappy.la endif ifeq ($(PLATFORM), OS_OPENBSD) - JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd + JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-openbsd$(ARCH).jar + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar endif -libz.a: - -rm -rf zlib-$(ZLIB_VER) -ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz)) +zlib-$(ZLIB_VER).tar.gz: curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz -endif ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \ echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \ exit 1; \ fi + +libz.a: zlib-$(ZLIB_VER).tar.gz + -rm -rf zlib-$(ZLIB_VER) tar xvzf zlib-$(ZLIB_VER).tar.gz - cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && $(MAKE) + if [ -n"$(ARCHFLAG)" ]; then \ + cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static --archs="$(ARCHFLAG)" && $(MAKE); \ + else \ + cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static && $(MAKE); \ + fi cp zlib-$(ZLIB_VER)/libz.a . -libbz2.a: - -rm -rf bzip2-$(BZIP2_VER) -ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz)) +bzip2-$(BZIP2_VER).tar.gz: curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz -endif BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \ echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \ exit 1; \ fi + +libbz2.a: bzip2-$(BZIP2_VER).tar.gz + -rm -rf bzip2-$(BZIP2_VER) tar xvzf bzip2-$(BZIP2_VER).tar.gz - cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}' + cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' AR='ar ${EXTRA_ARFLAGS}' libbz2.a cp bzip2-$(BZIP2_VER)/libbz2.a . -libsnappy.a: - -rm -rf snappy-$(SNAPPY_VER) -ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz)) +snappy-$(SNAPPY_VER).tar.gz: curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz -endif SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \ echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \ exit 1; \ fi + +libsnappy.a: snappy-$(SNAPPY_VER).tar.gz + -rm -rf snappy-$(SNAPPY_VER) tar xvzf snappy-$(SNAPPY_VER).tar.gz mkdir snappy-$(SNAPPY_VER)/build - cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET} + cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET} cp snappy-$(SNAPPY_VER)/build/libsnappy.a . -liblz4.a: - -rm -rf lz4-$(LZ4_VER) -ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz)) +lz4-$(LZ4_VER).tar.gz: curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz -endif LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \ echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \ exit 1; \ fi + +liblz4.a: lz4-$(LZ4_VER).tar.gz + -rm -rf lz4-$(LZ4_VER) tar xvzf lz4-$(LZ4_VER).tar.gz - cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all + cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' all cp lz4-$(LZ4_VER)/lib/liblz4.a . -libzstd.a: - -rm -rf zstd-$(ZSTD_VER) -ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz)) +zstd-$(ZSTD_VER).tar.gz: curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz -endif ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \ echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \ exit 1; \ fi + +libzstd.a: zstd-$(ZSTD_VER).tar.gz + -rm -rf zstd-$(ZSTD_VER) tar xvzf zstd-$(ZSTD_VER).tar.gz - cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' install + cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' libzstd.a cp zstd-$(ZSTD_VER)/lib/libzstd.a . -# A version of each $(LIBOBJECTS) compiled with -fPIC and a fixed set of static compression libraries -java_static_libobjects = $(patsubst %,jls/%,$(LIB_CC_OBJECTS)) -CLEAN_FILES += jls -java_static_all_libobjects = $(java_static_libobjects) - +# A version of each $(LIB_OBJECTS) compiled with -fPIC and a fixed set of static compression libraries ifneq ($(ROCKSDB_JAVA_NO_COMPRESSION), 1) JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a endif JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD -JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib/include +JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder -ifeq ($(HAVE_POWER8),1) -JAVA_STATIC_C_LIBOBJECTS = $(patsubst %.c.o,jls/%.c.o,$(LIB_SOURCES_C:.c=.o)) -JAVA_STATIC_ASM_LIBOBJECTS = $(patsubst %.S.o,jls/%.S.o,$(LIB_SOURCES_ASM:.S=.o)) - -java_static_ppc_libobjects = $(JAVA_STATIC_C_LIBOBJECTS) $(JAVA_STATIC_ASM_LIBOBJECTS) - -jls/util/crc32c_ppc.o: util/crc32c_ppc.c - $(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@ - -jls/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S - $(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@ - -java_static_all_libobjects += $(java_static_ppc_libobjects) +ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),) +CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) +CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) +endif +rocksdbjavastatic: +ifeq ($(JAVA_HOME),) + $(error JAVA_HOME is not set) +endif + $(MAKE) rocksdbjavastatic_deps + $(MAKE) rocksdbjavastatic_libobjects + $(MAKE) rocksdbjavastatic_javalib + $(MAKE) rocksdbjava_jar + +rocksdbjavastaticosx: rocksdbjavastaticosx_archs + cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + +rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs + cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib + cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + +rocksdbjavastaticosx_archs: + $(MAKE) rocksdbjavastaticosx_arch_x86_64 + $(MAKE) rocksdbjavastaticosx_arch_arm64 + +rocksdbjavastaticosx_arch_%: +ifeq ($(JAVA_HOME),) + $(error JAVA_HOME is not set) +endif + $(MAKE) clean-ext-libraries-bin + $(MAKE) clean-rocks + ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_deps + ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_libobjects + ARCHFLAG="-arch $*" ROCKSDBJNILIB="librocksdbjni-osx-$*.jnilib" $(MAKE) rocksdbjavastatic_javalib + +ifeq ($(JAR_CMD),) +ifneq ($(JAVA_HOME),) +JAR_CMD := $(JAVA_HOME)/bin/jar +else +JAR_CMD := jar endif - -$(java_static_libobjects): jls/%.o: %.cc $(JAVA_COMPRESSIONS) - $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -fPIC -c $< -o $@ $(COVERAGEFLAGS) - -rocksdbjavastatic: $(java_static_all_libobjects) - cd java;$(MAKE) javalib; - rm -f ./java/target/$(ROCKSDBJNILIB) +endif +rocksdbjavastatic_javalib: + cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib + rm -f java/target/$(ROCKSDBJNILIB) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \ - $(java_static_all_libobjects) $(COVERAGEFLAGS) \ + $(LIB_OBJECTS) $(COVERAGEFLAGS) \ $(JAVA_COMPRESSIONS) $(JAVA_STATIC_LDFLAGS) cd java/target;if [ "$(DEBUG_LEVEL)" == "0" ]; then \ strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \ fi - cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md - cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) - cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class - cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) * - cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org -rocksdbjavastaticrelease: rocksdbjavastatic +rocksdbjava_jar: + cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) + cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 + +rocksdbjava_javadocs_jar: + cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) * + openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1 + +rocksdbjava_sources_jar: + cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org + openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1 + +rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS) + +rocksdbjavastatic_libobjects: $(LIB_OBJECTS) + +rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl - cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib - cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class - -rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl - cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md - cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib - cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1 + +rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar + cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md + cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib + cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1 rocksdbjavastaticdockerx86: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target @@ -1988,87 +2333,83 @@ mkdir -p java/target docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh +rocksdbjavastaticdockers390x: + mkdir -p java/target + docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + rocksdbjavastaticdockerx86musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh rocksdbjavastaticdockerx86_64musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh rocksdbjavastaticdockerppc64lemusl: mkdir -p java/target - docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh rocksdbjavastaticdockerarm64v8musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh + +rocksdbjavastaticdockers390xmusl: + mkdir -p java/target + docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral -rocksdbjavastaticpublishcentral: - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64 - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32 - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64-musl.jar -Dclassifier=linux64-musl - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32-musl.jar -Dclassifier=linux32-musl - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-win64.jar -Dclassifier=win64 - mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar - -# A version of each $(LIBOBJECTS) compiled with -fPIC -ifeq ($(HAVE_POWER8),1) -JAVA_CC_OBJECTS = $(SHARED_CC_OBJECTS) -JAVA_C_OBJECTS = $(SHARED_C_OBJECTS) -JAVA_ASM_OBJECTS = $(SHARED_ASM_OBJECTS) - -JAVA_C_LIBOBJECTS = $(patsubst %.c.o,jl/%.c.o,$(JAVA_C_OBJECTS)) -JAVA_ASM_LIBOBJECTS = $(patsubst %.S.o,jl/%.S.o,$(JAVA_ASM_OBJECTS)) -endif - -java_libobjects = $(patsubst %,jl/%,$(LIB_CC_OBJECTS)) -CLEAN_FILES += jl -java_all_libobjects = $(java_libobjects) +ROCKSDB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64 -ifeq ($(HAVE_POWER8),1) -java_ppc_libobjects = $(JAVA_C_LIBOBJECTS) $(JAVA_ASM_LIBOBJECTS) +rocksdbjavastaticpublishcentral: rocksdbjavageneratepom + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar + $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);) + +rocksdbjavageneratepom: + cd java;cat pom.xml.template | sed 's/\$${ROCKSDB_JAVA_VERSION}/$(ROCKSDB_JAVA_VERSION)/' > pom.xml + +rocksdbjavastaticnexusbundlejar: rocksdbjavageneratepom + openssl sha1 -r java/pom.xml | awk '{ print $$1 }' > java/target/pom.xml.sha1 + openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar | awk '{ print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 + $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar | awk '{ print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1;) + gpg --yes --output java/target/pom.xml.asc -ab java/pom.xml + gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar + $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar;) + $(JAR_CMD) cvf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java pom.xml -C java/target pom.xml.sha1 -C java/target pom.xml.asc -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.asc + $(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), $(JAR_CMD) uf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.asc;) -jl/crc32c_ppc.o: util/crc32c_ppc.c - $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ -jl/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S - $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ -java_all_libobjects += $(java_ppc_libobjects) -endif +# A version of each $(LIBOBJECTS) compiled with -fPIC -$(java_libobjects): jl/%.o: %.cc +jl/%.o: %.cc $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS) - - -rocksdbjava: $(java_all_libobjects) - $(AM_V_GEN)cd java;$(MAKE) javalib; +rocksdbjava: $(LIB_OBJECTS) +ifeq ($(JAVA_HOME),) + $(error JAVA_HOME is not set) +endif + $(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib; $(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB) - $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(java_all_libobjects) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) - $(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md - $(AM_V_at)cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) - $(AM_V_at)cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) + $(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md + $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) + $(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + $(AM_V_at)openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1 jclean: cd java;$(MAKE) clean; jtest_compile: rocksdbjava - cd java;$(MAKE) java_test + cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) java_test jtest_run: cd java;$(MAKE) run_test jtest: rocksdbjava - cd java;$(MAKE) sample;$(MAKE) test; - python tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets + cd java;$(MAKE) sample; SHA256_CMD='$(SHA256_CMD)' $(MAKE) test; + $(PYTHON) tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets jdb_bench: cd java;$(MAKE) db_bench; @@ -2107,30 +2448,32 @@ else ifeq ($(HAVE_POWER8),1) -util/crc32c_ppc.o: util/crc32c_ppc.c +$(OBJ_DIR)/util/crc32c_ppc.o: util/crc32c_ppc.c $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ -util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S +$(OBJ_DIR)/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ endif -.cc.o: - $(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) +$(OBJ_DIR)/%.o: %.cc + $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) -.cpp.o: - $(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) +$(OBJ_DIR)/%.o: %.cpp + $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS) -.c.o: +$(OBJ_DIR)/%.o: %.c $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@ endif + # --------------------------------------------------------------------------- # Source files dependencies detection # --------------------------------------------------------------------------- - -all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) -DEPFILES = $(all_sources:.cc=.cc.d) - +# If skip dependencies is ON, skip including the dep files +ifneq ($(SKIP_DEPENDS), 1) +DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES)) +DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C)) ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) - DEPFILES += $(FOLLY_SOURCES:.cpp=.cpp.d) + DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES)) +endif endif # Add proper dependency support so changing a .h file forces a .cc file to @@ -2138,23 +2481,25 @@ # The .d file indicates .cc file's dependencies on .h files. We generate such # dependency by g++'s -MM option, whose output is a make dependency rule. -%.cc.d: %.cc - @$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ - -MM -MT'$@' -MT'$(<:.cc=.o)' "$<" -o '$@' - -%.cpp.d: %.cpp - @$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ - -MM -MT'$@' -MT'$(<:.cpp=.o)' "$<" -o '$@' +$(OBJ_DIR)/%.cc.d: %.cc + @mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ + -MM -MT'$@' -MT'$(<:.cc=.o)' -MT'$(<:%.cc=$(OBJ_DIR)/%.o)' \ + "$<" -o '$@' + +$(OBJ_DIR)/%.cpp.d: %.cpp + @mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ + -MM -MT'$@' -MT'$(<:.cpp=.o)' -MT'$(<:%.cpp=$(OBJ_DIR)/%.o)' \ + "$<" -o '$@' ifeq ($(HAVE_POWER8),1) -DEPFILES_C = $(LIB_SOURCES_C:.c=.c.d) -DEPFILES_ASM = $(LIB_SOURCES_ASM:.S=.S.d) +DEPFILES_C = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C)) +DEPFILES_ASM = $(patsubst %.S, $(OBJ_DIR)/%.S.d, $(LIB_SOURCES_ASM)) -%.c.d: %.c +$(OBJ_DIR)/%.c.d: %.c @$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ -MM -MT'$@' -MT'$(<:.c=.o)' "$<" -o '$@' -%.S.d: %.S +$(OBJ_DIR)/%.S.d: %.S @$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \ -MM -MT'$@' -MT'$(<:.S=.o)' "$<" -o '$@' @@ -2166,20 +2511,12 @@ depend: $(DEPFILES) endif -# if the make goal is either "clean" or "format", we shouldn't -# try to import the *.d files. -# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly -# working solution. -ifneq ($(MAKECMDGOALS),clean) -ifneq ($(MAKECMDGOALS),format) -ifneq ($(MAKECMDGOALS),jclean) -ifneq ($(MAKECMDGOALS),jtest) -ifneq ($(MAKECMDGOALS),package) -ifneq ($(MAKECMDGOALS),analyze) +build_subset_tests: $(ROCKSDBTESTS_SUBSET) + $(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi + +# Remove the rules for which dependencies should not be generated and see if any are left. +#If so, include the dependencies; if not, do not include the dependency files +ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test, $(MAKECMDGOALS)) +ifneq ("$(ROCKS_DEP_RULES)", "") -include $(DEPFILES) endif -endif -endif -endif -endif -endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/PLUGINS.md mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/PLUGINS.md 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,4 @@ +This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it. + +* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference +* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/README.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/README.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/README.md 2025-05-19 16:14:27.000000000 +0000 @@ -1,8 +1,9 @@ ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage -[![Linux/Mac Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) -[![Windows Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/master?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/master) -[![PPC64le Build Status](http://140.211.168.68:8080/buildStatus/icon?job=Rocksdb)](http://140.211.168.68:8080/job/Rocksdb) +[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb) +[![TravisCI Status](https://api.travis-ci.com/facebook/rocksdb.svg?branch=main)](https://travis-ci.com/github/facebook/rocksdb) +[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main) +[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb) RocksDB is developed and maintained by Facebook Database Engineering Team. It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com) @@ -16,7 +17,7 @@ making it especially suitable for storing multiple terabytes of data in a single database. -Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples +Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation. @@ -24,7 +25,7 @@ rely on the details of any other header files in this package. Those internal APIs may be changed without warning. -Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ +Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups. ## License diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/TARGETS mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS --- mariadb-10.11.11/storage/rocksdb/rocksdb/TARGETS 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS 2025-05-19 16:14:27.000000000 +0000 @@ -1,4 +1,5 @@ -# This file @generated by `python buckifier/buckify_rocksdb.py` +# This file @generated by: +#$ python3 buckifier/buckify_rocksdb.py # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. @@ -9,7 +10,7 @@ REPO_PATH = package_name() + "/" -ROCKSDB_COMPILER_FLAGS = [ +ROCKSDB_COMPILER_FLAGS_0 = [ "-fno-builtin-memcmp", # Needed to compile in fbcode "-Wno-expansion-to-defined", @@ -24,19 +25,25 @@ ("zlib", None, "z"), ("gflags", None, "gflags"), ("lz4", None, "lz4"), - ("zstd", None), - ("tbb", None), - ("googletest", None, "gtest"), + ("zstd", None, "zstd"), ] -ROCKSDB_OS_DEPS = [ +ROCKSDB_OS_DEPS_0 = [ ( "linux", - ["third-party//numa:numa", "third-party//liburing:uring"], + [ + "third-party//numa:numa", + "third-party//liburing:uring", + "third-party//tbb:tbb", + ], + ), + ( + "macos", + ["third-party//tbb:tbb"], ), ] -ROCKSDB_OS_PREPROCESSOR_FLAGS = [ +ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [ ( "linux", [ @@ -50,17 +57,33 @@ "-DHAVE_SSE42", "-DLIBURING", "-DNUMA", + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DTBB", ], ), ( "macos", - ["-DOS_MACOSX"], + [ + "-DOS_MACOSX", + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DTBB", + ], + ), + ( + "windows", + [ + "-DOS_WIN", + "-DWIN32", + "-D_MBCS", + "-DWIN64", + "-DNOMINMAX", + ], ), ] ROCKSDB_PREPROCESSOR_FLAGS = [ - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", "-DROCKSDB_SUPPORT_THREAD_LOCAL", # Flags to enable libs we include @@ -71,14 +94,15 @@ "-DZSTD", "-DZSTD_STATIC_LINKING_ONLY", "-DGFLAGS=gflags", - "-DTBB", # Added missing flags from output of build_detect_platform "-DROCKSDB_BACKTRACE", +] - # Directories with files for #include - "-I" + REPO_PATH + "include/", - "-I" + REPO_PATH, +# Directories with files for #include +ROCKSDB_INCLUDE_PATHS = [ + "", + "include", ] ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { @@ -93,33 +117,53 @@ # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice # doesn't harm and avoid forgetting to add it. -ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else []) +ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else []) sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( +ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([( "linux", ["-DROCKSDB_JEMALLOC"], )] if sanitizer == "" else []) -ROCKSDB_OS_DEPS += ([( +ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([( "linux", ["third-party//jemalloc:headers"], )] if sanitizer == "" else []) +ROCKSDB_LIB_DEPS = [ + ":rocksdb_lib", + ":rocksdb_test_lib", +] if not is_opt_mode else [":rocksdb_lib"] + cpp_library( name = "rocksdb_lib", srcs = [ + "cache/cache.cc", + "cache/cache_entry_roles.cc", + "cache/cache_key.cc", + "cache/cache_reservation_manager.cc", "cache/clock_cache.cc", "cache/lru_cache.cc", "cache/sharded_cache.cc", "db/arena_wrapped_db_iter.cc", + "db/blob/blob_fetcher.cc", + "db/blob/blob_file_addition.cc", + "db/blob/blob_file_builder.cc", + "db/blob/blob_file_cache.cc", + "db/blob/blob_file_garbage.cc", + "db/blob/blob_file_meta.cc", + "db/blob/blob_file_reader.cc", + "db/blob/blob_garbage_meter.cc", + "db/blob/blob_log_format.cc", + "db/blob/blob_log_sequential_reader.cc", + "db/blob/blob_log_writer.cc", + "db/blob/prefetch_buffer_collection.cc", "db/builder.cc", "db/c.cc", "db/column_family.cc", - "db/compacted_db_impl.cc", "db/compaction/compaction.cc", "db/compaction/compaction_iterator.cc", "db/compaction/compaction_job.cc", @@ -127,8 +171,10 @@ "db/compaction/compaction_picker_fifo.cc", "db/compaction/compaction_picker_level.cc", "db/compaction/compaction_picker_universal.cc", + "db/compaction/sst_partitioner.cc", "db/convenience.cc", "db/db_filesnapshot.cc", + "db/db_impl/compacted_db_impl.cc", "db/db_impl/db_impl.cc", "db/db_impl/db_impl_compaction_flush.cc", "db/db_impl/db_impl_debug.cc", @@ -159,6 +205,8 @@ "db/memtable_list.cc", "db/merge_helper.cc", "db/merge_operator.cc", + "db/output_validator.cc", + "db/periodic_work_scheduler.cc", "db/range_del_aggregator.cc", "db/range_tombstone_fragmenter.cc", "db/repair.cc", @@ -169,25 +217,32 @@ "db/trim_history_scheduler.cc", "db/version_builder.cc", "db/version_edit.cc", + "db/version_edit_handler.cc", "db/version_set.cc", + "db/wal_edit.cc", "db/wal_manager.cc", "db/write_batch.cc", "db/write_batch_base.cc", "db/write_controller.cc", "db/write_thread.cc", + "env/composite_env.cc", "env/env.cc", "env/env_chroot.cc", "env/env_encryption.cc", "env/env_hdfs.cc", "env/env_posix.cc", "env/file_system.cc", + "env/file_system_tracer.cc", "env/fs_posix.cc", + "env/fs_remap.cc", "env/io_posix.cc", "env/mock_env.cc", + "env/unique_id_gen.cc", "file/delete_scheduler.cc", "file/file_prefetch_buffer.cc", "file/file_util.cc", "file/filename.cc", + "file/line_file_reader.cc", "file/random_access_file_reader.cc", "file/read_write_util.cc", "file/readahead_raf.cc", @@ -200,6 +255,8 @@ "memory/arena.cc", "memory/concurrent_arena.cc", "memory/jemalloc_nodump_allocator.cc", + "memory/memkind_kmem_allocator.cc", + "memory/memory_allocator.cc", "memtable/alloc_tracker.cc", "memtable/hash_linklist_rep.cc", "memtable/hash_skiplist_rep.cc", @@ -221,20 +278,30 @@ "monitoring/thread_status_util.cc", "monitoring/thread_status_util_debug.cc", "options/cf_options.cc", + "options/configurable.cc", + "options/customizable.cc", "options/db_options.cc", "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", - "options/options_sanity_check.cc", "port/port_posix.cc", "port/stack_trace.cc", + "port/win/env_default.cc", + "port/win/env_win.cc", + "port/win/io_win.cc", + "port/win/port_win.cc", + "port/win/win_logger.cc", + "port/win/win_thread.cc", "table/adaptive/adaptive_table_factory.cc", + "table/block_based/binary_search_index_reader.cc", "table/block_based/block.cc", "table/block_based/block_based_filter_block.cc", "table/block_based/block_based_table_builder.cc", "table/block_based/block_based_table_factory.cc", + "table/block_based/block_based_table_iterator.cc", "table/block_based/block_based_table_reader.cc", "table/block_based/block_builder.cc", + "table/block_based/block_prefetcher.cc", "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", "table/block_based/data_block_hash_index.cc", @@ -242,9 +309,14 @@ "table/block_based/filter_policy.cc", "table/block_based/flush_block_policy.cc", "table/block_based/full_filter_block.cc", + "table/block_based/hash_index_reader.cc", "table/block_based/index_builder.cc", + "table/block_based/index_reader_common.cc", "table/block_based/parsed_full_filter_block.cc", "table/block_based/partitioned_filter_block.cc", + "table/block_based/partitioned_index_iterator.cc", + "table/block_based/partitioned_index_reader.cc", + "table/block_based/reader_common.cc", "table/block_based/uncompression_dict_reader.cc", "table/block_fetcher.cc", "table/cuckoo/cuckoo_table_builder.cc", @@ -262,18 +334,26 @@ "table/plain/plain_table_index.cc", "table/plain/plain_table_key_coding.cc", "table/plain/plain_table_reader.cc", + "table/sst_file_dumper.cc", "table/sst_file_reader.cc", "table/sst_file_writer.cc", + "table/table_factory.cc", "table/table_properties.cc", "table/two_level_iterator.cc", + "table/unique_id.cc", "test_util/sync_point.cc", "test_util/sync_point_impl.cc", "test_util/transaction_test_util.cc", "tools/dump/db_dump_tool.cc", + "tools/io_tracer_parser_tool.cc", "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", "trace_replay/block_cache_tracer.cc", + "trace_replay/io_tracer.cc", + "trace_replay/trace_record.cc", + "trace_replay/trace_record_handler.cc", + "trace_replay/trace_record_result.cc", "trace_replay/trace_replay.cc", "util/build_version.cc", "util/coding.cc", @@ -282,12 +362,15 @@ "util/compression_context_cache.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", + "util/crc32c_arm64.cc", "util/dynamic_bloom.cc", "util/file_checksum_helper.cc", "util/hash.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", + "util/regex.cc", + "util/ribbon_config.cc", "util/slice.cc", "util/status.cc", "util/string_util.cc", @@ -301,20 +384,24 @@ "utilities/blob_db/blob_db_impl_filesnapshot.cc", "utilities/blob_db/blob_dump_tool.cc", "utilities/blob_db/blob_file.cc", - "utilities/blob_db/blob_log_format.cc", - "utilities/blob_db/blob_log_reader.cc", - "utilities/blob_db/blob_log_writer.cc", + "utilities/cache_dump_load.cc", + "utilities/cache_dump_load_impl.cc", "utilities/cassandra/cassandra_compaction_filter.cc", "utilities/cassandra/format.cc", "utilities/cassandra/merge_operator.cc", "utilities/checkpoint/checkpoint_impl.cc", + "utilities/compaction_filters.cc", "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc", "utilities/convenience/info_log_finder.cc", "utilities/debug.cc", "utilities/env_mirror.cc", "utilities/env_timed.cc", + "utilities/fault_injection_env.cc", + "utilities/fault_injection_fs.cc", + "utilities/fault_injection_secondary_cache.cc", "utilities/leveldb_options/leveldb_options.cc", "utilities/memory/memory_util.cc", + "utilities/merge_operators.cc", "utilities/merge_operators/bytesxor.cc", "utilities/merge_operators/max.cc", "utilities/merge_operators/put.cc", @@ -334,6 +421,24 @@ "utilities/simulator_cache/sim_cache.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", "utilities/trace/file_trace_reader_writer.cc", + "utilities/trace/replayer_impl.cc", + "utilities/transactions/lock/lock_manager.cc", + "utilities/transactions/lock/point/point_lock_manager.cc", + "utilities/transactions/lock/point/point_lock_tracker.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc", + "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc", + "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc", + "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc", "utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/pessimistic_transaction.cc", @@ -341,24 +446,356 @@ "utilities/transactions/snapshot_checker.cc", "utilities/transactions/transaction_base.cc", "utilities/transactions/transaction_db_mutex_impl.cc", - "utilities/transactions/transaction_lock_mgr.cc", "utilities/transactions/transaction_util.cc", "utilities/transactions/write_prepared_txn.cc", "utilities/transactions/write_prepared_txn_db.cc", "utilities/transactions/write_unprepared_txn.cc", "utilities/transactions/write_unprepared_txn_db.cc", "utilities/ttl/db_ttl_impl.cc", + "utilities/wal_filter.cc", "utilities/write_batch_with_index/write_batch_with_index.cc", "utilities/write_batch_with_index/write_batch_with_index_internal.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = False, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = [], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, +) + +cpp_library( + name = "rocksdb_whole_archive_lib", + srcs = [ + "cache/cache.cc", + "cache/cache_entry_roles.cc", + "cache/cache_key.cc", + "cache/cache_reservation_manager.cc", + "cache/clock_cache.cc", + "cache/lru_cache.cc", + "cache/sharded_cache.cc", + "db/arena_wrapped_db_iter.cc", + "db/blob/blob_fetcher.cc", + "db/blob/blob_file_addition.cc", + "db/blob/blob_file_builder.cc", + "db/blob/blob_file_cache.cc", + "db/blob/blob_file_garbage.cc", + "db/blob/blob_file_meta.cc", + "db/blob/blob_file_reader.cc", + "db/blob/blob_garbage_meter.cc", + "db/blob/blob_log_format.cc", + "db/blob/blob_log_sequential_reader.cc", + "db/blob/blob_log_writer.cc", + "db/blob/prefetch_buffer_collection.cc", + "db/builder.cc", + "db/c.cc", + "db/column_family.cc", + "db/compaction/compaction.cc", + "db/compaction/compaction_iterator.cc", + "db/compaction/compaction_job.cc", + "db/compaction/compaction_picker.cc", + "db/compaction/compaction_picker_fifo.cc", + "db/compaction/compaction_picker_level.cc", + "db/compaction/compaction_picker_universal.cc", + "db/compaction/sst_partitioner.cc", + "db/convenience.cc", + "db/db_filesnapshot.cc", + "db/db_impl/compacted_db_impl.cc", + "db/db_impl/db_impl.cc", + "db/db_impl/db_impl_compaction_flush.cc", + "db/db_impl/db_impl_debug.cc", + "db/db_impl/db_impl_experimental.cc", + "db/db_impl/db_impl_files.cc", + "db/db_impl/db_impl_open.cc", + "db/db_impl/db_impl_readonly.cc", + "db/db_impl/db_impl_secondary.cc", + "db/db_impl/db_impl_write.cc", + "db/db_info_dumper.cc", + "db/db_iter.cc", + "db/dbformat.cc", + "db/error_handler.cc", + "db/event_helpers.cc", + "db/experimental.cc", + "db/external_sst_file_ingestion_job.cc", + "db/file_indexer.cc", + "db/flush_job.cc", + "db/flush_scheduler.cc", + "db/forward_iterator.cc", + "db/import_column_family_job.cc", + "db/internal_stats.cc", + "db/log_reader.cc", + "db/log_writer.cc", + "db/logs_with_prep_tracker.cc", + "db/malloc_stats.cc", + "db/memtable.cc", + "db/memtable_list.cc", + "db/merge_helper.cc", + "db/merge_operator.cc", + "db/output_validator.cc", + "db/periodic_work_scheduler.cc", + "db/range_del_aggregator.cc", + "db/range_tombstone_fragmenter.cc", + "db/repair.cc", + "db/snapshot_impl.cc", + "db/table_cache.cc", + "db/table_properties_collector.cc", + "db/transaction_log_impl.cc", + "db/trim_history_scheduler.cc", + "db/version_builder.cc", + "db/version_edit.cc", + "db/version_edit_handler.cc", + "db/version_set.cc", + "db/wal_edit.cc", + "db/wal_manager.cc", + "db/write_batch.cc", + "db/write_batch_base.cc", + "db/write_controller.cc", + "db/write_thread.cc", + "env/composite_env.cc", + "env/env.cc", + "env/env_chroot.cc", + "env/env_encryption.cc", + "env/env_hdfs.cc", + "env/env_posix.cc", + "env/file_system.cc", + "env/file_system_tracer.cc", + "env/fs_posix.cc", + "env/fs_remap.cc", + "env/io_posix.cc", + "env/mock_env.cc", + "env/unique_id_gen.cc", + "file/delete_scheduler.cc", + "file/file_prefetch_buffer.cc", + "file/file_util.cc", + "file/filename.cc", + "file/line_file_reader.cc", + "file/random_access_file_reader.cc", + "file/read_write_util.cc", + "file/readahead_raf.cc", + "file/sequence_file_reader.cc", + "file/sst_file_manager_impl.cc", + "file/writable_file_writer.cc", + "logging/auto_roll_logger.cc", + "logging/event_logger.cc", + "logging/log_buffer.cc", + "memory/arena.cc", + "memory/concurrent_arena.cc", + "memory/jemalloc_nodump_allocator.cc", + "memory/memkind_kmem_allocator.cc", + "memory/memory_allocator.cc", + "memtable/alloc_tracker.cc", + "memtable/hash_linklist_rep.cc", + "memtable/hash_skiplist_rep.cc", + "memtable/skiplistrep.cc", + "memtable/vectorrep.cc", + "memtable/write_buffer_manager.cc", + "monitoring/histogram.cc", + "monitoring/histogram_windowing.cc", + "monitoring/in_memory_stats_history.cc", + "monitoring/instrumented_mutex.cc", + "monitoring/iostats_context.cc", + "monitoring/perf_context.cc", + "monitoring/perf_level.cc", + "monitoring/persistent_stats_history.cc", + "monitoring/statistics.cc", + "monitoring/thread_status_impl.cc", + "monitoring/thread_status_updater.cc", + "monitoring/thread_status_updater_debug.cc", + "monitoring/thread_status_util.cc", + "monitoring/thread_status_util_debug.cc", + "options/cf_options.cc", + "options/configurable.cc", + "options/customizable.cc", + "options/db_options.cc", + "options/options.cc", + "options/options_helper.cc", + "options/options_parser.cc", + "port/port_posix.cc", + "port/stack_trace.cc", + "port/win/env_default.cc", + "port/win/env_win.cc", + "port/win/io_win.cc", + "port/win/port_win.cc", + "port/win/win_logger.cc", + "port/win/win_thread.cc", + "table/adaptive/adaptive_table_factory.cc", + "table/block_based/binary_search_index_reader.cc", + "table/block_based/block.cc", + "table/block_based/block_based_filter_block.cc", + "table/block_based/block_based_table_builder.cc", + "table/block_based/block_based_table_factory.cc", + "table/block_based/block_based_table_iterator.cc", + "table/block_based/block_based_table_reader.cc", + "table/block_based/block_builder.cc", + "table/block_based/block_prefetcher.cc", + "table/block_based/block_prefix_index.cc", + "table/block_based/data_block_footer.cc", + "table/block_based/data_block_hash_index.cc", + "table/block_based/filter_block_reader_common.cc", + "table/block_based/filter_policy.cc", + "table/block_based/flush_block_policy.cc", + "table/block_based/full_filter_block.cc", + "table/block_based/hash_index_reader.cc", + "table/block_based/index_builder.cc", + "table/block_based/index_reader_common.cc", + "table/block_based/parsed_full_filter_block.cc", + "table/block_based/partitioned_filter_block.cc", + "table/block_based/partitioned_index_iterator.cc", + "table/block_based/partitioned_index_reader.cc", + "table/block_based/reader_common.cc", + "table/block_based/uncompression_dict_reader.cc", + "table/block_fetcher.cc", + "table/cuckoo/cuckoo_table_builder.cc", + "table/cuckoo/cuckoo_table_factory.cc", + "table/cuckoo/cuckoo_table_reader.cc", + "table/format.cc", + "table/get_context.cc", + "table/iterator.cc", + "table/merging_iterator.cc", + "table/meta_blocks.cc", + "table/persistent_cache_helper.cc", + "table/plain/plain_table_bloom.cc", + "table/plain/plain_table_builder.cc", + "table/plain/plain_table_factory.cc", + "table/plain/plain_table_index.cc", + "table/plain/plain_table_key_coding.cc", + "table/plain/plain_table_reader.cc", + "table/sst_file_dumper.cc", + "table/sst_file_reader.cc", + "table/sst_file_writer.cc", + "table/table_factory.cc", + "table/table_properties.cc", + "table/two_level_iterator.cc", + "table/unique_id.cc", + "test_util/sync_point.cc", + "test_util/sync_point_impl.cc", + "test_util/transaction_test_util.cc", + "tools/dump/db_dump_tool.cc", + "tools/io_tracer_parser_tool.cc", + "tools/ldb_cmd.cc", + "tools/ldb_tool.cc", + "tools/sst_dump_tool.cc", + "trace_replay/block_cache_tracer.cc", + "trace_replay/io_tracer.cc", + "trace_replay/trace_record.cc", + "trace_replay/trace_record_handler.cc", + "trace_replay/trace_record_result.cc", + "trace_replay/trace_replay.cc", + "util/build_version.cc", + "util/coding.cc", + "util/compaction_job_stats_impl.cc", + "util/comparator.cc", + "util/compression_context_cache.cc", + "util/concurrent_task_limiter_impl.cc", + "util/crc32c.cc", + "util/crc32c_arm64.cc", + "util/dynamic_bloom.cc", + "util/file_checksum_helper.cc", + "util/hash.cc", + "util/murmurhash.cc", + "util/random.cc", + "util/rate_limiter.cc", + "util/regex.cc", + "util/ribbon_config.cc", + "util/slice.cc", + "util/status.cc", + "util/string_util.cc", + "util/thread_local.cc", + "util/threadpool_imp.cc", + "util/xxhash.cc", + "utilities/backupable/backupable_db.cc", + "utilities/blob_db/blob_compaction_filter.cc", + "utilities/blob_db/blob_db.cc", + "utilities/blob_db/blob_db_impl.cc", + "utilities/blob_db/blob_db_impl_filesnapshot.cc", + "utilities/blob_db/blob_dump_tool.cc", + "utilities/blob_db/blob_file.cc", + "utilities/cache_dump_load.cc", + "utilities/cache_dump_load_impl.cc", + "utilities/cassandra/cassandra_compaction_filter.cc", + "utilities/cassandra/format.cc", + "utilities/cassandra/merge_operator.cc", + "utilities/checkpoint/checkpoint_impl.cc", + "utilities/compaction_filters.cc", + "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc", + "utilities/convenience/info_log_finder.cc", + "utilities/debug.cc", + "utilities/env_mirror.cc", + "utilities/env_timed.cc", + "utilities/fault_injection_env.cc", + "utilities/fault_injection_fs.cc", + "utilities/fault_injection_secondary_cache.cc", + "utilities/leveldb_options/leveldb_options.cc", + "utilities/memory/memory_util.cc", + "utilities/merge_operators.cc", + "utilities/merge_operators/bytesxor.cc", + "utilities/merge_operators/max.cc", + "utilities/merge_operators/put.cc", + "utilities/merge_operators/sortlist.cc", + "utilities/merge_operators/string_append/stringappend.cc", + "utilities/merge_operators/string_append/stringappend2.cc", + "utilities/merge_operators/uint64add.cc", + "utilities/object_registry.cc", + "utilities/option_change_migration/option_change_migration.cc", + "utilities/options/options_util.cc", + "utilities/persistent_cache/block_cache_tier.cc", + "utilities/persistent_cache/block_cache_tier_file.cc", + "utilities/persistent_cache/block_cache_tier_metadata.cc", + "utilities/persistent_cache/persistent_cache_tier.cc", + "utilities/persistent_cache/volatile_tier_impl.cc", + "utilities/simulator_cache/cache_simulator.cc", + "utilities/simulator_cache/sim_cache.cc", + "utilities/table_properties_collectors/compact_on_deletion_collector.cc", + "utilities/trace/file_trace_reader_writer.cc", + "utilities/trace/replayer_impl.cc", + "utilities/transactions/lock/lock_manager.cc", + "utilities/transactions/lock/point/point_lock_manager.cc", + "utilities/transactions/lock/point/point_lock_tracker.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc", + "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc", + "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc", + "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc", + "utilities/transactions/optimistic_transaction.cc", + "utilities/transactions/optimistic_transaction_db_impl.cc", + "utilities/transactions/pessimistic_transaction.cc", + "utilities/transactions/pessimistic_transaction_db.cc", + "utilities/transactions/snapshot_checker.cc", + "utilities/transactions/transaction_base.cc", + "utilities/transactions/transaction_db_mutex_impl.cc", + "utilities/transactions/transaction_util.cc", + "utilities/transactions/write_prepared_txn.cc", + "utilities/transactions/write_prepared_txn_db.cc", + "utilities/transactions/write_unprepared_txn.cc", + "utilities/transactions/write_unprepared_txn_db.cc", + "utilities/ttl/db_ttl_impl.cc", + "utilities/wal_filter.cc", + "utilities/write_batch_with_index/write_batch_with_index.cc", + "utilities/write_batch_with_index/write_batch_with_index_internal.cc", + ], + auto_headers = AutoHeaders.RECURSIVE_GLOB, + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = True, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + exported_deps = [], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, ) cpp_library( @@ -366,7 +803,7 @@ srcs = [ "db/db_test_util.cc", "table/mock_table.cc", - "test_util/fault_injection_test_env.cc", + "test_util/mock_time_env.cc", "test_util/testharness.cc", "test_util/testutil.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", @@ -376,11 +813,15 @@ auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = False, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_lib"], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = [":rocksdb_lib"], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS + [ + ("googletest", None, "gtest"), + ], ) cpp_library( @@ -389,16 +830,34 @@ "test_util/testutil.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/db_bench_tool.cc", + "tools/simulated_hybrid_file_system.cc", "tools/trace_analyzer_tool.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = False, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + exported_deps = [":rocksdb_lib"], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, +) + +cpp_library( + name = "rocksdb_cache_bench_tools_lib", + srcs = ["cache/cache_bench_tool.cc"], + auto_headers = AutoHeaders.RECURSIVE_GLOB, + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = False, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_lib"], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = [":rocksdb_lib"], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, ) cpp_library( @@ -409,9 +868,13 @@ "db_stress_tool/db_stress_common.cc", "db_stress_tool/db_stress_driver.cc", "db_stress_tool/db_stress_gflags.cc", + "db_stress_tool/db_stress_listener.cc", "db_stress_tool/db_stress_shared_state.cc", + "db_stress_tool/db_stress_stat.cc", "db_stress_tool/db_stress_test_base.cc", "db_stress_tool/db_stress_tool.cc", + "db_stress_tool/expected_state.cc", + "db_stress_tool/multi_ops_txns_stress.cc", "db_stress_tool/no_batched_ops_stress.cc", "test_util/testutil.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", @@ -420,24 +883,47 @@ auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_lib"], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = ROCKSDB_LIB_DEPS, + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, ) +cpp_binary( + name = "c_test_bin", + srcs = ["db/c_test.c"], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + deps = [":rocksdb_test_lib"], +) if not is_opt_mode else None + +custom_unittest( + name = "c_test", + command = [ + native.package_name() + "/buckifier/rocks_test_runner.sh", + "$(location :{})".format("c_test_bin"), + ], + type = "simple", +) if not is_opt_mode else None + cpp_library( name = "env_basic_test_lib", srcs = ["env/env_basic_test.cc"], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = False, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_test_lib"], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = [":rocksdb_test_lib"], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, ) # [test_name, test_src, test_type, extra_deps, extra_compiler_flags] @@ -445,21 +931,21 @@ [ "arena_test", "memory/arena_test.cc", - "serial", + "parallel", [], [], ], [ "auto_roll_logger_test", "logging/auto_roll_logger_test.cc", - "serial", + "parallel", [], [], ], [ "autovector_test", "util/autovector_test.cc", - "serial", + "parallel", [], [], ], @@ -471,233 +957,345 @@ [], ], [ + "blob_counting_iterator_test", + "db/blob/blob_counting_iterator_test.cc", + "parallel", + [], + [], + ], + [ "blob_db_test", "utilities/blob_db/blob_db_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "blob_file_addition_test", + "db/blob/blob_file_addition_test.cc", + "parallel", + [], + [], + ], + [ + "blob_file_builder_test", + "db/blob/blob_file_builder_test.cc", + "parallel", + [], + [], + ], + [ + "blob_file_cache_test", + "db/blob/blob_file_cache_test.cc", + "parallel", + [], + [], + ], + [ + "blob_file_garbage_test", + "db/blob/blob_file_garbage_test.cc", + "parallel", + [], + [], + ], + [ + "blob_file_reader_test", + "db/blob/blob_file_reader_test.cc", + "parallel", + [], + [], + ], + [ + "blob_garbage_meter_test", + "db/blob/blob_garbage_meter_test.cc", + "parallel", [], [], ], [ "block_based_filter_block_test", "table/block_based/block_based_filter_block_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "block_based_table_reader_test", + "table/block_based/block_based_table_reader_test.cc", + "parallel", [], [], ], [ "block_cache_trace_analyzer_test", "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc", - "serial", + "parallel", [], [], ], [ "block_cache_tracer_test", "trace_replay/block_cache_tracer_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "block_fetcher_test", + "table/block_fetcher_test.cc", + "parallel", [], [], ], [ "block_test", "table/block_based/block_test.cc", - "serial", + "parallel", [], [], ], [ "bloom_test", "util/bloom_test.cc", - "serial", + "parallel", [], [], ], [ - "c_test", - "db/c_test.c", - "serial", + "cache_reservation_manager_test", + "cache/cache_reservation_manager_test.cc", + "parallel", [], [], ], [ "cache_simulator_test", "utilities/simulator_cache/cache_simulator_test.cc", - "serial", + "parallel", [], [], ], [ "cache_test", "cache/cache_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_format_test", "utilities/cassandra/cassandra_format_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_functional_test", "utilities/cassandra/cassandra_functional_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_row_merge_test", "utilities/cassandra/cassandra_row_merge_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_serialize_test", "utilities/cassandra/cassandra_serialize_test.cc", - "serial", + "parallel", [], [], ], [ "checkpoint_test", "utilities/checkpoint/checkpoint_test.cc", - "serial", + "parallel", [], [], ], [ "cleanable_test", "table/cleanable_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "clipping_iterator_test", + "db/compaction/clipping_iterator_test.cc", + "parallel", [], [], ], [ "coding_test", "util/coding_test.cc", - "serial", + "parallel", [], [], ], [ "column_family_test", "db/column_family_test.cc", - "serial", + "parallel", [], [], ], [ "compact_files_test", "db/compact_files_test.cc", - "serial", + "parallel", [], [], ], [ "compact_on_deletion_collector_test", "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_iterator_test", "db/compaction/compaction_iterator_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_job_stats_test", "db/compaction/compaction_job_stats_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_job_test", "db/compaction/compaction_job_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_picker_test", "db/compaction/compaction_picker_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "compaction_service_test", + "db/compaction/compaction_service_test.cc", + "parallel", [], [], ], [ "comparator_db_test", "db/comparator_db_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "configurable_test", + "options/configurable_test.cc", + "parallel", [], [], ], [ "corruption_test", "db/corruption_test.cc", - "serial", + "parallel", [], [], ], [ "crc32c_test", "util/crc32c_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_builder_test", "table/cuckoo/cuckoo_table_builder_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_db_test", "db/cuckoo_table_db_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_reader_test", "table/cuckoo/cuckoo_table_reader_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "customizable_test", + "options/customizable_test.cc", + "parallel", [], [], ], [ "data_block_hash_index_test", "table/block_based/data_block_hash_index_test.cc", - "serial", + "parallel", [], [], ], [ "db_basic_test", "db/db_basic_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_blob_basic_test", + "db/blob/db_blob_basic_test.cc", + "parallel", + [], + [], + ], + [ + "db_blob_compaction_test", + "db/blob/db_blob_compaction_test.cc", + "parallel", + [], + [], + ], + [ + "db_blob_corruption_test", + "db/blob/db_blob_corruption_test.cc", + "parallel", [], [], ], [ "db_blob_index_test", - "db/db_blob_index_test.cc", - "serial", + "db/blob/db_blob_index_test.cc", + "parallel", [], [], ], [ "db_block_cache_test", "db/db_block_cache_test.cc", - "serial", + "parallel", [], [], ], @@ -725,77 +1323,91 @@ [ "db_dynamic_level_test", "db/db_dynamic_level_test.cc", - "serial", + "parallel", [], [], ], [ "db_encryption_test", "db/db_encryption_test.cc", - "serial", + "parallel", [], [], ], [ "db_flush_test", "db/db_flush_test.cc", - "serial", + "parallel", [], [], ], [ "db_inplace_update_test", "db/db_inplace_update_test.cc", - "serial", + "parallel", [], [], ], [ "db_io_failure_test", "db/db_io_failure_test.cc", - "serial", + "parallel", [], [], ], [ "db_iter_stress_test", "db/db_iter_stress_test.cc", - "serial", + "parallel", [], [], ], [ "db_iter_test", "db/db_iter_test.cc", - "serial", + "parallel", [], [], ], [ "db_iterator_test", "db/db_iterator_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_kv_checksum_test", + "db/db_kv_checksum_test.cc", + "parallel", [], [], ], [ "db_log_iter_test", "db/db_log_iter_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_logical_block_size_cache_test", + "db/db_logical_block_size_cache_test.cc", + "parallel", [], [], ], [ "db_memtable_test", "db/db_memtable_test.cc", - "serial", + "parallel", [], [], ], [ "db_merge_operand_test", "db/db_merge_operand_test.cc", - "serial", + "parallel", [], [], ], @@ -809,28 +1421,28 @@ [ "db_options_test", "db/db_options_test.cc", - "serial", + "parallel", [], [], ], [ "db_properties_test", "db/db_properties_test.cc", - "serial", + "parallel", [], [], ], [ "db_range_del_test", "db/db_range_del_test.cc", - "serial", + "parallel", [], [], ], [ "db_secondary_test", - "db/db_impl/db_secondary_test.cc", - "serial", + "db/db_secondary_test.cc", + "parallel", [], [], ], @@ -844,21 +1456,21 @@ [ "db_statistics_test", "db/db_statistics_test.cc", - "serial", + "parallel", [], [], ], [ "db_table_properties_test", "db/db_table_properties_test.cc", - "serial", + "parallel", [], [], ], [ "db_tailing_iter_test", "db/db_tailing_iter_test.cc", - "serial", + "parallel", [], [], ], @@ -872,7 +1484,7 @@ [ "db_test2", "db/db_test2.cc", - "serial", + "parallel", [], [], ], @@ -891,30 +1503,51 @@ [], ], [ + "db_with_timestamp_basic_test", + "db/db_with_timestamp_basic_test.cc", + "parallel", + [], + [], + ], + [ + "db_with_timestamp_compaction_test", + "db/db_with_timestamp_compaction_test.cc", + "parallel", + [], + [], + ], + [ + "db_write_buffer_manager_test", + "db/db_write_buffer_manager_test.cc", + "parallel", + [], + [], + ], + [ "db_write_test", "db/db_write_test.cc", - "serial", + "parallel", [], [], ], [ "dbformat_test", "db/dbformat_test.cc", - "serial", + "parallel", [], [], ], [ "defer_test", "util/defer_test.cc", - "serial", + "parallel", [], [], ], [ "delete_scheduler_test", "file/delete_scheduler_test.cc", - "serial", + "parallel", [], [], ], @@ -928,21 +1561,21 @@ [ "dynamic_bloom_test", "util/dynamic_bloom_test.cc", - "serial", + "parallel", [], [], ], [ "env_basic_test", "env/env_basic_test.cc", - "serial", + "parallel", [], [], ], [ "env_logger_test", "logging/env_logger_test.cc", - "serial", + "parallel", [], [], ], @@ -956,28 +1589,28 @@ [ "env_timed_test", "utilities/env_timed_test.cc", - "serial", + "parallel", [], [], ], [ - "error_handler_test", - "db/error_handler_test.cc", - "serial", + "error_handler_fs_test", + "db/error_handler_fs_test.cc", + "parallel", [], [], ], [ "event_logger_test", "logging/event_logger_test.cc", - "serial", + "parallel", [], [], ], [ "external_sst_file_basic_test", "db/external_sst_file_basic_test.cc", - "serial", + "parallel", [], [], ], @@ -998,7 +1631,7 @@ [ "file_indexer_test", "db/file_indexer_test.cc", - "serial", + "parallel", [], [], ], @@ -1012,56 +1645,56 @@ [ "filelock_test", "util/filelock_test.cc", - "serial", + "parallel", [], [], ], [ "filename_test", "db/filename_test.cc", - "serial", + "parallel", [], [], ], [ "flush_job_test", "db/flush_job_test.cc", - "serial", + "parallel", [], [], ], [ "full_filter_block_test", "table/block_based/full_filter_block_test.cc", - "serial", + "parallel", [], [], ], [ "hash_table_test", "utilities/persistent_cache/hash_table_test.cc", - "serial", + "parallel", [], [], ], [ "hash_test", "util/hash_test.cc", - "serial", + "parallel", [], [], ], [ "heap_test", "util/heap_test.cc", - "serial", + "parallel", [], [], ], [ "histogram_test", "monitoring/histogram_test.cc", - "serial", + "parallel", [], [], ], @@ -1080,37 +1713,58 @@ [], ], [ + "io_posix_test", + "env/io_posix_test.cc", + "parallel", + [], + [], + ], + [ + "io_tracer_parser_test", + "tools/io_tracer_parser_test.cc", + "parallel", + [], + [], + ], + [ + "io_tracer_test", + "trace_replay/io_tracer_test.cc", + "parallel", + [], + [], + ], + [ "iostats_context_test", "monitoring/iostats_context_test.cc", - "serial", + "parallel", [], [], ], [ "ldb_cmd_test", "tools/ldb_cmd_test.cc", - "serial", + "parallel", [], [], ], [ "listener_test", "db/listener_test.cc", - "serial", + "parallel", [], [], ], [ "log_test", "db/log_test.cc", - "serial", + "parallel", [], [], ], [ "lru_cache_test", "cache/lru_cache_test.cc", - "serial", + "parallel", [], [], ], @@ -1122,114 +1776,128 @@ [], ], [ + "memory_allocator_test", + "memory/memory_allocator_test.cc", + "parallel", + [], + [], + ], + [ "memory_test", "utilities/memory/memory_test.cc", - "serial", + "parallel", [], [], ], [ "memtable_list_test", "db/memtable_list_test.cc", - "serial", + "parallel", [], [], ], [ "merge_helper_test", "db/merge_helper_test.cc", - "serial", + "parallel", [], [], ], [ "merge_test", "db/merge_test.cc", - "serial", + "parallel", [], [], ], [ "merger_test", "table/merger_test.cc", - "serial", + "parallel", [], [], ], [ "mock_env_test", "env/mock_env_test.cc", - "serial", + "parallel", [], [], ], [ "object_registry_test", "utilities/object_registry_test.cc", - "serial", + "parallel", [], [], ], [ "obsolete_files_test", "db/obsolete_files_test.cc", - "serial", + "parallel", [], [], ], [ "optimistic_transaction_test", "utilities/transactions/optimistic_transaction_test.cc", - "serial", + "parallel", [], [], ], [ "option_change_migration_test", "utilities/option_change_migration/option_change_migration_test.cc", - "serial", + "parallel", [], [], ], [ "options_file_test", "db/options_file_test.cc", - "serial", + "parallel", [], [], ], [ "options_settable_test", "options/options_settable_test.cc", - "serial", + "parallel", [], [], ], [ "options_test", "options/options_test.cc", - "serial", + "parallel", [], [], ], [ "options_util_test", "utilities/options/options_util_test.cc", - "serial", + "parallel", [], [], ], [ "partitioned_filter_block_test", "table/block_based/partitioned_filter_block_test.cc", - "serial", + "parallel", [], [], ], [ "perf_context_test", "db/perf_context_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "periodic_work_scheduler_test", + "db/periodic_work_scheduler_test.cc", + "parallel", [], [], ], @@ -1243,133 +1911,168 @@ [ "plain_table_db_test", "db/plain_table_db_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "point_lock_manager_test", + "utilities/transactions/lock/point/point_lock_manager_test.cc", + "parallel", + [], + [], + ], + [ + "prefetch_test", + "file/prefetch_test.cc", + "parallel", [], [], ], [ "prefix_test", "db/prefix_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "random_access_file_reader_test", + "file/random_access_file_reader_test.cc", + "parallel", [], [], ], [ "random_test", "util/random_test.cc", - "serial", + "parallel", [], [], ], [ "range_del_aggregator_test", "db/range_del_aggregator_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "range_locking_test", + "utilities/transactions/lock/range/range_locking_test.cc", + "parallel", [], [], ], [ "range_tombstone_fragmenter_test", "db/range_tombstone_fragmenter_test.cc", - "serial", + "parallel", [], [], ], [ "rate_limiter_test", "util/rate_limiter_test.cc", - "serial", + "parallel", [], [], ], [ "reduce_levels_test", "tools/reduce_levels_test.cc", - "serial", + "parallel", [], [], ], [ "repair_test", "db/repair_test.cc", - "serial", + "parallel", [], [], ], [ "repeatable_thread_test", "util/repeatable_thread_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "ribbon_test", + "util/ribbon_test.cc", + "parallel", [], [], ], [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", - "serial", + "parallel", [], [], ], [ "skiplist_test", "memtable/skiplist_test.cc", - "serial", + "parallel", [], [], ], [ "slice_test", "util/slice_test.cc", - "serial", + "parallel", [], [], ], [ "slice_transform_test", "util/slice_transform_test.cc", - "serial", + "parallel", [], [], ], [ "sst_dump_test", "tools/sst_dump_test.cc", - "serial", + "parallel", [], [], ], [ "sst_file_reader_test", "table/sst_file_reader_test.cc", - "serial", + "parallel", [], [], ], [ "statistics_test", "monitoring/statistics_test.cc", - "serial", + "parallel", [], [], ], [ "stats_history_test", "monitoring/stats_history_test.cc", - "serial", + "parallel", [], [], ], [ "stringappend_test", "utilities/merge_operators/string_append/stringappend_test.cc", - "serial", + "parallel", [], [], ], [ "table_properties_collector_test", "db/table_properties_collector_test.cc", - "serial", + "parallel", [], [], ], @@ -1381,30 +2084,44 @@ [], ], [ + "testutil_test", + "test_util/testutil_test.cc", + "parallel", + [], + [], + ], + [ "thread_list_test", "util/thread_list_test.cc", - "serial", + "parallel", [], [], ], [ "thread_local_test", "util/thread_local_test.cc", - "serial", + "parallel", [], [], ], [ "timer_queue_test", "util/timer_queue_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "timer_test", + "util/timer_test.cc", + "parallel", [], [], ], [ "trace_analyzer_test", "tools/trace_analyzer_test.cc", - "serial", + "parallel", [], [], ], @@ -1418,77 +2135,84 @@ [ "ttl_test", "utilities/ttl/ttl_test.cc", - "serial", + "parallel", [], [], ], [ "util_merge_operators_test", "utilities/util_merge_operators_test.cc", - "serial", + "parallel", [], [], ], [ "version_builder_test", "db/version_builder_test.cc", - "serial", + "parallel", [], [], ], [ "version_edit_test", "db/version_edit_test.cc", - "serial", + "parallel", [], [], ], [ "version_set_test", "db/version_set_test.cc", - "serial", + "parallel", [], [], ], [ "wal_manager_test", "db/wal_manager_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "work_queue_test", + "util/work_queue_test.cc", + "parallel", [], [], ], [ "write_batch_test", "db/write_batch_test.cc", - "serial", + "parallel", [], [], ], [ "write_batch_with_index_test", "utilities/write_batch_with_index/write_batch_with_index_test.cc", - "serial", + "parallel", [], [], ], [ "write_buffer_manager_test", "memtable/write_buffer_manager_test.cc", - "serial", + "parallel", [], [], ], [ "write_callback_test", "db/write_callback_test.cc", - "serial", + "parallel", [], [], ], [ "write_controller_test", "db/write_controller_test.cc", - "serial", + "parallel", [], [], ], @@ -1512,18 +2236,18 @@ # Do not build the tests in opt mode, since SyncPoint and other test code # will not be included. [ - test_binary( - extra_compiler_flags = extra_compiler_flags, - extra_deps = extra_deps, - parallelism = parallelism, - rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, - rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, - rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, - rocksdb_os_deps = ROCKSDB_OS_DEPS, - rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, - rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - test_cc = test_cc, - test_name = test_name, + cpp_unittest( + name = test_name, + srcs = [test_cc], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags, + include_paths = ROCKSDB_INCLUDE_PATHS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + deps = [":rocksdb_test_lib"] + extra_deps, + external_deps = ROCKSDB_EXTERNAL_DEPS + [ + ("googletest", None, "gtest"), + ], ) for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/USERS.md mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/USERS.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md 2025-05-19 16:14:27.000000000 +0000 @@ -26,6 +26,9 @@ ## Yahoo Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights +## Baidu +[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata. + ## CockroachDB CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach @@ -44,7 +47,7 @@ Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters. Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf -## Santanader UK/Cloudera Profession Services +## Santander UK/Cloudera Profession Services Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/ ## Airbnb @@ -67,7 +70,7 @@ [VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed. ## quasardb -[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. +[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. quasardb uses a heavily tuned RocksDB as its persistence layer. ## Netflix @@ -86,7 +89,7 @@ [Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue. ## 360 Pika -[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been widely used in many company +[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies. ## LzLabs LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data. @@ -96,13 +99,28 @@ ## IOTA Foundation [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. - + ## Avrio Project [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions. - + ## Crux [Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability. ## Nebula Graph - [Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency. + +## YugabyteDB +[YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/. + +## ArangoDB +[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its storage engine. + +## Milvus +[Milvus](https://milvus.io/) is an open source vector database for unstructured data. It uses RocksDB not only as one of the supported kv storage engines, but also as a message queue. + +## Kafka +[Kafka](https://kafka.apache.org/) is an open-source distributed event streaming platform, it uses RocksDB to store state in Kafka Streams: https://www.confluent.io/blog/how-to-tune-rocksdb-kafka-streams-state-stores-performance/. + +## Others +More databases using RocksDB can be found at [dbdb.io](https://dbdb.io/browse?embeds=rocksdb). + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/WINDOWS_PORT.md mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/WINDOWS_PORT.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md 2025-05-19 16:14:27.000000000 +0000 @@ -24,7 +24,7 @@ * make all unit test pass both in debug and release builds. * Note: latest introduction of SyncPoint seems to disable running db_test in Release. * make performance on par with published benchmarks accounting for HW differences -* we would like to keep the port code inline with the master branch with no forking +* we would like to keep the port code inline with the main branch with no forking ## Build system We have chosen CMake as a widely accepted build system to build the Windows port. It is very fast and convenient. @@ -66,7 +66,7 @@ Even though Windows provides its own efficient thread-pool implementation we chose to replicate posix logic using `std::thread` primitives. This allows anyone to quickly detect any changes within the posix source code and replicate them within windows env. This has proven to work very well. At the same time for anyone who wishes to replace the built-in thread-pool can do so using RocksDB stackable environments. For disk access we implemented all of the functionality present within the posix_env which includes memory mapped files, random access, rate-limiter support etc. -The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It’s not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST. +The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It's not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST. We have replaced `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure so we can atomically seek to the position of the disk operation but still perform the operation synchronously. Thus we able to emulate that functionality of `pread/pwrite` reasonably well. The only difference is that the file pointer is not returned to its original position but that hardly matters given the random nature of access. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/appveyor.yml mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/appveyor.yml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml 2025-05-19 16:14:27.000000000 +0000 @@ -1,6 +1,6 @@ version: 1.0.{build} -image: Visual Studio 2017 +image: Visual Studio 2019 environment: JAVA_HOME: C:\Program Files\Java\jdk1.8.0 @@ -21,9 +21,6 @@ - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 CMAKE_GENERATOR: Visual Studio 14 Win64 DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio 14.0\Common7\IDE\devenv.com - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 - CMAKE_GENERATOR: Visual Studio 15 Win64 - DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com install: - md %THIRDPARTY_HOME% @@ -34,7 +31,8 @@ - cd snappy-1.1.7 - mkdir build - cd build - - cmake -G "%CMAKE_GENERATOR%" .. + - if DEFINED CMAKE_PLATEFORM_NAME (set "PLATEFORM_OPT=-A %CMAKE_PLATEFORM_NAME%") + - cmake .. -G "%CMAKE_GENERATOR%" %PLATEFORM_OPT% - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64 - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64 - echo "Building LZ4 dependency..." @@ -57,7 +55,8 @@ before_build: - md %APPVEYOR_BUILD_FOLDER%\build - cd %APPVEYOR_BUILD_FOLDER%\build - - cmake -G "%CMAKE_GENERATOR%" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 .. + - if DEFINED CMAKE_PLATEFORM_NAME (set "PLATEFORM_OPT=-A %CMAKE_PLATEFORM_NAME%") + - cmake .. -G "%CMAKE_GENERATOR%" %PLATEFORM_OPT% %CMAKE_OPT% -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 -DWITH_ALL_TESTS=0 - cd .. build: @@ -68,7 +67,7 @@ test: test_script: - - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8 + - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,env_basic_test -Concurrency 8 on_failure: - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py --- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py 2025-05-19 16:14:27.000000000 +0000 @@ -20,14 +20,14 @@ # User can pass extra dependencies as a JSON object via command line, and this # script can include these dependencies in the generate TARGETS file. # Usage: -# $python buckifier/buckify_rocksdb.py +# $python3 buckifier/buckify_rocksdb.py # (This generates a TARGET file without user-specified dependency for unit # tests.) -# $python buckifier/buckify_rocksdb.py \ -# '{"fake": { \ -# "extra_deps": [":test_dep", "//fakes/module:mock1"], \ -# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \ -# } \ +# $python3 buckifier/buckify_rocksdb.py \ +# '{"fake": { +# "extra_deps": [":test_dep", "//fakes/module:mock1"], +# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"] +# } # }' # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB # unit tests, and will use the extra_compiler_flags to compile the unit test @@ -48,8 +48,8 @@ if '=' in line: current_src = line.split('=')[0].strip() src_files[current_src] = [] - elif '.cc' in line: - src_path = line.split('.cc')[0].strip() + '.cc' + elif '.c' in line: + src_path = line.split('\\')[0].strip() src_files[current_src].append(src_path) return src_files @@ -69,45 +69,28 @@ return cc_files -# Get tests from Makefile -def get_tests(repo_path): +# Get non_parallel tests from Makefile +def get_non_parallel_tests(repo_path): Makefile = repo_path + "/Makefile" - # Dictionary TEST_NAME => IS_PARALLEL - tests = {} + s = set({}) - found_tests = False + found_non_parallel_tests = False for line in open(Makefile): line = line.strip() - if line.startswith("TESTS ="): - found_tests = True - elif found_tests: + if line.startswith("NON_PARALLEL_TEST ="): + found_non_parallel_tests = True + elif found_non_parallel_tests: if line.endswith("\\"): # remove the trailing \ line = line[:-1] line = line.strip() - tests[line] = False + s.add(line) else: - # we consumed all the tests + # we consumed all the non_parallel tests break - found_parallel_tests = False - for line in open(Makefile): - line = line.strip() - if line.startswith("PARALLEL_TEST ="): - found_parallel_tests = True - elif found_parallel_tests: - if line.endswith("\\"): - # remove the trailing \ - line = line[:-1] - line = line.strip() - tests[line] = True - else: - # we consumed all the parallel tests - break - - return tests - + return s # Parse extra dependencies passed by user from command line def get_dependencies(): @@ -140,18 +123,38 @@ src_mk = parse_src_mk(repo_path) # get all .cc files cc_files = get_cc_files(repo_path) - # get tests from Makefile - tests = get_tests(repo_path) + # get non_parallel tests from Makefile + non_parallel_tests = get_non_parallel_tests(repo_path) - if src_mk is None or cc_files is None or tests is None: + if src_mk is None or cc_files is None or non_parallel_tests is None: return False - TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path) + extra_argv = "" + if len(sys.argv) >= 2: + # Heuristically quote and canonicalize whitespace for inclusion + # in how the file was generated. + extra_argv = " '{0}'".format(" ".join(sys.argv[1].split())) + + TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv) + # rocksdb_lib TARGETS.add_library( "rocksdb_lib", src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"]) + # rocksdb_whole_archive_lib + TARGETS.add_library( + "rocksdb_whole_archive_lib", + src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + + src_mk["TOOL_LIB_SOURCES"], + deps=None, + headers=None, + extra_external_deps="", + link_whole=True) # rocksdb_test_lib TARGETS.add_library( "rocksdb_test_lib", @@ -159,7 +162,10 @@ src_mk.get("TEST_LIB_SOURCES", []) + src_mk.get("EXP_LIB_SOURCES", []) + src_mk.get("ANALYZER_LIB_SOURCES", []), - [":rocksdb_lib"]) + [":rocksdb_lib"], + extra_external_deps=""" + [ + ("googletest", None, "gtest"), + ]""") # rocksdb_tools_lib TARGETS.add_library( "rocksdb_tools_lib", @@ -167,41 +173,56 @@ src_mk.get("ANALYZER_LIB_SOURCES", []) + ["test_util/testutil.cc"], [":rocksdb_lib"]) - # rocksdb_stress_lib + # rocksdb_cache_bench_tools_lib TARGETS.add_library( + "rocksdb_cache_bench_tools_lib", + src_mk.get("CACHE_BENCH_LIB_SOURCES", []), + [":rocksdb_lib"]) + # rocksdb_stress_lib + TARGETS.add_rocksdb_library( "rocksdb_stress_lib", src_mk.get("ANALYZER_LIB_SOURCES", []) + src_mk.get('STRESS_LIB_SOURCES', []) - + ["test_util/testutil.cc"], - [":rocksdb_lib"]) + + ["test_util/testutil.cc"]) + + print("Extra dependencies:\n{0}".format(json.dumps(deps_map))) + + # Dictionary test executable name -> relative source file path + test_source_map = {} + print(src_mk) + + # c_test.c is added through TARGETS.add_c_test(). If there + # are more than one .c test file, we need to extend + # TARGETS.add_c_test() to include other C tests too. + for test_src in src_mk.get("TEST_MAIN_SOURCES_C", []): + if test_src != 'db/c_test.c': + print("Don't know how to deal with " + test_src) + return False + TARGETS.add_c_test() + + for test_src in src_mk.get("TEST_MAIN_SOURCES", []): + test = test_src.split('.c')[0].strip().split('/')[-1].strip() + test_source_map[test] = test_src + print("" + test + " " + test_src) - print("Extra dependencies:\n{0}".format(str(deps_map))) - # test for every test we found in the Makefile for target_alias, deps in deps_map.items(): - for test in sorted(tests): - match_src = [src for src in cc_files if ("/%s.c" % test) in src] - if len(match_src) == 0: - print(ColorString.warning("Cannot find .cc file for %s" % test)) - continue - elif len(match_src) > 1: - print(ColorString.warning("Found more than one .cc for %s" % test)) - print(match_src) + for test, test_src in sorted(test_source_map.items()): + if len(test) == 0: + print(ColorString.warning("Failed to get test name for %s" % test_src)) continue - assert(len(match_src) == 1) - is_parallel = tests[test] test_target_name = \ test if not target_alias else test + "_" + target_alias TARGETS.register_test( test_target_name, - match_src[0], - is_parallel, - deps['extra_deps'], - deps['extra_compiler_flags']) + test_src, + test not in non_parallel_tests, + json.dumps(deps['extra_deps']), + json.dumps(deps['extra_compiler_flags'])) if test in _EXPORTED_TEST_LIBS: test_library = "%s_lib" % test_target_name - TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"]) + TARGETS.add_library(test_library, [test_src], [":rocksdb_test_lib"]) TARGETS.flush_tests() print(ColorString.info("Generated TARGETS Summary:")) @@ -220,6 +241,7 @@ return rocksdb_path + def exit_with_error(msg): print(ColorString.error(msg)) sys.exit(1) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# If clang_format_diff.py command is not specfied, we assume we are able to +# access directly without any path. + +TGT_DIFF=`git diff TARGETS | head -n 1` + +if [ ! -z "$TGT_DIFF" ] +then + echo "TARGETS file has uncommitted changes. Skip this check." + exit 0 +fi + +echo Backup original TARGETS file. + +cp TARGETS TARGETS.bkp + +${PYTHON:-python3} buckifier/buckify_rocksdb.py + +TGT_DIFF=`git diff TARGETS | head -n 1` + +if [ -z "$TGT_DIFF" ] +then + mv TARGETS.bkp TARGETS + exit 0 +else + echo "Please run '${PYTHON:-python3} buckifier/buckify_rocksdb.py' to update TARGETS file." + echo "Do not manually update TARGETS file." + ${PYTHON:-python3} --version + mv TARGETS.bkp TARGETS + exit 1 +fi diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_builder.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py --- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_builder.py 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py 2025-05-19 16:14:27.000000000 +0000 @@ -25,10 +25,12 @@ class TARGETSBuilder(object): - def __init__(self, path): + def __init__(self, path, extra_argv): self.path = path - self.targets_file = open(path, 'w') - self.targets_file.write(targets_cfg.rocksdb_target_header) + self.targets_file = open(path, 'wb') + header = targets_cfg.rocksdb_target_header_template.format( + extra_argv=extra_argv) + self.targets_file.write(header.encode("utf-8")) self.total_lib = 0 self.total_bin = 0 self.total_test = 0 @@ -37,26 +39,68 @@ def __del__(self): self.targets_file.close() - def add_library(self, name, srcs, deps=None, headers=None): + def add_library(self, name, srcs, deps=None, headers=None, + extra_external_deps="", link_whole=False): headers_attr_prefix = "" if headers is None: headers_attr_prefix = "auto_" headers = "AutoHeaders.RECURSIVE_GLOB" + else: + headers = "[" + pretty_list(headers) + "]" self.targets_file.write(targets_cfg.library_template.format( name=name, srcs=pretty_list(srcs), headers_attr_prefix=headers_attr_prefix, headers=headers, - deps=pretty_list(deps))) + deps=pretty_list(deps), + extra_external_deps=extra_external_deps, + link_whole=link_whole).encode("utf-8")) + self.total_lib = self.total_lib + 1 + + def add_rocksdb_library(self, name, srcs, headers=None): + headers_attr_prefix = "" + if headers is None: + headers_attr_prefix = "auto_" + headers = "AutoHeaders.RECURSIVE_GLOB" + else: + headers = "[" + pretty_list(headers) + "]" + self.targets_file.write(targets_cfg.rocksdb_library_template.format( + name=name, + srcs=pretty_list(srcs), + headers_attr_prefix=headers_attr_prefix, + headers=headers).encode("utf-8")) self.total_lib = self.total_lib + 1 def add_binary(self, name, srcs, deps=None): - self.targets_file.write(targets_cfg.binary_template % ( - name, - pretty_list(srcs), - pretty_list(deps))) + self.targets_file.write(targets_cfg.binary_template.format( + name=name, + srcs=pretty_list(srcs), + deps=pretty_list(deps)).encode("utf-8")) self.total_bin = self.total_bin + 1 + def add_c_test(self): + self.targets_file.write(b""" +cpp_binary( + name = "c_test_bin", + srcs = ["db/c_test.c"], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + deps = [":rocksdb_test_lib"], +) if not is_opt_mode else None + +custom_unittest( + name = "c_test", + command = [ + native.package_name() + "/buckifier/rocks_test_runner.sh", + "$(location :{})".format("c_test_bin"), + ], + type = "simple", +) if not is_opt_mode else None +""") + def register_test(self, test_name, src, @@ -76,5 +120,5 @@ self.total_test = self.total_test + 1 def flush_tests(self): - self.targets_file.write(targets_cfg.unittests_template % self.tests_cfg) + self.targets_file.write(targets_cfg.unittests_template.format(tests=self.tests_cfg).encode("utf-8")) self.tests_cfg = "" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_cfg.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py --- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_cfg.py 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py 2025-05-19 16:14:27.000000000 +0000 @@ -4,7 +4,9 @@ from __future__ import print_function from __future__ import unicode_literals -rocksdb_target_header = """# This file \100generated by `python buckifier/buckify_rocksdb.py` +rocksdb_target_header_template = \ + """# This file \100generated by: +#$ python3 buckifier/buckify_rocksdb.py{extra_argv} # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. @@ -15,7 +17,7 @@ REPO_PATH = package_name() + "/" -ROCKSDB_COMPILER_FLAGS = [ +ROCKSDB_COMPILER_FLAGS_0 = [ "-fno-builtin-memcmp", # Needed to compile in fbcode "-Wno-expansion-to-defined", @@ -30,19 +32,25 @@ ("zlib", None, "z"), ("gflags", None, "gflags"), ("lz4", None, "lz4"), - ("zstd", None), - ("tbb", None), - ("googletest", None, "gtest"), + ("zstd", None, "zstd"), ] -ROCKSDB_OS_DEPS = [ +ROCKSDB_OS_DEPS_0 = [ ( "linux", - ["third-party//numa:numa", "third-party//liburing:uring"], + [ + "third-party//numa:numa", + "third-party//liburing:uring", + "third-party//tbb:tbb", + ], + ), + ( + "macos", + ["third-party//tbb:tbb"], ), ] -ROCKSDB_OS_PREPROCESSOR_FLAGS = [ +ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [ ( "linux", [ @@ -56,17 +64,33 @@ "-DHAVE_SSE42", "-DLIBURING", "-DNUMA", + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DTBB", ], ), ( "macos", - ["-DOS_MACOSX"], + [ + "-DOS_MACOSX", + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DTBB", + ], + ), + ( + "windows", + [ + "-DOS_WIN", + "-DWIN32", + "-D_MBCS", + "-DWIN64", + "-DNOMINMAX", + ], ), ] ROCKSDB_PREPROCESSOR_FLAGS = [ - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", "-DROCKSDB_SUPPORT_THREAD_LOCAL", # Flags to enable libs we include @@ -77,21 +101,22 @@ "-DZSTD", "-DZSTD_STATIC_LINKING_ONLY", "-DGFLAGS=gflags", - "-DTBB", # Added missing flags from output of build_detect_platform "-DROCKSDB_BACKTRACE", +] - # Directories with files for #include - "-I" + REPO_PATH + "include/", - "-I" + REPO_PATH, +# Directories with files for #include +ROCKSDB_INCLUDE_PATHS = [ + "", + "include", ] -ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { +ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {{ "x86_64": [ "-DHAVE_PCLMUL", ], -} +}} build_mode = read_config("fbcode", "build_mode") @@ -99,21 +124,26 @@ # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice # doesn't harm and avoid forgetting to add it. -ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else []) +ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else []) sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( +ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([( "linux", ["-DROCKSDB_JEMALLOC"], )] if sanitizer == "" else []) -ROCKSDB_OS_DEPS += ([( +ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([( "linux", ["third-party//jemalloc:headers"], )] if sanitizer == "" else []) + +ROCKSDB_LIB_DEPS = [ + ":rocksdb_lib", + ":rocksdb_test_lib", +] if not is_opt_mode else [":rocksdb_lib"] """ @@ -124,22 +154,41 @@ {headers_attr_prefix}headers = {headers}, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + link_whole = {link_whole}, os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [{deps}], - external_deps = ROCKSDB_EXTERNAL_DEPS, + exported_deps = [{deps}], + exported_external_deps = ROCKSDB_EXTERNAL_DEPS{extra_external_deps}, +) +""" + +rocksdb_library_template = """ +cpp_library( + name = "{name}", + srcs = [{srcs}], + {headers_attr_prefix}headers = {headers}, + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + exported_deps = ROCKSDB_LIB_DEPS, + exported_external_deps = ROCKSDB_EXTERNAL_DEPS, ) """ binary_template = """ cpp_binary( - name = "%s", - srcs = [%s], + name = "{name}", + srcs = [{srcs}], arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [%s], + include_paths = ROCKSDB_INCLUDE_PATHS, + deps = [{deps}], external_deps = ROCKSDB_EXTERNAL_DEPS, ) """ @@ -156,24 +205,24 @@ unittests_template = """ # [test_name, test_src, test_type, extra_deps, extra_compiler_flags] ROCKS_TESTS = [ -%s] +{tests}] # Generate a test rule for each entry in ROCKS_TESTS # Do not build the tests in opt mode, since SyncPoint and other test code # will not be included. [ - test_binary( - extra_compiler_flags = extra_compiler_flags, - extra_deps = extra_deps, - parallelism = parallelism, - rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, - rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, - rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, - rocksdb_os_deps = ROCKSDB_OS_DEPS, - rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, - rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - test_cc = test_cc, - test_name = test_name, + cpp_unittest( + name = test_name, + srcs = [test_cc], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags, + include_paths = ROCKSDB_INCLUDE_PATHS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + deps = [":rocksdb_test_lib"] + extra_deps, + external_deps = ROCKSDB_EXTERNAL_DEPS + [ + ("googletest", None, "gtest"), + ], ) for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/build_detect_platform mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/build_detect_platform 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,7 @@ # PLATFORM_LDFLAGS Linker flags # JAVA_LDFLAGS Linker flags for RocksDBJava # JAVA_STATIC_LDFLAGS Linker flags for RocksDBJava static build +# JAVAC_ARGS Arguments for javac # PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library @@ -27,6 +28,7 @@ # -DZSTD if the ZSTD library is present # -DNUMA if the NUMA library is present # -DTBB if the TBB library is present +# -DMEMKIND if the memkind library is present # # Using gflags in rocksdb: # Our project depends on gflags, which requires users to take some extra steps @@ -43,8 +45,13 @@ exit 1 fi -# we depend on C++11 -PLATFORM_CXXFLAGS="-std=c++11" +# we depend on C++11, but should be compatible with newer standards +if [ "$ROCKSDB_CXX_STANDARD" ]; then + PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD" +else + PLATFORM_CXXFLAGS="-std=c++11" +fi + # we currently depend on POSIX platform COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX" @@ -58,8 +65,12 @@ source "$PWD/build_tools/fbcode_config4.8.1.sh" elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then source "$PWD/build_tools/fbcode_config.sh" - else + elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then source "$PWD/build_tools/fbcode_config_platform007.sh" + elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then + source "$PWD/build_tools/fbcode_config_platform009.sh" + else + source "$PWD/build_tools/fbcode_config_platform009.sh" fi fi @@ -87,6 +98,16 @@ fi fi +if test -z "$AR"; then + if [ -x "$(command -v gcc-ar)" ]; then + AR=gcc-ar + elif [ -x "$(command -v llvm-ar)" ]; then + AR=llvm-ar + else + AR=ar + fi +fi + # Detect OS if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` @@ -149,10 +170,13 @@ else PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" - if test $ROCKSDB_USE_IO_URING; then + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl" + if test -z "$ROCKSDB_USE_IO_URING"; then + ROCKSDB_USE_IO_URING=1 + fi + if test "$ROCKSDB_USE_IO_URING" -ne 0; then # check for liburing - $CXX $CFLAGS -x c++ - -luring -o /dev/null 2>/dev/null </dev/null < int main() { struct io_uring ring; @@ -165,9 +189,6 @@ COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT" fi fi - if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then - USE_FOLLY_DISTRIBUTED_MUTEX=1 - fi # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) @@ -190,6 +211,17 @@ PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" # PORT_FILES=port/freebsd/freebsd_specific.cc ;; + GNU/kFreeBSD) + PLATFORM=OS_GNU_KFREEBSD + COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD" + if [ -z "$USE_CLANG" ]; then + COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" + else + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" + fi + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc + ;; NetBSD) PLATFORM=OS_NETBSD COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD" @@ -239,15 +271,20 @@ PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" JAVA_LDFLAGS="$PLATFORM_LDFLAGS" JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS" +JAVAC_ARGS="-source 7" if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then # Cross-compiling; do not try any compilation tests. # Also don't need any compilation tests if compiling on fbcode + if [ "$FBCODE_BUILD" = "true" ]; then + # Enable backtrace on fbcode since the necessary libraries are present + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE" + fi true else if ! test $ROCKSDB_DISABLE_FALLOCATE; then # Test whether fallocate is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -263,7 +300,7 @@ if ! test $ROCKSDB_DISABLE_SNAPPY; then # Test whether Snappy library is installed # http://code.google.com/p/snappy/ - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -278,30 +315,38 @@ # Test whether gflags library is installed # http://gflags.github.io/gflags/ # check if the namespace is gflags - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF + if $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF #include + using namespace GFLAGS_NAMESPACE; int main() {} EOF - if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" - else - # check if namespace is google - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF + then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + # check if namespace is gflags + elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF + #include + using namespace gflags; + int main() {} +EOF + then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" + # check if namespace is google + elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF #include using namespace google; int main() {} EOF - if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" - fi + then + COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=google" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" fi fi if ! test $ROCKSDB_DISABLE_ZLIB; then # Test whether zlib library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -314,7 +359,7 @@ if ! test $ROCKSDB_DISABLE_BZIP; then # Test whether bzip library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -327,7 +372,7 @@ if ! test $ROCKSDB_DISABLE_LZ4; then # Test whether lz4 library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() {} @@ -341,7 +386,7 @@ if ! test $ROCKSDB_DISABLE_ZSTD; then # Test whether zstd library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -354,7 +399,7 @@ if ! test $ROCKSDB_DISABLE_NUMA; then # Test whether numa is available - $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null </dev/null < #include int main() {} @@ -368,7 +413,7 @@ if ! test $ROCKSDB_DISABLE_TBB; then # Test whether tbb is available - $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null </dev/null < int main() {} EOF @@ -381,7 +426,7 @@ if ! test $ROCKSDB_DISABLE_JEMALLOC; then # Test whether jemalloc is available - if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \ + if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \ 2>/dev/null; then # This will enable some preprocessor identifiers in the Makefile JEMALLOC=1 @@ -402,7 +447,7 @@ fi if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then # jemalloc is not available. Let's try tcmalloc - if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \ + if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o \ -ltcmalloc 2>/dev/null; then PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc" JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc" @@ -411,7 +456,7 @@ if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then # Test whether malloc_usable_size is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { size_t res = malloc_usable_size(0); @@ -424,9 +469,25 @@ fi fi + if ! test $ROCKSDB_DISABLE_MEMKIND; then + # Test whether memkind library is installed + $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o test.o 2>/dev/null < + int main() { + memkind_malloc(MEMKIND_DAX_KMEM, 1024); + return 0; + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DMEMKIND" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lmemkind" + JAVA_LDFLAGS="$JAVA_LDFLAGS -lmemkind" + fi + fi + if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { int x = PTHREAD_MUTEX_ADAPTIVE_NP; @@ -441,7 +502,7 @@ if ! test $ROCKSDB_DISABLE_BACKTRACE; then # Test whether backtrace is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { void* frames[1]; @@ -453,7 +514,7 @@ COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE" else # Test whether execinfo library is installed - $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { void* frames[1]; @@ -470,7 +531,7 @@ if ! test $ROCKSDB_DISABLE_PG; then # Test if -pg is supported - $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null </dev/null </dev/null </dev/null < int main() { int fd = open("/dev/null", 0); @@ -496,7 +557,7 @@ if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then # Test whether sched_getcpu is supported - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { int cpuid = sched_getcpu(); @@ -508,9 +569,23 @@ fi fi + if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then + # Test whether getauxval is supported + $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null < + int main() { + uint64_t auxv = getauxval(AT_HWCAP); + (void)auxv; + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_AUXV_GETAUXVAL_PRESENT" + fi + fi + if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then # Test whether c++17 aligned-new is supported - $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o /dev/null 2>/dev/null </dev/null </dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbenchmark" + fi + fi fi # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning. -# -Wshorten-64-to-32 breaks compilation on FreeBSD i386 -if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then +# -Wshorten-64-to-32 breaks compilation on FreeBSD aarch64 and i386 +if ! { [ "$TARGET_OS" = FreeBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then # Test whether -Wshorten-64-to-32 is available - $CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null </dev/null </dev/null; then + COMMON_FLAGS="$COMMON_FLAGS -march=native " + else + COMMON_FLAGS="$COMMON_FLAGS -march=z196 " + fi + COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" == "IOS" ]; then COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then @@ -575,6 +666,40 @@ if test "$USE_SSE"; then TRY_SSE_ETC="1" fi + + if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then + COMMON_FLAGS="$COMMON_FLAGS -march=z196 " + fi + + if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then + # For portability compile for macOS 10.12 (2016) or newer + COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.12" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.12" + # -mmacosx-version-min must come first here. + PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.12 $PLATFORM_SHARED_LDFLAGS" + PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12" + JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.12" + JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" + JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" + JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" + fi +fi + +if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then + # check for GNU libc on ppc64 + $CXX -x c++ - -o /dev/null 2>/dev/null < + #include + #include + + int main(int argc, char *argv[]) { + printf("GNU libc version: %s\n", gnu_get_libc_version()); + return 0; + } +EOF + if [ "$?" != 0 ]; then + PPC_LIBC_IS_GNU=0 + fi fi if test "$TRY_SSE_ETC"; then @@ -584,14 +709,21 @@ # It doesn't even really check that your current CPU is compatible. # # SSE4.2 available since nehalem, ca. 2008-2010 + # Includes POPCNT for BitsSetToOne, BitParity TRY_SSE42="-msse4.2" # PCLMUL available since westmere, ca. 2010-2011 TRY_PCLMUL="-mpclmul" # AVX2 available since haswell, ca. 2013-2015 TRY_AVX2="-mavx2" + # BMI available since haswell, ca. 2013-2015 + # Primarily for TZCNT for CountTrailingZeroBits + TRY_BMI="-mbmi" + # LZCNT available since haswell, ca. 2013-2015 + # For FloorLog2 + TRY_LZCNT="-mlzcnt" fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -605,7 +737,7 @@ echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2 fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -622,7 +754,7 @@ echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2 fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -637,7 +769,35 @@ echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2 fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < + #include + int main(int argc, char *argv[]) { + (void)argv; + return (int)_tzcnt_u64((uint64_t)argc); + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI" +elif test "$USE_SSE"; then + echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2 +fi + +$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null < + #include + int main(int argc, char *argv[]) { + (void)argv; + return (int)_lzcnt_u64((uint64_t)argc); + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT" +elif test "$USE_SSE"; then + echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2 +fi + +$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null < int main() { uint64_t a = 0xffffFFFFffffFFFF; @@ -654,7 +814,7 @@ # succeed because the cross-compiler flags are added by the Makefile, not this # script. if [ "$PLATFORM" != IOS ]; then - $CXX $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null </dev/null </dev/null + $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o test.o 2>/dev/null if [ "$?" = 0 ]; then EXEC_LDFLAGS+="-ldl" rm -f test_dl.o @@ -681,6 +842,20 @@ fi fi +# check for F_FULLFSYNC +$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null < + int main() { + fcntl(0, F_FULLFSYNC); + return 0; + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC" +fi + +rm -f test.o test_dl.o + PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" @@ -692,10 +867,16 @@ echo "CC=$CC" >> "$OUTPUT" echo "CXX=$CXX" >> "$OUTPUT" +echo "AR=$AR" >> "$OUTPUT" echo "PLATFORM=$PLATFORM" >> "$OUTPUT" echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" +echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT" echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT" echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT" +echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT" echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT" @@ -728,3 +909,6 @@ if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT" fi +if test -n "$PPC_LIBC_IS_GNU"; then + echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT" +fi diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/check-sources.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/check-sources.sh 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# Check for some simple mistakes that should prevent commit or push + +BAD="" + +git grep 'namespace rocksdb' -- '*.[ch]*' +if [ "$?" != "1" ]; then + echo "^^^^^ Do not hardcode namespace rocksdb. Use ROCKSDB_NAMESPACE" + BAD=1 +fi + +git grep -i 'nocommit' -- ':!build_tools/check-sources.sh' +if [ "$?" != "1" ]; then + echo "^^^^^ Code was not intended to be committed" + BAD=1 +fi + +git grep ' /dev/null -then - echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!" - echo "You can download clang-format-diff.py by running: " - echo " curl --location http://goo.gl/iUW1u2 -o ${CLANG_FORMAT_DIFF}" - echo "You can download clang-format by running:" - echo " brew install clang-format" - echo " Or" - echo " apt install clang-format" - echo " This might work too:" - echo " yum install git-clang-format" - echo "Then, move both files (i.e. ${CLANG_FORMAT_DIFF} and clang-format) to some directory within PATH=${PATH}" - echo "and make sure ${CLANG_FORMAT_DIFF} is executable." - exit 128 -fi - -# Check argparse, a library that clang-format-diff.py requires. -python 2>/dev/null << EOF -import argparse -EOF - -if [ "$?" != 0 ] -then - echo "To run clang-format-diff.py, we'll need the library "argparse" to be" - echo "installed. You can try either of the follow ways to install it:" - echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" - echo " 2. easy_install argparse (if you have easy_install)" - echo " 3. pip install argparse (if you have pip)" - exit 129 +print_usage () { + echo "Usage:" + echo "format-diff.sh [OPTIONS]" + echo "-c: check only." + echo "-h: print this message." +} + +while getopts ':ch' OPTION; do + case "$OPTION" in + c) + CHECK_ONLY=1 + ;; + h) + print_usage + exit 1 + ;; + ?) + print_usage + exit 1 + ;; + esac +done + +REPO_ROOT="$(git rev-parse --show-toplevel)" + +if [ "$CLANG_FORMAT_DIFF" ]; then + echo "Note: CLANG_FORMAT_DIFF='$CLANG_FORMAT_DIFF'" + # Dry run to confirm dependencies like argparse + if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then + true #Good + else + exit 128 + fi +else + # First try directly executing the possibilities + if clang-format-diff --help &> /dev/null < /dev/null; then + CLANG_FORMAT_DIFF=clang-format-diff + elif clang-format-diff.py --help &> /dev/null < /dev/null; then + CLANG_FORMAT_DIFF=clang-format-diff.py + elif $REPO_ROOT/clang-format-diff.py --help &> /dev/null < /dev/null; then + CLANG_FORMAT_DIFF=$REPO_ROOT/clang-format-diff.py + else + # This probably means we need to directly invoke the interpreter. + # But first find clang-format-diff.py + if [ -f "$REPO_ROOT/clang-format-diff.py" ]; then + CFD_PATH="$REPO_ROOT/clang-format-diff.py" + elif which clang-format-diff.py &> /dev/null; then + CFD_PATH="$(which clang-format-diff.py)" + else + echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!" + echo "You can download clang-format-diff.py by running: " + echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py" + echo "You should make sure the downloaded script is not compromised." + echo "You can download clang-format by running:" + echo " brew install clang-format" + echo " Or" + echo " apt install clang-format" + echo " This might work too:" + echo " yum install git-clang-format" + echo "Then make sure clang-format is available and executable from \$PATH:" + echo " clang-format --version" + exit 128 + fi + # Check argparse pre-req on interpreter, or it will fail + if echo import argparse | ${PYTHON:-python3}; then + true # Good + else + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 + fi + # Unfortunately, some machines have a Python2 clang-format-diff.py + # installed but only a Python3 interpreter installed. Unfortunately, + # automatic 2to3 migration is insufficient, so suggest downloading latest. + if grep -q "print '" "$CFD_PATH" && \ + ${PYTHON:-python3} --version | grep -q 'ython 3'; then + echo "You have clang-format-diff.py for Python 2 but are using a Python 3" + echo "interpreter (${PYTHON:-python3})." + echo "You can download clang-format-diff.py for Python 3 by running: " + echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py" + echo "You should make sure the downloaded script is not compromised." + exit 130 + fi + CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH" + # This had better work after all those checks + if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then + true #Good + else + exit 128 + fi + fi fi # TODO(kailiu) following work is not complete since we still need to figure @@ -62,31 +122,41 @@ # If there's no uncommitted changes, we assume user are doing post-commit # format check, in which case we'll try to check the modified lines vs. the -# facebook/rocksdb.git master branch. Otherwise, we'll check format of the +# facebook/rocksdb.git main branch. Otherwise, we'll check format of the # uncommitted code only. if [ -z "$uncommitted_code" ] then # Attempt to get name of facebook/rocksdb.git remote. - [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)" + [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)" # Fall back on 'origin' if that fails [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin - # Use master branch from that remote - [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/master" + # Use main branch from that remote + [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/$(LC_ALL=POSIX LANG=POSIX git remote show $FORMAT_REMOTE | sed -n '/HEAD branch/s/.*: //p')" # Get the common ancestor with that remote branch. Everything after that # common ancestor would be considered the contents of a pull request, so # should be relevant for formatting fixes. FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)" # Get the differences diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) + echo "Checking format of changes not yet in $FORMAT_UPSTREAM..." else # Check the format of uncommitted lines, diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) + echo "Checking format of uncommitted changes..." fi if [ -z "$diffs" ] then echo "Nothing needs to be reformatted!" exit 0 +elif [ $CHECK_ONLY ] +then + echo "Your change has unformatted code. Please run make format!" + if [ $VERBOSE_CHECK ]; then + clang-format --version + echo "$diffs" + fi + exit 1 fi # Highlight the insertion/deletion from the clang-format-diff.py's output @@ -121,7 +191,7 @@ then git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1 else - git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 + git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1 fi echo "Files reformatted!" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/gnu_parallel mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/gnu_parallel 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel 2025-05-19 16:14:27.000000000 +0000 @@ -1561,6 +1561,7 @@ ::die_bug("Can't dup STDERR: $!"); open $Global::original_stdin, "<&", "STDIN" or ::die_bug("Can't dup STDIN: $!"); + $Global::is_terminal = (-t $Global::original_stderr); } sub enough_file_handles { @@ -1840,12 +1841,17 @@ } } +$opt::min_progress_interval = 0; + sub init_progress { # Uses: # $opt::bar # Returns: # list of computers for progress output $|=1; + if (not $Global::is_terminal) { + $opt::min_progress_interval = 30; + } if($opt::bar) { return("",""); } @@ -1870,6 +1876,9 @@ } my $last_header=""; my $sleep = 0.2; + my $last_left = 1000000000; + my $last_progress_time = 0; + my $ps_reported = 0; do { while($Global::total_running > 0) { debug($Global::total_running, "==", scalar @@ -1880,14 +1889,38 @@ close $job->fh(0,"w"); } } - if($opt::progress) { + # When not connected to terminal, assume CI (e.g. CircleCI). In + # that case we want occasional progress output to prevent abort + # due to timeout with no output, but we also need to stop sending + # progress output if there has been no actual progress, so that + # the job can time out appropriately (CirecleCI: 10m) in case of + # a hung test. But without special output, it is extremely + # annoying to diagnose which test is hung, so we add that using + # `ps` below. + if($opt::progress and + ($Global::is_terminal or (time() - $last_progress_time) >= 30)) { my %progress = progress(); if($last_header ne $progress{'header'}) { print $Global::original_stderr "\n", $progress{'header'}, "\n"; $last_header = $progress{'header'}; } - print $Global::original_stderr "\r",$progress{'status'}; - flush $Global::original_stderr; + if ($Global::is_terminal) { + print $Global::original_stderr "\r",$progress{'status'}; + } + if ($last_left > $Global::left) { + if (not $Global::is_terminal) { + print $Global::original_stderr $progress{'status'},"\n"; + } + $last_progress_time = time(); + $ps_reported = 0; + } elsif (not $ps_reported and (time() - $last_progress_time) >= 60) { + # No progress in at least 60 seconds: run ps + print $Global::original_stderr "\n"; + system("ps", "-wf"); + $ps_reported = 1; + } + $last_left = $Global::left; + flush $Global::original_stderr; } if($Global::total_running < $Global::max_jobs_running and not $Global::JobQueue->empty()) { @@ -1921,7 +1954,7 @@ not $Global::start_no_new_jobs and not $Global::JobQueue->empty()); if($opt::progress) { my %progress = progress(); - print $Global::original_stderr "\r", $progress{'status'}, "\n"; + print $Global::original_stderr $opt::progress_sep, $progress{'status'}, "\n"; flush $Global::original_stderr; } } @@ -1954,10 +1987,11 @@ my $eta = ""; my ($status,$header)=("",""); if($opt::eta) { - my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) = - compute_eta(); - $eta = sprintf("ETA: %ds Left: %d AVG: %.2fs ", - $this_eta, $left, $avgtime); + my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) = + compute_eta(); + $eta = sprintf("ETA: %ds Left: %d AVG: %.2fs ", + $this_eta, $left, $avgtime); + $Global::left = $left; } my $termcols = terminal_columns(); my @workers = sort keys %Global::host; @@ -5801,7 +5835,7 @@ . "-" . $self->seq(); } else { $workdir = $opt::workdir; - # Rsync treats /./ special. We dont want that + # Rsync treats /./ special. We don't want that $workdir =~ s:/\./:/:g; # Remove /./ $workdir =~ s:/+$::; # Remove ending / if any $workdir =~ s:^\./::g; # Remove starting ./ if any diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/make_package.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/make_package.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh 2025-05-19 16:14:27.000000000 +0000 @@ -103,31 +103,26 @@ gem_install fpm make static_lib - make install INSTALL_PATH=package - - cd package - - LIB_DIR=lib - if [[ -z "$ARCH" ]]; then - ARCH=$(getconf LONG_BIT) - fi - if [[ ("$FPM_OUTPUT" = "rpm") && ($ARCH -eq 64) ]]; then - mv lib lib64 - LIB_DIR=lib64 + LIBDIR=/usr/lib + if [[ $FPM_OUTPUT = "rpm" ]]; then + LIBDIR=$(rpm --eval '%_libdir') fi + rm -rf package + make install DESTDIR=package PREFIX=/usr LIBDIR=$LIBDIR + fpm \ -s dir \ -t $FPM_OUTPUT \ + -C package \ -n rocksdb \ -v $1 \ - --prefix /usr \ --url http://rocksdb.org/ \ -m rocksdb@fb.com \ --license BSD \ --vendor Facebook \ --description "RocksDB is an embeddable persistent key-value store for fast storage." \ - include $LIB_DIR + usr } # shellcheck disable=SC2068 diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh 2025-05-19 16:14:27.000000000 +0000 @@ -20,26 +20,11 @@ function cleanup { rm -rf $DATA_DIR - rm -f $STAT_FILE.fillseq - rm -f $STAT_FILE.readrandom - rm -f $STAT_FILE.overwrite - rm -f $STAT_FILE.memtablefillreadrandom + rm -f $STAT_FILE.* } trap cleanup EXIT -if [ -z $GIT_BRANCH ]; then - git_br=`git rev-parse --abbrev-ref HEAD` -else - git_br=$(basename $GIT_BRANCH) -fi - -if [ $git_br == "master" ]; then - git_br="" -else - git_br="."$git_br -fi - make release # measure fillseq + fill up the DB for overwrite benchmark @@ -286,12 +271,10 @@ --sync=0 \ --verify_checksum=1 \ --delete_obsolete_files_period_micros=314572800 \ - --max_grandparent_overlap_factor=10 \ --use_plain_table=1 \ --open_files=-1 \ --mmap_read=1 \ --mmap_write=0 \ - --memtablerep=prefix_hash \ --bloom_bits=10 \ --bloom_locality=1 \ --perf_level=0" @@ -378,7 +361,7 @@ echo >&2 "ERROR: Key $key doesn't have a value." return fi - curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \ + curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ --connect-timeout 60 } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator 2025-05-19 16:14:27.000000000 +0000 @@ -3,7 +3,7 @@ # to determine next steps to run # Usage: -# EMAIL= ONCALL= TRIGGER= SUBSCRIBER= rocks_ci.py +# EMAIL= ONCALL= TRIGGER= SUBSCRIBER= WORKINGDIR= rocksdb-lego-determinator # # Input Value # ------------------------------------------------------------------------- @@ -11,7 +11,7 @@ # ONCALL Email address to raise a task on failure # TRIGGER Trigger conditions for email. Valid values are fail, warn, all # SUBSCRIBER Email addresss to add as subscriber for task -# +# WORKINGDIR Working directory # # Report configuration @@ -24,22 +24,22 @@ REPORT_EMAIL=" { - 'type':'email', - 'triggers': [ '$TRIGGER' ], - 'emails':['$EMAIL'] - }," + \"type\":\"email\", + \"triggers\": [ \"$TRIGGER\" ], + \"emails\":[\"$EMAIL\"] + }" fi CREATE_TASK= if [ ! -z $ONCALL ]; then CREATE_TASK=" { - 'type':'task', - 'triggers':[ 'fail' ], - 'priority':0, - 'subscribers':[ '$SUBSCRIBER' ], - 'tags':[ 'rocksdb', 'ci' ], - }," + \"type\":\"task\", + \"triggers\":[ \"fail\" ], + \"priority\":0, + \"subscribers\":[ \"$SUBSCRIBER\" ], + \"tags\":[ \"rocksdb\", \"ci\" ] + }" fi # For now, create the tasks using only the dedicated task creation tool. @@ -47,47 +47,54 @@ REPORT= if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then - REPORT="'report': [ - $REPORT_EMAIL + REPORT=",\"report\": [ + $REPORT_EMAIL, $CREATE_TASK ]" fi +# Working directory for the following command, default to current directory +WORKING_DIR=. +if [ ! -z $WORKINGDIR ]; then + WORKING_DIR=$WORKINGDIR +fi + # # Helper variables # CLEANUP_ENV=" { - 'name':'Cleanup environment', - 'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean', - 'user':'root' + \"name\":\"Cleanup environment\", + \"shell\":\"cd $WORKING_DIR; rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean\", + \"user\":\"root\" }" UPLOAD_DB_DIR=" { - 'name':'Upload database directory', - 'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/', - 'user':'root', - 'cleanup':true, - 'provide_artifacts': [ - { - 'name':'rocksdb_db_dir', - 'paths': ['rocksdb_db.tar.gz'], - 'bundle': false, - }, - ], + \"name\":\"Upload database directory\", + \"shell\":\"tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/\", + \"user\":\"root\", + \"cleanup\":true, + \"provide_artifacts\": [ + { + \"name\":\"rocksdb_db_dir\", + \"paths\": [\"rocksdb_db.tar.gz\"], + \"bundle\": false + } + ] }" -# We will eventually set the RATIO to 1, but we want do this -# in steps. RATIO=$(nproc) will make it work as J=1 +# set default RATIO to 1, which sets J=$(nproc) and j=$(nproc) if [ -z $RATIO ]; then - RATIO=$(nproc) + RATIO=1 fi +# Should probably be called PARALLEL_TEST if [ -z $PARALLEL_J ]; then PARALLEL_J="J=$(expr $(nproc) / ${RATIO})" fi +# Should probably be called PARALLEL_MAKE if [ -z $PARALLEL_j ]; then PARALLEL_j="-j$(expr $(nproc) / ${RATIO})" fi @@ -100,18 +107,18 @@ GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1" ASAN="COMPILE_WITH_ASAN=1" CLANG="USE_CLANG=1" -# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090. -# using platform007 gives us gcc-8 or higher which has that bug fixed. -TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1" +TSAN="COMPILE_WITH_TSAN=1" UBSAN="COMPILE_WITH_UBSAN=1" -TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"' +ASAN_CRASH="ASAN_OPTIONS=disable_coredump=0" +TSAN_CRASH="CRASH_TEST_EXT_ARGS=\\\"--compression_type=zstd --log2_keys_per_lock=22\\\"" NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd" DISABLE_JEMALLOC="DISABLE_JEMALLOC=1" HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080" SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; export PATH=\$JAVA_HOME/bin:\$PATH" -PARSER="'parser':'python build_tools/error_filter.py $1'" +PARSER="\"parser\":\"/usr/bin/env python3 build_tools/error_filter.py $1\"" CONTRUN_NAME="ROCKSDB_CONTRUN_NAME" +SKIP_FORMAT_CHECKS="SKIP_FORMAT_BUCK_CHECKS=1" # This code is getting called under various scenarios. What we care about is to # understand when it's called from nightly contruns because in that case we'll @@ -129,15 +136,15 @@ # DISABLE_COMMANDS="[ { - 'name':'Disable test', - 'oncall':'$ONCALL', - 'steps': [ - { - 'name':'Job disabled. Please contact test owner', - 'shell':'exit 1', - 'user':'root' - }, - ], + \"name\":\"Disable test\", + \"oncall\":\"$ONCALL\", + \"steps\": [ + { + \"name\":\"Job disabled. Please contact test owner\", + \"shell\":\"exit 1\", + \"user\":\"root\" + } + ] } ]" @@ -146,18 +153,18 @@ # UNIT_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and test RocksDB debug version\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -167,20 +174,20 @@ # UNIT_TEST_NON_SHM_COMMANDS="[ { - 'name':'Rocksdb Unit Test', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Unit Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and test RocksDB debug version', - 'timeout': 86400, - 'shell':'$NON_SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and test RocksDB debug version\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $NON_SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -190,18 +197,18 @@ # RELEASE_BUILD_COMMANDS="[ { - 'name':'Rocksdb Release Build', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Release Build\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build RocksDB release', - 'shell':'make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build RocksDB release\", + \"shell\":\"cd $WORKING_DIR; make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -211,18 +218,18 @@ # UNIT_TEST_COMMANDS_481="[ { - 'name':'Rocksdb Unit Test on GCC 4.8.1', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test on GCC 4.8.1\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $GCC_481 $DEBUG make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and test RocksDB debug version\", + \"shell\":\"cd $WORKING_DIR; $SHM $GCC_481 $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -232,18 +239,18 @@ # RELEASE_BUILD_COMMANDS_481="[ { - 'name':'Rocksdb Release on GCC 4.8.1', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Release on GCC 4.8.1\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build RocksDB release on GCC 4.8.1', - 'shell':'$GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build RocksDB release on GCC 4.8.1\", + \"shell\":\"cd $WORKING_DIR; $GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -253,18 +260,18 @@ # CLANG_UNIT_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and test RocksDB debug', - 'shell':'$CLANG $SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and test RocksDB debug\", + \"shell\":\"cd $WORKING_DIR; $CLANG $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -274,18 +281,18 @@ # CLANG_RELEASE_BUILD_COMMANDS="[ { - 'name':'Rocksdb CLANG Release Build', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb CLANG Release Build\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build RocksDB release', - 'shell':'$CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build RocksDB release\", + \"shell\":\"cd $WORKING_DIR; $CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -295,18 +302,18 @@ # CLANG_ANALYZE_COMMANDS="[ { - 'name':'Rocksdb analyze', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb analyze\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'RocksDB build and analyze', - 'shell':'$CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"RocksDB build and analyze\", + \"shell\":\"cd $WORKING_DIR; $CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -316,18 +323,18 @@ # CODE_COV_COMMANDS="[ { - 'name':'Rocksdb Unit Test Code Coverage', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test Code Coverage\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build, test and collect code coverage info', - 'shell':'$SHM $DEBUG make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build, test and collect code coverage info\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -337,18 +344,18 @@ # UNITY_COMMANDS="[ { - 'name':'Rocksdb Unity', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unity\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build, test unity test', - 'shell':'$SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build, test unity test\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG V=1 make $PARALLELISM unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -358,65 +365,108 @@ # LITE_BUILD_COMMANDS="[ { - 'name':'Rocksdb Lite build', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Lite build\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build RocksDB debug version', - 'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build RocksDB debug version\", + \"shell\":\"cd $WORKING_DIR; $SKIP_FORMAT_CHECKS make $PARALLELISM LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + } + ] + $REPORT + } +]" + +# +# RocksDB stress/crash test +# +STRESS_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb Stress and Crash Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - ], + { + \"name\":\"Build and run RocksDB debug crash tests\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] $REPORT } ]" # -# Report RocksDB lite binary size to scuba -REPORT_LITE_BINARY_SIZE_COMMANDS="[ +# RocksDB blackbox stress/crash test +# +BLACKBOX_STRESS_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb Lite Binary Size', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Blackbox Stress and Crash Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Report RocksDB Lite binary size to scuba', - 'shell':'tools/report_lite_binary_size.sh', - 'user':'root', + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER }, - ], + { + \"name\":\"Build and run RocksDB debug blackbox crash tests\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] + $REPORT + } ]" # -# RocksDB stress/crash test +# RocksDB whitebox stress/crash test # -STRESS_CRASH_TEST_COMMANDS="[ +WHITEBOX_STRESS_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb Stress and Crash Test', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Whitebox Stress and Crash Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, { - 'name':'Build and run RocksDB debug crash tests', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug whitebox crash tests\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -426,27 +476,27 @@ # STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb Stress and Crash Test with atomic flush', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Stress and Crash Test with atomic flush\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, { - 'name':'Build and run RocksDB debug crash tests with atomic flush', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug crash tests with atomic flush\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -456,27 +506,57 @@ # STRESS_CRASH_TEST_WITH_TXN_COMMANDS="[ { - 'name':'Rocksdb Stress and Crash Test with txn', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Stress and Crash Test with txn\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, { - 'name':'Build and run RocksDB debug crash tests with txn', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug crash tests with txn\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB stress/crash test with timestamp +# +STRESS_CRASH_TEST_WITH_TS_COMMANDS="[ + { + \"name\":\"Rocksdb Stress and Crash Test with ts\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug stress tests\", + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + { + \"name\":\"Build and run RocksDB debug crash tests with ts\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_ts || $CONTRUN_NAME=crash_test_with_ts $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -486,19 +566,19 @@ # because we want to add some randomness to fsync commands WRITE_STRESS_COMMANDS="[ { - 'name':'Rocksdb Write Stress Test', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Write Stress Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB write stress tests', - 'shell':'make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB write stress tests\", + \"shell\":\"cd $WORKING_DIR; make write_stress && /usr/bin/env python3 tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER } ], - 'artifacts': [{'name': 'database', 'paths': ['/tmp/rocksdb_write_stress']}], + \"artifacts\": [{\"name\": \"database\", \"paths\": [\"/tmp/rocksdb_write_stress\"]}] $REPORT } ]" @@ -509,18 +589,18 @@ # ASAN_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test under ASAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Test RocksDB debug under ASAN', -'shell':'set -o pipefail && ($SHM $ASAN $DEBUG make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d', - 'user':'root', + \"name\":\"Test RocksDB debug under ASAN\", +\"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $ASAN $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER } - ], + ] $REPORT } ]" @@ -530,21 +610,69 @@ # ASAN_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb crash test under ASAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug asan_crash_test', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug asan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB blackbox crash testing under address sanitizer +# +ASAN_BLACKBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb blackbox crash test under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug blackbox asan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB whitebox crash testing under address sanitizer +# +ASAN_WHITEBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb whitebox crash test under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug whitebox asan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -554,21 +682,21 @@ # ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test with atomic flush under ASAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test with atomic flush under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug asan_crash_test_with_atomic_flush', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug asan_crash_test_with_atomic_flush\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -578,21 +706,21 @@ # ASAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { - 'name':'Rocksdb crash test with txn under ASAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test with txn under ASAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug asan_crash_test_with_txn', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug asan_crash_test_with_txn\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -602,42 +730,90 @@ # UBSAN_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test under UBSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Unit Test under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Test RocksDB debug under UBSAN', - 'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Test RocksDB debug under UBSAN\", + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $UBSAN $CLANG $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER } - ], + ] $REPORT } ]" # -# RocksDB crash testing under udnefined behavior sanitizer +# RocksDB crash testing under undefined behavior sanitizer # UBSAN_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb crash test under UBSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug ubsan_crash_test', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug ubsan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB crash testing under undefined behavior sanitizer +# +UBSAN_BLACKBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb blackbox crash test under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug blackbox ubsan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB crash testing under undefined behavior sanitizer +# +UBSAN_WHITEBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb whitebox crash test under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Build and run RocksDB debug whitebox ubsan_crash_test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -647,21 +823,21 @@ # UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test with atomic flush under UBSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test with atomic flush under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug ubsan_crash_test_with_atomic_flush\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -671,21 +847,21 @@ # UBSAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { - 'name':'Rocksdb crash test with txn under UBSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb crash test with txn under UBSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build and run RocksDB debug ubsan_crash_test_with_txn', - 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build and run RocksDB debug ubsan_crash_test_with_txn\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -695,20 +871,20 @@ # VALGRIND_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test under valgrind', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Unit Test under valgrind\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Run RocksDB debug unit tests', - 'timeout': 86400, - 'shell':'$SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Run RocksDB debug unit tests\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -718,20 +894,20 @@ # TSAN_UNIT_TEST_COMMANDS="[ { - 'name':'Rocksdb Unit Test under TSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Unit Test under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Run RocksDB debug unit test', - 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Run RocksDB debug unit test\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -741,21 +917,69 @@ # TSAN_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb Crash Test under TSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Crash Test under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Compile and run', - 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Compile and run\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB blackbox crash test under TSAN +# +TSAN_BLACKBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb Blackbox Crash Test under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Compile and run\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] + $REPORT + } +]" + +# +# RocksDB whitebox crash test under TSAN +# +TSAN_WHITEBOX_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb Whitebox Crash Test under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + $CLEANUP_ENV, + { + \"name\":\"Compile and run\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -765,21 +989,21 @@ # TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb Crash Test with atomic flush under TSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Crash Test with atomic flush under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Compile and run', - 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Compile and run\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -789,21 +1013,21 @@ # TSAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { - 'name':'Rocksdb Crash Test with txn under TSAN', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'timeout': 86400, - 'steps': [ + \"name\":\"Rocksdb Crash Test with txn under TSAN\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ $CLEANUP_ENV, { - 'name':'Compile and run', - 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Compile and run\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - $UPLOAD_DB_DIR, - ], + $UPLOAD_DB_DIR + ] $REPORT } ]" @@ -818,23 +1042,25 @@ rm -rf /dev/shm/rocksdb mkdir /dev/shm/rocksdb + export https_proxy="fwdproxy:8080" + tools/check_format_compatible.sh } FORMAT_COMPATIBLE_COMMANDS="[ { - 'name':'Rocksdb Format Compatible tests', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Format Compatible tests\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Run RocksDB debug unit test\", + \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -852,23 +1078,24 @@ mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile mv .tmp.Makefile Makefile - make $DEBUG J=1 check + export $SKIP_FORMAT_CHECKS + make $DEBUG $PARALLELISM check } NO_COMPRESSION_COMMANDS="[ { - 'name':'Rocksdb No Compression tests', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb No Compression tests\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Run RocksDB debug unit test\", + \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -878,7 +1105,7 @@ # run_regression() { - time -v bash -vx ./build_tools/regression_build_test.sh $(mktemp -d $WORKSPACE/leveldb.XXXX) $(mktemp leveldb_test_stats.XXXX) + time bash -vx ./build_tools/regression_build_test.sh $(mktemp -d $WORKING_DIR/rocksdb.XXXX) $(mktemp rocksdb_test_stats.XXXX) # ======= report size to ODS ======== @@ -895,6 +1122,7 @@ strip librocksdb.a send_size_to_ods static_lib_stripped $(stat --printf="%s" librocksdb.a) + make clean make -j$(nproc) shared_lib send_size_to_ods shared_lib $(stat --printf="%s" `readlink -f librocksdb.so`) strip `readlink -f librocksdb.so` @@ -907,6 +1135,7 @@ strip librocksdb.a send_size_to_ods static_lib_lite_stripped $(stat --printf="%s" librocksdb.a) + make clean make LITE=1 -j$(nproc) shared_lib send_size_to_ods shared_lib_lite $(stat --printf="%s" `readlink -f librocksdb.so`) strip `readlink -f librocksdb.so` @@ -915,17 +1144,18 @@ REGRESSION_COMMANDS="[ { - 'name':'Rocksdb regression commands', - 'oncall':'$ONCALL', - 'steps': [ + \"name\":\"Rocksdb regression commands\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Make and run script', - 'shell':'build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Make and run script\", + \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER - }, - ], + } + ] $REPORT } ]" @@ -935,18 +1165,52 @@ # JAVA_BUILD_TEST_COMMANDS="[ { - 'name':'Rocksdb Java Build', - 'oncall':'$ONCALL', - 'executeLocal': 'true', - 'steps': [ + \"name\":\"Rocksdb Java Build\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"steps\": [ $CLEANUP_ENV, { - 'name':'Build RocksDB for Java', - 'shell':'$SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL', - 'user':'root', + \"name\":\"Build RocksDB for Java\", + \"shell\":\"cd $WORKING_DIR; $SETUP_JAVA_ENV; $SHM make $PARALLELISM rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + } + ] + $REPORT + } +]" + +# +# RocksDB fbcode stress/crash test +# +FBCODE_STRESS_CRASH_TEST_COMMANDS="[ + { + \"name\":\"Rocksdb Fbcode Stress and Crash Test\", + \"oncall\":\"$ONCALL\", + \"executeLocal\": \"true\", + \"timeout\": 86400, + \"steps\": [ + { + \"name\":\"Copy RocksDB code to fbcode repo\", + \"shell\":\"cd internal_repo_rocksdb/repo && git init -b main && git add * && git commit -a -m \\\"Make internal_repo_rocksdb/repo a git repo\\\" && cd ../.. && echo Y | python3 rocks/release_script/release_to_fbcode.py -u internal_repo_rocksdb/repo main || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\", + \"user\":\"root\", $PARSER }, - ], + { + \"name\":\"Build RocksDB fbcode stress tests\", + \"shell\":\"cd $WORKING_DIR; buck build @mode/dbg rocks/tools:rocks_db_stress || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + }, + { + \"name\":\"Run RocksDB whitebox crash tests\", + \"timeout\": 86400, + \"shell\":\"cd $WORKING_DIR; mkdir /dev/shm/rocksdb_fbcode_crash_test && TEST_TMPDIR=\$(mktemp -d --tmpdir=/dev/shm/rocksdb_fbcode_crash_test) python3 rocksdb/src/tools/db_crashtest.py --stress_cmd=buck-out/dbg/gen/rocks/tools/rocks_db_stress -secondary_cache_uri=\\\"$SECONDARY_CACHE_URI\\\" --env_uri=$ENV_URI $EXTRA_DB_STRESS_ARGS -logtostderr=false $TEST_TYPE || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\", + \"user\":\"root\", + $PARSER + } + ] $REPORT } ]" @@ -986,18 +1250,24 @@ lite) echo $LITE_BUILD_COMMANDS ;; - report_lite_binary_size) - echo $REPORT_LITE_BINARY_SIZE_COMMANDS - ;; stress_crash) echo $STRESS_CRASH_TEST_COMMANDS ;; + blackbox_stress_crash) + echo $BLACKBOX_STRESS_CRASH_TEST_COMMANDS + ;; + whitebox_stress_crash) + echo $WHITEBOX_STRESS_CRASH_TEST_COMMANDS + ;; stress_crash_with_atomic_flush) echo $STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS ;; stress_crash_with_txn) echo $STRESS_CRASH_TEST_WITH_TXN_COMMANDS ;; + stress_crash_with_ts) + echo $STRESS_CRASH_TEST_WITH_TS_COMMANDS + ;; write_stress) echo $WRITE_STRESS_COMMANDS ;; @@ -1007,6 +1277,12 @@ asan_crash) echo $ASAN_CRASH_TEST_COMMANDS ;; + blackbox_asan_crash) + echo $ASAN_BLACKBOX_CRASH_TEST_COMMANDS + ;; + whitebox_asan_crash) + echo $ASAN_WHITEBOX_CRASH_TEST_COMMANDS + ;; asan_crash_with_atomic_flush) echo $ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS ;; @@ -1019,6 +1295,12 @@ ubsan_crash) echo $UBSAN_CRASH_TEST_COMMANDS ;; + blackbox_ubsan_crash) + echo $UBSAN_BLACKBOX_CRASH_TEST_COMMANDS + ;; + whitebox_ubsan_crash) + echo $UBSAN_WHITEBOX_CRASH_TEST_COMMANDS + ;; ubsan_crash_with_atomic_flush) echo $UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS ;; @@ -1034,6 +1316,12 @@ tsan_crash) echo $TSAN_CRASH_TEST_COMMANDS ;; + blackbox_tsan_crash) + echo $TSAN_BLACKBOX_CRASH_TEST_COMMANDS + ;; + whitebox_tsan_crash) + echo $TSAN_WHITEBOX_CRASH_TEST_COMMANDS + ;; tsan_crash_with_atomic_flush) echo $TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS ;; @@ -1056,11 +1344,18 @@ echo $REGRESSION_COMMANDS ;; run_regression) + set -e run_regression + set +e ;; java_build) echo $JAVA_BUILD_TEST_COMMANDS ;; + fbcode_stress_crash) + set -f + echo $FBCODE_STRESS_CRASH_TEST_COMMANDS + set +f + ;; *) echo "Invalid determinator command" exit 1 diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 2025-05-19 16:14:27.000000000 +0000 @@ -68,7 +68,7 @@ if($WorkFolder -eq "") { - # If TEST_TMPDIR is set use it + # If TEST_TMPDIR is set use it [string]$var = $Env:TEST_TMPDIR if($var -eq "") { $WorkFolder = -Join($RootFolder, "\db_tests\") @@ -93,7 +93,7 @@ if($ExcludeCases -ne "") { Write-Host "ExcludeCases: $ExcludeCases" $l = $ExcludeCases -split ' ' - ForEach($t in $l) { + ForEach($t in $l) { $ExcludeCasesSet.Add($t) | Out-Null } } @@ -102,7 +102,7 @@ if($ExcludeExes -ne "") { Write-Host "ExcludeExe: $ExcludeExes" $l = $ExcludeExes -split ' ' - ForEach($t in $l) { + ForEach($t in $l) { $ExcludeExesSet.Add($t) | Out-Null } } @@ -118,6 +118,10 @@ # MultiThreaded/MultiThreadedDBTest. # MultiThreaded/0 # GetParam() = 0 # MultiThreaded/1 # GetParam() = 1 +# RibbonTypeParamTest/0. # TypeParam = struct DefaultTypesAndSettings +# CompactnessAndBacktrackAndFpRate +# Extremes +# FindOccupancyForSuccessRate # # into this: # @@ -125,6 +129,9 @@ # DBTest.WriteEmptyBatch # MultiThreaded/MultiThreadedDBTest.MultiThreaded/0 # MultiThreaded/MultiThreadedDBTest.MultiThreaded/1 +# RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate +# RibbonTypeParamTest/0.Extremes +# RibbonTypeParamTest/0.FindOccupancyForSuccessRate # # Output into the parameter in a form TestName -> Log File Name function ExtractTestCases([string]$GTestExe, $HashTable) { @@ -138,6 +145,8 @@ ForEach( $l in $Tests) { + # remove trailing comment if any + $l = $l -replace '\s+\#.*','' # Leading whitespace is fine $l = $l -replace '^\s+','' # Trailing dot is a test group but no whitespace @@ -146,8 +155,7 @@ } else { # Otherwise it is a test name, remove leading space $test = $l - # remove trailing comment if any and create a log name - $test = $test -replace '\s+\#.*','' + # create a log name $test = "$Group$test" if($ExcludeCasesSet.Contains($test)) { @@ -253,7 +261,7 @@ $DiscoveredExe = @() dir -Path $search_path | ForEach-Object { - $DiscoveredExe += ($_.Name) + $DiscoveredExe += ($_.Name) } # Remove exclusions @@ -293,7 +301,7 @@ $ListOfExe = @() dir -Path $search_path | ForEach-Object { - $ListOfExe += ($_.Name) + $ListOfExe += ($_.Name) } # Exclude those in RunOnly from running as suites @@ -348,7 +356,7 @@ # Wait for all to finish and get the results while(($JobToLog.Count -gt 0) -or - ($TestCmds.Count -gt 0) -or + ($TestCmds.Count -gt 0) -or ($Suites.Count -gt 0)) { # Make sure we have maximum concurrent jobs running if anything @@ -468,8 +476,8 @@ $EndDate = (Get-Date) -New-TimeSpan -Start $StartDate -End $EndDate | - ForEach-Object { +New-TimeSpan -Start $StartDate -End $EndDate | + ForEach-Object { "Elapsed time: {0:g}" -f $_ } @@ -484,4 +492,4 @@ exit 0 - + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh 2025-05-19 16:14:27.000000000 +0000 @@ -1,9 +1,9 @@ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -set -e +set -ex -ROCKSDB_VERSION="5.10.3" -ZSTD_VERSION="1.1.3" +ROCKSDB_VERSION="6.7.3" +ZSTD_VERSION="1.4.4" echo "This script configures CentOS with everything needed to build and run RocksDB" @@ -40,5 +40,6 @@ chown -R vagrant:vagrant /usr/local/rocksdb/ sudo -u vagrant make static_lib cd examples/ -sudo -u vagrant make all -sudo -u vagrant ./c_simple_example +sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ make all +sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ ./c_simple_example + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cache.h" + +#include "cache/lru_cache.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +static std::unordered_map + lru_cache_options_type_info = { + {"capacity", + {offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"num_shard_bits", + {offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strict_capacity_limit", + {offsetof(struct LRUCacheOptions, strict_capacity_limit), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"high_pri_pool_ratio", + {offsetof(struct LRUCacheOptions, high_pri_pool_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; +#endif // ROCKSDB_LITE + +Status SecondaryCache::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + return LoadSharedObject(config_options, value, nullptr, + result); +} + +Status Cache::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + Status status; + std::shared_ptr cache; + if (value.find('=') == std::string::npos) { + cache = NewLRUCache(ParseSizeT(value)); + } else { +#ifndef ROCKSDB_LITE + LRUCacheOptions cache_opts; + status = OptionTypeInfo::ParseStruct(config_options, "", + &lru_cache_options_type_info, "", + value, &cache_opts); + if (status.ok()) { + cache = NewLRUCache(cache_opts); + } +#else + (void)config_options; + status = Status::NotSupported("Cannot load cache in LITE mode ", value); +#endif //! ROCKSDB_LITE + } + if (status.ok()) { + result->swap(cache); + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -1,8 +1,11 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef GFLAGS #include int main() { @@ -10,272 +13,8 @@ return 1; } #else - -#include -#include -#include - -#include "port/port.h" -#include "rocksdb/cache.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "util/gflags_compat.h" -#include "util/mutexlock.h" -#include "util/random.h" - -using GFLAGS_NAMESPACE::ParseCommandLineFlags; - -static const uint32_t KB = 1024; - -DEFINE_int32(threads, 16, "Number of concurrent threads to run."); -DEFINE_int64(cache_size, 8 * KB * KB, - "Number of bytes to use as a cache of uncompressed data."); -DEFINE_int32(num_shard_bits, 4, "shard_bits."); - -DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache"); -DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); - -DEFINE_bool(populate_cache, false, "Populate cache before operations"); -DEFINE_int32(insert_percent, 40, - "Ratio of insert to total workload (expressed as a percentage)"); -DEFINE_int32(lookup_percent, 50, - "Ratio of lookup to total workload (expressed as a percentage)"); -DEFINE_int32(erase_percent, 10, - "Ratio of erase to total workload (expressed as a percentage)"); - -DEFINE_bool(use_clock_cache, false, ""); - -namespace ROCKSDB_NAMESPACE { - -class CacheBench; -namespace { -void deleter(const Slice& /*key*/, void* value) { - delete reinterpret_cast(value); -} - -// State shared by all concurrent executions of the same benchmark. -class SharedState { - public: - explicit SharedState(CacheBench* cache_bench) - : cv_(&mu_), - num_threads_(FLAGS_threads), - num_initialized_(0), - start_(false), - num_done_(0), - cache_bench_(cache_bench) { - } - - ~SharedState() {} - - port::Mutex* GetMutex() { - return &mu_; - } - - port::CondVar* GetCondVar() { - return &cv_; - } - - CacheBench* GetCacheBench() const { - return cache_bench_; - } - - void IncInitialized() { - num_initialized_++; - } - - void IncDone() { - num_done_++; - } - - bool AllInitialized() const { - return num_initialized_ >= num_threads_; - } - - bool AllDone() const { - return num_done_ >= num_threads_; - } - - void SetStart() { - start_ = true; - } - - bool Started() const { - return start_; - } - - private: - port::Mutex mu_; - port::CondVar cv_; - - const uint64_t num_threads_; - uint64_t num_initialized_; - bool start_; - uint64_t num_done_; - - CacheBench* cache_bench_; -}; - -// Per-thread state for concurrent executions of the same benchmark. -struct ThreadState { - uint32_t tid; - Random rnd; - SharedState* shared; - - ThreadState(uint32_t index, SharedState* _shared) - : tid(index), rnd(1000 + index), shared(_shared) {} -}; -} // namespace - -class CacheBench { - public: - CacheBench() : num_threads_(FLAGS_threads) { - if (FLAGS_use_clock_cache) { - cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits); - if (!cache_) { - fprintf(stderr, "Clock cache not supported.\n"); - exit(1); - } - } else { - cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits); - } - } - - ~CacheBench() {} - - void PopulateCache() { - Random rnd(1); - for (int64_t i = 0; i < FLAGS_cache_size; i++) { - uint64_t rand_key = rnd.Next() % FLAGS_max_key; - // Cast uint64* to be char*, data would be copied to cache - Slice key(reinterpret_cast(&rand_key), 8); - // do insert - cache_->Insert(key, new char[10], 1, &deleter); - } - } - - bool Run() { - ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default(); - - PrintEnv(); - SharedState shared(this); - std::vector threads(num_threads_); - for (uint32_t i = 0; i < num_threads_; i++) { - threads[i] = new ThreadState(i, &shared); - env->StartThread(ThreadBody, threads[i]); - } - { - MutexLock l(shared.GetMutex()); - while (!shared.AllInitialized()) { - shared.GetCondVar()->Wait(); - } - // Record start time - uint64_t start_time = env->NowMicros(); - - // Start all threads - shared.SetStart(); - shared.GetCondVar()->SignalAll(); - - // Wait threads to complete - while (!shared.AllDone()) { - shared.GetCondVar()->Wait(); - } - - // Record end time - uint64_t end_time = env->NowMicros(); - double elapsed = static_cast(end_time - start_time) * 1e-6; - uint32_t qps = static_cast( - static_cast(FLAGS_threads * FLAGS_ops_per_thread) / elapsed); - fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps); - } - return true; - } - - private: - std::shared_ptr cache_; - uint32_t num_threads_; - - static void ThreadBody(void* v) { - ThreadState* thread = reinterpret_cast(v); - SharedState* shared = thread->shared; - - { - MutexLock l(shared->GetMutex()); - shared->IncInitialized(); - if (shared->AllInitialized()) { - shared->GetCondVar()->SignalAll(); - } - while (!shared->Started()) { - shared->GetCondVar()->Wait(); - } - } - thread->shared->GetCacheBench()->OperateCache(thread); - - { - MutexLock l(shared->GetMutex()); - shared->IncDone(); - if (shared->AllDone()) { - shared->GetCondVar()->SignalAll(); - } - } - } - - void OperateCache(ThreadState* thread) { - for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key; - // Cast uint64* to be char*, data would be copied to cache - Slice key(reinterpret_cast(&rand_key), 8); - int32_t prob_op = thread->rnd.Uniform(100); - if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { - // do insert - cache_->Insert(key, new char[10], 1, &deleter); - } else if (prob_op -= FLAGS_insert_percent && - prob_op < FLAGS_lookup_percent) { - // do lookup - auto handle = cache_->Lookup(key); - if (handle) { - cache_->Release(handle); - } - } else if (prob_op -= FLAGS_lookup_percent && - prob_op < FLAGS_erase_percent) { - // do erase - cache_->Erase(key); - } - } - } - - void PrintEnv() const { - printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); - printf("Number of threads : %d\n", FLAGS_threads); - printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); - printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); - printf("Num shard bits : %d\n", FLAGS_num_shard_bits); - printf("Max key : %" PRIu64 "\n", FLAGS_max_key); - printf("Populate cache : %d\n", FLAGS_populate_cache); - printf("Insert percentage : %d%%\n", FLAGS_insert_percent); - printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent); - printf("Erase percentage : %d%%\n", FLAGS_erase_percent); - printf("----------------------------\n"); - } -}; -} // namespace ROCKSDB_NAMESPACE - +#include "rocksdb/cache_bench_tool.h" int main(int argc, char** argv) { - ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_threads <= 0) { - fprintf(stderr, "threads number <= 0\n"); - exit(1); - } - - ROCKSDB_NAMESPACE::CacheBench bench; - if (FLAGS_populate_cache) { - bench.PopulateCache(); - } - if (bench.Run()) { - return 0; - } else { - return 1; - } + return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv); } - #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,794 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl/db_impl.h" +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/cachable_entry.h" +#include "util/coding.h" +#include "util/gflags_compat.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +static constexpr uint32_t KiB = uint32_t{1} << 10; +static constexpr uint32_t MiB = KiB << 10; +static constexpr uint64_t GiB = MiB << 10; + +DEFINE_uint32(threads, 16, "Number of concurrent threads to run."); +DEFINE_uint64(cache_size, 1 * GiB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_uint32(num_shard_bits, 6, "shard_bits."); + +DEFINE_double(resident_ratio, 0.25, + "Ratio of keys fitting in cache to keyspace."); +DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); +DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); + +DEFINE_uint32(skew, 5, "Degree of skew in key selection"); +DEFINE_bool(populate_cache, true, "Populate cache before operations"); + +DEFINE_uint32(lookup_insert_percent, 87, + "Ratio of lookup (+ insert on not found) to total workload " + "(expressed as a percentage)"); +DEFINE_uint32(insert_percent, 2, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_uint32(lookup_percent, 10, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_uint32(erase_percent, 1, + "Ratio of erase to total workload (expressed as a percentage)"); +DEFINE_bool(gather_stats, false, + "Whether to periodically simulate gathering block cache stats, " + "using one more thread."); +DEFINE_uint32( + gather_stats_sleep_ms, 1000, + "How many milliseconds to sleep between each gathering of stats."); + +DEFINE_uint32(gather_stats_entries_per_lock, 256, + "For Cache::ApplyToAllEntries"); +DEFINE_bool(skewed, false, "If true, skew the key access distribution"); +#ifndef ROCKSDB_LITE +DEFINE_string(secondary_cache_uri, "", + "Full URI for creating a custom secondary cache object"); +static class std::shared_ptr secondary_cache; +#endif // ROCKSDB_LITE + +DEFINE_bool(use_clock_cache, false, ""); + +// ## BEGIN stress_cache_key sub-tool options ## +DEFINE_bool(stress_cache_key, false, + "If true, run cache key stress test instead"); +DEFINE_uint32(sck_files_per_day, 2500000, + "(-stress_cache_key) Simulated files generated per day"); +DEFINE_uint32(sck_duration, 90, + "(-stress_cache_key) Number of days to simulate in each run"); +DEFINE_uint32( + sck_min_collision, 15, + "(-stress_cache_key) Keep running until this many collisions seen"); +DEFINE_uint32( + sck_file_size_mb, 32, + "(-stress_cache_key) Simulated file size in MiB, for accounting purposes"); +DEFINE_uint32(sck_reopen_nfiles, 100, + "(-stress_cache_key) Re-opens DB average every n files"); +DEFINE_uint32( + sck_restarts_per_day, 24, + "(-stress_cache_key) Simulated process restarts per day (across DBs)"); +DEFINE_uint32(sck_db_count, 100, + "(-stress_cache_key) Parallel DBs in operation"); +DEFINE_uint32(sck_table_bits, 20, + "(-stress_cache_key) Log2 number of tracked files"); +DEFINE_uint32(sck_keep_bits, 50, + "(-stress_cache_key) Number of cache key bits to keep"); +DEFINE_bool(sck_randomize, false, + "(-stress_cache_key) Randomize (hash) cache key"); +DEFINE_bool(sck_footer_unique_id, false, + "(-stress_cache_key) Simulate using proposed footer unique id"); +// ## END stress_cache_key sub-tool options ## + +namespace ROCKSDB_NAMESPACE { + +class CacheBench; +namespace { +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) {} + + ~SharedState() {} + + port::Mutex* GetMutex() { return &mu_; } + + port::CondVar* GetCondVar() { return &cv_; } + + CacheBench* GetCacheBench() const { return cache_bench_; } + + void IncInitialized() { num_initialized_++; } + + void IncDone() { num_done_++; } + + bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; } + + bool AllDone() const { return num_done_ >= FLAGS_threads; } + + void SetStart() { start_ = true; } + + bool Started() const { return start_; } + + private: + port::Mutex mu_; + port::CondVar cv_; + + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random64 rnd; + SharedState* shared; + HistogramImpl latency_ns_hist; + uint64_t duration_us = 0; + + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rnd(1000 + index), shared(_shared) {} +}; + +struct KeyGen { + char key_data[27]; + + Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { + uint64_t key = 0; + if (!FLAGS_skewed) { + uint64_t raw = rnd.Next(); + // Skew according to setting + for (uint32_t i = 0; i < FLAGS_skew; ++i) { + raw = std::min(raw, rnd.Next()); + } + key = FastRange64(raw, max_key); + } else { + key = rnd.Skewed(max_log); + if (key > max_key) { + key -= max_key; + } + } + // Variable size and alignment + size_t off = key % 8; + key_data[0] = char{42}; + EncodeFixed64(key_data + 1, key); + key_data[9] = char{11}; + EncodeFixed64(key_data + 10, key); + key_data[18] = char{4}; + EncodeFixed64(key_data + 19, key); + return Slice(&key_data[off], sizeof(key_data) - off); + } +}; + +char* createValue(Random64& rnd) { + char* rv = new char[FLAGS_value_bytes]; + // Fill with some filler data, and take some CPU time + for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { + EncodeFixed64(rv + i, rnd.Next()); + } + return rv; +} + +// Callbacks for secondary cache +size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; } + +Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) { + memcpy(out, obj, size); + return Status::OK(); +} + +// Different deleters to simulate using deleter to gather +// stats on the code origin and kind of cache entries. +void deleter1(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} +void deleter2(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} +void deleter3(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} + +Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1); +Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2); +Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3); +} // namespace + +class CacheBench { + static constexpr uint64_t kHundredthUint64 = + std::numeric_limits::max() / 100U; + + public: + CacheBench() + : max_key_(static_cast(FLAGS_cache_size / FLAGS_resident_ratio / + FLAGS_value_bytes)), + lookup_insert_threshold_(kHundredthUint64 * + FLAGS_lookup_insert_percent), + insert_threshold_(lookup_insert_threshold_ + + kHundredthUint64 * FLAGS_insert_percent), + lookup_threshold_(insert_threshold_ + + kHundredthUint64 * FLAGS_lookup_percent), + erase_threshold_(lookup_threshold_ + + kHundredthUint64 * FLAGS_erase_percent), + skewed_(FLAGS_skewed) { + if (erase_threshold_ != 100U * kHundredthUint64) { + fprintf(stderr, "Percentages must add to 100.\n"); + exit(1); + } + + max_log_ = 0; + if (skewed_) { + uint64_t max_key = max_key_; + while (max_key >>= 1) max_log_++; + if (max_key > (static_cast(1) << max_log_)) max_log_++; + } + + if (FLAGS_use_clock_cache) { + cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits); + if (!cache_) { + fprintf(stderr, "Clock cache not supported.\n"); + exit(1); + } + } else { + LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5); +#ifndef ROCKSDB_LITE + if (!FLAGS_secondary_cache_uri.empty()) { + Status s = SecondaryCache::CreateFromString( + ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf( + stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + opts.secondary_cache = secondary_cache; + } +#endif // ROCKSDB_LITE + + cache_ = NewLRUCache(opts); + } + } + + ~CacheBench() {} + + void PopulateCache() { + Random64 rnd(1); + KeyGen keygen; + for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { + cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), createValue(rnd), + &helper1, FLAGS_value_bytes); + } + } + + bool Run() { + const auto clock = SystemClock::Default().get(); + + PrintEnv(); + SharedState shared(this); + std::vector > threads(FLAGS_threads); + for (uint32_t i = 0; i < FLAGS_threads; i++) { + threads[i].reset(new ThreadState(i, &shared)); + std::thread(ThreadBody, threads[i].get()).detach(); + } + + HistogramImpl stats_hist; + std::string stats_report; + std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report); + + uint64_t start_time; + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + start_time = clock->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + // Stats gathering is considered background work. This time measurement + // is for foreground work, and not really ideal for that. See below. + uint64_t end_time = clock->NowMicros(); + stats_thread.join(); + + // Wall clock time - includes idle time if threads + // finish at different times (not ideal). + double elapsed_secs = static_cast(end_time - start_time) * 1e-6; + uint32_t ops_per_sec = static_cast( + 1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs); + printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs, + ops_per_sec); + + // Total time in each thread (more accurate throughput measure) + elapsed_secs = 0; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + elapsed_secs += threads[i]->duration_us * 1e-6; + } + ops_per_sec = static_cast(1.0 * FLAGS_threads * + FLAGS_ops_per_thread / elapsed_secs); + printf("Thread ops/sec = %u\n", ops_per_sec); + + printf("\nOperation latency (ns):\n"); + HistogramImpl combined; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + combined.Merge(threads[i]->latency_ns_hist); + } + printf("%s", combined.ToString().c_str()); + + if (FLAGS_gather_stats) { + printf("\nGather stats latency (us):\n"); + printf("%s", stats_hist.ToString().c_str()); + } + + printf("\n%s", stats_report.c_str()); + + return true; + } + + private: + std::shared_ptr cache_; + const uint64_t max_key_; + // Cumulative thresholds in the space of a random uint64_t + const uint64_t lookup_insert_threshold_; + const uint64_t insert_threshold_; + const uint64_t lookup_threshold_; + const uint64_t erase_threshold_; + const bool skewed_; + int max_log_; + + // A benchmark version of gathering stats on an active block cache by + // iterating over it. The primary purpose is to measure the impact of + // gathering stats with ApplyToAllEntries on throughput- and + // latency-sensitive Cache users. Performance of stats gathering is + // also reported. The last set of gathered stats is also reported, for + // manual sanity checking for logical errors or other unexpected + // behavior of cache_bench or the underlying Cache. + static void StatsBody(SharedState* shared, HistogramImpl* stats_hist, + std::string* stats_report) { + if (!FLAGS_gather_stats) { + return; + } + const auto clock = SystemClock::Default().get(); + uint64_t total_key_size = 0; + uint64_t total_charge = 0; + uint64_t total_entry_count = 0; + std::set deleters; + StopWatchNano timer(clock); + + for (;;) { + uint64_t time; + time = clock->NowMicros(); + uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000; + + { + MutexLock l(shared->GetMutex()); + for (;;) { + if (shared->AllDone()) { + std::ostringstream ostr; + ostr << "Most recent cache entry stats:\n" + << "Number of entries: " << total_entry_count << "\n" + << "Total charge: " << BytesToHumanString(total_charge) << "\n" + << "Average key size: " + << (1.0 * total_key_size / total_entry_count) << "\n" + << "Average charge: " + << BytesToHumanString(static_cast( + 1.0 * total_charge / total_entry_count)) + << "\n" + << "Unique deleters: " << deleters.size() << "\n"; + *stats_report = ostr.str(); + return; + } + if (clock->NowMicros() >= deadline) { + break; + } + uint64_t diff = deadline - std::min(clock->NowMicros(), deadline); + shared->GetCondVar()->TimedWait(diff + 1); + } + } + + // Now gather stats, outside of mutex + total_key_size = 0; + total_charge = 0; + total_entry_count = 0; + deleters.clear(); + auto fn = [&](const Slice& key, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + total_key_size += key.size(); + total_charge += charge; + ++total_entry_count; + // Something slightly more expensive as in (future) stats by category + deleters.insert(deleter); + }; + timer.Start(); + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; + shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); + stats_hist->Add(timer.ElapsedNanos() / 1000); + } + } + + static void ThreadBody(ThreadState* thread) { + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + // To use looked-up values + uint64_t result = 0; + // To hold handles for a non-trivial amount of time + Cache::Handle* handle = nullptr; + KeyGen gen; + const auto clock = SystemClock::Default().get(); + uint64_t start_time = clock->NowMicros(); + StopWatchNano timer(clock); + + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + timer.Start(); + Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); + uint64_t random_op = thread->rnd.Next(); + Cache::CreateCallback create_cb = + [](void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + *out_obj = reinterpret_cast(new char[size]); + memcpy(*out_obj, buf, size); + *charge = size; + return Status::OK(); + }; + + if (random_op < lookup_insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + // do something with the data + result += NPHash64(static_cast(cache_->Value(handle)), + FLAGS_value_bytes); + } else { + // do insert + cache_->Insert(key, createValue(thread->rnd), &helper2, + FLAGS_value_bytes, &handle); + } + } else if (random_op < insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do insert + cache_->Insert(key, createValue(thread->rnd), &helper3, + FLAGS_value_bytes, &handle); + } else if (random_op < lookup_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + // do something with the data + result += NPHash64(static_cast(cache_->Value(handle)), + FLAGS_value_bytes); + } + } else if (random_op < erase_threshold_) { + // do erase + cache_->Erase(key); + } else { + // Should be extremely unlikely (noop) + assert(random_op >= kHundredthUint64 * 100U); + } + thread->latency_ns_hist.Add(timer.ElapsedNanos()); + } + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // Ensure computations on `result` are not optimized away. + if (result == 1) { + printf("You are extremely unlucky(2). Try again.\n"); + exit(1); + } + thread->duration_us = clock->NowMicros() - start_time; + } + + void PrintEnv() const { + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Number of threads : %u\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %s\n", + BytesToHumanString(FLAGS_cache_size).c_str()); + printf("Num shard bits : %u\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", max_key_); + printf("Resident ratio : %g\n", FLAGS_resident_ratio); + printf("Skew degree : %u\n", FLAGS_skew); + printf("Populate cache : %d\n", int{FLAGS_populate_cache}); + printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent); + printf("Insert percentage : %u%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %u%%\n", FLAGS_erase_percent); + std::ostringstream stats; + if (FLAGS_gather_stats) { + stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, " + << FLAGS_gather_stats_entries_per_lock << "/lock)"; + } else { + stats << "disabled"; + } + printf("Gather stats : %s\n", stats.str().c_str()); + printf("----------------------------\n"); + } +}; + +// TODO: better description (see PR #9126 for some info) +class StressCacheKey { + public: + void Run() { + if (FLAGS_sck_footer_unique_id) { + FLAGS_sck_db_count = 1; + } + + uint64_t mb_per_day = + uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb; + printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n", + FLAGS_sck_file_size_mb / 1024.0 / 1024.0 * + std::pow(2.0, FLAGS_sck_table_bits), + mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0); + multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) / + (FLAGS_sck_file_size_mb * 1024.0 * 1024.0); + printf( + "Multiply by %g to correct for simulation losses (but still assume " + "whole file cached)\n", + multiplier_); + restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day; + double without_ejection = + std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day; + printf( + "Without ejection, expect random collision after %g days (%g " + "corrected)\n", + without_ejection, without_ejection * multiplier_); + double with_full_table = + std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) / + FLAGS_sck_files_per_day; + printf( + "With ejection and full table, expect random collision after %g " + "days (%g corrected)\n", + with_full_table, with_full_table * multiplier_); + collisions_ = 0; + + for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) { + RunOnce(); + if (collisions_ == 0) { + printf( + "No collisions after %d x %u days " + " \n", + i, FLAGS_sck_duration); + } else { + double est = 1.0 * i * FLAGS_sck_duration / collisions_; + printf("%" PRIu64 + " collisions after %d x %u days, est %g days between (%g " + "corrected) \n", + collisions_, i, FLAGS_sck_duration, est, est * multiplier_); + } + } + } + + void RunOnce() { + const size_t db_count = FLAGS_sck_db_count; + dbs_.reset(new TableProperties[db_count]{}); + const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1; + table_.reset(new uint64_t[table_mask + 1]{}); + if (FLAGS_sck_keep_bits > 64) { + FLAGS_sck_keep_bits = 64; + } + uint32_t shift_away = 64 - FLAGS_sck_keep_bits; + uint32_t shift_away_b = shift_away / 3; + uint32_t shift_away_a = shift_away - shift_away_b; + + process_count_ = 0; + session_count_ = 0; + ResetProcess(); + + Random64 r{std::random_device{}()}; + + uint64_t max_file_count = + uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration; + uint64_t file_count = 0; + uint32_t report_count = 0; + uint32_t collisions_this_run = 0; + // Round robin through DBs + for (size_t db_i = 0;; ++db_i) { + if (db_i >= db_count) { + db_i = 0; + } + if (file_count >= max_file_count) { + break; + } + if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) { + ResetSession(db_i); + } else if (r.OneIn(restart_nfiles_)) { + ResetProcess(); + } + OffsetableCacheKey ock; + dbs_[db_i].orig_file_number += 1; + // skip some file numbers, unless 1 DB so that that can simulate + // better (DB-independent) unique IDs + if (db_count > 1) { + dbs_[db_i].orig_file_number += (r.Next() & 3); + } + BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock); + CacheKey ck = ock.WithOffset(0); + uint64_t stripped; + if (FLAGS_sck_randomize) { + stripped = GetSliceHash64(ck.AsSlice()) >> shift_away; + } else if (FLAGS_sck_footer_unique_id) { + uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a; + uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; + stripped = (uint64_t{a} << 32) + b; + } else { + uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a; + uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; + stripped = (uint64_t{a} << 32) + b; + } + if (stripped == 0) { + // Unlikely, but we need to exclude tracking this value + printf("Hit Zero! \n"); + continue; + } + file_count++; + uint64_t h = NPHash64(reinterpret_cast(&stripped), 8); + // Skew lifetimes + size_t pos = + std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask); + if (table_[pos] == stripped) { + collisions_this_run++; + // To predict probability of no collisions, we have to get rid of + // correlated collisions, which this takes care of: + ResetProcess(); + } else { + // Replace + table_[pos] = stripped; + } + + if (++report_count == FLAGS_sck_files_per_day) { + report_count = 0; + // Estimate fill % + size_t incr = table_mask / 1000; + size_t sampled_count = 0; + for (size_t i = 0; i <= table_mask; i += incr) { + if (table_[i] != 0) { + sampled_count++; + } + } + // Report + printf( + "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 + " sess, %u coll, occ %g%%, ejected %g%% \r", + file_count / FLAGS_sck_files_per_day, process_count_, + session_count_, collisions_this_run, 100.0 * sampled_count / 1000.0, + 100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count)); + fflush(stdout); + } + } + collisions_ += collisions_this_run; + } + + void ResetSession(size_t i) { + dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr); + session_count_++; + } + + void ResetProcess() { + process_count_++; + DBImpl::TEST_ResetDbSessionIdGen(); + for (size_t i = 0; i < FLAGS_sck_db_count; ++i) { + ResetSession(i); + } + if (FLAGS_sck_footer_unique_id) { + dbs_[0].orig_file_number = 0; + } + } + + private: + // Use db_session_id and orig_file_number from TableProperties + std::unique_ptr dbs_; + std::unique_ptr table_; + uint64_t process_count_ = 0; + uint64_t session_count_ = 0; + uint64_t collisions_ = 0; + uint32_t restart_nfiles_ = 0; + double multiplier_ = 0.0; +}; + +int cache_bench_tool(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_stress_cache_key) { + // Alternate tool + StressCacheKey().Run(); + return 0; + } + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::CacheBench bench; + if (FLAGS_populate_cache) { + bench.PopulateCache(); + printf("Population complete\n"); + printf("----------------------------\n"); + } + if (bench.Run()) { + return 0; + } else { + return 1; + } +} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,70 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_entry_roles.h" + +#include + +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { + +std::array kCacheEntryRoleToCamelString{{ + "DataBlock", + "FilterBlock", + "FilterMetaBlock", + "DeprecatedFilterBlock", + "IndexBlock", + "OtherBlock", + "WriteBuffer", + "CompressionDictionaryBuildingBuffer", + "FilterConstruction", + "Misc", +}}; + +std::array kCacheEntryRoleToHyphenString{{ + "data-block", + "filter-block", + "filter-meta-block", + "deprecated-filter-block", + "index-block", + "other-block", + "write-buffer", + "compression-dictionary-building-buffer", + "filter-construction", + "misc", +}}; + +namespace { + +struct Registry { + std::mutex mutex; + std::unordered_map role_map; + void Register(Cache::DeleterFn fn, CacheEntryRole role) { + std::lock_guard lock(mutex); + role_map[fn] = role; + } + std::unordered_map Copy() { + std::lock_guard lock(mutex); + return role_map; + } +}; + +Registry& GetRegistry() { + STATIC_AVOID_DESTRUCTION(Registry, registry); + return registry; +} + +} // namespace + +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) { + GetRegistry().Register(fn, role); +} + +std::unordered_map CopyCacheDeleterRoleMap() { + return GetRegistry().Copy(); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,134 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { + +// Classifications of block cache entries, for reporting statistics +// Adding new enum to this class requires corresponding updates to +// kCacheEntryRoleToCamelString and kCacheEntryRoleToHyphenString +enum class CacheEntryRole { + // Block-based table data block + kDataBlock, + // Block-based table filter block (full or partitioned) + kFilterBlock, + // Block-based table metadata block for partitioned filter + kFilterMetaBlock, + // Block-based table deprecated filter block (old "block-based" filter) + kDeprecatedFilterBlock, + // Block-based table index block + kIndexBlock, + // Other kinds of block-based table block + kOtherBlock, + // WriteBufferManager reservations to account for memtable usage + kWriteBuffer, + // BlockBasedTableBuilder reservations to account for + // compression dictionary building buffer's memory usage + kCompressionDictionaryBuildingBuffer, + // Filter reservations to account for + // (new) bloom and ribbon filter construction's memory usage + kFilterConstruction, + // Default bucket, for miscellaneous cache entries. Do not use for + // entries that could potentially add up to large usage. + kMisc, +}; +constexpr uint32_t kNumCacheEntryRoles = + static_cast(CacheEntryRole::kMisc) + 1; + +extern std::array + kCacheEntryRoleToCamelString; +extern std::array + kCacheEntryRoleToHyphenString; + +// To associate cache entries with their role, we use a hack on the +// existing Cache interface. Because the deleter of an entry can authenticate +// the code origin of an entry, we can elaborate the choice of deleter to +// also encode role information, without inferring false role information +// from entries not choosing to encode a role. +// +// The rest of this file is for handling mappings between deleters and +// roles. + +// To infer a role from a deleter, the deleter must be registered. This +// can be done "manually" with this function. This function is thread-safe, +// and the registration mappings go into private but static storage. (Note +// that DeleterFn is a function pointer, not std::function. Registrations +// should not be too many.) +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role); + +// Gets a copy of the registered deleter -> role mappings. This is the only +// function for reading the mappings made with RegisterCacheDeleterRole. +// Why only this interface for reading? +// * This function has to be thread safe, which could incur substantial +// overhead. We should not pay this overhead for every deleter look-up. +// * This is suitable for preparing for batch operations, like with +// CacheEntryStatsCollector. +// * The number of mappings should be sufficiently small (dozens). +std::unordered_map CopyCacheDeleterRoleMap(); + +// ************************************************************** // +// An automatic registration infrastructure. This enables code +// to simply ask for a deleter associated with a particular type +// and role, and registration is automatic. In a sense, this is +// a small dependency injection infrastructure, because linking +// in new deleter instantiations is essentially sufficient for +// making stats collection (using CopyCacheDeleterRoleMap) aware +// of them. + +namespace cache_entry_roles_detail { + +template +struct RegisteredDeleter { + RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); } + + // These have global linkage to help ensure compiler optimizations do not + // break uniqueness for each + static void Delete(const Slice& /* key */, void* value) { + // Supports T == Something[], unlike delete operator + std::default_delete()( + static_cast::type*>(value)); + } +}; + +template +struct RegisteredNoopDeleter { + RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); } + + static void Delete(const Slice& /* key */, void* /* value */) { + // Here was `assert(value == nullptr);` but we can also put pointers + // to static data in Cache, for testing at least. + } +}; + +} // namespace cache_entry_roles_detail + +// Get an automatically registered deleter for value type T and role R. +// Based on C++ semantics, registration is invoked exactly once in a +// thread-safe way on first call to this function, for each . +template +Cache::DeleterFn GetCacheEntryDeleterForRole() { + static cache_entry_roles_detail::RegisteredDeleter reg; + return reg.Delete; +} + +// Get an automatically registered no-op deleter (value should be nullptr) +// and associated with role R. This is used for Cache "reservation" entries +// such as for WriteBufferManager. +template +Cache::DeleterFn GetNoopDeleterForRole() { + static cache_entry_roles_detail::RegisteredNoopDeleter reg; + return reg.Delete; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_stats.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,183 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_helpers.h" +#include "cache/cache_key.h" +#include "port/lang.h" +#include "rocksdb/cache.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding_lean.h" + +namespace ROCKSDB_NAMESPACE { + +// A generic helper object for gathering stats about cache entries by +// iterating over them with ApplyToAllEntries. This class essentially +// solves the problem of slowing down a Cache with too many stats +// collectors that could be sharing stat results, such as from multiple +// column families or multiple DBs sharing a Cache. We employ a few +// mitigations: +// * Only one collector for a particular kind of Stats is alive +// for each Cache. This is guaranteed using the Cache itself to hold +// the collector. +// * A mutex ensures only one thread is gathering stats for this +// collector. +// * The most recent gathered stats are saved and simply copied to +// satisfy requests within a time window (default: 3 minutes) of +// completion of the most recent stat gathering. +// +// Template parameter Stats must be copyable and trivially constructable, +// as well as... +// concept Stats { +// // Notification before applying callback to all entries +// void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); +// // Get the callback to apply to all entries. `callback` +// // type must be compatible with Cache::ApplyToAllEntries +// callback GetEntryCallback(); +// // Notification after applying callback to all entries +// void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); +// // Notification that a collection was skipped because of +// // sufficiently recent saved results. +// void SkippedCollection(); +// } +template +class CacheEntryStatsCollector { + public: + // Gather and save stats if saved stats are too old. (Use GetStats() to + // read saved stats.) + // + // Maximum allowed age for a "hit" on saved results is determined by the + // two interval parameters. Both set to 0 forces a re-scan. For example + // with min_interval_seconds=300 and min_interval_factor=100, if the last + // scan took 10s, we would only rescan ("miss") if the age in seconds of + // the saved results is > max(300, 100*10). + // Justification: scans can vary wildly in duration, e.g. from 0.02 sec + // to as much as 20 seconds, so we want to be able to cap the absolute + // and relative frequency of scans. + void CollectStats(int min_interval_seconds, int min_interval_factor) { + // Waits for any pending reader or writer (collector) + std::lock_guard lock(working_mutex_); + + uint64_t max_age_micros = + static_cast(std::max(min_interval_seconds, 0)) * 1000000U; + + if (last_end_time_micros_ > last_start_time_micros_ && + min_interval_factor > 0) { + max_age_micros = std::max( + max_age_micros, min_interval_factor * (last_end_time_micros_ - + last_start_time_micros_)); + } + + uint64_t start_time_micros = clock_->NowMicros(); + if ((start_time_micros - last_end_time_micros_) > max_age_micros) { + last_start_time_micros_ = start_time_micros; + working_stats_.BeginCollection(cache_, clock_, start_time_micros); + + cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {}); + TEST_SYNC_POINT_CALLBACK( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr); + + uint64_t end_time_micros = clock_->NowMicros(); + last_end_time_micros_ = end_time_micros; + working_stats_.EndCollection(cache_, clock_, end_time_micros); + } else { + working_stats_.SkippedCollection(); + } + + // Save so that we don't need to wait for an outstanding collection in + // order to make of copy of the last saved stats + std::lock_guard lock2(saved_mutex_); + saved_stats_ = working_stats_; + } + + // Gets saved stats, regardless of age + void GetStats(Stats *stats) { + std::lock_guard lock(saved_mutex_); + *stats = saved_stats_; + } + + Cache *GetCache() const { return cache_; } + + // Gets or creates a shared instance of CacheEntryStatsCollector in the + // cache itself, and saves into `ptr`. This shared_ptr will hold the + // entry in cache until all refs are destroyed. + static Status GetShared(Cache *cache, SystemClock *clock, + std::shared_ptr *ptr) { + const Slice &cache_key = GetCacheKey(); + + Cache::Handle *h = cache->Lookup(cache_key); + if (h == nullptr) { + // Not yet in cache, but Cache doesn't provide a built-in way to + // avoid racing insert. So we double-check under a shared mutex, + // inspired by TableCache. + STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex); + std::lock_guard lock(static_mutex); + + h = cache->Lookup(cache_key); + if (h == nullptr) { + auto new_ptr = new CacheEntryStatsCollector(cache, clock); + // TODO: non-zero charge causes some tests that count block cache + // usage to go flaky. Fix the problem somehow so we can use an + // accurate charge. + size_t charge = 0; + Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h, + Cache::Priority::HIGH); + if (!s.ok()) { + assert(h == nullptr); + delete new_ptr; + return s; + } + } + } + // If we reach here, shared entry is in cache with handle `h`. + assert(cache->GetDeleter(h) == Deleter); + + // Build an aliasing shared_ptr that keeps `ptr` in cache while there + // are references. + *ptr = MakeSharedCacheHandleGuard(cache, h); + return Status::OK(); + } + + private: + explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock) + : saved_stats_(), + working_stats_(), + last_start_time_micros_(0), + last_end_time_micros_(/*pessimistic*/ 10000000), + cache_(cache), + clock_(clock) {} + + static void Deleter(const Slice &, void *value) { + delete static_cast(value); + } + + static const Slice &GetCacheKey() { + // For each template instantiation + static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime(); + static Slice ckey_slice = ckey.AsSlice(); + return ckey_slice; + } + + std::mutex saved_mutex_; + Stats saved_stats_; + + std::mutex working_mutex_; + Stats working_stats_; + uint64_t last_start_time_micros_; + uint64_t last_end_time_micros_; + + Cache *const cache_; + SystemClock *const clock_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_helpers.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_helpers.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,125 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the cached value given a cache handle. +template +T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) { + assert(cache); + assert(handle); + + return static_cast(cache->Value(handle)); +} + +// Simple generic deleter for Cache (to be used with Cache::Insert). +template +void DeleteCacheEntry(const Slice& /* key */, void* value) { + delete static_cast(value); +} + +// Turns a T* into a Slice so it can be used as a key with Cache. +template +Slice GetSlice(const T* t) { + return Slice(reinterpret_cast(t), sizeof(T)); +} + +// Generic resource management object for cache handles that releases the handle +// when destroyed. Has unique ownership of the handle, so copying it is not +// allowed, while moving it transfers ownership. +template +class CacheHandleGuard { + public: + CacheHandleGuard() = default; + + CacheHandleGuard(Cache* cache, Cache::Handle* handle) + : cache_(cache), + handle_(handle), + value_(GetFromCacheHandle(cache, handle)) { + assert(cache_ && handle_ && value_); + } + + CacheHandleGuard(const CacheHandleGuard&) = delete; + CacheHandleGuard& operator=(const CacheHandleGuard&) = delete; + + CacheHandleGuard(CacheHandleGuard&& rhs) noexcept + : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) { + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + } + + CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept { + if (this == &rhs) { + return *this; + } + + ReleaseHandle(); + + cache_ = rhs.cache_; + handle_ = rhs.handle_; + value_ = rhs.value_; + + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + + return *this; + } + + ~CacheHandleGuard() { ReleaseHandle(); } + + bool IsEmpty() const { return !handle_; } + + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return handle_; } + T* GetValue() const { return value_; } + + void Reset() { + ReleaseHandle(); + ResetFields(); + } + + private: + void ReleaseHandle() { + if (IsEmpty()) { + return; + } + + assert(cache_); + cache_->Release(handle_); + } + + void ResetFields() { + cache_ = nullptr; + handle_ = nullptr; + value_ = nullptr; + } + + private: + Cache* cache_ = nullptr; + Cache::Handle* handle_ = nullptr; + T* value_ = nullptr; +}; + +// Build an aliasing shared_ptr that keeps `handle` in cache while there +// are references, but the pointer is to the value for that cache entry, +// which must be of type T. This is copyable, unlike CacheHandleGuard, but +// does not provide access to caching details. +template +std::shared_ptr MakeSharedCacheHandleGuard(Cache* cache, + Cache::Handle* handle) { + auto wrapper = std::make_shared>(cache, handle); + return std::shared_ptr(wrapper, static_cast(cache->Value(handle))); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,271 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_key.h" + +#include +#include + +#include "rocksdb/cache.h" +#include "table/unique_id_impl.h" +#include "util/hash.h" +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { + +// Value space plan for CacheKey: +// +// session_etc64_ | offset_etc64_ | Only generated by +// ---------------+---------------+------------------------------------------ +// 0 | 0 | Reserved for "empty" CacheKey() +// 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime +// 0 | >= 1<<63 | CreateUniqueForProcessLifetime +// > 0 | any | OffsetableCacheKey.WithOffset + +CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) { + // +1 so that we can reserve all zeros for "unset" cache key + uint64_t id = cache->NewId() + 1; + // Ensure we don't collide with CreateUniqueForProcessLifetime + assert((id >> 63) == 0U); + return CacheKey(0, id); +} + +CacheKey CacheKey::CreateUniqueForProcessLifetime() { + // To avoid colliding with CreateUniqueForCacheLifetime, assuming + // Cache::NewId counts up from zero, here we count down from UINT64_MAX. + // If this ever becomes a point of contention, we could use CoreLocalArray. + static std::atomic counter{UINT64_MAX}; + uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed); + // Ensure we don't collide with CreateUniqueForCacheLifetime + assert((id >> 63) == 1U); + return CacheKey(0, id); +} + +// Value plan for CacheKeys from OffsetableCacheKey, assuming that +// db_session_ids are generated from a base_session_id and +// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId +// in DBImpl::GenerateDbSessionId): +// +// Conceptual inputs: +// db_id (unstructured, from GenerateRawUniqueId or equiv) +// * could be shared between cloned DBs but rare +// * could be constant, if session id suffices +// base_session_id (unstructured, from GenerateRawUniqueId) +// session_id_counter (structured) +// * usually much smaller than 2**24 +// file_number (structured) +// * usually smaller than 2**24 +// offset_in_file (structured, might skip lots of values) +// * usually smaller than 2**32 +// max_offset determines placement of file_number to prevent +// overlapping with offset +// +// Outputs come from bitwise-xor of the constituent pieces, low bits on left: +// +// |------------------------- session_etc64 -------------------------| +// | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ | +// |-----------------------------------------------------------------| +// | session_id_counter ...| | +// |-----------------------------------------------------------------| +// | | ... file_number | +// | | overflow & meta | +// |-----------------------------------------------------------------| +// +// +// |------------------------- offset_etc64 --------------------------| +// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ | +// | * base_session_id (upper ~39 bits) | +// | * db_id (~122 bits entropy) | +// |-----------------------------------------------------------------| +// | offset_in_file ............... | | +// |-----------------------------------------------------------------| +// | | file_number, 0-3 | +// | | lower bytes | +// |-----------------------------------------------------------------| +// +// Based on max_offset, a maximal number of bytes 0..3 is chosen for +// including from lower bits of file_number in offset_etc64. The choice +// is encoded in two bits of metadata going into session_etc64, though +// the common case of 3 bytes is encoded as 0 so that session_etc64 +// is unmodified by file_number concerns in the common case. +// +// There is nothing preventing "file number overflow & meta" from meeting +// and overlapping with session_id_counter, but reaching such a case requires +// an intractable combination of large file offsets (thus at least some large +// files), large file numbers (thus large number of files generated), and +// large number of session IDs generated in a single process. A trillion each +// (2**40) of session ids, offsets, and file numbers comes to 120 bits. +// With two bits of metadata and byte granularity, this is on the verge of +// overlap, but even in the overlap case, it doesn't seem likely that +// a file from billions of files or session ids ago will still be live +// or cached. +// +// In fact, if our SST files are all < 4TB (see +// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated +// in a single process are guaranteed to have unique cache keys, unless/until +// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in +// a single process and 64 trillion files generated. Even at that point, to +// see a collision we would need a miraculous re-synchronization of session +// id and file number, along with a live file or stale cache entry from +// trillions of files ago. +// +// How https://github.com/pdillinger/unique_id applies here: +// Every bit of output always includes "unstructured" uniqueness bits and +// often combines with "structured" uniqueness bits. The "unstructured" bits +// change infrequently: only when we cannot guarantee our state tracking for +// "structured" uniqueness hasn't been cloned. Using a static +// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an +// "all new" session id when a new process uses RocksDB. (Between processes, +// we don't know if a DB or other persistent storage has been cloned.) Within +// a process, only the session_lower of the db_session_id changes +// incrementally ("structured" uniqueness). +// +// This basically means that our offsets, counters and file numbers allow us +// to do somewhat "better than random" (birthday paradox) while in the +// degenerate case of completely new session for each tiny file, we still +// have strong uniqueness properties from the birthday paradox, with ~103 +// bit session IDs or up to 128 bits entropy with different DB IDs sharing a +// cache. +// +// More collision probability analysis: +// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD) +// with average process/session lifetime of (pessimistically) 4 minutes. +// In 180 days (generous allowable data lifespan), we generate 31 million GB +// of data, or 2^55 bytes, and 2^16 "all new" session IDs. +// +// First, suppose this is in a single DB (lifetime 180 days): +// 128 bits cache key size +// - 55 <- ideal size for byte offsets + file numbers +// - 2 <- bits for offsets and file numbers not exactly powers of two +// - 2 <- bits for file number encoding metadata +// + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey +// ---- +// 71 <- bits remaining for distinguishing session IDs +// The probability of a collision in 71 bits of session ID data is less than +// 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all +// data from the last 180 days is in cache for potential collision, and that +// cache keys under each session id exhaustively cover the remaining 57 bits +// while in reality they'll only cover a small fraction of it. +// +// Although data could be transferred between hosts, each host has its own +// cache and we are already assuming a high rate of "all new" session ids. +// So this doesn't really change the collision calculation. Across a fleet +// of 1 million, each with <1 in a trillion collision possibility, +// fleetwide collision probability is <1 in a million. +// +// Now suppose we have many DBs per host, say 2**10, with same host-wide write +// rate and process/session lifetime. File numbers will be ~10 bits smaller +// and we will have 2**10 times as many session IDs because of simultaneous +// lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)), +// or roughly 1 in a billion. +// +// Suppose instead we generated random or hashed cache keys for each +// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys +// in 180 days. Collision probability is more easily estimated at roughly +// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all +// data from the last 180 days is in cache, but NOT the other assumption +// for the 1 in a trillion estimate above). +// +// Conclusion: Burning through session IDs, particularly "all new" IDs that +// only arise when a new process is started, is the only way to have a +// plausible chance of cache key collision. When processes live for hours +// or days, the chance of a cache key collision seems more plausibly due +// to bad hardware than to bad luck in random session ID data. +// +OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, + uint64_t max_offset) { +#ifndef NDEBUG + max_offset_ = max_offset; +#endif + // Closely related to GetSstInternalUniqueId, but only need 128 bits and + // need to include an offset within the file. + // See also https://github.com/pdillinger/unique_id for background. + uint64_t session_upper = 0; // Assignment to appease clang-analyze + uint64_t session_lower = 0; // Assignment to appease clang-analyze + { + Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); + if (!s.ok()) { + // A reasonable fallback in case malformed + Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper, + &session_lower); + } + } + + // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) + // for more global uniqueness entropy. + // (It is possible that many DBs descended from one common DB id are copied + // around and proliferate, in which case session id is critical, but it is + // more common for different DBs to have different DB ids.) + uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper); + + // This establishes the db+session id part of the cache key. + // + // Exactly preserve (in common cases; see modifiers below) session lower to + // ensure that session ids generated during the same process lifetime are + // guaranteed unique. + // + // We put this first for CommonPrefixSlice(), so that a small-ish set of + // cache key prefixes to cover entries relevant to any DB. + session_etc64_ = session_lower; + // This provides extra entopy in case of different DB id or process + // generating a session id, but is also partly/variably obscured by + // file_number and offset (see below). + offset_etc64_ = db_hash; + + // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and + // a file_number, but we might need the file_number to overflow into + // session_etc64_. (There must only be one session_etc64_ value per + // file, and preferably shared among many files.) + // + // Figure out how many bytes of file_number we are going to be able to + // pack in with max_offset, though our encoding will only support packing + // in up to 3 bytes of file_number. (16M file numbers is enough for a new + // file number every second for half a year.) + int file_number_bytes_in_offset_etc = + (63 - FloorLog2(max_offset | 0x100000000U)) / 8; + int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8; + + // Assert two bits of metadata + assert(file_number_bytes_in_offset_etc >= 0 && + file_number_bytes_in_offset_etc <= 3); + // Assert we couldn't have used a larger allowed number of bytes (shift + // would chop off bytes). + assert(file_number_bytes_in_offset_etc == 3 || + (max_offset << (file_number_bits_in_offset_etc + 8) >> + (file_number_bits_in_offset_etc + 8)) != max_offset); + + uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1; + // Pack into high bits of etc so that offset can go in low bits of etc + // TODO: could be EndianSwapValue? + uint64_t offset_etc_modifier = ReverseBits(file_number & mask); + assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U); + + // Overflow and 3 - byte count (likely both zero) go into session_id part + uint64_t session_etc_modifier = + (file_number >> file_number_bits_in_offset_etc << 2) | + static_cast(3 - file_number_bytes_in_offset_etc); + // Packed into high bits to minimize interference with session id counter. + session_etc_modifier = ReverseBits(session_etc_modifier); + + // Assert session_id part is only modified in extreme cases + assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU || + max_offset > /*5 bytes*/ 0xffffffffffU); + + // Xor in the modifiers + session_etc64_ ^= session_etc_modifier; + offset_etc64_ ^= offset_etc_modifier; + + // Although DBImpl guarantees (in recent versions) that session_lower is not + // zero, that's not entirely sufficient to guarantee that session_etc64_ is + // not zero (so that the 0 case can be used by CacheKey::CreateUnique*) + if (session_etc64_ == 0U) { + session_etc64_ = session_upper | 1U; + } + assert(session_etc64_ != 0); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,132 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; + +// A standard holder for fixed-size block cache keys (and for related caches). +// They are created through one of these, each using its own range of values: +// * CacheKey::CreateUniqueForCacheLifetime +// * CacheKey::CreateUniqueForProcessLifetime +// * Default ctor ("empty" cache key) +// * OffsetableCacheKey->WithOffset +// +// The first two use atomic counters to guarantee uniqueness over the given +// lifetime and the last uses a form of universally unique identifier for +// uniqueness with very high probabilty (and guaranteed for files generated +// during a single process lifetime). +// +// CacheKeys are currently used by calling AsSlice() to pass as a key to +// Cache. For performance, the keys are endianness-dependent (though otherwise +// portable). (Persistable cache entries are not intended to cross platforms.) +class CacheKey { + public: + // For convenience, constructs an "empty" cache key that is never returned + // by other means. + inline CacheKey() : session_etc64_(), offset_etc64_() {} + + inline bool IsEmpty() const { + return (session_etc64_ == 0) & (offset_etc64_ == 0); + } + + // Use this cache key as a Slice (byte order is endianness-dependent) + inline Slice AsSlice() const { + static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key"); + assert(!IsEmpty()); + return Slice(reinterpret_cast(this), sizeof(*this)); + } + + // Create a CacheKey that is unique among others associated with this Cache + // instance. Depends on Cache::NewId. This is useful for block cache + // "reservations". + static CacheKey CreateUniqueForCacheLifetime(Cache *cache); + + // Create a CacheKey that is unique among others for the lifetime of this + // process. This is useful for saving in a static data member so that + // different DB instances can agree on a cache key for shared entities, + // such as for CacheEntryStatsCollector. + static CacheKey CreateUniqueForProcessLifetime(); + + protected: + friend class OffsetableCacheKey; + CacheKey(uint64_t session_etc64, uint64_t offset_etc64) + : session_etc64_(session_etc64), offset_etc64_(offset_etc64) {} + uint64_t session_etc64_; + uint64_t offset_etc64_; +}; + +// A file-specific generator of cache keys, sometimes referred to as the +// "base" cache key for a file because all the cache keys for various offsets +// within the file are computed using simple arithmetic. The basis for the +// general approach is dicussed here: https://github.com/pdillinger/unique_id +// Heavily related to GetUniqueIdFromTableProperties. +// +// If the db_id, db_session_id, and file_number come from the file's table +// properties, then the keys will be stable across DB::Open/Close, backup/ +// restore, import/export, etc. +// +// This class "is a" CacheKey only privately so that it is not misused as +// a ready-to-use CacheKey. +class OffsetableCacheKey : private CacheKey { + public: + // For convenience, constructs an "empty" cache key that should not be used. + inline OffsetableCacheKey() : CacheKey() {} + + // Constructs an OffsetableCacheKey with the given information about a file. + // max_offset is based on file size (see WithOffset) and is required here to + // choose an appropriate (sub-)encoding. This constructor never generates an + // "empty" base key. + OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, + uint64_t file_number, uint64_t max_offset); + + inline bool IsEmpty() const { + bool result = session_etc64_ == 0; + assert(!(offset_etc64_ > 0 && result)); + return result; + } + + // Construct a CacheKey for an offset within a file, which must be + // <= max_offset provided in constructor. An offset is not necessarily a + // byte offset if a smaller unique identifier of keyable offsets is used. + // + // This class was designed to make this hot code extremely fast. + inline CacheKey WithOffset(uint64_t offset) const { + assert(!IsEmpty()); + assert(offset <= max_offset_); + return CacheKey(session_etc64_, offset_etc64_ ^ offset); + } + + // The "common prefix" is a shared prefix for all the returned CacheKeys, + // that also happens to usually be the same among many files in the same DB, + // so is efficient and highly accurate (not perfectly) for DB-specific cache + // dump selection (but not file-specific). + static constexpr size_t kCommonPrefixSize = 8; + inline Slice CommonPrefixSlice() const { + static_assert(sizeof(session_etc64_) == kCommonPrefixSize, + "8 byte common prefix expected"); + assert(!IsEmpty()); + assert(&this->session_etc64_ == static_cast(this)); + + return Slice(reinterpret_cast(this), kCommonPrefixSize); + } + + // For any max_offset <= this value, the same encoding scheme is guaranteed. + static constexpr uint64_t kMaxOffsetStandardEncoding = 0xffffffffffU; + + private: +#ifndef NDEBUG + uint64_t max_offset_ = 0; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "cache/cache_reservation_manager.h" + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +CacheReservationManager::CacheReservationManager(std::shared_ptr cache, + bool delayed_decrease) + : delayed_decrease_(delayed_decrease), + cache_allocated_size_(0), + memory_used_(0) { + assert(cache != nullptr); + cache_ = cache; +} + +CacheReservationManager::~CacheReservationManager() { + for (auto* handle : dummy_handles_) { + cache_->Release(handle, true); + } +} + +template +Status CacheReservationManager::UpdateCacheReservation( + std::size_t new_mem_used) { + memory_used_ = new_mem_used; + std::size_t cur_cache_allocated_size = + cache_allocated_size_.load(std::memory_order_relaxed); + if (new_mem_used == cur_cache_allocated_size) { + return Status::OK(); + } else if (new_mem_used > cur_cache_allocated_size) { + Status s = IncreaseCacheReservation(new_mem_used); + return s; + } else { + // In delayed decrease mode, we don't decrease cache reservation + // untill the memory usage is less than 3/4 of what we reserve + // in the cache. + // We do this because + // (1) Dummy entry insertion is expensive in block cache + // (2) Delayed releasing previously inserted dummy entries can save such + // expensive dummy entry insertion on memory increase in the near future, + // which is likely to happen when the memory usage is greater than or equal + // to 3/4 of what we reserve + if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) { + return Status::OK(); + } else { + Status s = DecreaseCacheReservation(new_mem_used); + return s; + } + } +} + +// Explicitly instantiate templates for "CacheEntryRole" values we use. +// This makes it possible to keep the template definitions in the .cc file. +template Status CacheReservationManager::UpdateCacheReservation< + CacheEntryRole::kWriteBuffer>(std::size_t new_mem_used); +template Status CacheReservationManager::UpdateCacheReservation< + CacheEntryRole::kCompressionDictionaryBuildingBuffer>( + std::size_t new_mem_used); +// For cache reservation manager unit tests +template Status CacheReservationManager::UpdateCacheReservation< + CacheEntryRole::kMisc>(std::size_t new_mem_used); + +template +Status CacheReservationManager::MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr>* handle) { + assert(handle != nullptr); + Status s = + UpdateCacheReservation(GetTotalMemoryUsed() + incremental_memory_used); + (*handle).reset(new CacheReservationHandle(incremental_memory_used, + shared_from_this())); + return s; +} + +template Status +CacheReservationManager::MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr>* handle); +template Status CacheReservationManager::MakeCacheReservation< + CacheEntryRole::kFilterConstruction>( + std::size_t incremental_memory_used, + std::unique_ptr< + CacheReservationHandle>* handle); + +template +Status CacheReservationManager::IncreaseCacheReservation( + std::size_t new_mem_used) { + Status return_status = Status::OK(); + while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) { + Cache::Handle* handle = nullptr; + return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry, + GetNoopDeleterForRole(), &handle); + + if (return_status != Status::OK()) { + return return_status; + } + + dummy_handles_.push_back(handle); + cache_allocated_size_ += kSizeDummyEntry; + } + return return_status; +} + +Status CacheReservationManager::DecreaseCacheReservation( + std::size_t new_mem_used) { + Status return_status = Status::OK(); + + // Decrease to the smallest multiple of kSizeDummyEntry that is greater than + // or equal to new_mem_used We do addition instead of new_mem_used <= + // cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to + // avoid underflow of size_t when cache_allocated_size_ = 0 + while (new_mem_used + kSizeDummyEntry <= + cache_allocated_size_.load(std::memory_order_relaxed)) { + assert(!dummy_handles_.empty()); + auto* handle = dummy_handles_.back(); + cache_->Release(handle, true); + dummy_handles_.pop_back(); + cache_allocated_size_ -= kSizeDummyEntry; + } + return return_status; +} + +std::size_t CacheReservationManager::GetTotalReservedCacheSize() { + return cache_allocated_size_.load(std::memory_order_relaxed); +} + +std::size_t CacheReservationManager::GetTotalMemoryUsed() { + return memory_used_; +} + +Slice CacheReservationManager::GetNextCacheKey() { + // Calling this function will have the side-effect of changing the + // underlying cache_key_ that is shared among other keys generated from this + // fucntion. Therefore please make sure the previous keys are saved/copied + // before calling this function. + cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get()); + return cache_key_.AsSlice(); +} + +template +Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole() { + return GetNoopDeleterForRole(); +} + +template Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole< + CacheEntryRole::kFilterConstruction>(); + +template +CacheReservationHandle::CacheReservationHandle( + std::size_t incremental_memory_used, + std::shared_ptr cache_res_mgr) + : incremental_memory_used_(incremental_memory_used) { + assert(cache_res_mgr != nullptr); + cache_res_mgr_ = cache_res_mgr; +} + +template +CacheReservationHandle::~CacheReservationHandle() { + assert(cache_res_mgr_ != nullptr); + assert(cache_res_mgr_->GetTotalMemoryUsed() >= incremental_memory_used_); + + Status s = cache_res_mgr_->UpdateCacheReservation( + cache_res_mgr_->GetTotalMemoryUsed() - incremental_memory_used_); + s.PermitUncheckedError(); +} + +// Explicitly instantiate templates for "CacheEntryRole" values we use. +// This makes it possible to keep the template definitions in the .cc file. +template class CacheReservationHandle; +template class CacheReservationHandle; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,191 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +template +class CacheReservationHandle; + +// CacheReservationManager is for reserving cache space for the memory used +// through inserting/releasing dummy entries in the cache. +// +// This class is NOT thread-safe, except that GetTotalReservedCacheSize() +// can be called without external synchronization. +class CacheReservationManager + : public std::enable_shared_from_this { + public: + // Construct a CacheReservationManager + // @param cache The cache where dummy entries are inserted and released for + // reserving cache space + // @param delayed_decrease If set true, then dummy entries won't be released + // immediately when memory usage decreases. + // Instead, it will be released when the memory usage + // decreases to 3/4 of what we have reserved so far. + // This is for saving some future dummy entry + // insertion when memory usage increases are likely to + // happen in the near future. + explicit CacheReservationManager(std::shared_ptr cache, + bool delayed_decrease = false); + + // no copy constructor, copy assignment, move constructor, move assignment + CacheReservationManager(const CacheReservationManager &) = delete; + CacheReservationManager &operator=(const CacheReservationManager &) = delete; + CacheReservationManager(CacheReservationManager &&) = delete; + CacheReservationManager &operator=(CacheReservationManager &&) = delete; + + ~CacheReservationManager(); + + template + + // One of the two ways of reserving/releasing cache, + // see CacheReservationManager::MakeCacheReservation() for the other. + // Use ONLY one of them to prevent unexpected behavior. + // + // Insert and release dummy entries in the cache to + // match the size of total dummy entries with the least multiple of + // kSizeDummyEntry greater than or equal to new_mem_used + // + // Insert dummy entries if new_memory_used > cache_allocated_size_; + // + // Release dummy entries if new_memory_used < cache_allocated_size_ + // (and new_memory_used < cache_allocated_size_ * 3/4 + // when delayed_decrease is set true); + // + // Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_ + // or (2) new_memory_used is in the interval of + // [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease + // is set true. + // + // @param new_memory_used The number of bytes used by new memory + // The most recent new_memoy_used passed in will be returned + // in GetTotalMemoryUsed() even when the call return non-ok status. + // + // Since the class is NOT thread-safe, external synchronization on the + // order of calling UpdateCacheReservation() is needed if you want + // GetTotalMemoryUsed() indeed returns the latest memory used. + // + // @return On inserting dummy entries, it returns Status::OK() if all dummy + // entry insertions succeed. + // Otherwise, it returns the first non-ok status; + // On releasing dummy entries, it always returns Status::OK(). + // On keeping dummy entries the same, it always returns Status::OK(). + Status UpdateCacheReservation(std::size_t new_memory_used); + + // One of the two ways of reserving/releasing cache, + // see CacheReservationManager::UpdateCacheReservation() for the other. + // Use ONLY one of them to prevent unexpected behavior. + // + // Insert dummy entries in the cache for the incremental memory usage + // to match the size of total dummy entries with the least multiple of + // kSizeDummyEntry greater than or equal to the total memory used. + // + // A CacheReservationHandle is returned as an output parameter. + // The reserved dummy entries are automatically released on the destruction of + // this handle, which achieves better RAII per cache reservation. + // + // WARNING: Deallocate all the handles of the CacheReservationManager object + // before deallocating the object to prevent unexpected behavior. + // + // @param incremental_memory_used The number of bytes increased in memory + // usage. + // + // Calling GetTotalMemoryUsed() afterward will return the total memory + // increased by this number, even when calling MakeCacheReservation() + // returns non-ok status. + // + // Since the class is NOT thread-safe, external synchronization in + // calling MakeCacheReservation() is needed if you want + // GetTotalMemoryUsed() indeed returns the latest memory used. + // + // @param handle An pointer to std::unique_ptr> that + // manages the lifetime of the handle and its cache reservation. + // + // @return It returns Status::OK() if all dummy + // entry insertions succeed. + // Otherwise, it returns the first non-ok status; + // + // REQUIRES: handle != nullptr + // REQUIRES: The CacheReservationManager object is NOT managed by + // std::unique_ptr as CacheReservationHandle needs to + // shares ownership to the CacheReservationManager object. + template + Status MakeCacheReservation( + std::size_t incremental_memory_used, + std::unique_ptr> *handle); + + // Return the size of the cache (which is a multiple of kSizeDummyEntry) + // successfully reserved by calling UpdateCacheReservation(). + // + // When UpdateCacheReservation() returns non-ok status, + // calling GetTotalReservedCacheSize() after that might return a slightly + // smaller number than the actual reserved cache size due to + // the returned number will always be a multiple of kSizeDummyEntry + // and cache full might happen in the middle of inserting a dummy entry. + std::size_t GetTotalReservedCacheSize(); + + // Return the latest total memory used indicated by the most recent call of + // UpdateCacheReservation(std::size_t new_memory_used); + std::size_t GetTotalMemoryUsed(); + + static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; } + + // For testing only - it is to help ensure the NoopDeleterForRole + // accessed from CacheReservationManager and the one accessed from the test + // are from the same translation units + template + static Cache::DeleterFn TEST_GetNoopDeleterForRole(); + + private: + static constexpr std::size_t kSizeDummyEntry = 256 * 1024; + + Slice GetNextCacheKey(); + template + Status IncreaseCacheReservation(std::size_t new_mem_used); + Status DecreaseCacheReservation(std::size_t new_mem_used); + + std::shared_ptr cache_; + bool delayed_decrease_; + std::atomic cache_allocated_size_; + std::size_t memory_used_; + std::vector dummy_handles_; + CacheKey cache_key_; +}; + +// CacheReservationHandle is for managing the lifetime of a cache reservation +// This class is NOT thread-safe +template +class CacheReservationHandle { + public: + // REQUIRES: cache_res_mgr != nullptr + explicit CacheReservationHandle( + std::size_t incremental_memory_used, + std::shared_ptr cache_res_mgr); + + ~CacheReservationHandle(); + + private: + std::size_t incremental_memory_used_; + std::shared_ptr cache_res_mgr_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,506 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "cache/cache_reservation_manager.h" + +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "table/block_based/block_based_table_reader.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +class CacheReservationManagerTest : public ::testing::Test { + protected: + static constexpr std::size_t kSizeDummyEntry = + CacheReservationManager::GetDummyEntrySize(); + static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + static constexpr int kNumShardBits = 0; // 2^0 shard + static constexpr std::size_t kMetaDataChargeOverhead = 10000; + + std::shared_ptr cache = NewLRUCache(kCacheCapacity, kNumShardBits); + std::unique_ptr test_cache_rev_mng; + + CacheReservationManagerTest() { + test_cache_rev_mng.reset(new CacheReservationManager(cache)); + } +}; + +TEST_F(CacheReservationManagerTest, GenerateCacheKey) { + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + // Next unique Cache key + CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + // Back it up to the one used by CRM (using CacheKey implementation details) + using PairU64 = std::pair; + auto& ckey_pair = *reinterpret_cast(&ckey); + ckey_pair.second--; + + // Specific key (subject to implementation details) + EXPECT_EQ(ckey_pair, PairU64(0, 2)); + + Cache::Handle* handle = cache->Lookup(ckey.AsSlice()); + EXPECT_NE(handle, nullptr) + << "Failed to generate the cache key for the dummy entry correctly"; + // Clean up the returned handle from Lookup() to prevent memory leak + cache->Release(handle); +} + +TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) { + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + std::size_t initial_pinned_usage = cache->GetPinnedUsage(); + ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry); + ASSERT_LT(initial_pinned_usage, + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to keep cache reservation the same when new_mem_used equals " + "to current cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly when new_mem_used equals to current " + "cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly when new_mem_used " + "equals to current cache reservation"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to keep underlying dummy entries the same when new_mem_used " + "equals to current cache reservation"; +} + +TEST_F(CacheReservationManagerTest, + IncreaseCacheReservationByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry) + << "Failed to increase underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to increase underlying dummy entries in cache correctly"; +} + +TEST_F(CacheReservationManagerTest, + IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 3 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry) + << "Failed to increase underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 3 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to increase underlying dummy entries in cache correctly"; +} + +TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest, + IncreaseCacheReservationOnFullCache) { + ; + constexpr std::size_t kSizeDummyEntry = + CacheReservationManager::GetDummyEntrySize(); + constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry; + constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kSmallCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache = NewLRUCache(lo); + std::unique_ptr test_cache_rev_mng( + new CacheReservationManager(cache)); + + std::size_t new_mem_used = kSmallCacheCapacity + 1; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::Incomplete()) + << "Failed to return status to indicate failure of dummy entry insertion " + "during cache reservation on full cache"; + EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly before cache resevation failure happens " + "due to full cache"; + EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(), + kSmallCacheCapacity) + << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy " + "entry insertions) when encountering cache resevation failure due to " + "full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + + new_mem_used = kSmallCacheCapacity / 2; // 2 dummy entries + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation after encountering cache " + "reservation failure due to full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly after " + "encountering cache reservation due to full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry) + << "Failed to release underlying dummy entries correctly on cache " + "reservation decrease after encountering cache resevation failure due " + "to full cache"; + EXPECT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to release underlying dummy entries correctly on cache " + "reservation decrease after encountering cache resevation failure due " + "to full cache"; + + // Create cache full again for subsequent tests + new_mem_used = kSmallCacheCapacity + 1; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::Incomplete()) + << "Failed to return status to indicate failure of dummy entry insertion " + "during cache reservation on full cache"; + EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep correctly before cache resevation failure happens " + "due to full cache"; + EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(), + kSmallCacheCapacity) + << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy " + "entry insertions) when encountering cache resevation failure due to " + "full cache"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity) + << "Failed to insert underlying dummy entries correctly when " + "encountering cache resevation failure due to full cache"; + + // Increase cache capacity so the previously failed insertion can fully + // succeed + cache->SetCapacity(kBigCacheCapacity); + new_mem_used = kSmallCacheCapacity + 1; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to increase cache reservation after increasing cache capacity " + "and mitigating cache full error"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 5 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation increase correctly after " + "increasing cache capacity and mitigating cache full error"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry) + << "Failed to insert underlying dummy entries correctly after increasing " + "cache capacity and mitigating cache full error"; + EXPECT_LT(cache->GetPinnedUsage(), + 5 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to insert underlying dummy entries correctly after increasing " + "cache capacity and mitigating cache full error"; +} + +TEST_F(CacheReservationManagerTest, + DecreaseCacheReservationByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = 1 * kSizeDummyEntry; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache correctly"; +} + +TEST_F(CacheReservationManagerTest, + DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) { + std::size_t new_mem_used = 2 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 2 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = kSizeDummyEntry / 2; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 1 * kSizeDummyEntry) + << "Failed to bookkeep cache reservation decrease correctly"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache correctly"; + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache correctly"; +} + +TEST(CacheReservationManagerWithDelayedDecreaseTest, + DecreaseCacheReservationWithDelayedDecrease) { + constexpr std::size_t kSizeDummyEntry = + CacheReservationManager::GetDummyEntrySize(); + constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; + std::shared_ptr cache = NewLRUCache(lo); + std::unique_ptr test_cache_rev_mng( + new CacheReservationManager(cache, true /* delayed_decrease */)); + + std::size_t new_mem_used = 8 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry); + ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used); + std::size_t initial_pinned_usage = cache->GetPinnedUsage(); + ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry); + ASSERT_LT(initial_pinned_usage, + 8 * kSizeDummyEntry + kMetaDataChargeOverhead); + + new_mem_used = 6 * kSizeDummyEntry; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry) + << "Failed to bookkeep correctly when delaying cache reservation " + "decrease"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to delay decreasing underlying dummy entries in cache"; + + new_mem_used = 7 * kSizeDummyEntry; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 8 * kSizeDummyEntry) + << "Failed to bookkeep correctly when delaying cache reservation " + "decrease"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage) + << "Failed to delay decreasing underlying dummy entries in cache"; + + new_mem_used = 6 * kSizeDummyEntry - 1; + s = test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + EXPECT_EQ(s, Status::OK()) + << "Failed to decrease cache reservation correctly when new_mem_used < " + "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode"; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), + 6 * kSizeDummyEntry) + << "Failed to bookkeep correctly when new_mem_used < " + "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode"; + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used) + << "Failed to bookkeep the used memory correctly"; + EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry) + << "Failed to decrease underlying dummy entries in cache when " + "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed " + "decrease mode"; + EXPECT_LT(cache->GetPinnedUsage(), + 6 * kSizeDummyEntry + kMetaDataChargeOverhead) + << "Failed to decrease underlying dummy entries in cache when " + "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed " + "decrease mode"; +} + +TEST(CacheReservationManagerDestructorTest, + ReleaseRemainingDummyEntriesOnDestruction) { + constexpr std::size_t kSizeDummyEntry = + CacheReservationManager::GetDummyEntrySize(); + constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; + std::shared_ptr cache = NewLRUCache(lo); + { + std::unique_ptr test_cache_rev_mng( + new CacheReservationManager(cache)); + std::size_t new_mem_used = 1 * kSizeDummyEntry; + Status s = + test_cache_rev_mng + ->UpdateCacheReservation( + new_mem_used); + ASSERT_EQ(s, Status::OK()); + ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + } + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry) + << "Failed to release remaining underlying dummy entries in cache in " + "CacheReservationManager's destructor"; +} + +TEST(CacheReservationHandleTest, HandleTest) { + constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024; + constexpr std::size_t kSizeDummyEntry = 256 * 1024; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + + LRUCacheOptions lo; + lo.capacity = kOneGigabyte; + lo.num_shard_bits = 0; + std::shared_ptr cache = NewLRUCache(lo); + + std::shared_ptr test_cache_rev_mng( + std::make_shared(cache)); + + std::size_t mem_used = 0; + const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry; + const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry; + std::unique_ptr> handle_1, + handle_2; + + // To test consecutive CacheReservationManager::MakeCacheReservation works + // correctly in terms of returning the handle as well as updating cache + // reservation and the latest total memory used + Status s = test_cache_rev_mng->MakeCacheReservation( + incremental_mem_used_handle_1, &handle_1); + mem_used = mem_used + incremental_mem_used_handle_1; + ASSERT_EQ(s, Status::OK()); + EXPECT_TRUE(handle_1 != nullptr); + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + s = test_cache_rev_mng->MakeCacheReservation( + incremental_mem_used_handle_2, &handle_2); + mem_used = mem_used + incremental_mem_used_handle_2; + ASSERT_EQ(s, Status::OK()); + EXPECT_TRUE(handle_2 != nullptr); + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + // To test CacheReservationHandle::~CacheReservationHandle() works correctly + // in releasing the cache reserved for the handle + handle_1.reset(); + EXPECT_TRUE(handle_1 == nullptr); + mem_used = mem_used - incremental_mem_used_handle_1; + EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used); + EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + // To test the actual CacheReservationManager object won't be deallocated + // as long as there remain handles pointing to it. + // We strongly recommend deallocating CacheReservationManager object only + // after all its handles are deallocated to keep things easy to reasonate + test_cache_rev_mng.reset(); + EXPECT_GE(cache->GetPinnedUsage(), mem_used); + EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead); + + handle_2.reset(); + // The CacheReservationManager object is now deallocated since all the handles + // and its original pointer is gone + mem_used = mem_used - incremental_mem_used_handle_2; + EXPECT_EQ(mem_used, 0); + EXPECT_EQ(cache->GetPinnedUsage(), mem_used); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -117,8 +117,8 @@ void Insert(std::shared_ptr cache, int key, int value, int charge = 1) { - cache->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter); + EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter)); } void Erase(std::shared_ptr cache, int key) { @@ -167,9 +167,10 @@ for (int i = 1; i < 100; ++i) { std::string key(i, 'a'); auto kv_size = key.size() + 5; - cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter); - precise_cache->Insert(key, reinterpret_cast(value), kv_size, - dumbDeleter); + ASSERT_OK(cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), + kv_size, dumbDeleter)); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); ASSERT_LT(usage, precise_cache->GetUsage()); @@ -183,10 +184,10 @@ // make sure the cache will be overloaded for (uint64_t i = 1; i < kCapacity; ++i) { auto key = ToString(i); - cache->Insert(key, reinterpret_cast(value), key.size() + 5, - dumbDeleter); - precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, - dumbDeleter); + ASSERT_OK(cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), + key.size() + 5, dumbDeleter)); } // the usage should be close to the capacity @@ -215,11 +216,12 @@ auto kv_size = key.size() + 5; Cache::Handle* handle; Cache::Handle* handle_in_precise_cache; - cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter, - &handle); + ASSERT_OK(cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter, &handle)); assert(handle); - precise_cache->Insert(key, reinterpret_cast(value), kv_size, - dumbDeleter, &handle_in_precise_cache); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), + kv_size, dumbDeleter, + &handle_in_precise_cache)); assert(handle_in_precise_cache); pinned_usage += kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); @@ -254,10 +256,10 @@ // check that overloading the cache does not change the pinned usage for (uint64_t i = 1; i < 2 * kCapacity; ++i) { auto key = ToString(i); - cache->Insert(key, reinterpret_cast(value), key.size() + 5, - dumbDeleter); - precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, - dumbDeleter); + ASSERT_OK(cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter)); + ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), + key.size() + 5, dumbDeleter)); } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); @@ -607,6 +609,9 @@ for (size_t i = 5; i < 10; i++) { cache->Release(handles[i]); } + + // Make sure this doesn't crash or upset ASAN/valgrind + cache->DisownData(); } TEST_P(LRUCacheTest, SetStrictCapacityLimit) { @@ -710,25 +715,98 @@ } namespace { -std::vector> callback_state; -void callback(void* entry, size_t charge) { - callback_state.push_back({DecodeValue(entry), static_cast(charge)}); +std::vector> legacy_callback_state; +void legacy_callback(void* value, size_t charge) { + legacy_callback_state.push_back( + {DecodeValue(value), static_cast(charge)}); } }; -TEST_P(CacheTest, ApplyToAllCacheEntiresTest) { +TEST_P(CacheTest, ApplyToAllCacheEntriesTest) { std::vector> inserted; - callback_state.clear(); + legacy_callback_state.clear(); for (int i = 0; i < 10; ++i) { Insert(i, i * 2, i + 1); inserted.push_back({i * 2, i + 1}); } - cache_->ApplyToAllCacheEntries(callback, true); + cache_->ApplyToAllCacheEntries(legacy_callback, true); + + std::sort(inserted.begin(), inserted.end()); + std::sort(legacy_callback_state.begin(), legacy_callback_state.end()); + ASSERT_EQ(inserted.size(), legacy_callback_state.size()); + for (size_t i = 0; i < inserted.size(); ++i) { + EXPECT_EQ(inserted[i], legacy_callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesTest) { + std::vector callback_state; + const auto callback = [&](const Slice& key, void* value, size_t charge, + Cache::DeleterFn deleter) { + callback_state.push_back(ToString(DecodeKey(key)) + "," + + ToString(DecodeValue(value)) + "," + + ToString(charge)); + assert(deleter == &CacheTest::Deleter); + }; + + std::vector inserted; + callback_state.clear(); + + for (int i = 0; i < 10; ++i) { + Insert(i, i * 2, i + 1); + inserted.push_back(ToString(i) + "," + ToString(i * 2) + "," + + ToString(i + 1)); + } + cache_->ApplyToAllEntries(callback, /*opts*/ {}); std::sort(inserted.begin(), inserted.end()); std::sort(callback_state.begin(), callback_state.end()); - ASSERT_TRUE(inserted == callback_state); + ASSERT_EQ(inserted.size(), callback_state.size()); + for (size_t i = 0; i < inserted.size(); ++i) { + EXPECT_EQ(inserted[i], callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesDuringResize) { + // This is a mini-stress test of ApplyToAllEntries, to ensure + // items in the cache that are neither added nor removed + // during ApplyToAllEntries are counted exactly once. + + // Insert some entries that we expect to be seen exactly once + // during iteration. + constexpr int kSpecialCharge = 2; + constexpr int kNotSpecialCharge = 1; + constexpr int kSpecialCount = 100; + for (int i = 0; i < kSpecialCount; ++i) { + Insert(i, i * 2, kSpecialCharge); + } + + // For callback + int special_count = 0; + const auto callback = [&](const Slice&, void*, size_t charge, + Cache::DeleterFn) { + if (charge == static_cast(kSpecialCharge)) { + ++special_count; + } + }; + + // Start counting + std::thread apply_thread([&]() { + // Use small average_entries_per_lock to make the problem difficult + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = 2; + cache_->ApplyToAllEntries(callback, opts); + }); + + // In parallel, add more entries, enough to cause resize but not enough + // to cause ejections + for (int i = kSpecialCount * 1; i < kSpecialCount * 6; ++i) { + Insert(i, i * 2, kNotSpecialCharge); + } + + apply_thread.join(); + ASSERT_EQ(special_count, kSpecialCount); } TEST_P(CacheTest, DefaultShardBits) { @@ -747,11 +825,12 @@ ASSERT_EQ(6, sc->GetNumShardBits()); } -TEST_P(CacheTest, GetCharge) { +TEST_P(CacheTest, GetChargeAndDeleter) { Insert(1, 2); Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); ASSERT_EQ(1, cache_->GetCharge(h1)); + ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1)); cache_->Release(h1); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/clock_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/clock_cache.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -33,11 +33,11 @@ #ifndef ROCKSDB_USE_RTTI #define TBB_USE_EXCEPTIONS 0 #endif -#include "tbb/concurrent_hash_map.h" - #include "cache/sharded_cache.h" +#include "port/lang.h" #include "port/malloc.h" #include "port/port.h" +#include "tbb/concurrent_hash_map.h" #include "util/autovector.h" #include "util/mutexlock.h" @@ -176,13 +176,16 @@ // Cache entry meta data. struct CacheHandle { Slice key; - uint32_t hash; void* value; size_t charge; - void (*deleter)(const Slice&, void* value); + Cache::DeleterFn deleter; + uint32_t hash; + + // Addition to "charge" to get "total charge" under metadata policy. + uint32_t meta_charge; // Flags and counters associated with the cache handle: - // lowest bit: n-cache bit + // lowest bit: in-cache bit // second lowest bit: usage bit // the rest bits: reference count // The handle is unused when flags equals to 0. The thread decreases the count @@ -205,9 +208,8 @@ return *this; } - inline static size_t CalcTotalCharge( - Slice key, size_t charge, - CacheMetadataChargePolicy metadata_charge_policy) { + inline static uint32_t CalcMetadataCharge( + Slice key, CacheMetadataChargePolicy metadata_charge_policy) { size_t meta_charge = 0; if (metadata_charge_policy == kFullChargeCacheMetadata) { meta_charge += sizeof(CacheHandle); @@ -218,32 +220,30 @@ meta_charge += key.size(); #endif } - return charge + meta_charge; + assert(meta_charge <= UINT32_MAX); + return static_cast(meta_charge); } - inline size_t CalcTotalCharge( - CacheMetadataChargePolicy metadata_charge_policy) { - return CalcTotalCharge(key, charge, metadata_charge_policy); - } + inline size_t GetTotalCharge() { return charge + meta_charge; } }; // Key of hash map. We store hash value with the key for convenience. -struct CacheKey { +struct ClockCacheKey { Slice key; uint32_t hash_value; - CacheKey() = default; + ClockCacheKey() = default; - CacheKey(const Slice& k, uint32_t h) { + ClockCacheKey(const Slice& k, uint32_t h) { key = k; hash_value = h; } - static bool equal(const CacheKey& a, const CacheKey& b) { + static bool equal(const ClockCacheKey& a, const ClockCacheKey& b) { return a.hash_value == b.hash_value && a.key == b.key; } - static size_t hash(const CacheKey& a) { + static size_t hash(const ClockCacheKey& a) { return static_cast(a.hash_value); } }; @@ -260,7 +260,8 @@ class ClockCacheShard final : public CacheShard { public: // Hash map type. - typedef tbb::concurrent_hash_map HashTable; + using HashTable = + tbb::concurrent_hash_map; ClockCacheShard(); ~ClockCacheShard() override; @@ -271,7 +272,26 @@ Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), Cache::Handle** handle, Cache::Priority priority) override; + Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, Cache::Priority priority) override { + return Insert(key, hash, value, charge, helper->del_cb, handle, priority); + } Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; + Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* /*helper*/, + const Cache::CreateCallback& /*create_cb*/, + Cache::Priority /*priority*/, bool /*wait*/, + Statistics* /*stats*/) override { + return Lookup(key, hash); + } + bool Release(Cache::Handle* handle, bool /*useful*/, + bool force_erase) override { + return Release(handle, force_erase); + } + bool IsReady(Cache::Handle* /*handle*/) override { return true; } + void Wait(Cache::Handle* /*handle*/) override {} + // If the entry in in cache, increase reference count and return true. // Return false otherwise. // @@ -284,8 +304,10 @@ size_t GetUsage() const override; size_t GetPinnedUsage() const override; void EraseUnRefEntries() override; - void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) override; private: static const uint32_t kInCacheBit = 1; @@ -341,7 +363,8 @@ CacheHandle* Insert(const Slice& key, uint32_t hash, void* value, size_t change, void (*deleter)(const Slice& key, void* value), - bool hold_reference, CleanupContext* context); + bool hold_reference, CleanupContext* context, + bool* overwritten); // Guards list_, head_, and recycle_. In addition, updating table_ also has // to hold the mutex, to avoid the cache being in inconsistent state. @@ -403,22 +426,46 @@ return pinned_usage_.load(std::memory_order_relaxed); } -void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - if (thread_safe) { - mutex_.Lock(); +void ClockCacheShard::ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) { + assert(average_entries_per_lock > 0); + MutexLock lock(&mutex_); + + // Figure out the range to iterate, update `state` + size_t list_size = list_.size(); + size_t start_idx = *state; + size_t end_idx = start_idx + average_entries_per_lock; + if (start_idx > list_size) { + // Shouldn't reach here, but recoverable + assert(false); + // Mark finished with all + *state = UINT32_MAX; + return; + } + if (end_idx >= list_size || end_idx >= UINT32_MAX) { + // This also includes the hypothetical case of >4 billion + // cache handles. + end_idx = list_size; + // Mark finished with all + *state = UINT32_MAX; + } else { + *state = static_cast(end_idx); } - for (auto& handle : list_) { - // Use relaxed semantics instead of acquire semantics since we are either - // holding mutex, or don't have thread safe requirement. + + // Do the iteration + auto cur = list_.begin() + start_idx; + auto end = list_.begin() + end_idx; + for (; cur != end; ++cur) { + const CacheHandle& handle = *cur; + // Use relaxed semantics instead of acquire semantics since we are + // holding mutex uint32_t flags = handle.flags.load(std::memory_order_relaxed); if (InCache(flags)) { - callback(handle.value, handle.charge); + callback(handle.key, handle.value, handle.charge, handle.deleter); } } - if (thread_safe) { - mutex_.Unlock(); - } } void ClockCacheShard::RecycleHandle(CacheHandle* handle, @@ -427,10 +474,8 @@ assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0); context->to_delete_key.push_back(handle->key.data()); context->to_delete_value.emplace_back(*handle); - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); - handle->key.clear(); - handle->value = nullptr; - handle->deleter = nullptr; + size_t total_charge = handle->GetTotalCharge(); + // clearing `handle` fields would go here but not strictly required recycle_.push_back(handle); usage_.fetch_sub(total_charge, std::memory_order_relaxed); } @@ -458,7 +503,7 @@ std::memory_order_relaxed)) { if (CountRefs(flags) == 0) { // No reference count before the operation. - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + size_t total_charge = handle->GetTotalCharge(); pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } return true; @@ -472,6 +517,11 @@ if (set_usage) { handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed); } + // If the handle reaches state refs=0 and InCache=true after this + // atomic operation then we cannot access `handle` afterward, because + // it could be evicted before we access the `handle`. + size_t total_charge = handle->GetTotalCharge(); + // Use acquire-release semantics as previous operations on the cache entry // has to be order before reference count is decreased, and potential cleanup // of the entry has to be order after. @@ -479,7 +529,6 @@ assert(CountRefs(flags) > 0); if (CountRefs(flags) == 1) { // this is the last reference. - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed); // Cleanup if it is the last reference. if (!InCache(flags)) { @@ -511,7 +560,7 @@ if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire, std::memory_order_relaxed)) { bool erased __attribute__((__unused__)) = - table_.erase(CacheKey(handle->key, handle->hash)); + table_.erase(ClockCacheKey(handle->key, handle->hash)); assert(erased); RecycleHandle(handle, context); return true; @@ -564,9 +613,11 @@ CacheHandle* ClockCacheShard::Insert( const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), bool hold_reference, - CleanupContext* context) { - size_t total_charge = - CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_); + CleanupContext* context, bool* overwritten) { + assert(overwritten != nullptr && *overwritten == false); + uint32_t meta_charge = + CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_); + size_t total_charge = charge + meta_charge; MutexLock l(&mutex_); bool success = EvictFromCache(total_charge, context); bool strict = strict_capacity_limit_.load(std::memory_order_relaxed); @@ -592,16 +643,27 @@ handle->hash = hash; handle->value = value; handle->charge = charge; + handle->meta_charge = meta_charge; handle->deleter = deleter; uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit; + + // TODO investigate+fix suspected race condition: + // [thread 1] Lookup starts, up to Ref() + // [thread 2] Erase/evict the entry just looked up + // [thread 1] Ref() the handle, even though it's in the recycle bin + // [thread 2] Insert with recycling that handle + // Here we obliterate the other thread's Ref + // Possible fix: never blindly overwrite the flags, but only make + // relative updates (fetch_add, etc). handle->flags.store(flags, std::memory_order_relaxed); HashTable::accessor accessor; - if (table_.find(accessor, CacheKey(key, hash))) { + if (table_.find(accessor, ClockCacheKey(key, hash))) { + *overwritten = true; CacheHandle* existing_handle = accessor->second; table_.erase(accessor); UnsetInCache(existing_handle, context); } - table_.insert(HashTable::value_type(CacheKey(key, hash), handle)); + table_.insert(HashTable::value_type(ClockCacheKey(key, hash), handle)); if (hold_reference) { pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } @@ -619,8 +681,9 @@ char* key_data = new char[key.size()]; memcpy(key_data, key.data(), key.size()); Slice key_copy(key_data, key.size()); + bool overwritten = false; CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter, - out_handle != nullptr, &context); + out_handle != nullptr, &context, &overwritten); Status s; if (out_handle != nullptr) { if (handle == nullptr) { @@ -629,13 +692,17 @@ *out_handle = reinterpret_cast(handle); } } + if (overwritten) { + assert(s.ok()); + s = Status::OkOverwritten(); + } Cleanup(context); return s; } Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) { HashTable::const_accessor accessor; - if (!table_.find(accessor, CacheKey(key, hash))) { + if (!table_.find(accessor, ClockCacheKey(key, hash))) { return nullptr; } CacheHandle* handle = accessor->second; @@ -680,7 +747,7 @@ MutexLock l(&mutex_); HashTable::accessor accessor; bool erased = false; - if (table_.find(accessor, CacheKey(key, hash))) { + if (table_.find(accessor, ClockCacheKey(key, hash))) { CacheHandle* handle = accessor->second; table_.erase(accessor); erased = UnsetInCache(handle, context); @@ -718,11 +785,11 @@ const char* Name() const override { return "ClockCache"; } - CacheShard* GetShard(int shard) override { + CacheShard* GetShard(uint32_t shard) override { return reinterpret_cast(&shards_[shard]); } - const CacheShard* GetShard(int shard) const override { + const CacheShard* GetShard(uint32_t shard) const override { return reinterpret_cast(&shards_[shard]); } @@ -738,7 +805,18 @@ return reinterpret_cast(handle)->hash; } - void DisownData() override { shards_ = nullptr; } + DeleterFn GetDeleter(Handle* handle) const override { + return reinterpret_cast(handle)->deleter; + } + + void DisownData() override { + // Leak data only if that won't generate an ASAN/valgrind warning + if (!kMustFreeHeapAllocations) { + shards_ = nullptr; + } + } + + void WaitAll(std::vector& /*handles*/) override {} private: ClockCacheShard* shards_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,26 +9,31 @@ #include "cache/lru_cache.h" -#include -#include -#include -#include - +#include +#include +#include + +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "port/lang.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) { - Resize(); -} +LRUHandleTable::LRUHandleTable(int max_upper_hash_bits) + : length_bits_(/* historical starting size*/ 4), + list_(new LRUHandle* [size_t{1} << length_bits_] {}), + elems_(0), + max_length_bits_(max_upper_hash_bits) {} LRUHandleTable::~LRUHandleTable() { - ApplyToAllCacheEntries([](LRUHandle* h) { - if (!h->HasRefs()) { - h->Free(); - } - }); - delete[] list_; + ApplyToEntriesRange( + [](LRUHandle* h) { + if (!h->HasRefs()) { + h->Free(); + } + }, + 0, uint32_t{1} << length_bits_); } LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { @@ -42,7 +47,7 @@ *ptr = h; if (old == nullptr) { ++elems_; - if (elems_ > length_) { + if ((elems_ >> length_bits_) > 0) { // elems_ >= length // Since each cache entry is fairly large, we aim for a small // average linked list length (<= 1). Resize(); @@ -62,7 +67,7 @@ } LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) { - LRUHandle** ptr = &list_[hash & (length_ - 1)]; + LRUHandle** ptr = &list_[hash >> (32 - length_bits_)]; while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) { ptr = &(*ptr)->next_hash; } @@ -70,19 +75,29 @@ } void LRUHandleTable::Resize() { - uint32_t new_length = 16; - while (new_length < elems_ * 1.5) { - new_length *= 2; + if (length_bits_ >= max_length_bits_) { + // Due to reaching limit of hash information, if we made the table + // bigger, we would allocate more addresses but only the same + // number would be used. + return; + } + if (length_bits_ >= 31) { + // Avoid undefined behavior shifting uint32_t by 32 + return; } - LRUHandle** new_list = new LRUHandle*[new_length]; - memset(new_list, 0, sizeof(new_list[0]) * new_length); + + uint32_t old_length = uint32_t{1} << length_bits_; + int new_length_bits = length_bits_ + 1; + std::unique_ptr new_list { + new LRUHandle* [size_t{1} << new_length_bits] {} + }; uint32_t count = 0; - for (uint32_t i = 0; i < length_; i++) { + for (uint32_t i = 0; i < old_length; i++) { LRUHandle* h = list_[i]; while (h != nullptr) { LRUHandle* next = h->next_hash; uint32_t hash = h->hash; - LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)]; h->next_hash = *ptr; *ptr = h; h = next; @@ -90,23 +105,25 @@ } } assert(elems_ == count); - delete[] list_; - list_ = new_list; - length_ = new_length; + list_ = std::move(new_list); + length_bits_ = new_length_bits; } -LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, - double high_pri_pool_ratio, - bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) +LRUCacheShard::LRUCacheShard( + size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, + bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, + const std::shared_ptr& secondary_cache) : capacity_(0), high_pri_pool_usage_(0), strict_capacity_limit_(strict_capacity_limit), high_pri_pool_ratio_(high_pri_pool_ratio), high_pri_pool_capacity_(0), + table_(max_upper_hash_bits), usage_(0), lru_usage_(0), - mutex_(use_adaptive_mutex) { + mutex_(use_adaptive_mutex), + secondary_cache_(secondary_cache) { set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list lru_.next = &lru_; @@ -138,19 +155,40 @@ } } -void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - const auto applyCallback = [&]() { - table_.ApplyToAllCacheEntries( - [callback](LRUHandle* h) { callback(h->value, h->charge); }); - }; +void LRUCacheShard::ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) { + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. + MutexLock l(&mutex_); + uint32_t length_bits = table_.GetLengthBits(); + uint32_t length = uint32_t{1} << length_bits; - if (thread_safe) { - MutexLock l(&mutex_); - applyCallback(); + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow) + assert(average_entries_per_lock < length || *state == 0); + + uint32_t index_begin = *state >> (32 - length_bits); + uint32_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end + index_end = length; + *state = UINT32_MAX; } else { - applyCallback(); + *state = index_end << (32 - length_bits); } + + table_.ApplyToEntriesRange( + [callback](LRUHandle* h) { + DeleterFn deleter = h->IsSecondaryCacheCompatible() + ? h->info_.helper->del_cb + : h->info_.deleter; + callback(h->key(), h->value, h->charge, deleter); + }, + index_begin, index_end); } void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) { @@ -257,8 +295,14 @@ EvictFromLRU(0, &last_reference_list); } + // Try to insert the evicted entries into tiered cache // Free the entries outside of mutex for performance reasons for (auto entry : last_reference_list) { + if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && + !entry->IsPromoted()) { + secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + .PermitUncheckedError(); + } entry->Free(); } } @@ -268,17 +312,181 @@ strict_capacity_limit_ = strict_capacity_limit; } -Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { - MutexLock l(&mutex_); - LRUHandle* e = table_.Lookup(key, hash); - if (e != nullptr) { - assert(e->InCache()); - if (!e->HasRefs()) { - // The entry is in LRU since it's in hash and has no external references - LRU_Remove(e); +Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle, + bool free_handle_on_fail) { + Status s = Status::OK(); + autovector last_reference_list; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + + { + MutexLock l(&mutex_); + + // Free the space following strict LRU policy until enough space + // is freed or the lru list is empty + EvictFromLRU(total_charge, &last_reference_list); + + if ((usage_ + total_charge) > capacity_ && + (strict_capacity_limit_ || handle == nullptr)) { + e->SetInCache(false); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry inserted + // into cache and get evicted immediately. + last_reference_list.push_back(e); + } else { + if (free_handle_on_fail) { + delete[] reinterpret_cast(e); + *handle = nullptr; + } + s = Status::Incomplete("Insert failed due to LRU cache being full."); + } + } else { + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. + LRUHandle* old = table_.Insert(e); + usage_ += total_charge; + if (old != nullptr) { + s = Status::OkOverwritten(); + assert(old->InCache()); + old->SetInCache(false); + if (!old->HasRefs()) { + // old is on LRU because it's in cache and its reference count is 0 + LRU_Remove(old); + size_t old_total_charge = + old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; + last_reference_list.push_back(old); + } + } + if (handle == nullptr) { + LRU_Insert(e); + } else { + // If caller already holds a ref, no need to take one here + if (!e->HasRefs()) { + e->Ref(); + } + *handle = reinterpret_cast(e); + } + } + } + + // Try to insert the evicted entries into the secondary cache + // Free the entries here outside of mutex for performance reasons + for (auto entry : last_reference_list) { + if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && + !entry->IsPromoted()) { + secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + .PermitUncheckedError(); + } + entry->Free(); + } + + return s; +} + +void LRUCacheShard::Promote(LRUHandle* e) { + SecondaryCacheResultHandle* secondary_handle = e->sec_handle; + + assert(secondary_handle->IsReady()); + e->SetIncomplete(false); + e->SetInCache(true); + e->SetPromoted(true); + e->value = secondary_handle->Value(); + e->charge = secondary_handle->Size(); + delete secondary_handle; + + // This call could fail if the cache is over capacity and + // strict_capacity_limit_ is true. In such a case, we don't want + // InsertItem() to free the handle, since the item is already in memory + // and the caller will most likely just read from disk if we erase it here. + if (e->value) { + Cache::Handle* handle = reinterpret_cast(e); + Status s = InsertItem(e, &handle, /*free_handle_on_fail=*/false); + if (!s.ok()) { + // Item is in memory, but not accounted against the cache capacity. + // When the handle is released, the item should get deleted + assert(!e->InCache()); + } + } else { + // Since the secondary cache lookup failed, mark the item as not in cache + // Don't charge the cache as its only metadata that'll shortly be released + MutexLock l(&mutex_); + e->charge = 0; + e->SetInCache(false); + } +} + +Cache::Handle* LRUCacheShard::Lookup( + const Slice& key, uint32_t hash, + const ShardedCache::CacheItemHelper* helper, + const ShardedCache::CreateCallback& create_cb, Cache::Priority priority, + bool wait, Statistics* stats) { + LRUHandle* e = nullptr; + { + MutexLock l(&mutex_); + e = table_.Lookup(key, hash); + if (e != nullptr) { + assert(e->InCache()); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references + LRU_Remove(e); + } + e->Ref(); + e->SetHit(); + } + } + + // If handle table lookup failed, then allocate a handle outside the + // mutex if we're going to lookup in the secondary cache + // Only support synchronous for now + // TODO: Support asynchronous lookup in secondary cache + if (!e && secondary_cache_ && helper && helper->saveto_cb) { + // For objects from the secondary cache, we expect the caller to provide + // a way to create/delete the primary cache object. The only case where + // a deleter would not be required is for dummy entries inserted for + // accounting purposes, which we won't demote to the secondary cache + // anyway. + assert(create_cb && helper->del_cb); + std::unique_ptr secondary_handle = + secondary_cache_->Lookup(key, create_cb, wait); + if (secondary_handle != nullptr) { + e = reinterpret_cast( + new char[sizeof(LRUHandle) - 1 + key.size()]); + + e->flags = 0; + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + e->key_length = key.size(); + e->hash = hash; + e->refs = 0; + e->next = e->prev = nullptr; + e->SetPriority(priority); + memcpy(e->key_data, key.data(), key.size()); + e->value = nullptr; + e->sec_handle = secondary_handle.release(); + e->Ref(); + + if (wait) { + Promote(e); + if (!e->value) { + // The secondary cache returned a handle, but the lookup failed + e->Unref(); + e->Free(); + e = nullptr; + } else { + PERF_COUNTER_ADD(secondary_cache_hit_count, 1); + RecordTick(stats, SECONDARY_CACHE_HITS); + } + } else { + // If wait is false, we always return a handle and let the caller + // release the handle after checking for success or failure + e->SetIncomplete(true); + // This may be slightly inaccurate, if the lookup eventually fails. + // But the probability is very low. + PERF_COUNTER_ADD(secondary_cache_hit_count, 1); + RecordTick(stats, SECONDARY_CACHE_HITS); + } } - e->Ref(); - e->SetHit(); } return reinterpret_cast(e); } @@ -322,7 +530,12 @@ last_reference = false; } } - if (last_reference) { + // If it was the last reference, and the entry is either not secondary + // cache compatible (i.e a dummy entry for accounting), or is secondary + // cache compatible and has a non-null value, then decrement the cache + // usage. If value is null in the latter case, taht means the lookup + // failed and we didn't charge the cache. + if (last_reference && (!e->IsSecondaryCacheCompatible() || e->value)) { size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); assert(usage_ >= total_charge); usage_ -= total_charge; @@ -339,80 +552,35 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), + const Cache::CacheItemHelper* helper, Cache::Handle** handle, Cache::Priority priority) { // Allocate the memory here outside of the mutex // If the cache is full, we'll have to release it // It shouldn't happen very often though. LRUHandle* e = reinterpret_cast( new char[sizeof(LRUHandle) - 1 + key.size()]); - Status s = Status::OK(); - autovector last_reference_list; e->value = value; - e->deleter = deleter; + e->flags = 0; + if (helper) { + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + } else { +#ifdef __SANITIZE_THREAD__ + e->is_secondary_cache_compatible_for_tsan = false; +#endif // __SANITIZE_THREAD__ + e->info_.deleter = deleter; + } e->charge = charge; e->key_length = key.size(); - e->flags = 0; e->hash = hash; e->refs = 0; e->next = e->prev = nullptr; e->SetInCache(true); e->SetPriority(priority); memcpy(e->key_data, key.data(), key.size()); - size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); - - { - MutexLock l(&mutex_); - - // Free the space following strict LRU policy until enough space - // is freed or the lru list is empty - EvictFromLRU(total_charge, &last_reference_list); - - if ((usage_ + total_charge) > capacity_ && - (strict_capacity_limit_ || handle == nullptr)) { - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry inserted - // into cache and get evicted immediately. - e->SetInCache(false); - last_reference_list.push_back(e); - } else { - delete[] reinterpret_cast(e); - *handle = nullptr; - s = Status::Incomplete("Insert failed due to LRU cache being full."); - } - } else { - // Insert into the cache. Note that the cache might get larger than its - // capacity if not enough space was freed up. - LRUHandle* old = table_.Insert(e); - usage_ += total_charge; - if (old != nullptr) { - assert(old->InCache()); - old->SetInCache(false); - if (!old->HasRefs()) { - // old is on LRU because it's in cache and its reference count is 0 - LRU_Remove(old); - size_t old_total_charge = - old->CalcTotalCharge(metadata_charge_policy_); - assert(usage_ >= old_total_charge); - usage_ -= old_total_charge; - last_reference_list.push_back(old); - } - } - if (handle == nullptr) { - LRU_Insert(e); - } else { - e->Ref(); - *handle = reinterpret_cast(e); - } - } - } - - // Free the entries here outside of mutex for performance reasons - for (auto entry : last_reference_list) { - entry->Free(); - } - return s; + return InsertItem(e, handle, /* free_handle_on_fail */ true); } void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { @@ -442,6 +610,18 @@ } } +bool LRUCacheShard::IsReady(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + MutexLock l(&mutex_); + bool ready = true; + if (e->IsPending()) { + assert(secondary_cache_); + assert(e->sec_handle); + ready = e->sec_handle->IsReady(); + } + return ready; +} + size_t LRUCacheShard::GetUsage() const { MutexLock l(&mutex_); return usage_; @@ -468,7 +648,8 @@ bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) + CacheMetadataChargePolicy metadata_charge_policy, + const std::shared_ptr& secondary_cache) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, std::move(allocator)) { num_shards_ = 1 << num_shard_bits; @@ -476,10 +657,12 @@ port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_)); size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_; for (int i = 0; i < num_shards_; i++) { - new (&shards_[i]) - LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio, - use_adaptive_mutex, metadata_charge_policy); + new (&shards_[i]) LRUCacheShard( + per_shard, strict_capacity_limit, high_pri_pool_ratio, + use_adaptive_mutex, metadata_charge_policy, + /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache); } + secondary_cache_ = secondary_cache; } LRUCache::~LRUCache() { @@ -492,11 +675,11 @@ } } -CacheShard* LRUCache::GetShard(int shard) { +CacheShard* LRUCache::GetShard(uint32_t shard) { return reinterpret_cast(&shards_[shard]); } -const CacheShard* LRUCache::GetShard(int shard) const { +const CacheShard* LRUCache::GetShard(uint32_t shard) const { return reinterpret_cast(&shards_[shard]); } @@ -508,23 +691,25 @@ return reinterpret_cast(handle)->charge; } +Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast(handle); + if (h->IsSecondaryCacheCompatible()) { + return h->info_.helper->del_cb; + } else { + return h->info_.deleter; + } +} + uint32_t LRUCache::GetHash(Handle* handle) const { return reinterpret_cast(handle)->hash; } void LRUCache::DisownData() { -// Do not drop data if compile with ASAN to suppress leak warning. -#if defined(__clang__) -#if !defined(__has_feature) || !__has_feature(address_sanitizer) - shards_ = nullptr; - num_shards_ = 0; -#endif -#else // __clang__ -#ifndef __SANITIZE_ADDRESS__ - shards_ = nullptr; - num_shards_ = 0; -#endif // !__SANITIZE_ADDRESS__ -#endif // __clang__ + // Leak data only if that won't generate an ASAN/valgrind warning + if (!kMustFreeHeapAllocations) { + shards_ = nullptr; + num_shards_ = 0; + } } size_t LRUCache::TEST_GetLRUSize() { @@ -543,19 +728,42 @@ return result; } -std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { - return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, - cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, - cache_opts.metadata_charge_policy); +void LRUCache::WaitAll(std::vector& handles) { + if (secondary_cache_) { + std::vector sec_handles; + sec_handles.reserve(handles.size()); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast(handle); + if (!lru_handle->IsPending()) { + continue; + } + sec_handles.emplace_back(lru_handle->sec_handle); + } + secondary_cache_->WaitAll(sec_handles); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast(handle); + if (!lru_handle->IsPending()) { + continue; + } + uint32_t hash = GetHash(handle); + LRUCacheShard* shard = static_cast(GetShard(Shard(hash))); + shard->Promote(lru_handle); + } + } } std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) { + CacheMetadataChargePolicy metadata_charge_policy, + const std::shared_ptr& secondary_cache) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } @@ -568,7 +776,25 @@ } return std::make_shared( capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy); + std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy, + secondary_cache); +} + +std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { + return NewLRUCache( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); } +std::shared_ptr NewLRUCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, + std::shared_ptr memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, + metadata_charge_policy, nullptr); +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -1,4 +1,4 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). @@ -8,12 +8,14 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include #include "cache/sharded_cache.h" - +#include "port/lang.h" #include "port/malloc.h" #include "port/port.h" +#include "rocksdb/secondary_cache.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -49,8 +51,18 @@ struct LRUHandle { void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next_hash; + union Info { + Info() {} + ~Info() {} + Cache::DeleterFn deleter; + const ShardedCache::CacheItemHelper* helper; + } info_; + // An entry is not added to the LRUHandleTable until the secondary cache + // lookup is complete, so its safe to have this union. + union { + LRUHandle* next_hash; + SecondaryCacheResultHandle* sec_handle; + }; LRUHandle* next; LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? @@ -67,12 +79,26 @@ IS_HIGH_PRI = (1 << 1), // Whether this entry is in high-pri pool. IN_HIGH_PRI_POOL = (1 << 2), - // Wwhether this entry has had any lookups (hits). + // Whether this entry has had any lookups (hits). HAS_HIT = (1 << 3), + // Can this be inserted into the secondary cache + IS_SECONDARY_CACHE_COMPATIBLE = (1 << 4), + // Is the handle still being read from a lower tier + IS_PENDING = (1 << 5), + // Has the item been promoted from a lower tier + IS_PROMOTED = (1 << 6), }; uint8_t flags; +#ifdef __SANITIZE_THREAD__ + // TSAN can report a false data race on flags, where one thread is writing + // to one of the mutable bits and another thread is reading this immutable + // bit. So precisely suppress that TSAN warning, we separate out this bit + // during TSAN runs. + bool is_secondary_cache_compatible_for_tsan; +#endif // __SANITIZE_THREAD__ + // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) char key_data[1]; @@ -95,6 +121,15 @@ bool IsHighPri() const { return flags & IS_HIGH_PRI; } bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; } bool HasHit() const { return flags & HAS_HIT; } + bool IsSecondaryCacheCompatible() const { +#ifdef __SANITIZE_THREAD__ + return is_secondary_cache_compatible_for_tsan; +#else + return flags & IS_SECONDARY_CACHE_COMPATIBLE; +#endif // __SANITIZE_THREAD__ + } + bool IsPending() const { return flags & IS_PENDING; } + bool IsPromoted() const { return flags & IS_PROMOTED; } void SetInCache(bool in_cache) { if (in_cache) { @@ -122,15 +157,58 @@ void SetHit() { flags |= HAS_HIT; } + void SetSecondaryCacheCompatible(bool compat) { + if (compat) { + flags |= IS_SECONDARY_CACHE_COMPATIBLE; + } else { + flags &= ~IS_SECONDARY_CACHE_COMPATIBLE; + } +#ifdef __SANITIZE_THREAD__ + is_secondary_cache_compatible_for_tsan = compat; +#endif // __SANITIZE_THREAD__ + } + + void SetIncomplete(bool incomp) { + if (incomp) { + flags |= IS_PENDING; + } else { + flags &= ~IS_PENDING; + } + } + + void SetPromoted(bool promoted) { + if (promoted) { + flags |= IS_PROMOTED; + } else { + flags &= ~IS_PROMOTED; + } + } + void Free() { assert(refs == 0); - if (deleter) { - (*deleter)(key(), value); +#ifdef __SANITIZE_THREAD__ + // Here we can safely assert they are the same without a data race reported + assert(((flags & IS_SECONDARY_CACHE_COMPATIBLE) != 0) == + is_secondary_cache_compatible_for_tsan); +#endif // __SANITIZE_THREAD__ + if (!IsSecondaryCacheCompatible() && info_.deleter) { + (*info_.deleter)(key(), value); + } else if (IsSecondaryCacheCompatible()) { + if (IsPending()) { + assert(sec_handle != nullptr); + SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; + tmp_sec_handle->Wait(); + value = tmp_sec_handle->Value(); + delete tmp_sec_handle; + } + if (value) { + (*info_.helper->del_cb)(key(), value); + } } delete[] reinterpret_cast(this); } - // Caclculate the memory usage by metadata + // Calculate the memory usage by metadata inline size_t CalcTotalCharge( CacheMetadataChargePolicy metadata_charge_policy) { size_t meta_charge = 0; @@ -153,7 +231,10 @@ // 4.4.3's builtin hashtable. class LRUHandleTable { public: - LRUHandleTable(); + // If the table uses more hash bits than `max_upper_hash_bits`, + // it will eat into the bits used for sharding, which are constant + // for a given LRUHandleTable. + explicit LRUHandleTable(int max_upper_hash_bits); ~LRUHandleTable(); LRUHandle* Lookup(const Slice& key, uint32_t hash); @@ -161,8 +242,8 @@ LRUHandle* Remove(const Slice& key, uint32_t hash); template - void ApplyToAllCacheEntries(T func) { - for (uint32_t i = 0; i < length_; i++) { + void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { + for (uint32_t i = index_begin; i < index_end; i++) { LRUHandle* h = list_[i]; while (h != nullptr) { auto n = h->next_hash; @@ -173,6 +254,8 @@ } } + int GetLengthBits() const { return length_bits_; } + private: // Return a pointer to slot that points to a cache entry that // matches key/hash. If there is no such cache entry, return a @@ -181,11 +264,19 @@ void Resize(); + // Number of hash bits (upper because lower bits used for sharding) + // used for table index. Length == 1 << length_bits_ + int length_bits_; + // The table consists of an array of buckets where each bucket is // a linked list of cache entries that hash into the bucket. - LRUHandle** list_; - uint32_t length_; + std::unique_ptr list_; + + // Number of elements currently in the table uint32_t elems_; + + // Set from max_upper_hash_bits (see constructor) + const int max_length_bits_; }; // A single shard of sharded cache. @@ -193,7 +284,9 @@ public: LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy); + CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, + const std::shared_ptr& secondary_cache); virtual ~LRUCacheShard() override = default; // Separate from constructor so caller can easily make an array of LRUCache @@ -209,11 +302,35 @@ // Like Cache methods, but with an extra "hash" parameter. virtual Status Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), + size_t charge, Cache::DeleterFn deleter, Cache::Handle** handle, - Cache::Priority priority) override; - virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; + Cache::Priority priority) override { + return Insert(key, hash, value, charge, deleter, nullptr, handle, priority); + } + virtual Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, + Cache::Priority priority) override { + assert(helper); + return Insert(key, hash, value, charge, nullptr, helper, handle, priority); + } + // If helper_cb is null, the values of the following arguments don't + // matter + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const ShardedCache::CacheItemHelper* helper, + const ShardedCache::CreateCallback& create_cb, + ShardedCache::Priority priority, bool wait, + Statistics* stats) override; + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override { + return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true, + nullptr); + } + virtual bool Release(Cache::Handle* handle, bool /*useful*/, + bool force_erase) override { + return Release(handle, force_erase); + } + virtual bool IsReady(Cache::Handle* /*handle*/) override; + virtual void Wait(Cache::Handle* /*handle*/) override {} virtual bool Ref(Cache::Handle* handle) override; virtual bool Release(Cache::Handle* handle, bool force_erase = false) override; @@ -226,8 +343,10 @@ virtual size_t GetUsage() const override; virtual size_t GetPinnedUsage() const override; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + virtual void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) override; virtual void EraseUnRefEntries() override; @@ -239,10 +358,27 @@ // not threadsafe size_t TEST_GetLRUSize(); - // Retrives high pri pool ratio + // Retrieves high pri pool ratio double GetHighPriPoolRatio(); private: + friend class LRUCache; + // Insert an item into the hash table and, if handle is null, insert into + // the LRU list. Older items are evicted as necessary. If the cache is full + // and free_handle_on_fail is true, the item is deleted and handle is set to. + Status InsertItem(LRUHandle* item, Cache::Handle** handle, + bool free_handle_on_fail); + Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, + DeleterFn deleter, const Cache::CacheItemHelper* helper, + Cache::Handle** handle, Cache::Priority priority); + // Promote an item looked up from the secondary cache to the LRU cache. The + // item is only inserted into the hash table and not the LRU list, and only + // if the cache is not at full capacity, as is the case during Insert. The + // caller should hold a reference on the LRUHandle. When the caller releases + // the last reference, the item is added to the LRU list. + // The item is promoted to the high pri or low pri pool as specified by the + // caller in Lookup. + void Promote(LRUHandle* e); void LRU_Remove(LRUHandle* e); void LRU_Insert(LRUHandle* e); @@ -303,6 +439,8 @@ // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. mutable port::Mutex mutex_; + + std::shared_ptr secondary_cache_; }; class LRUCache @@ -316,24 +454,28 @@ std::shared_ptr memory_allocator = nullptr, bool use_adaptive_mutex = kDefaultToAdaptiveMutex, CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata); + kDontChargeCacheMetadata, + const std::shared_ptr& secondary_cache = nullptr); virtual ~LRUCache(); virtual const char* Name() const override { return "LRUCache"; } - virtual CacheShard* GetShard(int shard) override; - virtual const CacheShard* GetShard(int shard) const override; + virtual CacheShard* GetShard(uint32_t shard) override; + virtual const CacheShard* GetShard(uint32_t shard) const override; virtual void* Value(Handle* handle) override; virtual size_t GetCharge(Handle* handle) const override; virtual uint32_t GetHash(Handle* handle) const override; + virtual DeleterFn GetDeleter(Handle* handle) const override; virtual void DisownData() override; + virtual void WaitAll(std::vector& handles) override; // Retrieves number of elements in LRU, for unit test purpose only size_t TEST_GetLRUSize(); - // Retrives high pri pool ratio + // Retrieves high pri pool ratio double GetHighPriPoolRatio(); private: LRUCacheShard* shards_ = nullptr; int num_shards_ = 0; + std::shared_ptr secondary_cache_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,8 +7,21 @@ #include #include + +#include "cache/cache_key.h" +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" #include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/io_status.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/utilities/cache_dump_load.h" #include "test_util/testharness.h" +#include "util/coding.h" +#include "util/random.h" +#include "utilities/cache_dump_load_impl.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -30,15 +43,17 @@ DeleteCache(); cache_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); - new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/, - high_pri_pool_ratio, use_adaptive_mutex, - kDontChargeCacheMetadata); + new (cache_) LRUCacheShard( + capacity, false /*strict_capcity_limit*/, high_pri_pool_ratio, + use_adaptive_mutex, kDontChargeCacheMetadata, + 24 /*max_upper_hash_bits*/, nullptr /*secondary_cache*/); } void Insert(const std::string& key, Cache::Priority priority = Cache::Priority::LOW) { - cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, - nullptr /*deleter*/, nullptr /*handle*/, priority); + EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, + nullptr /*deleter*/, nullptr /*handle*/, + priority)); } void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { @@ -190,6 +205,1641 @@ ValidateLRUList({"e", "f", "g", "Z", "d"}, 2); } +class TestSecondaryCache : public SecondaryCache { + public: + // Specifies what action to take on a lookup for a particular key + enum ResultType { + SUCCESS, + // Fail lookup immediately + FAIL, + // Defer the result. It will returned after Wait/WaitAll is called + DEFER, + // Defer the result and eventually return failure + DEFER_AND_FAIL + }; + + using ResultMap = std::unordered_map; + + explicit TestSecondaryCache(size_t capacity) + : num_inserts_(0), num_lookups_(0), inject_failure_(false) { + cache_ = NewLRUCache(capacity, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + } + ~TestSecondaryCache() override { cache_.reset(); } + + const char* Name() const override { return "TestSecondaryCache"; } + + void InjectFailure() { inject_failure_ = true; } + + void ResetInjectFailure() { inject_failure_ = false; } + + void SetDbSessionId(const std::string& db_session_id) { + // NOTE: we assume the file is smaller than kMaxFileSizeStandardEncoding + // for this to work, but that's safe in a test. + auto base = OffsetableCacheKey("unknown", db_session_id, 1, 1); + ckey_prefix_ = base.CommonPrefixSlice().ToString(); + } + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) override { + if (inject_failure_) { + return Status::Corruption("Insertion Data Corrupted"); + } + EXPECT_TRUE(IsDbSessionLowerAsKeyPrefix(key)); + size_t size; + char* buf; + Status s; + + num_inserts_++; + size = (*helper->size_cb)(value); + buf = new char[size + sizeof(uint64_t)]; + EncodeFixed64(buf, size); + s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t)); + if (!s.ok()) { + delete[] buf; + return s; + } + return cache_->Insert(key, buf, size, + [](const Slice& /*key*/, void* val) -> void { + delete[] static_cast(val); + }); + } + + std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, + bool /*wait*/) override { + std::string key_str = key.ToString(); + TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); + + std::unique_ptr secondary_handle; + ResultType type = ResultType::SUCCESS; + auto iter = result_map_.find(key.ToString()); + if (iter != result_map_.end()) { + type = iter->second; + } + if (type == ResultType::FAIL) { + return secondary_handle; + } + + Cache::Handle* handle = cache_->Lookup(key); + num_lookups_++; + if (handle) { + void* value = nullptr; + size_t charge = 0; + Status s; + if (type != ResultType::DEFER_AND_FAIL) { + char* ptr = (char*)cache_->Value(handle); + size_t size = DecodeFixed64(ptr); + ptr += sizeof(uint64_t); + s = create_cb(ptr, size, &value, &charge); + } + if (s.ok()) { + secondary_handle.reset(new TestSecondaryCacheResultHandle( + cache_.get(), handle, value, charge, type)); + } else { + cache_->Release(handle); + } + } + return secondary_handle; + } + + void Erase(const Slice& /*key*/) override {} + + void WaitAll(std::vector handles) override { + for (SecondaryCacheResultHandle* handle : handles) { + TestSecondaryCacheResultHandle* sec_handle = + static_cast(handle); + sec_handle->SetReady(); + } + } + + std::string GetPrintableOptions() const override { return ""; } + + void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); } + + uint32_t num_inserts() { return num_inserts_; } + + uint32_t num_lookups() { return num_lookups_; } + + bool IsDbSessionLowerAsKeyPrefix(const Slice& key) { + return key.starts_with(ckey_prefix_); + } + + private: + class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle { + public: + TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle, + void* value, size_t size, ResultType type) + : cache_(cache), + handle_(handle), + value_(value), + size_(size), + is_ready_(true) { + if (type != ResultType::SUCCESS) { + is_ready_ = false; + } + } + + ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); } + + bool IsReady() override { return is_ready_; } + + void Wait() override {} + + void* Value() override { + assert(is_ready_); + return value_; + } + + size_t Size() override { return Value() ? size_ : 0; } + + void SetReady() { is_ready_ = true; } + + private: + Cache* cache_; + Cache::Handle* handle_; + void* value_; + size_t size_; + bool is_ready_; + }; + + std::shared_ptr cache_; + uint32_t num_inserts_; + uint32_t num_lookups_; + bool inject_failure_; + std::string ckey_prefix_; + ResultMap result_map_; +}; + +class DBSecondaryCacheTest : public DBTestBase { + public: + DBSecondaryCacheTest() + : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; +}; + +class LRUSecondaryCacheTest : public LRUCacheTest { + public: + LRUSecondaryCacheTest() : fail_create_(false) {} + ~LRUSecondaryCacheTest() {} + + protected: + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() {} + + char* Buf() { return buf_.get(); } + size_t Size() { return size_; } + std::string ToString() { return std::string(Buf(), Size()); } + + private: + std::unique_ptr buf_; + size_t size_; + }; + + static size_t SizeCallback(void* obj) { + return reinterpret_cast(obj)->Size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + TestItem* item = reinterpret_cast(from_obj); + char* buf = item->Buf(); + EXPECT_EQ(length, item->Size()); + EXPECT_EQ(from_offset, 0); + memcpy(out, buf, length); + return Status::OK(); + } + + static void DeletionCallback(const Slice& /*key*/, void* obj) { + delete reinterpret_cast(obj); + } + + static Cache::CacheItemHelper helper_; + + static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, + size_t /*size*/, void* /*out*/) { + return Status::NotSupported(); + } + + static Cache::CacheItemHelper helper_fail_; + + Cache::CreateCallback test_item_creator = + [&](void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + if (fail_create_) { + return Status::NotSupported(); + } + *out_obj = reinterpret_cast(new TestItem((char*)buf, size)); + *charge = size; + return Status::OK(); + }; + + void SetFailCreate(bool fail) { fail_create_ = fail; } + + private: + bool fail_create_; +}; + +Cache::CacheItemHelper LRUSecondaryCacheTest::helper_( + LRUSecondaryCacheTest::SizeCallback, LRUSecondaryCacheTest::SaveToCallback, + LRUSecondaryCacheTest::DeletionCallback); + +Cache::CacheItemHelper LRUSecondaryCacheTest::helper_fail_( + LRUSecondaryCacheTest::SizeCallback, + LRUSecondaryCacheTest::SaveToCallbackFail, + LRUSecondaryCacheTest::DeletionCallback); + +TEST_F(LRUSecondaryCacheTest, BasicTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr stats = CreateDBStatistics(); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + get_perf_context()->Reset(); + Cache::Handle* handle; + handle = + cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, test_item_creator, + Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should promote k1 and demote k2 + handle = + cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, test_item_creator, + Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS), + secondary_cache->num_lookups()); + PerfContext perf_ctx = *get_perf_context(); + ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups()); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, BasicFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_NOK(cache->Insert("k1", item1, nullptr, str1.length())); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW, + true); + ASSERT_EQ(handle, nullptr); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false); + ASSERT_EQ(handle, nullptr); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, SaveFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_fail_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_fail_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 demotion would have failed + handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, CreateFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + SetFailCreate(true); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 creation would have failed + handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, FullCapacityTest) { + LRUCacheOptions opts(1024, 0, /*_strict_capacity_limit=*/true, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + // k1 promotion should fail due to the block cache being at capacity, + // but the lookup should still succeed + Cache::Handle* handle2; + handle2 = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle2, nullptr); + // Since k1 didn't get inserted, k2 should still be in cache + cache->Release(handle); + cache->Release(handle2); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + + Destroy(options); +} + +// In this test, the block cache size is set to 6100, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// insert and cache block_1 in the block cache (this is the different place +// from TestSecondaryCacheCorrectness1) +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { + LRUCacheOptions opts(6100, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Thefore, block_1 is evicted from block + // cache and successfully inserted to the secondary cache. Here are 2 + // lookups in the secondary cache for block_1 and block_2. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. After Flush, only block_2 is cached in block cache + // and block_1 is in the secondary cache. So when read block_1, it is + // read out from secondary cache and inserted to block cache. At the same + // time, block_2 is inserted to secondary cache. Now, secondary cache has + // both block_1 and block_2. After compaction, block_1 is in the cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is not in the block cache. So + // it will lookup the secondary cache for block_2 and cache it in the + // block_cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is already in the block cache. + // No need to lookup secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is not in block cache + // there is one econdary cache lookup. Then, block_1 is cached in the + // block cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + Destroy(options); +} + +// The block cache size is set to 1024*1024, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// cache all the blocks in the block cache and there is not secondary cache +// insertion. 2 lookup is needed for the blocks. +TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { + LRUCacheOptions opts(1024 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. Now, block cache is large enough, it cache + // both block_1 and block_2. When first time read block_1 and block_2 + // there are cache misses. So 2 secondary cache lookups are needed for + // the 2 blocks + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will iterate the whole SST file. Since all the data blocks + // are in the block cache. No need to lookup the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1000, v.size()); + // Since the block cache is large enough, all the blocks are cached. we + // do not need to lookup the seondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { + LRUCacheOptions opts(8 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + Compact("a", "z"); + + Random r_index(47); + std::string v; + for (int i = 0; i < 1000; i++) { + uint32_t key_i = r_index.Next() % N; + v = Get(Key(key_i)); + } + + // We have over 200 data blocks there will be multiple insertion + // and lookups. + ASSERT_GE(secondary_cache->num_inserts(), 1u); + ASSERT_GE(secondary_cache->num_lookups(), 1u); + + Destroy(options); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB will do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + // Fail the insertion, in LRU cache, the secondary insertion returned status + // is not checked, therefore, the DB will not be influenced. + secondary_cache->InjectFailure(); + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + secondary_cache->ResetInjectFailure(); + + Destroy(options); +} + +TEST_F(LRUSecondaryCacheTest, BasicWaitAllTest) { + LRUCacheOptions opts(1024, 2, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(32 * 1024); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + const int num_keys = 32; + + Random rnd(301); + std::vector values; + for (int i = 0; i < num_keys; ++i) { + std::string str = rnd.RandomString(1020); + values.emplace_back(str); + TestItem* item = new TestItem(str.data(), str.length()); + ASSERT_OK(cache->Insert("k" + std::to_string(i), item, + &LRUSecondaryCacheTest::helper_, str.length())); + } + // Force all entries to be evicted to the secondary cache + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 32u); + cache->SetCapacity(32 * 1024); + + secondary_cache->SetResultMap( + {{"k3", TestSecondaryCache::ResultType::DEFER}, + {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {"k5", TestSecondaryCache::ResultType::FAIL}}); + std::vector results; + for (int i = 0; i < 6; ++i) { + results.emplace_back( + cache->Lookup("k" + std::to_string(i), &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false)); + } + cache->WaitAll(results); + for (int i = 0; i < 6; ++i) { + if (i == 4) { + ASSERT_EQ(cache->Value(results[i]), nullptr); + } else if (i == 5) { + ASSERT_EQ(results[i], nullptr); + continue; + } else { + TestItem* item = static_cast(cache->Value(results[i])); + ASSERT_EQ(item->ToString(), values[i]); + } + cache->Release(results[i]); + } + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, we have one KV pair per data block. We indirectly determine +// the cache key associated with each data block (and thus each KV) by using +// a sync point callback in TestSecondaryCache::Lookup. We then control the +// lookup result by setting the ResultMap. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { + LRUCacheOptions opts(1 << 20, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 8; + std::vector keys; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(4000); + keys.emplace_back(p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB does the paranoid check for the new + // SST file. This will try to lookup all data blocks in the secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 8u); + + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 8u); + cache->SetCapacity(1 << 20); + + std::vector cache_keys; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void { + cache_keys.emplace_back(*(static_cast(key))); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < N; ++i) { + std::string v = Get(Key(i)); + ASSERT_EQ(4000, v.size()); + ASSERT_EQ(v, keys[i]); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(secondary_cache->num_lookups(), 16u); + cache->SetCapacity(0); + cache->SetCapacity(1 << 20); + + ASSERT_EQ(Get(Key(2)), keys[2]); + ASSERT_EQ(Get(Key(7)), keys[7]); + secondary_cache->SetResultMap( + {{cache_keys[3], TestSecondaryCache::ResultType::DEFER}, + {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {cache_keys[5], TestSecondaryCache::ResultType::FAIL}}); + + std::vector mget_keys( + {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)}); + std::vector values(mget_keys.size()); + std::vector s(keys.size()); + std::vector key_slices; + for (const std::string& key : mget_keys) { + key_slices.emplace_back(key); + } + uint32_t num_lookups = secondary_cache->num_lookups(); + dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), + key_slices.size(), key_slices.data(), values.data(), + s.data(), false); + ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5); + for (int i = 0; i < N; ++i) { + ASSERT_OK(s[i]); + ASSERT_EQ(values[i].ToString(), keys[i]); + values[i].Reset(); + } + Destroy(options); +} + +class LRUCacheWithStat : public LRUCache { + public: + LRUCacheWithStat( + size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, + double _high_pri_pool_ratio, + std::shared_ptr _memory_allocator = nullptr, + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDontChargeCacheMetadata, + const std::shared_ptr& _secondary_cache = nullptr) + : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit, + _high_pri_pool_ratio, _memory_allocator, _use_adaptive_mutex, + _metadata_charge_policy, _secondary_cache) { + insert_count_ = 0; + lookup_count_ = 0; + } + ~LRUCacheWithStat() {} + + Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, + Handle** handle, Priority priority) override { + insert_count_++; + return LRUCache::Insert(key, value, charge, deleter, handle, priority); + } + Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, + size_t chargge, Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + insert_count_++; + return LRUCache::Insert(key, value, helper, chargge, handle, priority); + } + Handle* Lookup(const Slice& key, Statistics* stats) override { + lookup_count_++; + return LRUCache::Lookup(key, stats); + } + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, bool wait, + Statistics* stats = nullptr) override { + lookup_count_++; + return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats); + } + + uint32_t GetInsertCount() { return insert_count_; } + uint32_t GetLookupcount() { return lookup_count_; } + void ResetCount() { + insert_count_ = 0; + lookup_count_ = 0; + } + + private: + uint32_t insert_count_; + uint32_t lookup_count_; +}; + +#ifndef ROCKSDB_LITE + +TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { + LRUCacheOptions cache_opts(1024 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); + std::shared_ptr cache(tmp_cache); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + DestroyAndReopen(options); + fault_fs_->SetFailGetUniqueId(true); + + Random rnd(301); + const int N = 256; + std::vector value; + char buf[1000]; + memset(buf, 'a', 1000); + value.resize(N); + for (int i = 0; i < N; i++) { + // std::string p_v = rnd.RandomString(1000); + std::string p_v(buf, 1000); + value[i] = p_v; + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + Compact("a", "z"); + + // do th eread for all the key value pairs, so all the blocks should be in + // cache + uint32_t start_insert = tmp_cache->GetInsertCount(); + uint32_t start_lookup = tmp_cache->GetLookupcount(); + std::string v; + for (int i = 0; i < N; i++) { + v = Get(Key(i)); + ASSERT_EQ(v, value[i]); + } + uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + ASSERT_EQ(63, + static_cast(dump_insert)); // the insert in the block cache + ASSERT_EQ(256, + static_cast(dump_lookup)); // the lookup in the block cache + // We have enough blocks in the block cache + + CacheDumpOptions cd_options; + cd_options.clock = fault_env_->GetSystemClock().get(); + std::string dump_path = db_->GetName() + "/cache_dump"; + std::unique_ptr dump_writer; + Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path, + &dump_writer); + ASSERT_OK(s); + std::unique_ptr cache_dumper; + s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer), + &cache_dumper); + ASSERT_OK(s); + std::vector db_list; + db_list.push_back(db_); + s = cache_dumper->SetDumpFilter(db_list); + ASSERT_OK(s); + s = cache_dumper->DumpCacheEntriesToWriter(); + ASSERT_OK(s); + cache_dumper.reset(); + + // we have a new cache it is empty, then, before we do the Get, we do the + // dumpload + std::shared_ptr secondary_cache = + std::make_shared(2048 * 1024); + cache_opts.secondary_cache = secondary_cache; + tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); + std::shared_ptr cache_new(tmp_cache); + table_options.block_cache = cache_new; + table_options.block_size = 4 * 1024; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + + // start to load the data to new block cache + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + std::unique_ptr dump_reader; + s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path, + &dump_reader); + ASSERT_OK(s); + std::unique_ptr cache_loader; + s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache, + std::move(dump_reader), &cache_loader); + ASSERT_OK(s); + s = cache_loader->RestoreCacheEntriesToSecondaryCache(); + ASSERT_OK(s); + uint32_t load_insert = secondary_cache->num_inserts() - start_insert; + uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup; + // check the number we inserted + ASSERT_EQ(64, static_cast(load_insert)); + ASSERT_EQ(0, static_cast(load_lookup)); + ASSERT_OK(s); + + Reopen(options); + + // After load, we do the Get again + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + uint32_t cache_insert = tmp_cache->GetInsertCount(); + uint32_t cache_lookup = tmp_cache->GetLookupcount(); + for (int i = 0; i < N; i++) { + v = Get(Key(i)); + ASSERT_EQ(v, value[i]); + } + uint32_t final_insert = secondary_cache->num_inserts() - start_insert; + uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup; + // no insert to secondary cache + ASSERT_EQ(0, static_cast(final_insert)); + // lookup the secondary to get all blocks + ASSERT_EQ(64, static_cast(final_lookup)); + uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + // Check the new block cache insert and lookup, should be no insert since all + // blocks are from the secondary cache. + ASSERT_EQ(0, static_cast(block_insert)); + ASSERT_EQ(256, static_cast(block_lookup)); + + fault_fs_->SetFailGetUniqueId(false); + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { + LRUCacheOptions cache_opts(1024 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); + std::shared_ptr cache(tmp_cache); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + std::string dbname1 = test::PerThreadDBPath("db_1"); + ASSERT_OK(DestroyDB(dbname1, options)); + DB* db1 = nullptr; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + std::string dbname2 = test::PerThreadDBPath("db_2"); + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + fault_fs_->SetFailGetUniqueId(true); + + // write the KVs to db1 + Random rnd(301); + const int N = 256; + std::vector value1; + WriteOptions wo; + char buf[1000]; + memset(buf, 'a', 1000); + value1.resize(N); + for (int i = 0; i < N; i++) { + std::string p_v(buf, 1000); + value1[i] = p_v; + ASSERT_OK(db1->Put(wo, Key(i), p_v)); + } + ASSERT_OK(db1->Flush(FlushOptions())); + Slice bg("a"); + Slice ed("b"); + ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed)); + + // Write the KVs to DB2 + std::vector value2; + memset(buf, 'b', 1000); + value2.resize(N); + for (int i = 0; i < N; i++) { + std::string p_v(buf, 1000); + value2[i] = p_v; + ASSERT_OK(db2->Put(wo, Key(i), p_v)); + } + ASSERT_OK(db2->Flush(FlushOptions())); + ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed)); + + // do th eread for all the key value pairs, so all the blocks should be in + // cache + uint32_t start_insert = tmp_cache->GetInsertCount(); + uint32_t start_lookup = tmp_cache->GetLookupcount(); + ReadOptions ro; + std::string v; + for (int i = 0; i < N; i++) { + ASSERT_OK(db1->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value1[i]); + } + for (int i = 0; i < N; i++) { + ASSERT_OK(db2->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value2[i]); + } + uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + ASSERT_EQ(128, + static_cast(dump_insert)); // the insert in the block cache + ASSERT_EQ(512, + static_cast(dump_lookup)); // the lookup in the block cache + // We have enough blocks in the block cache + + CacheDumpOptions cd_options; + cd_options.clock = fault_env_->GetSystemClock().get(); + std::string dump_path = db1->GetName() + "/cache_dump"; + std::unique_ptr dump_writer; + Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path, + &dump_writer); + ASSERT_OK(s); + std::unique_ptr cache_dumper; + s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer), + &cache_dumper); + ASSERT_OK(s); + std::vector db_list; + db_list.push_back(db1); + s = cache_dumper->SetDumpFilter(db_list); + ASSERT_OK(s); + s = cache_dumper->DumpCacheEntriesToWriter(); + ASSERT_OK(s); + cache_dumper.reset(); + + // we have a new cache it is empty, then, before we do the Get, we do the + // dumpload + std::shared_ptr secondary_cache = + std::make_shared(2048 * 1024); + cache_opts.secondary_cache = secondary_cache; + tmp_cache = new LRUCacheWithStat( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); + std::shared_ptr cache_new(tmp_cache); + table_options.block_cache = cache_new; + table_options.block_size = 4 * 1024; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + + // Start the cache loading process + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + std::unique_ptr dump_reader; + s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path, + &dump_reader); + ASSERT_OK(s); + std::unique_ptr cache_loader; + s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache, + std::move(dump_reader), &cache_loader); + ASSERT_OK(s); + s = cache_loader->RestoreCacheEntriesToSecondaryCache(); + ASSERT_OK(s); + uint32_t load_insert = secondary_cache->num_inserts() - start_insert; + uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup; + // check the number we inserted + ASSERT_EQ(64, static_cast(load_insert)); + ASSERT_EQ(0, static_cast(load_lookup)); + ASSERT_OK(s); + + ASSERT_OK(db1->Close()); + delete db1; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + + // After load, we do the Get again. To validate the cache, we do not allow any + // I/O, so we set the file system to false. + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + fault_fs_->SetFilesystemActive(false, error_msg); + start_insert = secondary_cache->num_inserts(); + start_lookup = secondary_cache->num_lookups(); + uint32_t cache_insert = tmp_cache->GetInsertCount(); + uint32_t cache_lookup = tmp_cache->GetLookupcount(); + for (int i = 0; i < N; i++) { + ASSERT_OK(db1->Get(ro, Key(i), &v)); + ASSERT_EQ(v, value1[i]); + } + uint32_t final_insert = secondary_cache->num_inserts() - start_insert; + uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup; + // no insert to secondary cache + ASSERT_EQ(0, static_cast(final_insert)); + // lookup the secondary to get all blocks + ASSERT_EQ(64, static_cast(final_lookup)); + uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + // Check the new block cache insert and lookup, should be no insert since all + // blocks are from the secondary cache. + ASSERT_EQ(0, static_cast(block_insert)); + ASSERT_EQ(256, static_cast(block_lookup)); + fault_fs_->SetFailGetUniqueId(false); + fault_fs_->SetFilesystemActive(true); + delete db1; + delete db2; + ASSERT_OK(DestroyDB(dbname1, options)); + ASSERT_OK(DestroyDB(dbname2, options)); +} + +// Test the option not to use the secondary cache in a certain DB. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + options.lowest_used_cache_tier = CacheTier::kVolatileTier; + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i + 70), p_v)); + } + + ASSERT_OK(Flush()); + + // Flush will trigger the paranoid check and read blocks. But only block cache + // will be read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Compact("a", "z"); + + // Compaction will also insert and evict blocks, no operations to the block + // cache. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + + // Check the data in first block. Cache miss, direclty read from SST file. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // Check the second block. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // block cache hit + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(70)); + ASSERT_EQ(1007, v.size()); + + // Check the first block in the second SST file. Cache miss and trigger SST + // file read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(75)); + ASSERT_EQ(1007, v.size()); + + // Check the second block in the second SST file. Cache miss and trigger SST + // file read. No operations for secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Destroy(options); +} + +// We disable the secondary cache in DBOptions at first. Close and reopen the DB +// with new options, which set the lowest_used_cache_tier to +// kNonVolatileBlockTier. So secondary cache will be used. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + options.lowest_used_cache_tier = CacheTier::kVolatileTier; + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i + 70), p_v)); + } + + ASSERT_OK(Flush()); + + // Flush will trigger the paranoid check and read blocks. But only block cache + // will be read. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + Compact("a", "z"); + + // Compaction will also insert and evict blocks, no operations to the block + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + + // Check the data in first block. Cache miss, direclty read from SST file. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // Check the second block. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + // block cache hit + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + + // Change the option to enable secondary cache after we Reopen the DB + options.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + Reopen(options); + + v = Get(Key(70)); + ASSERT_EQ(1007, v.size()); + + // Enable the secondary cache, trigger lookup of the first block in second SST + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + v = Get(Key(75)); + ASSERT_EQ(1007, v.size()); + + // trigger lookup of the second block in second SST + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + Destroy(options); +} + +// Two DB test. We create 2 DBs sharing the same block cache and secondary +// cache. We diable the secondary cache option for DB2. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + options.paranoid_file_checks = true; + std::string dbname1 = test::PerThreadDBPath("db_t_1"); + ASSERT_OK(DestroyDB(dbname1, options)); + DB* db1 = nullptr; + ASSERT_OK(DB::Open(options, dbname1, &db1)); + std::string dbname2 = test::PerThreadDBPath("db_t_2"); + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + Options options2 = options; + options2.lowest_used_cache_tier = CacheTier::kVolatileTier; + ASSERT_OK(DB::Open(options2, dbname2, &db2)); + fault_fs_->SetFailGetUniqueId(true); + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + std::string session_id; + ASSERT_OK(db1->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + + WriteOptions wo; + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(db1->Put(wo, Key(i), p_v)); + } + + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 0u); + ASSERT_OK(db1->Flush(FlushOptions())); + + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(db2->Put(wo, Key(i), p_v)); + } + + // No change in the secondary cache, since it is disabled in DB2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + ASSERT_OK(db2->Flush(FlushOptions())); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Slice bg("a"); + Slice ed("b"); + ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed)); + ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed)); + + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + ReadOptions ro; + std::string v; + ASSERT_OK(db1->Get(ro, Key(0), &v)); + ASSERT_EQ(1007, v.size()); + + // DB 1 has lookup block 1 and it is miss in block cache, trigger secondary + // cache lookup + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + ASSERT_OK(db1->Get(ro, Key(5), &v)); + ASSERT_EQ(1007, v.size()); + + // DB 1 lookup the second block and it is miss in block cache, trigger + // secondary cache lookup + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + ASSERT_OK(db2->Get(ro, Key(0), &v)); + ASSERT_EQ(1007, v.size()); + + // For db2, it is not enabled with secondary cache, so no search in the + // secondary cache + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + ASSERT_OK(db2->Get(ro, Key(5), &v)); + ASSERT_EQ(1007, v.size()); + + // For db2, it is not enabled with secondary cache, so no search in the + // secondary cache + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + fault_fs_->SetFailGetUniqueId(false); + fault_fs_->SetFilesystemActive(true); + delete db1; + delete db2; + ASSERT_OK(DestroyDB(dbname1, options)); + ASSERT_OK(DestroyDB(dbname2, options)); +} + +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,53 +9,96 @@ #include "cache/sharded_cache.h" -#include +#include +#include +#include +#include "util/hash.h" +#include "util/math.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +namespace { + +inline uint32_t HashSlice(const Slice& s) { + return Lower32of64(GetSliceNPHash64(s)); +} + +} // namespace + ShardedCache::ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, std::shared_ptr allocator) : Cache(std::move(allocator)), - num_shard_bits_(num_shard_bits), + shard_mask_((uint32_t{1} << num_shard_bits) - 1), capacity_(capacity), strict_capacity_limit_(strict_capacity_limit), last_id_(1) {} void ShardedCache::SetCapacity(size_t capacity) { - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; MutexLock l(&capacity_mutex_); - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->SetCapacity(per_shard); } capacity_ = capacity; } void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) { - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); MutexLock l(&capacity_mutex_); - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit); } strict_capacity_limit_ = strict_capacity_limit; } Status ShardedCache::Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle, Priority priority) { + DeleterFn deleter, Handle** handle, + Priority priority) { uint32_t hash = HashSlice(key); return GetShard(Shard(hash)) ->Insert(key, hash, value, charge, deleter, handle, priority); } +Status ShardedCache::Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle, Priority priority) { + uint32_t hash = HashSlice(key); + if (!helper) { + return Status::InvalidArgument(); + } + return GetShard(Shard(hash)) + ->Insert(key, hash, value, helper, charge, handle, priority); +} + Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) { uint32_t hash = HashSlice(key); return GetShard(Shard(hash))->Lookup(key, hash); } +Cache::Handle* ShardedCache::Lookup(const Slice& key, + const CacheItemHelper* helper, + const CreateCallback& create_cb, + Priority priority, bool wait, + Statistics* stats) { + uint32_t hash = HashSlice(key); + return GetShard(Shard(hash)) + ->Lookup(key, hash, helper, create_cb, priority, wait, stats); +} + +bool ShardedCache::IsReady(Handle* handle) { + uint32_t hash = GetHash(handle); + return GetShard(Shard(hash))->IsReady(handle); +} + +void ShardedCache::Wait(Handle* handle) { + uint32_t hash = GetHash(handle); + GetShard(Shard(hash))->Wait(handle); +} + bool ShardedCache::Ref(Handle* handle) { uint32_t hash = GetHash(handle); return GetShard(Shard(hash))->Ref(handle); @@ -66,6 +109,11 @@ return GetShard(Shard(hash))->Release(handle, force_erase); } +bool ShardedCache::Release(Handle* handle, bool useful, bool force_erase) { + uint32_t hash = GetHash(handle); + return GetShard(Shard(hash))->Release(handle, useful, force_erase); +} + void ShardedCache::Erase(const Slice& key) { uint32_t hash = HashSlice(key); GetShard(Shard(hash))->Erase(key, hash); @@ -87,9 +135,9 @@ size_t ShardedCache::GetUsage() const { // We will not lock the cache when getting the usage from shards. - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); size_t usage = 0; - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { usage += GetShard(s)->GetUsage(); } return usage; @@ -101,25 +149,42 @@ size_t ShardedCache::GetPinnedUsage() const { // We will not lock the cache when getting the usage from shards. - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); size_t usage = 0; - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { usage += GetShard(s)->GetPinnedUsage(); } return usage; } -void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - int num_shards = 1 << num_shard_bits_; - for (int s = 0; s < num_shards; s++) { - GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe); - } +void ShardedCache::ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) { + uint32_t num_shards = GetNumShards(); + // Iterate over part of each shard, rotating between shards, to + // minimize impact on latency of concurrent operations. + std::unique_ptr states(new uint32_t[num_shards]{}); + + uint32_t aepl_in_32 = static_cast( + std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock)); + aepl_in_32 = std::min(aepl_in_32, uint32_t{1}); + + bool remaining_work; + do { + remaining_work = false; + for (uint32_t s = 0; s < num_shards; s++) { + if (states[s] != UINT32_MAX) { + GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]); + remaining_work |= states[s] != UINT32_MAX; + } + } + } while (remaining_work); } void ShardedCache::EraseUnRefEntries() { - int num_shards = 1 << num_shard_bits_; - for (int s = 0; s < num_shards; s++) { + uint32_t num_shards = GetNumShards(); + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->EraseUnRefEntries(); } } @@ -134,7 +199,8 @@ snprintf(buffer, kBufferSize, " capacity : %" ROCKSDB_PRIszt "\n", capacity_); ret.append(buffer); - snprintf(buffer, kBufferSize, " num_shard_bits : %d\n", num_shard_bits_); + snprintf(buffer, kBufferSize, " num_shard_bits : %d\n", + GetNumShardBits()); ret.append(buffer); snprintf(buffer, kBufferSize, " strict_capacity_limit : %d\n", strict_capacity_limit_); @@ -159,4 +225,8 @@ return num_shard_bits; } +int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); } + +uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; } + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,7 +14,6 @@ #include "port/port.h" #include "rocksdb/cache.h" -#include "util/hash.h" namespace ROCKSDB_NAMESPACE { @@ -24,20 +23,38 @@ CacheShard() = default; virtual ~CacheShard() = default; + using DeleterFn = Cache::DeleterFn; virtual Status Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), + size_t charge, DeleterFn deleter, + Cache::Handle** handle, Cache::Priority priority) = 0; + virtual Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, Cache::Handle** handle, Cache::Priority priority) = 0; virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0; + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority, bool wait, + Statistics* stats) = 0; + virtual bool Release(Cache::Handle* handle, bool useful, + bool force_erase) = 0; + virtual bool IsReady(Cache::Handle* handle) = 0; + virtual void Wait(Cache::Handle* handle) = 0; virtual bool Ref(Cache::Handle* handle) = 0; - virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0; + virtual bool Release(Cache::Handle* handle, bool force_erase) = 0; virtual void Erase(const Slice& key, uint32_t hash) = 0; virtual void SetCapacity(size_t capacity) = 0; virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; virtual size_t GetUsage() const = 0; virtual size_t GetPinnedUsage() const = 0; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) = 0; + // Handles iterating over roughly `average_entries_per_lock` entries, using + // `state` to somehow record where it last ended up. Caller initially uses + // *state == 0 and implementation sets *state = UINT32_MAX to indicate + // completion. + virtual void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) = 0; virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } void set_metadata_charge_policy( @@ -57,22 +74,29 @@ ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, std::shared_ptr memory_allocator = nullptr); virtual ~ShardedCache() = default; - virtual const char* Name() const override = 0; - virtual CacheShard* GetShard(int shard) = 0; - virtual const CacheShard* GetShard(int shard) const = 0; - virtual void* Value(Handle* handle) override = 0; - virtual size_t GetCharge(Handle* handle) const override = 0; + virtual CacheShard* GetShard(uint32_t shard) = 0; + virtual const CacheShard* GetShard(uint32_t shard) const = 0; virtual uint32_t GetHash(Handle* handle) const = 0; - virtual void DisownData() override = 0; virtual void SetCapacity(size_t capacity) override; virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override; virtual Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle, Priority priority) override; + DeleterFn deleter, Handle** handle, + Priority priority) override; + virtual Status Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t chargge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) override; virtual Handle* Lookup(const Slice& key, Statistics* stats) override; + virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, + bool wait, Statistics* stats = nullptr) override; + virtual bool Release(Handle* handle, bool useful, + bool force_erase = false) override; + virtual bool IsReady(Handle* handle) override; + virtual void Wait(Handle* handle) override; virtual bool Ref(Handle* handle) override; virtual bool Release(Handle* handle, bool force_erase = false) override; virtual void Erase(const Slice& key) override; @@ -82,24 +106,21 @@ virtual size_t GetUsage() const override; virtual size_t GetUsage(Handle* handle) const override; virtual size_t GetPinnedUsage() const override; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override; virtual void EraseUnRefEntries() override; virtual std::string GetPrintableOptions() const override; - int GetNumShardBits() const { return num_shard_bits_; } - - private: - static inline uint32_t HashSlice(const Slice& s) { - return static_cast(GetSliceNPHash64(s)); - } + int GetNumShardBits() const; + uint32_t GetNumShards() const; - uint32_t Shard(uint32_t hash) { - // Note, hash >> 32 yields hash in gcc, not the zero we expect! - return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; - } + protected: + inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; } - int num_shard_bits_; + private: + const uint32_t shard_mask_; mutable port::Mutex capacity_mutex_; size_t capacity_; bool strict_capacity_limit_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in 2025-05-19 16:14:27.000000000 +0000 @@ -1,3 +1,54 @@ @PACKAGE_INIT@ + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules") + +include(CMakeFindDependencyMacro) + +set(GFLAGS_USE_TARGET_NAMESPACE @GFLAGS_USE_TARGET_NAMESPACE@) + +if(@WITH_JEMALLOC@) + find_dependency(JeMalloc) +endif() + +if(@WITH_GFLAGS@) + find_dependency(gflags CONFIG) + if(NOT gflags_FOUND) + find_dependency(gflags) + endif() +endif() + +if(@WITH_SNAPPY@) + find_dependency(Snappy CONFIG) + if(NOT Snappy_FOUND) + find_dependency(Snappy) + endif() +endif() + +if(@WITH_ZLIB@) + find_dependency(ZLIB) +endif() + +if(@WITH_BZ2@) + find_dependency(BZip2) +endif() + +if(@WITH_LZ4@) + find_dependency(lz4) +endif() + +if(@WITH_ZSTD@) + find_dependency(zstd) +endif() + +if(@WITH_NUMA@) + find_dependency(NUMA) +endif() + +if(@WITH_TBB@) + find_dependency(TBB) +endif() + +find_dependency(Threads) + include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake") check_required_components(RocksDB) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,7 @@ +macro(get_cxx_std_flags FLAGS_VARIABLE) + if( CMAKE_CXX_STANDARD_REQUIRED ) + set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION}) + else() + set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION}) + endif() +endmacro() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,29 @@ +# - Find Snappy +# Find the snappy compression library and includes +# +# Snappy_INCLUDE_DIRS - where to find snappy.h, etc. +# Snappy_LIBRARIES - List of libraries when using snappy. +# Snappy_FOUND - True if snappy found. + +find_path(Snappy_INCLUDE_DIRS + NAMES snappy.h + HINTS ${snappy_ROOT_DIR}/include) + +find_library(Snappy_LIBRARIES + NAMES snappy + HINTS ${snappy_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_LIBRARIES Snappy_INCLUDE_DIRS) + +mark_as_advanced( + Snappy_LIBRARIES + Snappy_INCLUDE_DIRS) + +if(Snappy_FOUND AND NOT (TARGET Snappy::snappy)) + add_library (Snappy::snappy UNKNOWN IMPORTED) + set_target_properties(Snappy::snappy + PROPERTIES + IMPORTED_LOCATION ${Snappy_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${Snappy_INCLUDE_DIRS}) +endif() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake 2025-05-19 16:14:27.000000000 +0000 @@ -1,8 +1,8 @@ # - Find gflags library # Find the gflags includes and library # -# gflags_INCLUDE_DIR - where to find gflags.h. -# gflags_LIBRARIES - List of libraries when using gflags. +# GFLAGS_INCLUDE_DIR - where to find gflags.h. +# GFLAGS_LIBRARIES - List of libraries when using gflags. # gflags_FOUND - True if gflags found. find_path(GFLAGS_INCLUDE_DIR diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake 1970-01-01 00:00:00.000000000 +0000 @@ -1,29 +0,0 @@ -# - Find Snappy -# Find the snappy compression library and includes -# -# snappy_INCLUDE_DIRS - where to find snappy.h, etc. -# snappy_LIBRARIES - List of libraries when using snappy. -# snappy_FOUND - True if snappy found. - -find_path(snappy_INCLUDE_DIRS - NAMES snappy.h - HINTS ${snappy_ROOT_DIR}/include) - -find_library(snappy_LIBRARIES - NAMES snappy - HINTS ${snappy_ROOT_DIR}/lib) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS) - -mark_as_advanced( - snappy_LIBRARIES - snappy_INCLUDE_DIRS) - -if(snappy_FOUND AND NOT (TARGET snappy::snappy)) - add_library (snappy::snappy UNKNOWN IMPORTED) - set_target_properties(snappy::snappy - PROPERTIES - IMPORTED_LOCATION ${snappy_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS}) -endif() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake --- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,26 @@ +# - Find liburing +# +# uring_INCLUDE_DIR - Where to find liburing.h +# uring_LIBRARIES - List of libraries when using uring. +# uring_FOUND - True if uring found. + +find_path(uring_INCLUDE_DIR + NAMES liburing.h) +find_library(uring_LIBRARIES + NAMES liburing.a liburing) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(uring + DEFAULT_MSG uring_LIBRARIES uring_INCLUDE_DIR) + +mark_as_advanced( + uring_INCLUDE_DIR + uring_LIBRARIES) + +if(uring_FOUND AND NOT TARGET uring::uring) + add_library(uring::uring UNKNOWN IMPORTED) + set_target_properties(uring::uring PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${uring_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "C" + IMPORTED_LOCATION "${uring_LIBRARIES}") +endif() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/coverage_test.sh mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/coverage_test.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh 2025-05-19 16:14:27.000000000 +0000 @@ -12,21 +12,24 @@ ROOT=".." # Fetch right version of gcov if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then - source $ROOT/build_tools/fbcode_config.sh + source $ROOT/build_tools/fbcode_config_platform007.sh GCOV=$GCC_BASE/bin/gcov else GCOV=$(which gcov) fi +echo -e "Using $GCOV" COVERAGE_DIR="$PWD/COVERAGE_REPORT" mkdir -p $COVERAGE_DIR # Find all gcno files to generate the coverage report +PYTHON=${1:-`which python3`} +echo -e "Using $PYTHON" GCNO_FILES=`find $ROOT -name "*.gcno"` $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | # Parse the raw gcov report to more human readable form. - python $ROOT/coverage/parse_gcov_output.py | + $PYTHON $ROOT/coverage/parse_gcov_output.py | # Write the output to both stdout and report file. tee $COVERAGE_DIR/coverage_report_all.txt && echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n" @@ -41,7 +44,7 @@ echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null | - python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES | + $PYTHON $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES | tee -a $RECENT_REPORT && echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py --- mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py 2025-05-19 16:14:27.000000000 +0000 @@ -1,10 +1,12 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +from __future__ import print_function + +import optparse import re import sys -from optparse import OptionParser - # the gcov report follows certain pattern. Each file will have two lines # of report, from which we can extract the file name, total lines and coverage # percentage. @@ -48,7 +50,7 @@ def get_option_parser(): usage = "Parse the gcov output and generate more human-readable code " +\ "coverage report." - parser = OptionParser(usage) + parser = optparse.OptionParser(usage) parser.add_option( "--interested-files", "-i", @@ -73,8 +75,8 @@ header_template = \ "%" + str(max_file_name_length) + "s\t%s\t%s" separator = "-" * (max_file_name_length + 10 + 20) - print header_template % ("Filename", "Coverage", "Lines") # noqa: E999 T25377293 Grandfathered in - print separator + print(header_template % ("Filename", "Coverage", "Lines")) # noqa: E999 T25377293 Grandfathered in + print(separator) # -- Print body # template for printing coverage report for each file. @@ -82,12 +84,12 @@ for fname, coverage_info in per_file_coverage.items(): coverage, lines = coverage_info - print record_template % (fname, coverage, lines) + print(record_template % (fname, coverage, lines)) # -- Print footer if total_coverage: - print separator - print record_template % ("Total", total_coverage[0], total_coverage[1]) + print(separator) + print(record_template % ("Total", total_coverage[0], total_coverage[1])) def report_coverage(): parser = get_option_parser() @@ -111,7 +113,7 @@ total_coverage = None if not len(per_file_coverage): - print >> sys.stderr, "Cannot find coverage info for the given files." + print("Cannot find coverage info for the given files.", file=sys.stderr) return display_file_coverage(per_file_coverage, total_coverage) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc 2025-05-19 16:14:27.000000000 +0000 @@ -30,21 +30,20 @@ return db_iter_->GetProperty(prop_name, prop); } -void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iteration, - uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob, - bool allow_refresh) { +void ArenaWrappedDBIter::Init( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, - cf_options.user_comparator, nullptr, sequence, - true, max_sequential_skip_in_iteration, - read_callback, db_impl, cfd, allow_blob); + db_iter_ = + new (mem) DBIter(env, read_options, ioptions, mutable_cf_options, + ioptions.user_comparator, /* iter */ nullptr, version, + sequence, true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, expose_blob_index); sv_number_ = version_number; + read_options_ = read_options; allow_refresh_ = allow_refresh; } @@ -56,48 +55,74 @@ // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the // correct behavior. Will be corrected automatically when we take a snapshot // here for the case of WritePreparedTxnDB. - SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); - if (sv_number_ != cur_sv_number) { - Env* env = db_iter_->env(); - db_iter_->~DBIter(); - arena_.~Arena(); - new (&arena_) Arena(); - - SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); - if (read_callback_) { - read_callback_->Refresh(latest_seq); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1"); + TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2"); + while (true) { + if (sv_number_ != cur_sv_number) { + Env* env = db_iter_->env(); + db_iter_->~DBIter(); + arena_.~Arena(); + new (&arena_) Arena(); + + SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + if (read_callback_) { + read_callback_->Refresh(latest_seq); + } + Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + sv->current, latest_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_, + allow_refresh_); + + InternalIterator* internal_iter = db_impl_->NewInternalIterator( + read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(), + latest_seq, /* allow_unprepared_value */ true); + SetIterUnderDBIter(internal_iter); + break; + } else { + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + // Refresh range-tombstones in MemTable + if (!read_options_.ignore_range_deletions) { + SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_); + ReadRangeDelAggregator* range_del_agg = + db_iter_->GetRangeDelAggregator(); + std::unique_ptr range_del_iter; + range_del_iter.reset( + sv->mem->NewRangeTombstoneIterator(read_options_, latest_seq)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + cfd_->ReturnThreadLocalSuperVersion(sv); + } + // Refresh latest sequence number + db_iter_->set_sequence(latest_seq); + db_iter_->set_valid(false); + // Check again if the latest super version number is changed + uint64_t latest_sv_number = cfd_->GetSuperVersionNumber(); + if (latest_sv_number != cur_sv_number) { + // If the super version number is changed after refreshing, + // fallback to Re-Init the InternalIterator + cur_sv_number = latest_sv_number; + continue; + } + break; } - Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, - latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, - allow_refresh_); - - InternalIterator* internal_iter = db_impl_->NewInternalIterator( - read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(), - latest_seq); - SetIterUnderDBIter(internal_iter); - } else { - db_iter_->set_sequence(latest_seq); - db_iter_->set_valid(false); } return Status::OK(); } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh) { + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); - iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, + iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, max_sequential_skip_in_iterations, version_number, read_callback, - db_impl, cfd, allow_blob, allow_refresh); + db_impl, cfd, expose_blob_index, allow_refresh); if (db_impl != nullptr && cfd != nullptr && allow_refresh) { - iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback, - allow_blob); + iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index); } return iter; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ #include #include "db/db_impl/db_impl.h" #include "db/db_iter.h" -#include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "memory/arena.h" #include "options/cf_options.h" @@ -23,6 +22,7 @@ namespace ROCKSDB_NAMESPACE { class Arena; +class Version; // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed to be allocated. This class is used as an entry point of @@ -33,7 +33,13 @@ // the same as the inner DBIter. class ArenaWrappedDBIter : public Iterator { public: - virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); } + ~ArenaWrappedDBIter() override { + if (db_iter_ != nullptr) { + db_iter_->~DBIter(); + } else { + assert(false); + } + } // Get the arena to be used to allocate memory for DBIter to be wrapped, // as well as child iterators in it. @@ -41,6 +47,7 @@ virtual ReadRangeDelAggregator* GetRangeDelAggregator() { return db_iter_->GetRangeDelAggregator(); } + const ReadOptions& GetReadOptions() { return read_options_; } // Set the internal iterator wrapped inside the DB Iterator. Usually it is // a merging iterator. @@ -51,6 +58,8 @@ bool Valid() const override { return db_iter_->Valid(); } void SeekToFirst() override { db_iter_->SeekToFirst(); } void SeekToLast() override { db_iter_->SeekToLast(); } + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. void Seek(const Slice& target) override { db_iter_->Seek(target); } void SeekForPrev(const Slice& target) override { db_iter_->SeekForPrev(target); @@ -60,6 +69,7 @@ Slice key() const override { return db_iter_->key(); } Slice value() const override { return db_iter_->value(); } Status status() const override { return db_iter_->status(); } + Slice timestamp() const override { return db_iter_->timestamp(); } bool IsBlob() const { return db_iter_->IsBlob(); } Status GetProperty(std::string prop_name, std::string* prop) override; @@ -67,34 +77,32 @@ Status Refresh() override; void Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh); + bool expose_blob_index, bool allow_refresh); // Store some parameters so we can refresh the iterator at a later point // with these same params - void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl, - ColumnFamilyData* cfd, ReadCallback* read_callback, - bool allow_blob) { - read_options_ = read_options; + void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd, + ReadCallback* read_callback, bool expose_blob_index) { db_impl_ = db_impl; cfd_ = cfd; read_callback_ = read_callback; - allow_blob_ = allow_blob; + expose_blob_index_ = expose_blob_index; } private: - DBIter* db_iter_; + DBIter* db_iter_ = nullptr; Arena arena_; uint64_t sv_number_; ColumnFamilyData* cfd_ = nullptr; DBImpl* db_impl_ = nullptr; ReadOptions read_options_; ReadCallback* read_callback_; - bool allow_blob_ = false; + bool expose_blob_index_ = false; bool allow_refresh_ = true; }; @@ -102,11 +110,10 @@ // `db_impl` and `cfd` are used for reneweal. If left null, renewal will not // be supported. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl = nullptr, - ColumnFamilyData* cfd = nullptr, bool allow_blob = false, - bool allow_refresh = true); + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false, bool allow_refresh = true); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_constants.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_constants.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kInvalidBlobFileNumber = 0; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/blob/blob_garbage_meter.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that passes each key-value encountered to +// BlobGarbageMeter as inflow in order to measure the total number and size of +// blobs in the compaction input on a per-blob file basis. +class BlobCountingIterator : public InternalIterator { + public: + BlobCountingIterator(InternalIterator* iter, + BlobGarbageMeter* blob_garbage_meter) + : iter_(iter), blob_garbage_meter_(blob_garbage_meter) { + assert(iter_); + assert(blob_garbage_meter_); + + UpdateAndCountBlobIfNeeded(); + } + + bool Valid() const override { return iter_->Valid() && status_.ok(); } + + void SeekToFirst() override { + iter_->SeekToFirst(); + UpdateAndCountBlobIfNeeded(); + } + + void SeekToLast() override { + iter_->SeekToLast(); + UpdateAndCountBlobIfNeeded(); + } + + void Seek(const Slice& target) override { + iter_->Seek(target); + UpdateAndCountBlobIfNeeded(); + } + + void SeekForPrev(const Slice& target) override { + iter_->SeekForPrev(target); + UpdateAndCountBlobIfNeeded(); + } + + void Next() override { + assert(Valid()); + + iter_->Next(); + UpdateAndCountBlobIfNeeded(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + + const bool res = iter_->NextAndGetResult(result); + UpdateAndCountBlobIfNeeded(); + return res; + } + + void Prev() override { + assert(Valid()); + + iter_->Prev(); + UpdateAndCountBlobIfNeeded(); + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice user_key() const override { + assert(Valid()); + return iter_->user_key(); + } + + Slice value() const override { + assert(Valid()); + return iter_->value(); + } + + Status status() const override { return status_; } + + bool PrepareValue() override { + assert(Valid()); + return iter_->PrepareValue(); + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return iter_->UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(Valid()); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateAndCountBlobIfNeeded() { + assert(!iter_->Valid() || iter_->status().ok()); + + if (!iter_->Valid()) { + status_ = iter_->status(); + return; + } + + TEST_SYNC_POINT( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow"); + + status_ = blob_garbage_meter_->ProcessInFlow(key(), value()); + } + + InternalIterator* iter_; + BlobGarbageMeter* blob_garbage_meter_; + Status status_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,326 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_counting_iterator.h" + +#include +#include + +#include "db/blob/blob_garbage_meter.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter, + uint64_t blob_file_number, uint64_t count, uint64_t bytes) { + const auto& flows = blob_garbage_meter.flows(); + + const auto it = flows.find(blob_file_number); + if (it == flows.end()) { + ASSERT_EQ(count, 0); + ASSERT_EQ(bytes, 0); + return; + } + + const auto& in = it->second.GetInFlow(); + + ASSERT_EQ(in.GetCount(), count); + ASSERT_EQ(in.GetBytes(), bytes); +} + +TEST(BlobCountingIteratorTest, CountBlobs) { + // Note: the input consists of three key-values: two are blob references to + // different blob files, while the third one is a plain value. + constexpr char user_key0[] = "key0"; + constexpr char user_key1[] = "key1"; + constexpr char user_key2[] = "key2"; + + const std::vector keys{ + test::KeyStr(user_key0, 1, kTypeBlobIndex), + test::KeyStr(user_key1, 2, kTypeBlobIndex), + test::KeyStr(user_key2, 3, kTypeValue)}; + + constexpr uint64_t first_blob_file_number = 4; + constexpr uint64_t first_offset = 1000; + constexpr uint64_t first_size = 2000; + + std::string first_blob_index; + BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset, + first_size, kNoCompression); + + constexpr uint64_t second_blob_file_number = 6; + constexpr uint64_t second_offset = 2000; + constexpr uint64_t second_size = 4000; + + std::string second_blob_index; + BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number, + second_offset, second_size, kNoCompression); + + const std::vector values{first_blob_index, second_blob_index, + "raw_value"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + constexpr uint64_t first_expected_bytes = + first_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1); + constexpr uint64_t second_expected_bytes = + second_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1); + + // Call SeekToFirst and iterate forward + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + // Do it again using NextAndGetResult + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_FALSE(blob_counter.NextAndGetResult(&result)); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + // Call SeekToLast and iterate backward + blob_counter.SeekToLast(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + // Call Seek for all keys (plus one that's greater than all of them) + blob_counter.Seek(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Seek(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek("zzz"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + // Call SeekForPrev for all keys (plus one that's less than all of them) + blob_counter.SeekForPrev("aaa"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); +} + +TEST(BlobCountingIteratorTest, CorruptBlobIndex) { + const std::vector keys{ + test::KeyStr("user_key", 1, kTypeBlobIndex)}; + const std::vector values{"i_am_not_a_blob_index"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + blob_counter.SeekToFirst(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_NOK(blob_counter.status()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_fetcher.h" + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index_slice, + prefetch_buffer, blob_value, bytes_read); +} + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer, + blob_value, bytes_read); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class Slice; +class FilePrefetchBuffer; +class PinnableSlice; +class BlobIndex; + +// A thin wrapper around the blob retrieval functionality of Version. +class BlobFetcher { + public: + BlobFetcher(const Version* version, const ReadOptions& read_options) + : version_(version), read_options_(read_options) {} + + Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + private: + const Version* version_; + ReadOptions read_options_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileAddition::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileAddition::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, total_blob_count_); + PutVarint64(output, total_blob_bytes_); + PutLengthPrefixedSlice(output, checksum_method_); + PutLengthPrefixedSlice(output, checksum_value_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileAddition::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileAddition"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &total_blob_count_)) { + return Status::Corruption(class_name, "Error decoding total blob count"); + } + + if (!GetVarint64(input, &total_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding total blob bytes"); + } + + Slice checksum_method; + if (!GetLengthPrefixedSlice(input, &checksum_method)) { + return Status::Corruption(class_name, "Error decoding checksum method"); + } + checksum_method_ = checksum_method.ToString(); + + Slice checksum_value; + if (!GetLengthPrefixedSlice(input, &checksum_value)) { + return Status::Corruption(class_name, "Error decoding checksum value"); + } + checksum_value_ = checksum_value.ToString(); + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileAddition::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileAddition::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() && + lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() && + lhs.GetChecksumMethod() == rhs.GetChecksumMethod() && + lhs.GetChecksumValue() == rhs.GetChecksumValue(); +} + +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition) { + os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber() + << " total_blob_count: " << blob_file_addition.GetTotalBlobCount() + << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes() + << " checksum_method: " << blob_file_addition.GetChecksumMethod() + << " checksum_value: " + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition) { + jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber() + << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount() + << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes() + << "ChecksumMethod" << blob_file_addition.GetChecksumMethod() + << "ChecksumValue" + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileAddition { + public: + BlobFileAddition() = default; + + BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t total_blob_count_ = 0; + uint64_t total_blob_bytes_ = 0; + std::string checksum_method_; + std::string checksum_value_; +}; + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs); +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,210 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include +#include +#include + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileAdditionTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) { + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_addition, decoded); + } +}; + +TEST_F(BlobFileAdditionTest, Empty) { + BlobFileAddition blob_file_addition; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0); + ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty()); + ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty()); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t total_blob_count = 2; + constexpr uint64_t total_blob_bytes = 123456; + const std::string checksum_method("SHA1"); + const std::string checksum_value( + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileAddition blob_file_addition; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob count")); + } + + constexpr uint64_t total_blob_count = 4567; + PutVarint64(&str, total_blob_count); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes")); + } + + constexpr uint64_t total_blob_bytes = 12345678; + PutVarint64(&str, total_blob_bytes); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum method")); + } + + constexpr char checksum_method[] = "SHA1"; + PutLengthPrefixedSlice(&str, checksum_method); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum value")); + } + + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + PutLengthPrefixedSlice(&str, checksum_value); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t total_blob_count = 9999; + constexpr uint64_t total_blob_bytes = 100000000; + const std::string checksum_method("CRC32"); + const std::string checksum_value("\x3d\x87\xff\x57"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + TestEncodeDecode(blob_file_addition); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t total_blob_count = 100; + constexpr uint64_t total_blob_bytes = 2000000; + const std::string checksum_method("CRC32B"); + const std::string checksum_value("\x6d\xbd\xf2\x3a"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,375 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_completion_callback.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/event_helpers.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileBuilder::BlobFileBuilder( + VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, + immutable_options, mutable_cf_options, file_options, + job_id, column_family_id, column_family_name, io_priority, + write_hint, io_tracer, blob_callback, creation_reason, + blob_file_paths, blob_file_additions) {} + +BlobFileBuilder::BlobFileBuilder( + std::function file_number_generator, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + int job_id, uint32_t column_family_id, + const std::string& column_family_name, Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions) + : file_number_generator_(std::move(file_number_generator)), + fs_(fs), + immutable_options_(immutable_options), + min_blob_size_(mutable_cf_options->min_blob_size), + blob_file_size_(mutable_cf_options->blob_file_size), + blob_compression_type_(mutable_cf_options->blob_compression_type), + file_options_(file_options), + job_id_(job_id), + column_family_id_(column_family_id), + column_family_name_(column_family_name), + io_priority_(io_priority), + write_hint_(write_hint), + io_tracer_(io_tracer), + blob_callback_(blob_callback), + creation_reason_(creation_reason), + blob_file_paths_(blob_file_paths), + blob_file_additions_(blob_file_additions), + blob_count_(0), + blob_bytes_(0) { + assert(file_number_generator_); + assert(fs_); + assert(immutable_options_); + assert(file_options_); + assert(blob_file_paths_); + assert(blob_file_paths_->empty()); + assert(blob_file_additions_); + assert(blob_file_additions_->empty()); +} + +BlobFileBuilder::~BlobFileBuilder() = default; + +Status BlobFileBuilder::Add(const Slice& key, const Slice& value, + std::string* blob_index) { + assert(blob_index); + assert(blob_index->empty()); + + if (value.size() < min_blob_size_) { + return Status::OK(); + } + + { + const Status s = OpenBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + Slice blob = value; + std::string compressed_blob; + + { + const Status s = CompressBlobIfNeeded(&blob, &compressed_blob); + if (!s.ok()) { + return s; + } + } + + uint64_t blob_file_number = 0; + uint64_t blob_offset = 0; + + { + const Status s = + WriteBlobToFile(key, blob, &blob_file_number, &blob_offset); + if (!s.ok()) { + return s; + } + } + + { + const Status s = CloseBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(), + blob_compression_type_); + + return Status::OK(); +} + +Status BlobFileBuilder::Finish() { + if (!IsBlobFileOpen()) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; } + +Status BlobFileBuilder::OpenBlobFileIfNeeded() { + if (IsBlobFileOpen()) { + return Status::OK(); + } + + assert(!blob_count_); + assert(!blob_bytes_); + + assert(file_number_generator_); + const uint64_t blob_file_number = file_number_generator_(); + + assert(immutable_options_); + assert(!immutable_options_->cf_paths.empty()); + std::string blob_file_path = + BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number); + + if (blob_callback_) { + blob_callback_->OnBlobFileCreationStarted( + blob_file_path, column_family_name_, job_id_, creation_reason_); + } + + std::unique_ptr file; + + { + assert(file_options_); + Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); + + if (!s.ok()) { + return s; + } + } + + // Note: files get added to blob_file_paths_ right after the open, so they + // can be cleaned up upon failure. Contrast this with blob_file_additions_, + // which only contains successfully written files. + assert(blob_file_paths_); + blob_file_paths_->emplace_back(std::move(blob_file_path)); + + assert(file); + file->SetIOPriority(io_priority_); + file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; + Statistics* const statistics = immutable_options_->stats; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_paths_->back(), *file_options_, + immutable_options_->clock, io_tracer_, statistics, + immutable_options_->listeners, + immutable_options_->file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kBlobFile), false)); + + constexpr bool do_flush = false; + + std::unique_ptr blob_log_writer(new BlobLogWriter( + std::move(file_writer), immutable_options_->clock, statistics, + blob_file_number, immutable_options_->use_fsync, do_flush)); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl, + expiration_range); + + { + Status s = blob_log_writer->WriteHeader(header); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); + + if (!s.ok()) { + return s; + } + } + + writer_ = std::move(blob_log_writer); + + assert(IsBlobFileOpen()); + + return Status::OK(); +} + +Status BlobFileBuilder::CompressBlobIfNeeded( + Slice* blob, std::string* compressed_blob) const { + assert(blob); + assert(compressed_blob); + assert(compressed_blob->empty()); + assert(immutable_options_); + + if (blob_compression_type_ == kNoCompression) { + return Status::OK(); + } + + CompressionOptions opts; + CompressionContext context(blob_compression_type_); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + blob_compression_type_, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + bool success = false; + + { + StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, + BLOB_DB_COMPRESSION_MICROS); + success = + CompressData(*blob, info, compression_format_version, compressed_blob); + } + + if (!success) { + return Status::Corruption("Error compressing blob"); + } + + *blob = Slice(*compressed_blob); + + return Status::OK(); +} + +Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, + uint64_t* blob_offset) { + assert(IsBlobFileOpen()); + assert(blob_file_number); + assert(blob_offset); + + uint64_t key_offset = 0; + + Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); + + if (!s.ok()) { + return s; + } + + *blob_file_number = writer_->get_log_number(); + + ++blob_count_; + blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size(); + + return Status::OK(); +} + +Status BlobFileBuilder::CloseBlobFile() { + assert(IsBlobFileOpen()); + + BlobLogFooter footer; + footer.blob_count = blob_count_; + + std::string checksum_method; + std::string checksum_value; + + Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); + + if (!s.ok()) { + return s; + } + + const uint64_t blob_file_number = writer_->get_log_number(); + + if (blob_callback_) { + s = blob_callback_->OnBlobFileCompleted( + blob_file_paths_->back(), column_family_name_, job_id_, + blob_file_number, creation_reason_, s, checksum_value, checksum_method, + blob_count_, blob_bytes_); + } + + assert(blob_file_additions_); + blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_, + std::move(checksum_method), + std::move(checksum_value)); + + assert(immutable_options_); + ROCKS_LOG_INFO(immutable_options_->logger, + "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64 + " total blobs, %" PRIu64 " total bytes", + column_family_name_.c_str(), job_id_, blob_file_number, + blob_count_, blob_bytes_); + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; + + return s; +} + +Status BlobFileBuilder::CloseBlobFileIfNeeded() { + assert(IsBlobFileOpen()); + + const WritableFileWriter* const file_writer = writer_->file(); + assert(file_writer); + + if (file_writer->GetFileSize() < blob_file_size_) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +void BlobFileBuilder::Abandon(const Status& s) { + if (!IsBlobFileOpen()) { + return; + } + if (blob_callback_) { + // BlobFileBuilder::Abandon() is called because of error while writing to + // Blob files. So we can ignore the below error. + blob_callback_ + ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_, + job_id_, writer_->get_log_number(), + creation_reason_, s, "", "", blob_count_, + blob_bytes_) + .PermitUncheckedError(); + } + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionSet; +class FileSystem; +class SystemClock; +struct ImmutableOptions; +struct MutableCFOptions; +struct FileOptions; +class BlobFileAddition; +class Status; +class Slice; +class BlobLogWriter; +class IOTracer; +class BlobFileCompletionCallback; + +class BlobFileBuilder { + public: + BlobFileBuilder(VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(std::function file_number_generator, + FileSystem* fs, const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector* blob_file_paths, + std::vector* blob_file_additions); + + BlobFileBuilder(const BlobFileBuilder&) = delete; + BlobFileBuilder& operator=(const BlobFileBuilder&) = delete; + + ~BlobFileBuilder(); + + Status Add(const Slice& key, const Slice& value, std::string* blob_index); + Status Finish(); + void Abandon(const Status& s); + + private: + bool IsBlobFileOpen() const; + Status OpenBlobFileIfNeeded(); + Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const; + Status WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, uint64_t* blob_offset); + Status CloseBlobFile(); + Status CloseBlobFileIfNeeded(); + + std::function file_number_generator_; + FileSystem* fs_; + const ImmutableOptions* immutable_options_; + uint64_t min_blob_size_; + uint64_t blob_file_size_; + CompressionType blob_compression_type_; + const FileOptions* file_options_; + int job_id_; + uint32_t column_family_id_; + std::string column_family_name_; + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + std::shared_ptr io_tracer_; + BlobFileCompletionCallback* blob_callback_; + BlobFileCreationReason creation_reason_; + std::vector* blob_file_paths_; + std::vector* blob_file_additions_; + std::unique_ptr writer_; + uint64_t blob_count_; + uint64_t blob_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,672 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include +#include +#include +#include +#include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_sequential_reader.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFileNumberGenerator { + public: + uint64_t operator()() { return ++next_file_number_; } + + private: + uint64_t next_file_number_ = 1; +}; + +class BlobFileBuilderTest : public testing::Test { + protected: + BlobFileBuilderTest() { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + clock_ = mock_env_->GetSystemClock().get(); + } + + void VerifyBlobFile(uint64_t blob_file_number, + const std::string& blob_file_path, + uint32_t column_family_id, + CompressionType blob_compression_type, + const std::vector>& + expected_key_value_pairs, + const std::vector& blob_indexes) { + assert(expected_key_value_pairs.size() == blob_indexes.size()); + + std::unique_ptr file; + constexpr IODebugContext* dbg = nullptr; + ASSERT_OK( + fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg)); + + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file), blob_file_path, clock_)); + + constexpr Statistics* statistics = nullptr; + BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_, + statistics); + + BlobLogHeader header; + ASSERT_OK(blob_log_reader.ReadHeader(&header)); + ASSERT_EQ(header.version, kVersion1); + ASSERT_EQ(header.column_family_id, column_family_id); + ASSERT_EQ(header.compression, blob_compression_type); + ASSERT_FALSE(header.has_ttl); + ASSERT_EQ(header.expiration_range, ExpirationRange()); + + for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) { + BlobLogRecord record; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_reader.ReadRecord( + &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset)); + + // Check the contents of the blob file + const auto& expected_key_value = expected_key_value_pairs[i]; + const auto& key = expected_key_value.first; + const auto& value = expected_key_value.second; + + ASSERT_EQ(record.key_size, key.size()); + ASSERT_EQ(record.value_size, value.size()); + ASSERT_EQ(record.expiration, 0); + ASSERT_EQ(record.key, key); + ASSERT_EQ(record.value, value); + + // Make sure the blob reference returned by the builder points to the + // right place + BlobIndex blob_index; + ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i])); + ASSERT_FALSE(blob_index.IsInlined()); + ASSERT_FALSE(blob_index.HasTTL()); + ASSERT_EQ(blob_index.file_number(), blob_file_number); + ASSERT_EQ(blob_index.offset(), blob_offset); + ASSERT_EQ(blob_index.size(), value.size()); + } + + BlobLogFooter footer; + ASSERT_OK(blob_log_reader.ReadFooter(&footer)); + ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size()); + ASSERT_EQ(footer.expiration_range, ExpirationRange()); + } + + std::unique_ptr mock_env_; + FileSystem* fs_; + SystemClock* clock_; + FileOptions file_options_; +}; + +TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { + // Build a single blob file + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 4; + constexpr size_t value_offset = 1234; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckOneFile"), + 0); + options.enable_blob_files = true; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector> expected_key_value_pairs( + number_of_blobs); + std::vector blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs); + ASSERT_EQ( + blob_file_addition.GetTotalBlobBytes(), + number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size)); + + // Verify the contents of the new blob file as well as the blob references + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { + // Build multiple blob files: file size limit is set to the size of a single + // value, so each blob ends up in a file of its own + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckMultipleFiles"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector> expected_key_value_pairs( + number_of_blobs); + std::vector blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_EQ(blob_file_paths.size(), number_of_blobs); + ASSERT_EQ(blob_file_additions.size(), number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const uint64_t blob_file_number = i + 2; + + ASSERT_EQ(blob_file_paths[i], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + + const auto& blob_file_addition = blob_file_additions[i]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + value_size); + } + + // Verify the contents of the new blob files as well as the blob references + for (size_t i = 0; i < number_of_blobs; ++i) { + std::vector> expected_key_value_pair{ + expected_key_value_pairs[i]}; + std::vector blob_index{blob_indexes[i]}; + + VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression, + expected_key_value_pair, blob_index); + } +} + +TEST_F(BlobFileBuilderTest, InlinedValues) { + // All values are below the min_blob_size threshold; no blob files get written + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_InlinedValues"), + 0); + options.enable_blob_files = true; + options.min_blob_size = 1024; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const std::string key = std::to_string(i); + assert(key.size() == key_size); + + const std::string value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + std::string blob_index; + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_TRUE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_TRUE(blob_file_paths.empty()); + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Compression) { + // Build a blob file with a compressed blob + if (!Snappy_Supported()) { + return; + } + + constexpr size_t key_size = 1; + constexpr size_t value_size = 100; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string uncompressed_value(value_size, 'x'); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + + CompressionOptions opts; + CompressionContext context(kSnappyCompression); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + kSnappyCompression, sample_for_compression); + + std::string compressed_value; + ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(), + uncompressed_value.size(), &compressed_value)); + + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + compressed_value.size()); + + // Verify the contents of the new blob file as well as the blob reference + std::vector> expected_key_value_pairs{ + {key, compressed_value}}; + std::vector blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kSnappyCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, CompressionError) { + // Simulate an error during compression + if (!Snappy_Supported()) { + return; + } + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_CompressionError"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue", + [](void* arg) { + bool* ret = static_cast(arg); + *ret = false; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ( + blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Checksum) { + // Build a blob file with checksum + + class DummyFileChecksumGenerator : public FileChecksumGenerator { + public: + void Update(const char* /* data */, size_t /* n */) override {} + + void Finalize() override {} + + std::string GetChecksum() const override { return std::string("dummy"); } + + const char* Name() const override { return "DummyFileChecksum"; } + }; + + class DummyFileChecksumGenFactory : public FileChecksumGenFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& /* context */) override { + return std::unique_ptr( + new DummyFileChecksumGenerator); + } + + const char* Name() const override { return "DummyFileChecksumGenFactory"; } + }; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"), + 0); + options.enable_blob_files = true; + options.file_checksum_gen_factory = + std::make_shared(); + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string value("deadbeef"); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key.size() + value.size()); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum"); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy"); + + // Verify the contents of the new blob file as well as the blob reference + std::vector> expected_key_value_pairs{ + {key, value}}; + std::vector blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +class BlobFileBuilderIOErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + } + + std::unique_ptr mock_env_; + FileSystem* fs_; + FileOptions file_options_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P( + BlobFileBuilderTest, BlobFileBuilderIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(BlobFileBuilderIOErrorTest, IOError) { + // Simulate an I/O error during the specified step of Add() + // Note: blob_file_size will be set to value_size in order for the first blob + // to trigger close + constexpr size_t value_size = 8; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector blob_file_paths; + std::vector blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") { + ASSERT_TRUE(blob_file_paths.empty()); + } else { + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ(blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + } + + ASSERT_TRUE(blob_file_additions.empty()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileCache::BlobFileCache(Cache* cache, + const ImmutableOptions* immutable_options, + const FileOptions* file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer) + : cache_(cache), + mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + immutable_options_(immutable_options), + file_options_(file_options), + column_family_id_(column_family_id), + blob_file_read_hist_(blob_file_read_hist), + io_tracer_(io_tracer) { + assert(cache_); + assert(immutable_options_); + assert(file_options_); +} + +Status BlobFileCache::GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + assert(blob_file_reader); + assert(blob_file_reader->IsEmpty()); + + const Slice key = GetSlice(&blob_file_number); + + assert(cache_); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); + + // Check again while holding mutex + MutexLock lock(mutex_.get(key)); + + handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + assert(immutable_options_); + Statistics* const statistics = immutable_options_->stats; + + RecordTick(statistics, NO_FILE_OPENS); + + std::unique_ptr reader; + + { + assert(file_options_); + const Status s = BlobFileReader::Create( + *immutable_options_, *file_options_, column_family_id_, + blob_file_read_hist_, blob_file_number, io_tracer_, &reader); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + { + constexpr size_t charge = 1; + + const Status s = cache_->Insert(key, reader.get(), charge, + &DeleteCacheEntry, &handle); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + reader.release(); + + *blob_file_reader = CacheHandleGuard(cache_, handle); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/cache_helpers.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +class Status; +class BlobFileReader; +class Slice; +class IOTracer; + +class BlobFileCache { + public: + BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options, + const FileOptions* file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer); + + BlobFileCache(const BlobFileCache&) = delete; + BlobFileCache& operator=(const BlobFileCache&) = delete; + + Status GetBlobFileReader(uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader); + + private: + Cache* cache_; + // Note: mutex_ below is used to guard against multiple threads racing to open + // the same file. + Striped mutex_; + const ImmutableOptions* immutable_options_; + const FileOptions* file_options_; + uint32_t column_family_id_; + HistogramImpl* blob_file_read_hist_; + std::shared_ptr io_tracer_; + + static constexpr size_t kNumberOfMutexStripes = 1 << 7; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,268 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with a single blob in it. +void WriteBlobFile(uint32_t column_family_id, + const ImmutableOptions& immutable_options, + uint64_t blob_file_number) { + assert(!immutable_options.cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + std::string compressed_blob; + + uint64_t key_offset = 0; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + + BlobLogFooter footer; + footer.blob_count = 1; + footer.expiration_range = expiration_range; + + std::string checksum_method; + std::string checksum_value; + + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobFileCacheTest : public testing::Test { + protected: + BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + + std::unique_ptr mock_env_; +}; + +TEST_F(BlobFileCacheTest, GetBlobFileReader) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // First try: reader should be opened and put in cache + CacheHandleGuard first; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + // Second try: reader should be served from cache + CacheHandleGuard second; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_Race"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + CacheHandleGuard first; + CacheHandleGuard second; + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { + // Disabling sync points to prevent infinite recursion + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_IOError"), + 0); + options.enable_blob_files = true; + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + ImmutableOptions immutable_options(options); + FileOptions file_options; + constexpr uint32_t column_family_id = 1; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Note: there is no blob file with the below number + constexpr uint64_t blob_file_number = 123; + + CacheHandleGuard reader; + + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_CacheFull"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 0; + constexpr int num_shard_bits = -1; // determined automatically + constexpr bool strict_capacity_limit = true; + std::shared_ptr backing_cache = + NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Insert into cache should fail since it has zero capacity and + // strict_capacity_limit is set + CacheHandleGuard reader; + + ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) + .IsIncomplete()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileCompletionCallback { + public: + BlobFileCompletionCallback( + SstFileManager* sst_file_manager, InstrumentedMutex* mutex, + ErrorHandler* error_handler, EventLogger* event_logger, + const std::vector>& listeners, + const std::string& dbname) + : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) { +#ifndef ROCKSDB_LITE + sst_file_manager_ = sst_file_manager; + mutex_ = mutex; + error_handler_ = error_handler; +#else + (void)sst_file_manager; + (void)mutex; + (void)error_handler; +#endif // ROCKSDB_LITE + } + + void OnBlobFileCreationStarted(const std::string& file_name, + const std::string& column_family_name, + int job_id, + BlobFileCreationReason creation_reason) { +#ifndef ROCKSDB_LITE + // Notify the listeners. + EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_, + column_family_name, file_name, + job_id, creation_reason); +#else + (void)file_name; + (void)column_family_name; + (void)job_id; + (void)creation_reason; +#endif + } + + Status OnBlobFileCompleted(const std::string& file_name, + const std::string& column_family_name, int job_id, + uint64_t file_number, + BlobFileCreationReason creation_reason, + const Status& report_status, + const std::string& checksum_value, + const std::string& checksum_method, + uint64_t blob_count, uint64_t blob_bytes) { + Status s; + +#ifndef ROCKSDB_LITE + auto sfm = static_cast(sst_file_manager_); + if (sfm) { + // Report new blob files to SstFileManagerImpl + s = sfm->OnAddFile(file_name); + if (sfm->IsMaxAllowedSpaceReached()) { + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached"); + InstrumentedMutexLock l(mutex_); + error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); + } + } +#endif // !ROCKSDB_LITE + + // Notify the listeners. + EventHelpers::LogAndNotifyBlobFileCreationFinished( + event_logger_, listeners_, dbname_, column_family_name, file_name, + job_id, file_number, creation_reason, + (!report_status.ok() ? report_status : s), + (checksum_value.empty() ? kUnknownFileChecksum : checksum_value), + (checksum_method.empty() ? kUnknownFileChecksumFuncName + : checksum_method), + blob_count, blob_bytes); + return s; + } + + private: +#ifndef ROCKSDB_LITE + SstFileManager* sst_file_manager_; + InstrumentedMutex* mutex_; + ErrorHandler* error_handler_; +#endif // ROCKSDB_LITE + EventLogger* event_logger_; + std::vector> listeners_; + std::string dbname_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include +#include + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileGarbage::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileGarbage::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, garbage_blob_count_); + PutVarint64(output, garbage_blob_bytes_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileGarbage::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileGarbage"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &garbage_blob_count_)) { + return Status::Corruption(class_name, "Error decoding garbage blob count"); + } + + if (!GetVarint64(input, &garbage_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding garbage blob bytes"); + } + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileGarbage::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileGarbage::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() && + lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes(); +} + +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage) { + os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber() + << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount() + << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes(); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage) { + jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber() + << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount() + << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes(); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileGarbage { + public: + BlobFileGarbage() = default; + + BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : blob_file_number_(blob_file_number), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; +}; + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,173 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include +#include +#include + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileGarbageTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) { + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_garbage, decoded); + } +}; + +TEST_F(BlobFileGarbageTest, Empty) { + BlobFileGarbage blob_file_garbage; + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t garbage_blob_count = 1; + constexpr uint64_t garbage_blob_bytes = 9876; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileGarbage blob_file_garbage; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count")); + } + + constexpr uint64_t garbage_blob_count = 4567; + PutVarint64(&str, garbage_blob_count); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes")); + } + + constexpr uint64_t garbage_blob_bytes = 12345678; + PutVarint64(&str, garbage_blob_bytes); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t garbage_blob_count = 9999; + constexpr uint64_t garbage_blob_bytes = 100000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t garbage_blob_count = 100; + constexpr uint64_t garbage_blob_bytes = 2000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_meta.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t SharedBlobFileMetaData::GetBlobFileSize() const { + return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize; +} + +std::string SharedBlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta) { + os << "blob_file_number: " << shared_meta.GetBlobFileNumber() + << " total_blob_count: " << shared_meta.GetTotalBlobCount() + << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes() + << " checksum_method: " << shared_meta.GetChecksumMethod() + << " checksum_value: " + << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +std::string BlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) { + const auto& shared_meta = meta.GetSharedMeta(); + assert(shared_meta); + os << (*shared_meta); + + os << " linked_ssts: {"; + for (uint64_t file_number : meta.GetLinkedSsts()) { + os << ' ' << file_number; + } + os << " }"; + + os << " garbage_blob_count: " << meta.GetGarbageBlobCount() + << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes(); + + return os; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// SharedBlobFileMetaData represents the immutable part of blob files' metadata, +// like the blob file number, total number and size of blobs, or checksum +// method and value. There is supposed to be one object of this class per blob +// file (shared across all versions that include the blob file in question); +// hence, the type is neither copyable nor movable. A blob file can be marked +// obsolete when the corresponding SharedBlobFileMetaData object is destroyed. + +class SharedBlobFileMetaData { + public: + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + return std::shared_ptr(new SharedBlobFileMetaData( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value))); + } + + template + static std::shared_ptr Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, Deleter deleter) { + return std::shared_ptr( + new SharedBlobFileMetaData(blob_file_number, total_blob_count, + total_blob_bytes, std::move(checksum_method), + std::move(checksum_value)), + deleter); + } + + SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete; + SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete; + + SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete; + SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete; + + uint64_t GetBlobFileSize() const; + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + std::string DebugString() const; + + private: + SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t blob_file_number_; + uint64_t total_blob_count_; + uint64_t total_blob_bytes_; + std::string checksum_method_; + std::string checksum_value_; +}; + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta); + +// BlobFileMetaData contains the part of the metadata for blob files that can +// vary across versions, like the amount of garbage in the blob file. In +// addition, BlobFileMetaData objects point to and share the ownership of the +// SharedBlobFileMetaData object for the corresponding blob file. Similarly to +// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They +// are meant to be jointly owned by the versions in which the blob file has the +// same (immutable *and* mutable) state. + +class BlobFileMetaData { + public: + using LinkedSsts = std::unordered_set; + + static std::shared_ptr Create( + std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + return std::shared_ptr( + new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes)); + } + + BlobFileMetaData(const BlobFileMetaData&) = delete; + BlobFileMetaData& operator=(const BlobFileMetaData&) = delete; + + BlobFileMetaData(BlobFileMetaData&&) = delete; + BlobFileMetaData& operator=(BlobFileMetaData&&) = delete; + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileSize() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileSize(); + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + uint64_t GetTotalBlobCount() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobCount(); + } + uint64_t GetTotalBlobBytes() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobBytes(); + } + const std::string& GetChecksumMethod() const { + assert(shared_meta_); + return shared_meta_->GetChecksumMethod(); + } + const std::string& GetChecksumValue() const { + assert(shared_meta_); + return shared_meta_->GetChecksumValue(); + } + + const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + std::string DebugString() const; + + private: + BlobFileMetaData(std::shared_ptr shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : shared_meta_(std::move(shared_meta)), + linked_ssts_(std::move(linked_ssts)), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) { + assert(shared_meta_); + assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount()); + assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes()); + } + + std::shared_ptr shared_meta_; + LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_; + uint64_t garbage_blob_bytes_; +}; + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,582 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/file_prefetch_buffer.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "rocksdb/file_system.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFileReader::Create( + const ImmutableOptions& immutable_options, const FileOptions& file_options, + uint32_t column_family_id, HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, const std::shared_ptr& io_tracer, + std::unique_ptr* blob_file_reader) { + assert(blob_file_reader); + assert(!*blob_file_reader); + + uint64_t file_size = 0; + std::unique_ptr file_reader; + + { + const Status s = + OpenFile(immutable_options, file_options, blob_file_read_hist, + blob_file_number, io_tracer, &file_size, &file_reader); + if (!s.ok()) { + return s; + } + } + + assert(file_reader); + + Statistics* const statistics = immutable_options.stats; + + CompressionType compression_type = kNoCompression; + + { + const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, + &compression_type); + if (!s.ok()) { + return s; + } + } + + { + const Status s = ReadFooter(file_reader.get(), file_size, statistics); + if (!s.ok()) { + return s; + } + } + + blob_file_reader->reset( + new BlobFileReader(std::move(file_reader), file_size, compression_type, + immutable_options.clock, statistics)); + + return Status::OK(); +} + +Status BlobFileReader::OpenFile( + const ImmutableOptions& immutable_options, const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, uint64_t* file_size, + std::unique_ptr* file_reader) { + assert(file_size); + assert(file_reader); + + const auto& cf_paths = immutable_options.cf_paths; + assert(!cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(cf_paths.front().path, blob_file_number); + + FileSystem* const fs = immutable_options.fs.get(); + assert(fs); + + constexpr IODebugContext* dbg = nullptr; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize"); + + const Status s = + fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg); + if (!s.ok()) { + return s; + } + } + + if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) { + return Status::Corruption("Malformed blob file"); + } + + std::unique_ptr file; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile"); + + const Status s = + fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg); + if (!s.ok()) { + return s; + } + } + + assert(file); + + if (immutable_options.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + + file_reader->reset(new RandomAccessFileReader( + std::move(file), blob_file_path, immutable_options.clock, io_tracer, + immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS, + blob_file_read_hist, immutable_options.rate_limiter.get(), + immutable_options.listeners)); + + return Status::OK(); +} + +Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, + Statistics* statistics, + CompressionType* compression_type) { + assert(file_reader); + assert(compression_type); + + Slice header_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile"); + + constexpr uint64_t read_offset = 0; + constexpr size_t read_size = BlobLogHeader::kSize; + + const Status s = + ReadFromFile(file_reader, read_offset, read_size, statistics, + &header_slice, &buf, &aligned_buf); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult", + &header_slice); + } + + BlobLogHeader header; + + { + const Status s = header.DecodeFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (header.has_ttl || header.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + if (header.column_family_id != column_family_id) { + return Status::Corruption("Column family ID mismatch"); + } + + *compression_type = header.compression; + + return Status::OK(); +} + +Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics) { + assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); + assert(file_reader); + + Slice footer_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile"); + + const uint64_t read_offset = file_size - BlobLogFooter::kSize; + constexpr size_t read_size = BlobLogFooter::kSize; + + const Status s = + ReadFromFile(file_reader, read_offset, read_size, statistics, + &footer_slice, &buf, &aligned_buf); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult", + &footer_slice); + } + + BlobLogFooter footer; + + { + const Status s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (footer.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + return Status::OK(); +} + +Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, + Buffer* buf, AlignedBuf* aligned_buf) { + assert(slice); + assert(buf); + assert(aligned_buf); + + assert(file_reader); + + RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size); + + Status s; + + if (file_reader->use_direct_io()) { + constexpr char* scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, + aligned_buf); + } else { + buf->reset(new char[read_size]); + constexpr AlignedBuf* aligned_scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, + buf->get(), aligned_scratch); + } + + if (!s.ok()) { + return s; + } + + if (slice->size() != read_size) { + return Status::Corruption("Failed to read data from blob file"); + } + + return Status::OK(); +} + +BlobFileReader::BlobFileReader( + std::unique_ptr&& file_reader, uint64_t file_size, + CompressionType compression_type, SystemClock* clock, + Statistics* statistics) + : file_reader_(std::move(file_reader)), + file_size_(file_size), + compression_type_(compression_type), + clock_(clock), + statistics_(statistics) { + assert(file_reader_); +} + +BlobFileReader::~BlobFileReader() = default; + +Status BlobFileReader::GetBlob(const ReadOptions& read_options, + const Slice& user_key, uint64_t offset, + uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, + uint64_t* bytes_read) const { + assert(value); + + const uint64_t key_size = user_key.size(); + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + return Status::Corruption("Invalid blob offset"); + } + + if (compression_type != compression_type_) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + // Note: if verify_checksum is set, we read the entire blob record to be able + // to perform the verification; otherwise, we just read the blob itself. Since + // the offset in BlobIndex actually points to the blob value, we need to make + // an adjustment in the former case. + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(offset >= adjustment); + + const uint64_t record_offset = offset - adjustment; + const uint64_t record_size = value_size + adjustment; + + Slice record_slice; + Buffer buf; + AlignedBuf aligned_buf; + + bool prefetched = false; + + if (prefetch_buffer) { + Status s; + constexpr bool for_compaction = true; + + prefetched = prefetch_buffer->TryReadFromCache( + IOOptions(), file_reader_.get(), record_offset, + static_cast(record_size), &record_slice, &s, for_compaction); + if (!s.ok()) { + return s; + } + } + + if (!prefetched) { + TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile"); + + const Status s = ReadFromFile(file_reader_.get(), record_offset, + static_cast(record_size), statistics_, + &record_slice, &buf, &aligned_buf); + if (!s.ok()) { + return s; + } + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult", + &record_slice); + + if (read_options.verify_checksums) { + const Status s = VerifyBlob(record_slice, user_key, value_size); + if (!s.ok()) { + return s; + } + } + + const Slice value_slice(record_slice.data() + adjustment, value_size); + + { + const Status s = UncompressBlobIfNeeded(value_slice, compression_type, + clock_, statistics_, value); + if (!s.ok()) { + return s; + } + } + + if (bytes_read) { + *bytes_read = record_size; + } + + return Status::OK(); +} + +void BlobFileReader::MultiGetBlob( + const ReadOptions& read_options, + const autovector>& user_keys, + const autovector& offsets, + const autovector& value_sizes, autovector& statuses, + autovector& values, uint64_t* bytes_read) const { + const size_t num_blobs = user_keys.size(); + assert(num_blobs > 0); + assert(num_blobs == offsets.size()); + assert(num_blobs == value_sizes.size()); + assert(num_blobs == statuses.size()); + assert(num_blobs == values.size()); + +#ifndef NDEBUG + for (size_t i = 0; i < offsets.size() - 1; ++i) { + assert(offsets[i] <= offsets[i + 1]); + } +#endif // !NDEBUG + + std::vector read_reqs(num_blobs); + autovector adjustments; + uint64_t total_len = 0; + for (size_t i = 0; i < num_blobs; ++i) { + const size_t key_size = user_keys[i].get().size(); + assert(IsValidBlobOffset(offsets[i], key_size, value_sizes[i], file_size_)); + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(offsets[i] >= adjustment); + adjustments.push_back(adjustment); + read_reqs[i].offset = offsets[i] - adjustment; + read_reqs[i].len = value_sizes[i] + adjustment; + total_len += read_reqs[i].len; + } + + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len); + + Buffer buf; + AlignedBuf aligned_buf; + + Status s; + bool direct_io = file_reader_->use_direct_io(); + if (direct_io) { + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = nullptr; + } + } else { + buf.reset(new char[total_len]); + std::ptrdiff_t pos = 0; + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = buf.get() + pos; + pos += read_reqs[i].len; + } + } + TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile"); + s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(), + direct_io ? &aligned_buf : nullptr); + if (!s.ok()) { + for (auto& req : read_reqs) { + req.status.PermitUncheckedError(); + } + for (size_t i = 0; i < num_blobs; ++i) { + assert(statuses[i]); + *statuses[i] = s; + } + return; + } + + assert(s.ok()); + for (size_t i = 0; i < num_blobs; ++i) { + auto& req = read_reqs[i]; + assert(statuses[i]); + if (req.status.ok() && req.result.size() != req.len) { + req.status = IOStatus::Corruption("Failed to read data from blob file"); + } + *statuses[i] = req.status; + } + + if (read_options.verify_checksums) { + for (size_t i = 0; i < num_blobs; ++i) { + assert(statuses[i]); + if (!statuses[i]->ok()) { + continue; + } + const Slice& record_slice = read_reqs[i].result; + s = VerifyBlob(record_slice, user_keys[i], value_sizes[i]); + if (!s.ok()) { + assert(statuses[i]); + *statuses[i] = s; + } + } + } + + for (size_t i = 0; i < num_blobs; ++i) { + assert(statuses[i]); + if (!statuses[i]->ok()) { + continue; + } + const Slice& record_slice = read_reqs[i].result; + const Slice value_slice(record_slice.data() + adjustments[i], + value_sizes[i]); + s = UncompressBlobIfNeeded(value_slice, compression_type_, clock_, + statistics_, values[i]); + if (!s.ok()) { + *statuses[i] = s; + } + } + + if (bytes_read) { + uint64_t total_bytes = 0; + for (const auto& req : read_reqs) { + total_bytes += req.result.size(); + } + *bytes_read = total_bytes; + } +} + +Status BlobFileReader::VerifyBlob(const Slice& record_slice, + const Slice& user_key, uint64_t value_size) { + BlobLogRecord record; + + const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize); + + { + const Status s = record.DecodeHeaderFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + if (record.key_size != user_key.size()) { + return Status::Corruption("Key size mismatch when reading blob"); + } + + if (record.value_size != value_size) { + return Status::Corruption("Value size mismatch when reading blob"); + } + + record.key = + Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size); + if (record.key != user_key) { + return Status::Corruption("Key mismatch when reading blob"); + } + + record.value = Slice(record.key.data() + record.key_size, value_size); + + { + TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC", + &record); + + const Status s = record.CheckBlobCRC(); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status BlobFileReader::UncompressBlobIfNeeded(const Slice& value_slice, + CompressionType compression_type, + SystemClock* clock, + Statistics* statistics, + PinnableSlice* value) { + assert(value); + + if (compression_type == kNoCompression) { + SaveValue(value_slice, value); + + return Status::OK(); + } + + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + + size_t uncompressed_size = 0; + constexpr uint32_t compression_format_version = 2; + constexpr MemoryAllocator* allocator = nullptr; + + CacheAllocationPtr output; + + { + StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS); + output = UncompressData(info, value_slice.data(), value_slice.size(), + &uncompressed_size, compression_format_version, + allocator); + } + + TEST_SYNC_POINT_CALLBACK( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output); + + if (!output) { + return Status::Corruption("Unable to uncompress blob"); + } + + SaveValue(Slice(output.get(), uncompressed_size), value); + + return Status::OK(); +} + +void BlobFileReader::SaveValue(const Slice& src, PinnableSlice* dst) { + assert(dst); + + if (dst->IsPinned()) { + dst->Reset(); + } + + dst->PinSelf(src); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "file/random_access_file_reader.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Status; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +struct ReadOptions; +class Slice; +class FilePrefetchBuffer; +class PinnableSlice; +class Statistics; + +class BlobFileReader { + public: + static Status Create(const ImmutableOptions& immutable_options, + const FileOptions& file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + std::unique_ptr* reader); + + BlobFileReader(const BlobFileReader&) = delete; + BlobFileReader& operator=(const BlobFileReader&) = delete; + + ~BlobFileReader(); + + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t offset, uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + // offsets must be sorted in ascending order by caller. + void MultiGetBlob( + const ReadOptions& read_options, + const autovector>& user_keys, + const autovector& offsets, + const autovector& value_sizes, autovector& statuses, + autovector& values, uint64_t* bytes_read) const; + + CompressionType GetCompressionType() const { return compression_type_; } + + uint64_t GetFileSize() const { return file_size_; } + + private: + BlobFileReader(std::unique_ptr&& file_reader, + uint64_t file_size, CompressionType compression_type, + SystemClock* clock, Statistics* statistics); + + static Status OpenFile(const ImmutableOptions& immutable_options, + const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + uint64_t* file_size, + std::unique_ptr* file_reader); + + static Status ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, Statistics* statistics, + CompressionType* compression_type); + + static Status ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics); + + using Buffer = std::unique_ptr; + + static Status ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, Buffer* buf, + AlignedBuf* aligned_buf); + + static Status VerifyBlob(const Slice& record_slice, const Slice& user_key, + uint64_t value_size); + + static Status UncompressBlobIfNeeded(const Slice& value_slice, + CompressionType compression_type, + SystemClock* clock, + Statistics* statistics, + PinnableSlice* value); + + static void SaveValue(const Slice& src, PinnableSlice* dst); + + std::unique_ptr file_reader_; + uint64_t file_size_; + CompressionType compression_type_; + SystemClock* clock_; + Statistics* statistics_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,974 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with `num` blobs in it. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const std::vector& keys, + const std::vector& blobs, CompressionType compression, + std::vector& blob_offsets, + std::vector& blob_sizes) { + assert(!immutable_options.cf_paths.empty()); + size_t num = keys.size(); + assert(num == blobs.size()); + assert(num == blob_offsets.size()); + assert(num == blob_sizes.size()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + BlobLogHeader header(column_family_id, compression, has_ttl, + expiration_range_header); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + std::vector compressed_blobs(num); + std::vector blobs_to_write(num); + if (kNoCompression == compression) { + for (size_t i = 0; i < num; ++i) { + blobs_to_write[i] = blobs[i]; + blob_sizes[i] = blobs[i].size(); + } + } else { + CompressionOptions opts; + CompressionContext context(compression); + constexpr uint64_t sample_for_compression = 0; + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + compression, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + for (size_t i = 0; i < num; ++i) { + ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, + &compressed_blobs[i])); + blobs_to_write[i] = compressed_blobs[i]; + blob_sizes[i] = compressed_blobs[i].size(); + } + } + + for (size_t i = 0; i < num; ++i) { + uint64_t key_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + &blob_offsets[i])); + } + + BlobLogFooter footer; + footer.blob_count = num; + footer.expiration_range = expiration_range_footer; + + std::string checksum_method; + std::string checksum_value; + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +// Creates a test blob file with a single blob in it. Note: this method +// makes it possible to test various corner cases by allowing the caller +// to specify the contents of various blob file header/footer fields. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const Slice& key, + const Slice& blob, CompressionType compression, + uint64_t* blob_offset, uint64_t* blob_size) { + std::vector keys{key}; + std::vector blobs{blob}; + std::vector blob_offsets{0}; + std::vector blob_sizes{0}; + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, keys, blobs, compression, blob_offsets, + blob_sizes); + if (blob_offset) { + *blob_offset = blob_offsets[0]; + } + if (blob_size) { + *blob_size = blob_sizes[0]; + } +} + +} // anonymous namespace + +class BlobFileReaderTest : public testing::Test { + protected: + BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + std::unique_ptr mock_env_; +}; + +TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_CreateReaderAndGetBlob"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 3; + const std::vector key_strs = {"key1", "key2", "key3"}; + const std::vector blob_strs = {"blob1", "blob2", "blob3"}; + + const std::vector keys = {key_strs[0], key_strs[1], key_strs[2]}; + const std::vector blobs = {blob_strs[0], blob_strs[1], blob_strs[2]}; + + std::vector blob_offsets(keys.size()); + std::vector blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + &value, &bytes_read)); + ASSERT_EQ(value, blobs[0]); + ASSERT_EQ(bytes_read, blob_sizes[0]); + + // MultiGetBlob + bytes_read = 0; + size_t total_size = 0; + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + autovector offsets{blob_offsets[0], blob_offsets[1], + blob_offsets[2]}; + autovector sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]}; + std::array statuses_buf; + autovector statuses{&statuses_buf[0], &statuses_buf[1], + &statuses_buf[2]}; + std::array value_buf; + autovector values{&value_buf[0], &value_buf[1], + &value_buf[2]}; + reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses, + values, &bytes_read); + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_OK(statuses_buf[i]); + ASSERT_EQ(value_buf[i], blobs[i]); + total_size += blob_sizes[i]; + } + ASSERT_EQ(bytes_read, total_size); + } + + read_options.verify_checksums = true; + + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1], kNoCompression, prefetch_buffer, + &value, &bytes_read)); + ASSERT_EQ(value, blobs[1]); + + const uint64_t key_size = keys[1].size(); + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_sizes[1]); + } + + // Invalid offset (too close to start of file) + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0] - 1, + blob_sizes[0], kNoCompression, prefetch_buffer, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + } + + // Invalid offset (too close to end of file) + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[2], blob_offsets[2] + 1, + blob_sizes[2], kNoCompression, prefetch_buffer, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect compression type + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kZSTD, prefetch_buffer, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect key size + { + constexpr char shorter_key[] = "k"; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, shorter_key, + blob_offsets[0] - + (keys[0].size() - sizeof(shorter_key) + 1), + blob_sizes[0], kNoCompression, prefetch_buffer, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1); + key_refs[1] = std::cref(shorter_key_slice); + + autovector offsets{ + blob_offsets[0], + blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()), + blob_offsets[2]}; + autovector sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]}; + std::array statuses_buf; + autovector statuses{&statuses_buf[0], &statuses_buf[1], + &statuses_buf[2]}; + std::array value_buf; + autovector values{&value_buf[0], &value_buf[1], + &value_buf[2]}; + reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses, + values, &bytes_read); + for (size_t i = 0; i < num_blobs; ++i) { + if (i == 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect key + { + constexpr char incorrect_key[] = "foo1"; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, incorrect_key, blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1); + key_refs[2] = std::cref(wrong_key_slice); + + autovector offsets{blob_offsets[0], blob_offsets[1], + blob_offsets[2]}; + autovector sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]}; + std::array statuses_buf; + autovector statuses{&statuses_buf[0], &statuses_buf[1], + &statuses_buf[2]}; + std::array value_buf; + autovector values{&value_buf[0], &value_buf[1], + &value_buf[2]}; + reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses, + values, &bytes_read); + for (size_t i = 0; i < num_blobs; ++i) { + if (i == num_blobs - 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect value size + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1] + 1, kNoCompression, + prefetch_buffer, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + autovector offsets{blob_offsets[0], blob_offsets[1], + blob_offsets[2]}; + autovector sizes{blob_sizes[0], blob_sizes[1] + 1, blob_sizes[2]}; + std::array statuses_buf; + autovector statuses{&statuses_buf[0], &statuses_buf[1], + &statuses_buf[2]}; + std::array value_buf; + autovector values{&value_buf[0], &value_buf[1], + &value_buf[2]}; + reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses, + values, &bytes_read); + for (size_t i = 0; i < num_blobs; ++i) { + if (i != 1) { + ASSERT_OK(statuses_buf[i]); + } else { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } + } + } +} + +TEST_F(BlobFileReaderTest, Malformed) { + // Write a blob file consisting of nothing but a header, and make sure we + // detect the error when we open it for reading + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr uint64_t blob_file_number = 1; + + { + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), blob_file_path, FileOptions(), + immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), + immutable_options.clock, statistics, + blob_file_number, use_fsync, do_flush); + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + } + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, TTL) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = true; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInHeader"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + const ExpirationRange expiration_range_header( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr ExpirationRange expiration_range_footer; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInFooter"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range_header; + const ExpirationRange expiration_range_footer( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_IncorrectColumnFamily"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + constexpr uint32_t incorrect_column_family_id = 2; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + incorrect_column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, BlobCRCError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { + BlobLogRecord* const record = static_cast(arg); + assert(record); + + record->blob_crc = 0xfaceb00c; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileReaderTest, Compression) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, &value, + &bytes_read)); + ASSERT_EQ(value, blob); + ASSERT_EQ(bytes_read, blob_size); + } + + read_options.verify_checksums = true; + + { + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, &value, + &bytes_read)); + ASSERT_EQ(value, blob); + + constexpr uint64_t key_size = sizeof(key) - 1; + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size); + } +} + +TEST_F(BlobFileReaderTest, UncompressionError) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_UncompressionError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { + CacheAllocationPtr* const output = + static_cast(arg); + assert(output); + + output->reset(); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderIOErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileReaderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get())); + } + + std::unique_ptr mock_env_; + std::unique_ptr fault_injection_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:GetFileSize", + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::ReadHeader:ReadFromFile", + "BlobFileReader::ReadFooter:ReadFromFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +TEST_P(BlobFileReaderIOErrorTest, IOError) { + // Simulates an I/O error during the specified step + + Options options; + options.env = fault_injection_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(fault_injection_env_.get(), + "BlobFileReaderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); + + if (fail_during_create) { + ASSERT_TRUE(s.IsIOError()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, &value, + &bytes_read) + .IsIOError()); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderDecodingErrorTest + : public testing::Test, + public testing::WithParamInterface { + protected: + BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + } + + std::unique_ptr mock_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::ReadHeader:TamperWithResult", + "BlobFileReader::ReadFooter:TamperWithResult", + "BlobFileReader::GetBlob:TamperWithResult"})); + +TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderDecodingErrorTest_DecodingError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) { + Slice* const slice = static_cast(arg); + assert(slice); + assert(!slice->empty()); + + slice->remove_prefix(1); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; + + if (fail_during_create) { + ASSERT_TRUE(s.IsCorruption()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + PinnableSlice value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + flows_[blob_file_number].AddInFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + // Note: in order to measure the amount of additional garbage, we only need to + // track the outflow for preexisting files, i.e. those that also had inflow. + // (Newly written files would only have outflow.) + auto it = flows_.find(blob_file_number); + if (it == flows_.end()) { + return Status::OK(); + } + + it->second.AddOutFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes) { + assert(blob_file_number); + assert(*blob_file_number == kInvalidBlobFileNumber); + assert(bytes); + assert(*bytes == 0); + + ParsedInternalKey ikey; + + { + constexpr bool log_err_key = false; + const Status s = ParseInternalKey(key, &ikey, log_err_key); + if (!s.ok()) { + return s; + } + } + + if (ikey.type != kTypeBlobIndex) { + return Status::OK(); + } + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + } + + if (blob_index.IsInlined() || blob_index.HasTTL()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + *blob_file_number = blob_index.file_number(); + *bytes = + blob_index.size() + + BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size()); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// A class that can be used to compute the amount of additional garbage +// generated by a compaction. It parses the keys and blob references in the +// input and output of a compaction, and aggregates the "inflow" and "outflow" +// on a per-blob file basis. The amount of additional garbage for any given blob +// file can then be computed by subtracting the outflow from the inflow. +class BlobGarbageMeter { + public: + // A class to store the number and total size of blobs on a per-blob file + // basis. + class BlobStats { + public: + void Add(uint64_t bytes) { + ++count_; + bytes_ += bytes; + } + void Add(uint64_t count, uint64_t bytes) { + count_ += count; + bytes_ += bytes; + } + + uint64_t GetCount() const { return count_; } + uint64_t GetBytes() const { return bytes_; } + + private: + uint64_t count_ = 0; + uint64_t bytes_ = 0; + }; + + // A class to keep track of the "inflow" and the "outflow" and to compute the + // amount of additional garbage for a given blob file. + class BlobInOutFlow { + public: + void AddInFlow(uint64_t bytes) { + in_flow_.Add(bytes); + assert(IsValid()); + } + void AddOutFlow(uint64_t bytes) { + out_flow_.Add(bytes); + assert(IsValid()); + } + + const BlobStats& GetInFlow() const { return in_flow_; } + const BlobStats& GetOutFlow() const { return out_flow_; } + + bool IsValid() const { + return in_flow_.GetCount() >= out_flow_.GetCount() && + in_flow_.GetBytes() >= out_flow_.GetBytes(); + } + bool HasGarbage() const { + assert(IsValid()); + return in_flow_.GetCount() > out_flow_.GetCount(); + } + uint64_t GetGarbageCount() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetCount() - out_flow_.GetCount(); + } + uint64_t GetGarbageBytes() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetBytes() - out_flow_.GetBytes(); + } + + private: + BlobStats in_flow_; + BlobStats out_flow_; + }; + + Status ProcessInFlow(const Slice& key, const Slice& value); + Status ProcessOutFlow(const Slice& key, const Slice& value); + + const std::unordered_map& flows() const { + return flows_; + } + + private: + static Status Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes); + + std::unordered_map flows_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include +#include + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(BlobGarbageMeterTest, MeasureGarbage) { + BlobGarbageMeter blob_garbage_meter; + + struct BlobDescriptor { + std::string user_key; + uint64_t blob_file_number; + uint64_t offset; + uint64_t size; + CompressionType compression_type; + bool has_in_flow; + bool has_out_flow; + + uint64_t GetExpectedBytes() const { + return size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size()); + } + }; + + // Note: blob file 4 has the same inflow and outflow and hence no additional + // garbage. Blob file 5 has less outflow than inflow and thus it does have + // additional garbage. Blob file 6 is a newly written file (i.e. no inflow, + // only outflow) and is thus not tracked by the meter. + std::vector blobs{ + {"key", 4, 1234, 555, kLZ4Compression, true, true}, + {"other_key", 4, 6789, 101010, kLZ4Compression, true, true}, + {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true}, + {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true}, + {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false}, + {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false}, + {"new_key", 6, 7777, 9999, kNoCompression, false, true}}; + + for (const auto& blob : blobs) { + constexpr SequenceNumber seq = 123; + const InternalKey key(blob.user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + std::string value; + BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size, + blob.compression_type); + const Slice value_slice(value); + + if (blob.has_in_flow) { + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + } + if (blob.has_out_flow) { + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + } + } + + const auto& flows = blob_garbage_meter.flows(); + ASSERT_EQ(flows.size(), 2); + + { + const auto it = flows.find(4); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + constexpr uint64_t expected_count = 2; + const uint64_t expected_bytes = + blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes(); + + const auto& in = flow.GetInFlow(); + ASSERT_EQ(in.GetCount(), expected_count); + ASSERT_EQ(in.GetBytes(), expected_bytes); + + const auto& out = flow.GetOutFlow(); + ASSERT_EQ(out.GetCount(), expected_count); + ASSERT_EQ(out.GetBytes(), expected_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_FALSE(flow.HasGarbage()); + } + + { + const auto it = flows.find(5); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + const auto& in = flow.GetInFlow(); + + constexpr uint64_t expected_in_count = 4; + const uint64_t expected_in_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() + + blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes(); + + ASSERT_EQ(in.GetCount(), expected_in_count); + ASSERT_EQ(in.GetBytes(), expected_in_bytes); + + const auto& out = flow.GetOutFlow(); + + constexpr uint64_t expected_out_count = 2; + const uint64_t expected_out_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes(); + + ASSERT_EQ(out.GetCount(), expected_out_count); + ASSERT_EQ(out.GetBytes(), expected_out_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_TRUE(flow.HasGarbage()); + ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count); + ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes); + } +} + +TEST(BlobGarbageMeterTest, PlainValue) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeValue); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + ASSERT_TRUE(blob_garbage_meter.flows().empty()); +} + +TEST(BlobGarbageMeterTest, CorruptInternalKey) { + constexpr char corrupt_key[] = "i_am_corrupt"; + const Slice key_slice(corrupt_key); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, CorruptBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "i_am_not_a_blob_index"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr uint64_t expiration = 1234567890; + constexpr char inlined_value[] = "inlined"; + + std::string value; + BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value); + + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_index.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/compression_type.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// BlobIndex is a pointer to the blob and metadata of the blob. The index is +// stored in base DB as ValueType::kTypeBlobIndex. +// There are three types of blob index: +// +// kInlinedTTL: +// +------+------------+---------------+ +// | type | expiration | value | +// +------+------------+---------------+ +// | char | varint64 | variable size | +// +------+------------+---------------+ +// +// kBlob: +// +------+-------------+----------+----------+-------------+ +// | type | file number | offset | size | compression | +// +------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | char | +// +------+-------------+----------+----------+-------------+ +// +// kBlobTTL: +// +------+------------+-------------+----------+----------+-------------+ +// | type | expiration | file number | offset | size | compression | +// +------+------------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | varint64 | char | +// +------+------------+-------------+----------+----------+-------------+ +// +// There isn't a kInlined (without TTL) type since we can store it as a plain +// value (i.e. ValueType::kTypeValue). +class BlobIndex { + public: + enum class Type : unsigned char { + kInlinedTTL = 0, + kBlob = 1, + kBlobTTL = 2, + kUnknown = 3, + }; + + BlobIndex() : type_(Type::kUnknown) {} + + BlobIndex(const BlobIndex&) = default; + BlobIndex& operator=(const BlobIndex&) = default; + + bool IsInlined() const { return type_ == Type::kInlinedTTL; } + + bool HasTTL() const { + return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; + } + + uint64_t expiration() const { + assert(HasTTL()); + return expiration_; + } + + const Slice& value() const { + assert(IsInlined()); + return value_; + } + + uint64_t file_number() const { + assert(!IsInlined()); + return file_number_; + } + + uint64_t offset() const { + assert(!IsInlined()); + return offset_; + } + + uint64_t size() const { + assert(!IsInlined()); + return size_; + } + + CompressionType compression() const { + assert(!IsInlined()); + return compression_; + } + + Status DecodeFrom(Slice slice) { + static const std::string kErrorMessage = "Error while decoding blob index"; + assert(slice.size() > 0); + type_ = static_cast(*slice.data()); + if (type_ >= Type::kUnknown) { + return Status::Corruption( + kErrorMessage, + "Unknown blob index type: " + ToString(static_cast(type_))); + } + slice = Slice(slice.data() + 1, slice.size() - 1); + if (HasTTL()) { + if (!GetVarint64(&slice, &expiration_)) { + return Status::Corruption(kErrorMessage, "Corrupted expiration"); + } + } + if (IsInlined()) { + value_ = slice; + } else { + if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && + GetVarint64(&slice, &size_) && slice.size() == 1) { + compression_ = static_cast(*slice.data()); + } else { + return Status::Corruption(kErrorMessage, "Corrupted blob offset"); + } + } + return Status::OK(); + } + + std::string DebugString(bool output_hex) const { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_ + << " compression: " << CompressionTypeToString(compression_); + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, + const Slice& value) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(1 + kMaxVarint64Length + value.size()); + dst->push_back(static_cast(Type::kInlinedTTL)); + PutVarint64(dst, expiration); + dst->append(value.data(), value.size()); + } + + static void EncodeBlob(std::string* dst, uint64_t file_number, + uint64_t offset, uint64_t size, + CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 3 + 2); + dst->push_back(static_cast(Type::kBlob)); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + static void EncodeBlobTTL(std::string* dst, uint64_t expiration, + uint64_t file_number, uint64_t offset, + uint64_t size, CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 4 + 2); + dst->push_back(static_cast(Type::kBlobTTL)); + PutVarint64(dst, expiration); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast(compression)); + } + + private: + Type type_ = Type::kUnknown; + uint64_t expiration_ = 0; + Slice value_; + uint64_t file_number_ = 0; + uint64_t offset_ = 0; + uint64_t size_ = 0; + CompressionType compression_ = kNoCompression; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_format.h" + +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +void BlobLogHeader::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogHeader::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed32(dst, version); + PutFixed32(dst, column_family_id); + unsigned char flags = (has_ttl ? 1 : 0); + dst->push_back(flags); + dst->push_back(compression); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); +} + +Status BlobLogHeader::DecodeFrom(Slice src) { + static const std::string kErrorMessage = + "Error while decoding blob log header"; + if (src.size() != BlobLogHeader::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file header size"); + } + uint32_t magic_number; + unsigned char flags; + if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) || + !GetFixed32(&src, &column_family_id)) { + return Status::Corruption( + kErrorMessage, + "Error decoding magic number, version and column family id"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (version != kVersion1) { + return Status::Corruption(kErrorMessage, "Unknown header version"); + } + flags = src.data()[0]; + compression = static_cast(src.data()[1]); + has_ttl = (flags & 1) == 1; + src.remove_prefix(2); + if (!GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second)) { + return Status::Corruption(kErrorMessage, "Error decoding expiration range"); + } + return Status::OK(); +} + +void BlobLogFooter::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogFooter::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed64(dst, blob_count); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); + crc = crc32c::Value(dst->c_str(), dst->size()); + crc = crc32c::Mask(crc); + PutFixed32(dst, crc); +} + +Status BlobLogFooter::DecodeFrom(Slice src) { + static const std::string kErrorMessage = + "Error while decoding blob log footer"; + if (src.size() != BlobLogFooter::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file footer size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t)); + src_crc = crc32c::Mask(src_crc); + uint32_t magic_number = 0; + if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) || + !GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (src_crc != crc) { + return Status::Corruption(kErrorMessage, "CRC mismatch"); + } + return Status::OK(); +} + +void BlobLogRecord::EncodeHeaderTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size()); + PutFixed64(dst, key.size()); + PutFixed64(dst, value.size()); + PutFixed64(dst, expiration); + header_crc = crc32c::Value(dst->c_str(), dst->size()); + header_crc = crc32c::Mask(header_crc); + PutFixed32(dst, header_crc); + blob_crc = crc32c::Value(key.data(), key.size()); + blob_crc = crc32c::Extend(blob_crc, value.data(), value.size()); + blob_crc = crc32c::Mask(blob_crc); + PutFixed32(dst, blob_crc); +} + +Status BlobLogRecord::DecodeHeaderFrom(Slice src) { + static const std::string kErrorMessage = "Error while decoding blob record"; + if (src.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob record header size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8); + src_crc = crc32c::Mask(src_crc); + if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) || + !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) || + !GetFixed32(&src, &blob_crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (src_crc != header_crc) { + return Status::Corruption(kErrorMessage, "Header CRC mismatch"); + } + return Status::OK(); +} + +Status BlobLogRecord::CheckBlobCRC() const { + uint32_t expected_crc = 0; + expected_crc = crc32c::Value(key.data(), key.size()); + expected_crc = crc32c::Extend(expected_crc, value.data(), value.size()); + expected_crc = crc32c::Mask(expected_crc); + if (expected_crc != blob_crc) { + return Status::Corruption("Blob CRC mismatch"); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Log format information shared by reader and writer. + +#pragma once + +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37 +constexpr uint32_t kVersion1 = 1; + +using ExpirationRange = std::pair; + +// Format of blob log file header (30 bytes): +// +// +--------------+---------+---------+-------+-------------+-------------------+ +// | magic number | version | cf id | flags | compression | expiration range | +// +--------------+---------+---------+-------+-------------+-------------------+ +// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 | +// +--------------+---------+---------+-------+-------------+-------------------+ +// +// List of flags: +// has_ttl: Whether the file contain TTL data. +// +// Expiration range in the header is a rough range based on +// blob_db_options.ttl_range_secs. +struct BlobLogHeader { + static constexpr size_t kSize = 30; + + BlobLogHeader() = default; + BlobLogHeader(uint32_t _column_family_id, CompressionType _compression, + bool _has_ttl, const ExpirationRange& _expiration_range) + : column_family_id(_column_family_id), + compression(_compression), + has_ttl(_has_ttl), + expiration_range(_expiration_range) {} + + uint32_t version = kVersion1; + uint32_t column_family_id = 0; + CompressionType compression = kNoCompression; + bool has_ttl = false; + ExpirationRange expiration_range; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// Format of blob log file footer (32 bytes): +// +// +--------------+------------+-------------------+------------+ +// | magic number | blob count | expiration range | footer CRC | +// +--------------+------------+-------------------+------------+ +// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 | +// +--------------+------------+-------------------+------------+ +// +// The footer will be presented only when the blob file is properly closed. +// +// Unlike the same field in file header, expiration range in the footer is the +// range of smallest and largest expiration of the data in this file. +struct BlobLogFooter { + static constexpr size_t kSize = 32; + + uint64_t blob_count = 0; + ExpirationRange expiration_range = std::make_pair(0, 0); + uint32_t crc = 0; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// Blob record format (32 bytes header + key + value): +// +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | key length | value length | expiration | header CRC | blob CRC | key | value | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// +// If file has has_ttl = false, expiration field is always 0, and the blob +// doesn't has expiration. +// +// Also note that if compression is used, value is compressed value and value +// length is compressed value length. +// +// Header CRC is the checksum of (key_len + val_len + expiration), while +// blob CRC is the checksum of (key + value). +// +// We could use variable length encoding (Varint64) to save more space, but it +// make reader more complicated. +struct BlobLogRecord { + // header include fields up to blob CRC + static constexpr size_t kHeaderSize = 32; + + // Note that the offset field of BlobIndex actually points to the blob value + // as opposed to the start of the blob record. The following method can + // be used to calculate the adjustment needed to read the blob record header. + static constexpr uint64_t CalculateAdjustmentForRecordHeader( + uint64_t key_size) { + return key_size + kHeaderSize; + } + + uint64_t key_size = 0; + uint64_t value_size = 0; + uint64_t expiration = 0; + uint32_t header_crc = 0; + uint32_t blob_crc = 0; + Slice key; + Slice value; + std::unique_ptr key_buf; + std::unique_ptr value_buf; + + uint64_t record_size() const { return kHeaderSize + key_size + value_size; } + + void EncodeHeaderTo(std::string* dst); + + Status DecodeHeaderFrom(Slice src); + + Status CheckBlobCRC() const; +}; + +// Checks whether a blob offset is potentially valid or not. +inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size, + uint64_t value_size, uint64_t file_size) { + if (value_offset < + BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) { + return false; + } + + if (value_offset + value_size + BlobLogFooter::kSize > file_size) { + return false; + } + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_sequential_reader.h" + +#include "file/random_access_file_reader.h" +#include "monitoring/statistics.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogSequentialReader::BlobLogSequentialReader( + std::unique_ptr&& file_reader, SystemClock* clock, + Statistics* statistics) + : file_(std::move(file_reader)), + clock_(clock), + statistics_(statistics), + next_byte_(0) {} + +BlobLogSequentialReader::~BlobLogSequentialReader() = default; + +Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, + char* buf) { + assert(slice); + assert(file_); + + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + Status s = file_->Read(IOOptions(), next_byte_, static_cast(size), + slice, buf, nullptr); + next_byte_ += size; + if (!s.ok()) { + return s; + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size()); + if (slice->size() != size) { + return Status::Corruption("EOF reached while reading record"); + } + return s; +} + +Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) { + assert(header); + assert(next_byte_ == 0); + + static_assert(BlobLogHeader::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogHeader::kSize"); + + Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogHeader::kSize) { + return Status::Corruption("EOF reached before file header"); + } + + return header->DecodeFrom(buffer_); +} + +Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record, + ReadLevel level, + uint64_t* blob_offset) { + assert(record); + static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogRecord::kHeaderSize"); + + Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + if (buffer_.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption("EOF reached before record header"); + } + + s = record->DecodeHeaderFrom(buffer_); + if (!s.ok()) { + return s; + } + + uint64_t kb_size = record->key_size + record->value_size; + if (blob_offset != nullptr) { + *blob_offset = next_byte_ + record->key_size; + } + + switch (level) { + case kReadHeader: + next_byte_ += kb_size; + break; + + case kReadHeaderKey: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + next_byte_ += record->value_size; + break; + + case kReadHeaderKeyBlob: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + if (s.ok()) { + record->value_buf.reset(new char[record->value_size]); + s = ReadSlice(record->value_size, &record->value, + record->value_buf.get()); + } + if (s.ok()) { + s = record->CheckBlobCRC(); + } + break; + } + return s; +} + +Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) { + assert(footer); + static_assert(BlobLogFooter::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogFooter::kSize"); + + Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogFooter::kSize) { + return Status::Corruption("EOF reached before file footer"); + } + + return footer->DecodeFrom(buffer_); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c)) + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class Env; +class Statistics; +class Status; +class SystemClock; + +/** + * BlobLogSequentialReader is a general purpose log stream reader + * implementation. The actual job of reading from the device is implemented by + * the RandomAccessFileReader interface. + * + * Please see BlobLogWriter for details on the file and record layout. + */ + +class BlobLogSequentialReader { + public: + enum ReadLevel { + kReadHeader, + kReadHeaderKey, + kReadHeaderKeyBlob, + }; + + // Create a reader that will return log records from "*file_reader". + BlobLogSequentialReader(std::unique_ptr&& file_reader, + SystemClock* clock, Statistics* statistics); + + // No copying allowed + BlobLogSequentialReader(const BlobLogSequentialReader&) = delete; + BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete; + + ~BlobLogSequentialReader(); + + Status ReadHeader(BlobLogHeader* header); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. The contents filled in + // *record will only be valid until the next mutating operation on this + // reader. + // If blob_offset is non-null, return offset of the blob through it. + Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader, + uint64_t* blob_offset = nullptr); + + Status ReadFooter(BlobLogFooter* footer); + + void ResetNextByte() { next_byte_ = 0; } + + uint64_t GetNextByte() const { return next_byte_; } + + private: + Status ReadSlice(uint64_t size, Slice* slice, char* buf); + + const std::unique_ptr file_; + SystemClock* clock_; + + Statistics* statistics_; + + Slice buffer_; + char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize, + BlobLogRecord::kHeaderSize)]; + + // which byte to read next + uint64_t next_byte_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#undef MAX_HEADER_SIZE \ No newline at end of file diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,172 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_log_writer.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "file/writable_file_writer.h" +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, + SystemClock* clock, Statistics* statistics, + uint64_t log_number, bool use_fs, bool do_flush, + uint64_t boffset) + : dest_(std::move(dest)), + clock_(clock), + statistics_(statistics), + log_number_(log_number), + block_offset_(boffset), + use_fsync_(use_fs), + do_flush_(do_flush), + last_elem_type_(kEtNone) {} + +BlobLogWriter::~BlobLogWriter() = default; + +Status BlobLogWriter::Sync() { + TEST_SYNC_POINT("BlobLogWriter::Sync"); + + StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); + Status s = dest_->Sync(use_fsync_); + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + return s; +} + +Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { + assert(block_offset_ == 0); + assert(last_elem_type_ == kEtNone); + std::string str; + header.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + if (do_flush_) { + s = dest_->Flush(); + } + } + last_elem_type_ = kEtFileHdr; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + return s; +} + +Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, + std::string* checksum_method, + std::string* checksum_value) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string str; + footer.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + + s = Sync(); + + if (s.ok()) { + s = dest_->Close(); + + if (s.ok()) { + assert(!!checksum_method == !!checksum_value); + + if (checksum_method) { + assert(checksum_method->empty()); + + std::string method = dest_->GetFileChecksumFuncName(); + if (method != kUnknownFileChecksumFuncName) { + *checksum_method = std::move(method); + } + } + if (checksum_value) { + assert(checksum_value->empty()); + + std::string value = dest_->GetFileChecksum(); + if (value != kUnknownFileChecksum) { + *checksum_value = std::move(value); + } + } + } + } + + dest_.reset(); + } + + last_elem_type_ = kEtFileFooter; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, expiration); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, 0); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration) { + BlobLogRecord record; + record.key = key; + record.value = val; + record.expiration = expiration; + record.EncodeHeaderTo(buf); +} + +Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, + const Slice& key, const Slice& val, + uint64_t* key_offset, + uint64_t* blob_offset) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); + Status s = dest_->Append(Slice(headerbuf)); + if (s.ok()) { + s = dest_->Append(key); + } + if (s.ok()) { + s = dest_->Append(val); + } + if (do_flush_ && s.ok()) { + s = dest_->Flush(); + } + + *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; + *blob_offset = *key_offset + key.size(); + block_offset_ = *blob_offset + val.size(); + last_elem_type_ = kEtRecord; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; +class SystemClock; +/** + * BlobLogWriter is the blob log stream writer. It provides an append-only + * abstraction for writing blob data. + * + * + * Look at blob_db_format.h to see the details of the record formats. + */ + +class BlobLogWriter { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this BlobLogWriter is in use. + BlobLogWriter(std::unique_ptr&& dest, SystemClock* clock, + Statistics* statistics, uint64_t log_number, bool use_fsync, + bool do_flush, uint64_t boffset = 0); + // No copying allowed + BlobLogWriter(const BlobLogWriter&) = delete; + BlobLogWriter& operator=(const BlobLogWriter&) = delete; + + ~BlobLogWriter(); + + static void ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, + uint64_t* key_offset, uint64_t* blob_offset); + + Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + std::string* checksum_value); + + Status WriteHeader(BlobLogHeader& header); + + WritableFileWriter* file() { return dest_.get(); } + + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + Status Sync(); + + private: + std::unique_ptr dest_; + SystemClock* clock_; + Statistics* statistics_; + uint64_t log_number_; + uint64_t block_offset_; // Current offset in block + bool use_fsync_; + bool do_flush_; + + public: + enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter }; + ElemType last_elem_type_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,1026 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobBasicTest : public DBTestBase { + protected: + DBBlobBasicTest() + : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {} +}; + +TEST_F(DBBlobBasicTest, GetBlob) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get(key), blob_value); + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches; however, the blob itself can only be + // read from the blob file, so the read should return Incomplete. + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + + PinnableSlice result; + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); +} + +TEST_F(DBBlobBasicTest, MultiGetBlobs) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + std::array keys{{first_key, second_key, third_key}}; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { + Options options = GetDefaultOptions(); + + // First, create an external SST file ["b"]. + const std::string file_path = dbname_ + "/test.sst"; + { + SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions()); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Put("b", "b_value")); + ASSERT_OK(sst_file_writer.Finish()); + } + + options.enable_blob_files = true; + options.min_blob_size = 1000; + options.use_direct_reads = true; + options.allow_ingest_behind = true; + + // Open DB with fixed-prefix sst-partitioner so that compaction will cut + // new table file when encountering a new key whose 1-byte prefix changes. + constexpr size_t key_len = 1; + options.sst_partitioner_factory = + NewSstPartitionerFixedPrefixFactory(key_len); + + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + ROCKSDB_GTEST_SKIP("This test requires direct IO support"); + return; + } + ASSERT_OK(s); + + constexpr size_t num_keys = 3; + constexpr size_t blob_size = 3000; + + constexpr char first_key[] = "a"; + const std::string first_blob(blob_size, 'a'); + ASSERT_OK(Put(first_key, first_blob)); + + constexpr char second_key[] = "b"; + const std::string second_blob(2 * blob_size, 'b'); + ASSERT_OK(Put(second_key, second_blob)); + + constexpr char third_key[] = "d"; + const std::string third_blob(blob_size, 'd'); + ASSERT_OK(Put(third_key, third_blob)); + + // first_blob, second_blob and third_blob in the same blob file. + // SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| + // | | | ^ ^ ^ + // | | | | | | + // | | +---------|-------|--------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + constexpr char fourth_key[] = "c"; + const std::string fourth_blob(blob_size, 'c'); + ASSERT_OK(Put(fourth_key, fourth_blob)); + // fourth_blob in another blob file. + // SST Blob file SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'| + // | | | ^ ^ ^ | ^ + // | | | | | | | | + // | | +---------|-------|--------+ +-------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + // Due to the above sst partitioner, we get 4 L1 files. The blob files are + // unchanged. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + + { + // Ingest the external SST file into bottommost level. + std::vector ext_files{file_path}; + IngestExternalFileOptions opts; + opts.ingest_behind = true; + ASSERT_OK( + db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts)); + } + + // Now the database becomes as follows. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + // + // L6 ["b"] + + { + // Compact ["b"] to bottommost level. + Slice begin = Slice(second_key); + Slice end = Slice(second_key); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin, &end)); + } + + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["c"] | | ["d"] | + // | | | | | + // | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------|-----------------+ + // | + // L6 ["b"] + ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1)); + ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6)); + + bool called = false; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) { + auto* aligned_reqs = static_cast*>(arg); + assert(aligned_reqs); + ASSERT_EQ(1, aligned_reqs->size()); + called = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::array keys{{first_key, third_key, second_key}}; + + { + std::array values; + std::array statuses; + + // The MultiGet(), when constructing the KeyContexts, will process the keys + // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1, + // while ["b"] resides in L6. + // Consequently, the original FSReadRequest list prepared by + // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as + // follows: + // + // ["a", offset=30, len=3033], + // ["d", offset=9096, len=3033], + // ["b", offset=3063, len=6033] + // + // If we do not sort them before calling MultiRead() in DirectIO, then the + // underlying IO merging logic will yield two requests. + // + // [offset=0, len=4096] (for "a") + // [offset=0, len=12288] (result of merging the request for "d" and "b") + // + // We need to sort them in Version::MultiGetBlob() so that the underlying + // IO merging logic in DirectIO mode works as expected. The correct + // behavior will be one aligned request: + // + // [offset=0, len=12288] + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(called); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_blob); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], third_blob); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], second_blob); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t kNumBlobFiles = 3; + constexpr size_t kNumBlobsPerFile = 3; + constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles; + + std::vector key_strs; + std::vector value_strs; + for (size_t i = 0; i < kNumBlobFiles; ++i) { + for (size_t j = 0; j < kNumBlobsPerFile; ++j) { + std::string key = "key" + std::to_string(i) + "_" + std::to_string(j); + std::string value = + "value_as_blob" + std::to_string(i) + "_" + std::to_string(j); + ASSERT_OK(Put(key, value)); + key_strs.push_back(key); + value_strs.push_back(value); + } + ASSERT_OK(Flush()); + } + assert(key_strs.size() == kNumKeys); + std::array keys; + for (size_t i = 0; i < keys.size(); ++i) { + keys[i] = key_strs[i]; + } + std::array values; + std::array statuses; + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } +} + +TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a corrupt blob index. + const std::string blob_index("foobar"); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + + DestroyAndReopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array key_strs; + std::array value_strs; + std::array keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_strs[i] = "foo" + std::to_string(i); + value_strs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_strs[i], value_strs[i])); + keys[i] = key_strs[i]; + } + + constexpr char key[] = "key"; + { + // Fake a corrupt blob index. + const std::string blob_index("foobar"); + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + keys[kNumOfKeys] = Slice(static_cast(key), sizeof(key) - 1); + } + + ASSERT_OK(Flush()); + + std::array values; + std::array statuses; + db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/false); + for (size_t i = 0; i < kNumOfKeys + 1; ++i) { + if (i != kNumOfKeys) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("blob_value" + std::to_string(i), values[i]); + } else { + ASSERT_TRUE(statuses[i].IsCorruption()); + } + } +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array key_bufs; + std::array value_bufs; + std::array keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_bufs[i] = "foo" + std::to_string(i); + value_bufs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_bufs[i], value_bufs[i])); + keys[i] = key_bufs[i]; + } + ASSERT_OK(Flush()); + + std::array values; + std::array statuses; + ReadOptions read_opts; + read_opts.value_size_soft_limit = 1; + db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/true); + for (const auto& s : statuses) { + ASSERT_TRUE(s.IsAborted()); + } +} + +TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, GenerateIOTracing) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + std::string trace_file = dbname_ + "/io_trace_file"; + + Reopen(options); + { + // Create IO trace file + std::unique_ptr trace_writer; + ASSERT_OK( + NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer)); + ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer))); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + ASSERT_EQ(Get(key), blob_value); + + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file)); + } + { + // Parse trace file to check file operations related to blob files are + // recorded. + std::unique_ptr trace_reader; + ASSERT_OK( + NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader)); + IOTraceReader reader(std::move(trace_reader)); + + IOTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, static_cast(header.rocksdb_major_version)); + ASSERT_EQ(kMinorVersion, static_cast(header.rocksdb_minor_version)); + + // Read records. + int blob_files_op_count = 0; + Status status; + while (true) { + IOTraceRecord record; + status = reader.ReadIOOp(&record); + if (!status.ok()) { + break; + } + if (record.file_name.find("blob") != std::string::npos) { + blob_files_op_count++; + } + } + // Assuming blob files will have Append, Close and then Read operations. + ASSERT_GT(blob_files_op_count, 2); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + Reopen(options); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + constexpr int kNumTableFiles = 2; + for (int i = 0; i < kNumTableFiles; ++i) { + for (char ch = 'a'; ch != 'c'; ++ch) { + std::string key(1, ch); + ASSERT_OK(Put(key, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + Close(); + + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + std::string blob_file_path; + uint64_t max_blob_file_num = kInvalidBlobFileNumber; + for (const auto& fname : files) { + uint64_t file_num = 0; + FileType type; + if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) && + type == kBlobFile) { + if (file_num > max_blob_file_num) { + max_blob_file_num = file_num; + blob_file_path = dbname_ + "/" + fname; + } + } + } + ASSERT_OK(env_->DeleteFile(blob_file_path)); + + options.best_efforts_recovery = true; + Reopen(options); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "a", &value)); + ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value); +} + +TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v3")); + ASSERT_OK(Flush()); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value)); + ASSERT_EQ(Get("Key1"), "v1,v2,v3"); +} + +TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { + constexpr size_t num_keys = 3; + + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key0", "v0_0")); + ASSERT_OK(Put("Key1", "v1_0")); + ASSERT_OK(Put("Key2", "v2_0")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_1")); + ASSERT_OK(Merge("Key1", "v1_1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_2")); + ASSERT_OK(Flush()); + + std::array keys{{"Key0", "Key1", "Key2"}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], "v1_0,v1_1"); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v2_0"); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, Properties) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr size_t key1_size = sizeof(key1) - 1; + + constexpr char key2[] = "key2"; + constexpr size_t key2_size = sizeof(key2) - 1; + + constexpr char key3[] = "key3"; + constexpr size_t key3_size = sizeof(key3) - 1; + + constexpr char blob[] = "0000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Put(key2, blob)); + ASSERT_OK(Flush()); + + constexpr size_t first_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + blob_size + + BlobLogFooter::kSize; + + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr size_t second_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size + + BlobLogFooter::kSize; + + constexpr size_t total_expected_size = + first_blob_file_expected_size + second_blob_file_expected_size; + + // Number of blob files + uint64_t num_blob_files = 0; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files)); + ASSERT_EQ(num_blob_files, 2); + + // Total size of live blob files + uint64_t live_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize, + &live_blob_file_size)); + ASSERT_EQ(live_blob_file_size, total_expected_size); + + // Total size of all blob files across all versions + // Note: this should be the same as above since we only have one + // version at this point. + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, total_expected_size); + + // Delete key2 to create some garbage + ASSERT_OK(Delete(key2)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + constexpr size_t expected_garbage_size = + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + blob_size; + + // Blob file stats + std::string blob_stats; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats)); + + std::ostringstream oss; + oss << "Number of blob files: 2\nTotal size of blob files: " + << total_expected_size + << "\nTotal size of garbage in blob files: " << expected_garbage_size + << '\n'; + + ASSERT_EQ(blob_stats, oss.str()); +} + +TEST_F(DBBlobBasicTest, PropertiesMultiVersion) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr char key2[] = "key2"; + constexpr char key3[] = "key3"; + + constexpr size_t key_size = sizeof(key1) - 1; + static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2"); + static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3"); + + constexpr char blob[] = "0000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(key2, blob)); + ASSERT_OK(Flush()); + + // Create an iterator to keep the current version alive + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); + + // Note: the Delete and subsequent compaction results in the first blob file + // not making it to the final version. (It is still part of the previous + // version kept alive by the iterator though.) On the other hand, the Put + // results in a third blob file. + ASSERT_OK(Delete(key1)); + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + // Total size of all blob files across all versions: between the two versions, + // we should have three blob files of the same size with one blob each. + // The version kept alive by the iterator contains the first and the second + // blob file, while the final version contains the second and the third blob + // file. (The second blob file is thus shared by the two versions but should + // be counted only once.) + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, + 3 * (BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size + BlobLogFooter::kSize)); +} +#endif // !ROCKSDB_LITE + +class DBBlobBasicIOErrorTest : public DBBlobBasicTest, + public testing::WithParamInterface { + protected: + DBBlobBasicIOErrorTest() : sync_point_(GetParam()) { + fault_injection_env_.reset(new FaultInjectionTestEnv(env_)); + } + ~DBBlobBasicIOErrorTest() { Close(); } + + std::unique_ptr fault_injection_env_; + std::string sync_point_; +}; + +class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest { + public: + DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {} +}; + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::MultiGetBlob:ReadFromFile"})); + +TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) { + Options options; + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + std::array keys{{first_key, second_key}}; + std::array values; + std::array statuses; + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(statuses[0].IsIOError()); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char key1[] = "key1"; + constexpr char value1[] = "blob1"; + + ASSERT_OK(Put(key1, value1)); + ASSERT_OK(Flush()); + + constexpr char key2[] = "key2"; + constexpr char value2[] = "blob2"; + + ASSERT_OK(Put(key2, value2)); + ASSERT_OK(Flush()); + + std::array keys{{key1, key2}}; + std::array values; + std::array statuses; + + bool first_blob_file = true; + SyncPoint::GetInstance()->SetCallBack( + sync_point_, [&first_blob_file, this](void* /* arg */) { + if (first_blob_file) { + first_blob_file = false; + return; + } + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(statuses[0]); + ASSERT_EQ(value1, values[0]); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +namespace { + +class ReadBlobCompactionFilter : public CompactionFilter { + public: + ReadBlobCompactionFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.read.blob"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const override { + if (value_type != CompactionFilter::ValueType::kValue) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + return CompactionFilter::Decision::kChangeValue; + } +}; + +} // anonymous namespace + +TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + std::unique_ptr compaction_filter_guard( + new ReadBlobCompactionFilter); + options.compaction_filter = compaction_filter_guard.get(); + + DestroyAndReopen(options); + constexpr char key[] = "foo"; + constexpr char blob_value[] = "foo_blob_value"; + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,718 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCompactionTest : public DBTestBase { + public: + explicit DBBlobCompactionTest() + : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {} + +#ifndef ROCKSDB_LITE + const std::vector& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } +#endif // ROCKSDB_LITE +}; + +namespace { + +class FilterByKeyLength : public CompactionFilter { + public: + explicit FilterByKeyLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.key.length"; + } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class BadBlobCompactionFilter : public CompactionFilter { + public: + explicit BadBlobCompactionFilter(std::string prefix, + CompactionFilter::Decision filter_by_key, + CompactionFilter::Decision filter_v2) + : prefix_(std::move(prefix)), + filter_blob_by_key_(filter_by_key), + filter_v2_(filter_v2) {} + const char* Name() const override { return "rocksdb.compaction.filter.bad"; } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() >= prefix_.size() && + 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) { + return CompactionFilter::Decision::kUndetermined; + } + return filter_blob_by_key_; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return filter_v2_; + } + + private: + const std::string prefix_; + const CompactionFilter::Decision filter_blob_by_key_; + const CompactionFilter::Decision filter_v2_; +}; + +class ValueBlindWriteFilter : public CompactionFilter { + public: + explicit ValueBlindWriteFilter(std::string new_val) + : new_value_(std::move(new_val)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.blind.write"; + } + CompactionFilter::Decision FilterBlobByKey( + int level, const Slice& key, std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string new_value_; +}; + +CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey( + int /*level*/, const Slice& /*key*/, std::string* new_value, + std::string* /*skip_until*/) const { + assert(new_value); + new_value->assign(new_value_); + return CompactionFilter::Decision::kChangeValue; +} + +class ValueMutationFilter : public CompactionFilter { + public: + explicit ValueMutationFilter(std::string padding) + : padding_(std::move(padding)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.value.mutation"; + } + CompactionFilter::Decision FilterV2(int level, const Slice& key, + ValueType value_type, + const Slice& existing_value, + std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string padding_; +}; + +CompactionFilter::Decision ValueMutationFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + assert(CompactionFilter::ValueType::kBlobIndex != value_type); + if (CompactionFilter::ValueType::kValue != value_type) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + new_value->append(padding_); + return CompactionFilter::Decision::kChangeValue; +} + +class AlwaysKeepFilter : public CompactionFilter { + public: + explicit AlwaysKeepFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.always.keep"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return CompactionFilter::Decision::kKeep; + } +}; + +class SkipUntilFilter : public CompactionFilter { + public: + explicit SkipUntilFilter(std::string skip_until) + : skip_until_(std::move(skip_until)) {} + + const char* Name() const override { + return "rocksdb.compaction.filter.skip.until"; + } + + CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */, + ValueType /* value_type */, + const Slice& /* existing_value */, + std::string* /* new_value */, + std::string* skip_until) const override { + assert(skip_until); + *skip_until = skip_until_; + + return CompactionFilter::Decision::kRemoveAndSkipUntil; + } + + private: + std::string skip_until_; +}; + +} // anonymous namespace + +class DBBlobBadCompactionFilterTest + : public DBBlobCompactionTest, + public testing::WithParamInterface< + std::tuple> { + public: + explicit DBBlobBadCompactionFilterTest() + : compaction_filter_guard_(new BadBlobCompactionFilter( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()))) {} + + protected: + std::unique_ptr compaction_filter_guard_; +}; + +INSTANTIATE_TEST_CASE_P( + BadCompactionFilter, DBBlobBadCompactionFilterTest, + testing::Combine( + testing::Values("a"), + testing::Values(CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError), + testing::Values(CompactionFilter::Decision::kUndetermined, + CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError))); + +TEST_F(DBBlobCompactionTest, FilterByKeyLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr size_t kKeyLength = 2; + std::unique_ptr compaction_filter_guard( + new FilterByKeyLength(kKeyLength)); + options.compaction_filter = compaction_filter_guard.get(); + + constexpr char short_key[] = "a"; + constexpr char long_key[] = "abc"; + constexpr char blob_value[] = "value"; + + DestroyAndReopen(options); + ASSERT_OK(Put(short_key, blob_value)); + ASSERT_OK(Put(long_key, blob_value)); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound()); + value.clear(); + ASSERT_OK(db_->Get(ReadOptions(), long_key, &value)); + ASSERT_EQ("value", value); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove solely based on key; + // this involves neither reading nor writing blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, BlindWriteFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr char new_blob_value[] = "new_blob_value"; + std::unique_ptr compaction_filter_guard( + new ValueBlindWriteFilter(new_blob_value)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector keys = {"a", "b", "c"}; + const std::vector values = {"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& key : keys) { + ASSERT_EQ(new_blob_value, Get(key)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter unconditionally changes value in FilterBlobByKey; + // this involves writing but not reading blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, SkipUntilFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + std::unique_ptr compaction_filter_guard( + new SkipUntilFilter("z")); + options.compaction_filter = compaction_filter_guard.get(); + + Reopen(options); + + const std::vector keys{"a", "b", "c"}; + const std::vector values{"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + + ASSERT_OK(Flush()); + + int process_in_flow_called = 0; + + SyncPoint::GetInstance()->SetCallBack( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow", + [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (const auto& key : keys) { + ASSERT_EQ(Get(key), "NOT_FOUND"); + } + + // Make sure SkipUntil was performed using iteration rather than Seek + ASSERT_EQ(process_in_flow_called, keys.size()); + + Close(); +} + +TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.compaction_filter = compaction_filter_guard_.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); + + DestroyAndReopen(options); + std::string key(std::get<0>(GetParam())); + ASSERT_OK(Put(key, "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + // Fake an inlined TTL blob index. + std::string blob_index; + constexpr uint64_t expiration = 1234567890; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + constexpr char padding[] = "_delta"; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter(padding)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector> kvs = { + {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}}; + for (const auto& kv : kvs) { + ASSERT_OK(Put(kv.first, kv.second)); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& kv : kvs) { + ASSERT_EQ(kv.second + std::string(padding), Get(kv.first)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter changes the value using the previous value in FilterV2; + // this involves reading and writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + // Mock a corrupted blob index + constexpr char key[] = "key"; + std::string blob_idx("blob_idx"); + WriteBatch write_batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&write_batch, 0, key, blob_idx)); + ASSERT_OK(db_->Write(WriteOptions(), &write_batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new AlwaysKeepFilter()); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(blob_files, GetBlobFileNumbers()); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides to keep the existing value in FilterV2; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, TrackGarbage) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + Reopen(options); + + // First table+blob file pair: 4 blobs with different keys + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + // Second table+blob file pair: overwrite 2 existing keys + constexpr char new_first_value[] = "new_first_value"; + constexpr char new_second_value[] = "new_second_value"; + + ASSERT_OK(Put(first_key, new_first_value)); + ASSERT_OK(Put(second_key, new_second_value)); + ASSERT_OK(Flush()); + + // Compact them together. The first blob file should have 2 garbage blobs + // corresponding to the 2 overwritten keys. + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 2); + + { + auto it = blob_files.begin(); + const auto& meta = it->second; + assert(meta); + + constexpr uint64_t first_expected_bytes = + sizeof(first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t second_expected_bytes = + sizeof(second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + constexpr uint64_t third_expected_bytes = + sizeof(third_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) - + 1); + constexpr uint64_t fourth_expected_bytes = + sizeof(fourth_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 4); + ASSERT_EQ(meta->GetTotalBlobBytes(), + first_expected_bytes + second_expected_bytes + + third_expected_bytes + fourth_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 2); + ASSERT_EQ(meta->GetGarbageBlobBytes(), + first_expected_bytes + second_expected_bytes); + } + + { + auto it = blob_files.rbegin(); + const auto& meta = it->second; + assert(meta); + + constexpr uint64_t new_first_expected_bytes = + sizeof(new_first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t new_second_expected_bytes = + sizeof(new_second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 2); + ASSERT_EQ(meta->GetTotalBlobBytes(), + new_first_expected_bytes + new_second_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 0); + ASSERT_EQ(meta->GetGarbageBlobBytes(), 0); + } +} + +TEST_F(DBBlobCompactionTest, MergeBlobWithBase) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + ASSERT_OK(Put("Key1", "v1_1")); + ASSERT_OK(Put("Key2", "v2_1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_2")); + ASSERT_OK(Merge("Key2", "v2_2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_3")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3"); + ASSERT_EQ(Get("Key2"), "v2_1,v2_2"); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "pie")); + ASSERT_OK(Put("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "pie"); + ASSERT_EQ(Get("foo"), "baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) { + Options options = GetDefaultOptions(); + + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("pie")); + + options.compaction_filter = compaction_filter_guard.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "limepie"); + ASSERT_EQ(Get("foo"), "barpie"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("key", "pie")); + ASSERT_OK(Merge("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "lime,pie"); + ASSERT_EQ(Get("foo"), "bar,baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCorruptionTest : public DBTestBase { + protected: + DBBlobCorruptionTest() + : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {} + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + uint64_t picked_number = kInvalidBlobFileNumber; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == filetype && + number > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(options); + + ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1"))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2"))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + Close(); + + Corrupt(kBlobFile, 0, 2); + + ASSERT_OK(TryReopen(options)); + + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + const Status* s = static_cast(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,572 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/column_family.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb +// should accept the value type on write, and report not supported value +// for reads, unless caller request for it explicitly. The base rocksdb +// doesn't understand format of actual blob index (the value). +class DBBlobIndexTest : public DBTestBase { + public: + enum Tier { + kMemtable = 0, + kImmutableMemtables = 1, + kL0SstFile = 2, + kLnSstFile = 3, + }; + const std::vector kAllTiers = {Tier::kMemtable, + Tier::kImmutableMemtables, + Tier::kL0SstFile, Tier::kLnSstFile}; + + DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {} + + ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } + + ColumnFamilyData* cfd() { + return static_cast_with_check(cfh())->cfd(); + } + + Status PutBlobIndex(WriteBatch* batch, const Slice& key, + const Slice& blob_index) { + return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key, + blob_index); + } + + Status Write(WriteBatch* batch) { + return dbfull()->Write(WriteOptions(), batch); + } + + std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr, + const Snapshot* snapshot = nullptr) { + ReadOptions read_options; + read_options.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); + if (s.IsNotFound()) { + return "NOT_FOUND"; + } + if (s.IsCorruption()) { + return "CORRUPTION"; + } + if (s.IsNotSupported()) { + return "NOT_SUPPORTED"; + } + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } + + std::string GetBlobIndex(const Slice& key, + const Snapshot* snapshot = nullptr) { + bool is_blob_index = false; + std::string value = GetImpl(key, &is_blob_index, snapshot); + if (!is_blob_index) { + return "NOT_BLOB"; + } + return value; + } + + ArenaWrappedDBIter* GetBlobIterator() { + return dbfull()->NewIteratorImpl( + ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), + nullptr /*read_callback*/, true /*expose_blob_index*/); + } + + Options GetTestOptions() { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.num_levels = 2; + options.disable_auto_compactions = true; + // Disable auto flushes. + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + return options; + } + + void MoveDataTo(Tier tier) { + switch (tier) { + case Tier::kMemtable: + break; + case Tier::kImmutableMemtables: + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + break; + case Tier::kL0SstFile: + ASSERT_OK(Flush()); + break; + case Tier::kLnSstFile: + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "dummy")); + ASSERT_OK(Put("z", "dummy")); + ASSERT_OK(Flush()); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + break; + } + } +}; + +// Should be able to write kTypeBlobIndex to memtables and SST files. +TEST_F(DBBlobIndexTest, Write) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + for (int i = 1; i <= 5; i++) { + std::string index = ToString(i); + WriteBatch batch; + ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index)); + ASSERT_OK(Write(&batch)); + } + MoveDataTo(tier); + for (int i = 1; i <= 5; i++) { + std::string index = ToString(i); + ASSERT_EQ("blob" + index, GetBlobIndex("key" + index)); + } + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should be able to return blob index if is_blob_index is +// provided, otherwise it should return Status::NotSupported (when reading from +// memtable) or Status::Corruption (when reading from SST). Reading from SST +// returns Corruption because we can't differentiate between the application +// accidentally opening the base DB of a stacked BlobDB and actual corruption +// when using the integrated BlobDB. +TEST_F(DBBlobIndexTest, Get) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + ASSERT_OK(batch.Put("key", "value")); + ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index")); + ASSERT_OK(Write(&batch)); + MoveDataTo(tier); + // Verify normal value + bool is_blob_index = false; + PinnableSlice value; + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("value", GetImpl("key")); + ASSERT_EQ("value", GetImpl("key", &is_blob_index)); + ASSERT_FALSE(is_blob_index); + // Verify blob index + if (tier <= kImmutableMemtables) { + ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + } else { + ASSERT_TRUE(Get("blob_key", &value).IsCorruption()); + ASSERT_EQ("CORRUPTION", GetImpl("blob_key")); + } + ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index)); + ASSERT_TRUE(is_blob_index); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should NOT return Status::NotSupported/Status::Corruption +// if blob index is updated with a normal value. See the test case above for +// more details. +TEST_F(DBBlobIndexTest, Updated) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + for (int i = 0; i < 10; i++) { + ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index")); + } + ASSERT_OK(Write(&batch)); + // Avoid blob values from being purged. + const Snapshot* snapshot = dbfull()->GetSnapshot(); + ASSERT_OK(Put("key1", "new_value")); + ASSERT_OK(Merge("key2", "a")); + ASSERT_OK(Merge("key2", "b")); + ASSERT_OK(Merge("key2", "c")); + ASSERT_OK(Delete("key3")); + ASSERT_OK(SingleDelete("key4")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Merge("key5", "a")); + ASSERT_OK(Merge("key5", "b")); + ASSERT_OK(Merge("key5", "c")); + ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9")); + MoveDataTo(tier); + for (int i = 0; i < 10; i++) { + ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot)); + } + ASSERT_EQ("new_value", Get("key1")); + if (tier <= kImmutableMemtables) { + ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + } else { + ASSERT_EQ("CORRUPTION", GetImpl("key2")); + } + ASSERT_EQ("NOT_FOUND", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("a,b,c", GetImpl("key5")); + for (int i = 6; i < 9; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i))); + } + ASSERT_EQ("blob_index", GetBlobIndex("key9")); + dbfull()->ReleaseSnapshot(snapshot); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. When a blob iterator is used, it should set the +// expose_blob_index flag for the underlying DBIter, and retrieve/return the +// corresponding blob value. If a regular DBIter is created (i.e. +// expose_blob_index is not set), it should return Status::Corruption. +TEST_F(DBBlobIndexTest, Iterate) { + const std::vector> data = { + /*00*/ {kTypeValue}, + /*01*/ {kTypeBlobIndex}, + /*02*/ {kTypeValue}, + /*03*/ {kTypeBlobIndex, kTypeValue}, + /*04*/ {kTypeValue}, + /*05*/ {kTypeValue, kTypeBlobIndex}, + /*06*/ {kTypeValue}, + /*07*/ {kTypeDeletion, kTypeBlobIndex}, + /*08*/ {kTypeValue}, + /*09*/ {kTypeSingleDeletion, kTypeBlobIndex}, + /*10*/ {kTypeValue}, + /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex}, + /*12*/ {kTypeValue}, + /*13*/ + {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex}, + /*14*/ {kTypeValue}, + /*15*/ {kTypeBlobIndex}, + /*16*/ {kTypeValue}, + }; + + auto get_key = [](int index) { + char buf[20]; + snprintf(buf, sizeof(buf), "%02d", index); + return "key" + std::string(buf); + }; + + auto get_value = [&](int index, int version) { + return get_key(index) + "_value" + ToString(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status::Code expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status().code()); + if (expected_status == Status::kOk) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto create_normal_iterator = [&]() -> Iterator* { + return dbfull()->NewIterator(ReadOptions()); + }; + + auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); }; + + auto check_is_blob = [&](bool is_blob) { + return [is_blob](Iterator* iterator) { + ASSERT_EQ(is_blob, + reinterpret_cast(iterator)->IsBlob()); + }; + }; + + auto verify = [&](int index, Status::Code expected_status, + const Slice& forward_value, const Slice& backward_value, + std::function create_iterator, + std::function extra_check = nullptr) { + // Seek + auto* iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Next + iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // SeekForPrev + iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Prev + iterator = create_iterator(); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + }; + + for (auto tier : {Tier::kMemtable} /*kAllTiers*/) { + // Avoid values from being purged. + std::vector snapshots; + DestroyAndReopen(GetTestOptions()); + + // fill data + for (int i = 0; i < static_cast(data.size()); i++) { + for (int j = static_cast(data[i].size()) - 1; j >= 0; j--) { + std::string key = get_key(i); + std::string value = get_value(i, j); + WriteBatch batch; + switch (data[i][j]) { + case kTypeValue: + ASSERT_OK(Put(key, value)); + break; + case kTypeDeletion: + ASSERT_OK(Delete(key)); + break; + case kTypeSingleDeletion: + ASSERT_OK(SingleDelete(key)); + break; + case kTypeMerge: + ASSERT_OK(Merge(key, value)); + break; + case kTypeBlobIndex: + ASSERT_OK(PutBlobIndex(&batch, key, value)); + ASSERT_OK(Write(&batch)); + break; + default: + FAIL(); + }; + } + snapshots.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK( + dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16))); + snapshots.push_back(dbfull()->GetSnapshot()); + MoveDataTo(tier); + + // Normal iterator + verify(1, Status::kCorruption, "", "", create_normal_iterator); + verify(3, Status::kCorruption, "", "", create_normal_iterator); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_normal_iterator); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_normal_iterator); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_normal_iterator); + verify(11, Status::kCorruption, "", "", create_normal_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_normal_iterator); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_normal_iterator); + + // Iterator with blob support + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); + +#ifndef ROCKSDB_LITE + // Iterator with blob support and using seek. + ASSERT_OK(dbfull()->SetOptions( + cfh(), {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); +#endif // !ROCKSDB_LITE + + for (auto* snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } + } +} + +TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { + const std::vector> data = { + /*00*/ {"Put"}, + /*01*/ {"Put", "Merge", "Merge", "Merge"}, + /*02*/ {"Put"}}; + + auto get_key = [](size_t index) { return ("key" + std::to_string(index)); }; + + auto get_value = [&](size_t index, size_t version) { + return get_key(index) + "_value" + ToString(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status()); + if (expected_status.ok()) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto verify = [&](size_t index, Status expected_status, + const Slice& expected_value) { + // Seek + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Next + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, expected_value); + } + // SeekForPrev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Prev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, expected_value); + } + }; + + Options options = GetTestOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + DestroyAndReopen(options); + + // fill data + for (size_t i = 0; i < data.size(); i++) { + for (size_t j = 0; j < data[i].size(); j++) { + std::string key = get_key(i); + std::string value = get_value(i, j); + if (data[i][j] == "Put") { + ASSERT_OK(Put(key, value)); + ASSERT_OK(Flush()); + } else if (data[i][j] == "Merge") { + ASSERT_OK(Merge(key, value)); + ASSERT_OK(Flush()); + } + } + } + + std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," + + get_value(1, 2) + "," + get_value(1, 3); + Status expected_status; + verify(1, expected_status, expected_value); + +#ifndef ROCKSDB_LITE + // Test DBIter::FindValueForCurrentKeyUsingSeek flow. + ASSERT_OK(dbfull()->SetOptions(cfh(), + {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, expected_status, expected_value); +#endif // !ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/prefetch_buffer_collection.h" + +namespace ROCKSDB_NAMESPACE { + +FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer( + uint64_t file_number) { + auto& prefetch_buffer = prefetch_buffers_[file_number]; + if (!prefetch_buffer) { + prefetch_buffer.reset( + new FilePrefetchBuffer(readahead_size_, readahead_size_)); + } + + return prefetch_buffer.get(); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "file/file_prefetch_buffer.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A class that owns a collection of FilePrefetchBuffers using the file number +// as key. Used for implementing compaction readahead for blob files. Designed +// to be accessed by a single thread only: every (sub)compaction needs its own +// buffers since they are guaranteed to read different blobs from different +// positions even when reading the same file. +class PrefetchBufferCollection { + public: + explicit PrefetchBufferCollection(uint64_t readahead_size) + : readahead_size_(readahead_size) { + assert(readahead_size_ > 0); + } + + FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number); + + private: + uint64_t readahead_size_; + std::unordered_map> + prefetch_buffers_; // maps file number to prefetch buffer +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob_index.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,179 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -#pragma once -#ifndef ROCKSDB_LITE - -#include -#include - -#include "rocksdb/options.h" -#include "util/coding.h" -#include "util/string_util.h" - -namespace ROCKSDB_NAMESPACE { - -// BlobIndex is a pointer to the blob and metadata of the blob. The index is -// stored in base DB as ValueType::kTypeBlobIndex. -// There are three types of blob index: -// -// kInlinedTTL: -// +------+------------+---------------+ -// | type | expiration | value | -// +------+------------+---------------+ -// | char | varint64 | variable size | -// +------+------------+---------------+ -// -// kBlob: -// +------+-------------+----------+----------+-------------+ -// | type | file number | offset | size | compression | -// +------+-------------+----------+----------+-------------+ -// | char | varint64 | varint64 | varint64 | char | -// +------+-------------+----------+----------+-------------+ -// -// kBlobTTL: -// +------+------------+-------------+----------+----------+-------------+ -// | type | expiration | file number | offset | size | compression | -// +------+------------+-------------+----------+----------+-------------+ -// | char | varint64 | varint64 | varint64 | varint64 | char | -// +------+------------+-------------+----------+----------+-------------+ -// -// There isn't a kInlined (without TTL) type since we can store it as a plain -// value (i.e. ValueType::kTypeValue). -class BlobIndex { - public: - enum class Type : unsigned char { - kInlinedTTL = 0, - kBlob = 1, - kBlobTTL = 2, - kUnknown = 3, - }; - - BlobIndex() : type_(Type::kUnknown) {} - - bool IsInlined() const { return type_ == Type::kInlinedTTL; } - - bool HasTTL() const { - return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; - } - - uint64_t expiration() const { - assert(HasTTL()); - return expiration_; - } - - const Slice& value() const { - assert(IsInlined()); - return value_; - } - - uint64_t file_number() const { - assert(!IsInlined()); - return file_number_; - } - - uint64_t offset() const { - assert(!IsInlined()); - return offset_; - } - - uint64_t size() const { - assert(!IsInlined()); - return size_; - } - - Status DecodeFrom(Slice slice) { - static const std::string kErrorMessage = "Error while decoding blob index"; - assert(slice.size() > 0); - type_ = static_cast(*slice.data()); - if (type_ >= Type::kUnknown) { - return Status::Corruption( - kErrorMessage, - "Unknown blob index type: " + ToString(static_cast(type_))); - } - slice = Slice(slice.data() + 1, slice.size() - 1); - if (HasTTL()) { - if (!GetVarint64(&slice, &expiration_)) { - return Status::Corruption(kErrorMessage, "Corrupted expiration"); - } - } - if (IsInlined()) { - value_ = slice; - } else { - if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && - GetVarint64(&slice, &size_) && slice.size() == 1) { - compression_ = static_cast(*slice.data()); - } else { - return Status::Corruption(kErrorMessage, "Corrupted blob offset"); - } - } - return Status::OK(); - } - - std::string DebugString(bool output_hex) const { - std::ostringstream oss; - - if (IsInlined()) { - oss << "[inlined blob] value:" << value_.ToString(output_hex); - } else { - oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ - << " size:" << size_; - } - - if (HasTTL()) { - oss << " exp:" << expiration_; - } - - return oss.str(); - } - - static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, - const Slice& value) { - assert(dst != nullptr); - dst->clear(); - dst->reserve(1 + kMaxVarint64Length + value.size()); - dst->push_back(static_cast(Type::kInlinedTTL)); - PutVarint64(dst, expiration); - dst->append(value.data(), value.size()); - } - - static void EncodeBlob(std::string* dst, uint64_t file_number, - uint64_t offset, uint64_t size, - CompressionType compression) { - assert(dst != nullptr); - dst->clear(); - dst->reserve(kMaxVarint64Length * 3 + 2); - dst->push_back(static_cast(Type::kBlob)); - PutVarint64(dst, file_number); - PutVarint64(dst, offset); - PutVarint64(dst, size); - dst->push_back(static_cast(compression)); - } - - static void EncodeBlobTTL(std::string* dst, uint64_t expiration, - uint64_t file_number, uint64_t offset, - uint64_t size, CompressionType compression) { - assert(dst != nullptr); - dst->clear(); - dst->reserve(kMaxVarint64Length * 4 + 2); - dst->push_back(static_cast(Type::kBlobTTL)); - PutVarint64(dst, expiration); - PutVarint64(dst, file_number); - PutVarint64(dst, offset); - PutVarint64(dst, size); - dst->push_back(static_cast(compression)); - } - - private: - Type type_ = Type::kUnknown; - uint64_t expiration_ = 0; - Slice value_; - uint64_t file_number_ = 0; - uint64_t offset_ = 0; - uint64_t size_ = 0; - CompressionType compression_ = kNoCompression; -}; - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,19 +13,22 @@ #include #include +#include "db/blob/blob_file_builder.h" #include "db/compaction/compaction_iterator.h" -#include "db/dbformat.h" #include "db/event_helpers.h" #include "db/internal_stats.h" #include "db/merge_helper.h" +#include "db/output_validator.h" #include "db/range_del_aggregator.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "file/file_util.h" #include "file/filename.h" #include "file/read_write_util.h" #include "file/writable_file_writer.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_util.h" +#include "options/options_helper.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -41,125 +44,172 @@ class TableFactory; -TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, - WritableFileWriter* file, const CompressionType compression_type, - uint64_t sample_for_compression, const CompressionOptions& compression_opts, - int level, const bool skip_filters, const uint64_t creation_time, - const uint64_t oldest_key_time, const uint64_t target_file_size, - const uint64_t file_creation_time) { - assert((column_family_id == +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file) { + assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == - column_family_name.empty()); - return ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, - int_tbl_prop_collector_factories, compression_type, - sample_for_compression, compression_opts, - skip_filters, column_family_name, level, - creation_time, oldest_key_time, target_file_size, - file_creation_time), - column_family_id, file); + tboptions.column_family_name.empty()); + return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); } Status BuildTable( - const std::string& dbname, Env* env, FileSystem* fs, - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, - TableCache* table_cache, InternalIterator* iter, + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, - FileMetaData* meta, const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, + FileMetaData* meta, std::vector* blob_file_additions, std::vector snapshots, SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, const CompressionType compression, - uint64_t sample_for_compression, const CompressionOptions& compression_opts, - bool paranoid_file_checks, InternalStats* internal_stats, - TableFileCreationReason reason, EventLogger* event_logger, int job_id, - const Env::IOPriority io_priority, TableProperties* table_properties, - int level, const uint64_t creation_time, const uint64_t oldest_key_time, - Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) { - assert((column_family_id == + SnapshotChecker* snapshot_checker, bool paranoid_file_checks, + InternalStats* internal_stats, IOStatus* io_status, + const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, EventLogger* event_logger, + int job_id, const Env::IOPriority io_priority, + TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, + const std::string* full_history_ts_low, + BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries, + uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { + assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == - column_family_name.empty()); + tboptions.column_family_name.empty()); + auto& mutable_cf_options = tboptions.moptions; + auto& ioptions = tboptions.ioptions; // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; + OutputValidator output_validator( + tboptions.internal_comparator, + /*enable_order_check=*/ + mutable_cf_options.check_flush_compaction_key_order, + /*enable_hash=*/paranoid_file_checks); Status s; meta->fd.file_size = 0; iter->SeekToFirst(); std::unique_ptr range_del_agg( - new CompactionRangeDelAggregator(&internal_comparator, snapshots)); + new CompactionRangeDelAggregator(&tboptions.internal_comparator, + snapshots)); + uint64_t num_unfragmented_tombstones = 0; + uint64_t total_tombstone_payload_bytes = 0; for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); + total_tombstone_payload_bytes += + range_del_iter->total_tombstone_payload_bytes(); range_del_agg->AddTombstones(std::move(range_del_iter)); } std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); + std::vector blob_file_paths; + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; #ifndef ROCKSDB_LITE - EventHelpers::NotifyTableFileCreationStarted( - ioptions.listeners, dbname, column_family_name, fname, job_id, reason); + EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, + tboptions.column_family_name, + fname, job_id, tboptions.reason); #endif // !ROCKSDB_LITE - TableProperties tp; + Env* env = db_options.env; + assert(env); + FileSystem* fs = db_options.fs.get(); + assert(fs); + TableProperties tp; if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + tboptions.reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = tboptions.column_family_id; + context.reason = tboptions.reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s.PermitUncheckedError(); + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + TableBuilder* builder; std::unique_ptr file_writer; - // Currently we only enable dictionary compression during compaction to the - // bottommost level. - CompressionOptions compression_opts_for_flush(compression_opts); - compression_opts_for_flush.max_dict_bytes = 0; - compression_opts_for_flush.zstd_max_train_bytes = 0; { std::unique_ptr file; #ifndef NDEBUG bool use_direct_writes = file_options.use_direct_writes; TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); #endif // !NDEBUG - s = NewWritableFile(fs, fname, &file, file_options); + IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + assert(s.ok()); + s = io_s; + if (io_status->ok()) { + *io_status = io_s; + } if (!s.ok()) { EventHelpers::LogAndNotifyTableFileCreationFinished( - event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s); + event_logger, ioptions.listeners, dbname, + tboptions.column_family_name, fname, job_id, meta->fd, + kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum, + file_checksum_func_name); return s; } + FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; file->SetIOPriority(io_priority); file->SetWriteLifeTimeHint(write_hint); - file_writer.reset(new WritableFileWriter( - std::move(file), fname, file_options, env, ioptions.statistics, - ioptions.listeners, ioptions.sst_file_checksum_func)); + std::move(file), fname, file_options, ioptions.clock, io_tracer, + ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + builder = NewTableBuilder(tboptions, file_writer.get()); + } - builder = NewTableBuilder( - ioptions, mutable_cf_options, internal_comparator, - int_tbl_prop_collector_factories, column_family_id, - column_family_name, file_writer.get(), compression, - sample_for_compression, compression_opts_for_flush, level, - false /* skip_filters */, creation_time, oldest_key_time, - 0 /*target_file_size*/, file_creation_time); - } - - MergeHelper merge(env, internal_comparator.user_comparator(), - ioptions.merge_operator, nullptr, ioptions.info_log, - true /* internal key corruption is not ok */, - snapshots.empty() ? 0 : snapshots.back(), - snapshot_checker); + MergeHelper merge( + env, tboptions.internal_comparator.user_comparator(), + ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); + + std::unique_ptr blob_file_builder( + (mutable_cf_options.enable_blob_files && blob_file_additions) + ? new BlobFileBuilder( + versions, fs, &ioptions, &mutable_cf_options, &file_options, + job_id, tboptions.column_family_id, + tboptions.column_family_name, io_priority, write_hint, + io_tracer, blob_callback, blob_creation_reason, + &blob_file_paths, blob_file_additions) + : nullptr); CompactionIterator c_iter( - iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber, - &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, - ShouldReportDetailedTime(env, ioptions.statistics), - true /* internal key corruption is not ok */, range_del_agg.get()); + iter, tboptions.internal_comparator.user_comparator(), &merge, + kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, + snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + blob_file_builder.get(), ioptions.allow_data_in_errors, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, + /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, db_options.info_log, + full_history_ts_low); + c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); const ParsedInternalKey& ikey = c_iter.ikey(); + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key, value); + if (!s.ok()) { + break; + } builder->Add(key, value); meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); @@ -170,26 +220,39 @@ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); } } + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } - auto range_del_it = range_del_agg->NewIterator(); - for (range_del_it->SeekToFirst(); range_del_it->Valid(); - range_del_it->Next()) { - auto tombstone = range_del_it->Tombstone(); - auto kv = tombstone.Serialize(); - builder->Add(kv.first.Encode(), kv.second); - meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), - tombstone.seq_, internal_comparator); + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + auto kv = tombstone.Serialize(); + builder->Add(kv.first.Encode(), kv.second); + meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), + tombstone.seq_, + tboptions.internal_comparator); + } } - // Finish and check for builder errors - tp = builder->GetTableProperties(); - bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0; - s = c_iter.status(); + TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); + const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } if (!s.ok() || empty) { builder->Abandon(); } else { s = builder->Finish(); } + if (io_status->ok()) { + *io_status = builder->io_status(); + } if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); @@ -197,24 +260,64 @@ meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder->GetTableProperties(); // refresh now that builder is finished + if (memtable_payload_bytes != nullptr && + memtable_garbage_bytes != nullptr) { + const CompactionIterationStats& ci_stats = c_iter.iter_stats(); + uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes + + ci_stats.total_input_raw_value_bytes + + total_tombstone_payload_bytes; + uint64_t total_payload_bytes_written = + (tp.raw_key_size + tp.raw_value_size); + // Prevent underflow, which may still happen at this point + // since we only support inserts, deletes, and deleteRanges. + if (total_payload_bytes_written <= total_payload_bytes) { + *memtable_payload_bytes = total_payload_bytes; + *memtable_garbage_bytes = + total_payload_bytes - total_payload_bytes_written; + } else { + *memtable_payload_bytes = 0; + *memtable_garbage_bytes = 0; + } + } if (table_properties) { *table_properties = tp; } - // Add the checksum information to file metadata. - meta->file_checksum = builder->GetFileChecksum(); - meta->file_checksum_func_name = builder->GetFileChecksumFuncName(); } delete builder; // Finish and check for file errors + TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); if (s.ok() && !empty) { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); - s = file_writer->Sync(ioptions.use_fsync); + StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); + *io_status = file_writer->Sync(ioptions.use_fsync); } - if (s.ok() && !empty) { - s = file_writer->Close(); + TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); + if (s.ok() && io_status->ok() && !empty) { + *io_status = file_writer->Close(); + } + if (s.ok() && io_status->ok() && !empty) { + // Add the checksum information to file metadata. + meta->file_checksum = file_writer->GetFileChecksum(); + meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName(); + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + } + + if (s.ok()) { + s = *io_status; + } + + if (blob_file_builder) { + if (s.ok()) { + s = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(s); + } + blob_file_builder.reset(); } + // TODO Also check the IO status when create the Iterator. + if (s.ok() && !empty) { // Verify that the table is usable // We set for_compaction to false and don't OptimizeForCompactionTableRead @@ -222,20 +325,32 @@ // No matter whether use_direct_io_for_flush_and_compaction is true, // we will regrad this verification as user reads since the goal is // to cache it here for further user reads + ReadOptions read_options; std::unique_ptr it(table_cache->NewIterator( - ReadOptions(), file_options, internal_comparator, *meta, - nullptr /* range_del_agg */, - mutable_cf_options.prefix_extractor.get(), nullptr, + read_options, file_options, tboptions.internal_comparator, *meta, + nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, + nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), TableReaderCaller::kFlush, /*arena=*/nullptr, - /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key*/ nullptr)); + /*skip_filter=*/false, tboptions.level_at_creation, + MaxFileSizeForL0MetaPin(mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr, + /*allow_unprepared_value*/ false)); s = it->status(); if (s.ok() && paranoid_file_checks) { + OutputValidator file_validator(tboptions.internal_comparator, + /*enable_order_check=*/true, + /*enable_hash=*/true); for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + file_validator.Add(it->key(), it->value()).PermitUncheckedError(); } s = it->status(); + if (s.ok() && !output_validator.CompareValidator(file_validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } } } } @@ -246,16 +361,38 @@ } if (!s.ok() || meta->fd.GetFileSize() == 0) { - fs->DeleteFile(fname, IOOptions(), nullptr); + TEST_SYNC_POINT("BuildTable:BeforeDeleteFile"); + + constexpr IODebugContext* dbg = nullptr; + + Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); + ignored.PermitUncheckedError(); + + assert(blob_file_additions || blob_file_paths.empty()); + + if (blob_file_additions) { + for (const std::string& blob_file_path : blob_file_paths) { + ignored = DeleteDBFile(&db_options, blob_file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + ignored.PermitUncheckedError(); + TEST_SYNC_POINT("BuildTable::AfterDeleteFile"); + } + } } + Status status_for_listener = s; if (meta->fd.GetFileSize() == 0) { fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } } // Output to event logger and fire events. EventHelpers::LogAndNotifyTableFileCreationFinished( - event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s); + event_logger, ioptions.listeners, dbname, tboptions.column_family_name, + fname, job_id, meta->fd, meta->oldest_blob_file_number, tp, + tboptions.reason, status_for_listener, file_checksum, + file_checksum_func_name); return s; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -24,34 +24,20 @@ namespace ROCKSDB_NAMESPACE { -struct Options; struct FileMetaData; -class Env; -struct EnvOptions; -class Iterator; +class VersionSet; +class BlobFileAddition; class SnapshotChecker; class TableCache; -class VersionEdit; class TableBuilder; class WritableFileWriter; class InternalStats; +class BlobFileCompletionCallback; -// @param column_family_name Name of the column family that is also identified -// by column_family_id, or empty string if unknown. It must outlive the -// TableBuilder returned by this function. -TableBuilder* NewTableBuilder( - const ImmutableCFOptions& options, const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, - WritableFileWriter* file, const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, int level, - const bool skip_filters = false, const uint64_t creation_time = 0, - const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0, - const uint64_t file_creation_time = 0); +// Convenience function for NewTableBuilder on the embedded table_factory. +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -62,27 +48,27 @@ // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. extern Status BuildTable( - const std::string& dbname, Env* env, FileSystem* fs, - const ImmutableCFOptions& options, - const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, - TableCache* table_cache, InternalIterator* iter, + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, - FileMetaData* meta, const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, + FileMetaData* meta, std::vector* blob_file_additions, std::vector snapshots, SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, const CompressionType compression, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, bool paranoid_file_checks, - InternalStats* internal_stats, TableFileCreationReason reason, + SnapshotChecker* snapshot_checker, bool paranoid_file_checks, + InternalStats* internal_stats, IOStatus* io_status, + const std::shared_ptr& io_tracer, + BlobFileCreationReason blob_creation_reason, EventLogger* event_logger = nullptr, int job_id = 0, const Env::IOPriority io_priority = Env::IO_HIGH, - TableProperties* table_properties = nullptr, int level = -1, - const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + TableProperties* table_properties = nullptr, Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, - const uint64_t file_creation_time = 0); + const std::string* full_history_ts_low = nullptr, + BlobFileCompletionCallback* blob_callback = nullptr, + uint64_t* num_input_entries = nullptr, + uint64_t* memtable_payload_bytes = nullptr, + uint64_t* memtable_garbage_bytes = nullptr); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/c.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/c.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,7 +11,11 @@ #include "rocksdb/c.h" -#include +#include +#include +#include +#include + #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -24,6 +28,7 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" +#include "rocksdb/perf_context.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" @@ -35,17 +40,13 @@ #include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/table_properties_collectors.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_batch.h" -#include "rocksdb/perf_context.h" #include "utilities/merge_operators.h" -#include -#include -#include - using ROCKSDB_NAMESPACE::BackupableDBOptions; using ROCKSDB_NAMESPACE::BackupEngine; using ROCKSDB_NAMESPACE::BackupID; @@ -60,7 +61,6 @@ using ROCKSDB_NAMESPACE::ColumnFamilyHandle; using ROCKSDB_NAMESPACE::ColumnFamilyOptions; using ROCKSDB_NAMESPACE::CompactionFilter; -using ROCKSDB_NAMESPACE::CompactionFilterContext; using ROCKSDB_NAMESPACE::CompactionFilterFactory; using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; using ROCKSDB_NAMESPACE::CompactRangeOptions; @@ -80,12 +80,15 @@ using ROCKSDB_NAMESPACE::Iterator; using ROCKSDB_NAMESPACE::LiveFileMetaData; using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::LRUCacheOptions; +using ROCKSDB_NAMESPACE::MemoryAllocator; using ROCKSDB_NAMESPACE::MemoryUtil; using ROCKSDB_NAMESPACE::MergeOperator; -using ROCKSDB_NAMESPACE::MergeOperators; using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; +using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory; using ROCKSDB_NAMESPACE::NewGenericRateLimiter; using ROCKSDB_NAMESPACE::NewLRUCache; +using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy; using ROCKSDB_NAMESPACE::OptimisticTransactionDB; using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; using ROCKSDB_NAMESPACE::Options; @@ -104,6 +107,7 @@ using ROCKSDB_NAMESPACE::Snapshot; using ROCKSDB_NAMESPACE::SstFileWriter; using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; using ROCKSDB_NAMESPACE::Transaction; using ROCKSDB_NAMESPACE::TransactionDB; using ROCKSDB_NAMESPACE::TransactionDBOptions; @@ -115,10 +119,8 @@ using ROCKSDB_NAMESPACE::WriteBatchWithIndex; using ROCKSDB_NAMESPACE::WriteOptions; -using std::shared_ptr; using std::vector; using std::unordered_set; -using std::map; extern "C" { @@ -154,6 +156,12 @@ struct rocksdb_logger_t { std::shared_ptr rep; }; +struct rocksdb_lru_cache_options_t { + LRUCacheOptions rep; +}; +struct rocksdb_memory_allocator_t { + std::shared_ptr rep; +}; struct rocksdb_cache_t { std::shared_ptr rep; }; @@ -181,6 +189,9 @@ struct rocksdb_transaction_t { Transaction* rep; }; +struct rocksdb_backupable_db_options_t { + BackupableDBOptions rep; +}; struct rocksdb_checkpoint_t { Checkpoint* rep; }; @@ -504,13 +515,13 @@ return result; } -rocksdb_t* rocksdb_open_for_read_only( - const rocksdb_options_t* options, - const char* name, - unsigned char error_if_log_file_exist, - char** errptr) { +rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options, + const char* name, + unsigned char error_if_wal_file_exists, + char** errptr) { DB* db; - if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) { + if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), + &db, error_if_wal_file_exists))) { return nullptr; } rocksdb_t* result = new rocksdb_t; @@ -549,6 +560,18 @@ return result; } +rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts( + const rocksdb_backupable_db_options_t* options, rocksdb_env_t* env, + char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr) { @@ -595,6 +618,15 @@ restore_options->rep)); } +void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( rocksdb_backup_engine_t* be) { rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t; @@ -636,6 +668,128 @@ delete be; } +rocksdb_backupable_db_options_t* rocksdb_backupable_db_options_create( + const char* backup_dir) { + return new rocksdb_backupable_db_options_t{ + BackupableDBOptions(std::string(backup_dir))}; +} + +void rocksdb_backupable_db_options_set_backup_dir( + rocksdb_backupable_db_options_t* options, const char* backup_dir) { + options->rep.backup_dir = std::string(backup_dir); +} + +void rocksdb_backupable_db_options_set_env( + rocksdb_backupable_db_options_t* options, rocksdb_env_t* env) { + options->rep.backup_env = (env ? env->rep : nullptr); +} + +void rocksdb_backupable_db_options_set_share_table_files( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.share_table_files = val; +} + +unsigned char rocksdb_backupable_db_options_get_share_table_files( + rocksdb_backupable_db_options_t* options) { + return options->rep.share_table_files; +} + +void rocksdb_backupable_db_options_set_sync( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.sync = val; +} + +unsigned char rocksdb_backupable_db_options_get_sync( + rocksdb_backupable_db_options_t* options) { + return options->rep.sync; +} + +void rocksdb_backupable_db_options_set_destroy_old_data( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.destroy_old_data = val; +} + +unsigned char rocksdb_backupable_db_options_get_destroy_old_data( + rocksdb_backupable_db_options_t* options) { + return options->rep.destroy_old_data; +} + +void rocksdb_backupable_db_options_set_backup_log_files( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.backup_log_files = val; +} + +unsigned char rocksdb_backupable_db_options_get_backup_log_files( + rocksdb_backupable_db_options_t* options) { + return options->rep.backup_log_files; +} + +void rocksdb_backupable_db_options_set_backup_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit) { + options->rep.backup_rate_limit = limit; +} + +uint64_t rocksdb_backupable_db_options_get_backup_rate_limit( + rocksdb_backupable_db_options_t* options) { + return options->rep.backup_rate_limit; +} + +void rocksdb_backupable_db_options_set_restore_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit) { + options->rep.restore_rate_limit = limit; +} + +uint64_t rocksdb_backupable_db_options_get_restore_rate_limit( + rocksdb_backupable_db_options_t* options) { + return options->rep.restore_rate_limit; +} + +void rocksdb_backupable_db_options_set_max_background_operations( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.max_background_operations = val; +} + +int rocksdb_backupable_db_options_get_max_background_operations( + rocksdb_backupable_db_options_t* options) { + return options->rep.max_background_operations; +} + +void rocksdb_backupable_db_options_set_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options, uint64_t size) { + options->rep.callback_trigger_interval_size = size; +} + +uint64_t rocksdb_backupable_db_options_get_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options) { + return options->rep.callback_trigger_interval_size; +} + +void rocksdb_backupable_db_options_set_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.max_valid_backups_to_open = val; +} + +int rocksdb_backupable_db_options_get_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options) { + return options->rep.max_valid_backups_to_open; +} + +void rocksdb_backupable_db_options_set_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.share_files_with_checksum_naming = + static_cast(val); +} + +int rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options) { + return static_cast(options->rep.share_files_with_checksum_naming); +} + +void rocksdb_backupable_db_options_destroy( + rocksdb_backupable_db_options_t* options) { + delete options; +} + rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr) { Checkpoint* checkpoint; @@ -698,12 +852,47 @@ return result; } +rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* db_options, const char* name, + int num_column_families, const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr) { + std::vector ttls_vec; + std::vector column_families; + for (int i = 0; i < num_column_families; i++) { + ttls_vec.push_back(ttls[i]); + + column_families.push_back(ColumnFamilyDescriptor( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep))); + } + + ROCKSDB_NAMESPACE::DBWithTTL* db; + std::vector handles; + if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( + DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db, ttls_vec))) { + return nullptr; + } + + for (size_t i = 0; i < handles.size(); i++) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + rocksdb_t* rocksdb_open_for_read_only_column_families( const rocksdb_options_t* db_options, const char* name, int num_column_families, const char* const* column_family_names, const rocksdb_options_t* const* column_family_options, rocksdb_column_family_handle_t** column_family_handles, - unsigned char error_if_log_file_exist, char** errptr) { + unsigned char error_if_wal_file_exists, char** errptr) { std::vector column_families; for (int i = 0; i < num_column_families; i++) { column_families.push_back(ColumnFamilyDescriptor( @@ -713,8 +902,10 @@ DB* db; std::vector handles; - if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep), - std::string(name), column_families, &handles, &db, error_if_log_file_exist))) { + if (SaveError(errptr, + DB::OpenForReadOnly(DBOptions(db_options->rep), + std::string(name), column_families, + &handles, &db, error_if_wal_file_exists))) { return nullptr; } @@ -796,6 +987,18 @@ return handle; } +rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr) { + ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl = + static_cast(db->rep); + rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; + SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep), ttl)); + return handle; +} + void rocksdb_drop_column_family( rocksdb_t* db, rocksdb_column_family_handle_t* handle, @@ -996,6 +1199,55 @@ } } +unsigned char rocksdb_key_may_exist(rocksdb_t* db, + const rocksdb_readoptions_t* options, + const char* key, size_t key_len, + char** value, size_t* val_len, + const char* timestamp, size_t timestamp_len, + unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len), + &tmp, timestamp ? &time : nullptr, + value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + +unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found) { + std::string tmp; + std::string time; + if (timestamp) { + time.assign(timestamp, timestamp_len); + } + bool found = false; + const bool result = db->rep->KeyMayExist( + options->rep, column_family->rep, Slice(key, key_len), &tmp, + timestamp ? &time : nullptr, value_found ? &found : nullptr); + if (value_found) { + *value_found = found; + if (found) { + *val_len = tmp.size(); + *value = CopyString(tmp); + } + } + return result; +} + rocksdb_iterator_t* rocksdb_create_iterator( rocksdb_t* db, const rocksdb_readoptions_t* options) { @@ -1148,34 +1400,39 @@ } } -void rocksdb_approximate_sizes( - rocksdb_t* db, - int num_ranges, - const char* const* range_start_key, const size_t* range_start_key_len, - const char* const* range_limit_key, const size_t* range_limit_key_len, - uint64_t* sizes) { +void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges, + const char* const* range_start_key, + const size_t* range_start_key_len, + const char* const* range_limit_key, + const size_t* range_limit_key_len, + uint64_t* sizes, char** errptr) { Range* ranges = new Range[num_ranges]; for (int i = 0; i < num_ranges; i++) { ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); } - db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } delete[] ranges; } void rocksdb_approximate_sizes_cf( - rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - int num_ranges, - const char* const* range_start_key, const size_t* range_start_key_len, - const char* const* range_limit_key, const size_t* range_limit_key_len, - uint64_t* sizes) { + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) { Range* ranges = new Range[num_ranges]; for (int i = 0; i < num_ranges; i++) { ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); } - db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes); + Status s = db->rep->GetApproximateSizes(column_family->rep, ranges, + num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } delete[] ranges; } @@ -1256,6 +1513,10 @@ SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); } +void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { + SaveError(errptr, db->rep->FlushWAL(sync)); +} + void rocksdb_disable_file_deletions( rocksdb_t* db, char** errptr) { @@ -1466,6 +1727,11 @@ b->rep.Delete(Slice(key, klen)); } +void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { + b->rep.SingleDelete(Slice(key, klen)); +} + void rocksdb_writebatch_delete_cf( rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, @@ -1473,6 +1739,12 @@ b->rep.Delete(column_family->rep, Slice(key, klen)); } +void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep.SingleDelete(column_family->rep, Slice(key, klen)); +} + void rocksdb_writebatch_deletev( rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, @@ -1723,6 +1995,11 @@ b->rep->Delete(Slice(key, klen)); } +void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b, + const char* key, size_t klen) { + b->rep->SingleDelete(Slice(key, klen)); +} + void rocksdb_writebatch_wi_delete_cf( rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, @@ -1730,6 +2007,12 @@ b->rep->Delete(column_family->rep, Slice(key, klen)); } +void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { + b->rep->SingleDelete(column_family->rep, Slice(key, klen)); +} + void rocksdb_writebatch_wi_deletev( rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, @@ -2154,6 +2437,10 @@ delete options; } +rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) { + return new rocksdb_options_t(*options); +} + void rocksdb_options_increase_parallelism( rocksdb_options_t* opt, int total_threads) { opt->rep.IncreaseParallelism(total_threads); @@ -2179,6 +2466,10 @@ opt->rep.allow_ingest_behind = v; } +unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) { + return opt->rep.allow_ingest_behind; +} + void rocksdb_options_set_compaction_filter( rocksdb_options_t* opt, rocksdb_compactionfilter_t* filter) { @@ -2196,6 +2487,10 @@ opt->rep.compaction_readahead_size = s; } +size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) { + return opt->rep.compaction_readahead_size; +} + void rocksdb_options_set_comparator( rocksdb_options_t* opt, rocksdb_comparator_t* cmp) { @@ -2208,27 +2503,43 @@ opt->rep.merge_operator = std::shared_ptr(merge_operator); } - void rocksdb_options_set_create_if_missing( rocksdb_options_t* opt, unsigned char v) { opt->rep.create_if_missing = v; } +unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) { + return opt->rep.create_if_missing; +} + void rocksdb_options_set_create_missing_column_families( rocksdb_options_t* opt, unsigned char v) { opt->rep.create_missing_column_families = v; } +unsigned char rocksdb_options_get_create_missing_column_families( + rocksdb_options_t* opt) { + return opt->rep.create_missing_column_families; +} + void rocksdb_options_set_error_if_exists( rocksdb_options_t* opt, unsigned char v) { opt->rep.error_if_exists = v; } +unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) { + return opt->rep.error_if_exists; +} + void rocksdb_options_set_paranoid_checks( rocksdb_options_t* opt, unsigned char v) { opt->rep.paranoid_checks = v; } +unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) { + return opt->rep.paranoid_checks; +} + void rocksdb_options_set_db_paths(rocksdb_options_t* opt, const rocksdb_dbpath_t** dbpath_values, size_t num_paths) { @@ -2254,57 +2565,107 @@ opt->rep.info_log_level = static_cast(v); } +int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) { + return static_cast(opt->rep.info_log_level); +} + void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.db_write_buffer_size = s; } +size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.db_write_buffer_size; +} + void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } +size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) { + return opt->rep.write_buffer_size; +} + void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { opt->rep.max_open_files = n; } +int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) { + return opt->rep.max_open_files; +} + void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) { opt->rep.max_file_opening_threads = n; } +int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) { + return opt->rep.max_file_opening_threads; +} + void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) { opt->rep.max_total_wal_size = n; } +uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) { + return opt->rep.max_total_wal_size; +} + void rocksdb_options_set_target_file_size_base( rocksdb_options_t* opt, uint64_t n) { opt->rep.target_file_size_base = n; } +uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) { + return opt->rep.target_file_size_base; +} + void rocksdb_options_set_target_file_size_multiplier( rocksdb_options_t* opt, int n) { opt->rep.target_file_size_multiplier = n; } +int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) { + return opt->rep.target_file_size_multiplier; +} + void rocksdb_options_set_max_bytes_for_level_base( rocksdb_options_t* opt, uint64_t n) { opt->rep.max_bytes_for_level_base = n; } +uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_base; +} + void rocksdb_options_set_level_compaction_dynamic_level_bytes( rocksdb_options_t* opt, unsigned char v) { opt->rep.level_compaction_dynamic_level_bytes = v; } +unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes( + rocksdb_options_t* opt) { + return opt->rep.level_compaction_dynamic_level_bytes; +} + void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt, double n) { opt->rep.max_bytes_for_level_multiplier = n; } +double rocksdb_options_get_max_bytes_for_level_multiplier( + rocksdb_options_t* opt) { + return opt->rep.max_bytes_for_level_multiplier; +} + void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt, uint64_t n) { opt->rep.max_compaction_bytes = n; } +uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) { + return opt->rep.max_compaction_bytes; +} + void rocksdb_options_set_max_bytes_for_level_multiplier_additional( rocksdb_options_t* opt, int* level_values, size_t num_levels) { opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels); @@ -2322,30 +2683,129 @@ opt->rep.skip_stats_update_on_db_open = val; } +unsigned char rocksdb_options_get_skip_stats_update_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_stats_update_on_db_open; +} + void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( rocksdb_options_t* opt, unsigned char val) { opt->rep.skip_checking_sst_file_sizes_on_db_open = val; } +unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt) { + return opt->rep.skip_checking_sst_file_sizes_on_db_open; +} + +/* Blob Options Settings */ +void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_files = val; +} +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt) { + return opt->rep.enable_blob_files; +} + +void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.min_blob_size = val; +} + +uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) { + return opt->rep.min_blob_size; +} + +void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.blob_file_size = val; +} + +uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) { + return opt->rep.blob_file_size; +} + +void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt, + int val) { + opt->rep.blob_compression_type = static_cast(val); +} + +int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) { + return opt->rep.blob_compression_type; +} + +void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_garbage_collection = val; +} + +unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) { + return opt->rep.enable_blob_garbage_collection; +} + +void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_age_cutoff = val; +} + +double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_age_cutoff; +} + +void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_force_threshold = val; +} + +double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_force_threshold; +} + +void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val) { + opt->rep.blob_compaction_readahead_size = val; +} + +uint64_t rocksdb_options_get_blob_compaction_readahead_size( + rocksdb_options_t* opt) { + return opt->rep.blob_compaction_readahead_size; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } +int rocksdb_options_get_num_levels(rocksdb_options_t* opt) { + return opt->rep.num_levels; +} + void rocksdb_options_set_level0_file_num_compaction_trigger( rocksdb_options_t* opt, int n) { opt->rep.level0_file_num_compaction_trigger = n; } +int rocksdb_options_get_level0_file_num_compaction_trigger( + rocksdb_options_t* opt) { + return opt->rep.level0_file_num_compaction_trigger; +} + void rocksdb_options_set_level0_slowdown_writes_trigger( rocksdb_options_t* opt, int n) { opt->rep.level0_slowdown_writes_trigger = n; } +int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_slowdown_writes_trigger; +} + void rocksdb_options_set_level0_stop_writes_trigger( rocksdb_options_t* opt, int n) { opt->rep.level0_stop_writes_trigger = n; } +int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) { + return opt->rep.level0_stop_writes_trigger; +} + void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/, int /*n*/) {} @@ -2353,12 +2813,28 @@ opt->rep.wal_recovery_mode = static_cast(mode); } +int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) { + return static_cast(opt->rep.wal_recovery_mode); +} + void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) { opt->rep.compression = static_cast(t); } +int rocksdb_options_get_compression(rocksdb_options_t* opt) { + return opt->rep.compression; +} + +void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) { + opt->rep.bottommost_compression = static_cast(t); +} + +int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) { + return opt->rep.bottommost_compression; +} + void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, - int* level_values, + const int* level_values, size_t num_levels) { opt->rep.compression_per_level.resize(num_levels); for (size_t i = 0; i < num_levels; ++i) { @@ -2371,7 +2847,7 @@ int w_bits, int level, int strategy, int max_dict_bytes, - bool enabled) { + unsigned char enabled) { opt->rep.bottommost_compression_opts.window_bits = w_bits; opt->rep.bottommost_compression_opts.level = level; opt->rep.bottommost_compression_opts.strategy = strategy; @@ -2379,6 +2855,21 @@ opt->rep.bottommost_compression_opts.enabled = enabled; } +void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) { + opt->rep.bottommost_compression_opts.zstd_max_train_bytes = + zstd_max_train_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + +void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.max_dict_buffer_bytes = + max_dict_buffer_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, int level, int strategy, int max_dict_bytes) { @@ -2388,6 +2879,36 @@ opt->rep.compression_opts.max_dict_bytes = max_dict_bytes; } +void rocksdb_options_set_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt, int zstd_max_train_bytes) { + opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes; +} + +int rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.zstd_max_train_bytes; +} + +void rocksdb_options_set_compression_options_parallel_threads( + rocksdb_options_t* opt, int value) { + opt->rep.compression_opts.parallel_threads = value; +} + +int rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.parallel_threads; +} + +void rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) { + opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes; +} + +uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.max_dict_buffer_bytes; +} + void rocksdb_options_set_prefix_extractor( rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { opt->rep.prefix_extractor.reset(prefix_extractor); @@ -2398,6 +2919,10 @@ opt->rep.use_fsync = use_fsync; } +int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) { + return opt->rep.use_fsync; +} + void rocksdb_options_set_db_log_dir( rocksdb_options_t* opt, const char* db_log_dir) { opt->rep.db_log_dir = db_log_dir; @@ -2412,16 +2937,28 @@ opt->rep.WAL_ttl_seconds = ttl; } +uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) { + return opt->rep.WAL_ttl_seconds; +} + void rocksdb_options_set_WAL_size_limit_MB( rocksdb_options_t* opt, uint64_t limit) { opt->rep.WAL_size_limit_MB = limit; } +uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) { + return opt->rep.WAL_size_limit_MB; +} + void rocksdb_options_set_manifest_preallocation_size( rocksdb_options_t* opt, size_t v) { opt->rep.manifest_preallocation_size = v; } +size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) { + return opt->rep.manifest_preallocation_size; +} + // noop void rocksdb_options_set_purge_redundant_kvs_while_flush( rocksdb_options_t* /*opt*/, unsigned char /*v*/) {} @@ -2431,41 +2968,91 @@ opt->rep.use_direct_reads = v; } +unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) { + return opt->rep.use_direct_reads; +} + void rocksdb_options_set_use_direct_io_for_flush_and_compaction( rocksdb_options_t* opt, unsigned char v) { opt->rep.use_direct_io_for_flush_and_compaction = v; } +unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction( + rocksdb_options_t* opt) { + return opt->rep.use_direct_io_for_flush_and_compaction; +} + void rocksdb_options_set_allow_mmap_reads( rocksdb_options_t* opt, unsigned char v) { opt->rep.allow_mmap_reads = v; } +unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_reads; +} + void rocksdb_options_set_allow_mmap_writes( rocksdb_options_t* opt, unsigned char v) { opt->rep.allow_mmap_writes = v; } +unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) { + return opt->rep.allow_mmap_writes; +} + void rocksdb_options_set_is_fd_close_on_exec( rocksdb_options_t* opt, unsigned char v) { opt->rep.is_fd_close_on_exec = v; } +unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) { + return opt->rep.is_fd_close_on_exec; +} + void rocksdb_options_set_skip_log_error_on_recovery( rocksdb_options_t* opt, unsigned char v) { opt->rep.skip_log_error_on_recovery = v; } +unsigned char rocksdb_options_get_skip_log_error_on_recovery( + rocksdb_options_t* opt) { + return opt->rep.skip_log_error_on_recovery; +} + void rocksdb_options_set_stats_dump_period_sec( rocksdb_options_t* opt, unsigned int v) { opt->rep.stats_dump_period_sec = v; } +unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) { + return opt->rep.stats_dump_period_sec; +} + +void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt, + unsigned int v) { + opt->rep.stats_persist_period_sec = v; +} + +unsigned int rocksdb_options_get_stats_persist_period_sec( + rocksdb_options_t* opt) { + return opt->rep.stats_persist_period_sec; +} + void rocksdb_options_set_advise_random_on_open( rocksdb_options_t* opt, unsigned char v) { opt->rep.advise_random_on_open = v; } +unsigned char rocksdb_options_get_advise_random_on_open( + rocksdb_options_t* opt) { + return opt->rep.advise_random_on_open; +} + +void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt, + double v) { + opt->rep.experimental_mempurge_threshold = v; +} + void rocksdb_options_set_access_hint_on_compaction_start( rocksdb_options_t* opt, int v) { switch(v) { @@ -2485,142 +3072,276 @@ opt->rep.access_hint_on_compaction_start = ROCKSDB_NAMESPACE::Options::WILLNEED; break; + default: + assert(0); } } +int rocksdb_options_get_access_hint_on_compaction_start( + rocksdb_options_t* opt) { + return opt->rep.access_hint_on_compaction_start; +} + void rocksdb_options_set_use_adaptive_mutex( rocksdb_options_t* opt, unsigned char v) { opt->rep.use_adaptive_mutex = v; } +unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) { + return opt->rep.use_adaptive_mutex; +} + void rocksdb_options_set_wal_bytes_per_sync( rocksdb_options_t* opt, uint64_t v) { opt->rep.wal_bytes_per_sync = v; } +uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.wal_bytes_per_sync; +} + void rocksdb_options_set_bytes_per_sync( rocksdb_options_t* opt, uint64_t v) { opt->rep.bytes_per_sync = v; } +uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) { + return opt->rep.bytes_per_sync; +} + void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt, uint64_t v) { opt->rep.writable_file_max_buffer_size = static_cast(v); } +uint64_t rocksdb_options_get_writable_file_max_buffer_size( + rocksdb_options_t* opt) { + return opt->rep.writable_file_max_buffer_size; +} + void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt, unsigned char v) { opt->rep.allow_concurrent_memtable_write = v; } +unsigned char rocksdb_options_get_allow_concurrent_memtable_write( + rocksdb_options_t* opt) { + return opt->rep.allow_concurrent_memtable_write; +} + void rocksdb_options_set_enable_write_thread_adaptive_yield( rocksdb_options_t* opt, unsigned char v) { opt->rep.enable_write_thread_adaptive_yield = v; } +unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield( + rocksdb_options_t* opt) { + return opt->rep.enable_write_thread_adaptive_yield; +} + void rocksdb_options_set_max_sequential_skip_in_iterations( rocksdb_options_t* opt, uint64_t v) { opt->rep.max_sequential_skip_in_iterations = v; } +uint64_t rocksdb_options_get_max_sequential_skip_in_iterations( + rocksdb_options_t* opt) { + return opt->rep.max_sequential_skip_in_iterations; +} + void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { opt->rep.max_write_buffer_number = n; } +int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number; +} + void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { opt->rep.min_write_buffer_number_to_merge = n; } +int rocksdb_options_get_min_write_buffer_number_to_merge( + rocksdb_options_t* opt) { + return opt->rep.min_write_buffer_number_to_merge; +} + void rocksdb_options_set_max_write_buffer_number_to_maintain( rocksdb_options_t* opt, int n) { opt->rep.max_write_buffer_number_to_maintain = n; } +int rocksdb_options_get_max_write_buffer_number_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_number_to_maintain; +} + void rocksdb_options_set_max_write_buffer_size_to_maintain( rocksdb_options_t* opt, int64_t n) { opt->rep.max_write_buffer_size_to_maintain = n; } +int64_t rocksdb_options_get_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt) { + return opt->rep.max_write_buffer_size_to_maintain; +} + void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, unsigned char v) { opt->rep.enable_pipelined_write = v; } +unsigned char rocksdb_options_get_enable_pipelined_write( + rocksdb_options_t* opt) { + return opt->rep.enable_pipelined_write; +} + void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, unsigned char v) { opt->rep.unordered_write = v; } +unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) { + return opt->rep.unordered_write; +} + void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, uint32_t n) { opt->rep.max_subcompactions = n; } +uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) { + return opt->rep.max_subcompactions; +} + void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) { opt->rep.max_background_jobs = n; } +int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) { + return opt->rep.max_background_jobs; +} + void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { opt->rep.max_background_compactions = n; } +int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) { + return opt->rep.max_background_compactions; +} + void rocksdb_options_set_base_background_compactions(rocksdb_options_t* opt, int n) { opt->rep.base_background_compactions = n; } +int rocksdb_options_get_base_background_compactions(rocksdb_options_t* opt) { + return opt->rep.base_background_compactions; +} + void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) { opt->rep.max_background_flushes = n; } +int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) { + return opt->rep.max_background_flushes; +} + void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) { opt->rep.max_log_file_size = v; } +size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) { + return opt->rep.max_log_file_size; +} + void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) { opt->rep.log_file_time_to_roll = v; } +size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) { + return opt->rep.log_file_time_to_roll; +} + void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { opt->rep.keep_log_file_num = v; } +size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) { + return opt->rep.keep_log_file_num; +} + void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, size_t v) { opt->rep.recycle_log_file_num = v; } +size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) { + return opt->rep.recycle_log_file_num; +} + void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { opt->rep.soft_rate_limit = v; } +double rocksdb_options_get_soft_rate_limit(rocksdb_options_t* opt) { + return opt->rep.soft_rate_limit; +} + void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) { opt->rep.hard_rate_limit = v; } +double rocksdb_options_get_hard_rate_limit(rocksdb_options_t* opt) { + return opt->rep.hard_rate_limit; +} + void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { opt->rep.soft_pending_compaction_bytes_limit = v; } +size_t rocksdb_options_get_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.soft_pending_compaction_bytes_limit; +} + void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { opt->rep.hard_pending_compaction_bytes_limit = v; } +size_t rocksdb_options_get_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt) { + return opt->rep.hard_pending_compaction_bytes_limit; +} + void rocksdb_options_set_rate_limit_delay_max_milliseconds( rocksdb_options_t* opt, unsigned int v) { opt->rep.rate_limit_delay_max_milliseconds = v; } +unsigned int rocksdb_options_get_rate_limit_delay_max_milliseconds( + rocksdb_options_t* opt) { + return opt->rep.rate_limit_delay_max_milliseconds; +} + void rocksdb_options_set_max_manifest_file_size( rocksdb_options_t* opt, size_t v) { opt->rep.max_manifest_file_size = v; } +size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) { + return opt->rep.max_manifest_file_size; +} + void rocksdb_options_set_table_cache_numshardbits( rocksdb_options_t* opt, int v) { opt->rep.table_cache_numshardbits = v; } +int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) { + return opt->rep.table_cache_numshardbits; +} + void rocksdb_options_set_table_cache_remove_scan_count_limit( rocksdb_options_t* /*opt*/, int /*v*/) { // this option is deprecated @@ -2631,19 +3352,38 @@ opt->rep.arena_block_size = v; } +size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) { + return opt->rep.arena_block_size; +} + void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { opt->rep.disable_auto_compactions = disable; } +unsigned char rocksdb_options_get_disable_auto_compactions( + rocksdb_options_t* opt) { + return opt->rep.disable_auto_compactions; +} + void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) { opt->rep.optimize_filters_for_hits = v; } +unsigned char rocksdb_options_get_optimize_filters_for_hits( + rocksdb_options_t* opt) { + return opt->rep.optimize_filters_for_hits; +} + void rocksdb_options_set_delete_obsolete_files_period_micros( rocksdb_options_t* opt, uint64_t v) { opt->rep.delete_obsolete_files_period_micros = v; } +uint64_t rocksdb_options_get_delete_obsolete_files_period_micros( + rocksdb_options_t* opt) { + return opt->rep.delete_obsolete_files_period_micros; +} + void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { opt->rep.PrepareForBulkLoad(); } @@ -2657,11 +3397,20 @@ opt->rep.memtable_prefix_bloom_size_ratio = v; } +double rocksdb_options_get_memtable_prefix_bloom_size_ratio( + rocksdb_options_t* opt) { + return opt->rep.memtable_prefix_bloom_size_ratio; +} + void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt, size_t v) { opt->rep.memtable_huge_page_size = v; } +size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) { + return opt->rep.memtable_huge_page_size; +} + void rocksdb_options_set_hash_skip_list_rep( rocksdb_options_t *opt, size_t bucket_count, int32_t skiplist_height, int32_t skiplist_branching_factor) { @@ -2696,31 +3445,56 @@ opt->rep.max_successive_merges = v; } +size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) { + return opt->rep.max_successive_merges; +} + void rocksdb_options_set_bloom_locality( rocksdb_options_t* opt, uint32_t v) { opt->rep.bloom_locality = v; } +uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) { + return opt->rep.bloom_locality; +} + void rocksdb_options_set_inplace_update_support( rocksdb_options_t* opt, unsigned char v) { opt->rep.inplace_update_support = v; } +unsigned char rocksdb_options_get_inplace_update_support( + rocksdb_options_t* opt) { + return opt->rep.inplace_update_support; +} + void rocksdb_options_set_inplace_update_num_locks( rocksdb_options_t* opt, size_t v) { opt->rep.inplace_update_num_locks = v; } +size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) { + return opt->rep.inplace_update_num_locks; +} + void rocksdb_options_set_report_bg_io_stats( rocksdb_options_t* opt, int v) { opt->rep.report_bg_io_stats = v; } +unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) { + return opt->rep.report_bg_io_stats; +} + void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { opt->rep.compaction_style = static_cast(style); } +int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) { + return opt->rep.compaction_style; +} + void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { opt->rep.compaction_options_universal = *(uco->rep); } @@ -2750,6 +3524,19 @@ opt->rep.atomic_flush = atomic_flush; } +unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) { + return opt->rep.atomic_flush; +} + +void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt, + unsigned char manual_wal_flush) { + opt->rep.manual_wal_flush = manual_wal_flush; +} + +unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) { + return opt->rep.manual_wal_flush; +} + rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, @@ -2771,6 +3558,14 @@ } } +void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) { + std::shared_ptr + compact_on_del = + NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + void rocksdb_set_perf_level(int v) { PerfLevel level = static_cast(v); SetPerfLevel(level); @@ -3064,7 +3859,8 @@ delete filter; } -rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) { +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format( + double bits_per_key, bool original_format) { // Make a rocksdb_filterpolicy_t, but override all of its methods so // they delegate to a NewBloomFilterPolicy() instead of user // supplied C functions. @@ -3099,14 +3895,63 @@ return wrapper; } -rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) { +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full( + double bits_per_key) { return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false); } -rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) { +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) { return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true); } +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + // Make a rocksdb_filterpolicy_t, but override all of its methods so + // they delegate to a NewRibbonFilterPolicy() instead of user + // supplied C functions. + struct Wrapper : public rocksdb_filterpolicy_t { + const FilterPolicy* rep_; + ~Wrapper() override { delete rep_; } + const char* Name() const override { return rep_->Name(); } + void CreateFilter(const Slice* keys, int n, + std::string* dst) const override { + return rep_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + return rep_->KeyMayMatch(key, filter); + } + ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext( + const ROCKSDB_NAMESPACE::FilterBuildingContext& context) + const override { + return rep_->GetBuilderWithContext(context); + } + ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader( + const Slice& contents) const override { + return rep_->GetFilterBitsReader(contents); + } + static void DoNothing(void*) {} + }; + Wrapper* wrapper = new Wrapper; + wrapper->rep_ = + NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level); + wrapper->state_ = nullptr; + wrapper->delete_filter_ = nullptr; + wrapper->destructor_ = &Wrapper::DoNothing; + return wrapper; +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon( + double bloom_equivalent_bits_per_key) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1); +} + +rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid( + double bloom_equivalent_bits_per_key, int bloom_before_level) { + return rocksdb_filterpolicy_create_ribbon_format( + bloom_equivalent_bits_per_key, bloom_before_level); +} + rocksdb_mergeoperator_t* rocksdb_mergeoperator_create( void* state, void (*destructor)(void*), char* (*full_merge)(void*, const char* key, size_t key_length, @@ -3149,11 +3994,20 @@ opt->rep.verify_checksums = v; } +unsigned char rocksdb_readoptions_get_verify_checksums( + rocksdb_readoptions_t* opt) { + return opt->rep.verify_checksums; +} + void rocksdb_readoptions_set_fill_cache( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.fill_cache = v; } +unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) { + return opt->rep.fill_cache; +} + void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t* opt, const rocksdb_snapshot_t* snap) { @@ -3190,11 +4044,19 @@ opt->rep.read_tier = static_cast(v); } +int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) { + return static_cast(opt->rep.read_tier); +} + void rocksdb_readoptions_set_tailing( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.tailing = v; } +unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) { + return opt->rep.tailing; +} + void rocksdb_readoptions_set_managed( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.managed = v; @@ -3205,37 +4067,89 @@ opt->rep.readahead_size = v; } +size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) { + return opt->rep.readahead_size; +} + void rocksdb_readoptions_set_prefix_same_as_start( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.prefix_same_as_start = v; } +unsigned char rocksdb_readoptions_get_prefix_same_as_start( + rocksdb_readoptions_t* opt) { + return opt->rep.prefix_same_as_start; +} + void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.pin_data = v; } +unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) { + return opt->rep.pin_data; +} + void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.total_order_seek = v; } +unsigned char rocksdb_readoptions_get_total_order_seek( + rocksdb_readoptions_t* opt) { + return opt->rep.total_order_seek; +} + void rocksdb_readoptions_set_max_skippable_internal_keys( rocksdb_readoptions_t* opt, uint64_t v) { opt->rep.max_skippable_internal_keys = v; } +uint64_t rocksdb_readoptions_get_max_skippable_internal_keys( + rocksdb_readoptions_t* opt) { + return opt->rep.max_skippable_internal_keys; +} + void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.background_purge_on_iterator_cleanup = v; } +unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t* opt) { + return opt->rep.background_purge_on_iterator_cleanup; +} + void rocksdb_readoptions_set_ignore_range_deletions( rocksdb_readoptions_t* opt, unsigned char v) { opt->rep.ignore_range_deletions = v; } +unsigned char rocksdb_readoptions_get_ignore_range_deletions( + rocksdb_readoptions_t* opt) { + return opt->rep.ignore_range_deletions; +} + +void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.deadline = std::chrono::microseconds(microseconds); +} + +uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) { + return opt->rep.deadline.count(); +} + +void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.io_timeout = std::chrono::microseconds(microseconds); +} + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { + return opt->rep.io_timeout.count(); +} + rocksdb_writeoptions_t* rocksdb_writeoptions_create() { return new rocksdb_writeoptions_t; } @@ -3249,33 +4163,61 @@ opt->rep.sync = v; } +unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) { + return opt->rep.sync; +} + void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) { opt->rep.disableWAL = disable; } +unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt) { + return opt->rep.disableWAL; +} + void rocksdb_writeoptions_set_ignore_missing_column_families( rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.ignore_missing_column_families = v; } +unsigned char rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t* opt) { + return opt->rep.ignore_missing_column_families; +} + void rocksdb_writeoptions_set_no_slowdown( rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.no_slowdown = v; } +unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t* opt) { + return opt->rep.no_slowdown; +} + void rocksdb_writeoptions_set_low_pri( rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.low_pri = v; } +unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) { + return opt->rep.low_pri; +} + void rocksdb_writeoptions_set_memtable_insert_hint_per_batch( rocksdb_writeoptions_t* opt, unsigned char v) { opt->rep.memtable_insert_hint_per_batch = v; } +unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt) { + return opt->rep.memtable_insert_hint_per_batch; +} + rocksdb_compactoptions_t* rocksdb_compactoptions_create() { return new rocksdb_compactoptions_t; } @@ -3289,21 +4231,40 @@ opt->rep.bottommost_level_compaction = static_cast(v); } +unsigned char rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t* opt) { + return static_cast(opt->rep.bottommost_level_compaction); +} + void rocksdb_compactoptions_set_exclusive_manual_compaction( rocksdb_compactoptions_t* opt, unsigned char v) { opt->rep.exclusive_manual_compaction = v; } +unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t* opt) { + return opt->rep.exclusive_manual_compaction; +} + void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt, unsigned char v) { opt->rep.change_level = v; } +unsigned char rocksdb_compactoptions_get_change_level( + rocksdb_compactoptions_t* opt) { + return opt->rep.change_level; +} + void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt, int n) { opt->rep.target_level = n; } +int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) { + return opt->rep.target_level; +} + rocksdb_flushoptions_t* rocksdb_flushoptions_create() { return new rocksdb_flushoptions_t; } @@ -3317,20 +4278,70 @@ opt->rep.wait = v; } +unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) { + return opt->rep.wait; +} + +rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create( + char** errptr) { + rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t; + ROCKSDB_NAMESPACE::JemallocAllocatorOptions options; + SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator( + options, &allocator->rep)); + return allocator; +} + +void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) { + delete allocator; +} + +rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() { + return new rocksdb_lru_cache_options_t; +} + +void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) { + delete opt; +} + +void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt, + size_t capacity) { + opt->rep.capacity = capacity; +} + +void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) { + opt->rep.memory_allocator = allocator->rep; +} + rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(capacity); return c; } +rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t* opt) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(opt->rep); + return c; +} + void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } +void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { + cache->rep->DisownData(); +} + void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { cache->rep->SetCapacity(capacity); } +size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) { + return cache->rep->GetCapacity(); +} + size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) { return cache->rep->GetUsage(); } @@ -3368,10 +4379,36 @@ env->rep->SetBackgroundThreads(n); } +int rocksdb_env_get_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(); +} + +void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::BOTTOM); +} + +int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::BOTTOM); +} + void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n, Env::HIGH); } +int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::HIGH); +} + +void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env, + int n) { + env->rep->SetBackgroundThreads(n, Env::LOW); +} + +int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::LOW); +} + void rocksdb_env_join_all_threads(rocksdb_env_t* env) { env->rep->WaitForJoin(); } @@ -3558,10 +4595,11 @@ delete st; } -struct Wrapper : public rocksdb_slicetransform_t { +struct SliceTransformWrapper : public rocksdb_slicetransform_t { const SliceTransform* rep_; - ~Wrapper() override { delete rep_; } + ~SliceTransformWrapper() override { delete rep_; } const char* Name() const override { return rep_->Name(); } + std::string GetId() const override { return rep_->GetId(); } Slice Transform(const Slice& src) const override { return rep_->Transform(src); } @@ -3573,18 +4611,18 @@ }; rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) { - Wrapper* wrapper = new Wrapper; + SliceTransformWrapper* wrapper = new SliceTransformWrapper; wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen); wrapper->state_ = nullptr; - wrapper->destructor_ = &Wrapper::DoNothing; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; return wrapper; } rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() { - Wrapper* wrapper = new Wrapper; + SliceTransformWrapper* wrapper = new SliceTransformWrapper; wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform(); wrapper->state_ = nullptr; - wrapper->destructor_ = &Wrapper::DoNothing; + wrapper->destructor_ = &SliceTransformWrapper::DoNothing; return wrapper; } @@ -3599,32 +4637,62 @@ uco->rep->size_ratio = ratio; } +int rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->size_ratio; +} + void rocksdb_universal_compaction_options_set_min_merge_width( rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->min_merge_width = w; } +int rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->min_merge_width; +} + void rocksdb_universal_compaction_options_set_max_merge_width( rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->max_merge_width = w; } +int rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_merge_width; +} + void rocksdb_universal_compaction_options_set_max_size_amplification_percent( rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->max_size_amplification_percent = p; } +int rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_size_amplification_percent; +} + void rocksdb_universal_compaction_options_set_compression_size_percent( rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->compression_size_percent = p; } +int rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->compression_size_percent; +} + void rocksdb_universal_compaction_options_set_stop_style( rocksdb_universal_compaction_options_t* uco, int style) { uco->rep->stop_style = static_cast(style); } +int rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t* uco) { + return static_cast(uco->rep->stop_style); +} + void rocksdb_universal_compaction_options_destroy( rocksdb_universal_compaction_options_t* uco) { delete uco->rep; @@ -3642,6 +4710,11 @@ fifo_opts->rep.max_table_files_size = size; } +uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.max_table_files_size; +} + void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts) { delete fifo_opts; @@ -3665,6 +4738,11 @@ return static_cast(lf->rep.size()); } +const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf, + int index) { + return lf->rep[index].column_family_name.c_str(); +} + const char* rocksdb_livefiles_name( const rocksdb_livefiles_t* lf, int index) { @@ -3831,6 +4909,27 @@ opt->rep.set_snapshot = v; } +char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family( rocksdb_transactiondb_t* txn_db, const rocksdb_options_t* column_family_options, @@ -3901,6 +5000,27 @@ delete snapshot; } +char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db, + const char* propname) { + std::string tmp; + if (db->rep->GetProperty(Slice(propname), &tmp)) { + // We use strdup() since we expect human readable output. + return strdup(tmp.c_str()); + } else { + return nullptr; + } +} + +int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db, + const char* propname, + uint64_t* out_val) { + if (db->rep->GetIntProperty(Slice(propname), out_val)) { + return 0; + } else { + return -1; + } +} + rocksdb_transaction_t* rocksdb_transaction_begin( rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* write_options, @@ -3940,7 +5060,10 @@ const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot( rocksdb_transaction_t* txn) { - rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + // This will be freed later on using free, so use malloc here to avoid a + // mismatch + rocksdb_snapshot_t* result = + (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t)); result->rep = txn->rep->GetSnapshot(); return result; } @@ -4300,12 +5423,31 @@ return old_txn; } +// Write batch into OptimisticTransactionDB +void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr) { + SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep)); +} + void rocksdb_optimistictransactiondb_close( rocksdb_optimistictransactiondb_t* otxn_db) { delete otxn_db->rep; delete otxn_db; } +rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) { + Checkpoint* checkpoint; + if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) { + return nullptr; + } + rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t; + result->rep = checkpoint; + return result; +} + void rocksdb_free(void* ptr) { free(ptr); } rocksdb_pinnableslice_t* rocksdb_get_pinned( @@ -4441,11 +5583,25 @@ return memory_usage->cache_total; } +void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.dump_malloc_stats = val; +} + +void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.memtable_whole_key_filtering = val; +} + // deletes container with memory usage estimates void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) { delete usage; } +void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) { + CancelAllBackgroundWork(db->rep, wait); +} + } // end extern "C" #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/c_test.c mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/c_test.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c 2025-05-19 16:14:27.000000000 +0000 @@ -7,12 +7,13 @@ #ifndef ROCKSDB_LITE // Lite does not support C API -#include "rocksdb/c.h" - +#include #include #include #include #include + +#include "rocksdb/c.h" #ifndef OS_WIN #include #endif @@ -58,7 +59,11 @@ static const char* GetTempDir(void) { const char* ret = getenv("TEST_TMPDIR"); if (ret == NULL || ret[0] == '\0') - ret = "/tmp"; +#ifdef OS_WIN + ret = getenv("TEMP"); +#else + ret = "/tmp"; +#endif return ret; } #ifdef _MSC_VER @@ -85,10 +90,8 @@ // ok return; } else { - fprintf(stderr, "%s: expected '%s', got '%s'\n", - phase, - (expected ? expected : "(null)"), - (v ? v : "(null")); + fprintf(stderr, "%s: expected '%s', got '%s'\n", phase, + (expected ? expected : "(null)"), (v ? v : "(null)")); abort(); } } @@ -513,6 +516,9 @@ coptions = rocksdb_compactoptions_create(); rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1); + rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000, + 10001); + StartPhase("destroy"); rocksdb_destroy_db(options, dbname, &err); Free(&err); @@ -984,7 +990,9 @@ &err); CheckNoError(err); } - rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes, + &err); + CheckNoError(err); CheckCondition(sizes[0] > 0); CheckCondition(sizes[1] > 0); } @@ -1010,7 +1018,36 @@ CheckGet(db, roptions, "foo", NULL); rocksdb_release_snapshot(db, snap); } - + StartPhase("snapshot_with_memtable_inplace_update"); + { + rocksdb_close(db); + const rocksdb_snapshot_t* snap = NULL; + const char* s_key = "foo_snap"; + const char* value1 = "hello_s1"; + const char* value2 = "hello_s2"; + rocksdb_options_set_allow_concurrent_memtable_write(options, 0); + rocksdb_options_set_inplace_update_support(options, 1); + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_put(db, woptions, s_key, 8, value1, 8, &err); + snap = rocksdb_create_snapshot(db); + assert(snap != NULL); + rocksdb_put(db, woptions, s_key, 8, value2, 8, &err); + CheckNoError(err); + rocksdb_readoptions_set_snapshot(roptions, snap); + CheckGet(db, roptions, "foo", NULL); + // snapshot syntax is invalid, because of inplace update supported is set + CheckGet(db, roptions, s_key, value2); + // restore the data and options + rocksdb_delete(db, woptions, s_key, 8, &err); + CheckGet(db, roptions, s_key, NULL); + rocksdb_release_snapshot(db, snap); + rocksdb_readoptions_set_snapshot(roptions, NULL); + rocksdb_options_set_inplace_update_support(options, 0); + rocksdb_options_set_allow_concurrent_memtable_write(options, 1); + rocksdb_options_set_error_if_exists(options, 1); + } StartPhase("repair"); { // If we do not compact here, then the lazy deletion of @@ -1034,19 +1071,25 @@ } StartPhase("filter"); - for (run = 0; run <= 2; run++) { - // First run uses custom filter - // Second run uses old block-based bloom filter - // Third run uses full bloom filter + for (run = 0; run <= 4; run++) { + // run=0 uses custom filter + // run=1 uses old block-based bloom filter + // run=2 run uses full bloom filter + // run=3 uses Ribbon + // run=4 uses Ribbon-Bloom hybrid configuration CheckNoError(err); rocksdb_filterpolicy_t* policy; if (run == 0) { policy = rocksdb_filterpolicy_create(NULL, FilterDestroy, FilterCreate, FilterKeyMatch, NULL, FilterName); } else if (run == 1) { - policy = rocksdb_filterpolicy_create_bloom(8); + policy = rocksdb_filterpolicy_create_bloom(8.0); + } else if (run == 2) { + policy = rocksdb_filterpolicy_create_bloom_full(8.0); + } else if (run == 3) { + policy = rocksdb_filterpolicy_create_ribbon(8.0); } else { - policy = rocksdb_filterpolicy_create_bloom_full(8); + policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1); } rocksdb_block_based_options_set_filter_policy(table_options, policy); @@ -1112,10 +1155,12 @@ } else if (run == 1) { // Essentially a fingerprint of the block-based Bloom schema CheckCondition(hits == 241); + } else if (run == 2 || run == 4) { + // Essentially a fingerprint of full Bloom schema, format_version=5 + CheckCondition(hits == 188); } else { - // Essentially a fingerprint of the full Bloom schema(s), - // format_version < 5, which vary for three different CACHE_LINE_SIZEs - CheckCondition(hits == 224 || hits == 180 || hits == 125); + // Essentially a fingerprint of Ribbon schema + CheckCondition(hits == 226); } CheckCondition( (keys_to_query - hits) == @@ -1271,6 +1316,9 @@ CheckPinGetCF(db, roptions, handles[1], "box", "c"); rocksdb_writebatch_destroy(wb); + rocksdb_flush_wal(db, 1, &err); + CheckNoError(err); + const char* keys[3] = { "box", "box", "barfooxx" }; const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] }; const size_t keys_sizes[3] = { 3, 3, 8 }; @@ -1296,6 +1344,29 @@ Free(&vals[i]); } + { + unsigned char value_found = 0; + + CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11, + NULL, NULL, NULL, 0, NULL)); + CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11, + &vals[0], &vals_sizes[0], NULL, 0, + &value_found)); + if (value_found) { + Free(&vals[0]); + } + + CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1], + "invalid_key", 11, NULL, NULL, + NULL, 0, NULL)); + CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1], + "invalid_key", 11, &vals[0], + &vals_sizes[0], NULL, 0, NULL)); + if (value_found) { + Free(&vals[0]); + } + } + rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]); CheckCondition(!rocksdb_iter_valid(iter)); rocksdb_iter_seek_to_first(iter); @@ -1461,6 +1532,1079 @@ rocksdb_cuckoo_options_destroy(cuckoo_options); } + StartPhase("options"); + { + rocksdb_options_t* o; + o = rocksdb_options_create(); + + // Set and check options. + rocksdb_options_set_allow_ingest_behind(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o)); + + rocksdb_options_compaction_readahead_size(o, 10); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o)); + + rocksdb_options_set_create_if_missing(o, 1); + CheckCondition(1 == rocksdb_options_get_create_if_missing(o)); + + rocksdb_options_set_create_missing_column_families(o, 1); + CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o)); + + rocksdb_options_set_error_if_exists(o, 1); + CheckCondition(1 == rocksdb_options_get_error_if_exists(o)); + + rocksdb_options_set_paranoid_checks(o, 1); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(o)); + + rocksdb_options_set_info_log_level(o, 3); + CheckCondition(3 == rocksdb_options_get_info_log_level(o)); + + rocksdb_options_set_write_buffer_size(o, 100); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(o)); + + rocksdb_options_set_db_write_buffer_size(o, 1000); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o)); + + rocksdb_options_set_max_open_files(o, 21); + CheckCondition(21 == rocksdb_options_get_max_open_files(o)); + + rocksdb_options_set_max_file_opening_threads(o, 5); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o)); + + rocksdb_options_set_max_total_wal_size(o, 400); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o)); + + rocksdb_options_set_num_levels(o, 7); + CheckCondition(7 == rocksdb_options_get_num_levels(o)); + + rocksdb_options_set_level0_file_num_compaction_trigger(o, 4); + CheckCondition(4 == + rocksdb_options_get_level0_file_num_compaction_trigger(o)); + + rocksdb_options_set_level0_slowdown_writes_trigger(o, 6); + CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o)); + + rocksdb_options_set_level0_stop_writes_trigger(o, 8); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o)); + + rocksdb_options_set_target_file_size_base(o, 256); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(o)); + + rocksdb_options_set_target_file_size_multiplier(o, 3); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o)); + + rocksdb_options_set_max_bytes_for_level_base(o, 1024); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o)); + + rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1); + CheckCondition(1 == + rocksdb_options_get_level_compaction_dynamic_level_bytes(o)); + + rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(o)); + + rocksdb_options_set_skip_stats_update_on_db_open(o, 1); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); + + rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); + + rocksdb_options_set_max_write_buffer_number(o, 97); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); + + rocksdb_options_set_min_write_buffer_number_to_merge(o, 23); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(o)); + + rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64); + CheckCondition(64 == + rocksdb_options_get_max_write_buffer_number_to_maintain(o)); + + rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(o)); + + rocksdb_options_set_enable_pipelined_write(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o)); + + rocksdb_options_set_unordered_write(o, 1); + CheckCondition(1 == rocksdb_options_get_unordered_write(o)); + + rocksdb_options_set_max_subcompactions(o, 123456); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o)); + + rocksdb_options_set_max_background_jobs(o, 2); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(o)); + + rocksdb_options_set_max_background_compactions(o, 3); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(o)); + + rocksdb_options_set_base_background_compactions(o, 4); + CheckCondition(4 == rocksdb_options_get_base_background_compactions(o)); + + rocksdb_options_set_max_background_flushes(o, 5); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(o)); + + rocksdb_options_set_max_log_file_size(o, 6); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(o)); + + rocksdb_options_set_log_file_time_to_roll(o, 7); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o)); + + rocksdb_options_set_keep_log_file_num(o, 8); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o)); + + rocksdb_options_set_recycle_log_file_num(o, 9); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o)); + + rocksdb_options_set_soft_rate_limit(o, 2.0); + CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(o)); + + rocksdb_options_set_hard_rate_limit(o, 4.0); + CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(o)); + + rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10); + CheckCondition(10 == + rocksdb_options_get_soft_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11); + CheckCondition(11 == + rocksdb_options_get_hard_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_rate_limit_delay_max_milliseconds(o, 1); + CheckCondition(1 == + rocksdb_options_get_rate_limit_delay_max_milliseconds(o)); + + rocksdb_options_set_max_manifest_file_size(o, 12); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o)); + + rocksdb_options_set_table_cache_numshardbits(o, 13); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o)); + + rocksdb_options_set_arena_block_size(o, 14); + CheckCondition(14 == rocksdb_options_get_arena_block_size(o)); + + rocksdb_options_set_use_fsync(o, 1); + CheckCondition(1 == rocksdb_options_get_use_fsync(o)); + + rocksdb_options_set_WAL_ttl_seconds(o, 15); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o)); + + rocksdb_options_set_WAL_size_limit_MB(o, 16); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o)); + + rocksdb_options_set_manifest_preallocation_size(o, 17); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o)); + + rocksdb_options_set_allow_mmap_reads(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o)); + + rocksdb_options_set_allow_mmap_writes(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o)); + + rocksdb_options_set_use_direct_reads(o, 1); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(o)); + + rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o)); + + rocksdb_options_set_is_fd_close_on_exec(o, 1); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o)); + + rocksdb_options_set_skip_log_error_on_recovery(o, 1); + CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(o)); + + rocksdb_options_set_stats_dump_period_sec(o, 18); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o)); + + rocksdb_options_set_stats_persist_period_sec(o, 5); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o)); + + rocksdb_options_set_advise_random_on_open(o, 1); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); + + rocksdb_options_set_access_hint_on_compaction_start(o, 3); + CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); + + rocksdb_options_set_use_adaptive_mutex(o, 1); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); + + rocksdb_options_set_bytes_per_sync(o, 19); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o)); + + rocksdb_options_set_wal_bytes_per_sync(o, 20); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o)); + + rocksdb_options_set_writable_file_max_buffer_size(o, 21); + CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o)); + + rocksdb_options_set_allow_concurrent_memtable_write(o, 1); + CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o)); + + rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1); + CheckCondition(1 == + rocksdb_options_get_enable_write_thread_adaptive_yield(o)); + + rocksdb_options_set_max_sequential_skip_in_iterations(o, 22); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(o)); + + rocksdb_options_set_disable_auto_compactions(o, 1); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o)); + + rocksdb_options_set_optimize_filters_for_hits(o, 1); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o)); + + rocksdb_options_set_delete_obsolete_files_period_micros(o, 23); + CheckCondition(23 == + rocksdb_options_get_delete_obsolete_files_period_micros(o)); + + rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(o)); + + rocksdb_options_set_max_compaction_bytes(o, 24); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o)); + + rocksdb_options_set_memtable_huge_page_size(o, 25); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o)); + + rocksdb_options_set_max_successive_merges(o, 26); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(o)); + + rocksdb_options_set_bloom_locality(o, 27); + CheckCondition(27 == rocksdb_options_get_bloom_locality(o)); + + rocksdb_options_set_inplace_update_support(o, 1); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(o)); + + rocksdb_options_set_inplace_update_num_locks(o, 28); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o)); + + rocksdb_options_set_report_bg_io_stats(o, 1); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o)); + + rocksdb_options_set_wal_recovery_mode(o, 2); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o)); + + rocksdb_options_set_compression(o, 5); + CheckCondition(5 == rocksdb_options_get_compression(o)); + + rocksdb_options_set_bottommost_compression(o, 4); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(o)); + + rocksdb_options_set_compaction_style(o, 2); + CheckCondition(2 == rocksdb_options_get_compaction_style(o)); + + rocksdb_options_set_atomic_flush(o, 1); + CheckCondition(1 == rocksdb_options_get_atomic_flush(o)); + + rocksdb_options_set_manual_wal_flush(o, 1); + CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o)); + + /* Blob Options */ + rocksdb_options_set_enable_blob_files(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_files(o)); + + rocksdb_options_set_min_blob_size(o, 29); + CheckCondition(29 == rocksdb_options_get_min_blob_size(o)); + + rocksdb_options_set_blob_file_size(o, 30); + CheckCondition(30 == rocksdb_options_get_blob_file_size(o)); + + rocksdb_options_set_blob_compression_type(o, 4); + CheckCondition(4 == rocksdb_options_get_blob_compression_type(o)); + + rocksdb_options_set_enable_blob_gc(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); + + rocksdb_options_set_blob_gc_age_cutoff(o, 0.5); + CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o)); + + rocksdb_options_set_blob_gc_force_threshold(o, 0.75); + CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o)); + + rocksdb_options_set_blob_compaction_readahead_size(o, 262144); + CheckCondition(262144 == + rocksdb_options_get_blob_compaction_readahead_size(o)); + + // Create a copy that should be equal to the original. + rocksdb_options_t* copy; + copy = rocksdb_options_create_copy(o); + + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy)); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy)); + CheckCondition(1 == rocksdb_options_get_create_if_missing(copy)); + CheckCondition(1 == + rocksdb_options_get_create_missing_column_families(copy)); + CheckCondition(1 == rocksdb_options_get_error_if_exists(copy)); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy)); + CheckCondition(3 == rocksdb_options_get_info_log_level(copy)); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy)); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy)); + CheckCondition(21 == rocksdb_options_get_max_open_files(copy)); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy)); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy)); + CheckCondition(7 == rocksdb_options_get_num_levels(copy)); + CheckCondition( + 4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy)); + CheckCondition(6 == + rocksdb_options_get_level0_slowdown_writes_trigger(copy)); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy)); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy)); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy)); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy)); + CheckCondition( + 1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy)); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(copy)); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy)); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(copy)); + CheckCondition( + 64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy)); + CheckCondition(1 == rocksdb_options_get_unordered_write(copy)); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy)); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy)); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy)); + CheckCondition(4 == rocksdb_options_get_base_background_compactions(copy)); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy)); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy)); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy)); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy)); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy)); + CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(copy)); + CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(copy)); + CheckCondition( + 10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy)); + CheckCondition( + 11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy)); + CheckCondition(1 == + rocksdb_options_get_rate_limit_delay_max_milliseconds(copy)); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy)); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy)); + CheckCondition(14 == rocksdb_options_get_arena_block_size(copy)); + CheckCondition(1 == rocksdb_options_get_use_fsync(copy)); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy)); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy)); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy)); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy)); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy)); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy)); + CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(copy)); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy)); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy)); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy)); + CheckCondition(3 == + rocksdb_options_get_access_hint_on_compaction_start(copy)); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy)); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy)); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy)); + CheckCondition(21 == + rocksdb_options_get_writable_file_max_buffer_size(copy)); + CheckCondition(1 == + rocksdb_options_get_allow_concurrent_memtable_write(copy)); + CheckCondition( + 1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy)); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(copy)); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy)); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy)); + CheckCondition( + 23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy)); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy)); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy)); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy)); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy)); + CheckCondition(27 == rocksdb_options_get_bloom_locality(copy)); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy)); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy)); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy)); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy)); + CheckCondition(5 == rocksdb_options_get_compression(copy)); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy)); + CheckCondition(2 == rocksdb_options_get_compaction_style(copy)); + CheckCondition(1 == rocksdb_options_get_atomic_flush(copy)); + + // Copies should be independent. + rocksdb_options_set_allow_ingest_behind(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy)); + CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o)); + + rocksdb_options_compaction_readahead_size(copy, 20); + CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy)); + CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o)); + + rocksdb_options_set_create_if_missing(copy, 0); + CheckCondition(0 == rocksdb_options_get_create_if_missing(copy)); + CheckCondition(1 == rocksdb_options_get_create_if_missing(o)); + + rocksdb_options_set_create_missing_column_families(copy, 0); + CheckCondition(0 == + rocksdb_options_get_create_missing_column_families(copy)); + CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o)); + + rocksdb_options_set_error_if_exists(copy, 0); + CheckCondition(0 == rocksdb_options_get_error_if_exists(copy)); + CheckCondition(1 == rocksdb_options_get_error_if_exists(o)); + + rocksdb_options_set_paranoid_checks(copy, 0); + CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy)); + CheckCondition(1 == rocksdb_options_get_paranoid_checks(o)); + + rocksdb_options_set_info_log_level(copy, 2); + CheckCondition(2 == rocksdb_options_get_info_log_level(copy)); + CheckCondition(3 == rocksdb_options_get_info_log_level(o)); + + rocksdb_options_set_write_buffer_size(copy, 200); + CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy)); + CheckCondition(100 == rocksdb_options_get_write_buffer_size(o)); + + rocksdb_options_set_db_write_buffer_size(copy, 2000); + CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy)); + CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o)); + + rocksdb_options_set_max_open_files(copy, 42); + CheckCondition(42 == rocksdb_options_get_max_open_files(copy)); + CheckCondition(21 == rocksdb_options_get_max_open_files(o)); + + rocksdb_options_set_max_file_opening_threads(copy, 3); + CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy)); + CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o)); + + rocksdb_options_set_max_total_wal_size(copy, 4000); + CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy)); + CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o)); + + rocksdb_options_set_num_levels(copy, 6); + CheckCondition(6 == rocksdb_options_get_num_levels(copy)); + CheckCondition(7 == rocksdb_options_get_num_levels(o)); + + rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14); + CheckCondition( + 14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy)); + CheckCondition(4 == + rocksdb_options_get_level0_file_num_compaction_trigger(o)); + + rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61); + CheckCondition(61 == + rocksdb_options_get_level0_slowdown_writes_trigger(copy)); + CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o)); + + rocksdb_options_set_level0_stop_writes_trigger(copy, 17); + CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy)); + CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o)); + + rocksdb_options_set_target_file_size_base(copy, 128); + CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy)); + CheckCondition(256 == rocksdb_options_get_target_file_size_base(o)); + + rocksdb_options_set_target_file_size_multiplier(copy, 13); + CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy)); + CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o)); + + rocksdb_options_set_max_bytes_for_level_base(copy, 900); + CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy)); + CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o)); + + rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy)); + CheckCondition(1 == + rocksdb_options_get_level_compaction_dynamic_level_bytes(o)); + + rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0); + CheckCondition(8.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(copy)); + CheckCondition(2.0 == + rocksdb_options_get_max_bytes_for_level_multiplier(o)); + + rocksdb_options_set_skip_stats_update_on_db_open(copy, 0); + CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); + CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); + + rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy)); + CheckCondition( + 1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o)); + + rocksdb_options_set_max_write_buffer_number(copy, 2000); + CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy)); + CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o)); + + rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146); + CheckCondition(146 == + rocksdb_options_get_min_write_buffer_number_to_merge(copy)); + CheckCondition(23 == + rocksdb_options_get_min_write_buffer_number_to_merge(o)); + + rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128); + CheckCondition( + 128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy)); + CheckCondition(64 == + rocksdb_options_get_max_write_buffer_number_to_maintain(o)); + + rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000); + CheckCondition(9000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(copy)); + CheckCondition(50000 == + rocksdb_options_get_max_write_buffer_size_to_maintain(o)); + + rocksdb_options_set_enable_pipelined_write(copy, 0); + CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy)); + CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o)); + + rocksdb_options_set_unordered_write(copy, 0); + CheckCondition(0 == rocksdb_options_get_unordered_write(copy)); + CheckCondition(1 == rocksdb_options_get_unordered_write(o)); + + rocksdb_options_set_max_subcompactions(copy, 90001); + CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy)); + CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o)); + + rocksdb_options_set_max_background_jobs(copy, 12); + CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy)); + CheckCondition(2 == rocksdb_options_get_max_background_jobs(o)); + + rocksdb_options_set_max_background_compactions(copy, 13); + CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy)); + CheckCondition(3 == rocksdb_options_get_max_background_compactions(o)); + + rocksdb_options_set_base_background_compactions(copy, 14); + CheckCondition(14 == rocksdb_options_get_base_background_compactions(copy)); + CheckCondition(4 == rocksdb_options_get_base_background_compactions(o)); + + rocksdb_options_set_max_background_flushes(copy, 15); + CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy)); + CheckCondition(5 == rocksdb_options_get_max_background_flushes(o)); + + rocksdb_options_set_max_log_file_size(copy, 16); + CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy)); + CheckCondition(6 == rocksdb_options_get_max_log_file_size(o)); + + rocksdb_options_set_log_file_time_to_roll(copy, 17); + CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy)); + CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o)); + + rocksdb_options_set_keep_log_file_num(copy, 18); + CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy)); + CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o)); + + rocksdb_options_set_recycle_log_file_num(copy, 19); + CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy)); + CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o)); + + rocksdb_options_set_soft_rate_limit(copy, 4.0); + CheckCondition(4.0 == rocksdb_options_get_soft_rate_limit(copy)); + CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(o)); + + rocksdb_options_set_hard_rate_limit(copy, 2.0); + CheckCondition(2.0 == rocksdb_options_get_hard_rate_limit(copy)); + CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(o)); + + rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110); + CheckCondition( + 110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy)); + CheckCondition(10 == + rocksdb_options_get_soft_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111); + CheckCondition( + 111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy)); + CheckCondition(11 == + rocksdb_options_get_hard_pending_compaction_bytes_limit(o)); + + rocksdb_options_set_rate_limit_delay_max_milliseconds(copy, 0); + CheckCondition(0 == + rocksdb_options_get_rate_limit_delay_max_milliseconds(copy)); + CheckCondition(1 == + rocksdb_options_get_rate_limit_delay_max_milliseconds(o)); + + rocksdb_options_set_max_manifest_file_size(copy, 112); + CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy)); + CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o)); + + rocksdb_options_set_table_cache_numshardbits(copy, 113); + CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy)); + CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o)); + + rocksdb_options_set_arena_block_size(copy, 114); + CheckCondition(114 == rocksdb_options_get_arena_block_size(copy)); + CheckCondition(14 == rocksdb_options_get_arena_block_size(o)); + + rocksdb_options_set_use_fsync(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_fsync(copy)); + CheckCondition(1 == rocksdb_options_get_use_fsync(o)); + + rocksdb_options_set_WAL_ttl_seconds(copy, 115); + CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy)); + CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o)); + + rocksdb_options_set_WAL_size_limit_MB(copy, 116); + CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy)); + CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o)); + + rocksdb_options_set_manifest_preallocation_size(copy, 117); + CheckCondition(117 == + rocksdb_options_get_manifest_preallocation_size(copy)); + CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o)); + + rocksdb_options_set_allow_mmap_reads(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o)); + + rocksdb_options_set_allow_mmap_writes(copy, 0); + CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy)); + CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o)); + + rocksdb_options_set_use_direct_reads(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy)); + CheckCondition(1 == rocksdb_options_get_use_direct_reads(o)); + + rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy)); + CheckCondition( + 1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o)); + + rocksdb_options_set_is_fd_close_on_exec(copy, 0); + CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy)); + CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o)); + + rocksdb_options_set_skip_log_error_on_recovery(copy, 0); + CheckCondition(0 == rocksdb_options_get_skip_log_error_on_recovery(copy)); + CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(o)); + + rocksdb_options_set_stats_dump_period_sec(copy, 218); + CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy)); + CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o)); + + rocksdb_options_set_stats_persist_period_sec(copy, 600); + CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy)); + CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o)); + + rocksdb_options_set_advise_random_on_open(copy, 0); + CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy)); + CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o)); + + rocksdb_options_set_access_hint_on_compaction_start(copy, 2); + CheckCondition(2 == + rocksdb_options_get_access_hint_on_compaction_start(copy)); + CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o)); + + rocksdb_options_set_use_adaptive_mutex(copy, 0); + CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy)); + CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o)); + + rocksdb_options_set_bytes_per_sync(copy, 219); + CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy)); + CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o)); + + rocksdb_options_set_wal_bytes_per_sync(copy, 120); + CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy)); + CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o)); + + rocksdb_options_set_writable_file_max_buffer_size(copy, 121); + CheckCondition(121 == + rocksdb_options_get_writable_file_max_buffer_size(copy)); + CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o)); + + rocksdb_options_set_allow_concurrent_memtable_write(copy, 0); + CheckCondition(0 == + rocksdb_options_get_allow_concurrent_memtable_write(copy)); + CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o)); + + rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0); + CheckCondition( + 0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy)); + CheckCondition(1 == + rocksdb_options_get_enable_write_thread_adaptive_yield(o)); + + rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122); + CheckCondition(122 == + rocksdb_options_get_max_sequential_skip_in_iterations(copy)); + CheckCondition(22 == + rocksdb_options_get_max_sequential_skip_in_iterations(o)); + + rocksdb_options_set_disable_auto_compactions(copy, 0); + CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy)); + CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o)); + + rocksdb_options_set_optimize_filters_for_hits(copy, 0); + CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy)); + CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o)); + + rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123); + CheckCondition( + 123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy)); + CheckCondition(23 == + rocksdb_options_get_delete_obsolete_files_period_micros(o)); + + rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0); + CheckCondition(4.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy)); + CheckCondition(2.0 == + rocksdb_options_get_memtable_prefix_bloom_size_ratio(o)); + + rocksdb_options_set_max_compaction_bytes(copy, 124); + CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy)); + CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o)); + + rocksdb_options_set_memtable_huge_page_size(copy, 125); + CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy)); + CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o)); + + rocksdb_options_set_max_successive_merges(copy, 126); + CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy)); + CheckCondition(26 == rocksdb_options_get_max_successive_merges(o)); + + rocksdb_options_set_bloom_locality(copy, 127); + CheckCondition(127 == rocksdb_options_get_bloom_locality(copy)); + CheckCondition(27 == rocksdb_options_get_bloom_locality(o)); + + rocksdb_options_set_inplace_update_support(copy, 0); + CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy)); + CheckCondition(1 == rocksdb_options_get_inplace_update_support(o)); + + rocksdb_options_set_inplace_update_num_locks(copy, 128); + CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy)); + CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o)); + + rocksdb_options_set_report_bg_io_stats(copy, 0); + CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy)); + CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o)); + + rocksdb_options_set_wal_recovery_mode(copy, 1); + CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy)); + CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o)); + + rocksdb_options_set_compression(copy, 4); + CheckCondition(4 == rocksdb_options_get_compression(copy)); + CheckCondition(5 == rocksdb_options_get_compression(o)); + + rocksdb_options_set_bottommost_compression(copy, 3); + CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy)); + CheckCondition(4 == rocksdb_options_get_bottommost_compression(o)); + + rocksdb_options_set_compaction_style(copy, 1); + CheckCondition(1 == rocksdb_options_get_compaction_style(copy)); + CheckCondition(2 == rocksdb_options_get_compaction_style(o)); + + rocksdb_options_set_atomic_flush(copy, 0); + CheckCondition(0 == rocksdb_options_get_atomic_flush(copy)); + CheckCondition(1 == rocksdb_options_get_atomic_flush(o)); + + rocksdb_options_destroy(copy); + rocksdb_options_destroy(o); + } + + StartPhase("read_options"); + { + rocksdb_readoptions_t* ro; + ro = rocksdb_readoptions_create(); + + rocksdb_readoptions_set_verify_checksums(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro)); + + rocksdb_readoptions_set_fill_cache(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro)); + + rocksdb_readoptions_set_read_tier(ro, 2); + CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro)); + + rocksdb_readoptions_set_tailing(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_tailing(ro)); + + rocksdb_readoptions_set_readahead_size(ro, 100); + CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro)); + + rocksdb_readoptions_set_prefix_same_as_start(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro)); + + rocksdb_readoptions_set_pin_data(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro)); + + rocksdb_readoptions_set_total_order_seek(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro)); + + rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200); + CheckCondition(200 == + rocksdb_readoptions_get_max_skippable_internal_keys(ro)); + + rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1); + CheckCondition( + 1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro)); + + rocksdb_readoptions_set_ignore_range_deletions(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro)); + + rocksdb_readoptions_set_deadline(ro, 300); + CheckCondition(300 == rocksdb_readoptions_get_deadline(ro)); + + rocksdb_readoptions_set_io_timeout(ro, 400); + CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro)); + + rocksdb_readoptions_destroy(ro); + } + + StartPhase("write_options"); + { + rocksdb_writeoptions_t* wo; + wo = rocksdb_writeoptions_create(); + + rocksdb_writeoptions_set_sync(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_sync(wo)); + + rocksdb_writeoptions_disable_WAL(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo)); + + rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1); + CheckCondition(1 == + rocksdb_writeoptions_get_ignore_missing_column_families(wo)); + + rocksdb_writeoptions_set_no_slowdown(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo)); + + rocksdb_writeoptions_set_low_pri(wo, 1); + CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo)); + + rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1); + CheckCondition(1 == + rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo)); + + rocksdb_writeoptions_destroy(wo); + } + + StartPhase("compact_options"); + { + rocksdb_compactoptions_t* co; + co = rocksdb_compactoptions_create(); + + rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1); + CheckCondition(1 == + rocksdb_compactoptions_get_exclusive_manual_compaction(co)); + + rocksdb_compactoptions_set_bottommost_level_compaction(co, 1); + CheckCondition(1 == + rocksdb_compactoptions_get_bottommost_level_compaction(co)); + + rocksdb_compactoptions_set_change_level(co, 1); + CheckCondition(1 == rocksdb_compactoptions_get_change_level(co)); + + rocksdb_compactoptions_set_target_level(co, 1); + CheckCondition(1 == rocksdb_compactoptions_get_target_level(co)); + + rocksdb_compactoptions_destroy(co); + } + + StartPhase("flush_options"); + { + rocksdb_flushoptions_t* fo; + fo = rocksdb_flushoptions_create(); + + rocksdb_flushoptions_set_wait(fo, 1); + CheckCondition(1 == rocksdb_flushoptions_get_wait(fo)); + + rocksdb_flushoptions_destroy(fo); + } + + StartPhase("cache_options"); + { + rocksdb_cache_t* co; + co = rocksdb_cache_create_lru(100); + CheckCondition(100 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_set_capacity(co, 200); + CheckCondition(200 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_destroy(co); + } + + StartPhase("jemalloc_nodump_allocator"); + { + rocksdb_memory_allocator_t* allocator; + allocator = rocksdb_jemalloc_nodump_allocator_create(&err); + if (err != NULL) { + // not supported on all platforms, allow unsupported error + const char* ni = "Not implemented: "; + size_t ni_len = strlen(ni); + size_t err_len = strlen(err); + + CheckCondition(err_len >= ni_len); + CheckCondition(memcmp(ni, err, ni_len) == 0); + Free(&err); + } else { + rocksdb_cache_t* co; + rocksdb_lru_cache_options_t* copts; + + copts = rocksdb_lru_cache_options_create(); + + rocksdb_lru_cache_options_set_capacity(copts, 100); + rocksdb_lru_cache_options_set_memory_allocator(copts, allocator); + + co = rocksdb_cache_create_lru_opts(copts); + CheckCondition(100 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_destroy(co); + rocksdb_lru_cache_options_destroy(copts); + } + rocksdb_memory_allocator_destroy(allocator); + } + + StartPhase("env"); + { + rocksdb_env_t* e; + e = rocksdb_create_default_env(); + + rocksdb_env_set_background_threads(e, 10); + CheckCondition(10 == rocksdb_env_get_background_threads(e)); + + rocksdb_env_set_high_priority_background_threads(e, 20); + CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e)); + + rocksdb_env_set_low_priority_background_threads(e, 30); + CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e)); + + rocksdb_env_set_bottom_priority_background_threads(e, 40); + CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e)); + + rocksdb_env_destroy(e); + } + + StartPhase("universal_compaction_options"); + { + rocksdb_universal_compaction_options_t* uco; + uco = rocksdb_universal_compaction_options_create(); + + rocksdb_universal_compaction_options_set_size_ratio(uco, 5); + CheckCondition(5 == + rocksdb_universal_compaction_options_get_size_ratio(uco)); + + rocksdb_universal_compaction_options_set_min_merge_width(uco, 15); + CheckCondition( + 15 == rocksdb_universal_compaction_options_get_min_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_merge_width(uco, 25); + CheckCondition( + 25 == rocksdb_universal_compaction_options_get_max_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco, + 35); + CheckCondition( + 35 == + rocksdb_universal_compaction_options_get_max_size_amplification_percent( + uco)); + + rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45); + CheckCondition( + 45 == + rocksdb_universal_compaction_options_get_compression_size_percent(uco)); + + rocksdb_universal_compaction_options_set_stop_style(uco, 1); + CheckCondition(1 == + rocksdb_universal_compaction_options_get_stop_style(uco)); + + rocksdb_universal_compaction_options_destroy(uco); + } + + StartPhase("fifo_compaction_options"); + { + rocksdb_fifo_compaction_options_t* fco; + fco = rocksdb_fifo_compaction_options_create(); + + rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000); + CheckCondition( + 100000 == + rocksdb_fifo_compaction_options_get_max_table_files_size(fco)); + + rocksdb_fifo_compaction_options_destroy(fco); + } + + StartPhase("backupable_db_option"); + { + rocksdb_backupable_db_options_t* bdo; + bdo = rocksdb_backupable_db_options_create("path"); + + rocksdb_backupable_db_options_set_share_table_files(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_share_table_files(bdo)); + + rocksdb_backupable_db_options_set_sync(bdo, 1); + CheckCondition(1 == rocksdb_backupable_db_options_get_sync(bdo)); + + rocksdb_backupable_db_options_set_destroy_old_data(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_destroy_old_data(bdo)); + + rocksdb_backupable_db_options_set_backup_log_files(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_backup_log_files(bdo)); + + rocksdb_backupable_db_options_set_backup_rate_limit(bdo, 123); + CheckCondition(123 == + rocksdb_backupable_db_options_get_backup_rate_limit(bdo)); + + rocksdb_backupable_db_options_set_restore_rate_limit(bdo, 37); + CheckCondition(37 == + rocksdb_backupable_db_options_get_restore_rate_limit(bdo)); + + rocksdb_backupable_db_options_set_max_background_operations(bdo, 20); + CheckCondition( + 20 == rocksdb_backupable_db_options_get_max_background_operations(bdo)); + + rocksdb_backupable_db_options_set_callback_trigger_interval_size(bdo, 9000); + CheckCondition( + 9000 == + rocksdb_backupable_db_options_get_callback_trigger_interval_size(bdo)); + + rocksdb_backupable_db_options_set_max_valid_backups_to_open(bdo, 40); + CheckCondition( + 40 == rocksdb_backupable_db_options_get_max_valid_backups_to_open(bdo)); + + rocksdb_backupable_db_options_set_share_files_with_checksum_naming(bdo, 2); + CheckCondition( + 2 == rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + bdo)); + + rocksdb_backupable_db_options_destroy(bdo); + } + + StartPhase("compression_options"); + { + rocksdb_options_t* co; + co = rocksdb_options_create(); + + rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100); + CheckCondition( + 100 == + rocksdb_options_get_compression_options_zstd_max_train_bytes(co)); + + rocksdb_options_set_compression_options_parallel_threads(co, 2); + CheckCondition( + 2 == rocksdb_options_get_compression_options_parallel_threads(co)); + + rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200); + CheckCondition( + 200 == + rocksdb_options_get_compression_options_max_dict_buffer_bytes(co)); + + rocksdb_options_destroy(co); + } + StartPhase("iterate_upper_bound"); { // Create new empty database @@ -1840,6 +2984,54 @@ CheckNoError(err); } + StartPhase("filter_with_prefix_seek"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_prefix_extractor( + options, rocksdb_slicetransform_create_fixed_prefix(1)); + rocksdb_filterpolicy_t* filter_policy = + rocksdb_filterpolicy_create_bloom_full(8.0); + rocksdb_block_based_options_set_filter_policy(table_options, filter_policy); + rocksdb_options_set_block_based_table_factory(options, table_options); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + int i; + for (i = 0; i < 10; ++i) { + char key = '0' + (char)i; + rocksdb_put(db, woptions, &key, 1, "", 1, &err); + CheckNoError(err); + } + + // Flush to generate an L0 so that filter will be used later. + rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_options, 1); + rocksdb_flush(db, flush_options, &err); + rocksdb_flushoptions_destroy(flush_options); + CheckNoError(err); + + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts); + + rocksdb_iter_seek(iter, "0", 1); + int cnt = 0; + while (rocksdb_iter_valid(iter)) { + ++cnt; + rocksdb_iter_next(iter); + } + CheckCondition(10 == cnt); + + rocksdb_iter_destroy(iter); + rocksdb_readoptions_destroy(ropts); + } + + StartPhase("cancel_all_background_work"); + rocksdb_cancel_all_background_work(db, 1); + StartPhase("cleanup"); rocksdb_close(db); rocksdb_options_destroy(options); @@ -1858,7 +3050,7 @@ #else -int main() { +int main(void) { fprintf(stderr, "SKIPPED\n"); return 0; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,9 +12,11 @@ #include #include #include +#include #include #include +#include "db/blob/blob_file_cache.h" #include "db/compaction/compaction_picker.h" #include "db/compaction/compaction_picker_fifo.h" #include "db/compaction/compaction_picker_level.h" @@ -27,13 +29,15 @@ #include "db/version_set.h" #include "db/write_controller.h" #include "file/sst_file_manager_impl.h" -#include "memtable/hash_skiplist_rep.h" +#include "logging/logging.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "port/port.h" -#include "table/block_based/block_based_table_factory.h" +#include "rocksdb/convenience.h" +#include "rocksdb/table.h" #include "table/merging_iterator.h" #include "util/autovector.h" +#include "util/cast_util.h" #include "util/compression.h" namespace ROCKSDB_NAMESPACE { @@ -71,11 +75,6 @@ bool defer_purge = db_->immutable_db_options().avoid_unnecessary_blocking_io; db_->PurgeObsoleteFiles(job_context, defer_purge); - if (defer_purge) { - mutex_->Lock(); - db_->SchedulePurge(); - mutex_->Unlock(); - } } job_context.Clean(); } @@ -105,8 +104,9 @@ void GetIntTblPropCollectorFactory( const ImmutableCFOptions& ioptions, - std::vector>* - int_tbl_prop_collector_factories) { + IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { + assert(int_tbl_prop_collector_factories); + auto& collector_factories = ioptions.table_properties_collector_factories; for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); ++i) { @@ -147,6 +147,16 @@ "should be nonzero if we're using zstd's dictionary generator."); } } + + if (!CompressionTypeSupported(cf_options.blob_compression_type)) { + std::ostringstream oss; + oss << "The specified blob compression type " + << CompressionTypeToString(cf_options.blob_compression_type) + << " is not available."; + + return Status::InvalidArgument(oss.str()); + } + return Status::OK(); } @@ -188,7 +198,7 @@ namespace { const uint64_t kDefaultTtl = 0xfffffffffffffffe; const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; -}; // namespace +} // namespace ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src) { @@ -196,11 +206,13 @@ size_t clamp_max = std::conditional< sizeof(size_t) == 4, std::integral_constant, std::integral_constant>::type::value; - ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max); + ClipToRange(&result.write_buffer_size, (static_cast(64)) << 10, + clamp_max); // if user sets arena_block_size, we trust user to use this value. Otherwise, // calculate a proper value from writer_buffer_size; if (result.arena_block_size <= 0) { - result.arena_block_size = result.write_buffer_size / 8; + result.arena_block_size = + std::min(size_t{1024 * 1024}, result.write_buffer_size / 8); // Align up to 4k const size_t align = 4 * 1024; @@ -269,7 +281,7 @@ } if (result.level0_file_num_compaction_trigger == 0) { - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "level0_file_num_compaction_trigger cannot be 0"); result.level0_file_num_compaction_trigger = 1; } @@ -278,7 +290,7 @@ result.level0_slowdown_writes_trigger || result.level0_slowdown_writes_trigger < result.level0_file_num_compaction_trigger) { - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "This condition must be satisfied: " "level0_stop_writes_trigger(%d) >= " "level0_slowdown_writes_trigger(%d) >= " @@ -295,7 +307,7 @@ result.level0_slowdown_writes_trigger) { result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; } - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "Adjust the value to " "level0_stop_writes_trigger(%d)" "level0_slowdown_writes_trigger(%d)" @@ -322,7 +334,9 @@ // was not used) auto sfm = static_cast(db_options.sst_file_manager.get()); for (size_t i = 0; i < result.cf_paths.size(); i++) { - DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path); + DeleteScheduler::CleanupDirectory(db_options.env, sfm, + result.cf_paths[i].path) + .PermitUncheckedError(); } #endif @@ -331,12 +345,18 @@ } if (result.level_compaction_dynamic_level_bytes) { - if (result.compaction_style != kCompactionStyleLevel || - result.cf_paths.size() > 1U) { - // 1. level_compaction_dynamic_level_bytes only makes sense for - // level-based compaction. - // 2. we don't yet know how to make both of this feature and multiple - // DB path work. + if (result.compaction_style != kCompactionStyleLevel) { + ROCKS_LOG_WARN(db_options.info_log.get(), + "level_compaction_dynamic_level_bytes only makes sense" + "for level-based compaction"); + result.level_compaction_dynamic_level_bytes = false; + } else if (result.cf_paths.size() > 1U) { + // we don't yet know how to make both of this feature and multiple + // DB path work. + ROCKS_LOG_WARN(db_options.info_log.get(), + "multiple cf_paths/db_paths and" + "level_compaction_dynamic_level_bytes" + "can't be used together"); result.level_compaction_dynamic_level_bytes = false; } } @@ -345,8 +365,8 @@ result.max_compaction_bytes = result.target_file_size_base * 25; } - bool is_block_based_table = - (result.table_factory->Name() == BlockBasedTableFactory().Name()); + bool is_block_based_table = (result.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; if (result.ttl == kDefaultTtl) { @@ -427,6 +447,9 @@ void SuperVersion::Cleanup() { assert(refs.load(std::memory_order_relaxed) == 0); + // Since this SuperVersion object is being deleted, + // decrement reference to the immutable MemtableList + // this SV object was pointing to. imm->Unref(&to_delete); MemTable* m = mem->Unref(); if (m != nullptr) { @@ -436,9 +459,7 @@ to_delete.push_back(m); } current->Unref(); - if (cfd->Unref()) { - delete cfd; - } + cfd->UnrefAndTryDelete(); } void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, @@ -456,10 +477,10 @@ namespace { void SuperVersionUnrefHandle(void* ptr) { - // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets - // destroyed. When former happens, the thread shouldn't see kSVInUse. - // When latter happens, we are in ~ColumnFamilyData(), no get should happen as - // well. + // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets + // destroyed. When the former happens, the thread shouldn't see kSVInUse. + // When the latter happens, only super_version_ holds a reference + // to ColumnFamilyData, so no further queries are possible. SuperVersion* sv = static_cast(ptr); bool was_last_ref __attribute__((__unused__)); was_last_ref = sv->Unref(); @@ -471,12 +492,25 @@ } } // anonymous namespace +std::vector ColumnFamilyData::GetDbPaths() const { + std::vector paths; + paths.reserve(ioptions_.cf_paths.size()); + for (const DbPath& db_path : ioptions_.cf_paths) { + paths.emplace_back(db_path.path); + } + return paths; +} + +const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId = port::kMaxUint32; + ColumnFamilyData::ColumnFamilyData( uint32_t id, const std::string& name, Version* _dummy_versions, Cache* _table_cache, WriteBufferManager* write_buffer_manager, const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, - const FileOptions& file_options, ColumnFamilySet* column_family_set, - BlockCacheTracer* const block_cache_tracer) + const FileOptions* file_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : id_(id), name_(name), dummy_versions_(_dummy_versions), @@ -507,7 +541,23 @@ queued_for_compaction_(false), prev_compaction_needed_bytes_(0), allow_2pc_(db_options.allow_2pc), - last_memtable_id_(0) { + last_memtable_id_(0), + db_paths_registered_(false) { + if (id_ != kDummyColumnFamilyDataId) { + // TODO(cc): RegisterDbPaths can be expensive, considering moving it + // outside of this constructor which might be called with db mutex held. + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->RegisterDbPaths(GetDbPaths()); + if (s.ok()) { + db_paths_registered_ = true; + } else { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to register data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } Ref(); // Convert user defined table properties collector factories to internal ones. @@ -516,9 +566,14 @@ // if _dummy_versions is nullptr, then this is a dummy column family. if (_dummy_versions != nullptr) { internal_stats_.reset( - new InternalStats(ioptions_.num_levels, db_options.env, this)); + new InternalStats(ioptions_.num_levels, ioptions_.clock, this)); table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, - block_cache_tracer)); + block_cache_tracer, io_tracer, + db_session_id)); + blob_file_cache_.reset( + new BlobFileCache(_table_cache, ioptions(), soptions(), id_, + internal_stats_->GetBlobFileReadHist(), io_tracer)); + if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); @@ -532,13 +587,13 @@ } else if (ioptions_.compaction_style == kCompactionStyleNone) { compaction_picker_.reset(new NullCompactionPicker( ioptions_, &internal_comparator_)); - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "Column family %s does not use any background compaction. " "Compactions can only be done via CompactFiles\n", GetName().c_str()); #endif // !ROCKSDB_LITE } else { - ROCKS_LOG_ERROR(ioptions_.info_log, + ROCKS_LOG_ERROR(ioptions_.logger, "Unable to recognize the specified compaction style %d. " "Column family %s will use kCompactionStyleLevel.\n", ioptions_.compaction_style, GetName().c_str()); @@ -547,12 +602,12 @@ } if (column_family_set_->NumberOfColumnFamilies() < 10) { - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "--------------- Options for column family [%s]:\n", name.c_str()); - initial_cf_options_.Dump(ioptions_.info_log); + initial_cf_options_.Dump(ioptions_.logger); } else { - ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n"); + ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n"); } } @@ -587,7 +642,7 @@ if (dummy_versions_ != nullptr) { // List must be empty - assert(dummy_versions_->TEST_Next() == dummy_versions_); + assert(dummy_versions_->Next() == dummy_versions_); bool deleted __attribute__((__unused__)); deleted = dummy_versions_->Unref(); assert(deleted); @@ -601,6 +656,18 @@ for (MemTable* m : to_delete) { delete m; } + + if (db_paths_registered_) { + // TODO(cc): considering using ioptions_.fs, currently some tests rely on + // EnvWrapper, that's the main reason why we use env here. + Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths()); + if (!s.ok()) { + ROCKS_LOG_ERROR( + ioptions_.logger, + "Failed to unregister data paths of column family (id: %d, name: %s)", + id_, name_.c_str()); + } + } } bool ColumnFamilyData::UnrefAndTryDelete() { @@ -617,14 +684,13 @@ // Only the super_version_ holds me SuperVersion* sv = super_version_; super_version_ = nullptr; - // Release SuperVersion reference kept in ThreadLocalPtr. - // This must be done outside of mutex_ since unref handler can lock mutex. - sv->db_mutex->Unlock(); + + // Release SuperVersion references kept in ThreadLocalPtr. local_sv_.reset(); - sv->db_mutex->Lock(); if (sv->Unref()) { - // May delete this ColumnFamilyData after calling Cleanup() + // Note: sv will delete this ColumnFamilyData during Cleanup() + assert(sv->cfd == this); sv->Cleanup(); delete sv; return true; @@ -651,9 +717,7 @@ auto current_log = GetLogNumber(); if (allow_2pc_) { - autovector empty_list; - auto imm_prep_log = - imm()->PrecomputeMinLogContainingPrepSection(empty_list); + auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection(); auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); if (imm_prep_log > 0 && imm_prep_log < current_log) { @@ -775,7 +839,8 @@ ColumnFamilyData::GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, uint64_t num_compaction_needed_bytes, - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options) { if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && @@ -789,7 +854,9 @@ WriteStallCause::kPendingCompactionBytes}; } else if (mutable_cf_options.max_write_buffer_number > 3 && num_unflushed_memtables >= - mutable_cf_options.max_write_buffer_number - 1) { + mutable_cf_options.max_write_buffer_number - 1 && + num_unflushed_memtables - 1 >= + immutable_cf_options.min_write_buffer_number_to_merge) { return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && mutable_cf_options.level0_slowdown_writes_trigger >= 0 && @@ -817,7 +884,8 @@ auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), - vstorage->estimated_compaction_needed_bytes(), mutable_cf_options); + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, + *ioptions()); write_stall_condition = write_stall_condition_and_cause.first; auto write_stall_cause = write_stall_condition_and_cause.second; @@ -829,7 +897,7 @@ write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stopping writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d", name_.c_str(), imm()->NumNotFlushed(), @@ -842,7 +910,7 @@ internal_stats_->AddCFStats( InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); } - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stopping writes because we have %d level-0 files", name_.c_str(), vstorage->l0_delay_trigger_count()); } else if (write_stall_condition == WriteStallCondition::kStopped && @@ -851,7 +919,7 @@ internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stopping writes because of estimated pending compaction " "bytes %" PRIu64, name_.c_str(), compaction_needed_bytes); @@ -863,7 +931,7 @@ mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d " "rate %" PRIu64, @@ -885,7 +953,7 @@ internal_stats_->AddCFStats( InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); } - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " "rate %" PRIu64, name_.c_str(), vstorage->l0_delay_trigger_count(), @@ -910,7 +978,7 @@ internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " "bytes %" PRIu64 " rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), @@ -924,7 +992,7 @@ write_controller_token_ = write_controller->GetCompactionPressureToken(); ROCKS_LOG_INFO( - ioptions_.info_log, + ioptions_.logger, "[%s] Increasing compaction threads because we have %d level-0 " "files ", name_.c_str(), vstorage->l0_delay_trigger_count()); @@ -938,7 +1006,7 @@ write_controller->GetCompactionPressureToken(); if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { ROCKS_LOG_INFO( - ioptions_.info_log, + ioptions_.logger, "[%s] Increasing compaction threads because of estimated pending " "compaction " "bytes %" PRIu64, @@ -983,6 +1051,10 @@ return VersionSet::GetTotalSstFilesSize(dummy_versions_); } +uint64_t ColumnFamilyData::GetTotalBlobFileSize() const { + return VersionSet::GetTotalBlobFileSize(dummy_versions_); +} + uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { return current_->GetSstFilesSize(); } @@ -1003,17 +1075,19 @@ } bool ColumnFamilyData::NeedsCompaction() const { - return compaction_picker_->NeedsCompaction(current_->storage_info()); + return !mutable_cf_options_.disable_auto_compactions && + compaction_picker_->NeedsCompaction(current_->storage_info()); } Compaction* ColumnFamilyData::PickCompaction( - const MutableCFOptions& mutable_options, LogBuffer* log_buffer) { + const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { SequenceNumber earliest_mem_seqno = std::min(mem_->GetEarliestSequenceNumber(), imm_.current()->GetEarliestSequenceNumber(false)); auto* result = compaction_picker_->PickCompaction( - GetName(), mutable_options, current_->storage_info(), log_buffer, - earliest_mem_seqno); + GetName(), mutable_options, mutable_db_options, current_->storage_info(), + log_buffer, earliest_mem_seqno); if (result != nullptr) { result->SetInputVersion(current_); } @@ -1029,7 +1103,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( const autovector& ranges, SuperVersion* super_version, - bool* overlap) { + bool allow_data_in_errors, bool* overlap) { assert(overlap != nullptr); *overlap = false; // Create an InternalIterator over all unflushed memtables @@ -1048,10 +1122,12 @@ super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq); range_del_agg.AddTombstones( std::unique_ptr(active_range_del_iter)); - super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */, - &range_del_agg); - Status status; + status = super_version->imm->AddRangeTombstoneIterators( + read_opts, nullptr /* arena */, &range_del_agg); + // AddRangeTombstoneIterators always return Status::OK. + assert(status.ok()); + for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { auto* vstorage = super_version->current->storage_info(); auto* ucmp = vstorage->InternalComparator()->user_comparator(); @@ -1060,12 +1136,12 @@ memtable_iter->Seek(range_start.Encode()); status = memtable_iter->status(); ParsedInternalKey seek_result; - if (status.ok()) { - if (memtable_iter->Valid() && - !ParseInternalKey(memtable_iter->key(), &seek_result)) { - status = Status::Corruption("DB have corrupted keys"); - } + + if (status.ok() && memtable_iter->Valid()) { + status = ParseInternalKey(memtable_iter->key(), &seek_result, + allow_data_in_errors); } + if (status.ok()) { if (memtable_iter->Valid() && ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { @@ -1083,14 +1159,16 @@ const int ColumnFamilyData::kCompactToBaseLevel = -2; Compaction* ColumnFamilyData::CompactRange( - const MutableCFOptions& mutable_cf_options, int input_level, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* conflict, uint64_t max_file_num_to_ignore) { auto* result = compaction_picker_->CompactRange( - GetName(), mutable_cf_options, current_->storage_info(), input_level, - output_level, compact_range_options, begin, end, compaction_end, conflict, + GetName(), mutable_cf_options, mutable_db_options, + current_->storage_info(), input_level, output_level, + compact_range_options, begin, end, compaction_end, conflict, max_file_num_to_ignore); if (result != nullptr) { result->SetInputVersion(current_); @@ -1133,11 +1211,11 @@ SuperVersion* sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || sv->version_number != super_version_number_.load()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES); + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; if (sv && sv->Unref()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS); + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); db->mutex()->Lock(); // NOTE: underlying resources held by superversion (sst files) might // not be released until the next background job. @@ -1181,14 +1259,13 @@ void ColumnFamilyData::InstallSuperVersion( SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) { db_mutex->AssertHeld(); - return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_); + return InstallSuperVersion(sv_context, mutable_cf_options_); } void ColumnFamilyData::InstallSuperVersion( - SuperVersionContext* sv_context, InstrumentedMutex* db_mutex, + SuperVersionContext* sv_context, const MutableCFOptions& mutable_cf_options) { SuperVersion* new_superversion = sv_context->new_superversion.release(); - new_superversion->db_mutex = db_mutex; new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(this, mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; @@ -1260,7 +1337,8 @@ } if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { - if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { return Status::NotSupported( "TTL is only supported in Block-Based Table format. "); } @@ -1268,30 +1346,53 @@ if (cf_options.periodic_compaction_seconds > 0 && cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { - if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + if (!cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { return Status::NotSupported( "Periodic Compaction is only supported in " "Block-Based Table format. "); } } + + if (cf_options.enable_blob_garbage_collection) { + if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + if (cf_options.blob_garbage_collection_force_threshold < 0.0 || + cf_options.blob_garbage_collection_force_threshold > 1.0) { + return Status::InvalidArgument( + "The garbage ratio threshold for forcing blob garbage collection " + "should be in the range [0.0, 1.0]."); + } + } + + if (cf_options.compaction_style == kCompactionStyleFIFO && + db_options.max_open_files != -1 && cf_options.ttl > 0) { + return Status::NotSupported( + "FIFO compaction only supported with max_open_files = -1."); + } + return s; } #ifndef ROCKSDB_LITE Status ColumnFamilyData::SetOptions( - const DBOptions& db_options, + const DBOptions& db_opts, const std::unordered_map& options_map) { - MutableCFOptions new_mutable_cf_options; - Status s = - GetMutableOptionsFromStrings(mutable_cf_options_, options_map, - ioptions_.info_log, &new_mutable_cf_options); + ColumnFamilyOptions cf_opts = + BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); + ConfigOptions config_opts; + config_opts.mutable_options_only = true; + Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, + &cf_opts); if (s.ok()) { - ColumnFamilyOptions cf_options = - BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options); - s = ValidateOptions(db_options, cf_options); + s = ValidateOptions(db_opts, cf_opts); } if (s.ok()) { - mutable_cf_options_ = new_mutable_cf_options; + mutable_cf_options_ = MutableCFOptions(cf_opts); mutable_cf_options_.RefreshDerivedOptions(ioptions_); } return s; @@ -1321,7 +1422,7 @@ } Status ColumnFamilyData::AddDirectories( - std::map>* created_dirs) { + std::map>* created_dirs) { Status s; assert(created_dirs != nullptr); assert(data_dirs_.empty()); @@ -1329,8 +1430,9 @@ auto existing_dir = created_dirs->find(p.path); if (existing_dir == created_dirs->end()) { - std::unique_ptr path_directory; - s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory); + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path, + &path_directory); if (!s.ok()) { return s; } @@ -1345,7 +1447,7 @@ return s; } -Directory* ColumnFamilyData::GetDataDir(size_t path_id) const { +FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { if (data_dirs_.empty()) { return nullptr; } @@ -1358,21 +1460,26 @@ const ImmutableDBOptions* db_options, const FileOptions& file_options, Cache* table_cache, - WriteBufferManager* write_buffer_manager, - WriteController* write_controller, - BlockCacheTracer* const block_cache_tracer) + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : max_column_family_(0), + file_options_(file_options), dummy_cfd_(new ColumnFamilyData( - 0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, - file_options, nullptr, block_cache_tracer)), + ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, + nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr, + block_cache_tracer, io_tracer, db_session_id)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), - file_options_(file_options), table_cache_(table_cache), - write_buffer_manager_(write_buffer_manager), - write_controller_(write_controller), - block_cache_tracer_(block_cache_tracer) { + write_buffer_manager_(_write_buffer_manager), + write_controller_(_write_controller), + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_session_id_(db_session_id) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; @@ -1438,7 +1545,8 @@ assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, options, - *db_options_, file_options_, this, block_cache_tracer_); + *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, + db_session_id_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); @@ -1454,20 +1562,6 @@ return new_cfd; } -// REQUIRES: DB mutex held -void ColumnFamilySet::FreeDeadColumnFamilies() { - autovector to_delete; - for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { - if (cfd->refs_.load(std::memory_order_relaxed) == 0) { - to_delete.push_back(cfd); - } - } - for (auto cfd : to_delete) { - // this is very rare, so it's not a problem that we do it under a mutex - delete cfd; - } -} - // under a DB mutex AND from a write thread void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { auto cfd_iter = column_family_data_.find(cfd->GetID()); @@ -1506,7 +1600,7 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { uint32_t column_family_id = 0; if (column_family != nullptr) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); column_family_id = cfh->GetID(); } return column_family_id; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h 2025-05-19 16:14:27.000000000 +0000 @@ -44,6 +44,7 @@ class InstrumentedMutex; class InstrumentedMutexLock; struct SuperVersionContext; +class BlobFileCache; extern const double kIncSlowdownRatio; // This file contains a list of data structures for managing column family @@ -207,8 +208,6 @@ uint64_t version_number; WriteStallCondition write_stall_condition; - InstrumentedMutex* db_mutex; - // should be called outside the mutex SuperVersion() = default; ~SuperVersion(); @@ -252,13 +251,12 @@ extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src); -// Wrap user defined table proproties collector factories `from cf_options` +// Wrap user defined table properties collector factories `from cf_options` // into internal ones in int_tbl_prop_collector_factories. Add a system internal // one too. extern void GetIntTblPropCollectorFactory( const ImmutableCFOptions& ioptions, - std::vector>* - int_tbl_prop_collector_factories); + IntTblPropCollectorFactories* int_tbl_prop_collector_factories); class ColumnFamilySet; @@ -278,17 +276,6 @@ // holding a DB mutex, or as the leader in a write batch group). void Ref() { refs_.fetch_add(1); } - // Unref decreases the reference count, but does not handle deletion - // when the count goes to 0. If this method returns true then the - // caller should delete the instance immediately, or later, by calling - // FreeDeadColumnFamilies(). Unref() can only be called while holding - // a DB mutex, or during single-threaded recovery. - bool Unref() { - int old_refs = refs_.fetch_sub(1); - assert(old_refs > 0); - return old_refs == 1; - } - // UnrefAndTryDelete() decreases the reference count and do free if needed, // return true if this is freed else false, UnrefAndTryDelete() can only // be called while holding a DB mutex, or during single-threaded recovery. @@ -325,7 +312,7 @@ FlushReason GetFlushReason() const { return flush_reason_; } // thread-safe const FileOptions* soptions() const; - const ImmutableCFOptions* ioptions() const { return &ioptions_; } + const ImmutableOptions* ioptions() const { return &ioptions_; } // REQUIRES: DB mutex held // This returns the MutableCFOptions used by current SuperVersion // You should use this API to reference MutableCFOptions most of the time. @@ -359,12 +346,18 @@ MemTableList* imm() { return &imm_; } MemTable* mem() { return mem_; } + + bool IsEmpty() { + return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; + } + Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } void SetCurrent(Version* _current); uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held + uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; new_mem->SetID(memtable_id); @@ -381,12 +374,14 @@ SequenceNumber earliest_seq); TableCache* table_cache() const { return table_cache_.get(); } + BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); } // See documentation in compaction_picker.h // REQUIRES: DB mutex held bool NeedsCompaction() const; // REQUIRES: DB mutex held Compaction* PickCompaction(const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer); // Check if the passed range overlap with any running compactions. @@ -403,7 +398,8 @@ // // Thread-safe Status RangesOverlapWithMemtables(const autovector& ranges, - SuperVersion* super_version, bool* overlap); + SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap); // A flag to tell a manual compaction is to compact all levels together // instead of a specific level. @@ -412,6 +408,7 @@ static const int kCompactToBaseLevel; // REQUIRES: DB mutex held Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, @@ -428,8 +425,7 @@ return internal_comparator_; } - const std::vector>* - int_tbl_prop_collector_factories() const { + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { return &int_tbl_prop_collector_factories_; } @@ -441,7 +437,7 @@ // Get SuperVersion stored in thread local storage. If it does not exist, // get a reference from a current SuperVersion. SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); - // Try to return SuperVersion back to thread local storage. Retrun true on + // Try to return SuperVersion back to thread local storage. Return true on // success and false on failure. It fails when the thread local storage // contains anything other than SuperVersion::kSVInUse flag. bool ReturnThreadLocalSuperVersion(SuperVersion* sv); @@ -455,7 +451,6 @@ // the clients to allocate SuperVersion outside of mutex. // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() void InstallSuperVersion(SuperVersionContext* sv_context, - InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options); void InstallSuperVersion(SuperVersionContext* sv_context, InstrumentedMutex* db_mutex); @@ -475,9 +470,11 @@ kPendingCompactionBytes, }; static std::pair - GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, - uint64_t num_compaction_needed_bytes, - const MutableCFOptions& mutable_cf_options); + GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options); // Recalculate some small conditions, which are changed only during // compaction, adding new memtable and/or @@ -500,11 +497,29 @@ // created_dirs remembers directory created, so that we don't need to call // the same data creation operation again. Status AddDirectories( - std::map>* created_dirs); + std::map>* created_dirs); + + FSDirectory* GetDataDir(size_t path_id) const; + + // full_history_ts_low_ can only increase. + void SetFullHistoryTsLow(std::string ts_low) { + assert(!ts_low.empty()); + const Comparator* ucmp = user_comparator(); + assert(ucmp); + if (full_history_ts_low_.empty() || + ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { + full_history_ts_low_ = std::move(ts_low); + } + } - Directory* GetDataDir(size_t path_id) const; + const std::string& GetFullHistoryTsLow() const { + return full_history_ts_low_; + } ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } + WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } + + static const uint32_t kDummyColumnFamilyDataId; private: friend class ColumnFamilySet; @@ -513,9 +528,13 @@ WriteBufferManager* write_buffer_manager, const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options, - const FileOptions& file_options, + const FileOptions* file_options, ColumnFamilySet* column_family_set, - BlockCacheTracer* const block_cache_tracer); + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); + + std::vector GetDbPaths() const; uint32_t id_; const std::string name_; @@ -527,16 +546,16 @@ std::atomic dropped_; // true if client dropped it const InternalKeyComparator internal_comparator_; - std::vector> - int_tbl_prop_collector_factories_; + IntTblPropCollectorFactories int_tbl_prop_collector_factories_; const ColumnFamilyOptions initial_cf_options_; - const ImmutableCFOptions ioptions_; + const ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; const bool is_delete_range_supported_; std::unique_ptr table_cache_; + std::unique_ptr blob_file_cache_; std::unique_ptr internal_stats_; @@ -592,7 +611,11 @@ std::atomic last_memtable_id_; // Directories corresponding to cf_paths. - std::vector> data_dirs_; + std::vector> data_dirs_; + + bool db_paths_registered_; + + std::string full_history_ts_low_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -605,10 +628,8 @@ // held and it needs to be executed from the write thread. SetDropped() also // guarantees that it will be called only from single-threaded LogAndApply(), // but this condition is not that important. -// * Iteration -- hold DB mutex, but you can release it in the body of -// iteration. If you release DB mutex in body, reference the column -// family before the mutex and unreference after you unlock, since the column -// family might get dropped when the DB mutex is released +// * Iteration -- hold DB mutex. If you want to release the DB mutex in the +// body of the iteration, wrap in a RefedColumnFamilySet. // * GetDefault() -- thread safe // * GetColumnFamily() -- either inside of DB mutex or from a write thread // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), @@ -620,17 +641,12 @@ public: explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} + // NOTE: minimum operators for for-loop iteration iterator& operator++() { - // dropped column families might still be included in this iteration - // (we're only removing them when client drops the last reference to the - // column family). - // dummy is never dead, so this will never be infinite - do { - current_ = current_->next_; - } while (current_->refs_.load(std::memory_order_relaxed) == 0); + current_ = current_->next_; return *this; } - bool operator!=(const iterator& other) { + bool operator!=(const iterator& other) const { return this->current_ != other.current_; } ColumnFamilyData* operator*() { return current_; } @@ -642,9 +658,11 @@ ColumnFamilySet(const std::string& dbname, const ImmutableDBOptions* db_options, const FileOptions& file_options, Cache* table_cache, - WriteBufferManager* write_buffer_manager, - WriteController* write_controller, - BlockCacheTracer* const block_cache_tracer); + WriteBufferManager* _write_buffer_manager, + WriteController* _write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -667,12 +685,12 @@ iterator begin() { return iterator(dummy_cfd_->next_); } iterator end() { return iterator(dummy_cfd_); } - // REQUIRES: DB mutex held - // Don't call while iterating over ColumnFamilySet - void FreeDeadColumnFamilies(); - Cache* get_table_cache() { return table_cache_; } + WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } + + WriteController* write_controller() { return write_controller_; } + private: friend class ColumnFamilyData; // helper function that gets called from cfd destructor @@ -690,6 +708,8 @@ std::unordered_map column_family_data_; uint32_t max_column_family_; + const FileOptions file_options_; + ColumnFamilyData* dummy_cfd_; // We don't hold the refcount here, since default column family always exists // We are also not responsible for cleaning up default_cfd_cache_. This is @@ -699,11 +719,61 @@ const std::string db_name_; const ImmutableDBOptions* const db_options_; - const FileOptions file_options_; Cache* table_cache_; WriteBufferManager* write_buffer_manager_; WriteController* write_controller_; BlockCacheTracer* const block_cache_tracer_; + std::shared_ptr io_tracer_; + std::string db_session_id_; +}; + +// A wrapper for ColumnFamilySet that supports releasing DB mutex during each +// iteration over the iterator, because the cfd is Refed and Unrefed during +// each iteration to prevent concurrent CF drop from destroying it (until +// Unref). +class RefedColumnFamilySet { + public: + explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} + + class iterator { + public: + explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { + MaybeRef(*wrapped_); + } + ~iterator() { MaybeUnref(*wrapped_); } + inline void MaybeRef(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->Ref(); + } + } + inline void MaybeUnref(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->UnrefAndTryDelete(); + } + } + // NOTE: minimum operators for for-loop iteration + inline iterator& operator++() { + ColumnFamilyData* old = *wrapped_; + ++wrapped_; + // Can only unref & potentially free cfd after accessing its next_ + MaybeUnref(old); + MaybeRef(*wrapped_); + return *this; + } + inline bool operator!=(const iterator& other) const { + return this->wrapped_ != other.wrapped_; + } + inline ColumnFamilyData* operator*() { return *wrapped_; } + + private: + ColumnFamilySet::iterator wrapped_; + }; + + iterator begin() { return iterator(wrapped_->begin()); } + iterator end() { return iterator(wrapped_->end()); } + + private: + ColumnFamilySet* wrapped_; }; // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,45 +8,37 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include -#include #include #include +#include #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" -#include "memtable/hash_skiplist_rep.h" #include "options/options_parser.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/utilities/object_registry.h" -#include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" #include "util/string_util.h" +#include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { static const int kValueSize = 1000; -namespace { -std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} -} // anonymous namespace - // counts how many operations were performed -class EnvCounter : public EnvWrapper { +class EnvCounter : public SpecialEnv { public: explicit EnvCounter(Env* base) - : EnvWrapper(base), num_new_writable_file_(0) {} + : SpecialEnv(base), num_new_writable_file_(0) {} int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; } @@ -64,33 +56,30 @@ public: explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { Env* base_env = Env::Default(); -#ifndef ROCKSDB_LITE - const char* test_env_uri = getenv("TEST_ENV_URI"); - if (test_env_uri) { - Env* test_env = nullptr; - Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); - base_env = test_env; - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } -#endif // !ROCKSDB_LITE + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); EXPECT_NE(nullptr, base_env); env_ = new EnvCounter(base_env); + env_->skip_fsync_ = true; dbname_ = test::PerThreadDBPath("column_family_test"); db_options_.create_if_missing = true; db_options_.fail_if_options_file_error = true; db_options_.env = env_; - DestroyDB(dbname_, Options(db_options_, column_family_options_)); + EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); } ~ColumnFamilyTestBase() override { std::vector column_families; for (auto h : handles_) { ColumnFamilyDescriptor cfdescriptor; - h->GetDescriptor(&cfdescriptor); + Status s = h->GetDescriptor(&cfdescriptor); +#ifdef ROCKSDB_LITE + EXPECT_TRUE(s.IsNotSupported()); +#else + EXPECT_OK(s); +#endif // ROCKSDB_LITE column_families.push_back(cfdescriptor); } - Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Destroy(column_families); delete env_; @@ -109,11 +98,11 @@ // preserves the implementation that was in place when all of the // magic values in this file were picked. *storage = std::string(kValueSize, ' '); - return Slice(*storage); } else { Random r(k); - return test::RandomString(&r, kValueSize, storage); + *storage = r.RandomString(kValueSize); } + return Slice(*storage); } void Build(int base, int n, int flush_every = 0) { @@ -122,7 +111,7 @@ for (int i = 0; i < n; i++) { if (flush_every != 0 && i != 0 && i % flush_every == 0) { - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); dbi->TEST_FlushMemTable(); } @@ -176,7 +165,7 @@ void Close() { for (auto h : handles_) { if (h) { - db_->DestroyColumnFamilyHandle(h); + ASSERT_OK(db_->DestroyColumnFamilyHandle(h)); } } handles_.clear(); @@ -190,8 +179,8 @@ std::vector column_families; names_.clear(); for (size_t i = 0; i < cf.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor( - cf[i], options.size() == 0 ? column_family_options_ : options[i])); + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); names_.push_back(cf[i]); } return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); @@ -202,8 +191,8 @@ std::vector column_families; names_.clear(); for (size_t i = 0; i < cf.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor( - cf[i], options.size() == 0 ? column_family_options_ : options[i])); + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); names_.push_back(cf[i]); } return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_, @@ -227,7 +216,7 @@ Open({"default"}); } - DBImpl* dbfull() { return reinterpret_cast(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_); } int GetProperty(int cf, std::string property) { std::string value; @@ -287,7 +276,11 @@ // Verify the CF options of the returned CF handle. ColumnFamilyDescriptor desc; ASSERT_OK(handles_[cfi]->GetDescriptor(&desc)); - RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt); + // Need to sanitize the default column family options before comparing + // them. + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + ConfigOptions(), desc.options, + SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt))); #endif // !ROCKSDB_LITE cfi++; } @@ -313,7 +306,7 @@ void DropColumnFamilies(const std::vector& cfs) { for (auto cf : cfs) { ASSERT_OK(db_->DropColumnFamily(handles_[cf])); - db_->DestroyColumnFamilyHandle(handles_[cf]); + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf])); handles_[cf] = nullptr; names_[cf] = ""; } @@ -327,14 +320,14 @@ // 10 bytes for key, rest is value if (!save) { ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11), - RandomString(&rnd_, key_value_size - 10))); + rnd_.RandomString(key_value_size - 10))); } else { std::string key = test::RandomKey(&rnd_, 11); keys_[cf].insert(key); - ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10))); + ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10))); } } - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); } #ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite @@ -561,14 +554,14 @@ INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest, testing::Values(test::kDefaultFormatVersion)); INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest, - testing::Values(test::kLatestFormatVersion)); + testing::Values(kLatestFormatVersion)); TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) { for (int iter = 0; iter < 3; ++iter) { Open(); CreateColumnFamilies({"one", "two", "three"}); for (size_t i = 0; i < handles_.size(); ++i) { - auto cfh = reinterpret_cast(handles_[i]); + auto cfh = static_cast_with_check(handles_[i]); ASSERT_EQ(i, cfh->GetID()); } if (iter == 1) { @@ -584,7 +577,7 @@ CreateColumnFamilies({"three2"}); // ID 3 that was used for dropped column family "three" should not be // reused - auto cfh3 = reinterpret_cast(handles_[3]); + auto cfh3 = static_cast_with_check(handles_[3]); ASSERT_EQ(4U, cfh3->GetID()); Close(); Destroy(); @@ -652,11 +645,11 @@ // after flushing file B is deleted. At the same time, the min log number of // default CF is not written to manifest. Log file A still remains. // Flushed to SST file Y. - Flush(1); - Flush(0); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); ASSERT_OK(Put(1, "bar", "v3")); // seqID 4 ASSERT_OK(Put(1, "foo", "v4")); // seqID 5 - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); // Preserve file system state up to here to simulate a crash condition. fault_env->SetFilesystemActive(false); @@ -707,19 +700,19 @@ // and is set to current. Both CFs' min log number is set to file C so after // flushing file B is deleted. Log file A still remains. // Flushed to SST file Y. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(0, "bar", "v2")); // seqID 4 ASSERT_OK(Put(2, "bar", "v2")); // seqID 5 ASSERT_OK(Put(1, "bar", "v3")); // seqID 6 // Flushing all column families. This forces all CFs' min log to current. This // is written to the manifest file. Log file C is cleared. - Flush(0); - Flush(1); - Flush(2); + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(2)); // Write to log file D ASSERT_OK(Put(1, "bar", "v4")); // seqID 7 ASSERT_OK(Put(1, "bar", "v5")); // seqID 8 - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(/*sync=*/false)); // Preserve file system state up to here to simulate a crash condition. fault_env->SetFilesystemActive(false); std::vector names; @@ -753,8 +746,8 @@ std::make_tuple(test::kDefaultFormatVersion, false))); INSTANTIATE_TEST_CASE_P( FormatLatest, FlushEmptyCFTestWithParam, - testing::Values(std::make_tuple(test::kLatestFormatVersion, true), - std::make_tuple(test::kLatestFormatVersion, false))); + testing::Values(std::make_tuple(kLatestFormatVersion, true), + std::make_tuple(kLatestFormatVersion, false))); TEST_P(ColumnFamilyTest, AddDrop) { Open(); @@ -821,7 +814,7 @@ } TEST_P(ColumnFamilyTest, DropTest) { - // first iteration - dont reopen DB before dropping + // first iteration - don't reopen DB before dropping // second iteration - reopen DB before dropping for (int iter = 0; iter < 2; ++iter) { Open({"default"}); @@ -848,13 +841,15 @@ Open(); CreateColumnFamiliesAndReopen({"one", "two"}); WriteBatch batch; - batch.Put(handles_[0], Slice("existing"), Slice("column-family")); - batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); + ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family"))); + ASSERT_OK( + batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"))); ASSERT_OK(db_->Write(WriteOptions(), &batch)); DropColumnFamilies({1}); WriteOptions woptions_ignore_missing_cf; woptions_ignore_missing_cf.ignore_missing_column_families = true; - batch.Put(handles_[0], Slice("still here"), Slice("column-family")); + ASSERT_OK( + batch.Put(handles_[0], Slice("still here"), Slice("column-family"))); ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch)); ASSERT_EQ("column-family", Get(0, "still here")); Status s = db_->Write(WriteOptions(), &batch); @@ -893,11 +888,9 @@ ASSERT_OK(env_->CreateDirIfMissing(dbname_)); ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); std::vector old_files; - env_->GetChildren(backup_logs, &old_files); + ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); for (auto& file : old_files) { - if (file != "." && file != "..") { - env_->DeleteFile(backup_logs + "/" + file); - } + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); } column_family_options_.merge_operator = @@ -924,11 +917,9 @@ // copy the logs to backup std::vector logs; - env_->GetChildren(db_options_.wal_dir, &logs); + ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); - } + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); } // recover the DB @@ -953,9 +944,7 @@ if (iter == 0) { // copy the logs from backup back to wal dir for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); } } } @@ -982,13 +971,14 @@ for (int i = 0; i < 3; ++i) { uint64_t max_total_in_memory_state = MaxTotalInMemoryState(); - Flush(i); + ASSERT_OK(Flush(i)); AssertMaxTotalInMemoryState(max_total_in_memory_state); } ASSERT_OK(Put(1, "foofoo", "bar")); ASSERT_OK(Put(0, "foofoo", "bar")); for (auto* it : iterators) { + ASSERT_OK(it->status()); delete it; } } @@ -1086,10 +1076,10 @@ CreateColumnFamilies({"one"}); WriteBatch batch; - batch.Put(handles_[0], Slice("foo"), Slice("bar")); - batch.Put(handles_[1], Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar"))); ASSERT_OK(db_->Write(WriteOptions(), &batch)); - Flush(0); + ASSERT_OK(Flush(0)); fault_env->SetFilesystemActive(false); std::vector names; @@ -1099,7 +1089,7 @@ } } Close(); - fault_env->DropUnsyncedFileData(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); fault_env->ResetState(); Open(names, {}); @@ -2073,6 +2063,7 @@ if (iter->Valid()) { result = iter->key().ToString() + "->" + iter->value().ToString(); } else { + EXPECT_OK(iter->status()); result = "(invalid)"; } return result; @@ -2231,7 +2222,7 @@ // files for column family [one], because it's empty AssertCountLiveFiles(4); - Flush(0); + ASSERT_OK(Flush(0)); ASSERT_EQ(0, dbfull()->TEST_total_log_size()); Close(); } @@ -2287,6 +2278,8 @@ // not a multiple of 4k, round up 4k expected_arena_block_size += 4 * 1024; } + expected_arena_block_size = + std::min(size_t{1024 * 1024}, expected_arena_block_size); ASSERT_EQ(expected_arena_block_size, result.arena_block_size); } } @@ -2327,7 +2320,7 @@ ASSERT_OK(db_->DropColumnFamily(handles_[2])); } else { // delete CF two - db_->DestroyColumnFamilyHandle(handles_[2]); + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2])); handles_[2] = nullptr; } // Make sure iterator created can still be used. @@ -2383,7 +2376,6 @@ // 1MB should create ~10 files for each CF int kKeysNum = 10000; PutRandomData(1, kKeysNum, 100); - { std::unique_ptr iterator( db_->NewIterator(ReadOptions(), handles_[1])); @@ -2430,6 +2422,9 @@ env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); // 1MB should create ~10 files for each CF int kKeysNum = 10000; @@ -2444,6 +2439,9 @@ // now we sleep again. this is just so we're certain that flush job finished env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); sleeping_task.WakeUp(); sleeping_task.WaitUntilDone(); @@ -2977,7 +2975,8 @@ SpecialEnv env(Env::Default()); db_options_.env = &env; db_options_.max_background_flushes = 1; - column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); Open(); CreateColumnFamilies({"one"}); ASSERT_OK(Put(1, "fodor", "mirko")); @@ -2993,6 +2992,9 @@ test::SleepingBackgroundTask sleeping_task; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); WriteOptions wo; wo.sync = true; @@ -3019,14 +3021,16 @@ SpecialEnv env(Env::Default()); db_options_.env = &env; db_options_.max_background_flushes = 1; - column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); Open(); CreateColumnFamilies({"one"}); ASSERT_OK(Put(1, "fodor", "mirko")); // Create an iterator holding the current super version. Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(it->status()); // A flush will make `it` hold the last reference of its super version. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(0, "fodor", "mirko")); @@ -3038,6 +3042,9 @@ test::SleepingBackgroundTask sleeping_task; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task, Env::Priority::HIGH); + // Make sure the task is sleeping. Otherwise, it might start to execute + // after sleeping_task.WaitUntilDone() and cause TSAN warning. + sleeping_task.WaitUntilSleeping(); WriteOptions wo; wo.sync = true; @@ -3066,7 +3073,8 @@ env.SetBackgroundThreads(2, Env::HIGH); db_options_.env = &env; db_options_.max_background_flushes = 1; - column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2)); + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(2)); Open(); CreateColumnFamilies({"one"}); ASSERT_OK(Put(1, "fodor", "mirko")); @@ -3074,8 +3082,9 @@ ReadOptions ro; ro.background_purge_on_iterator_cleanup = true; Iterator* it = db_->NewIterator(ro, handles_[1]); + ASSERT_OK(it->status()); // A flush will make `it` hold the last reference of its super version. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(0, "fodor", "mirko")); @@ -3123,13 +3132,14 @@ env.SetBackgroundThreads(2, Env::HIGH); db_options_.env = &env; db_options_.max_background_flushes = 1; - column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3)); + column_family_options_.memtable_factory.reset( + test::NewSpecialSkipListFactory(3)); column_family_options_.level0_file_num_compaction_trigger = 2; Open(); CreateColumnFamilies({"one"}); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(1, "fodar2", "mirko")); - Flush(1); + ASSERT_OK(Flush(1)); // Create an iterator holding the current super version, as well as // the SST file just flushed. @@ -3141,7 +3151,7 @@ ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(1, "fodar2", "mirko")); - Flush(1); + ASSERT_OK(Flush(1)); WaitForCompaction(); @@ -3168,6 +3178,8 @@ // Deleting the iterator will clear its super version, triggering // closing all files it->Seek(""); + ASSERT_OK(it->status()); + ASSERT_EQ(2, env.num_open_wal_file_.load()); ASSERT_EQ(0, env.delete_count_.load()); @@ -3198,8 +3210,8 @@ Open(); CreateColumnFamiliesAndReopen({"one", "two"}); - Put(0, "", ""); - Put(1, "foo", "bar"); + ASSERT_OK(Put(0, "", "")); + ASSERT_OK(Put(1, "foo", "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1", @@ -3209,12 +3221,12 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - ROCKSDB_NAMESPACE::port::Thread thread([&] { db_->SyncWAL(); }); + ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); }); TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); - Flush(1); - Put(1, "foo", "bar"); - Flush(1); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); @@ -3236,7 +3248,7 @@ Build(0, 100); // Flush the 0th column family to force a roll of the wal log - Flush(0); + ASSERT_OK(Flush(0)); // Add some more entries Build(100, 100); @@ -3251,7 +3263,7 @@ FileType type; if (!(ParseFileName(filenames[i], &number, &type))) continue; - if (type != kLogFile) continue; + if (type != kWalFile) continue; logfs.push_back(filenames[i]); } @@ -3296,7 +3308,7 @@ Close(); // cleanup - env_->DeleteDir(backup_logs); + ASSERT_OK(env_->DeleteDir(backup_logs)); } TEST_P(ColumnFamilyTest, DefaultCfPathsTest) { @@ -3312,14 +3324,14 @@ // Fill Column family 1. PutRandomData(1, 100, 100); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); // Fill column family 2 PutRandomData(2, 100, 100); - Flush(2); + ASSERT_OK(Flush(2)); // SST from Column family 2 should be generated in // db_paths which is dbname_ in this case. @@ -3338,29 +3350,31 @@ Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); PutRandomData(1, 100, 100, true /* save */); - Flush(1); + ASSERT_OK(Flush(1)); // Check that files are generated in appropriate paths. ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); PutRandomData(2, 100, 100, true /* save */); - Flush(2); + ASSERT_OK(Flush(2)); ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); // Re-open and verify the keys. Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); for (int cf = 1; cf != 3; ++cf) { ReadOptions read_options; read_options.readahead_size = 0; auto it = dbi->NewIterator(read_options, handles_[cf]); for (it->SeekToFirst(); it->Valid(); it->Next()) { + ASSERT_OK(it->status()); Slice key(it->key()); ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString())); } + ASSERT_OK(it->status()); delete it; for (const auto& key : keys_[cf]) { @@ -3369,15 +3383,55 @@ } } -} // namespace ROCKSDB_NAMESPACE +TEST(ColumnFamilyTest, ValidateBlobGCCutoff) { + DBOptions db_options; -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_age_cutoff = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_age_cutoff = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); } -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_force_threshold = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_force_threshold = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + +} // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compact_files_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compact_files_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,6 +16,7 @@ #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "util/cast_util.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -90,9 +91,9 @@ // create couple files // Background compaction starts and waits in BackgroundCallCompaction:0 for (int i = 0; i < kLevel0Trigger * 4; ++i) { - db->Put(WriteOptions(), ToString(i), ""); - db->Put(WriteOptions(), ToString(100 - i), ""); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), "")); + ASSERT_OK(db->Put(WriteOptions(), ToString(100 - i), "")); + ASSERT_OK(db->Flush(FlushOptions())); } ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; @@ -117,6 +118,78 @@ delete db; } +TEST_F(CompactFilesTest, MultipleLevel) { + Options options; + options.create_if_missing = true; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 6; + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // create couple files in L0, L3, L4 and L5 + for (int i = 5; i > 2; --i) { + collector->ClearFlushedFiles(); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), "")); + ASSERT_OK(db->Flush(FlushOptions())); + auto l0_files = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i)); + + std::string prop; + ASSERT_TRUE( + db->GetProperty("rocksdb.num-files-at-level" + ToString(i), &prop)); + ASSERT_EQ("1", prop); + } + ASSERT_OK(db->Put(WriteOptions(), ToString(0), "")); + ASSERT_OK(db->Flush(FlushOptions())); + + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + // Compact files except the file in L3 + std::vector files; + for (int i = 0; i < 6; ++i) { + if (i == 3) continue; + for (auto& file : meta.levels[i].files) { + files.push_back(file.db_path + "/" + file.name); + } + } + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"}, + {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread thread([&] { + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0"); + ASSERT_OK(db->Put(WriteOptions(), "bar", "v2")); + ASSERT_OK(db->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db->Flush(FlushOptions())); + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1"); + }); + + // Compaction cannot move up the data to higher level + // here we have input file from level 5, so the output level has to be >= 5 + for (int invalid_output_level = 0; invalid_output_level < 5; + invalid_output_level++) { + s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); + std::cout << s.ToString() << std::endl; + ASSERT_TRUE(s.IsInvalidArgument()); + } + + ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5)); + SyncPoint::GetInstance()->DisableProcessing(); + thread.join(); + + delete db; +} + TEST_F(CompactFilesTest, ObsoleteFiles) { Options options; // to trigger compaction more easily @@ -137,18 +210,18 @@ DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); - assert(db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); // create couple files for (int i = 1000; i < 2000; ++i) { - db->Put(WriteOptions(), ToString(i), - std::string(kWriteBufferSize / 10, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(kWriteBufferSize / 10, 'a' + (i % 26)))); } auto l0_files = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); - reinterpret_cast(db)->TEST_WaitForCompact(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForCompact()); // verify all compaction input files are deleted for (auto fname : l0_files) { @@ -181,15 +254,17 @@ // create couple files for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); collector->ClearFlushedFiles(); for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_2 = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0)); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0)); @@ -212,13 +287,13 @@ DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(s); assert(db); // Create 5 files. for (int i = 0; i < 5; ++i) { - db->Put(WriteOptions(), "key" + ToString(i), "value"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key" + ToString(i), "value")); + ASSERT_OK(db->Flush(FlushOptions())); } auto l0_files = collector->GetFlushedFiles(); @@ -236,8 +311,8 @@ // In the meantime flush another file. TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); - db->Put(WriteOptions(), "key5", "value"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key5", "value")); + ASSERT_OK(db->Flush(FlushOptions())); TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); compaction_thread.join(); @@ -248,7 +323,7 @@ // Make sure we can reopen the DB. s = DB::Open(options, db_name_, &db); - ASSERT_TRUE(s.ok()); + ASSERT_OK(s); assert(db); delete db; } @@ -292,8 +367,8 @@ cf->SetDB(db); // Write one L0 file - db->Put(WriteOptions(), "K1", "V1"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "K1", "V1")); + ASSERT_OK(db->Flush(FlushOptions())); // Compact all L0 files using CompactFiles ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; @@ -336,8 +411,8 @@ DB* db = nullptr; ASSERT_OK(DB::Open(options, db_name_, &db)); - db->Put(WriteOptions(), "key", "val"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db->Flush(FlushOptions())); auto l0_files = collector->GetFlushedFiles(); ASSERT_EQ(1, l0_files.size()); @@ -376,14 +451,15 @@ DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(s); assert(db); // create couple files for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - reinterpret_cast(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); CompactionOptions co; co.compression = CompressionType::kLZ4Compression; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,160 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#ifndef ROCKSDB_LITE -#include "db/compacted_db_impl.h" -#include "db/db_impl/db_impl.h" -#include "db/version_set.h" -#include "table/get_context.h" - -namespace ROCKSDB_NAMESPACE { - -extern void MarkKeyMayExist(void* arg); -extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v, bool hit_and_return); - -CompactedDBImpl::CompactedDBImpl( - const DBOptions& options, const std::string& dbname) - : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr), - user_comparator_(nullptr) { -} - -CompactedDBImpl::~CompactedDBImpl() { -} - -size_t CompactedDBImpl::FindFile(const Slice& key) { - size_t right = files_.num_files - 1; - auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { - return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; - }; - return static_cast(std::lower_bound(files_.files, - files_.files + right, key, cmp) - files_.files); -} - -Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, - const Slice& key, PinnableSlice* value) { - GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, - GetContext::kNotFound, key, value, nullptr, nullptr, - true, nullptr, nullptr); - LookupKey lkey(key, kMaxSequenceNumber); - files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(), - &get_context, nullptr); - if (get_context.State() == GetContext::kFound) { - return Status::OK(); - } - return Status::NotFound(); -} - -std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, - const std::vector&, - const std::vector& keys, std::vector* values) { - autovector reader_list; - for (const auto& key : keys) { - const FdWithKeyRange& f = files_.files[FindFile(key)]; - if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) { - reader_list.push_back(nullptr); - } else { - LookupKey lkey(key, kMaxSequenceNumber); - f.fd.table_reader->Prepare(lkey.internal_key()); - reader_list.push_back(f.fd.table_reader); - } - } - std::vector statuses(keys.size(), Status::NotFound()); - values->resize(keys.size()); - int idx = 0; - for (auto* r : reader_list) { - if (r != nullptr) { - PinnableSlice pinnable_val; - std::string& value = (*values)[idx]; - GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, - GetContext::kNotFound, keys[idx], &pinnable_val, - nullptr, nullptr, true, nullptr, nullptr); - LookupKey lkey(keys[idx], kMaxSequenceNumber); - r->Get(options, lkey.internal_key(), &get_context, nullptr); - value.assign(pinnable_val.data(), pinnable_val.size()); - if (get_context.State() == GetContext::kFound) { - statuses[idx] = Status::OK(); - } - } - ++idx; - } - return statuses; -} - -Status CompactedDBImpl::Init(const Options& options) { - SuperVersionContext sv_context(/* create_superversion */ true); - mutex_.Lock(); - ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, - ColumnFamilyOptions(options)); - Status s = Recover({cf}, true /* read only */, false, true); - if (s.ok()) { - cfd_ = reinterpret_cast( - DefaultColumnFamily())->cfd(); - cfd_->InstallSuperVersion(&sv_context, &mutex_); - } - mutex_.Unlock(); - sv_context.Clean(); - if (!s.ok()) { - return s; - } - NewThreadStatusCfInfo(cfd_); - version_ = cfd_->GetSuperVersion()->current; - user_comparator_ = cfd_->user_comparator(); - auto* vstorage = version_->storage_info(); - if (vstorage->num_non_empty_levels() == 0) { - return Status::NotSupported("no file exists"); - } - const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); - // L0 should not have files - if (l0.num_files > 1) { - return Status::NotSupported("L0 contain more than 1 file"); - } - if (l0.num_files == 1) { - if (vstorage->num_non_empty_levels() > 1) { - return Status::NotSupported("Both L0 and other level contain files"); - } - files_ = l0; - return Status::OK(); - } - - for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { - if (vstorage->LevelFilesBrief(i).num_files > 0) { - return Status::NotSupported("Other levels also contain files"); - } - } - - int level = vstorage->num_non_empty_levels() - 1; - if (vstorage->LevelFilesBrief(level).num_files > 0) { - files_ = vstorage->LevelFilesBrief(level); - return Status::OK(); - } - return Status::NotSupported("no file exists"); -} - -Status CompactedDBImpl::Open(const Options& options, - const std::string& dbname, DB** dbptr) { - *dbptr = nullptr; - - if (options.max_open_files != -1) { - return Status::InvalidArgument("require max_open_files = -1"); - } - if (options.merge_operator.get() != nullptr) { - return Status::InvalidArgument("merge operator is not supported"); - } - DBOptions db_options(options); - std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); - Status s = db->Init(options); - if (s.ok()) { - db->StartTimedTasks(); - ROCKS_LOG_INFO(db->immutable_db_options_.info_log, - "Opened the db as fully compacted mode"); - LogFlush(db->immutable_db_options_.info_log); - *dbptr = db.release(); - } - return s; -} - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,113 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once -#ifndef ROCKSDB_LITE -#include -#include -#include "db/db_impl/db_impl.h" - -namespace ROCKSDB_NAMESPACE { - -class CompactedDBImpl : public DBImpl { - public: - CompactedDBImpl(const DBOptions& options, const std::string& dbname); - // No copying allowed - CompactedDBImpl(const CompactedDBImpl&) = delete; - void operator=(const CompactedDBImpl&) = delete; - - virtual ~CompactedDBImpl(); - - static Status Open(const Options& options, const std::string& dbname, - DB** dbptr); - - // Implementations of the DB interface - using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; - using DB::MultiGet; - virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector&, - const std::vector& keys, std::vector* values) - override; - - using DBImpl::Put; - virtual Status Put(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::Merge; - virtual Status Merge(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::Delete; - virtual Status Delete(const WriteOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice& /*key*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DBImpl::CompactRange; - virtual Status CompactRange(const CompactRangeOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/, - const Slice* /*begin*/, - const Slice* /*end*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - - virtual Status DisableFileDeletions() override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status EnableFileDeletions(bool /*force*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - virtual Status GetLiveFiles(std::vector& ret, - uint64_t* manifest_file_size, - bool /*flush_memtable*/) override { - return DBImpl::GetLiveFiles(ret, manifest_file_size, - false /* flush_memtable */); - } - using DBImpl::Flush; - virtual Status Flush(const FlushOptions& /*options*/, - ColumnFamilyHandle* /*column_family*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DB::IngestExternalFile; - virtual Status IngestExternalFile( - ColumnFamilyHandle* /*column_family*/, - const std::vector& /*external_files*/, - const IngestExternalFileOptions& /*ingestion_options*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - using DB::CreateColumnFamilyWithImport; - virtual Status CreateColumnFamilyWithImport( - const ColumnFamilyOptions& /*options*/, - const std::string& /*column_family_name*/, - const ImportColumnFamilyOptions& /*import_options*/, - const ExportImportFilesMetaData& /*metadata*/, - ColumnFamilyHandle** /*handle*/) override { - return Status::NotSupported("Not supported in compacted db mode."); - } - - private: - friend class DB; - inline size_t FindFile(const Slice& key); - Status Init(const Options& options); - - ColumnFamilyData* cfd_; - Version* version_; - const Comparator* user_comparator_; - LevelFilesBrief files_; -}; -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,275 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that wraps another one and ensures that any keys +// returned are strictly within a range [start, end). If the underlying +// iterator has already performed the bounds checking, it relies on that result; +// otherwise, it performs the necessary key comparisons itself. Both bounds +// are optional. +class ClippingIterator : public InternalIterator { + public: + ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end, + const Comparator* cmp) + : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) { + assert(iter_); + assert(cmp_); + assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0); + + UpdateAndEnforceBounds(); + } + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + if (start_) { + iter_->Seek(*start_); + } else { + iter_->SeekToFirst(); + } + + UpdateAndEnforceUpperBound(); + } + + void SeekToLast() override { + if (end_) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + } else { + iter_->SeekToLast(); + } + + UpdateAndEnforceLowerBound(); + } + + void Seek(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + iter_->Seek(*start_); + UpdateAndEnforceUpperBound(); + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + valid_ = false; + return; + } + + iter_->Seek(target); + UpdateAndEnforceUpperBound(); + } + + void SeekForPrev(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + valid_ = false; + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + + UpdateAndEnforceLowerBound(); + return; + } + + iter_->SeekForPrev(target); + UpdateAndEnforceLowerBound(); + } + + void Next() override { + assert(valid_); + iter_->Next(); + UpdateAndEnforceUpperBound(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(valid_); + assert(result); + + IterateResult res; + valid_ = iter_->NextAndGetResult(&res); + + if (!valid_) { + return false; + } + + if (end_) { + EnforceUpperBoundImpl(res.bound_check_result); + + if (!valid_) { + return false; + } + } + + res.bound_check_result = IterBoundCheck::kInbound; + *result = res; + + return true; + } + + void Prev() override { + assert(valid_); + iter_->Prev(); + UpdateAndEnforceLowerBound(); + } + + Slice key() const override { + assert(valid_); + return iter_->key(); + } + + Slice user_key() const override { + assert(valid_); + return iter_->user_key(); + } + + Slice value() const override { + assert(valid_); + return iter_->value(); + } + + Status status() const override { return iter_->status(); } + + bool PrepareValue() override { + assert(valid_); + + if (iter_->PrepareValue()) { + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + + bool MayBeOutOfLowerBound() override { + assert(valid_); + return false; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(valid_); + return IterBoundCheck::kInbound; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(valid_); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(valid_); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateValid() { + assert(!iter_->Valid() || iter_->status().ok()); + + valid_ = iter_->Valid(); + } + + void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) { + if (bound_check_result == IterBoundCheck::kInbound) { + return; + } + + if (bound_check_result == IterBoundCheck::kOutOfBound) { + valid_ = false; + return; + } + + assert(bound_check_result == IterBoundCheck::kUnknown); + + if (cmp_->Compare(key(), *end_) >= 0) { + valid_ = false; + } + } + + void EnforceUpperBound() { + if (!valid_) { + return; + } + + if (!end_) { + return; + } + + EnforceUpperBoundImpl(iter_->UpperBoundCheckResult()); + } + + void EnforceLowerBound() { + if (!valid_) { + return; + } + + if (!start_) { + return; + } + + if (!iter_->MayBeOutOfLowerBound()) { + return; + } + + if (cmp_->Compare(key(), *start_) < 0) { + valid_ = false; + } + } + + void AssertBounds() { + assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0); + assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0); + } + + void UpdateAndEnforceBounds() { + UpdateValid(); + EnforceUpperBound(); + EnforceLowerBound(); + AssertBounds(); + } + + void UpdateAndEnforceUpperBound() { + UpdateValid(); + EnforceUpperBound(); + AssertBounds(); + } + + void UpdateAndEnforceLowerBound() { + UpdateValid(); + EnforceLowerBound(); + AssertBounds(); + } + + InternalIterator* iter_; + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,258 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/clipping_iterator.h" + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// A vector iterator which does its own bounds checking. This is for testing the +// optimizations in the clipping iterator where we bypass the bounds checking if +// the input iterator has already performed it. +class BoundsCheckingVectorIterator : public VectorIterator { + public: + BoundsCheckingVectorIterator(const std::vector& keys, + const std::vector& values, + const Slice* start, const Slice* end, + const Comparator* cmp) + : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) { + assert(cmp_); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + assert(result); + + Next(); + + if (!Valid()) { + return false; + } + + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = true; + + return true; + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + + if (!start_) { + return false; + } + + return cmp_->Compare(key(), *start_) < 0; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + + if (!end_) { + return IterBoundCheck::kInbound; + } + + return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound + : IterBoundCheck::kInbound; + } + + private: + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; +}; + +class ClippingIteratorTest + : public ::testing::Test, + public ::testing::WithParamInterface> {}; + +TEST_P(ClippingIteratorTest, Clip) { + const std::vector keys{"key0", "key1", "key2", "key3", "key4", + "key5", "key6", "key7", "key8", "key9"}; + const std::vector values{ + "unused0", "value1", "value2", "value3", "unused4", + "unused5", "unused6", "unused7", "unused8", "unused9"}; + + assert(keys.size() == values.size()); + + // Note: the input always contains key1, key2, and key3; however, the clipping + // window is based on the test parameters: its left edge is a value in the + // range [0, 4], and its size is a value in the range [0, 5] + const std::vector input_keys{keys[1], keys[2], keys[3]}; + const std::vector input_values{values[1], values[2], values[3]}; + + const bool use_bounds_checking_vec_it = std::get<0>(GetParam()); + + const size_t clip_start_idx = std::get<1>(GetParam()); + const size_t clip_window_size = std::get<2>(GetParam()); + const size_t clip_end_idx = clip_start_idx + clip_window_size; + + const Slice start(keys[clip_start_idx]); + const Slice end(keys[clip_end_idx]); + + std::unique_ptr input( + use_bounds_checking_vec_it + ? new BoundsCheckingVectorIterator(input_keys, input_values, &start, + &end, BytewiseComparator()) + : new VectorIterator(input_keys, input_values, BytewiseComparator())); + + ClippingIterator clip(input.get(), &start, &end, BytewiseComparator()); + + // The range the clipping iterator should return values from. This is + // essentially the intersection of the input range [1, 4) and the clipping + // window [clip_start_idx, clip_end_idx) + const size_t data_start_idx = + std::max(clip_start_idx, static_cast(1)); + const size_t data_end_idx = std::min(clip_end_idx, static_cast(4)); + + // Range is empty; all Seeks should fail + if (data_start_idx >= data_end_idx) { + clip.SeekToFirst(); + ASSERT_FALSE(clip.Valid()); + + clip.SeekToLast(); + ASSERT_FALSE(clip.Valid()); + + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + ASSERT_FALSE(clip.Valid()); + + clip.SeekForPrev(keys[i]); + ASSERT_FALSE(clip.Valid()); + } + + return; + } + + // Range is non-empty; call SeekToFirst and iterate forward + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + clip.Next(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Next(); + ASSERT_FALSE(clip.Valid()); + + // Do it again using NextAndGetResult + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + IterateResult result; + ASSERT_TRUE(clip.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[i]); + ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + IterateResult result; + ASSERT_FALSE(clip.NextAndGetResult(&result)); + ASSERT_FALSE(clip.Valid()); + + // Call SeekToLast and iterate backward + clip.SeekToLast(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) { + clip.Prev(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Prev(); + ASSERT_FALSE(clip.Valid()); + + // Call Seek/SeekForPrev for all keys; Seek should return the smallest key + // which is >= the target; SeekForPrev should return the largest key which is + // <= the target + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + + if (i < data_start_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_FALSE(clip.Valid()); + } + + clip.SeekForPrev(keys[i]); + + if (i < data_start_idx) { + ASSERT_FALSE(clip.Valid()); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + } +} + +INSTANTIATE_TEST_CASE_P( + ClippingIteratorTest, ClippingIteratorTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Range(static_cast(0), static_cast(5)), + ::testing::Range(static_cast(0), static_cast(6)))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,12 +7,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/compaction/compaction.h" + #include #include #include "db/column_family.h" -#include "db/compaction/compaction.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/sst_partitioner.h" #include "test_util/sync_point.h" #include "util/string_util.h" @@ -23,7 +25,7 @@ int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, const InternalKey& b) { - auto c = user_cmp->Compare(a.user_key(), b.user_key()); + auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key()); if (c != 0) { return c; } @@ -202,26 +204,24 @@ return num_files_in_compaction == total_num_files; } -Compaction::Compaction(VersionStorageInfo* vstorage, - const ImmutableCFOptions& _immutable_cf_options, - const MutableCFOptions& _mutable_cf_options, - std::vector _inputs, - int _output_level, uint64_t _target_file_size, - uint64_t _max_compaction_bytes, uint32_t _output_path_id, - CompressionType _compression, - CompressionOptions _compression_opts, - uint32_t _max_subcompactions, - std::vector _grandparents, - bool _manual_compaction, double _score, - bool _deletion_compaction, - CompactionReason _compaction_reason) +Compaction::Compaction( + VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, + const MutableCFOptions& _mutable_cf_options, + const MutableDBOptions& _mutable_db_options, + std::vector _inputs, int _output_level, + uint64_t _target_file_size, uint64_t _max_compaction_bytes, + uint32_t _output_path_id, CompressionType _compression, + CompressionOptions _compression_opts, Temperature _output_temperature, + uint32_t _max_subcompactions, std::vector _grandparents, + bool _manual_compaction, double _score, bool _deletion_compaction, + CompactionReason _compaction_reason) : input_vstorage_(vstorage), start_level_(_inputs[0].level), output_level_(_output_level), max_output_file_size_(_target_file_size), max_compaction_bytes_(_max_compaction_bytes), max_subcompactions_(_max_subcompactions), - immutable_cf_options_(_immutable_cf_options), + immutable_options_(_immutable_options), mutable_cf_options_(_mutable_cf_options), input_version_(nullptr), number_levels_(vstorage->num_levels()), @@ -229,6 +229,7 @@ output_path_id_(_output_path_id), output_compression_(_compression), output_compression_opts_(_compression_opts), + output_temperature_(_output_temperature), deletion_compaction_(_deletion_compaction), inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), @@ -237,19 +238,14 @@ is_full_compaction_(IsFullCompaction(vstorage, inputs_)), is_manual_compaction_(_manual_compaction), is_trivial_move_(false), - compaction_reason_(_compaction_reason) { + compaction_reason_(_compaction_reason), + notify_on_compaction_completion_(false) { MarkFilesBeingCompacted(true); if (is_manual_compaction_) { compaction_reason_ = CompactionReason::kManualCompaction; } if (max_subcompactions_ == 0) { - max_subcompactions_ = immutable_cf_options_.max_subcompactions; - } - if (!bottommost_level_) { - // Currently we only enable dictionary compression during compaction to the - // bottommost level. - output_compression_opts_.max_dict_bytes = 0; - output_compression_opts_.zstd_max_train_bytes = 0; + max_subcompactions_ = _mutable_db_options.max_subcompactions; } #ifndef NDEBUG @@ -281,7 +277,7 @@ bool Compaction::InputCompressionMatchesOutput() const { int base_level = input_vstorage_->base_level(); - bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_, + bool matches = (GetCompressionType(immutable_options_, input_vstorage_, mutable_cf_options_, start_level_, base_level) == output_compression_); if (matches) { @@ -306,13 +302,19 @@ } if (is_manual_compaction_ && - (immutable_cf_options_.compaction_filter != nullptr || - immutable_cf_options_.compaction_filter_factory != nullptr)) { + (immutable_options_.compaction_filter != nullptr || + immutable_options_.compaction_filter_factory != nullptr)) { // This is a manual compaction and we have a compaction filter that should // be executed, we cannot do a trivial move return false; } + if (start_level_ == output_level_) { + // It doesn't make sense if compaction picker picks files just to trivial + // move to the same level. + return false; + } + // Used in universal compaction, where trivial move can be done if the // input files are non overlapping if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && @@ -328,6 +330,8 @@ // assert inputs_.size() == 1 + std::unique_ptr partitioner = CreateSstPartitioner(); + for (const auto& file : inputs_.front().files) { std::vector file_grand_parents; if (output_level_ + 1 >= number_levels_) { @@ -340,6 +344,13 @@ if (compaction_size > max_compaction_bytes_) { return false; } + + if (partitioner.get() != nullptr) { + if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + file->largest.user_key())) { + return false; + } + } } return true; @@ -371,7 +382,13 @@ auto* f = files[level_ptrs->at(lvl)]; if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // In the presence of user-defined timestamp, we may need to handle + // the case in which f->smallest.user_key() (including ts) has the + // same user key, but the ts part is smaller. If so, + // Compare(user_key, f->smallest.user_key()) returns -1. + // That's why we need CompareWithoutTimestamp(). + if (user_cmp->CompareWithoutTimestamp(user_key, + f->smallest.user_key()) >= 0) { // Key falls in this file's range, so it may // exist beyond output level return false; @@ -500,14 +517,14 @@ } if (max_output_file_size_ != port::kMaxUint64 && - (immutable_cf_options_.compaction_style == kCompactionStyleLevel || + (immutable_options_.compaction_style == kCompactionStyleLevel || output_level() > 0)) { preallocation_size = std::min(max_output_file_size_, preallocation_size); } // Over-estimate slightly so we don't end up just barely crossing // the threshold - // No point to prellocate more than 1GB. + // No point to preallocate more than 1GB. return std::min(uint64_t{1073741824}, preallocation_size + (preallocation_size / 10)); } @@ -517,14 +534,35 @@ return nullptr; } + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + CompactionFilter::Context context; context.is_full_compaction = is_full_compaction_; context.is_manual_compaction = is_manual_compaction_; context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); } +std::unique_ptr Compaction::CreateSstPartitioner() const { + if (!immutable_options_.sst_partitioner_factory) { + return nullptr; + } + + SstPartitioner::Context context; + context.is_full_compaction = is_full_compaction_; + context.is_manual_compaction = is_manual_compaction_; + context.output_level = output_level_; + context.smallest_user_key = smallest_user_key_; + context.largest_user_key = largest_user_key_; + return immutable_options_.sst_partitioner_factory->CreatePartitioner(context); +} + bool Compaction::IsOutputLevelEmpty() const { return inputs_.back().level != output_level_ || inputs_.back().empty(); } @@ -533,6 +571,14 @@ if (max_subcompactions_ <= 1 || cfd_ == nullptr) { return false; } + + // Note: the subcompaction boundary picking logic does not currently guarantee + // that all user keys that differ only by timestamp get processed by the same + // subcompaction. + if (cfd_->user_comparator()->timestamp_size() > 0) { + return false; + } + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 && !IsOutputLevelEmpty(); @@ -543,10 +589,42 @@ } } -uint64_t Compaction::MinInputFileOldestAncesterTime() const { +bool Compaction::DoesInputReferenceBlobFiles() const { + assert(input_version_); + + const VersionStorageInfo* storage_info = input_version_->storage_info(); + assert(storage_info); + + if (storage_info->GetBlobFiles().empty()) { + return false; + } + + for (size_t i = 0; i < inputs_.size(); ++i) { + for (const FileMetaData* meta : inputs_[i].files) { + assert(meta); + + if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) { + return true; + } + } + } + + return false; +} + +uint64_t Compaction::MinInputFileOldestAncesterTime( + const InternalKey* start, const InternalKey* end) const { uint64_t min_oldest_ancester_time = port::kMaxUint64; + const InternalKeyComparator& icmp = + column_family_data()->internal_comparator(); for (const auto& level_files : inputs_) { for (const auto& file : level_files.files) { + if (start != nullptr && icmp.Compare(file->largest, *start) < 0) { + continue; + } + if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) { + continue; + } uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); if (oldest_ancester_time != 0) { min_oldest_ancester_time = diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,6 +11,7 @@ #include "db/version_set.h" #include "memory/arena.h" #include "options/cf_options.h" +#include "rocksdb/sst_partitioner.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -69,12 +70,14 @@ class Compaction { public: Compaction(VersionStorageInfo* input_version, - const ImmutableCFOptions& immutable_cf_options, + const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, std::vector inputs, int output_level, uint64_t target_file_size, uint64_t max_compaction_bytes, uint32_t output_path_id, CompressionType compression, - CompressionOptions compression_opts, uint32_t max_subcompactions, + CompressionOptions compression_opts, + Temperature output_temperature, uint32_t max_subcompactions, std::vector grandparents, bool manual_compaction = false, double score = -1, bool deletion_compaction = false, @@ -160,7 +163,7 @@ CompressionType output_compression() const { return output_compression_; } // What compression options for output - CompressionOptions output_compression_opts() const { + const CompressionOptions& output_compression_opts() const { return output_compression_opts_; } @@ -221,10 +224,10 @@ // How many total levels are there? int number_levels() const { return number_levels_; } - // Return the ImmutableCFOptions that should be used throughout the compaction + // Return the ImmutableOptions that should be used throughout the compaction // procedure - const ImmutableCFOptions* immutable_cf_options() const { - return &immutable_cf_options_; + const ImmutableOptions* immutable_options() const { + return &immutable_options_; } // Return the MutableCFOptions that should be used throughout the compaction @@ -255,12 +258,20 @@ // Create a CompactionFilter from compaction_filter_factory std::unique_ptr CreateCompactionFilter() const; + // Create a SstPartitioner from sst_partitioner_factory + std::unique_ptr CreateSstPartitioner() const; + // Is the input level corresponding to output_level_ empty? bool IsOutputLevelEmpty() const; // Should this compaction be broken up into smaller ones run in parallel? bool ShouldFormSubcompactions() const; + // Returns true iff at least one input file references a blob file. + // + // PRE: input version has been set. + bool DoesInputReferenceBlobFiles() const; + // test function to validate the functionality of IsBottommostLevel() // function -- determines if compaction with inputs and storage is bottommost static bool TEST_IsBottommostLevel( @@ -289,9 +300,24 @@ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; } + Temperature output_temperature() const { return output_temperature_; } + uint32_t max_subcompactions() const { return max_subcompactions_; } - uint64_t MinInputFileOldestAncesterTime() const; + // start and end are sub compact range. Null if no boundary. + // This is used to filter out some input files' ancester's time range. + uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, + const InternalKey* end) const; + + // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of + // compaction begin and compaction completion callbacks match. + void SetNotifyOnCompactionCompleted() { + notify_on_compaction_completion_ = true; + } + + bool ShouldNotifyOnCompactionCompleted() const { + return notify_on_compaction_completion_; + } private: // mark (or clear) all files that are being compacted @@ -325,7 +351,7 @@ uint64_t max_output_file_size_; uint64_t max_compaction_bytes_; uint32_t max_subcompactions_; - const ImmutableCFOptions immutable_cf_options_; + const ImmutableOptions immutable_options_; const MutableCFOptions mutable_cf_options_; Version* input_version_; VersionEdit edit_; @@ -336,7 +362,8 @@ const uint32_t output_path_id_; CompressionType output_compression_; CompressionOptions output_compression_opts_; - // If true, then the comaction can be done by simply deleting input files. + Temperature output_temperature_; + // If true, then the compaction can be done by simply deleting input files. const bool deletion_compaction_; // Compaction input files organized by level. Constant after construction @@ -376,6 +403,10 @@ // Reason for compaction CompactionReason compaction_reason_; + + // Notify on compaction completion only if listener was notified on compaction + // begin. + bool notify_on_compaction_completion_; }; // Return sum of sizes of all files in `files`. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,12 @@ #pragma once +#include + #include "rocksdb/rocksdb_namespace.h" +namespace ROCKSDB_NAMESPACE { + struct CompactionIterationStats { // Compaction statistics @@ -34,4 +38,12 @@ // Single-Delete diagnostics for exceptional situations uint64_t num_single_del_fallthru = 0; uint64_t num_single_del_mismatch = 0; + + // Blob related statistics + uint64_t num_blobs_read = 0; + uint64_t total_blob_bytes_read = 0; + uint64_t num_blobs_relocated = 0; + uint64_t total_blob_bytes_relocated = 0; }; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,53 +3,48 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include - #include "db/compaction/compaction_iterator.h" + +#include +#include + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_builder.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" #include "db/snapshot_checker.h" +#include "logging/logging.h" #include "port/likely.h" #include "rocksdb/listener.h" #include "table/internal_iterator.h" #include "test_util/sync_point.h" -#define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \ - ((seq) <= (snapshot) && \ - (snapshot_checker_ == nullptr || \ - LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ - SnapshotCheckerResult::kInSnapshot))) - -#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot) \ - ((seq) > (snapshot) || \ - (snapshot_checker_ != nullptr && \ - UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \ - SnapshotCheckerResult::kNotInSnapshot))) - -#define IN_EARLIEST_SNAPSHOT(seq) \ - ((seq) <= earliest_snapshot_ && \ - (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq)))) - namespace ROCKSDB_NAMESPACE { - CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, - const CompactionFilter* compaction_filter, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + const Compaction* compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, - const std::atomic* manual_compaction_paused, - const std::shared_ptr info_log) + const std::atomic* manual_compaction_paused, + const std::atomic* manual_compaction_canceled, + const std::shared_ptr info_log, + const std::string* full_history_ts_low) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, report_detailed_time, expect_valid_internal_key, range_del_agg, + blob_file_builder, allow_data_in_errors, std::unique_ptr( - compaction ? new CompactionProxy(compaction) : nullptr), + compaction ? new RealCompaction(compaction) : nullptr), compaction_filter, shutting_down, preserve_deletes_seqnum, - manual_compaction_paused, info_log) {} + manual_compaction_paused, manual_compaction_canceled, info_log, + full_history_ts_low) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -58,36 +53,54 @@ const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, std::unique_ptr compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, - const std::atomic* manual_compaction_paused, - const std::shared_ptr info_log) - : input_(input), + const std::atomic* manual_compaction_paused, + const std::atomic* manual_compaction_canceled, + const std::shared_ptr info_log, + const std::string* full_history_ts_low) + : input_(input, cmp, + !compaction || compaction->DoesInputReferenceBlobFiles()), cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), env_(env), + clock_(env_->GetSystemClock().get()), report_detailed_time_(report_detailed_time), expect_valid_internal_key_(expect_valid_internal_key), range_del_agg_(range_del_agg), + blob_file_builder_(blob_file_builder), compaction_(std::move(compaction)), compaction_filter_(compaction_filter), shutting_down_(shutting_down), manual_compaction_paused_(manual_compaction_paused), + manual_compaction_canceled_(manual_compaction_canceled), preserve_deletes_seqnum_(preserve_deletes_seqnum), + info_log_(info_log), + allow_data_in_errors_(allow_data_in_errors), + timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0), + full_history_ts_low_(full_history_ts_low), current_user_key_sequence_(0), current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), + blob_garbage_collection_cutoff_file_number_( + ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())), + blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())), + prefetch_buffers_( + CreatePrefetchBufferCollectionIfNeeded(compaction_.get())), current_key_committed_(false), - info_log_(info_log) { - assert(compaction_filter_ == nullptr || compaction_ != nullptr); + cmp_with_history_ts_low_(0), + level_(compaction_ == nullptr ? 0 : compaction_->level()) { assert(snapshots_ != nullptr); - bottommost_level_ = - compaction_ == nullptr ? false : compaction_->bottommost_level(); + bottommost_level_ = compaction_ == nullptr + ? false + : compaction_->bottommost_level() && + !compaction_->allow_ingest_behind(); if (compaction_ != nullptr) { level_ptrs_ = std::vector(compaction_->number_levels(), 0); } @@ -108,14 +121,16 @@ for (size_t i = 1; i < snapshots_->size(); ++i) { assert(snapshots_->at(i - 1) < snapshots_->at(i)); } + assert(timestamp_size_ == 0 || !full_history_ts_low_ || + timestamp_size_ == full_history_ts_low_->size()); #endif - input_->SetPinnedItersMgr(&pinned_iters_mgr_); + input_.SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } CompactionIterator::~CompactionIterator() { - // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime - input_->SetPinnedItersMgr(nullptr); + // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime + input_.SetPinnedItersMgr(nullptr); } void CompactionIterator::ResetRecordCounts() { @@ -142,14 +157,13 @@ if (merge_out_iter_.Valid()) { key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - bool valid_key __attribute__((__unused__)); - valid_key = ParseInternalKey(key_, &ikey_); + Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. - assert(valid_key); - if (!valid_key) { - ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", - key_.ToString(true).c_str()); + assert(s.ok()); + if (!s.ok()) { + ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s", + s.getState()); } // Keep current_key_ in sync. @@ -169,7 +183,7 @@ // Only advance the input iterator if there is no merge output and the // iterator is not already at the next record. if (!at_next_) { - input_->Next(); + AdvanceInputIter(); } NextFromInput(); } @@ -182,90 +196,191 @@ PrepareOutput(); } -void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, +bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until) { - if (compaction_filter_ != nullptr && - (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) { - // If the user has specified a compaction filter and the sequence - // number is greater than any external snapshot, then invoke the - // filter. If the return value of the compaction filter is true, - // replace the entry with a deletion marker. - CompactionFilter::Decision filter; - compaction_filter_value_.clear(); - compaction_filter_skip_until_.Clear(); - CompactionFilter::ValueType value_type = - ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue - : CompactionFilter::ValueType::kBlobIndex; - // Hack: pass internal key to BlobIndexCompactionFilter since it needs - // to get sequence number. - Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_; - { - StopWatchNano timer(env_, report_detailed_time_); + if (!compaction_filter_ || + (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) { + return true; + } + bool error = false; + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + assert(compaction_filter_); + Slice& filter_key = + (ikey_.type == kTypeValue || + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) + ? ikey_.user_key + : key_; + { + StopWatchNano timer(clock_, report_detailed_time_); + if (kTypeBlobIndex == ikey_.type) { + blob_value_.Reset(); + filter = compaction_filter_->FilterBlobByKey( + level_, filter_key, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + if (CompactionFilter::Decision::kUndetermined == filter && + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + if (compaction_ == nullptr) { + status_ = + Status::Corruption("Unexpected blob index outside of compaction"); + valid_ = false; + return false; + } + + // For integrated BlobDB impl, CompactionIterator reads blob value. + // For Stacked BlobDB impl, the corresponding CompactionFilter's + // FilterV2 method should read the blob value. + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value_); + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher_); + + s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index, + prefetch_buffer, &blob_value_, + &bytes_read); + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_type = CompactionFilter::ValueType::kValue; + } + } + if (CompactionFilter::Decision::kUndetermined == filter) { filter = compaction_filter_->FilterV2( - compaction_->level(), filter_key, value_type, value_, - &compaction_filter_value_, compaction_filter_skip_until_.rep()); - iter_stats_.total_filter_time += - env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; - } - - if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && - cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= - 0) { - // Can't skip to a key smaller than the current one. - // Keep the key as per FilterV2 documentation. - filter = CompactionFilter::Decision::kKeep; - } - - if (filter == CompactionFilter::Decision::kRemove) { - // convert the current key to a delete; key_ is pointing into - // current_key_ at this point, so updating current_key_ updates key() - ikey_.type = kTypeDeletion; - current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); - // no value associated with delete - value_.clear(); - iter_stats_.num_record_drop_user++; - } else if (filter == CompactionFilter::Decision::kChangeValue) { - value_ = compaction_filter_value_; - } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { - *need_skip = true; - compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, - kValueTypeForSeek); - *skip_until = compaction_filter_skip_until_.Encode(); + level_, filter_key, value_type, + blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); } + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } + + if (CompactionFilter::Decision::kUndetermined == filter) { + // Should not reach here, since FilterV2 should never return kUndetermined. + status_ = + Status::NotSupported("FilterV2() should never return kUndetermined"); + valid_ = false; + return false; + } + + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type == kTypeBlobIndex) { + // value transfer from blob file to inlined data + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { + // Only the StackableDB-based BlobDB impl's compaction filter should return + // kChangeBlobIndex. Decision about rewriting blob and changing blob index + // in the integrated BlobDB impl is made in subsequent call to + // PrepareOutput() and its callees. + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "Only stacked BlobDB's internal compaction filter can return " + "kChangeBlobIndex."); + valid_ = false; + return false; + } + if (ikey_.type == kTypeValue) { + // value transfer from inlined data to blob file + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kIOError) { + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "CompactionFilter for integrated BlobDB should not return kIOError"); + valid_ = false; + return false; + } + status_ = Status::IOError("Failed to access blob during compaction filter"); + error = true; + } + return !error; } void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; - while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + while (!valid_ && input_.Valid() && !IsPausingManualCompaction() && !IsShuttingDown()) { - key_ = input_->key(); - value_ = input_->value(); + key_ = input_.key(); + value_ = input_.value(); iter_stats_.num_input_records++; - if (!ParseInternalKey(key_, &ikey_)) { + Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + if (!pik_status.ok()) { + iter_stats_.num_input_corrupt_records++; + // If `expect_valid_internal_key_` is false, return the corrupted key // and let the caller decide what to do with it. - // TODO(noetzli): We should have a more elegant solution for this. if (expect_valid_internal_key_) { - assert(!"Corrupted internal key not expected."); - status_ = Status::Corruption("Corrupted internal key not expected."); - break; + status_ = pik_status; + return; } key_ = current_key_.SetInternalKey(key_); has_current_user_key_ = false; current_user_key_sequence_ = kMaxSequenceNumber; current_user_key_snapshot_ = 0; - iter_stats_.num_input_corrupt_records++; valid_ = true; break; } TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); // Update input statistics - if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || + ikey_.type == kTypeDeletionWithTimestamp) { iter_stats_.num_input_deletion_records++; } iter_stats_.total_input_raw_key_bytes += key_.size(); @@ -278,25 +393,71 @@ // merge_helper_->compaction_filter_skip_until_. Slice skip_until; + bool user_key_equal_without_ts = false; + int cmp_ts = 0; + if (has_current_user_key_) { + user_key_equal_without_ts = + cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_); + // if timestamp_size_ > 0, then curr_ts_ has been initialized by a + // previous key. + cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp( + ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_), + curr_ts_) + : 0; + } + // Check whether the user key changed. After this if statement current_key_ // is a copy of the current input key (maybe converted to a delete by the // compaction filter). ikey_.user_key is pointing to the copy. - if (!has_current_user_key_ || - !cmp_->Equal(ikey_.user_key, current_user_key_)) { + if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) { // First occurrence of this user key // Copy key for output key_ = current_key_.SetInternalKey(key_, &ikey_); + + int prev_cmp_with_ts_low = + !full_history_ts_low_ ? 0 + : curr_ts_.empty() + ? 0 + : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_); + + // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use + // in next iteration to compare with the timestamp of next key. + UpdateTimestampAndCompareWithFullHistoryLow(); + + // If + // (1) !has_current_user_key_, OR + // (2) timestamp is disabled, OR + // (3) all history will be preserved, OR + // (4) user key (excluding timestamp) is different from previous key, OR + // (5) timestamp is NO older than *full_history_ts_low_, OR + // (6) timestamp is the largest one older than full_history_ts_low_, + // then current_user_key_ must be treated as a different user key. + // This means, if a user key (excluding ts) is the same as the previous + // user key, and its ts is older than *full_history_ts_low_, then we + // consider this key for GC, e.g. it may be dropped if certain conditions + // match. + if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ || + !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 || + prev_cmp_with_ts_low >= 0) { + // Initialize for future comparison for rule (A) and etc. + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + has_current_user_key_ = true; + } current_user_key_ = ikey_.user_key; - has_current_user_key_ = true; + has_outputted_key_ = false; - current_user_key_sequence_ = kMaxSequenceNumber; - current_user_key_snapshot_ = 0; + + last_key_seq_zeroed_ = false; + current_key_committed_ = KeyCommitted(ikey_.sequence); // Apply the compaction filter to the first committed version of the user // key. - if (current_key_committed_) { - InvokeFilterIfNeeded(&need_skip, &skip_until); + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; } } else { // Update the current key to reflect the new sequence number/type without @@ -316,8 +477,9 @@ current_key_committed_ = KeyCommitted(ikey_.sequence); // Apply the compaction filter to the first committed version of the // user key. - if (current_key_committed_) { - InvokeFilterIfNeeded(&need_skip, &skip_until); + if (current_key_committed_ && + !InvokeFilterIfNeeded(&need_skip, &skip_until)) { + break; } } } @@ -331,8 +493,7 @@ // If there are no snapshots, then this kv affect visibility at tip. // Otherwise, search though all existing snapshots to find the earliest // snapshot that is affected by this kv. - SequenceNumber last_sequence __attribute__((__unused__)); - last_sequence = current_user_key_sequence_; + SequenceNumber last_sequence = current_user_key_sequence_; current_user_key_sequence_ = ikey_.sequence; SequenceNumber last_snapshot = current_user_key_snapshot_; SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot @@ -347,20 +508,25 @@ // In the previous iteration we encountered a single delete that we could // not compact out. We will keep this Put, but can drop it's data. // (See Optimization 3, below.) - assert(ikey_.type == kTypeValue); - if (ikey_.type != kTypeValue) { + assert(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex); + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex) { ROCKS_LOG_FATAL(info_log_, "Unexpected key type %d for compaction output", ikey_.type); } - assert(current_user_key_snapshot_ == last_snapshot); - if (current_user_key_snapshot_ != last_snapshot) { + assert(current_user_key_snapshot_ >= last_snapshot); + if (current_user_key_snapshot_ < last_snapshot) { ROCKS_LOG_FATAL(info_log_, "current_user_key_snapshot_ (%" PRIu64 - ") != last_snapshot (%" PRIu64 ")", + ") < last_snapshot (%" PRIu64 ")", current_user_key_snapshot_, last_snapshot); } + if (ikey_.type == kTypeBlobIndex) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_.clear(); valid_ = true; clear_and_output_next_key_ = false; @@ -372,6 +538,25 @@ // 2) We've already returned a record in this snapshot -OR- // there are no earlier earliest_write_conflict_snapshot. // + // A note about 2) above: + // we try to determine whether there is any earlier write conflict + // checking snapshot by calling DefinitelyInSnapshot() with seq and + // earliest_write_conflict_snapshot as arguments. For write-prepared + // and write-unprepared transactions, if earliest_write_conflict_snapshot + // is evicted from WritePreparedTxnDB::commit_cache, then + // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns + // false, even if the seq is actually visible within + // earliest_write_conflict_snapshot. Consequently, CompactionIterator + // may try to zero out its sequence number, thus hitting assertion error + // in debug mode or cause incorrect DBIter return result. + // We observe that earliest_write_conflict_snapshot >= earliest_snapshot, + // and the seq zeroing logic depends on + // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot + // determine whether seq is **definitely** in + // earliest_write_conflict_snapshot, then we can additionally check if + // seq is definitely in earliest_snapshot. If the latter holds, then the + // former holds too. + // // Rule 1 is needed for SingleDelete correctness. Rule 2 is needed to // allow Transactions to do write-conflict checking (if we compacted away // all keys, then we wouldn't know that a write happened in this @@ -396,33 +581,78 @@ // we can choose how to handle such a combinations of operations. We will // try to compact out as much as we can in these cases. // We will report counts on these anomalous cases. + // + // Note: If timestamp is enabled, then record will be eligible for + // deletion, only if, along with above conditions (Rule 1 and Rule 2) + // full_history_ts_low_ is specified and timestamp for that key is less + // than *full_history_ts_low_. If it's not eligible for deletion, then we + // will output the SingleDelete. For Optimization 3 also, if + // full_history_ts_low_ is specified and timestamp for the key is less + // than *full_history_ts_low_ then only optimization will be applied. // The easiest way to process a SingleDelete during iteration is to peek // ahead at the next key. + const bool is_timestamp_eligible_for_gc = + (timestamp_size_ == 0 || + (full_history_ts_low_ && cmp_with_history_ts_low_ < 0)); + ParsedInternalKey next_ikey; - input_->Next(); + AdvanceInputIter(); // Check whether the next key exists, is not corrupt, and is the same key // as the single delete. - if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && - cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { - // Check whether the next key belongs to the same snapshot as the - // SingleDelete. - if (prev_snapshot == 0 || - DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) { - if (next_ikey.type == kTypeSingleDeletion) { - // We encountered two SingleDeletes in a row. This could be due to - // unexpected user input. - // Skip the first SingleDelete and let the next iteration decide how - // to handle the second SingleDelete + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:1", + const_cast(c)); + if (last_key_seq_zeroed_) { + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); + AdvanceInputIter(); + } else if (prev_snapshot == 0 || + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) { + // Check whether the next key belongs to the same snapshot as the + // SingleDelete. + + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:2", nullptr); + if (next_ikey.type == kTypeSingleDeletion || + next_ikey.type == kTypeDeletion) { + // We encountered two SingleDeletes for same key in a row. This + // could be due to unexpected user input. If write-(un)prepared + // transaction is used, this could also be due to releasing an old + // snapshot between a Put and its matching SingleDelete. + // Furthermore, if write-(un)prepared transaction is rolled back + // after prepare, we will write a Delete to cancel a prior Put. If + // old snapshot is released between a later Put and its matching + // SingleDelete, we will end up with a Delete followed by + // SingleDelete. + // Skip the first SingleDelete and let the next iteration decide + // how to handle the second SingleDelete or Delete. // First SingleDelete has been skipped since we already called - // input_->Next(). + // input_.Next(). ++iter_stats_.num_record_drop_obsolete; ++iter_stats_.num_single_del_mismatch; + } else if (!is_timestamp_eligible_for_gc) { + // We cannot drop the SingleDelete as timestamp is enabled, and + // timestamp of this key is greater than or equal to + // *full_history_ts_low_. We will output the SingleDelete. + valid_ = true; } else if (has_outputted_key_ || - DEFINITELY_IN_SNAPSHOT( - ikey_.sequence, earliest_write_conflict_snapshot_)) { + DefinitelyInSnapshot(ikey_.sequence, + earliest_write_conflict_snapshot_) || + (earliest_snapshot_ < earliest_write_conflict_snapshot_ && + DefinitelyInSnapshot(ikey_.sequence, + earliest_snapshot_))) { // Found a matching value, we can drop the single delete and the // value. It is safe to drop both records since we've already // outputted a key in this snapshot, or there is no earlier @@ -439,9 +669,9 @@ ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_obsolete; - // Already called input_->Next() once. Call it a second time to + // Already called input_.Next() once. Call it a second time to // skip past the second key. - input_->Next(); + AdvanceInputIter(); } else { // Found a matching value, but we cannot drop both keys since // there is an earlier snapshot and we need to leave behind a record @@ -455,11 +685,17 @@ // Set up the Put to be outputted in the next iteration. // (Optimization 3). clear_and_output_next_key_ = true; + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:KeepSDForWW", + /*arg=*/nullptr); } } else { // We hit the next snapshot without hitting a put, so the iterator // returns the single delete. valid_ = true; + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:SingleDelete:3", + const_cast(c)); } } else { // We are at the end of the input, could not parse the next key, or hit @@ -470,9 +706,11 @@ // iteration. If the next key is corrupt, we return before the // comparison, so the value of has_current_user_key does not matter. has_current_user_key_ = false; - if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + if (compaction_ != nullptr && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, - &level_ptrs_)) { + &level_ptrs_) && + is_timestamp_eligible_for_gc) { // Key doesn't exist outside of this range. // Can compact out this SingleDelete. ++iter_stats_.num_record_drop_obsolete; @@ -480,6 +718,11 @@ if (!bottommost_level_) { ++iter_stats_.num_optimized_del_drop_obsolete; } + } else if (last_key_seq_zeroed_) { + // Skip. + ++iter_stats_.num_record_drop_hidden; + ++iter_stats_.num_record_drop_obsolete; + assert(bottommost_level_); } else { // Output SingleDelete valid_ = true; @@ -508,10 +751,13 @@ last_sequence, current_user_key_sequence_); } - ++iter_stats_.num_record_drop_hidden; // (A) - input_->Next(); - } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion && - IN_EARLIEST_SNAPSHOT(ikey_.sequence) && + ++iter_stats_.num_record_drop_hidden; // rule (A) + AdvanceInputIter(); + } else if (compaction_ != nullptr && + (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && ikeyNotNeededForIncrementalSnapshot() && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, &level_ptrs_)) { @@ -534,30 +780,54 @@ // given that: // (1) The deletion is earlier than earliest_write_conflict_snapshot, and // (2) No value exist earlier than the deletion. + // + // Note also that a deletion marker of type kTypeDeletionWithTimestamp + // will be treated as a different user key unless the timestamp is older + // than *full_history_ts_low_. ++iter_stats_.num_record_drop_obsolete; if (!bottommost_level_) { ++iter_stats_.num_optimized_del_drop_obsolete; } - input_->Next(); - } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ && - ikeyNotNeededForIncrementalSnapshot()) { + AdvanceInputIter(); + } else if ((ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + bottommost_level_ && ikeyNotNeededForIncrementalSnapshot()) { // Handle the case where we have a delete key at the bottom most level // We can skip outputting the key iff there are no subsequent puts for this // key + assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( + ikey_.user_key, &level_ptrs_)); ParsedInternalKey next_ikey; - input_->Next(); - // Skip over all versions of this key that happen to occur in the same snapshot - // range as the delete - while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && - cmp_->Equal(ikey_.user_key, next_ikey.user_key) && + AdvanceInputIter(); +#ifndef NDEBUG + const Compaction* c = + compaction_ ? compaction_->real_compaction() : nullptr; +#endif + TEST_SYNC_POINT_CALLBACK( + "CompactionIterator::NextFromInput:BottommostDelete:1", + const_cast(c)); + // Skip over all versions of this key that happen to occur in the same + // snapshot range as the delete. + // + // Note that a deletion marker of type kTypeDeletionWithTimestamp will be + // considered to have a different user key unless the timestamp is older + // than *full_history_ts_low_. + while (!IsPausingManualCompaction() && !IsShuttingDown() && + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && (prev_snapshot == 0 || - DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { - input_->Next(); + DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { + AdvanceInputIter(); } // If you find you still need to output a row with this key, we need to output the // delete too - if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && - cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { valid_ = true; at_next_ = true; } @@ -569,12 +839,15 @@ } pinned_iters_mgr_.StartPinning(); + // We know the merge type entry is not hidden, otherwise we would // have hit (A) // We encapsulate the merge related state machine in a different // object to minimize change to the existing flow. - Status s = merge_helper_->MergeUntil(input_, range_del_agg_, - prev_snapshot, bottommost_level_); + Status s = merge_helper_->MergeUntil( + &input_, range_del_agg_, prev_snapshot, bottommost_level_, + allow_data_in_errors_, blob_fetcher_.get(), prefetch_buffers_.get(), + &iter_stats_); merge_out_iter_.SeekToFirst(); if (!s.ok() && !s.IsMergeInProgress()) { @@ -585,14 +858,13 @@ // These will be correctly set below. key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - bool valid_key __attribute__((__unused__)); - valid_key = ParseInternalKey(key_, &ikey_); + pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to valid. - assert(valid_key); - if (!valid_key) { - ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", - key_.ToString(true).c_str()); + assert(pik_status.ok()); + if (!pik_status.ok()) { + ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s", + pik_status.getState()); } // Keep current_key_ in sync. current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); @@ -618,14 +890,14 @@ if (should_delete) { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_range_del; - input_->Next(); + AdvanceInputIter(); } else { valid_ = true; } } if (need_skip) { - input_->Seek(skip_until); + SkipUntil(skip_until); } } @@ -638,25 +910,144 @@ } } -void CompactionIterator::PrepareOutput() { - if (valid_) { - if (compaction_filter_ && ikey_.type == kTypeBlobIndex) { - const auto blob_decision = compaction_filter_->PrepareBlobOutput( - user_key(), value_, &compaction_filter_value_); - - if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { - status_ = Status::Corruption( - "Corrupted blob reference encountered during GC"); +bool CompactionIterator::ExtractLargeValueIfNeededImpl() { + if (!blob_file_builder_) { + return false; + } + + blob_index_.clear(); + const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_); + + if (!s.ok()) { + status_ = s; + valid_ = false; + + return false; + } + + if (blob_index_.empty()) { + return false; + } + + value_ = blob_index_; + + return true; +} + +void CompactionIterator::ExtractLargeValueIfNeeded() { + assert(ikey_.type == kTypeValue); + + if (!ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); +} + +void CompactionIterator::GarbageCollectBlobIfNeeded() { + assert(ikey_.type == kTypeBlobIndex); + + if (!compaction_) { + return; + } + + // GC for integrated BlobDB + if (compaction_->enable_blob_garbage_collection()) { + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value_); + + if (!s.ok()) { + status_ = s; valid_ = false; - } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) { - status_ = Status::IOError("Could not relocate blob during GC"); + + return; + } + } + + if (blob_index.file_number() >= + blob_garbage_collection_cutoff_file_number_) { + return; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + { + assert(blob_fetcher_); + + const Status s = blob_fetcher_->FetchBlob( + user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read); + + if (!s.ok()) { + status_ = s; valid_ = false; - } else if (blob_decision == - CompactionFilter::BlobDecision::kChangeValue) { - value_ = compaction_filter_value_; + + return; } } + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + ++iter_stats_.num_blobs_relocated; + iter_stats_.total_blob_bytes_relocated += blob_index.size(); + + value_ = blob_value_; + + if (ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + + return; + } + + // GC for stacked BlobDB + if (compaction_filter_ && + compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = + Status::Corruption("Corrupted blob reference encountered during GC"); + valid_ = false; + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + valid_ = false; + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + + return; + } + } +} + +void CompactionIterator::PrepareOutput() { + if (valid_) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } + // Zeroing out the sequence number leads to better compression. // If this is the bottommost level (no files in lower levels) // and the earliest snapshot is larger than this seqno @@ -671,15 +1062,34 @@ if (valid_ && compaction_ != nullptr && !compaction_->allow_ingest_behind() && ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && - IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) { - assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); - if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && + ikey_.type != kTypeMerge) { + assert(ikey_.type != kTypeDeletion); + assert(ikey_.type != kTypeSingleDeletion || + (timestamp_size_ || full_history_ts_low_)); + if (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeSingleDeletion && + (!timestamp_size_ || !full_history_ts_low_))) { ROCKS_LOG_FATAL(info_log_, "Unexpected key type %d for seq-zero optimization", ikey_.type); } ikey_.sequence = 0; - current_key_.UpdateInternalKey(0, ikey_.type); + last_key_seq_zeroed_ = true; + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq", + &ikey_); + if (!timestamp_size_) { + current_key_.UpdateInternalKey(0, ikey_.type); + } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { + // We can also zero out timestamp for better compression. + // For the same user key (excluding timestamp), the timestamp-based + // history can be collapsed to save some space if the timestamp is + // older than *full_history_ts_low_. + const std::string kTsMin(timestamp_size_, static_cast(0)); + const Slice ts_slice = kTsMin; + ikey_.SetTimestamp(ts_slice); + current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + } } } } @@ -736,39 +1146,68 @@ (ikey_.sequence < preserve_deletes_seqnum_); } -bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { - assert(snapshot_checker_ != nullptr); - bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber || - (earliest_snapshot_iter_ != snapshots_->end() && - *earliest_snapshot_iter_ == earliest_snapshot_)); - assert(pre_condition); - if (!pre_condition) { - ROCKS_LOG_FATAL(info_log_, - "Pre-Condition is not hold in IsInEarliestSnapshot"); +uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction) { + if (!compaction) { + return 0; } - auto in_snapshot = - snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); - while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) { - // Avoid the the current earliest_snapshot_ being return as - // earliest visible snapshot for the next value. So if a value's sequence - // is zero-ed out by PrepareOutput(), the next value will be compact out. - released_snapshots_.insert(earliest_snapshot_); - earliest_snapshot_iter_++; - if (earliest_snapshot_iter_ == snapshots_->end()) { - earliest_snapshot_ = kMaxSequenceNumber; - } else { - earliest_snapshot_ = *earliest_snapshot_iter_; - } - in_snapshot = - snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); + if (!compaction->enable_blob_garbage_collection()) { + return 0; } - assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased); - if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) { - ROCKS_LOG_FATAL(info_log_, - "Unexpected released snapshot in IsInEarliestSnapshot"); + + const Version* const version = compaction->input_version(); + assert(version); + + const VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + auto it = blob_files.begin(); + std::advance( + it, compaction->blob_garbage_collection_age_cutoff() * blob_files.size()); + + return it != blob_files.end() ? it->first + : std::numeric_limits::max(); +} + +std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + const Version* const version = compaction->input_version(); + if (!version) { + return nullptr; } - return in_snapshot == SnapshotCheckerResult::kInSnapshot; + + return std::unique_ptr(new BlobFetcher(version, ReadOptions())); +} + +std::unique_ptr +CompactionIterator::CreatePrefetchBufferCollectionIfNeeded( + const CompactionProxy* compaction) { + if (!compaction) { + return nullptr; + } + + if (!compaction->input_version()) { + return nullptr; + } + + if (compaction->allow_mmap_reads()) { + return nullptr; + } + + const uint64_t readahead_size = compaction->blob_compaction_readahead_size(); + if (!readahead_size) { + return nullptr; + } + + return std::unique_ptr( + new PrefetchBufferCollection(readahead_size)); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include @@ -21,39 +22,153 @@ namespace ROCKSDB_NAMESPACE { +class BlobFileBuilder; +class BlobFetcher; +class PrefetchBufferCollection; + +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : icmp_(cmp, /*named=*/false), + inner_iter_(iter), + need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + icmp_.Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + + private: + InternalKeyComparator icmp_; + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what // CompactionIterator uses. Tests can override it. class CompactionProxy { public: - explicit CompactionProxy(const Compaction* compaction) - : compaction_(compaction) {} - virtual ~CompactionProxy() = default; - virtual int level(size_t /*compaction_input_level*/ = 0) const { - return compaction_->level(); - } + + virtual int level() const = 0; + virtual bool KeyNotExistsBeyondOutputLevel( - const Slice& user_key, std::vector* level_ptrs) const { + const Slice& user_key, std::vector* level_ptrs) const = 0; + + virtual bool bottommost_level() const = 0; + + virtual int number_levels() const = 0; + + virtual Slice GetLargestUserKey() const = 0; + + virtual bool allow_ingest_behind() const = 0; + + virtual bool preserve_deletes() const = 0; + + virtual bool allow_mmap_reads() const = 0; + + virtual bool enable_blob_garbage_collection() const = 0; + + virtual double blob_garbage_collection_age_cutoff() const = 0; + + virtual uint64_t blob_compaction_readahead_size() const = 0; + + virtual const Version* input_version() const = 0; + + virtual bool DoesInputReferenceBlobFiles() const = 0; + + virtual const Compaction* real_compaction() const = 0; + }; + + class RealCompaction : public CompactionProxy { + public: + explicit RealCompaction(const Compaction* compaction) + : compaction_(compaction) { + assert(compaction_); + assert(compaction_->immutable_options()); + assert(compaction_->mutable_cf_options()); + } + + int level() const override { return compaction_->level(); } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const override { return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); } - virtual bool bottommost_level() const { + + bool bottommost_level() const override { return compaction_->bottommost_level(); } - virtual int number_levels() const { return compaction_->number_levels(); } - virtual Slice GetLargestUserKey() const { + + int number_levels() const override { return compaction_->number_levels(); } + + Slice GetLargestUserKey() const override { return compaction_->GetLargestUserKey(); } - virtual bool allow_ingest_behind() const { - return compaction_->immutable_cf_options()->allow_ingest_behind; + + bool allow_ingest_behind() const override { + return compaction_->immutable_options()->allow_ingest_behind; + } + + bool preserve_deletes() const override { + return compaction_->immutable_options()->preserve_deletes; + } + + bool allow_mmap_reads() const override { + return compaction_->immutable_options()->allow_mmap_reads; + } + + bool enable_blob_garbage_collection() const override { + return compaction_->mutable_cf_options()->enable_blob_garbage_collection; } - virtual bool preserve_deletes() const { - return compaction_->immutable_cf_options()->preserve_deletes; + + double blob_garbage_collection_age_cutoff() const override { + return compaction_->mutable_cf_options() + ->blob_garbage_collection_age_cutoff; } - protected: - CompactionProxy() = default; + uint64_t blob_compaction_readahead_size() const override { + return compaction_->mutable_cf_options()->blob_compaction_readahead_size; + } + + const Version* input_version() const override { + return compaction_->input_version(); + } + + bool DoesInputReferenceBlobFiles() const override { + return compaction_->DoesInputReferenceBlobFiles(); + } + + const Compaction* real_compaction() const override { return compaction_; } private: const Compaction* compaction_; @@ -66,12 +181,15 @@ const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, const SequenceNumber preserve_deletes_seqnum = 0, - const std::atomic* manual_compaction_paused = nullptr, - const std::shared_ptr info_log = nullptr); + const std::atomic* manual_compaction_paused = nullptr, + const std::atomic* manual_compaction_canceled = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr); // Constructor with custom CompactionProxy, used for tests. CompactionIterator( @@ -81,12 +199,15 @@ const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, const SequenceNumber preserve_deletes_seqnum = 0, - const std::atomic* manual_compaction_paused = nullptr, - const std::shared_ptr info_log = nullptr); + const std::atomic* manual_compaction_paused = nullptr, + const std::atomic* manual_compaction_canceled = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr); ~CompactionIterator(); @@ -110,18 +231,39 @@ bool Valid() const { return valid_; } const Slice& user_key() const { return current_user_key_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } private: // Processes the input stream to find the next output void NextFromInput(); - // Do last preparations before presenting the output to the callee. At this - // point this only zeroes out the sequence number if possible for better - // compression. + // Do final preparations before presenting the output to the callee. void PrepareOutput(); + // Passes the output value to the blob file builder (if any), and replaces it + // with the corresponding blob reference if it has been actually written to a + // blob file (i.e. if it passed the value size check). Returns true if the + // value got extracted to a blob file, false otherwise. + bool ExtractLargeValueIfNeededImpl(); + + // Extracts large values as described above, and updates the internal key's + // type to kTypeBlobIndex if the value got extracted. Should only be called + // for regular values (kTypeValue). + void ExtractLargeValueIfNeeded(); + + // Relocates valid blobs residing in the oldest blob files if garbage + // collection is enabled. Relocated blobs are written to new blob files or + // inlined in the LSM tree depending on the current settings (i.e. + // enable_blob_files and min_blob_size). Should only be called for blob + // references (kTypeBlobIndex). + // + // Note: the stacked BlobDB implementation's compaction filter based GC + // algorithm is also called from here. + void GarbageCollectBlobIfNeeded(); + // Invoke compaction filter if needed. - void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); + // Return true on success, false on failures (e.g.: kIOError). + bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); // Given a sequence number, return the sequence number of the // earliest snapshot that this sequence number is visible in. @@ -143,9 +285,32 @@ SnapshotCheckerResult::kInSnapshot; } - bool IsInEarliestSnapshot(SequenceNumber sequence); + bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot); + + // Extract user-defined timestamp from user key if possible and compare it + // with *full_history_ts_low_ if applicable. + inline void UpdateTimestampAndCompareWithFullHistoryLow() { + if (!timestamp_size_) { + return; + } + Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_); + curr_ts_.assign(ts.data(), ts.size()); + if (full_history_ts_low_) { + cmp_with_history_ts_low_ = + cmp_->CompareTimestamp(ts, *full_history_ts_low_); + } + } - InternalIterator* input_; + static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction); + static std::unique_ptr CreateBlobFetcherIfNeeded( + const CompactionProxy* compaction); + static std::unique_ptr + CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction); + + SequenceIterWrapper input_; const Comparator* cmp_; MergeHelper* merge_helper_; const std::vector* snapshots_; @@ -159,13 +324,16 @@ const SequenceNumber earliest_write_conflict_snapshot_; const SnapshotChecker* const snapshot_checker_; Env* env_; + SystemClock* clock_; bool report_detailed_time_; bool expect_valid_internal_key_; CompactionRangeDelAggregator* range_del_agg_; + BlobFileBuilder* blob_file_builder_; std::unique_ptr compaction_; const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; - const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_canceled_; const SequenceNumber preserve_deletes_seqnum_; bool bottommost_level_; bool valid_ = false; @@ -173,6 +341,20 @@ SequenceNumber earliest_snapshot_; SequenceNumber latest_snapshot_; + std::shared_ptr info_log_; + + bool allow_data_in_errors_; + + // Comes from comparator. + const size_t timestamp_size_; + + // Lower bound timestamp to retain full history in terms of user-defined + // timestamp. If a key's timestamp is older than full_history_ts_low_, then + // the key *may* be eligible for garbage collection (GC). The skipping logic + // is in `NextFromInput()` and `PrepareOutput()`. + // If nullptr, NO GC will be performed and all history will be preserved. + const std::string* const full_history_ts_low_; + // State // // Points to a copy of the current compaction iterator output (current_key_) @@ -191,11 +373,13 @@ // Stores whether ikey_.user_key is valid. If set to false, the user key is // not compared against the current key in the underlying iterator. bool has_current_user_key_ = false; - bool at_next_ = false; // If false, the iterator - // Holds a copy of the current compaction iterator output (or current key in - // the underlying iterator during NextFromInput()). + // If false, the iterator holds a copy of the current compaction iterator + // output (or current key in the underlying iterator during NextFromInput()). + bool at_next_ = false; + IterKey current_key_; Slice current_user_key_; + std::string curr_ts_; SequenceNumber current_user_key_sequence_; SequenceNumber current_user_key_snapshot_; @@ -210,6 +394,14 @@ // PinnedIteratorsManager used to pin input_ Iterator blocks while reading // merge operands and then releasing them after consuming them. PinnedIteratorsManager pinned_iters_mgr_; + + uint64_t blob_garbage_collection_cutoff_file_number_; + + std::unique_ptr blob_fetcher_; + std::unique_ptr prefetch_buffers_; + + std::string blob_index_; + PinnableSlice blob_value_; std::string compaction_filter_value_; InternalKey compaction_filter_skip_until_; // "level_ptrs" holds indices that remember which file of an associated @@ -224,7 +416,19 @@ // Used to avoid purging uncommitted values. The application can specify // uncommitted values by providing a SnapshotChecker object. bool current_key_committed_; - std::shared_ptr info_log_; + + // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_) + int cmp_with_history_ts_low_; + + const int level_; + + // True if the previous internal key (same user key)'s sequence number has + // just been zeroed out during bottommost compaction. + bool last_key_seq_zeroed_{false}; + + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. @@ -233,8 +437,27 @@ bool IsPausingManualCompaction() { // This is a best-effort facility, so memory_order_relaxed is sufficient. - return manual_compaction_paused_ && - manual_compaction_paused_->load(std::memory_order_relaxed); + return (manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed) > 0) || + (manual_compaction_canceled_ && + manual_compaction_canceled_->load(std::memory_order_relaxed)); } }; + +inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, + SequenceNumber snapshot) { + return ((seq) <= (snapshot) && + (snapshot_checker_ == nullptr || + LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kInSnapshot))); +} + +inline bool CompactionIterator::DefinitelyNotInSnapshot( + SequenceNumber seq, SequenceNumber snapshot) { + return ((seq) > (snapshot) || + (snapshot_checker_ != nullptr && + UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == + SnapshotCheckerResult::kNotInSnapshot))); +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,15 +3,17 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/compaction/compaction_iterator.h" #include #include -#include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" #include "port/port.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" +#include "util/vector_iterator.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { @@ -38,7 +40,7 @@ // Compaction filter that gets stuck when it sees a particular key, // then gets unstuck when told to. -// Always returns Decition::kRemove. +// Always returns Decision::kRemove. class StallingFilter : public CompactionFilter { public: Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, @@ -86,7 +88,7 @@ const char* Name() const override { return "AllKeysCompactionFilter"; } }; -class LoggingForwardVectorIterator : public InternalIterator { +class LoggingForwardVectorIterator : public VectorIterator { public: struct Action { enum class Type { @@ -108,22 +110,19 @@ LoggingForwardVectorIterator(const std::vector& keys, const std::vector& values) - : keys_(keys), values_(values), current_(keys.size()) { - assert(keys_.size() == values_.size()); + : VectorIterator(keys, values) { + current_ = keys_.size(); } - bool Valid() const override { return current_ < keys_.size(); } - void SeekToFirst() override { log.emplace_back(Action::Type::SEEK_TO_FIRST); - current_ = 0; + VectorIterator::SeekToFirst(); } void SeekToLast() override { assert(false); } void Seek(const Slice& target) override { log.emplace_back(Action::Type::SEEK, target.ToString()); - current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - - keys_.begin(); + VectorIterator::Seek(target); } void SeekForPrev(const Slice& /*target*/) override { assert(false); } @@ -131,54 +130,66 @@ void Next() override { assert(Valid()); log.emplace_back(Action::Type::NEXT); - current_++; + VectorIterator::Next(); } void Prev() override { assert(false); } Slice key() const override { assert(Valid()); - return Slice(keys_[current_]); + return VectorIterator::key(); } Slice value() const override { assert(Valid()); - return Slice(values_[current_]); + return VectorIterator::value(); } - Status status() const override { return Status::OK(); } - std::vector log; - - private: - std::vector keys_; - std::vector values_; - size_t current_; }; class FakeCompaction : public CompactionIterator::CompactionProxy { public: - FakeCompaction() = default; + int level() const override { return 0; } - int level(size_t /*compaction_input_level*/) const override { return 0; } bool KeyNotExistsBeyondOutputLevel( const Slice& /*user_key*/, std::vector* /*level_ptrs*/) const override { return is_bottommost_level || key_not_exists_beyond_output_level; } + bool bottommost_level() const override { return is_bottommost_level; } + int number_levels() const override { return 1; } + Slice GetLargestUserKey() const override { return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; } - bool allow_ingest_behind() const override { return false; } + + bool allow_ingest_behind() const override { return is_allow_ingest_behind; } bool preserve_deletes() const override { return false; } + bool allow_mmap_reads() const override { return false; } + + bool enable_blob_garbage_collection() const override { return false; } + + double blob_garbage_collection_age_cutoff() const override { return 0.0; } + + uint64_t blob_compaction_readahead_size() const override { return 0; } + + const Version* input_version() const override { return nullptr; } + + bool DoesInputReferenceBlobFiles() const override { return false; } + + const Compaction* real_compaction() const override { return nullptr; } + bool key_not_exists_beyond_output_level = false; bool is_bottommost_level = false; + + bool is_allow_ingest_behind = false; }; -// A simplifed snapshot checker which assumes each snapshot has a global +// A simplified snapshot checker which assumes each snapshot has a global // last visible sequence. class TestSnapshotChecker : public SnapshotChecker { public: @@ -214,6 +225,9 @@ CompactionIteratorTest() : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + explicit CompactionIteratorTest(const Comparator* ucmp) + : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {} + void InitIterators( const std::vector& ks, const std::vector& vs, const std::vector& range_del_ks, @@ -222,9 +236,11 @@ SequenceNumber last_committed_sequence = kMaxSequenceNumber, MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, bool bottommost_level = false, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { std::unique_ptr unfragmented_range_del_iter( - new test::VectorIterator(range_del_ks, range_del_vs)); + new VectorIterator(range_del_ks, range_del_vs, &icmp_)); auto tombstone_list = std::make_shared( std::move(unfragmented_range_del_iter), icmp_); std::unique_ptr range_del_iter( @@ -234,9 +250,12 @@ range_del_agg_->AddTombstones(std::move(range_del_iter)); std::unique_ptr compaction; - if (filter || bottommost_level) { + if (filter || bottommost_level || key_not_exists_beyond_output_level) { compaction_proxy_ = new FakeCompaction(); compaction_proxy_->is_bottommost_level = bottommost_level; + compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind(); + compaction_proxy_->key_not_exists_beyond_output_level = + key_not_exists_beyond_output_level; compaction.reset(compaction_proxy_); } bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); @@ -249,13 +268,23 @@ 0 /*latest_snapshot*/, snapshot_checker_.get(), 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + if (c_iter_) { + // Since iter_ is still used in ~CompactionIterator(), we call + // ~CompactionIterator() first. + c_iter_.reset(); + } iter_.reset(new LoggingForwardVectorIterator(ks, vs)); iter_->SeekToFirst(); c_iter_.reset(new CompactionIterator( iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_, earliest_write_conflict_snapshot, snapshot_checker_.get(), Env::Default(), false /* report_detailed_time */, false, - range_del_agg_.get(), std::move(compaction), filter, &shutting_down_)); + range_del_agg_.get(), nullptr /* blob_file_builder */, + true /*allow_data_in_errors*/, std::move(compaction), filter, + &shutting_down_, /*preserve_deletes_seqnum=*/0, + /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, /*info_log=*/nullptr, + full_history_ts_low)); } void AddSnapshot(SequenceNumber snapshot, @@ -266,6 +295,8 @@ virtual bool UseSnapshotChecker() const { return false; } + virtual bool AllowIngestBehind() const { return false; } + void RunTest( const std::vector& input_keys, const std::vector& input_values, @@ -275,10 +306,13 @@ MergeOperator* merge_operator = nullptr, CompactionFilter* compaction_filter = nullptr, bool bottommost_level = false, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, last_committed_seq, merge_operator, compaction_filter, - bottommost_level, earliest_write_conflict_snapshot); + bottommost_level, earliest_write_conflict_snapshot, + key_not_exists_beyond_output_level, full_history_ts_low); c_iter_->SeekToFirst(); for (size_t i = 0; i < expected_keys.size(); i++) { std::string info = "i = " + ToString(i); @@ -288,9 +322,15 @@ ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info; c_iter_->Next(); } + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); } + void ClearSnapshots() { + snapshots_.clear(); + snapshot_map_.clear(); + } + const Comparator* cmp_; const InternalKeyComparator icmp_; std::vector snapshots_; @@ -312,6 +352,7 @@ test::KeyStr("a", 3, kTypeValue)}, {"", "val"}, {}, {}, 5); c_iter_->SeekToFirst(); + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); } @@ -333,6 +374,7 @@ ASSERT_TRUE(c_iter_->Valid()); ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString()); c_iter_->Next(); + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); } @@ -349,6 +391,7 @@ ASSERT_TRUE(c_iter_->Valid()); ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString()); c_iter_->Next(); + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); } @@ -370,6 +413,7 @@ ASSERT_TRUE(c_iter_->Valid()); ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString()); c_iter_->Next(); + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); } @@ -463,6 +507,7 @@ ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString()); ASSERT_EQ("hv91", c_iter_->value().ToString()); c_iter_->Next(); + ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); // Check that the compaction iterator did the correct sequence of calls on @@ -656,6 +701,7 @@ ASSERT_TRUE(c_iter_->Valid()); ASSERT_EQ("bv1bv2", c_iter_->value().ToString()); c_iter_->Next(); + ASSERT_OK(c_iter_->status()); ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); } @@ -666,7 +712,7 @@ RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, {"v1", "v2"}, {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, - {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/, + {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -675,15 +721,14 @@ // permanently. TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { AddSnapshot(1); - RunTest({test::KeyStr("a", 1, kTypeDeletion), - test::KeyStr("b", 3, kTypeDeletion), - test::KeyStr("b", 1, kTypeValue)}, - {"", "", ""}, - {test::KeyStr("b", 3, kTypeDeletion), - test::KeyStr("b", 0, kTypeValue)}, - {"", ""}, - kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, - nullptr /*compaction_filter*/, true /*bottommost_level*/); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); } // In bottommost level, single deletions earlier than earliest snapshot can be @@ -693,10 +738,22 @@ RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), test::KeyStr("b", 2, kTypeSingleDeletion)}, {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, - kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } +TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest, testing::Values(true, false)); @@ -838,7 +895,7 @@ {"v1", "v2", "v3"}, {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), test::KeyStr("c", 3, kTypeValue)}, - {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -849,9 +906,7 @@ RunTest( {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), test::KeyStr("c", 3, kTypeDeletion)}, - {"", "", ""}, - {}, - {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -859,15 +914,14 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, NotRemoveDeletionIfValuePresentToEarlierSnapshot) { AddSnapshot(2,1); - RunTest( - {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), - test::KeyStr("b", 3, kTypeValue)}, - {"", "", ""}, - {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue), - test::KeyStr("b", 3, kTypeValue)}, - {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/, - nullptr /*merge_operator*/, nullptr /*compaction_filter*/, - true /*bottommost_level*/); + RunTest({test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); } TEST_F(CompactionIteratorWithSnapshotCheckerTest, @@ -879,7 +933,7 @@ {"", "", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion), test::KeyStr("c", 3, kTypeSingleDeletion)}, - {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -913,9 +967,24 @@ 2 /*earliest_write_conflict_snapshot*/); } +// Same as above but with a blob index. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_BlobIndex) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeBlobIndex)}, + {"", "fake_blob_index"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + // Compaction filter should keep uncommitted key as-is, and -// * Convert the latest velue to deletion, and/or -// * if latest value is a merge, apply filter to all suequent merges. +// * Convert the latest value to deletion, and/or +// * if latest value is a merge, apply filter to all subsequent merges. TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { std::unique_ptr compaction_filter( @@ -968,6 +1037,323 @@ compaction_filter.get()); } +// Tests how CompactionIterator work together with AllowIngestBehind. +class CompactionIteratorWithAllowIngestBehindTest + : public CompactionIteratorTest { + public: + bool AllowIngestBehind() const override { return true; } +}; + +// When allow_ingest_behind is set, compaction iterator is not targeting +// the bottommost level since there is no guarantee there won't be further +// data ingested under the compaction output in future. +TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +TEST_P(CompactionIteratorWithAllowIngestBehindTest, + MergeToPutIfEncounteredPutAtBottom) { + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendOperator(); + RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge), + test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a4", "a3", "a2", "b1"}, + {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)}, + {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/, + merge_op.get(), nullptr /*compaction_filter*/, + true /*bottomost_level*/); +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance, + CompactionIteratorWithAllowIngestBehindTest, + testing::Values(true, false)); + +class CompactionIteratorTsGcTest : public CompactionIteratorTest { + public: + CompactionIteratorTsGcTest() + : CompactionIteratorTest(test::ComparatorWithU64Ts()) {} +}; + +TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"a3", "", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[3]}; + const std::vector expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot, the deletion marker should be preserved because the user + // key may appear beyond output level. + const std::vector expected_keys = {input_keys[0], + input_keys[3]}; + const std::vector expected_values = {"", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } + { + // No snapshot, the deletion marker can be dropped because the user key + // does not appear in higher levels. + const std::vector expected_keys = {input_keys[3]}; + const std::vector expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "a0"}; + { + std::string full_history_ts_low; + // Keys whose timestamps larger than or equal to 102 will be preserved. + PutFixed64(&full_history_ts_low, 102); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector expected_values = {"", input_values[1], + input_values[2]}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, DropTombstones) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = {input_keys[0], input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + + { + // Non-bottommost level, but key does not exist beyond output level. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_sequence=*/kMaxSequenceNumber, + /*merge_op=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } + { + // Bottommost level + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, RewriteTs) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2], + test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)}; + const std::vector expected_values = {"", "a2", "", "a0"}; + + AddSnapshot(1); + AddSnapshot(2); + + { + // Bottommost level and need to rewrite both ts and seq. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)}; + const std::vector input_values = {"", "a3", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = {input_keys[0], input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + { + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + +TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"", "a2", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2]}; + const std::vector expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot. + const std::vector expected_keys = {input_keys[2]}; + const std::vector expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance, + CompactionIteratorTsGcTest, + testing::Values(true, false)); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/compaction/compaction_job.h" + #include #include #include @@ -18,8 +20,12 @@ #include #include +#include "db/blob/blob_counting_iterator.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_builder.h" +#include "db/blob/blob_garbage_meter.h" #include "db/builder.h" -#include "db/compaction/compaction_job.h" +#include "db/compaction/clipping_iterator.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -31,6 +37,7 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" +#include "db/output_validator.h" #include "db/range_del_aggregator.h" #include "db/version_set.h" #include "file/filename.h" @@ -42,18 +49,23 @@ #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/sst_partitioner.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "test_util/sync_point.h" #include "util/coding.h" +#include "util/hash.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" @@ -95,6 +107,10 @@ return "ExternalSstIngestion"; case CompactionReason::kPeriodicCompaction: return "PeriodicCompaction"; + case CompactionReason::kChangeTemperature: + return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; case CompactionReason::kNumOfReasons: // fall through default: @@ -116,23 +132,37 @@ // The return status of this subcompaction Status status; + // The return IO Status of this subcompaction + IOStatus io_status; + // Files produced by this subcompaction struct Output { + Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, + bool _enable_order_check, bool _enable_hash, bool _finished = false, + uint64_t precalculated_hash = 0) + : meta(std::move(_meta)), + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} FileMetaData meta; + OutputValidator validator; bool finished; std::shared_ptr table_properties; }; // State kept for output being generated std::vector outputs; + std::vector blob_file_additions; + std::unique_ptr blob_garbage_meter; std::unique_ptr outfile; std::unique_ptr builder; + Output* current_output() { if (outputs.empty()) { - // This subcompaction's outptut could be empty if compaction was aborted + // This subcompaction's output could be empty if compaction was aborted // before this subcompaction had a chance to generate any output files. // When subcompactions are executed sequentially this is more likely and - // will be particulalry likely for the later subcompactions to be empty. + // will be particularly likely for the later subcompactions to be empty. // Once they are run in parallel however it should be much rarer. return nullptr; } else { @@ -140,13 +170,20 @@ } } - uint64_t current_output_file_size; + // Some identified files with old oldest ancester time and the range should be + // isolated out so that the output file(s) in that range can be merged down + // for TTL and clear the timestamps for the range. + std::vector files_to_cut_for_ttl; + int cur_files_to_cut_for_ttl = -1; + int next_files_to_cut_for_ttl = 0; + + uint64_t current_output_file_size = 0; // State during the subcompaction - uint64_t total_bytes; - uint64_t num_output_records; + uint64_t total_bytes = 0; + uint64_t num_output_records = 0; CompactionJobStats compaction_job_stats; - uint64_t approx_size; + uint64_t approx_size = 0; // An index that used to speed up ShouldStopBefore(). size_t grandparent_index = 0; // The number of bytes overlapping between the current output and @@ -154,49 +191,35 @@ uint64_t overlapped_bytes = 0; // A flag determine whether the key has been seen in ShouldStopBefore() bool seen_key = false; + // sub compaction job id, which is used to identify different sub-compaction + // within the same compaction job. + const uint32_t sub_job_id; - SubcompactionState(Compaction* c, Slice* _start, Slice* _end, - uint64_t size = 0) + SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size, + uint32_t _sub_job_id) : compaction(c), start(_start), end(_end), - outfile(nullptr), - builder(nullptr), - current_output_file_size(0), - total_bytes(0), - num_output_records(0), approx_size(size), - grandparent_index(0), - overlapped_bytes(0), - seen_key(false) { + sub_job_id(_sub_job_id) { assert(compaction != nullptr); } - SubcompactionState(SubcompactionState&& o) { *this = std::move(o); } - - SubcompactionState& operator=(SubcompactionState&& o) { - compaction = std::move(o.compaction); - start = std::move(o.start); - end = std::move(o.end); - status = std::move(o.status); - outputs = std::move(o.outputs); - outfile = std::move(o.outfile); - builder = std::move(o.builder); - current_output_file_size = std::move(o.current_output_file_size); - total_bytes = std::move(o.total_bytes); - num_output_records = std::move(o.num_output_records); - compaction_job_stats = std::move(o.compaction_job_stats); - approx_size = std::move(o.approx_size); - grandparent_index = std::move(o.grandparent_index); - overlapped_bytes = std::move(o.overlapped_bytes); - seen_key = std::move(o.seen_key); - return *this; + // Adds the key and value to the builder + // If paranoid is true, adds the key-value to the paranoid hash + Status AddToBuilder(const Slice& key, const Slice& value) { + auto curr = current_output(); + assert(builder != nullptr); + assert(curr != nullptr); + Status s = curr->validator.Add(key, value); + if (!s.ok()) { + return s; + } + builder->Add(key, value); + return Status::OK(); } - // Because member std::unique_ptrs do not have these. - SubcompactionState(const SubcompactionState&) = delete; - - SubcompactionState& operator=(const SubcompactionState&) = delete; + void FillFilesToCutForTtl(); // Returns true iff we should stop building the current output // before processing "internal_key". @@ -205,6 +228,7 @@ &compaction->column_family_data()->internal_comparator(); const std::vector& grandparents = compaction->grandparents(); + bool grandparant_file_switched = false; // Scan to find earliest grandparent file that contains key. while (grandparent_index < grandparents.size() && icmp->Compare(internal_key, @@ -212,6 +236,7 @@ 0) { if (seen_key) { overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize(); + grandparant_file_switched = true; } assert(grandparent_index + 1 >= grandparents.size() || icmp->Compare( @@ -221,17 +246,99 @@ } seen_key = true; - if (overlapped_bytes + curr_file_size > - compaction->max_compaction_bytes()) { + if (grandparant_file_switched && overlapped_bytes + curr_file_size > + compaction->max_compaction_bytes()) { // Too much overlap for current output; start new output overlapped_bytes = 0; return true; } + if (!files_to_cut_for_ttl.empty()) { + if (cur_files_to_cut_for_ttl != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl[cur_files_to_cut_for_ttl] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl = cur_files_to_cut_for_ttl + 1; + cur_files_to_cut_for_ttl = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl < + static_cast(files_to_cut_for_ttl.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl[next_files_to_cut_for_ttl] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl[next_files_to_cut_for_ttl] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl = next_files_to_cut_for_ttl; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl++; + } else { + // Still fall into the gap + break; + } + } + } + } + return false; } + + Status ProcessOutFlowIfNeeded(const Slice& key, const Slice& value) { + if (!blob_garbage_meter) { + return Status::OK(); + } + + return blob_garbage_meter->ProcessOutFlow(key, value); + } }; +void CompactionJob::SubcompactionState::FillFilesToCutForTtl() { + if (compaction->immutable_options()->compaction_style != + CompactionStyle::kCompactionStyleLevel || + compaction->immutable_options()->compaction_pri != + CompactionPri::kMinOverlappingRatio || + compaction->mutable_cf_options()->ttl == 0 || + compaction->num_input_levels() < 2 || compaction->bottommost_level()) { + return; + } + + // We define new file with oldest ancestor time to be younger than 1/4 TTL, + // and an old one to be older than 1/2 TTL time. + int64_t temp_current_time; + auto get_time_status = compaction->immutable_options()->clock->GetCurrentTime( + &temp_current_time); + if (!get_time_status.ok()) { + return; + } + uint64_t current_time = static_cast(temp_current_time); + if (current_time < compaction->mutable_cf_options()->ttl) { + return; + } + uint64_t old_age_thres = + current_time - compaction->mutable_cf_options()->ttl / 2; + + const std::vector& olevel = + *(compaction->inputs(compaction->num_input_levels() - 1)); + for (FileMetaData* file : olevel) { + // Worth filtering out by start and end? + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + // We put old files if they are not too small to prevent a flood + // of small files. + if (oldest_ancester_time < old_age_thres && + file->fd.GetFileSize() > + compaction->mutable_cf_options()->target_file_size_base / 2) { + files_to_cut_for_ttl.push_back(file); + } + } +} + // Maintains state for the entire compaction struct CompactionJob::CompactionState { Compaction* const compaction; @@ -241,21 +348,13 @@ std::vector sub_compact_states; Status status; - uint64_t total_bytes; - uint64_t num_output_records; - - explicit CompactionState(Compaction* c) - : compaction(c), - total_bytes(0), - num_output_records(0) {} + size_t num_output_files = 0; + uint64_t total_bytes = 0; + size_t num_blob_output_files = 0; + uint64_t total_blob_bytes = 0; + uint64_t num_output_records = 0; - size_t NumOutputFiles() { - size_t total = 0; - for (auto& s : sub_compact_states) { - total += s.outputs.size(); - } - return total; - } + explicit CompactionState(Compaction* c) : compaction(c) {} Slice SmallestUserKey() { for (const auto& sub_compact_state : sub_compact_states) { @@ -282,49 +381,78 @@ }; void CompactionJob::AggregateStatistics() { + assert(compact_); + for (SubcompactionState& sc : compact_->sub_compact_states) { + auto& outputs = sc.outputs; + + if (!outputs.empty() && !outputs.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs.pop_back(); + } + + compact_->num_output_files += outputs.size(); compact_->total_bytes += sc.total_bytes; - compact_->num_output_records += sc.num_output_records; - } - if (compaction_job_stats_) { - for (SubcompactionState& sc : compact_->sub_compact_states) { - compaction_job_stats_->Add(sc.compaction_job_stats); + + const auto& blobs = sc.blob_file_additions; + + compact_->num_blob_output_files += blobs.size(); + + for (const auto& blob : blobs) { + compact_->total_blob_bytes += blob.GetTotalBlobBytes(); } + + compact_->num_output_records += sc.num_output_records; + + compaction_job_stats_->Add(sc.compaction_job_stats); } } CompactionJob::CompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const FileOptions& file_options, VersionSet* versions, - const std::atomic* shutting_down, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, - Directory* db_directory, Directory* output_directory, Statistics* stats, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, const std::atomic* manual_compaction_paused) - : job_id_(job_id), - compact_(new CompactionState(compaction)), - compaction_job_stats_(compaction_job_stats), + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::atomic* manual_compaction_paused, + const std::atomic* manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) + : compact_(new CompactionState(compaction)), compaction_stats_(compaction->compaction_reason(), 1), - dbname_(dbname), db_options_(db_options), + mutable_db_options_copy_(mutable_db_options), + log_buffer_(log_buffer), + output_directory_(output_directory), + stats_(stats), + bottommost_level_(false), + write_hint_(Env::WLTH_NOT_SET), + job_id_(job_id), + compaction_job_stats_(compaction_job_stats), + dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), file_options_(file_options), env_(db_options.env), - fs_(db_options.fs.get()), + io_tracer_(io_tracer), + fs_(db_options.fs, io_tracer), file_options_for_read_( fs_->OptimizeForCompactionTableRead(file_options, db_options_)), versions_(versions), shutting_down_(shutting_down), manual_compaction_paused_(manual_compaction_paused), + manual_compaction_canceled_(manual_compaction_canceled), preserve_deletes_seqnum_(preserve_deletes_seqnum), - log_buffer_(log_buffer), db_directory_(db_directory), - output_directory_(output_directory), - stats_(stats), + blob_output_directory_(blob_output_directory), db_mutex_(db_mutex), db_error_handler_(db_error_handler), existing_snapshots_(std::move(existing_snapshots)), @@ -332,11 +460,12 @@ snapshot_checker_(snapshot_checker), table_cache_(std::move(table_cache)), event_logger_(event_logger), - bottommost_level_(false), paranoid_file_checks_(paranoid_file_checks), measure_io_stats_(measure_io_stats), - write_hint_(Env::WLTH_NOT_SET), - thread_pri_(thread_pri) { + thread_pri_(thread_pri), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback) { + assert(compaction_job_stats_ != nullptr); assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, @@ -388,17 +517,16 @@ // to ensure GetThreadList() can always show them all together. ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); - if (compaction_job_stats_) { - compaction_job_stats_->is_manual_compaction = - compaction->is_manual_compaction(); - } + compaction_job_stats_->is_manual_compaction = + compaction->is_manual_compaction(); + compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); } void CompactionJob::Prepare() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PREPARE); - // Generate file_levels_ for compaction berfore making Iterator + // Generate file_levels_ for compaction before making Iterator auto* c = compact_->compaction; assert(c->column_family_data() != nullptr); assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( @@ -410,7 +538,7 @@ if (c->ShouldFormSubcompactions()) { { - StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME); + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); GenSubcompactionBoundaries(); } assert(sizes_.size() == boundaries_.size() + 1); @@ -418,12 +546,18 @@ for (size_t i = 0; i <= boundaries_.size(); i++) { Slice* start = i == 0 ? nullptr : &boundaries_[i - 1]; Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i]; - compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]); + compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i], + static_cast(i)); } RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, compact_->sub_compact_states.size()); } else { - compact_->sub_compact_states.emplace_back(c, nullptr, nullptr); + constexpr Slice* start = nullptr; + constexpr Slice* end = nullptr; + constexpr uint64_t size = 0; + + compact_->sub_compact_states.emplace_back(c, start, end, size, + /*sub_job_id*/ 0); } } @@ -529,9 +663,10 @@ int base_level = v->storage_info()->base_level(); uint64_t max_output_files = static_cast(std::ceil( sum / min_file_fill_percent / - MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl, - c->immutable_cf_options()->compaction_style, base_level, - c->immutable_cf_options()->level_compaction_dynamic_level_bytes))); + MaxFileSizeForLevel( + *(c->mutable_cf_options()), out_lvl, + c->immutable_options()->compaction_style, base_level, + c->immutable_options()->level_compaction_dynamic_level_bytes))); uint64_t subcompactions = std::min({static_cast(ranges.size()), static_cast(c->max_subcompactions()), @@ -542,7 +677,7 @@ // Greedily add ranges to the subcompaction until the sum of the ranges' // sizes becomes >= the expected mean size of a subcompaction sum = 0; - for (size_t i = 0; i < ranges.size() - 1; i++) { + for (size_t i = 0; i + 1 < ranges.size(); i++) { sum += ranges[i].size; if (subcompactions == 1) { // If there's only one left to schedule then it goes to the end so no @@ -572,7 +707,7 @@ const size_t num_threads = compact_->sub_compact_states.size(); assert(num_threads > 0); - const uint64_t start_micros = env_->NowMicros(); + const uint64_t start_micros = db_options_.clock->NowMicros(); // Launch a thread for each of subcompactions 1...num_threads-1 std::vector thread_pool; @@ -591,7 +726,7 @@ thread.join(); } - compaction_stats_.micros = env_->NowMicros() - start_micros; + compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; compaction_stats_.cpu_micros = 0; for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) { compaction_stats_.cpu_micros += @@ -606,33 +741,62 @@ // Check if any thread encountered an error during execution Status status; + IOStatus io_s; + bool wrote_new_blob_files = false; + for (const auto& state : compact_->sub_compact_states) { if (!state.status.ok()) { status = state.status; + io_s = state.io_status; break; } + + if (!state.blob_file_additions.empty()) { + wrote_new_blob_files = true; + } } - if (status.ok() && output_directory_) { - status = output_directory_->Fsync(); + if (io_status_.ok()) { + io_status_ = io_s; } + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } if (status.ok()) { thread_pool.clear(); - std::vector files_meta; + std::vector files_output; for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { - files_meta.emplace_back(&output.meta); + files_output.emplace_back(&output); } } ColumnFamilyData* cfd = compact_->compaction->column_family_data(); - auto prefix_extractor = - compact_->compaction->mutable_cf_options()->prefix_extractor.get(); - std::atomic next_file_meta_idx(0); + auto& prefix_extractor = + compact_->compaction->mutable_cf_options()->prefix_extractor; + std::atomic next_file_idx(0); auto verify_table = [&](Status& output_status) { while (true) { - size_t file_idx = next_file_meta_idx.fetch_add(1); - if (file_idx >= files_meta.size()) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { break; } // Verify that the table is usable @@ -641,21 +805,40 @@ // No matter whether use_direct_io_for_flush_and_compaction is true, // we will regard this verification as user reads since the goal is // to cache it here for further user reads + ReadOptions read_options; InternalIterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), file_options_, cfd->internal_comparator(), - *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + read_options, file_options_, cfd->internal_comparator(), + files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, /*skip_filters=*/false, compact_->compaction->output_level(), + MaxFileSizeForL0MetaPin( + *compact_->compaction->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {} - s = iter->status(); + OutputValidator validator(cfd->internal_comparator(), + /*_enable_order_check=*/true, + /*_enable_hash=*/true); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = validator.Add(iter->key(), iter->value()); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + s = iter->status(); + } + if (s.ok() && + !validator.CompareValidator(files_output[file_idx]->validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } } delete iter; @@ -686,7 +869,7 @@ for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { auto fn = - TableFileName(state.compaction->immutable_cf_options()->cf_paths, + TableFileName(state.compaction->immutable_options()->cf_paths, output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); tp[fn] = output.table_properties; } @@ -696,6 +879,7 @@ // Finish up all book-keeping to unify the subcompaction results AggregateStatistics(); UpdateCompactionStats(); + RecordCompactionIOStats(); LogFlush(db_options_.info_log); TEST_SYNC_POINT("CompactionJob::Run():End"); @@ -705,17 +889,26 @@ } Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + assert(compact_); + AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_INSTALL); db_mutex_->AssertHeld(); Status status = compact_->status; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd); + cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), thread_pri_, compaction_stats_); if (status.ok()) { status = InstallCompactionResults(mutable_cf_options); } + if (!versions_->io_status().ok()) { + io_status_ = versions_->io_status(); + } + VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); const auto& stats = compaction_stats_; @@ -725,63 +918,86 @@ double bytes_read_per_sec = 0; double bytes_written_per_sec = 0; - if (stats.bytes_read_non_output_levels > 0) { - read_write_amp = (stats.bytes_written + stats.bytes_read_output_level + - stats.bytes_read_non_output_levels) / - static_cast(stats.bytes_read_non_output_levels); - write_amp = stats.bytes_written / - static_cast(stats.bytes_read_non_output_levels); + const uint64_t bytes_read_non_output_and_blob = + stats.bytes_read_non_output_levels + stats.bytes_read_blob; + const uint64_t bytes_read_all = + stats.bytes_read_output_level + bytes_read_non_output_and_blob; + const uint64_t bytes_written_all = + stats.bytes_written + stats.bytes_written_blob; + + if (bytes_read_non_output_and_blob > 0) { + read_write_amp = (bytes_written_all + bytes_read_all) / + static_cast(bytes_read_non_output_and_blob); + write_amp = + bytes_written_all / static_cast(bytes_read_non_output_and_blob); } if (stats.micros > 0) { - bytes_read_per_sec = - (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) / - static_cast(stats.micros); + bytes_read_per_sec = bytes_read_all / static_cast(stats.micros); bytes_written_per_sec = - stats.bytes_written / static_cast(stats.micros); + bytes_written_all / static_cast(stats.micros); } + const std::string& column_family_name = cfd->GetName(); + + constexpr double kMB = 1048576.0; + ROCKS_LOG_BUFFER( log_buffer_, "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " - "files in(%d, %d) out(%d) " - "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " - "write-amplify(%.1f) %s, records in: %" PRIu64 + "files in(%d, %d) out(%d +%d blob) " + "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), " + "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64 ", records dropped: %" PRIu64 " output_compression: %s\n", - cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec, - bytes_written_per_sec, compact_->compaction->output_level(), + column_family_name.c_str(), vstorage->LevelSummary(&tmp), + bytes_read_per_sec, bytes_written_per_sec, + compact_->compaction->output_level(), stats.num_input_files_in_non_output_levels, stats.num_input_files_in_output_level, stats.num_output_files, - stats.bytes_read_non_output_levels / 1048576.0, - stats.bytes_read_output_level / 1048576.0, - stats.bytes_written / 1048576.0, read_write_amp, write_amp, - status.ToString().c_str(), stats.num_input_records, + stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB, + stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB, + stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp, + write_amp, status.ToString().c_str(), stats.num_input_records, stats.num_dropped_records, CompressionTypeToString(compact_->compaction->output_compression()) .c_str()); + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 + "\n", + column_family_name.c_str(), blob_files.begin()->first, + blob_files.rbegin()->first); + } + UpdateCompactionJobStats(stats); - auto stream = event_logger_->LogToBuffer(log_buffer_); + auto stream = event_logger_->LogToBuffer(log_buffer_, 8192); stream << "job" << job_id_ << "event" << "compaction_finished" << "compaction_time_micros" << stats.micros << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" << compact_->compaction->output_level() << "num_output_files" - << compact_->NumOutputFiles() << "total_output_size" - << compact_->total_bytes << "num_input_records" - << stats.num_input_records << "num_output_records" - << compact_->num_output_records << "num_subcompactions" - << compact_->sub_compact_states.size() << "output_compression" - << CompressionTypeToString(compact_->compaction->output_compression()); + << compact_->num_output_files << "total_output_size" + << compact_->total_bytes; - if (compaction_job_stats_ != nullptr) { - stream << "num_single_delete_mismatches" - << compaction_job_stats_->num_single_del_mismatch; - stream << "num_single_delete_fallthrough" - << compaction_job_stats_->num_single_del_fallthru; + if (compact_->num_blob_output_files > 0) { + stream << "num_blob_output_files" << compact_->num_blob_output_files + << "total_blob_output_size" << compact_->total_blob_bytes; } - if (measure_io_stats_ && compaction_job_stats_ != nullptr) { + stream << "num_input_records" << stats.num_input_records + << "num_output_records" << compact_->num_output_records + << "num_subcompactions" << compact_->sub_compact_states.size() + << "output_compression" + << CompressionTypeToString(compact_->compaction->output_compression()); + + stream << "num_single_delete_mismatches" + << compaction_job_stats_->num_single_del_mismatch; + stream << "num_single_delete_fallthrough" + << compaction_job_stats_->num_single_del_fallthru; + + if (measure_io_stats_) { stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos; stream << "file_range_sync_nanos" << compaction_job_stats_->file_range_sync_nanos; @@ -797,14 +1013,222 @@ } stream.EndArray(); + if (!blob_files.empty()) { + stream << "blob_file_head" << blob_files.begin()->first; + stream << "blob_file_tail" << blob_files.rbegin()->first; + } + CleanupCompaction(); return status; } +#ifndef ROCKSDB_LITE +CompactionServiceJobStatus +CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + + const std::vector& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start; + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end; + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + compaction_input.approx_size = sub_compact->approx_size; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, + GetCompactionId(sub_compact), thread_pri_); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->StartV2(info, compaction_input_binary); + switch (compaction_status) { + case CompactionServiceJobStatus::kSuccess: + break; + case CompactionServiceJobStatus::kFailure: + sub_compact->status = Status::Incomplete( + "CompactionService failed to start compaction job."); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed to start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + case CompactionServiceJobStatus::kUseLocal: + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API Start.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + default: + assert(false); // unknown status + break; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Waiting for remote compaction...", + compaction_input.column_family.name.c_str(), job_id_); + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForCompleteV2( + info, &compaction_result_binary); + + if (compaction_status == CompactionServiceJobStatus::kUseLocal) { + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Remote compaction fallback to local by API " + "WaitForComplete.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + + if (compaction_status == CompactionServiceJobStatus::kFailure) { + if (s.ok()) { + if (compaction_result.status.ok()) { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (even though " + "the internal status is okay)."); + } else { + // set the current sub compaction status with the status returned from + // remote + sub_compact->status = compaction_result.status; + } + } else { + sub_compact->status = Status::Incomplete( + "CompactionService failed to run the compaction job (and no valid " + "result is returned)."); + compaction_result.status.PermitUncheckedError(); + } + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed.", + compaction_input.column_family.name.c_str(), job_id_); + return compaction_status; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return CompactionServiceJobStatus::kFailure; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return CompactionServiceJobStatus::kFailure; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.marked_for_compaction = file.marked_for_compaction; + + auto cfd = compaction->column_family_data(); + sub_compact->outputs.emplace_back(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->num_output_records = compaction_result.num_output_records; + sub_compact->approx_size = compaction_input.approx_size; // is this used? + sub_compact->total_bytes = compaction_result.total_bytes; + RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); + RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, + compaction_result.bytes_written); + return CompactionServiceJobStatus::kSuccess; +} +#endif // !ROCKSDB_LITE + void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { - assert(sub_compact != nullptr); + assert(sub_compact); + assert(sub_compact->compaction); - uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000; +#ifndef ROCKSDB_LITE + if (db_options_.compaction_service) { + CompactionServiceJobStatus comp_status = + ProcessKeyValueCompactionWithCompactionService(sub_compact); + if (comp_status == CompactionServiceJobStatus::kSuccess || + comp_status == CompactionServiceJobStatus::kFailure) { + return; + } + // fallback to local compaction + assert(comp_status == CompactionServiceJobStatus::kUseLocal); + } +#endif // !ROCKSDB_LITE + + uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); @@ -828,10 +1252,63 @@ CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), existing_snapshots_); + const Slice* const start = sub_compact->start; + const Slice* const end = sub_compact->end; + + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.fill_cache = false; + // Compaction iterators shouldn't be confined to a single prefix. + // Compactions use Seek() for + // (a) concurrent compactions, + // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. + read_options.total_order_seek = true; + + // Note: if we're going to support subcompactions for user-defined timestamps, + // the timestamp part will have to be stripped from the bounds here. + assert((!start && !end) || cfd->user_comparator()->timestamp_size() == 0); + read_options.iterate_lower_bound = start; + read_options.iterate_upper_bound = end; + // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. - std::unique_ptr input(versions_->MakeInputIterator( - sub_compact->compaction, &range_del_agg, file_options_for_read_)); + std::unique_ptr raw_input( + versions_->MakeInputIterator(read_options, sub_compact->compaction, + &range_del_agg, file_options_for_read_)); + InternalIterator* input = raw_input.get(); + + IterKey start_ikey; + IterKey end_ikey; + Slice start_slice; + Slice end_slice; + + if (start) { + start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); + start_slice = start_ikey.GetInternalKey(); + } + if (end) { + end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek); + end_slice = end_ikey.GetInternalKey(); + } + + std::unique_ptr clip; + if (start || end) { + clip.reset(new ClippingIterator( + raw_input.get(), start ? &start_slice : nullptr, + end ? &end_slice : nullptr, &cfd->internal_comparator())); + input = clip.get(); + } + + std::unique_ptr blob_counter; + + if (sub_compact->compaction->DoesInputReferenceBlobFiles()) { + sub_compact->blob_garbage_meter.reset(new BlobGarbageMeter); + blob_counter.reset( + new BlobCountingIterator(input, sub_compact->blob_garbage_meter.get())); + input = blob_counter.get(); + } + + input->SeekToFirst(); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PROCESS_KV); @@ -857,40 +1334,51 @@ } MergeHelper merge( - env_, cfd->user_comparator(), cfd->ioptions()->merge_operator, + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(), compaction_filter, db_options_.info_log.get(), false /* internal key corruption is expected */, existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), - snapshot_checker_, compact_->compaction->level(), - db_options_.statistics.get()); + snapshot_checker_, compact_->compaction->level(), db_options_.stats); + + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + assert(mutable_cf_options); + + std::vector blob_file_paths; + + std::unique_ptr blob_file_builder( + mutable_cf_options->enable_blob_files + ? new BlobFileBuilder( + versions_, fs_.get(), + sub_compact->compaction->immutable_options(), + mutable_cf_options, &file_options_, job_id_, cfd->GetID(), + cfd->GetName(), Env::IOPriority::IO_LOW, write_hint_, + io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction, + &blob_file_paths, &sub_compact->blob_file_additions) + : nullptr); TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); TEST_SYNC_POINT_CALLBACK( "CompactionJob::Run():PausingManualCompaction:1", reinterpret_cast( - const_cast*>(manual_compaction_paused_))); - - Slice* start = sub_compact->start; - Slice* end = sub_compact->end; - if (start != nullptr) { - IterKey start_iter; - start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); - input->Seek(start_iter.GetInternalKey()); - } else { - input->SeekToFirst(); - } + const_cast*>(manual_compaction_paused_))); Status status; + const std::string* const full_history_ts_low = + full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; sub_compact->c_iter.reset(new CompactionIterator( - input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), + input, cfd->user_comparator(), &merge, versions_->LastSequence(), &existing_snapshots_, earliest_write_conflict_snapshot_, - snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, - &range_del_agg, sub_compact->compaction, compaction_filter, - shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_, - db_options_.info_log)); + snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), + /*expect_valid_internal_key=*/true, &range_del_agg, + blob_file_builder.get(), db_options_.allow_data_in_errors, + sub_compact->compaction, compaction_filter, shutting_down_, + preserve_deletes_seqnum_, manual_compaction_paused_, + manual_compaction_canceled_, db_options_.info_log, full_history_ts_low)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { + sub_compact->FillFilesToCutForTtl(); // ShouldStopBefore() maintains state based on keys processed so far. The // compaction loop always calls it on the "next" key, thus won't tell it the // first key. So we do that here. @@ -899,18 +1387,21 @@ } const auto& c_iter_stats = c_iter->iter_stats(); + std::unique_ptr partitioner = + sub_compact->compaction->output_level() == 0 + ? nullptr + : sub_compact->compaction->CreateSstPartitioner(); + std::string last_key_for_partitioner; + while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() // returns true. const Slice& key = c_iter->key(); const Slice& value = c_iter->value(); - // If an end key (exclusive) is specified, check if the current key is - // >= than it and exit if it is because the iterator is out of its range - if (end != nullptr && - cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) { - break; - } + assert(!end || + cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0); + if (c_iter_stats.num_input_records % kRecordStatsEvery == kRecordStatsEvery - 1) { RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); @@ -925,10 +1416,18 @@ break; } } - assert(sub_compact->builder != nullptr); - assert(sub_compact->current_output() != nullptr); - sub_compact->builder->Add(key, value); - sub_compact->current_output_file_size = sub_compact->builder->FileSize(); + status = sub_compact->AddToBuilder(key, value); + if (!status.ok()) { + break; + } + + status = sub_compact->ProcessOutFlowIfNeeded(key, value); + if (!status.ok()) { + break; + } + + sub_compact->current_output_file_size = + sub_compact->builder->EstimatedFileSize(); const ParsedInternalKey& ikey = c_iter->ikey(); sub_compact->current_output()->meta.UpdateBoundaries( key, value, ikey.sequence, ikey.type); @@ -943,33 +1442,39 @@ // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB // and 0.6MB instead of 1MB and 0.2MB) bool output_file_ended = false; - Status input_status; if (sub_compact->compaction->output_level() != 0 && sub_compact->current_output_file_size >= sub_compact->compaction->max_output_file_size()) { // (1) this key terminates the file. For historical reasons, the iterator // status before advancing will be given to FinishCompactionOutputFile(). - input_status = input->status(); output_file_ended = true; } TEST_SYNC_POINT_CALLBACK( "CompactionJob::Run():PausingManualCompaction:2", reinterpret_cast( - const_cast*>(manual_compaction_paused_))); + const_cast*>(manual_compaction_paused_))); + if (partitioner.get()) { + last_key_for_partitioner.assign(c_iter->user_key().data_, + c_iter->user_key().size_); + } c_iter->Next(); if (c_iter->status().IsManualCompactionPaused()) { break; } - if (!output_file_ended && c_iter->Valid() && - sub_compact->compaction->output_level() != 0 && - sub_compact->ShouldStopBefore(c_iter->key(), - sub_compact->current_output_file_size) && - sub_compact->builder != nullptr) { - // (2) this key belongs to the next file. For historical reasons, the - // iterator status after advancing will be given to - // FinishCompactionOutputFile(). - input_status = input->status(); - output_file_ended = true; + if (!output_file_ended && c_iter->Valid()) { + if (((partitioner.get() && + partitioner->ShouldPartition(PartitionerRequest( + last_key_for_partitioner, c_iter->user_key(), + sub_compact->current_output_file_size)) == kRequired) || + (sub_compact->compaction->output_level() != 0 && + sub_compact->ShouldStopBefore( + c_iter->key(), sub_compact->current_output_file_size))) && + sub_compact->builder != nullptr) { + // (2) this key belongs to the next file. For historical reasons, the + // iterator status after advancing will be given to + // FinishCompactionOutputFile(). + output_file_ended = true; + } } if (output_file_ended) { const Slice* next_key = nullptr; @@ -977,14 +1482,18 @@ next_key = &c_iter->key(); } CompactionIterationStats range_del_out_stats; - status = - FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg, - &range_del_out_stats, next_key); + status = FinishCompactionOutputFile(input->status(), sub_compact, + &range_del_agg, &range_del_out_stats, + next_key); RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); } } + sub_compact->compaction_job_stats.num_blobs_read = + c_iter_stats.num_blobs_read; + sub_compact->compaction_job_stats.total_blob_bytes_read = + c_iter_stats.total_blob_bytes_read; sub_compact->compaction_job_stats.num_input_deletion_records = c_iter_stats.num_input_deletion_records; sub_compact->compaction_job_stats.num_corrupt_keys = @@ -1000,6 +1509,16 @@ RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, c_iter_stats.total_filter_time); + + if (c_iter_stats.num_blobs_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED, + c_iter_stats.num_blobs_relocated); + } + if (c_iter_stats.total_blob_bytes_relocated > 0) { + RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED, + c_iter_stats.total_blob_bytes_relocated); + } + RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); RecordCompactionIOStats(); @@ -1012,8 +1531,10 @@ status = Status::ShutdownInProgress("Database shutdown"); } if ((status.ok() || status.IsColumnFamilyDropped()) && - (manual_compaction_paused_ && - manual_compaction_paused_->load(std::memory_order_relaxed))) { + ((manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed) > 0) || + (manual_compaction_canceled_ && + manual_compaction_canceled_->load(std::memory_order_relaxed)))) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } if (status.ok()) { @@ -1035,14 +1556,23 @@ CompactionIterationStats range_del_out_stats; Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, &range_del_out_stats); - if (status.ok()) { + if (!s.ok() && status.ok()) { status = s; } RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); } + if (blob_file_builder) { + if (status.ok()) { + status = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(status); + } + blob_file_builder.reset(); + } + sub_compact->compaction_job_stats.cpu_micros = - env_->NowCPUNanos() / 1000 - prev_cpu_micros; + db_options_.clock->CPUMicros() - prev_cpu_micros; if (measure_io_stats_) { sub_compact->compaction_job_stats.file_write_nanos += @@ -1061,12 +1591,28 @@ SetPerfLevel(prev_perf_level); } } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!status.ok()) { + if (sub_compact->c_iter) { + sub_compact->c_iter->status().PermitUncheckedError(); + } + if (input) { + input->status().PermitUncheckedError(); + } + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED sub_compact->c_iter.reset(); - input.reset(); + blob_counter.reset(); + clip.reset(); + raw_input.reset(); sub_compact->status = status; } +uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) { + return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id; +} + void CompactionJob::RecordDroppedKeys( const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats) { @@ -1121,6 +1667,8 @@ ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); const Comparator* ucmp = cfd->user_comparator(); + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; // Check for iterator errors Status s = input_status; @@ -1194,6 +1742,7 @@ } else { it->SeekToFirst(); } + TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr) { @@ -1221,6 +1770,7 @@ auto kv = tombstone.Serialize(); assert(lower_bound == nullptr || ucmp->Compare(*lower_bound, kv.second) < 0); + // Range tombstone is not supported by output validator yet. sub_compact->builder->Add(kv.first.Encode(), kv.second); InternalKey smallest_candidate = std::move(kv.first); if (lower_bound != nullptr && @@ -1277,7 +1827,6 @@ meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, cfd->internal_comparator()); - // The smallest key in a file is used for range tombstone truncation, so // it cannot have a seqnum of 0 (unless the smallest data key in a file // has a seqnum of 0). Otherwise, the truncated tombstone may expose @@ -1286,7 +1835,6 @@ ExtractInternalKeyFooter(meta->smallest.Encode()) != PackSequenceAndType(0, kTypeRangeDeletion)); } - meta->marked_for_compaction = sub_compact->builder->NeedCompact(); } const uint64_t current_entries = sub_compact->builder->NumEntries(); if (s.ok()) { @@ -1294,25 +1842,59 @@ } else { sub_compact->builder->Abandon(); } + IOStatus io_s = sub_compact->builder->io_status(); + if (s.ok()) { + s = io_s; + } const uint64_t current_bytes = sub_compact->builder->FileSize(); if (s.ok()) { - // Add the checksum information to file metadata. - meta->file_checksum = sub_compact->builder->GetFileChecksum(); - meta->file_checksum_func_name = - sub_compact->builder->GetFileChecksumFuncName(); - meta->fd.file_size = current_bytes; + meta->marked_for_compaction = sub_compact->builder->NeedCompact(); + // With accurate smallest and largest key, we can get a slightly more + // accurate oldest ancester time. + // This makes oldest ancester time in manifest more accurate than in + // table properties. Not sure how to resolve it. + if (meta->smallest.size() > 0 && meta->largest.size() > 0) { + uint64_t refined_oldest_ancester_time; + Slice new_smallest = meta->smallest.user_key(); + Slice new_largest = meta->largest.user_key(); + if (!new_largest.empty() && !new_smallest.empty()) { + refined_oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime( + &(meta->smallest), &(meta->largest)); + if (refined_oldest_ancester_time != port::kMaxUint64) { + meta->oldest_ancester_time = refined_oldest_ancester_time; + } + } + } } sub_compact->current_output()->finished = true; sub_compact->total_bytes += current_bytes; // Finish and check for file errors if (s.ok()) { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = sub_compact->outfile->Sync(db_options_.use_fsync); + StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + io_s = sub_compact->outfile->Sync(db_options_.use_fsync); + } + if (s.ok() && io_s.ok()) { + io_s = sub_compact->outfile->Close(); + } + if (s.ok() && io_s.ok()) { + // Add the checksum information to file metadata. + meta->file_checksum = sub_compact->outfile->GetFileChecksum(); + meta->file_checksum_func_name = + sub_compact->outfile->GetFileChecksumFuncName(); + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; } if (s.ok()) { - s = sub_compact->outfile->Close(); + s = io_s; + } + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the + // "normal" status, it does not also need to be checked + sub_compact->io_status.PermitUncheckedError(); } sub_compact->outfile.reset(); @@ -1326,9 +1908,20 @@ // This happens when the output level is bottom level, at the same time // the sub_compact output nothing. std::string fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + TableFileName(sub_compact->compaction->immutable_options()->cf_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); - env_->DeleteFile(fname); + + // TODO(AR) it is not clear if there are any larger implications if + // DeleteFile fails here + Status ds = env_->DeleteFile(fname); + if (!ds.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64 + " at bottom level%s", + cfd->GetName().c_str(), job_id_, output_number, + meta->marked_for_compaction ? " (need compaction)" : ""); + } // Also need to remove the file from outputs, or it will be added to the // VersionEdit. @@ -1352,9 +1945,7 @@ FileDescriptor output_fd; uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; if (meta != nullptr) { - fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, - meta->fd.GetNumber(), meta->fd.GetPathId()); + fname = GetTableFileName(meta->fd.GetNumber()); output_fd = meta->fd; oldest_blob_file_number = meta->oldest_blob_file_number; } else { @@ -1363,14 +1954,18 @@ EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, output_fd, oldest_blob_file_number, tp, - TableFileCreationReason::kCompaction, s); + TableFileCreationReason::kCompaction, s, file_checksum, + file_checksum_func_name); #ifndef ROCKSDB_LITE // Report new file to SstFileManagerImpl auto sfm = static_cast(db_options_.sst_file_manager.get()); if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) { - sfm->OnAddFile(fname); + Status add_s = sfm->OnAddFile(fname); + if (!add_s.ok() && s.ok()) { + s = add_s; + } if (sfm->IsMaxAllowedSpaceReached()) { // TODO(ajkr): should we return OK() if max space was reached by the final // compaction output file (similarly to how flush works when full)? @@ -1391,49 +1986,86 @@ Status CompactionJob::InstallCompactionResults( const MutableCFOptions& mutable_cf_options) { + assert(compact_); + db_mutex_->AssertHeld(); auto* compaction = compact_->compaction; - // paranoia: verify that the files that we started with - // still exist in the current version and in the same original level. - // This ensures that a concurrent compaction did not erroneously - // pick the same files to compact_. - if (!versions_->VerifyCompactionFileConsistency(compaction)) { - Compaction::InputLevelSummaryBuffer inputs_summary; - - ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted", - compaction->column_family_data()->GetName().c_str(), - job_id_, compaction->InputLevelSummary(&inputs_summary)); - return Status::Corruption("Compaction input files inconsistent"); - } + assert(compaction); { Compaction::InputLevelSummaryBuffer inputs_summary; - ROCKS_LOG_INFO( - db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", - compaction->column_family_data()->GetName().c_str(), job_id_, - compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compact_->total_bytes + compact_->total_blob_bytes); } + VersionEdit* const edit = compaction->edit(); + assert(edit); + // Add compaction inputs - compaction->AddInputDeletions(compact_->compaction->edit()); + compaction->AddInputDeletions(edit); + + std::unordered_map blob_total_garbage; for (const auto& sub_compact : compact_->sub_compact_states) { for (const auto& out : sub_compact.outputs) { - compaction->edit()->AddFile(compaction->output_level(), out.meta); + edit->AddFile(compaction->output_level(), out.meta); + } + + for (const auto& blob : sub_compact.blob_file_additions) { + edit->AddBlobFile(blob); } + + if (sub_compact.blob_garbage_meter) { + const auto& flows = sub_compact.blob_garbage_meter->flows(); + + for (const auto& pair : flows) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobInOutFlow& flow = pair.second; + + assert(flow.IsValid()); + if (flow.HasGarbage()) { + blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(), + flow.GetGarbageBytes()); + } + } + } + } + + for (const auto& pair : blob_total_garbage) { + const uint64_t blob_file_number = pair.first; + const BlobGarbageMeter::BlobStats& stats = pair.second; + + edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(), + stats.GetBytes()); } + return versions_->LogAndApply(compaction->column_family_data(), - mutable_cf_options, compaction->edit(), - db_mutex_, db_directory_); + mutable_cf_options, edit, db_mutex_, + db_directory_); } void CompactionJob::RecordCompactionIOStats() { RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + CompactionReason compaction_reason = + compact_->compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kPeriodicCompaction) { + RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written)); + } else if (compaction_reason == CompactionReason::kTtl) { + RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read)); + RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written)); + } ThreadStatusUtil::IncreaseThreadOperationProperty( ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read)); IOSTATS_RESET(bytes_read); - RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); ThreadStatusUtil::IncreaseThreadOperationProperty( ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); @@ -1445,9 +2077,7 @@ assert(sub_compact->builder == nullptr); // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); - std::string fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, - file_number, sub_compact->compaction->output_path_id()); + std::string fname = GetTableFileName(file_number); // Fire events. ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); #ifndef ROCKSDB_LITE @@ -1462,7 +2092,25 @@ TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", &syncpoint_arg); #endif - Status s = NewWritableFile(fs_, fname, &writable_file, file_options_); + + // Pass temperature of botommost files to FileSystem. + FileOptions fo_copy = file_options_; + Temperature temperature = sub_compact->compaction->output_temperature(); + if (temperature == Temperature::kUnknown && bottommost_level_) { + temperature = + sub_compact->compaction->mutable_cf_options()->bottommost_temperature; + } + fo_copy.temperature = temperature; + + Status s; + IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); + s = io_s; + if (sub_compact->io_status.ok()) { + sub_compact->io_status = io_s; + // Since this error is really a copy of the io_s that is checked below as s, + // it does not also need to be checked. + sub_compact->io_status.PermitUncheckedError(); + } if (!s.ok()) { ROCKS_LOG_ERROR( db_options_.info_log, @@ -1474,13 +2122,14 @@ EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, - TableProperties(), TableFileCreationReason::kCompaction, s); + TableProperties(), TableFileCreationReason::kCompaction, s, + kUnknownFileChecksum, kUnknownFileChecksumFuncName); return s; } // Try to figure out the output file's oldest ancester time. int64_t temp_current_time = 0; - auto get_time_status = env_->GetCurrentTime(&temp_current_time); + auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time); // Safe to proceed even if GetCurrentTime fails. So, log and proceed. if (!get_time_status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, @@ -1488,50 +2137,62 @@ get_time_status.ToString().c_str()); } uint64_t current_time = static_cast(temp_current_time); + InternalKey tmp_start, tmp_end; + if (sub_compact->start != nullptr) { + tmp_start.SetMinPossibleForUserKey(*(sub_compact->start)); + } + if (sub_compact->end != nullptr) { + tmp_end.SetMinPossibleForUserKey(*(sub_compact->end)); + } uint64_t oldest_ancester_time = - sub_compact->compaction->MinInputFileOldestAncesterTime(); + sub_compact->compaction->MinInputFileOldestAncesterTime( + (sub_compact->start != nullptr) ? &tmp_start : nullptr, + (sub_compact->end != nullptr) ? &tmp_end : nullptr); if (oldest_ancester_time == port::kMaxUint64) { oldest_ancester_time = current_time; } // Initialize a SubcompactionState::Output and add it to sub_compact->outputs { - SubcompactionState::Output out; - out.meta.fd = FileDescriptor(file_number, - sub_compact->compaction->output_path_id(), 0); - out.meta.oldest_ancester_time = oldest_ancester_time; - out.meta.file_creation_time = current_time; - out.finished = false; - sub_compact->outputs.push_back(out); + FileMetaData meta; + meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + meta.oldest_ancester_time = oldest_ancester_time; + meta.file_creation_time = current_time; + meta.temperature = temperature; + sub_compact->outputs.emplace_back( + std::move(meta), cfd->internal_comparator(), + /*enable_order_check=*/ + sub_compact->compaction->mutable_cf_options() + ->check_flush_compaction_key_order, + /*enable_hash=*/paranoid_file_checks_); } writable_file->SetIOPriority(Env::IOPriority::IO_LOW); writable_file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; writable_file->SetPreallocationBlockSize(static_cast( sub_compact->compaction->OutputFilePreallocationSize())); const auto& listeners = - sub_compact->compaction->immutable_cf_options()->listeners; - sub_compact->outfile.reset( - new WritableFileWriter(std::move(writable_file), fname, file_options_, - env_, db_options_.statistics.get(), listeners, - db_options_.sst_file_checksum_func.get())); - - // If the Column family flag is to only optimize filters for hits, - // we can skip creating filters if this is the bottommost_level where - // data is going to be found - bool skip_filters = - cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; + sub_compact->compaction->immutable_options()->listeners; + sub_compact->outfile.reset(new WritableFileWriter( + std::move(writable_file), fname, file_options_, db_options_.clock, + io_tracer_, db_options_.stats, listeners, + db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); - sub_compact->builder.reset(NewTableBuilder( + TableBuilderOptions tboptions( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(), sub_compact->compaction->output_compression(), - 0 /*sample_for_compression */, - sub_compact->compaction->output_compression_opts(), - sub_compact->compaction->output_level(), skip_filters, - oldest_ancester_time, 0 /* oldest_key_time */, - sub_compact->compaction->max_output_file_size(), current_time)); + sub_compact->compaction->output_compression_opts(), cfd->GetID(), + cfd->GetName(), sub_compact->compaction->output_level(), + bottommost_level_, TableFileCreationReason::kCompaction, + oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_, + db_session_id_, sub_compact->compaction->max_output_file_size(), + file_number); + sub_compact->builder.reset( + NewTableBuilder(tboptions, sub_compact->outfile.get())); LogFlush(db_options_.info_log); return s; } @@ -1554,6 +2215,9 @@ TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber()); } } + // TODO: sub_compact.io_status is not checked like status. Not sure if thats + // intentional. So ignoring the io_status as of now. + sub_compact.io_status.PermitUncheckedError(); } delete compact_; compact_ = nullptr; @@ -1571,6 +2235,8 @@ #endif // !ROCKSDB_LITE void CompactionJob::UpdateCompactionStats() { + assert(compact_); + Compaction* compaction = compact_->compaction; compaction_stats_.num_input_files_in_non_output_levels = 0; compaction_stats_.num_input_files_in_output_level = 0; @@ -1588,27 +2254,20 @@ } } - uint64_t num_output_records = 0; - - for (const auto& sub_compact : compact_->sub_compact_states) { - size_t num_output_files = sub_compact.outputs.size(); - if (sub_compact.builder != nullptr) { - // An error occurred so ignore the last output. - assert(num_output_files > 0); - --num_output_files; - } - compaction_stats_.num_output_files += static_cast(num_output_files); - - num_output_records += sub_compact.num_output_records; - - for (const auto& out : sub_compact.outputs) { - compaction_stats_.bytes_written += out.meta.fd.file_size; - } - } + assert(compaction_job_stats_); + compaction_stats_.bytes_read_blob = + compaction_job_stats_->total_blob_bytes_read; + + compaction_stats_.num_output_files = + static_cast(compact_->num_output_files); + compaction_stats_.num_output_files_blob = + static_cast(compact_->num_blob_output_files); + compaction_stats_.bytes_written = compact_->total_bytes; + compaction_stats_.bytes_written_blob = compact_->total_blob_bytes; - if (compaction_stats_.num_input_records > num_output_records) { + if (compaction_stats_.num_input_records > compact_->num_output_records) { compaction_stats_.num_dropped_records = - compaction_stats_.num_input_records - num_output_records; + compaction_stats_.num_input_records - compact_->num_output_records; } } @@ -1630,32 +2289,31 @@ void CompactionJob::UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const { #ifndef ROCKSDB_LITE - if (compaction_job_stats_) { - compaction_job_stats_->elapsed_micros = stats.micros; + compaction_job_stats_->elapsed_micros = stats.micros; - // input information - compaction_job_stats_->total_input_bytes = - stats.bytes_read_non_output_levels + stats.bytes_read_output_level; - compaction_job_stats_->num_input_records = stats.num_input_records; - compaction_job_stats_->num_input_files = - stats.num_input_files_in_non_output_levels + - stats.num_input_files_in_output_level; - compaction_job_stats_->num_input_files_at_output_level = - stats.num_input_files_in_output_level; - - // output information - compaction_job_stats_->total_output_bytes = stats.bytes_written; - compaction_job_stats_->num_output_records = compact_->num_output_records; - compaction_job_stats_->num_output_files = stats.num_output_files; - - if (compact_->NumOutputFiles() > 0U) { - CopyPrefix(compact_->SmallestUserKey(), - CompactionJobStats::kMaxPrefixLength, - &compaction_job_stats_->smallest_output_key_prefix); - CopyPrefix(compact_->LargestUserKey(), - CompactionJobStats::kMaxPrefixLength, - &compaction_job_stats_->largest_output_key_prefix); - } + // input information + compaction_job_stats_->total_input_bytes = + stats.bytes_read_non_output_levels + stats.bytes_read_output_level; + compaction_job_stats_->num_input_records = stats.num_input_records; + compaction_job_stats_->num_input_files = + stats.num_input_files_in_non_output_levels + + stats.num_input_files_in_output_level; + compaction_job_stats_->num_input_files_at_output_level = + stats.num_input_files_in_output_level; + + // output information + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; + compaction_job_stats_->num_output_records = compact_->num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; + + if (stats.num_output_files > 0) { + CopyPrefix(compact_->SmallestUserKey(), + CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->smallest_output_key_prefix); + CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, + &compaction_job_stats_->largest_output_key_prefix); } #else (void)stats; @@ -1697,4 +2355,629 @@ } } +std::string CompactionJob::GetTableFileName(uint64_t file_number) { + return TableFileName(compact_->compaction->immutable_options()->cf_paths, + file_number, compact_->compaction->output_path_id()); +} + +#ifndef ROCKSDB_LITE +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +void CompactionServiceCompactionJob::RecordCompactionIOStats() { + compaction_result_->bytes_read += IOSTATS(bytes_read); + compaction_result_->bytes_written += IOSTATS(bytes_written); + CompactionJob::RecordCompactionIOStats(); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id, + const std::string& output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, 0, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, existing_snapshots, + kMaxSequenceNumber, nullptr, table_cache, event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + nullptr, nullptr, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(output_path), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, compaction_input_.has_begin ? &begin : nullptr, + compaction_input_.has_end ? &end : nullptr, compaction_input_.approx_size, + /*sub_job_id*/ 0); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; + compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() + } + + // Finish up all book-keeping to unify the subcompaction results + AggregateStatistics(); + UpdateCompactionStats(); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->outputs) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, output_file.validator.GetHash(), + meta.marked_for_compaction); + } + compaction_result_->num_output_records = sub_compact->num_output_records; + compaction_result_->total_bytes = sub_compact->total_bytes; + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +// offset_of is used to get the offset of a class data member +// ex: offset_of(&ColumnFamilyDescriptor::options) +// This call will return the offset of options in ColumnFamilyDescriptor class +// +// This is the same as offsetof() but allow us to work with non standard-layout +// classes and structures +// refs: +// http://en.cppreference.com/w/cpp/concept/StandardLayoutType +// https://gist.github.com/graphitemaster/494f21190bb2c63c5516 +static ColumnFamilyDescriptor dummy_cfd("", ColumnFamilyOptions()); +template +int offset_of(T1 ColumnFamilyDescriptor::*member) { + return int(size_t(&(dummy_cfd.*member)) - size_t(&dummy_cfd)); +} + +static CompactionServiceInput dummy_cs_input; +template +int offset_of(T1 CompactionServiceInput::*member) { + return int(size_t(&(dummy_cs_input.*member)) - size_t(&dummy_cs_input)); +} + +static std::unordered_map cfd_type_info = { + {"name", + {offset_of(&ColumnFamilyDescriptor::name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offset_of(&ColumnFamilyDescriptor::options), OptionType::kConfigurable, + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct("column_family", &cfd_type_info, + offset_of(&CompactionServiceInput::column_family), + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, + {"db_options", + {offset_of(&CompactionServiceInput::db_options), OptionType::kConfigurable, + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector( + offset_of(&CompactionServiceInput::snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector( + offset_of(&CompactionServiceInput::input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offset_of(&CompactionServiceInput::output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"has_begin", + {offset_of(&CompactionServiceInput::has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offset_of(&CompactionServiceInput::begin), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"has_end", + {offset_of(&CompactionServiceInput::has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offset_of(&CompactionServiceInput::end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"approx_size", + {offset_of(&CompactionServiceInput::approx_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() {} + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() { + return Status(static_cast(code), + static_cast(subcode), + static_cast(severity), message); + } +}; +} // namespace + +static std::unordered_map + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast(addr1); + const auto status2 = static_cast(addr2); + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + ToString(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + ToString(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +#endif // !ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,9 +17,9 @@ #include #include +#include "db/blob/blob_file_completion_callback.h" #include "db/column_family.h" #include "db/compaction/compaction_iterator.h" -#include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" #include "db/job_context.h" @@ -50,6 +50,7 @@ class ErrorHandler; class MemTable; class SnapshotChecker; +class SystemClock; class TableCache; class Version; class VersionEdit; @@ -62,25 +63,29 @@ // if needed. class CompactionJob { public: - CompactionJob(int job_id, Compaction* compaction, - const ImmutableDBOptions& db_options, - const FileOptions& file_options, VersionSet* versions, - const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, - LogBuffer* log_buffer, Directory* db_directory, - Directory* output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, - std::shared_ptr table_cache, EventLogger* event_logger, - bool paranoid_file_checks, bool measure_io_stats, - const std::string& dbname, - CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, - const std::atomic* manual_compaction_paused = nullptr); + CompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::atomic* manual_compaction_paused = nullptr, + const std::atomic* manual_compaction_canceled = nullptr, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); - ~CompactionJob(); + virtual ~CompactionJob(); // no copy/move CompactionJob(CompactionJob&& job) = delete; @@ -100,11 +105,39 @@ // Add compaction input/output to the current version Status Install(const MutableCFOptions& mutable_cf_options); - private: + // Return the IO status + IOStatus io_status() const { return io_status_; } + + protected: struct SubcompactionState; + // CompactionJob state + struct CompactionState; void AggregateStatistics(); + void UpdateCompactionStats(); + void LogCompaction(); + virtual void RecordCompactionIOStats(); + void CleanupCompaction(); + + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + CompactionState* compact_; + InternalStats::CompactionStats compaction_stats_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions mutable_db_options_copy_; + LogBuffer* log_buffer_; + FSDirectory* output_directory_; + Statistics* stats_; + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + + Env::WriteLifeTimeHint write_hint_; + + IOStatus io_status_; + + private: // Generates a histogram representing potential divisions of key ranges from // the input. It adds the starting and/or ending keys of certain input files // to the working set and then finds the approximate size of data in between @@ -112,12 +145,12 @@ // consecutive groups such that each group has a similar size. void GenSubcompactionBoundaries(); + CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact); + // update the thread status for starting a compaction. void ReportStartedCompaction(Compaction* compaction); void AllocateCompactionOutputFileNumbers(); - // Call compaction filter. Then iterate through input and compact the - // kv-pairs - void ProcessKeyValueCompaction(SubcompactionState* sub_compact); Status FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, @@ -125,45 +158,37 @@ CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key = nullptr); Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); - void RecordCompactionIOStats(); Status OpenCompactionOutputFile(SubcompactionState* sub_compact); - void CleanupCompaction(); void UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const; void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats = nullptr); - void UpdateCompactionStats(); void UpdateCompactionInputStatsHelper( int* num_files, uint64_t* bytes_read, int input_level); - void LogCompaction(); - - int job_id_; + uint32_t job_id_; - // CompactionJob state - struct CompactionState; - CompactionState* compact_; CompactionJobStats* compaction_job_stats_; - InternalStats::CompactionStats compaction_stats_; // DBImpl state const std::string& dbname_; - const ImmutableDBOptions& db_options_; + const std::string db_id_; + const std::string db_session_id_; const FileOptions file_options_; Env* env_; - FileSystem* fs_; + std::shared_ptr io_tracer_; + FileSystemPtr fs_; // env_option optimized for compaction table reads FileOptions file_options_for_read_; VersionSet* versions_; const std::atomic* shutting_down_; - const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_canceled_; const SequenceNumber preserve_deletes_seqnum_; - LogBuffer* log_buffer_; - Directory* db_directory_; - Directory* output_directory_; - Statistics* stats_; + FSDirectory* db_directory_; + FSDirectory* blob_output_directory_; InstrumentedMutex* db_mutex_; ErrorHandler* db_error_handler_; // If there were two snapshots with seq numbers s1 and @@ -183,16 +208,158 @@ EventLogger* event_logger_; - // Is this compaction creating a file in the bottom most level? - bool bottommost_level_; bool paranoid_file_checks_; bool measure_io_stats_; // Stores the Slices that designate the boundaries for each subcompaction std::vector boundaries_; // Stores the approx size of keys covered in the range of each subcompaction std::vector sizes_; - Env::WriteLifeTimeHint write_hint_; Env::Priority thread_pri_; + std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; + + uint64_t GetCompactionId(SubcompactionState* sub_compact); + + // Get table file name in where it's outputting to, which should also be in + // `output_directory_`. + virtual std::string GetTableFileName(uint64_t file_number); +}; + +// CompactionServiceInput is used the pass compaction information between two +// db instances. It contains the information needed to do a compaction. It +// doesn't contain the LSM tree information, which is passed though MANIFEST +// file. +struct CompactionServiceInput { + ColumnFamilyDescriptor column_family; + + DBOptions db_options; + + std::vector snapshots; + + // SST files for compaction, it should already be expended to include all the + // files needed for this compaction, for both input level files and output + // level files. + std::vector input_files; + int output_level; + + // information for subcompaction + bool has_begin = false; + std::string begin; + bool has_end = false; + std::string end; + uint64_t approx_size = 0; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceInput* obj); + Status Write(std::string* output); + + // Initialize a dummy ColumnFamilyDescriptor + CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {} + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceInput* other); + bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceOutputFile is the metadata for the output SST file +struct CompactionServiceOutputFile { + std::string file_name; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; + std::string smallest_internal_key; + std::string largest_internal_key; + uint64_t oldest_ancester_time; + uint64_t file_creation_time; + uint64_t paranoid_hash; + bool marked_for_compaction; + + CompactionServiceOutputFile() = default; + CompactionServiceOutputFile( + const std::string& name, SequenceNumber smallest, SequenceNumber largest, + std::string _smallest_internal_key, std::string _largest_internal_key, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _paranoid_hash, bool _marked_for_compaction) + : file_name(name), + smallest_seqno(smallest), + largest_seqno(largest), + smallest_internal_key(std::move(_smallest_internal_key)), + largest_internal_key(std::move(_largest_internal_key)), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + paranoid_hash(_paranoid_hash), + marked_for_compaction(_marked_for_compaction) {} +}; + +// CompactionServiceResult contains the compaction result from a different db +// instance, with these information, the primary db instance with write +// permission is able to install the result to the DB. +struct CompactionServiceResult { + Status status; + std::vector output_files; + int output_level; + + // location of the output files + std::string output_path; + + // some statistics about the compaction + uint64_t num_output_records = 0; + uint64_t total_bytes = 0; + uint64_t bytes_read = 0; + uint64_t bytes_written = 0; + CompactionJobStats stats; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceResult* obj); + Status Write(std::string* output); + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceResult* other); + bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceCompactionJob is an read-only compaction job, it takes +// input information from `compaction_service_input` and put result information +// in `compaction_service_result`, the SST files are generated to `output_path`. +class CompactionServiceCompactionJob : private CompactionJob { + public: + CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id, + const std::string& output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result); + + // Run the compaction in current thread and return the result + Status Run(); + + void CleanupCompaction(); + + IOStatus io_status() const { return CompactionJob::io_status(); } + + protected: + void RecordCompactionIOStats() override; + + private: + // Get table file name in output_path + std::string GetTableFileName(uint64_t file_number) override; + // Specific the compaction output path, otherwise it uses default DB path + const std::string output_path_; + + // Compaction job input + const CompactionServiceInput& compaction_input_; + + // Compaction job result + CompactionServiceResult* compaction_result_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -24,8 +24,6 @@ #include "db/write_batch_internal.h" #include "env/mock_env.h" #include "file/filename.h" -#include "logging/logging.h" -#include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "monitoring/thread_status_util.h" #include "port/stack_trace.h" @@ -52,6 +50,7 @@ #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" #include "util/compression.h" #include "util/hash.h" #include "util/mutexlock.h" @@ -126,9 +125,7 @@ static void SetUpTestCase() {} static void TearDownTestCase() {} - DBImpl* dbfull() { - return reinterpret_cast(db_); - } + DBImpl* dbfull() { return static_cast_with_check(db_); } void CreateColumnFamilies(const std::vector& cfs, const Options& options) { @@ -271,10 +268,10 @@ if (cf == 0) { // default cfd EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(level), &property)); + "rocksdb.num-files-at-level" + ToString(level), &property)); } else { EXPECT_TRUE(db_->GetProperty( - handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), + handles_[cf], "rocksdb.num-files-at-level" + ToString(level), &property)); } return atoi(property.c_str()); @@ -299,15 +296,14 @@ return result; } - uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { + Status Size(uint64_t* size, const Slice& start, const Slice& limit, + int cf = 0) { Range r(start, limit); - uint64_t size; if (cf == 0) { - db_->GetApproximateSizes(&r, 1, &size); + return db_->GetApproximateSizes(&r, 1, size); } else { - db_->GetApproximateSizes(handles_[1], &r, 1, &size); + return db_->GetApproximateSizes(handles_[1], &r, 1, size); } - return size; } void Compact(int cf, const Slice& start, const Slice& limit, @@ -460,6 +456,7 @@ ASSERT_EQ(current_stats.num_output_files, stats.num_output_files); + ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction); ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction); @@ -572,7 +569,7 @@ uint64_t num_input_records, size_t key_size, size_t value_size, size_t num_output_files, uint64_t num_output_records, double compression_ratio, uint64_t num_records_replaced, - bool is_manual = true) { + bool is_full = false, bool is_manual = true) { CompactionJobStats stats; stats.Reset(); @@ -596,6 +593,7 @@ stats.total_input_raw_value_bytes = num_input_records * value_size; + stats.is_full_compaction = is_full; stats.is_manual_compaction = is_manual; stats.num_records_replaced = num_records_replaced; @@ -797,7 +795,7 @@ } ASSERT_OK(Flush(1)); - reinterpret_cast(db_)->TEST_WaitForCompact(); + ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); stats_checker->set_verify_next_comp_io_stats(true); std::atomic first_prepare_write(true); @@ -895,7 +893,7 @@ CompactRangeOptions cr_options; cr_options.change_level = true; cr_options.target_level = 2; - db_->CompactRange(cr_options, handles_[1], nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr)); ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); // Stage 2: Generate files including keys from the entire key range @@ -982,26 +980,21 @@ if (num_input_units == 0) { continue; } + // A full compaction only happens when the number of flushes equals to + // the number of compaction input runs. + bool is_full = num_flushes == num_input_units; // The following statement determines the expected smallest key - // based on whether it is a full compaction. A full compaction only - // happens when the number of flushes equals to the number of compaction - // input runs. - uint64_t smallest_key = - (num_flushes == num_input_units) ? - key_base : key_base * (num_flushes - 1); + // based on whether it is a full compaction. + uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1); - stats_checker->AddExpectedStats( - NewManualCompactionJobStats( - Key(smallest_key, 10), - Key(smallest_key + key_base * num_input_units - key_interval, 10), - num_input_units, - num_input_units > 2 ? num_input_units / 2 : 0, - num_keys_per_table * num_input_units, - kKeySize, kValueSize, - num_input_units, - num_keys_per_table * num_input_units, - 1.0, 0, false)); - dbfull()->TEST_WaitForCompact(); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(smallest_key, 10), + Key(smallest_key + key_base * num_input_units - key_interval, 10), + num_input_units, num_input_units > 2 ? num_input_units / 2 : 0, + num_keys_per_table * num_input_units, kKeySize, kValueSize, + num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full, + false)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); @@ -1012,7 +1005,7 @@ &rnd, start_key, start_key + key_base - 1, kKeySize, kValueSize, key_interval, compression_ratio, 1); - reinterpret_cast(db_)->TEST_WaitForCompact(); + ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); } ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,8 @@ #ifndef ROCKSDB_LITE +#include "db/compaction/compaction_job.h" + #include #include #include @@ -12,15 +14,16 @@ #include #include -#include "db/blob_index.h" +#include "db/blob/blob_index.h" #include "db/column_family.h" -#include "db/compaction/compaction_job.h" #include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/version_set.h" #include "file/writable_file_writer.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" @@ -67,30 +70,42 @@ } // namespace -// TODO(icanadi) Make it simpler once we mock out VersionSet -class CompactionJobTest : public testing::Test { - public: - CompactionJobTest() - : env_(Env::Default()), - fs_(std::make_shared(env_)), - dbname_(test::PerThreadDBPath("compaction_job_test")), +class CompactionJobTestBase : public testing::Test { + protected: + CompactionJobTestBase(std::string dbname, const Comparator* ucmp, + std::function encode_u64_ts) + : dbname_(std::move(dbname)), + ucmp_(ucmp), db_options_(), mutable_cf_options_(cf_options_), + mutable_db_options_(), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr)), + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ "")), shutting_down_(false), preserve_deletes_seqnum_(0), mock_table_factory_(new mock::MockTableFactory()), - error_handler_(nullptr, db_options_, &mutex_) { + error_handler_(nullptr, db_options_, &mutex_), + encode_u64_ts_(std::move(encode_u64_ts)) { + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + env_ = base_env; + fs_ = env_->GetFileSystem(); + } + + void SetUp() override { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); db_options_.env = env_; db_options_.fs = fs_; db_options_.db_paths.emplace_back(dbname_, std::numeric_limits::max()); + cf_options_.comparator = ucmp_; + cf_options_.table_factory = mock_table_factory_; } std::string GenerateFileName(uint64_t file_number) { @@ -101,9 +116,10 @@ return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); } - static std::string KeyStr(const std::string& user_key, - const SequenceNumber seq_num, const ValueType t) { - return InternalKey(user_key, seq_num, t).Encode().ToString(); + std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num, + const ValueType t, uint64_t ts = 0) { + std::string user_key_with_ts = user_key + encode_u64_ts_(ts); + return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString(); } static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, @@ -129,7 +145,7 @@ return blob_index; } - void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) { + void AddMockFile(const mock::KVVector& contents, int level = 0) { assert(contents.size() > 0); bool first_key = true; @@ -143,7 +159,8 @@ std::string skey; std::string value; std::tie(skey, value) = kv; - bool parsed = ParseInternalKey(skey, &key); + const Status pik_status = + ParseInternalKey(skey, &key, true /* log_err_key */); smallest_seqno = std::min(smallest_seqno, key.sequence); largest_seqno = std::max(largest_seqno, key.sequence); @@ -161,7 +178,7 @@ first_key = false; - if (parsed && key.type == kTypeBlobIndex) { + if (pik_status.ok() && key.type == kTypeBlobIndex) { BlobIndex blob_index; const Status s = blob_index.DecodeFrom(value); if (!s.ok()) { @@ -186,13 +203,16 @@ VersionEdit edit; edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key, - smallest_seqno, largest_seqno, false, oldest_blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + smallest_seqno, largest_seqno, false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); mutex_.Lock(); - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_); + EXPECT_OK( + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_)); mutex_.Unlock(); } @@ -203,11 +223,11 @@ } // returns expected result after compaction - stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) { - auto expected_results = mock::MakeMockFile(); - const int kKeysPerFile = 10000; - const int kCorruptKeysPerFile = 200; - const int kMatchingKeys = kKeysPerFile / 2; + mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) { + stl_wrappers::KVMap expected_results; + constexpr int kKeysPerFile = 10000; + constexpr int kCorruptKeysPerFile = 200; + constexpr int kMatchingKeys = kKeysPerFile / 2; SequenceNumber sequence_number = 0; auto corrupt_id = [&](int id) { @@ -230,49 +250,51 @@ test::CorruptKeyType(&internal_key); test::CorruptKeyType(&bottommost_internal_key); } - contents.insert({ internal_key.Encode().ToString(), value }); + contents.push_back({internal_key.Encode().ToString(), value}); if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) { expected_results.insert( - { bottommost_internal_key.Encode().ToString(), value }); + {bottommost_internal_key.Encode().ToString(), value}); } } + mock::SortKVVector(&contents, ucmp_); AddMockFile(contents); } SetLastSequence(sequence_number); - return expected_results; + mock::KVVector expected_results_kvvector; + for (auto& kv : expected_results) { + expected_results_kvvector.push_back({kv.first, kv.second}); + } + + return expected_results_kvvector; } void NewDB() { - DestroyDB(dbname_, Options()); + EXPECT_OK(DestroyDB(dbname_, Options())); EXPECT_OK(env_->CreateDirIfMissing(dbname_)); - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_manager_, - &write_controller_, - /*block_cache_tracer=*/nullptr)); + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); compaction_job_stats_.Reset(); - SetIdentityFile(env_, dbname_); + ASSERT_OK(SetIdentityFile(env_, dbname_)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { - DBImpl* impl = new DBImpl(DBOptions(), dbname_); - std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); - new_db.SetDBId(db_id); - } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + std::unique_ptr file_writer; + const auto& fs = env_->GetFileSystem(); + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); { log::Writer log(std::move(file_writer), 0, false); std::string record; @@ -281,21 +303,22 @@ } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1, nullptr); + s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + + ASSERT_OK(s); - std::vector column_families; - cf_options_.table_factory = mock_table_factory_; cf_options_.merge_operator = merge_op_; cf_options_.compaction_filter = compaction_filter_.get(); + std::vector column_families; column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); - EXPECT_OK(versions_->Recover(column_families, false)); + ASSERT_OK(versions_->Recover(column_families, false)); cfd_ = versions_->GetColumnFamilySet()->GetDefault(); } void RunCompaction( const std::vector>& input_files, - const stl_wrappers::KVMap& expected_results, + const mock::KVVector& expected_results, const std::vector& snapshots = {}, SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, int output_level = 1, bool verify = true, @@ -314,11 +337,12 @@ num_input_files += level_files.size(); } - Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(), - *cfd->GetLatestMutableCFOptions(), - compaction_input_files, output_level, 1024 * 1024, - 10 * 1024 * 1024, 0, kNoCompression, - cfd->ioptions()->compression_opts, 0, {}, true); + Compaction compaction( + cfd->current()->storage_info(), *cfd->ioptions(), + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, + compaction_input_files, output_level, 1024 * 1024, 10 * 1024 * 1024, 0, + kNoCompression, cfd->GetLatestMutableCFOptions()->compression_opts, + Temperature::kUnknown, 0, {}, true); compaction.SetInputVersion(cfd->current()); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); @@ -326,22 +350,28 @@ EventLogger event_logger(db_options_.info_log.get()); // TODO(yiwu) add a mock snapshot checker and add test for it. SnapshotChecker* snapshot_checker = nullptr; + ASSERT_TRUE(full_history_ts_low_.empty() || + ucmp_->timestamp_size() == full_history_ts_low_.size()); CompactionJob compaction_job( - 0, &compaction, db_options_, env_options_, versions_.get(), - &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr, - nullptr, nullptr, &mutex_, &error_handler_, snapshots, + 0, &compaction, db_options_, mutable_db_options_, env_options_, + versions_.get(), &shutting_down_, preserve_deletes_seqnum_, &log_buffer, + nullptr, nullptr, nullptr, nullptr, &mutex_, &error_handler_, snapshots, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, - Env::Priority::USER); + Env::Priority::USER, nullptr /* IOTracer */, + /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); mutex_.Unlock(); - Status s; - s = compaction_job.Run(); + Status s = compaction_job.Run(); ASSERT_OK(s); + ASSERT_OK(compaction_job.io_status()); mutex_.Lock(); ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + ASSERT_OK(compaction_job.io_status()); mutex_.Unlock(); if (verify) { @@ -363,13 +393,16 @@ } } + std::shared_ptr env_guard_; Env* env_; std::shared_ptr fs_; std::string dbname_; + const Comparator* const ucmp_; EnvOptions env_options_; ImmutableDBOptions db_options_; ColumnFamilyOptions cf_options_; MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -383,6 +416,17 @@ std::unique_ptr compaction_filter_; std::shared_ptr merge_op_; ErrorHandler error_handler_; + std::string full_history_ts_low_; + const std::function encode_u64_ts_; +}; + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public CompactionJobTestBase { + public: + CompactionJobTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_test"), + BytewiseComparator(), + [](uint64_t /*ts*/) { return ""; }) {} }; TEST_F(CompactionJobTest, Simple) { @@ -395,7 +439,7 @@ RunCompaction({ files }, expected_results); } -TEST_F(CompactionJobTest, SimpleCorrupted) { +TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) { NewDB(); auto expected_results = CreateTwoFiles(true); @@ -636,7 +680,7 @@ SetLastSequence(11U); auto files = cfd_->current()->storage_info()->LevelFiles(0); - stl_wrappers::KVMap empty_map; + mock::KVVector empty_map; RunCompaction({files}, empty_map); } @@ -989,7 +1033,7 @@ // single deletion and the (single) deletion gets removed while the corrupt key // gets written out. TODO(noetzli): We probably want a better way to treat // corrupt keys. -TEST_F(CompactionJobTest, CorruptionAfterDeletion) { +TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) { NewDB(); auto file1 = @@ -1063,10 +1107,312 @@ /* expected_oldest_blob_file_number */ 19); } +TEST_F(CompactionJobTest, InputSerialization) { + // Setup a random CompactionServiceInput + CompactionServiceInput input; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + input.column_family.options.comparator = ReverseBytewiseComparator(); + input.column_family.options.max_bytes_for_level_base = + rnd64.Uniform(UINT64_MAX); + input.column_family.options.disable_auto_compactions = rnd.OneIn(2); + input.column_family.options.compression = kZSTD; + input.column_family.options.compression_opts.level = 4; + input.db_options.max_background_flushes = 10; + input.db_options.paranoid_checks = rnd.OneIn(2); + input.db_options.statistics = CreateDBStatistics(); + input.db_options.env = env_; + while (!rnd.OneIn(10)) { + input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX)); + } + while (!rnd.OneIn(10)) { + input.input_files.emplace_back(rnd.RandomString( + rnd.Uniform(kStrMaxLen - 1) + + 1)); // input file name should have at least one character + } + input.output_level = 4; + input.has_begin = rnd.OneIn(2); + if (input.has_begin) { + input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.has_end = rnd.OneIn(2); + if (input.has_end) { + input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.approx_size = rnd64.Uniform(UINT64_MAX); + + std::string output; + ASSERT_OK(input.Write(&output)); + + // Test deserialization + CompactionServiceInput deserialized1; + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&input)); + + // Test mismatch + deserialized1.db_options.max_background_flushes += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "db_options.max_background_flushes"); + + // Test unknown field + CompactionServiceInput deserialized2; + output.clear(); + ASSERT_OK(input.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&input)); + + // Test missing field + CompactionServiceInput deserialized3; + deserialized3.output_level = 0; + std::string to_remove = "output_level=4;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "output_level"); + + // manually set the value back, should match the original structure + deserialized3.output_level = 4; + ASSERT_TRUE(deserialized3.TEST_Equals(&input)); + + // Test invalid version + output.clear(); + ASSERT_OK(input.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceInput::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_F(CompactionJobTest, ResultSerialization) { + // Setup a random CompactionServiceResult + CompactionServiceResult result; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + std::vector status_list = { + Status::OK(), + Status::InvalidArgument("invalid option"), + Status::Aborted("failed to run"), + Status::NotSupported("not supported option"), + }; + result.status = + status_list.at(rnd.Uniform(static_cast(status_list.size()))); + while (!rnd.OneIn(10)) { + result.output_files.emplace_back( + rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), rnd.OneIn(2)); + } + result.output_level = rnd.Uniform(10); + result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + result.num_output_records = rnd64.Uniform(UINT64_MAX); + result.total_bytes = rnd64.Uniform(UINT64_MAX); + result.bytes_read = 123; + result.bytes_written = rnd64.Uniform(UINT64_MAX); + result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); + result.stats.num_output_files = rnd.Uniform(1000); + result.stats.is_full_compaction = rnd.OneIn(2); + result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX); + result.stats.num_input_files = 9; + + std::string output; + ASSERT_OK(result.Write(&output)); + + // Test deserialization + CompactionServiceResult deserialized1; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&result)); + + // Test mismatch + deserialized1.stats.num_input_files += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "stats.num_input_files"); + + // Test unknown field + CompactionServiceResult deserialized2; + output.clear(); + ASSERT_OK(result.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&result)); + + // Test missing field + CompactionServiceResult deserialized3; + deserialized3.bytes_read = 0; + std::string to_remove = "bytes_read=123;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "bytes_read"); + + deserialized3.bytes_read = 123; + ASSERT_TRUE(deserialized3.TEST_Equals(&result)); + + // Test invalid version + output.clear(); + ASSERT_OK(result.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceResult::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); + for (const auto& item : status_list) { + item.PermitUncheckedError(); + } +} + +class CompactionJobTimestampTest : public CompactionJobTestBase { + public: + CompactionJobTimestampTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"), + test::ComparatorWithU64Ts(), test::EncodeInt) {} +}; + +TEST_F(CompactionJobTimestampTest, GCDisabled) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}}); + + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + AddMockFile(file2); + + SetLastSequence(10); + + auto expected_results = mock::MakeMockFile( + {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"}, + {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}, + {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, NoKeyExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + AddMockFile(file2); + + SetLastSequence(101); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, AllKeysExpired) { + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""}, + {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""}, + {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"}, + {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"}, + {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""}, + {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}}); + AddMockFile(file2); + + SetLastSequence(7); + + auto expected_results = + mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(std::numeric_limits::max()); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, SomeKeysExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"}, + {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"}, + {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}}); + AddMockFile(file2); + + SetLastSequence(6); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(49); + RunCompaction({files}, expected_results); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,9 +15,11 @@ #include #include #include + #include "db/column_family.h" #include "file/filename.h" #include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/statistics.h" #include "test_util/sync_point.h" #include "util/random.h" @@ -110,9 +112,9 @@ // If bottommost_compression is set and we are compacting to the // bottommost level then we should use it. - if (ioptions.bottommost_compression != kDisableCompressionOption && + if (mutable_cf_options.bottommost_compression != kDisableCompressionOption && level >= (vstorage->num_non_empty_levels() - 1)) { - return ioptions.bottommost_compression; + return mutable_cf_options.bottommost_compression; } // If the user has specified a different compression level for each level, // then pick the compression for that level. @@ -132,25 +134,23 @@ } } -CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, +CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options, const VersionStorageInfo* vstorage, int level, const bool enable_compression) { if (!enable_compression) { - return ioptions.compression_opts; + return cf_options.compression_opts; } - // If bottommost_compression is set and we are compacting to the - // bottommost level then we should use the specified compression options - // for the bottmomost_compression. - if (ioptions.bottommost_compression != kDisableCompressionOption && - level >= (vstorage->num_non_empty_levels() - 1) && - ioptions.bottommost_compression_opts.enabled) { - return ioptions.bottommost_compression_opts; + // If bottommost_compression_opts is enabled and we are compacting to the + // bottommost level then we should use the specified compression options. + if (level >= (vstorage->num_non_empty_levels() - 1) && + cf_options.bottommost_compression_opts.enabled) { + return cf_options.bottommost_compression_opts; } - return ioptions.compression_opts; + return cf_options.compression_opts; } -CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, +CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : ioptions_(ioptions), icmp_(icmp) {} @@ -332,7 +332,7 @@ const CompactionOptions& compact_options, const std::vector& input_files, int output_level, VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, - uint32_t output_path_id) { + const MutableDBOptions& mutable_db_options, uint32_t output_path_id) { assert(input_files.size()); // This compaction output should not overlap with a running compaction as // `SanitizeCompactionInputFiles` should've checked earlier and db mutex @@ -356,11 +356,11 @@ compression_type = compact_options.compression; } auto c = new Compaction( - vstorage, ioptions_, mutable_cf_options, input_files, output_level, - compact_options.output_file_size_limit, + vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files, + output_level, compact_options.output_file_size_limit, mutable_cf_options.max_compaction_bytes, output_path_id, compression_type, - GetCompressionOptions(ioptions_, vstorage, output_level), - compact_options.max_subcompactions, + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_options.max_subcompactions, /* grandparents */ {}, true); RegisterCompaction(c); return c; @@ -532,7 +532,7 @@ } } if (expand_inputs) { - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", @@ -554,16 +554,21 @@ InternalKey start, limit; GetRange(inputs, output_level_inputs, &start, &limit); // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (output_level_inputs.level + 1 < NumberLevels()) { - vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start, - &limit, grandparents); + // (parent == level+1; grandparent == level+2 or the first + // level after that has overlapping files) + for (int level = output_level_inputs.level + 1; level < NumberLevels(); + level++) { + vstorage->GetOverlappingInputs(level, &start, &limit, grandparents); + if (!grandparents->empty()) { + break; + } } } Compaction* CompactionPicker::CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, int output_level, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, uint64_t max_file_num_to_ignore) { @@ -626,18 +631,20 @@ } Compaction* c = new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), - output_level, + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), output_level, MaxFileSizeForLevel(mutable_cf_options, output_level, ioptions_.compaction_style), /* max_compaction_bytes */ LLONG_MAX, compact_range_options.target_path_id, GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, 1), - GetCompressionOptions(ioptions_, vstorage, output_level), - compact_range_options.max_subcompactions, /* grandparents */ {}, + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + /* grandparents */ {}, /* is manual */ true); RegisterCompaction(c); + vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); return c; } @@ -670,17 +677,41 @@ // two files overlap. if (input_level > 0) { const uint64_t limit = mutable_cf_options.max_compaction_bytes; - uint64_t total = 0; + uint64_t input_level_total = 0; + int hint_index = -1; + InternalKey* smallest = nullptr; + InternalKey* largest = nullptr; for (size_t i = 0; i + 1 < inputs.size(); ++i) { + if (!smallest) { + smallest = &inputs[i]->smallest; + } + largest = &inputs[i]->largest; + uint64_t s = inputs[i]->compensated_file_size; - total += s; - if (total >= limit) { + uint64_t output_level_total = 0; + if (output_level < vstorage->num_non_empty_levels()) { + std::vector files; + vstorage->GetOverlappingInputsRangeBinarySearch( + output_level, smallest, largest, &files, hint_index, &hint_index); + for (const auto& file : files) { + output_level_total += file->compensated_file_size; + } + } + + input_level_total += s; + + if (input_level_total + output_level_total >= limit) { covering_the_whole_range = false; + // still include the current file, so the compaction could be larger + // than max_compaction_bytes, which is also to make sure the compaction + // can make progress even `max_compaction_bytes` is small (e.g. smaller + // than an SST file). inputs.files.resize(i + 1); break; } } } + assert(compact_range_options.target_path_id < static_cast(ioptions_.cf_paths.size())); @@ -778,8 +809,8 @@ std::vector grandparents; GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents); Compaction* compaction = new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs), - output_level, + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(compaction_inputs), output_level, MaxFileSizeForLevel(mutable_cf_options, output_level, ioptions_.compaction_style, vstorage->base_level(), ioptions_.level_compaction_dynamic_level_bytes), @@ -787,8 +818,9 @@ compact_range_options.target_path_id, GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, vstorage->base_level()), - GetCompressionOptions(ioptions_, vstorage, output_level), - compact_range_options.max_subcompactions, std::move(grandparents), + GetCompressionOptions(mutable_cf_options, vstorage, output_level), + Temperature::kUnknown, compact_range_options.max_subcompactions, + std::move(grandparents), /* is manual compaction */ true); TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction); @@ -1004,6 +1036,7 @@ // any currently-existing files. for (auto file_num : *input_files) { bool found = false; + int input_file_level = -1; for (const auto& level_meta : cf_meta.levels) { for (const auto& file_meta : level_meta.files) { if (file_num == TableFileNameToNumber(file_meta.name)) { @@ -1013,6 +1046,7 @@ " is already being compacted."); } found = true; + input_file_level = level_meta.level; break; } } @@ -1025,6 +1059,13 @@ "Specified compaction input file " + MakeTableFileName("", file_num) + " does not exist in column family " + cf_meta.name + "."); } + if (input_file_level > output_level) { + return Status::InvalidArgument( + "Cannot compact file to up level, input file: " + + MakeTableFileName("", file_num) + " level " + + ToString(input_file_level) + " > output level " + + ToString(output_level)); + } } return Status::OK(); @@ -1043,6 +1084,8 @@ level0_compactions_in_progress_.insert(c); } compactions_in_progress_.insert(c); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered", + c); } void CompactionPicker::UnregisterCompaction(Compaction* c) { @@ -1085,6 +1128,8 @@ Random64 rnd(/* seed */ reinterpret_cast(vstorage)); size_t random_file_index = static_cast(rnd.Uniform( static_cast(vstorage->FilesMarkedForCompaction().size()))); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction", + &random_file_index); if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) { // found the compaction! diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h 2025-05-19 16:14:27.000000000 +0000 @@ -46,7 +46,7 @@ // compaction style specific logic for them. class CompactionPicker { public: - CompactionPicker(const ImmutableCFOptions& ioptions, + CompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp); virtual ~CompactionPicker(); @@ -56,7 +56,8 @@ // describes the compaction. Caller should delete the result. virtual Compaction* PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; // Return a compaction object for compacting the range [begin,end] in @@ -72,7 +73,8 @@ // *compaction_end should point to valid InternalKey! virtual Compaction* CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, int output_level, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, @@ -113,6 +115,7 @@ const std::vector& input_files, int output_level, VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, uint32_t output_path_id); // Converts a set of compaction input file numbers into @@ -215,7 +218,7 @@ } protected: - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; // A helper function to SanitizeCompactionInputFiles() that // sanitizes "input_files" by adding necessary files. @@ -241,7 +244,7 @@ // compaction. class NullCompactionPicker : public CompactionPicker { public: - NullCompactionPicker(const ImmutableCFOptions& ioptions, + NullCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual ~NullCompactionPicker() {} @@ -250,6 +253,7 @@ Compaction* PickCompaction( const std::string& /*cf_name*/, const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, SequenceNumber /* earliest_memtable_seqno */) override { return nullptr; @@ -258,6 +262,7 @@ // Always return "nullptr" Compaction* CompactRange(const std::string& /*cf_name*/, const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, VersionStorageInfo* /*vstorage*/, int /*input_level*/, int /*output_level*/, const CompactRangeOptions& /*compact_range_options*/, @@ -305,9 +310,9 @@ int level, int base_level, const bool enable_compression = true); -CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions, - const VersionStorageInfo* vstorage, - int level, - const bool enable_compression = true); +CompressionOptions GetCompressionOptions( + const MutableCFOptions& mutable_cf_options, + const VersionStorageInfo* vstorage, int level, + const bool enable_compression = true); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,8 +13,10 @@ #include #include #include + #include "db/column_family.h" #include "logging/log_buffer.h" +#include "logging/logging.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -36,7 +38,8 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { assert(mutable_cf_options.ttl > 0); const int kLevel0 = 0; @@ -44,7 +47,7 @@ uint64_t total_size = GetTotalFilesSize(level_files); int64_t _current_time; - auto status = ioptions_.env->GetCurrentTime(&_current_time); + auto status = ioptions_.clock->GetCurrentTime(&_current_time); if (!status.ok()) { ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: Couldn't get current time: %s. " @@ -70,18 +73,18 @@ // avoid underflow if (current_time > mutable_cf_options.ttl) { for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { - auto f = *ritr; - if (f->fd.table_reader != nullptr && - f->fd.table_reader->GetTableProperties() != nullptr) { - auto creation_time = + FileMetaData* f = *ritr; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + uint64_t creation_time = f->fd.table_reader->GetTableProperties()->creation_time; if (creation_time == 0 || creation_time >= (current_time - mutable_cf_options.ttl)) { break; } - total_size -= f->compensated_file_size; - inputs[0].files.push_back(f); } + total_size -= f->compensated_file_size; + inputs[0].files.push_back(f); } } @@ -96,24 +99,31 @@ } for (const auto& f : inputs[0].files) { + uint64_t creation_time = 0; + assert(f); + if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { + creation_time = f->fd.table_reader->GetTableProperties()->creation_time; + } ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64 " with creation time %" PRIu64 " for deletion", - cf_name.c_str(), f->fd.GetNumber(), - f->fd.table_reader->GetTableProperties()->creation_time); + cf_name.c_str(), f->fd.GetNumber(), creation_time); } Compaction* c = new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, - kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, - {}, /* is manual */ false, vstorage->CompactionScore(0), + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0, 0, 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + vstorage->CompactionScore(0), /* is deletion compaction */ true, CompactionReason::kFIFOTtl); return c; } Compaction* FIFOCompactionPicker::PickSizeCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { const int kLevel0 = 0; const std::vector& level_files = vstorage->LevelFiles(kLevel0); uint64_t total_size = GetTotalFilesSize(level_files); @@ -142,11 +152,12 @@ max_compact_bytes_per_del_file, mutable_cf_options.max_compaction_bytes, &comp_inputs)) { Compaction* c = new Compaction( - vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0, - 16 * 1024 * 1024 /* output file size limit */, + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, mutable_cf_options.compression, - ioptions_.compression_opts, 0 /* max_subcompactions */, {}, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions */, {}, /* is manual */ false, vstorage->CompactionScore(0), /* is deletion compaction */ false, CompactionReason::kFIFOReduceNumFiles); @@ -193,25 +204,139 @@ } Compaction* c = new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0, - kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0, - {}, /* is manual */ false, vstorage->CompactionScore(0), + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0, 0, 0, kNoCompression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + /* max_subcompactions */ 0, {}, /* is manual */ false, + vstorage->CompactionScore(0), /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize); return c; } +Compaction* FIFOCompactionPicker::PickCompactionToWarm( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) { + if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) { + return nullptr; + } + + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + + int64_t _current_time; + auto status = ioptions_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on warm threshold. ", + cf_name.c_str(), status.ToString().c_str()); + return nullptr; + } + const uint64_t current_time = static_cast(_current_time); + + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. Parallel " + "compactions are not supported", + cf_name.c_str()); + return nullptr; + } + + std::vector inputs; + inputs.emplace_back(); + inputs[0].level = 0; + + // avoid underflow + if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) { + uint64_t create_time_threshold = + current_time - mutable_cf_options.compaction_options_fifo.age_for_warm; + uint64_t compaction_size = 0; + // We will ideally identify a file qualifying for warm tier by knowing + // the timestamp for the youngest entry in the file. However, right now + // we don't have the information. We infer it by looking at timestamp + // of the next file's (which is just younger) oldest entry's timestamp. + FileMetaData* prev_file = nullptr; + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { + FileMetaData* f = *ritr; + assert(f); + if (f->being_compacted) { + // Right now this probably won't happen as we never try to schedule + // two compactions in parallel, so here we just simply don't schedule + // anything. + return nullptr; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time == kUnknownOldestAncesterTime) { + // Older files might not have enough information. It is possible to + // handle these files by looking at newer files, but maintaining the + // logic isn't worth it. + break; + } + if (oldest_ancester_time > create_time_threshold) { + // The previous file (which has slightly older data) doesn't qualify + // for warm tier. + break; + } + if (prev_file != nullptr) { + compaction_size += prev_file->fd.GetFileSize(); + if (compaction_size > mutable_cf_options.max_compaction_bytes) { + break; + } + inputs[0].files.push_back(prev_file); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with next file's oldest time %" PRIu64 " for warm", + cf_name.c_str(), prev_file->fd.GetNumber(), + oldest_ancester_time); + } + if (f->temperature == Temperature::kUnknown || + f->temperature == Temperature::kHot) { + prev_file = f; + } else if (!inputs[0].files.empty()) { + // A warm file newer than files picked. + break; + } else { + assert(prev_file == nullptr); + } + } + } + + if (inputs[0].files.empty()) { + return nullptr; + } + + Compaction* c = new Compaction( + vstorage, ioptions_, mutable_cf_options, mutable_db_options, + std::move(inputs), 0, 0 /* output file size limit */, + 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, + mutable_cf_options.compression, mutable_cf_options.compression_opts, + Temperature::kWarm, + /* max_subcompactions */ 0, {}, /* is manual */ false, + vstorage->CompactionScore(0), + /* is deletion compaction */ false, CompactionReason::kChangeTemperature); + return c; +} + Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, - SequenceNumber /*earliest_memtable_seqno*/) { + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) { assert(vstorage->num_levels() == 1); Compaction* c = nullptr; if (mutable_cf_options.ttl > 0) { - c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); + } + if (c == nullptr) { + c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); } if (c == nullptr) { - c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer); + c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options, + vstorage, log_buffer); } RegisterCompaction(c); return c; @@ -219,7 +344,8 @@ Compaction* FIFOCompactionPicker::CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, int output_level, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, const CompactRangeOptions& /*compact_range_options*/, const InternalKey* /*begin*/, const InternalKey* /*end*/, InternalKey** compaction_end, bool* /*manual_conflict*/, @@ -231,9 +357,9 @@ assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); - Compaction* c = - PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); + Compaction* c = PickCompaction(cf_name, mutable_cf_options, + mutable_db_options, vstorage, &log_buffer); log_buffer.FlushBufferToLog(); return c; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,18 +15,20 @@ namespace ROCKSDB_NAMESPACE { class FIFOCompactionPicker : public CompactionPicker { public: - FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + FIFOCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual Compaction* PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* version, LogBuffer* log_buffer, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, + LogBuffer* log_buffer, SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; virtual Compaction* CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, int input_level, int output_level, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict, @@ -41,13 +43,21 @@ private: Compaction* PickTTLCompaction(const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, LogBuffer* log_buffer); Compaction* PickSizeCompaction(const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, LogBuffer* log_buffer); + + Compaction* PickCompactionToWarm(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer); }; } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc 2025-05-19 16:14:27.000000000 +0000 @@ -31,6 +31,9 @@ if (!vstorage->FilesMarkedForCompaction().empty()) { return true; } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { if (vstorage->CompactionScore(i) >= 1) { return true; @@ -49,14 +52,16 @@ CompactionPicker* compaction_picker, LogBuffer* log_buffer, const MutableCFOptions& mutable_cf_options, - const ImmutableCFOptions& ioptions) + const ImmutableOptions& ioptions, + const MutableDBOptions& mutable_db_options) : cf_name_(cf_name), vstorage_(vstorage), earliest_mem_seqno_(earliest_mem_seqno), compaction_picker_(compaction_picker), log_buffer_(log_buffer), mutable_cf_options_(mutable_cf_options), - ioptions_(ioptions) {} + ioptions_(ioptions), + mutable_db_options_(mutable_db_options) {} // Pick and return a compaction. Compaction* PickCompaction(); @@ -93,9 +98,13 @@ // otherwise, returns false. bool PickIntraL0Compaction(); - void PickExpiredTtlFiles(); - - void PickFilesMarkedForPeriodicCompaction(); + // Picks a file from level_files to compact. + // level_files is a vector of (level, file metadata) in ascending order of + // level. If compact_to_next_level is true, compact the file to the next + // level, otherwise, compact to the same level as the input file. + void PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level); const std::string& cf_name_; VersionStorageInfo* vstorage_; @@ -115,7 +124,8 @@ CompactionReason compaction_reason_ = CompactionReason::kUnknown; const MutableCFOptions& mutable_cf_options_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; + const MutableDBOptions& mutable_db_options_; // Pick a path ID to place a newly generated file, with its level static uint32_t GetPathId(const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, @@ -124,72 +134,34 @@ static const int kMinFilesForIntraL0Compaction = 4; }; -void LevelCompactionBuilder::PickExpiredTtlFiles() { - if (vstorage_->ExpiredTtlFiles().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { +void LevelCompactionBuilder::PickFileToCompact( + const autovector>& level_files, + bool compact_to_next_level) { + for (auto& level_file : level_files) { // If it's being compacted it has nothing to do here. // If this assert() fails that means that some function marked some // files as being_compacted, but didn't call ComputeCompactionScore() assert(!level_file.second->being_compacted); start_level_ = level_file.first; - output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - - if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + if ((compact_to_next_level && + start_level_ == vstorage_->num_non_empty_levels() - 1) || (start_level_ == 0 && !compaction_picker_->level0_compactions_in_progress()->empty())) { - return false; - } - - start_level_inputs_.files = {level_file.second}; - start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->ExpiredTtlFiles()) { - if (continuation(level_file)) { - // found the compaction! - return; + continue; } - } - - start_level_inputs_.files.clear(); -} - -void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { - if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { - // If it's being compacted it has nothing to do here. - // If this assert() fails that means that some function marked some - // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - output_level_ = start_level_ = level_file.first; - - if (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty()) { - return false; + if (compact_to_next_level) { + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + } else { + output_level_ = start_level_; } - start_level_inputs_.files = {level_file.second}; start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { - if (continuation(level_file)) { - // found the compaction! + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { return; } } - start_level_inputs_.files.clear(); } @@ -238,64 +210,53 @@ } } } + } else { + // Compaction scores are sorted in descending order, no further scores + // will be >= 1. + break; } } + if (!start_level_inputs_.empty()) { + return; + } // if we didn't find a compaction, check if there are any files marked for // compaction - if (start_level_inputs_.empty()) { - parent_index_ = base_index_ = -1; + parent_index_ = base_index_ = -1; - compaction_picker_->PickFilesMarkedForCompaction( - cf_name_, vstorage_, &start_level_, &output_level_, - &start_level_inputs_); - if (!start_level_inputs_.empty()) { - is_manual_ = true; - compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; - return; - } + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; } // Bottommost Files Compaction on deleting tombstones - if (start_level_inputs_.empty()) { - size_t i; - for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); - ++i) { - auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; - assert(!level_and_file.second->being_compacted); - start_level_inputs_.level = output_level_ = start_level_ = - level_and_file.first; - start_level_inputs_.files = {level_and_file.second}; - if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_)) { - break; - } - } - if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { - start_level_inputs_.clear(); - } else { - assert(!start_level_inputs_.empty()); - compaction_reason_ = CompactionReason::kBottommostFiles; - return; - } + PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kBottommostFiles; + return; } // TTL Compaction - if (start_level_inputs_.empty()) { - PickExpiredTtlFiles(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kTtl; - return; - } + PickFileToCompact(vstorage_->ExpiredTtlFiles(), true); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; } // Periodic Compaction - if (start_level_inputs_.empty()) { - PickFilesMarkedForPeriodicCompaction(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kPeriodicCompaction; - return; - } + PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; } } @@ -375,8 +336,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() { auto c = new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), - output_level_, + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(compaction_inputs_), output_level_, MaxFileSizeForLevel(mutable_cf_options_, output_level_, ioptions_.compaction_style, vstorage_->base_level(), ioptions_.level_compaction_dynamic_level_bytes), @@ -384,7 +345,8 @@ GetPathId(ioptions_, mutable_cf_options_, output_level_), GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, output_level_, vstorage_->base_level()), - GetCompressionOptions(ioptions_, vstorage_, output_level_), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), + Temperature::kUnknown, /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, start_level_score_, false /* deletion_compaction */, compaction_reason_); @@ -433,7 +395,7 @@ if (ioptions.level_compaction_dynamic_level_bytes) { // Currently, level_compaction_dynamic_level_bytes is ignored when // multiple db paths are specified. https://github.com/facebook/ - // rocksdb/blob/master/db/column_family.cc. + // rocksdb/blob/main/db/column_family.cc. // Still, adding this check to avoid accidentally using // max_bytes_for_level_multiplier_additional level_size = static_cast( @@ -549,10 +511,11 @@ Compaction* LevelCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, - SequenceNumber earliest_mem_seqno) { + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) { LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, - log_buffer, mutable_cf_options, ioptions_); + log_buffer, mutable_cf_options, ioptions_, + mutable_db_options); return builder.PickCompaction(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,12 +17,13 @@ // for description of Leveled compaction. class LevelCompactionPicker : public CompactionPicker { public: - LevelCompactionPicker(const ImmutableCFOptions& ioptions, + LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual Compaction* PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; virtual bool NeedsCompaction( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,16 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - #include #include #include + #include "db/compaction/compaction.h" #include "db/compaction/compaction_picker_fifo.h" #include "db/compaction/compaction_picker_level.h" #include "db/compaction/compaction_picker_universal.h" - -#include "logging/logging.h" +#include "db/compaction/file_pri.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -31,8 +30,9 @@ const Comparator* ucmp_; InternalKeyComparator icmp_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; + MutableDBOptions mutable_db_options_; LevelCompactionPicker level_compaction_picker; std::string cf_name_; CountingLogger logger_; @@ -52,6 +52,7 @@ icmp_(ucmp_), ioptions_(options_), mutable_cf_options_(options_), + mutable_db_options_(), level_compaction_picker(ioptions_, &icmp_), cf_name_("dummy"), log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), @@ -78,8 +79,17 @@ vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); } + // Create a new VersionStorageInfo object so we can add mode files and then + // merge it with the existing VersionStorageInfo + void AddVersionStorage() { + temp_vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, + vstorage_.get(), false)); + } + void DeleteVersionStorage() { vstorage_.reset(); + temp_vstorage_.reset(); files_.clear(); file_map_.clear(); input_files_.clear(); @@ -88,18 +98,28 @@ void Add(int level, uint32_t file_number, const char* smallest, const char* largest, uint64_t file_size = 1, uint32_t path_id = 0, SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, - size_t compensated_file_size = 0) { - assert(level < vstorage_->num_levels()); + size_t compensated_file_size = 0, bool marked_for_compact = false, + Temperature temperature = Temperature::kUnknown, + uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime) { + VersionStorageInfo* vstorage; + if (temp_vstorage_) { + vstorage = temp_vstorage_.get(); + } else { + vstorage = vstorage_.get(); + } + assert(level < vstorage->num_levels()); FileMetaData* f = new FileMetaData( file_number, path_id, file_size, InternalKey(smallest, smallest_seq, kTypeValue), InternalKey(largest, largest_seq, kTypeValue), smallest_seq, - largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; - vstorage_->AddFile(level, f); + f->oldest_ancester_time = oldest_ancestor_time; + vstorage->AddFile(level, f); files_.emplace_back(f); file_map_.insert({file_number, {f, level}}); } @@ -122,8 +142,14 @@ } void UpdateVersionStorageInfo() { + if (temp_vstorage_) { + VersionBuilder builder(FileOptions(), &ioptions_, nullptr, + vstorage_.get(), nullptr); + ASSERT_OK(builder.SaveTo(temp_vstorage_.get())); + vstorage_ = std::move(temp_vstorage_); + } vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); - vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_->UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_); vstorage_->UpdateNumNonEmptyLevels(); vstorage_->GenerateFileIndexer(); vstorage_->GenerateLevelFilesBrief(); @@ -132,13 +158,36 @@ vstorage_->ComputeFilesMarkedForCompaction(); vstorage_->SetFinalized(); } + void AddFileToVersionStorage(int level, uint32_t file_number, + const char* smallest, const char* largest, + uint64_t file_size = 1, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100, + size_t compensated_file_size = 0, + bool marked_for_compact = false) { + VersionStorageInfo* base_vstorage = vstorage_.release(); + vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleUniversal, + base_vstorage, false)); + Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq, + largest_seq, compensated_file_size, marked_for_compact); + + VersionBuilder builder(FileOptions(), &ioptions_, nullptr, base_vstorage, + nullptr); + builder.SaveTo(vstorage_.get()); + UpdateVersionStorageInfo(); + } + + private: + std::unique_ptr temp_vstorage_; }; TEST_F(CompactionPickerTest, Empty) { NewVersionStorage(6, kCompactionStyleLevel); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -149,7 +198,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -162,7 +212,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -175,7 +226,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); @@ -193,7 +245,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->num_input_files(1)); @@ -224,7 +277,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); @@ -271,7 +325,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -295,7 +350,8 @@ ASSERT_EQ(vstorage_->base_level(), num_levels - 2); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -320,7 +376,8 @@ ASSERT_EQ(vstorage_->base_level(), num_levels - 3); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -349,7 +406,8 @@ ASSERT_EQ(vstorage_->base_level(), num_levels - 3); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -371,8 +429,8 @@ mutable_cf_options_.max_bytes_for_level_multiplier = 10; NewVersionStorage(num_levels, kCompactionStyleLevel); Add(0, 1U, "150", "200"); - Add(num_levels - 1, 3U, "200", "250", 300U); - Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 1, 2U, "200", "250", 300U); + Add(num_levels - 1, 3U, "300", "350", 3000U); Add(num_levels - 1, 4U, "400", "450", 3U); Add(num_levels - 2, 5U, "150", "180", 300U); Add(num_levels - 2, 6U, "181", "350", 500U); @@ -381,7 +439,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); @@ -438,7 +497,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); // output level should be the one above the bottom-most ASSERT_EQ(1, compaction->output_level()); @@ -472,7 +532,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(!compaction->is_trivial_move()); } @@ -498,7 +559,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction->is_trivial_move()); } @@ -526,7 +588,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); @@ -556,7 +619,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_FALSE(compaction); } @@ -582,14 +646,15 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_FALSE(compaction); } TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { // The case where universal periodic compaction couldn't form - // a compaction that inlcudes any file marked for periodic compaction. + // a compaction that includes any file marked for periodic compaction. // Right now we form the compaction anyway if it is more than one // sorted run. Just put the case here to validate that it doesn't // crash. @@ -612,7 +677,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(!compaction || compaction->start_level() != compaction->output_level()); } @@ -632,7 +698,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(0, compaction->start_level()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -656,7 +723,8 @@ std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->start_level()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -665,6 +733,221 @@ ASSERT_EQ(4, compaction->output_level()); } +TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 555555; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(3, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(3, 7U, "910", "980", 1, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + // Add(4, 15U, "960", "970", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + // ASSERT_EQ(4U, compaction->num_input_files(1)); + ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber()); + ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) { + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 400000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(1, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(2, 5U, "310", "380", kFileSize, 0, 200, 251); + Add(2, 6U, "410", "880", kFileSize, 0, 200, 251); + Add(2, 7U, "910", "980", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(1U, compaction->num_input_files(1)); + ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) { + // Test bottom level files falling between gaps between two upper level + // files + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 300000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "000", "180", kFileSize, 0, 200, 251); + Add(3, 6U, "181", "190", kFileSize, 0, 200, 251); + Add(3, 7U, "710", "810", kFileSize, 0, 200, 251); + Add(3, 8U, "820", "830", kFileSize, 0, 200, 251); + Add(3, 9U, "900", "991", kFileSize, 0, 200, 251); + Add(4, 10U, "201", "250", kFileSize, 0, 101, 150); + Add(4, 11U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 12U, "401", "450", kFileSize, 0, 101, 150); + Add(4, 13U, "501", "750", kFileSize, 0, 101, 150); + Add(4, 14U, "801", "850", kFileSize, 0, 101, 150); + Add(4, 15U, "901", "950", kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber()); + ASSERT_EQ(0, compaction->num_input_files(2)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) { + // Test compaction candidates always cover many files. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 79; i++) { + Add(3, 100 + i * 3, ToString(i * 100).c_str(), + ToString(i * 100 + 80).c_str(), kFileSize, 0, 200, 251); + // Add a tie breaker + if (i == 66) { + Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251); + } + + Add(4, 100 + i * 3 + 1, ToString(i * 100 + 30).c_str(), + ToString(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, ToString(i * 100 + 60).c_str(), + ToString(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(11, compaction->num_input_files(1)); +} + +TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { + // Test compaction candidates always cover many files with some single + // files larger than size threshold. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.max_compaction_bytes = 3200000; + mutable_cf_options_.compaction_options_universal.incremental = true; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 30; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + + // Generate files like following: + // L3: (1101, 1180) (1201, 1280) ... (7901, 7908) + // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010) + for (int i = 11; i < 70; i++) { + Add(3, 100 + i * 3, ToString(i * 100).c_str(), + ToString(i * 100 + 80).c_str(), + i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251); + + Add(4, 100 + i * 3 + 1, ToString(i * 100 + 30).c_str(), + ToString(i * 100 + 50).c_str(), kFileSize, 0, 200, 251); + Add(4, 100 + i * 3 + 2, ToString(i * 100 + 60).c_str(), + ToString(i * 100 + 110).c_str(), kFileSize, 0, 200, 251); + } + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(3, compaction->start_level()); + ASSERT_EQ(6U, compaction->num_input_files(0)); + ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber()); + ASSERT_EQ(13, compaction->num_input_files(1)); +} + TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { NewVersionStorage(1, kCompactionStyleFIFO); const int kFileCount = @@ -681,18 +964,255 @@ // verify whether compaction is needed based on the current // size of L0 files. - uint64_t current_size = 0; for (int i = 1; i <= kFileCount; ++i) { NewVersionStorage(1, kCompactionStyleFIFO); Add(0, i, ToString((i + 100) * 1000).c_str(), - ToString((i + 100) * 1000 + 999).c_str(), - kFileSize, 0, i * 100, i * 100 + 99); - current_size += kFileSize; + ToString((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100, + i * 100 + 99); UpdateVersionStorageInfo(); ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), vstorage_->CompactionScore(0) >= 1); } } + +TEST_F(CompactionPickerTest, FIFOToWarm1) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarm2) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 9; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kUnknown, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kUnknown, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) { + NewVersionStorage(1, kCompactionStyleFIFO); + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * 100000; + uint64_t kWarmThreshold = 2000; + + fifo_options_.max_table_files_size = kMaxSize; + fifo_options_.age_for_warm = kWarmThreshold; + mutable_cf_options_.compaction_options_fifo = fifo_options_; + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.max_compaction_bytes = kFileSize * 100; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + int64_t current_time = 0; + ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); + uint64_t threshold_time = + static_cast(current_time) - kWarmThreshold; + Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, + Temperature::kUnknown, static_cast(current_time) - 100); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, + Temperature::kUnknown, threshold_time + 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, + Temperature::kUnknown, threshold_time - 2000); + Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, + Temperature::kWarm, threshold_time - 3000); + Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, + Temperature::kUnknown, threshold_time - 4000); + Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, + Temperature::kWarm, threshold_time - 5000); + UpdateVersionStorageInfo(); + + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); + std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + // Stop if a file is being compacted + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); +} + #endif // ROCKSDB_LITE TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { @@ -716,7 +1236,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Pick file 8 because it overlaps with 0 files on level 3. @@ -735,11 +1256,11 @@ Add(2, 6U, "150", "175", 60000000U); // Overlaps with file 26, 27, total size 521M Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size - // 520M, the smalelst overlapping + // 520M, the smallest overlapping Add(2, 8U, "201", "300", 60000000U); // Overlaps with file 28, 29, total size 521M - Add(3, 26U, "100", "110", 261000000U); + Add(3, 25U, "100", "110", 261000000U); Add(3, 26U, "150", "170", 261000000U); Add(3, 27U, "171", "179", 260000000U); Add(3, 28U, "191", "220", 260000000U); @@ -748,7 +1269,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 7 because overlapping ratio is the biggest. @@ -775,7 +1297,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -804,7 +1327,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -831,7 +1355,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); } // This test checks ExpandWhileOverlapping() by having overlapping user keys @@ -848,7 +1373,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -867,7 +1393,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -894,7 +1421,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -924,7 +1452,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -947,7 +1476,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -968,7 +1498,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -988,7 +1519,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_GE(1U, compaction->num_input_files(0)); @@ -1016,7 +1548,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(3U, compaction->num_input_files(0)); @@ -1048,7 +1581,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -1088,7 +1622,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1126,7 +1661,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1135,6 +1671,66 @@ ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); } +TEST_F(CompactionPickerTest, FileTtlBooster) { + // Set TTL to 2048 + // TTL boosting for all levels starts at 1024, + // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960. + // From second last level (L5), range starts at + // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3). + // Boosting step 124 / 16 = 7.75 -> 7 + // + const uint64_t kCurrentTime = 1000000; + FileMetaData meta; + + { + FileTtlBooster booster(kCurrentTime, 2048, 7, 3); + + // Not triggering if the file is younger than ttl/2 + meta.oldest_ancester_time = kCurrentTime - 1023; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 1024; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime + 10; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // Within one boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + + // One boosting step + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8); + ASSERT_EQ(2, booster.GetBoostScore(&meta)); + + // Multiple boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30); + ASSERT_EQ(5, booster.GetBoostScore(&meta)); + + // Very high boosting steps + meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700); + ASSERT_EQ(101, booster.GetBoostScore(&meta)); + } + { + // Test second last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 5); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(3, booster.GetBoostScore(&meta)); + } + { + // Test last level + FileTtlBooster booster(kCurrentTime, 2048, 7, 6); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60); + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + meta.oldest_ancester_time = kCurrentTime - 3000; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); + } +} + TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { NewVersionStorage(6, kCompactionStyleLevel); mutable_cf_options_.level0_file_num_compaction_trigger = 2; @@ -1148,7 +1744,7 @@ Add(0, 32U, "001", "400", 1000000000U, 0, 0); Add(0, 33U, "001", "400", 1000000000U, 0, 0); - // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. Add(1, 4U, "050", "300", 1000000000U, 0, 0); file_map_[4u].first->being_compacted = true; Add(1, 5U, "301", "350", 1000000000U, 0, 0); @@ -1163,7 +1759,8 @@ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -1180,7 +1777,7 @@ Add(0, 32U, "001", "400", 1000000000U, 0, 0); Add(0, 33U, "001", "400", 1000000000U, 0, 0); - // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. Add(1, 4U, "050", "300", 1000000000U, 0, 0); Add(1, 5U, "301", "350", 1000000000U, 0, 0); @@ -1193,7 +1790,8 @@ ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); } @@ -1226,7 +1824,8 @@ ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); } @@ -1255,7 +1854,7 @@ // Size ratio L4/L3 is 9.9 // After merge from L3, L4 size is 1000900 Add(4, 11U, "400", "500", 999900); - Add(5, 11U, "400", "500", 8007200); + Add(5, 12U, "400", "500", 8007200); UpdateVersionStorageInfo(); @@ -1520,7 +2119,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1544,7 +2144,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(3U, compaction->num_input_files(0)); @@ -1568,16 +2169,43 @@ Add(3, 5U, "120", "130", 7000U); Add(3, 6U, "170", "180", 7000U); - Add(3, 5U, "220", "230", 7000U); - Add(3, 5U, "270", "280", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); } +TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { + mutable_cf_options_.max_bytes_for_level_base = 10000u; + mutable_cf_options_.max_compaction_bytes = 10001u; + ioptions_.level_compaction_dynamic_level_bytes = false; + ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1); + NewVersionStorage(6, kCompactionStyleLevel); + // A compaction should be triggered and pick file 2 + Add(1, 1U, "100", "150", 3000U); + Add(1, 2U, "151", "200", 3001U); + Add(1, 3U, "201", "250", 3000U); + Add(1, 4U, "251", "300", 3000U); + + Add(3, 5U, "120", "130", 7000U); + Add(3, 6U, "170", "180", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + // No trivial move, because partitioning is applied + ASSERT_TRUE(!compaction->IsTrivialMove()); +} + TEST_F(CompactionPickerTest, IsTrivialMoveOff) { mutable_cf_options_.max_bytes_for_level_base = 1000000u; mutable_cf_options_.max_compaction_bytes = 10000u; @@ -1594,7 +2222,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_FALSE(compaction->IsTrivialMove()); } @@ -1619,7 +2248,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1628,7 +2258,8 @@ ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); compaction.reset(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1637,7 +2268,8 @@ ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); compaction.reset(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); } @@ -1662,7 +2294,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -1692,7 +2325,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -1724,7 +2358,8 @@ UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107)); + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_, 107)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -1733,6 +2368,336 @@ ASSERT_EQ(0, compaction->output_level()); } +#ifndef ROCKSDB_LITE +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a "regular" universal compaction is + // scheduled first, followed by a delete triggered compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a compaction to reduce sorted runs + ASSERT_EQ(CompactionReason::kUniversalSortedRunNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + + AddVersionStorage(); + // Simulate a flush and mark the file for compaction + Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(5, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250); + Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); +} + +TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + + bool input_level_overlap = false; + bool output_level_overlap = false; + // Let's mark 2 files in 2 different levels for compaction. The + // compaction picker will randomly pick one, so use the sync point to + // ensure a deterministic order. Loop until both cases are covered + size_t random_index = 0; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) { + size_t* index = static_cast(arg); + *index = random_index; + }); + SyncPoint::GetInstance()->EnableProcessing(); + while (!input_level_overlap || !output_level_overlap) { + // Ensure that the L0 file gets picked first + random_index = !input_level_overlap ? 0 : 1; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248); + Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249); + Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250); + Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true); + Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150); + Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_TRUE(compaction->start_level() == 0 || + compaction->start_level() == 3); + if (compaction->start_level() == 0) { + // The L0 file was picked. The next compaction will detect an + // overlap on its input level + input_level_overlap = true; + ASSERT_EQ(3, compaction->output_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(3U, compaction->num_input_files(1)); + } else { + // The level 3 file was picked. The next compaction will pick + // the L0 file and will detect overlap when adding output + // level inputs + output_level_overlap = true; + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->num_input_files(1)); + } + + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + // After recomputing the compaction score, only one marked file will remain + random_index = 0; + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_FALSE(compaction2); + DeleteVersionStorage(); + } +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled and should result in a full compaction + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a file is being compacted, and a + // delete triggered compaction is then scheduled. The latter should stop + // at the first file being compacted + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + file_map_[3].first->being_compacted = true; + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { + const uint64_t kFileSize = 100000; + + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + // This test covers the case where a delete triggered compaction is + // scheduled first, followed by a "regular" compaction. The latter + // should fail + NewVersionStorage(1, kCompactionStyleUniversal); + + // Mark file number 4 for compaction + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + + ASSERT_TRUE(compaction); + // Validate that its a delete triggered compaction + ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[5].first->being_compacted); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[6].first->being_compacted); + + AddVersionStorage(); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction2( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction2); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[1].first->being_compacted); + ASSERT_TRUE(file_map_[2].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); +} + +TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) { + const uint64_t kFileSize = 100000; + const int kNumLevels = 7; + + // This test makes sure the `files_marked_for_compaction_` is updated after + // creating manual compaction. + ioptions_.compaction_style = kCompactionStyleUniversal; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + + // Add 3 files marked for compaction + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); + UpdateVersionStorageInfo(); + + // All 3 files are marked for compaction + ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size()); + + bool manual_conflict = false; + InternalKey* manual_end = NULL; + std::unique_ptr compaction( + universal_compaction_picker.CompactRange( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(), NULL, + NULL, &manual_end, &manual_conflict, port::kMaxUint64)); + + ASSERT_TRUE(compaction); + + ASSERT_EQ(CompactionReason::kManualCompaction, + compaction->compaction_reason()); + ASSERT_EQ(kNumLevels - 1, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(3U, compaction->num_input_files(0)); + ASSERT_TRUE(file_map_[3].first->being_compacted); + ASSERT_TRUE(file_map_[4].first->being_compacted); + ASSERT_TRUE(file_map_[5].first->being_compacted); + + // After creating the manual compaction, all files should be cleared from + // `FilesMarkedForCompaction`. So they won't be picked by others. + ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size()); +} + +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,9 +15,11 @@ #include #include #include + #include "db/column_family.h" #include "file/filename.h" #include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/statistics.h" #include "test_util/sync_point.h" #include "util/random.h" @@ -31,17 +33,16 @@ // PickCompaction(). class UniversalCompactionBuilder { public: - UniversalCompactionBuilder(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp, - const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - UniversalCompactionPicker* picker, - LogBuffer* log_buffer) + UniversalCompactionBuilder( + const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, LogBuffer* log_buffer) : ioptions_(ioptions), icmp_(icmp), cf_name_(cf_name), mutable_cf_options_(mutable_cf_options), + mutable_db_options_(mutable_db_options), vstorage_(vstorage), picker_(picker), log_buffer_(log_buffer) {} @@ -88,6 +89,14 @@ // Pick Universal compaction to limit space amplification. Compaction* PickCompactionToReduceSizeAmp(); + // Try to pick incremental compaction to reduce space amplification. + // It will return null if it cannot find a fanout within the threshold. + // Fanout is defined as + // total size of files to compact at output level + // -------------------------------------------------- + // total size of files to compact at other levels + Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold); + Compaction* PickDeleteTriggeredCompaction(); // Form a compaction from the sorted run indicated by start_index to the @@ -103,25 +112,27 @@ // because some files are being compacted. Compaction* PickPeriodicCompaction(); - // Used in universal compaction when the enabled_trivial_move + // Used in universal compaction when the allow_trivial_move // option is set. Checks whether there are any overlapping files // in the input. Returns true if the input files are non // overlapping. bool IsInputFilesNonOverlapping(Compaction* c); - const ImmutableCFOptions& ioptions_; + uint64_t GetMaxOverlappingBytes() const; + + const ImmutableOptions& ioptions_; const InternalKeyComparator* icmp_; double score_; std::vector sorted_runs_; const std::string& cf_name_; const MutableCFOptions& mutable_cf_options_; + const MutableDBOptions& mutable_db_options_; VersionStorageInfo* vstorage_; UniversalCompactionPicker* picker_; LogBuffer* log_buffer_; static std::vector CalculateSortedRuns( - const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options); + const VersionStorageInfo& vstorage); // Pick a path ID to place a newly generated file, with its estimated file // size. @@ -158,9 +169,9 @@ const Comparator* ucmp_; }; -typedef std::priority_queue, - SmallestKeyHeapComparator> - SmallestKeyHeap; +using SmallestKeyHeap = + std::priority_queue, + SmallestKeyHeapComparator>; // This function creates the heap that is used to find if the files are // overlapping during universal compaction when the allow_trivial_move @@ -278,11 +289,11 @@ Compaction* UniversalCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, - SequenceNumber /* earliest_memtable_seqno */) { + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) { UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, - mutable_cf_options, vstorage, this, - log_buffer); + mutable_cf_options, mutable_db_options, + vstorage, this, log_buffer); return builder.PickCompaction(); } @@ -325,8 +336,7 @@ std::vector UniversalCompactionBuilder::CalculateSortedRuns( - const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/, - const MutableCFOptions& mutable_cf_options) { + const VersionStorageInfo& vstorage) { std::vector ret; for (FileMetaData* f : vstorage.LevelFiles(0)) { ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, @@ -336,27 +346,16 @@ uint64_t total_compensated_size = 0U; uint64_t total_size = 0U; bool being_compacted = false; - bool is_first = true; for (FileMetaData* f : vstorage.LevelFiles(level)) { total_compensated_size += f->compensated_file_size; total_size += f->fd.GetFileSize(); - if (mutable_cf_options.compaction_options_universal.allow_trivial_move == - true) { - if (f->being_compacted) { - being_compacted = f->being_compacted; - } - } else { - // Compaction always includes all files for a non-zero level, so for a - // non-zero level, all the files should share the same being_compacted - // value. - // This assumption is only valid when - // mutable_cf_options.compaction_options_universal.allow_trivial_move - // is false - assert(is_first || f->being_compacted == being_compacted); - } - if (is_first) { + // Size amp, read amp and periodic compactions always include all files + // for a non-zero level. However, a delete triggered compaction and + // a trivial move might pick a subset of files in a sorted run. So + // always check all files in a sorted run and mark the entire run as + // being compacted if one or more files are being compacted + if (f->being_compacted) { being_compacted = f->being_compacted; - is_first = false; } } if (total_compensated_size > 0) { @@ -372,8 +371,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { const int kLevel0 = 0; score_ = vstorage_->CompactionScore(kLevel0); - sorted_runs_ = - CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_); + sorted_runs_ = CalculateSortedRuns(*vstorage_); if (sorted_runs_.size() == 0 || (vstorage_->FilesMarkedForPeriodicCompaction().empty() && @@ -389,7 +387,7 @@ VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER_MAX_SZ( log_buffer_, 3072, - "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n", + "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n", cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); Compaction* c = nullptr; @@ -475,7 +473,6 @@ // validate that all the chosen files of L0 are non overlapping in time #ifndef NDEBUG - SequenceNumber prev_smallest_seqno = 0U; bool is_first = true; size_t level_index = 0U; @@ -485,7 +482,6 @@ if (is_first) { is_first = false; } - prev_smallest_seqno = f->fd.smallest_seqno; } level_index = 1U; } @@ -497,22 +493,16 @@ &largest_seqno); if (is_first) { is_first = false; - } else if (prev_smallest_seqno > 0) { - // A level is considered as the bottommost level if there are - // no files in higher levels or if files in higher levels do - // not overlap with the files being compacted. Sequence numbers - // of files in bottommost level can be set to 0 to help - // compression. As a result, the following assert may not hold - // if the prev_smallest_seqno is 0. - assert(prev_smallest_seqno > largest_seqno); } - prev_smallest_seqno = smallest_seqno; } } #endif // update statistics - RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs(0)->size()); + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files); picker_->RegisterCompaction(c); vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); @@ -737,6 +727,19 @@ cf_name_.c_str(), file_num_buf); } + std::vector grandparents; + // Include grandparents for potential file cutting in incremental + // mode. It is for aligning file cutting boundaries across levels, + // so that subsequent compactions can pick files with aligned + // buffer. + // Single files are only picked up in incremental mode, so that + // there is no need for full range. + if (mutable_cf_options_.compaction_options_universal.incremental && + first_index_after < sorted_runs_.size() && + sorted_runs_[first_index_after].level > 1) { + grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level); + } + CompactionReason compaction_reason; if (max_number_of_files_to_compact == UINT_MAX) { compaction_reason = CompactionReason::kUniversalSizeRatio; @@ -744,21 +747,22 @@ compaction_reason = CompactionReason::kUniversalSortedRunNum; } return new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), - output_level, + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, MaxFileSizeForLevel(mutable_cf_options_, output_level, kCompactionStyleUniversal), - LLONG_MAX, path_id, + GetMaxOverlappingBytes(), path_id, GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, 1, enable_compression), - GetCompressionOptions(ioptions_, vstorage_, start_level, + GetCompressionOptions(mutable_cf_options_, vstorage_, start_level, enable_compression), - /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, - score_, false /* deletion_compaction */, compaction_reason); + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, /* is manual */ false, score_, + false /* deletion_compaction */, compaction_reason); } // Look at overall size amplification. If size amplification -// exceeeds the configured value, then do a compaction +// exceeds the configured value, then do a compaction // of the candidate files all the way upto the earliest // base file (overrides configured values of file-size ratios, // min_merge_width and max_merge_width). @@ -779,7 +783,7 @@ } // Skip files that are already being compacted - for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) { + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { sr = &sorted_runs_[loop]; if (!sr->being_compacted) { start_index = loop; // Consider this as the first candidate. @@ -807,9 +811,11 @@ } // keep adding up all the remaining files - for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) { + for (size_t loop = start_index; loop + 1 < sorted_runs_.size(); loop++) { sr = &sorted_runs_[loop]; if (sr->being_compacted) { + // TODO with incremental compaction is supported, we might want to + // schedule some incremental compactions in parallel if needed. char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); ROCKS_LOG_BUFFER( @@ -843,34 +849,288 @@ " earliest-file-size %" PRIu64, cf_name_.c_str(), candidate_size, earliest_file_size); } + // Since incremental compaction can't include more than second last + // level, it can introduce penalty, compared to full compaction. We + // hard code the pentalty to be 80%. If we end up with a compaction + // fanout higher than 80% of full level compactions, we fall back + // to full level compaction. + // The 80% threshold is arbitrary and can be adjusted or made + // configurable in the future. + // This also prevent the case when compaction falls behind and we + // need to compact more levels for compactions to catch up. + if (mutable_cf_options_.compaction_options_universal.incremental) { + double fanout_threshold = static_cast(earliest_file_size) / + static_cast(candidate_size) * 1.8; + Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold); + if (picked != nullptr) { + // As the feature is still incremental, picking incremental compaction + // might fail and we will fall bck to compacting full level. + return picked; + } + } return PickCompactionToOldest(start_index, CompactionReason::kUniversalSizeAmplification); } +Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( + double fanout_threshold) { + // Try find all potential compactions with total size just over + // options.max_compaction_size / 2, and take the one with the lowest + // fanout (defined in declaration of the function). + // This is done by having a sliding window of the files at the second + // lowest level, and keep expanding while finding overlapping in the + // last level. Once total size exceeds the size threshold, calculate + // the fanout value. And then shrinking from the small side of the + // window. Keep doing it until the end. + // Finally, we try to include upper level files if they fall into + // the range. + // + // Note that it is a similar problem as leveled compaction's + // kMinOverlappingRatio priority, but instead of picking single files + // we expand to a target compaction size. The reason is that in + // leveled compaction, actual fanout value tends to high, e.g. 10, so + // even with single file in down merging level, the extra size + // compacted in boundary files is at a lower ratio. But here users + // often have size of second last level size to be 1/4, 1/3 or even + // 1/2 of the bottommost level, so picking single file in second most + // level will cause significant waste, which is not desirable. + // + // This algorithm has lots of room to improve to pick more efficient + // compactions. + assert(sorted_runs_.size() >= 2); + int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level; + if (second_last_level == 0) { + // Can't split Level 0. + return nullptr; + } + int output_level = sorted_runs_.back().level; + const std::vector& bottom_files = + vstorage_->LevelFiles(output_level); + const std::vector& files = + vstorage_->LevelFiles(second_last_level); + assert(!bottom_files.empty()); + assert(!files.empty()); + + // std::unordered_map file_to_order; + + int picked_start_idx = 0; + int picked_end_idx = 0; + double picked_fanout = fanout_threshold; + + // Use half target compaction bytes as anchor to stop growing second most + // level files, and reserve growing space for more overlapping bottom level, + // clean cut, files from other levels, etc. + uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2; + int start_idx = 0; + int bottom_end_idx = 0; + int bottom_start_idx = 0; + uint64_t non_bottom_size = 0; + uint64_t bottom_size = 0; + bool end_bottom_size_counted = false; + for (int end_idx = 0; end_idx < static_cast(files.size()); end_idx++) { + FileMetaData* end_file = files[end_idx]; + + // Include bottom most level files smaller than the current second + // last level file. + int num_skipped = 0; + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->smallest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + } + bottom_end_idx++; + end_bottom_size_counted = false; + num_skipped++; + } + + if (num_skipped > 1) { + // At least a file in the bottom most level falls into the file gap. No + // reason to include the file. We cut the range and start a new sliding + // window. + start_idx = end_idx; + } + + if (start_idx == end_idx) { + // new sliding window. + non_bottom_size = 0; + bottom_size = 0; + bottom_start_idx = bottom_end_idx; + end_bottom_size_counted = false; + } + + non_bottom_size += end_file->fd.file_size; + + // Include all overlapping files in bottom level. + while (bottom_end_idx < static_cast(bottom_files.size()) && + icmp_->Compare(bottom_files[bottom_end_idx]->smallest, + end_file->largest) < 0) { + if (!end_bottom_size_counted) { + bottom_size += bottom_files[bottom_end_idx]->fd.file_size; + end_bottom_size_counted = true; + } + if (icmp_->Compare(bottom_files[bottom_end_idx]->largest, + end_file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + bottom_end_idx++; + end_bottom_size_counted = false; + } + + if ((non_bottom_size + bottom_size > comp_thres_size || + end_idx == static_cast(files.size()) - 1) && + non_bottom_size > 0) { // Do we alow 0 size file at all? + // If it is a better compaction, remember it in picked* variables. + double fanout = static_cast(bottom_size) / + static_cast(non_bottom_size); + if (fanout < picked_fanout) { + picked_start_idx = start_idx; + picked_end_idx = end_idx; + picked_fanout = fanout; + } + // Shrink from the start end to under comp_thres_size + while (non_bottom_size + bottom_size > comp_thres_size && + start_idx <= end_idx) { + non_bottom_size -= files[start_idx]->fd.file_size; + start_idx++; + if (start_idx < static_cast(files.size())) { + while (bottom_start_idx <= bottom_end_idx && + icmp_->Compare(bottom_files[bottom_start_idx]->largest, + files[start_idx]->smallest) < 0) { + bottom_size -= bottom_files[bottom_start_idx]->fd.file_size; + bottom_start_idx++; + } + } + } + } + } + + if (picked_fanout >= fanout_threshold) { + assert(picked_fanout == fanout_threshold); + return nullptr; + } + + std::vector inputs; + CompactionInputFiles bottom_level_inputs; + CompactionInputFiles second_last_level_inputs; + second_last_level_inputs.level = second_last_level; + bottom_level_inputs.level = output_level; + for (int i = picked_start_idx; i <= picked_end_idx; i++) { + if (files[i]->being_compacted) { + return nullptr; + } + second_last_level_inputs.files.push_back(files[i]); + } + assert(!second_last_level_inputs.empty()); + if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &second_last_level_inputs, + /*next_smallest=*/nullptr)) { + return nullptr; + } + // We might be able to avoid this binary search if we save and expand + // from bottom_start_idx and bottom_end_idx, but for now, we use + // SetupOtherInputs() for simplicity. + int parent_index = -1; // Create and use bottom_start_idx? + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &second_last_level_inputs, + &bottom_level_inputs, &parent_index, + /*base_index=*/-1)) { + return nullptr; + } + + // Try to include files in upper levels if they fall into the range. + // Since we need to go from lower level up and this is in the reverse + // order, compared to level order, we first write to an reversed + // data structure and finally copy them to compaction inputs. + InternalKey smallest, largest; + picker_->GetRange(second_last_level_inputs, &smallest, &largest); + std::vector inputs_reverse; + for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) { + SortedRun& sr = *it; + if (sr.level == 0) { + break; + } + std::vector level_inputs; + vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest, + &level_inputs); + if (!level_inputs.empty()) { + inputs_reverse.push_back({}); + inputs_reverse.back().level = sr.level; + inputs_reverse.back().files = level_inputs; + picker_->GetRange(inputs_reverse.back(), &smallest, &largest); + } + } + for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) { + inputs.push_back(*it); + } + + inputs.push_back(second_last_level_inputs); + inputs.push_back(bottom_level_inputs); + + // TODO support multi paths? + uint32_t path_id = 0; + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + GetMaxOverlappingBytes(), path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level, 1, true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, + true /* enable_compression */), + Temperature::kUnknown, + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, + CompactionReason::kUniversalSizeAmplification); +} + // Pick files marked for compaction. Typically, files are marked by // CompactOnDeleteCollector due to the presence of tombstones. Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { CompactionInputFiles start_level_inputs; int output_level; std::vector inputs; + std::vector grandparents; if (vstorage_->num_levels() == 1) { // This is single level universal. Since we're basically trying to reclaim // space by processing files marked for compaction due to high tombstone // density, let's do the same thing as compaction to reduce size amp which // has the same goals. - bool compact = false; + int start_index = -1; start_level_inputs.level = 0; start_level_inputs.files.clear(); output_level = 0; - for (FileMetaData* f : vstorage_->LevelFiles(0)) { - if (f->marked_for_compaction) { - compact = true; + // Find the first file marked for compaction. Ignore the last file + for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + continue; } - if (compact) { + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + if (f->marked_for_compaction) { start_level_inputs.files.push_back(f); + start_index = + static_cast(loop); // Consider this as the first candidate. + break; + } + } + if (start_index < 0) { + // Either no file marked, or they're already being compacted + return nullptr; + } + + for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { + SortedRun* sr = &sorted_runs_[loop]; + if (sr->being_compacted) { + break; } + + FileMetaData* f = vstorage_->LevelFiles(0)[loop]; + start_level_inputs.files.push_back(f); } if (start_level_inputs.size() <= 1) { // If only the last file in L0 is marked for compaction, ignore it @@ -939,6 +1199,9 @@ if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) { return nullptr; } + + picker_->GetGrandparents(vstorage_, start_level_inputs, + output_level_inputs, &grandparents); } else { inputs.push_back(start_level_inputs); } @@ -952,16 +1215,17 @@ uint32_t path_id = GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); return new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), - output_level, + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, MaxFileSizeForLevel(mutable_cf_options_, output_level, kCompactionStyleUniversal), - /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, + /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id, GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, output_level, 1), - GetCompressionOptions(ioptions_, vstorage_, output_level), - /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true, - score_, false /* deletion_compaction */, + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), + Temperature::kUnknown, + /* max_subcompactions */ 0, grandparents, /* is manual */ false, score_, + false /* deletion_compaction */, CompactionReason::kFilesMarkedForCompaction); } @@ -1001,6 +1265,9 @@ comp_reason_print_string = "size amp"; } else { assert(false); + comp_reason_print_string = "unknown: "; + comp_reason_print_string.append( + std::to_string(static_cast(compaction_reason))); } char file_num_buf[256]; @@ -1022,15 +1289,16 @@ // compaction_options_universal.compression_size_percent, // because we always compact all the files, so always compress. return new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), - output_level, + vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, + std::move(inputs), output_level, MaxFileSizeForLevel(mutable_cf_options_, output_level, kCompactionStyleUniversal), - LLONG_MAX, path_id, - GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, - 1, true /* enable_compression */), - GetCompressionOptions(ioptions_, vstorage_, start_level, + GetMaxOverlappingBytes(), path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level, 1, true /* enable_compression */), + GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), + Temperature::kUnknown, /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, score_, false /* deletion_compaction */, compaction_reason); } @@ -1100,6 +1368,17 @@ return c; } + +uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const { + if (!mutable_cf_options_.compaction_options_universal.incremental) { + return port::kMaxUint64; + } else { + // Try to align cutting boundary with files at the next level if the + // file isn't end up with 1/2 of target size, or it would overlap + // with two full size files at the next level. + return mutable_cf_options_.target_file_size_base / 2 * 3; + } +} } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,12 +15,13 @@ namespace ROCKSDB_NAMESPACE { class UniversalCompactionPicker : public CompactionPicker { public: - UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual Compaction* PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer, SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,825 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class TestCompactionServiceBase { + public: + virtual int GetCompactionNum() = 0; + + void OverrideStartStatus(CompactionServiceJobStatus s) { + is_override_start_status = true; + override_start_status = s; + } + + void OverrideWaitStatus(CompactionServiceJobStatus s) { + is_override_wait_status = true; + override_wait_status = s; + } + + void OverrideWaitResult(std::string str) { + is_override_wait_result = true; + override_wait_result = std::move(str); + } + + void ResetOverride() { + is_override_wait_result = false; + is_override_start_status = false; + is_override_wait_status = false; + } + + virtual ~TestCompactionServiceBase() = default; + + protected: + bool is_override_start_status = false; + CompactionServiceJobStatus override_start_status = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_status = false; + CompactionServiceJobStatus override_wait_status = + CompactionServiceJobStatus::kFailure; + bool is_override_wait_result = false; + std::string override_wait_result; +}; + +class MyTestCompactionServiceLegacy : public CompactionService, + public TestCompactionServiceBase { + public: + MyTestCompactionServiceLegacy(std::string db_path, Options& options, + std::shared_ptr& statistics) + : db_path_(std::move(db_path)), + options_(options), + statistics_(statistics) {} + + static const char* kClassName() { return "MyTestCompactionServiceLegacy"; } + + const char* Name() const override { return kClassName(); } + + CompactionServiceJobStatus Start(const std::string& compaction_service_input, + uint64_t job_id) override { + InstrumentedMutexLock l(&mutex_); + jobs_.emplace(job_id, compaction_service_input); + CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; + if (is_override_start_status) { + return override_start_status; + } + return s; + } + + CompactionServiceJobStatus WaitForComplete( + uint64_t job_id, std::string* compaction_service_result) override { + std::string compaction_input; + { + InstrumentedMutexLock l(&mutex_); + auto i = jobs_.find(job_id); + if (i == jobs_.end()) { + return CompactionServiceJobStatus::kFailure; + } + compaction_input = std::move(i->second); + jobs_.erase(i); + } + + if (is_override_wait_status) { + return override_wait_status; + } + + CompactionServiceOptionsOverride options_override; + options_override.env = options_.env; + options_override.file_checksum_gen_factory = + options_.file_checksum_gen_factory; + options_override.comparator = options_.comparator; + options_override.merge_operator = options_.merge_operator; + options_override.compaction_filter = options_.compaction_filter; + options_override.compaction_filter_factory = + options_.compaction_filter_factory; + options_override.prefix_extractor = options_.prefix_extractor; + options_override.table_factory = options_.table_factory; + options_override.sst_partitioner_factory = options_.sst_partitioner_factory; + options_override.statistics = statistics_; + + Status s = DB::OpenAndCompact( + db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(job_id), + compaction_input, compaction_service_result, options_override); + if (is_override_wait_result) { + *compaction_service_result = override_wait_result; + } + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + int GetCompactionNum() override { return compaction_num_.load(); } + + private: + InstrumentedMutex mutex_; + std::atomic_int compaction_num_{0}; + std::map jobs_; + const std::string db_path_; + Options options_; + std::shared_ptr statistics_; +}; + +class MyTestCompactionService : public CompactionService, + public TestCompactionServiceBase { + public: + MyTestCompactionService(std::string db_path, Options& options, + std::shared_ptr& statistics) + : db_path_(std::move(db_path)), + options_(options), + statistics_(statistics), + start_info_("na", "na", "na", 0, Env::TOTAL), + wait_info_("na", "na", "na", 0, Env::TOTAL) {} + + static const char* kClassName() { return "MyTestCompactionService"; } + + const char* Name() const override { return kClassName(); } + + CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) override { + InstrumentedMutexLock l(&mutex_); + start_info_ = info; + assert(info.db_name == db_path_); + jobs_.emplace(info.job_id, compaction_service_input); + CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; + if (is_override_start_status) { + return override_start_status; + } + return s; + } + + CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& info, + std::string* compaction_service_result) override { + std::string compaction_input; + assert(info.db_name == db_path_); + { + InstrumentedMutexLock l(&mutex_); + wait_info_ = info; + auto i = jobs_.find(info.job_id); + if (i == jobs_.end()) { + return CompactionServiceJobStatus::kFailure; + } + compaction_input = std::move(i->second); + jobs_.erase(i); + } + + if (is_override_wait_status) { + return override_wait_status; + } + + CompactionServiceOptionsOverride options_override; + options_override.env = options_.env; + options_override.file_checksum_gen_factory = + options_.file_checksum_gen_factory; + options_override.comparator = options_.comparator; + options_override.merge_operator = options_.merge_operator; + options_override.compaction_filter = options_.compaction_filter; + options_override.compaction_filter_factory = + options_.compaction_filter_factory; + options_override.prefix_extractor = options_.prefix_extractor; + options_override.table_factory = options_.table_factory; + options_override.sst_partitioner_factory = options_.sst_partitioner_factory; + options_override.statistics = statistics_; + + Status s = DB::OpenAndCompact( + db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(info.job_id), + compaction_input, compaction_service_result, options_override); + if (is_override_wait_result) { + *compaction_service_result = override_wait_result; + } + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + int GetCompactionNum() override { return compaction_num_.load(); } + + CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } + CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; } + + private: + InstrumentedMutex mutex_; + std::atomic_int compaction_num_{0}; + std::map jobs_; + const std::string db_path_; + Options options_; + std::shared_ptr statistics_; + CompactionServiceJobInfo start_info_; + CompactionServiceJobInfo wait_info_; +}; + +// This is only for listing test classes +enum TestCompactionServiceType { + MyTestCompactionServiceType, + MyTestCompactionServiceLegacyType, +}; + +class CompactionServiceTest + : public DBTestBase, + public testing::WithParamInterface { + public: + explicit CompactionServiceTest() + : DBTestBase("compaction_service_test", true) {} + + protected: + void ReopenWithCompactionService(Options* options) { + options->env = env_; + primary_statistics_ = CreateDBStatistics(); + options->statistics = primary_statistics_; + compactor_statistics_ = CreateDBStatistics(); + TestCompactionServiceType cs_type = GetParam(); + switch (cs_type) { + case MyTestCompactionServiceType: + compaction_service_ = std::make_shared( + dbname_, *options, compactor_statistics_); + break; + case MyTestCompactionServiceLegacyType: + compaction_service_ = std::make_shared( + dbname_, *options, compactor_statistics_); + break; + default: + assert(false); + } + options->compaction_service = compaction_service_; + DestroyAndReopen(*options); + } + + Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); } + + Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); } + + TestCompactionServiceBase* GetCompactionService() { + CompactionService* cs = compaction_service_.get(); + return dynamic_cast(cs); + } + + void GenerateTestData() { + // Generate 20 files @ L2 + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // Generate 10 files @ L1 overlap with all 20 files @ L2 + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,10,20"); + } + + void VerifyTestData() { + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + } + + private: + std::shared_ptr compactor_statistics_; + std::shared_ptr primary_statistics_; + std::shared_ptr compaction_service_; +}; + +TEST_P(CompactionServiceTest, BasicCompactions) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + Statistics* primary_statistics = GetPrimaryStatistics(); + Statistics* compactor_statistics = GetCompactorStatistics(); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); + ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0); + // even with remote compaction, primary host still needs to read SST files to + // `verify_table()`. + ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1); + // all the compaction write happens on the remote side + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES), + primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES)); + // compactor is already the remote side, which doesn't have remote + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + 0); + + // Test failed compaction + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) { + // override job status + auto s = static_cast(status); + *s = Status::Aborted("MyTestCompactionService failed to compact!"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + s = Put(Key(key_id), "value_new" + ToString(key_id)); + if (s.IsAborted()) { + break; + } + } + if (s.IsAborted()) { + break; + } + s = Flush(); + if (s.IsAborted()) { + break; + } + s = dbfull()->TEST_WaitForCompact(); + if (s.IsAborted()) { + break; + } + } + ASSERT_TRUE(s.IsAborted()); +} + +TEST_P(CompactionServiceTest, ManualCompaction) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + end_str = Key(92); + end = end_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); +} + +TEST_P(CompactionServiceTest, FailedToStart) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); +} + +TEST_P(CompactionServiceTest, InvalidResult) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideWaitResult("Invalid Str"); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_FALSE(s.ok()); +} + +TEST_P(CompactionServiceTest, SubCompaction) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + int compaction_num_before = my_cs->GetCompactionNum(); + + auto cro = CompactRangeOptions(); + cro.max_subcompactions = 10; + Status s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(s); + VerifyTestData(); + int compaction_num = my_cs->GetCompactionNum() - compaction_num_before; + // make sure there's sub-compaction by checking the compaction number + ASSERT_GE(compaction_num, 2); +} + +class PartialDeleteCompactionFilter : public CompactionFilter { + public: + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int i = std::stoi(key.ToString().substr(3)); + if (i > 5 && i <= 105) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + const char* Name() const override { return "PartialDeleteCompactionFilter"; } +}; + +TEST_P(CompactionServiceTest, CompactionFilter) { + Options options = CurrentOptions(); + std::unique_ptr delete_comp_filter( + new PartialDeleteCompactionFilter()); + options.compaction_filter = delete_comp_filter.get(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i > 5 && i <= 105) { + ASSERT_EQ(result, "NOT_FOUND"); + } else if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); +} + +TEST_P(CompactionServiceTest, Snapshot) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), "value2")); + ASSERT_OK(Put(Key(3), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + auto my_cs = GetCompactionService(); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + ASSERT_EQ("value1", Get(Key(1), s1)); + ASSERT_EQ("value2", Get(Key(1))); + db_->ReleaseSnapshot(s1); +} + +TEST_P(CompactionServiceTest, ConcurrentCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; + options.max_background_jobs = 20; + ReopenWithCompactionService(&options); + GenerateTestData(); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + std::vector threads; + for (const auto& file : meta.levels[1].files) { + threads.push_back(std::thread([&]() { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2)); + })); + } + + for (auto& thread : threads) { + thread.join(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = GetCompactionService(); + ASSERT_EQ(my_cs->GetCompactionNum(), 10); + ASSERT_EQ(FilesPerLevel(), "0,0,10"); +} + +TEST_P(CompactionServiceTest, CompactionInfo) { + // only test compaction info for new compaction service interface + if (GetParam() != MyTestCompactionServiceType) { + return; + } + + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + auto my_cs = + static_cast_with_check(GetCompactionService()); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_GE(comp_num, 1); + + CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(dbname_, info.db_name); + std::string db_id, db_session_id; + ASSERT_OK(db_->GetDbIdentity(db_id)); + ASSERT_EQ(db_id, info.db_id); + ASSERT_OK(db_->GetDbSessionId(db_session_id)); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(dbname_, info.db_name); + ASSERT_EQ(db_id, info.db_id); + ASSERT_EQ(db_session_id, info.db_session_id); + ASSERT_EQ(Env::LOW, info.priority); + + // Test priority USER + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + SstFileMetaData file = meta.levels[1].files[0]; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), + {file.db_path + "/" + file.name}, 2)); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::USER, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::USER, info.priority); + + // Test priority BOTTOM + env_->SetBackgroundThreads(1, Env::BOTTOM); + options.num_levels = 2; + ReopenWithCompactionService(&options); + my_cs = + static_cast_with_check(GetCompactionService()); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + info = my_cs->GetCompactionInfoForStart(); + ASSERT_EQ(Env::BOTTOM, info.priority); + info = my_cs->GetCompactionInfoForWait(); + ASSERT_EQ(Env::BOTTOM, info.priority); +} + +TEST_P(CompactionServiceTest, FallbackLocalAuto) { + Options options = CurrentOptions(); + ReopenWithCompactionService(&options); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + + ASSERT_EQ(my_cs->GetCompactionNum(), 0); + + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0); +} + +TEST_P(CompactionServiceTest, FallbackLocalManual) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + + GenerateTestData(); + VerifyTestData(); + + auto my_cs = GetCompactionService(); + Statistics* compactor_statistics = GetCompactorStatistics(); + Statistics* primary_statistics = GetPrimaryStatistics(); + uint64_t compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + uint64_t primary_write_bytes = + primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + // re-enable remote compaction + my_cs->ResetOverride(); + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + // make sure the compaction statistics is only recorded on the remote side + ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES)); + ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + + // return run local again with API WaitForComplete + my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal); + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + compactor_write_bytes = + compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES); + primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(my_cs->GetCompactionNum(), + comp_num); // no remote compaction is run + // make sure the compaction statistics is only recorded on the local side + ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), + compactor_write_bytes); + ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), + primary_write_bytes); + ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), + compactor_write_bytes); + + // verify result after 2 manual compactions + VerifyTestData(); +} + +INSTANTIATE_TEST_CASE_P( + CompactionServiceTest, CompactionServiceTest, + ::testing::Values( + TestCompactionServiceType::MyTestCompactionServiceType, + TestCompactionServiceType::MyTestCompactionServiceLegacyType)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/file_pri.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/file_pri.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once +#include + +#include "db/version_edit.h" + +namespace ROCKSDB_NAMESPACE { +// We boost files that are closer to TTL limit. This boosting could be +// through FileMetaData.compensated_file_size but this compensated size +// is widely used as something similar to file size so dramatically boost +// the value might cause unintended consequences. +// +// This boosting algorithm can go very fancy, but here we use a simple +// formula which can satisify: +// (1) Different levels are triggered slightly differently to avoid +// too many cascading cases +// (2) Files in the same level get boosting more when TTL gets closer. +// +// Don't do any boosting before TTL has past by half. This is to make +// sure lower write amp for most of the case. And all levels should be +// fully boosted when total TTL compaction threshold triggers. +// Differientiate boosting ranges of each level by 1/2. This will make +// range for each level exponentially increasing. We could do it by +// having them to be equal, or go even fancier. We can adjust it after +// we observe the behavior in production. +// The threshold starting boosting: +// +------------------------------------------------------------------ + +// ^ ^ ^ ^ ^ ^ +// Age 0 ... | | second last level thresold +// | | +// | third last level +// | +// forth last level +// +// We arbitrarily set with 0 when a file is aged boost_age_start and +// grow linearly. The ratio is arbitrarily set so that when the next level +// starts to boost, the previous level's boosting amount is 16. +class FileTtlBooster { + public: + FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels, + int level) + : current_time_(current_time) { + if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) { + enabled_ = false; + boost_age_start_ = 0; + boost_step_ = 1; + } else { + enabled_ = true; + uint64_t all_boost_start_age = ttl / 2; + uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age; + uint64_t boost_age_range = + all_boost_age_range >> (num_non_empty_levels - level - 1); + boost_age_start_ = all_boost_start_age + boost_age_range; + const uint64_t kBoostRatio = 16; + // prevent 0 value to avoid divide 0 error. + boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1}); + } + } + + uint64_t GetBoostScore(FileMetaData* f) { + if (!enabled_) { + return 1; + } + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time >= current_time_) { + return 1; + } + uint64_t age = current_time_ - oldest_ancester_time; + if (age > boost_age_start_) { + // Use integer just for convenience. + // We could make all file_to_order double if we want. + // Technically this can overflow if users override timing and + // give a very high current time. Ignore the case for simplicity. + // Boosting is addition to current value, so +1. This will effectively + // make boosting to kick in after the first boost_step_ is reached. + return (age - boost_age_start_) / boost_step_ + 1; + } + return 1; + } + + private: + bool enabled_; + uint64_t current_time_; + uint64_t boost_age_start_; + uint64_t boost_step_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "rocksdb/sst_partitioner.h" + +#include + +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +static std::unordered_map + sst_fixed_prefix_type_info = { +#ifndef ROCKSDB_LITE + {"length", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len) + : len_(len) { + RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info); +} + +PartitionerResult SstPartitionerFixedPrefix::ShouldPartition( + const PartitionerRequest& request) { + Slice last_key_fixed(*request.prev_user_key); + if (last_key_fixed.size() > len_) { + last_key_fixed.size_ = len_; + } + Slice current_key_fixed(*request.current_user_key); + if (current_key_fixed.size() > len_) { + current_key_fixed.size_ = len_; + } + return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired + : kNotRequired; +} + +bool SstPartitionerFixedPrefix::CanDoTrivialMove( + const Slice& smallest_user_key, const Slice& largest_user_key) { + return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key, + 0)) == kNotRequired; +} + +std::unique_ptr +SstPartitionerFixedPrefixFactory::CreatePartitioner( + const SstPartitioner::Context& /* context */) const { + return std::unique_ptr(new SstPartitionerFixedPrefix(len_)); +} + +std::shared_ptr NewSstPartitionerFixedPrefixFactory( + size_t prefix_len) { + return std::make_shared(prefix_len); +} + +#ifndef ROCKSDB_LITE +namespace { +static int RegisterSstPartitionerFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + SstPartitionerFixedPrefixFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new SstPartitionerFixedPrefixFactory(0)); + return guard->get(); + }); + return 1; +} +} // namespace +#endif // ROCKSDB_LITE + +Status SstPartitionerFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(options, value, nullptr, + result); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/comparator_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/comparator_db_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,10 +13,10 @@ #include "test_util/testutil.h" #include "util/hash.h" #include "util/kv_map.h" +#include "util/random.h" #include "util/string_util.h" #include "utilities/merge_operators.h" -using std::unique_ptr; namespace ROCKSDB_NAMESPACE { namespace { @@ -317,7 +317,7 @@ INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest, testing::Values(test::kDefaultFormatVersion)); INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest, - testing::Values(test::kLatestFormatVersion)); + testing::Values(kLatestFormatVersion)); TEST_P(ComparatorDBTest, Bytewise) { for (int rand_seed = 301; rand_seed < 306; rand_seed++) { @@ -342,12 +342,12 @@ std::vector source_prefixes; // Randomly generate 5 prefixes for (int i = 0; i < 5; i++) { - source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8)); + source_prefixes.push_back(rnd.HumanReadableString(8)); } for (int j = 0; j < 20; j++) { int prefix_index = rnd.Uniform(static_cast(source_prefixes.size())); std::string key = source_prefixes[prefix_index] + - test::RandomHumanReadableString(&rnd, rnd.Uniform(8)); + rnd.HumanReadableString(rnd.Uniform(8)); source_strings.push_back(key); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/convenience.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/convenience.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc 2025-05-19 16:14:27.000000000 +0000 @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { void CancelAllBackgroundWork(DB* db, bool wait) { - (static_cast_with_check(db->GetRootDB())) + (static_cast_with_check(db->GetRootDB())) ->CancelAllBackgroundWork(wait); } @@ -28,7 +28,7 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { - return (static_cast_with_check(db->GetRootDB())) + return (static_cast_with_check(db->GetRootDB())) ->DeleteFilesInRanges(column_family, ranges, n, include_end); } @@ -44,7 +44,7 @@ std::unique_ptr file; uint64_t file_size; InternalKeyComparator internal_comparator(options.comparator); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); Status s = ioptions.fs->NewRandomAccessFile(file_path, FileOptions(env_options), @@ -59,9 +59,10 @@ new RandomAccessFileReader(std::move(file), file_path)); const bool kImmortal = true; s = ioptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options, + TableReaderOptions(ioptions, options.prefix_extractor, env_options, internal_comparator, false /* skip_filters */, - !kImmortal, -1 /* level */), + !kImmortal, false /* force_direct_prefetch */, + -1 /* level */), std::move(file_reader), file_size, &table_reader, false /* prefetch_index_and_filter_in_cache */); if (!s.ok()) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/corruption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/corruption_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,37 +9,65 @@ #ifndef ROCKSDB_LITE -#include "rocksdb/db.h" - -#include #include #include #include + #include + #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/log_format.h" #include "db/version_set.h" -#include "env/composite_env_wrapper.h" #include "file/filename.h" +#include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" +#include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" #include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" +#include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -static const int kValueSize = 1000; +static constexpr int kValueSize = 1000; +namespace { +// A wrapper that allows injection of errors. +class ErrorEnv : public EnvWrapper { + public: + bool writable_file_error_; + int num_writable_file_errors_; + explicit ErrorEnv(Env* _target) + : EnvWrapper(_target), + writable_file_error_(false), + num_writable_file_errors_(0) {} + const char* Name() const override { return "ErrorEnv"; } + + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override { + result->reset(); + if (writable_file_error_) { + ++num_writable_file_errors_; + return Status::IOError(fname, "fake error"); + } + return target()->NewWritableFile(fname, result, soptions); + } +}; +} // namespace class CorruptionTest : public testing::Test { public: - test::ErrorEnv env_; + std::shared_ptr env_guard_; + ErrorEnv* env_; std::string dbname_; std::shared_ptr tiny_cache_; Options options_; @@ -50,10 +78,16 @@ // set it to 0), test SequenceNumberRecovery will fail, likely because of a // bug in recovery code. Keep it 4 for now to make the test passes. tiny_cache_ = NewLRUCache(100, 4); + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + EXPECT_NE(base_env, nullptr); + env_ = new ErrorEnv(base_env); options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; - options_.env = &env_; - dbname_ = test::PerThreadDBPath("corruption_test"); - DestroyDB(dbname_, options_); + options_.env = env_; + dbname_ = test::PerThreadDBPath(env_, "corruption_test"); + Status s = DestroyDB(dbname_, options_); + EXPECT_OK(s); db_ = nullptr; options_.create_if_missing = true; @@ -65,8 +99,19 @@ } ~CorruptionTest() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({}); + SyncPoint::GetInstance()->ClearAllCallBacks(); delete db_; - DestroyDB(dbname_, Options()); + db_ = nullptr; + if (getenv("KEEP_DB")) { + fprintf(stdout, "db is still at %s\n", dbname_.c_str()); + } else { + Options opts; + opts.env = env_->target(); + EXPECT_OK(DestroyDB(dbname_, opts)); + } + delete env_; } void CloseDb() { @@ -81,7 +126,7 @@ if (opt.env == Options().env) { // If env is not overridden, replace it with ErrorEnv. // Otherwise, the test already uses a non-default Env. - opt.env = &env_; + opt.env = env_; } opt.arena_block_size = 4096; BlockBasedTableOptions table_options; @@ -101,22 +146,24 @@ ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_)); } - void Build(int n, int flush_every = 0) { + void Build(int n, int start, int flush_every) { std::string key_space, value_space; WriteBatch batch; for (int i = 0; i < n; i++) { if (flush_every != 0 && i != 0 && i % flush_every == 0) { - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); } //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); - Slice key = Key(i, &key_space); + Slice key = Key(i + start, &key_space); batch.Clear(); - batch.Put(key, Value(i, &value_space)); + ASSERT_OK(batch.Put(key, Value(i + start, &value_space))); ASSERT_OK(db_->Write(WriteOptions(), &batch)); } } + void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); } + void Check(int min_expected, int max_expected) { uint64_t next_expected = 0; uint64_t missed = 0; @@ -131,6 +178,7 @@ // occurred. Iterator* iter = db_->NewIterator(ReadOptions(false, true)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); uint64_t key; Slice in(iter->key()); if (!ConsumeDecimalNumber(&in, &key) || @@ -147,6 +195,7 @@ correct++; } } + iter->status().PermitUncheckedError(); delete iter; fprintf(stderr, @@ -157,47 +206,10 @@ ASSERT_GE(max_expected, correct); } - void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) { - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - FAIL() << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = static_cast(sbuf.st_size + offset); - } - } - if (offset > sbuf.st_size) { - offset = static_cast(sbuf.st_size); - } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = static_cast(sbuf.st_size - offset); - } - - // Do it - std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - ASSERT_TRUE(s.ok()) << s.ToString(); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; - } - s = WriteStringToFile(Env::Default(), contents, fname); - ASSERT_TRUE(s.ok()) << s.ToString(); - Options options; - EnvOptions env_options; - options.file_system.reset(new LegacyFileSystemWrapper(options.env)); - ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname)); - } - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { // Pick file to corrupt std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); uint64_t number; FileType type; std::string fname; @@ -212,7 +224,7 @@ } ASSERT_TRUE(!fname.empty()) << filetype; - CorruptFile(fname, offset, bytes_to_corrupt); + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); } // corrupts exactly one file at level `level`. if no file found at level, @@ -222,7 +234,8 @@ db_->GetLiveFilesMetaData(&metadata); for (const auto& m : metadata) { if (m.level == level) { - CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt); + ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset, + bytes_to_corrupt)); return; } } @@ -256,11 +269,11 @@ // preserves the implementation that was in place when all of the // magic values in this file were picked. *storage = std::string(kValueSize, ' '); - return Slice(*storage); } else { Random r(k); - return test::RandomString(&r, kValueSize, storage); + *storage = r.RandomString(kValueSize); } + return Slice(*storage); } }; @@ -277,8 +290,8 @@ // is not available for WAL though. CloseDb(); #endif - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Corrupt(kWalFile, 19, 1); // WriteBatch tag for first record + Corrupt(kWalFile, log::kBlockSize + 1000, 1); // Somewhere in second block ASSERT_TRUE(!TryReopen().ok()); options_.paranoid_checks = false; Reopen(&options_); @@ -288,14 +301,14 @@ } TEST_F(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; + env_->writable_file_error_ = true; Status s = TryReopen(); ASSERT_TRUE(!s.ok()); } TEST_F(CorruptionTest, NewFileErrorDuringWrite) { // Do enough writing to force minor compaction - env_.writable_file_error_ = true; + env_->writable_file_error_ = true; const int num = static_cast(3 + (Options().write_buffer_size / kValueSize)); std::string value_storage; @@ -303,7 +316,7 @@ bool failed = false; for (int i = 0; i < num; i++) { WriteBatch batch; - batch.Put("a", Value(100, &value_storage)); + ASSERT_OK(batch.Put("a", Value(100, &value_storage))); s = db_->Write(WriteOptions(), &batch); if (!s.ok()) { failed = true; @@ -311,17 +324,17 @@ ASSERT_TRUE(!failed || !s.ok()); } ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; + ASSERT_GE(env_->num_writable_file_errors_, 1); + env_->writable_file_error_ = false; Reopen(); } TEST_F(CorruptionTest, TableFile) { Build(100); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); - dbi->TEST_CompactRange(0, nullptr, nullptr); - dbi->TEST_CompactRange(1, nullptr, nullptr); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); Corrupt(kTableFile, 100, 1); Check(99, 99); @@ -330,7 +343,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) { Options options; - SpecialEnv senv(Env::Default()); + SpecialEnv senv(env_->target()); options.env = &senv; // Disable block cache as we are going to check checksum for // the same file twice and measure number of reads. @@ -341,10 +354,10 @@ Reopen(&options); Build(10000); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); - dbi->TEST_CompactRange(0, nullptr, nullptr); - dbi->TEST_CompactRange(1, nullptr, nullptr); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); senv.count_random_reads_ = true; senv.random_read_counter_.Reset(); @@ -388,14 +401,14 @@ Reopen(&options); // build 2 tables, flush at 5000 Build(10000, 5000); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); // corrupt an index block of an entire file Corrupt(kTableFile, -2000, 500); options.paranoid_checks = false; Reopen(&options); - dbi = reinterpret_cast(db_); + dbi = static_cast_with_check(db_); // one full file may be readable, since only one was corrupted // the other file should be fully non-readable, since index was corrupted Check(0, 5000); @@ -435,9 +448,9 @@ TEST_F(CorruptionTest, CorruptedDescriptor) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); - dbi->TEST_CompactRange(0, nullptr, nullptr); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); Corrupt(kDescriptorFile, 0, 1000); Status s = TryReopen(); @@ -452,12 +465,13 @@ TEST_F(CorruptionTest, CompactionInputError) { Options options; + options.env = env_; Reopen(&options); Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); - dbi->TEST_CompactRange(0, nullptr, nullptr); - dbi->TEST_CompactRange(1, nullptr, nullptr); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr)); ASSERT_EQ(1, Property("rocksdb.num-files-at-level2")); Corrupt(kTableFile, 100, 1); @@ -472,29 +486,30 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { Options options; + options.env = env_; options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; Reopen(&options); - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); // Fill levels >= 1 for (int level = 1; level < dbi->NumberLevels(); level++) { - dbi->Put(WriteOptions(), "", "begin"); - dbi->Put(WriteOptions(), "~", "end"); - dbi->TEST_FlushMemTable(); + ASSERT_OK(dbi->Put(WriteOptions(), "", "begin")); + ASSERT_OK(dbi->Put(WriteOptions(), "~", "end")); + ASSERT_OK(dbi->TEST_FlushMemTable()); for (int comp_level = 0; comp_level < dbi->NumberLevels() - level; ++comp_level) { - dbi->TEST_CompactRange(comp_level, nullptr, nullptr); + ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr)); } } Reopen(&options); - dbi = reinterpret_cast(db_); + dbi = static_cast_with_check(db_); Build(10); - dbi->TEST_FlushMemTable(); - dbi->TEST_WaitForCompact(); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_WaitForCompact()); ASSERT_EQ(1, Property("rocksdb.num-files-at-level0")); CorruptTableFileAtLevel(0, 100, 1); @@ -518,8 +533,8 @@ TEST_F(CorruptionTest, UnrelatedKeys) { Build(10); - DBImpl* dbi = reinterpret_cast(db_); - dbi->TEST_FlushMemTable(); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); Corrupt(kTableFile, 100, 1); ASSERT_NOK(dbi->VerifyChecksum()); @@ -528,7 +543,7 @@ std::string v; ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - dbi->TEST_FlushMemTable(); + ASSERT_OK(dbi->TEST_FlushMemTable()); ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); ASSERT_EQ(Value(1000, &tmp2).ToString(), v); } @@ -542,37 +557,40 @@ ASSERT_EQ(static_cast(1), metadata.size()); std::string filename = dbname_ + metadata[0].name; - std::unique_ptr file; - ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions())); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), - filename)); + FileOptions file_opts; + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts, + &file_reader, nullptr)); uint64_t file_size; - ASSERT_OK(options_.env->GetFileSize(filename, &file_size)); + ASSERT_OK( + fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); BlockHandle range_del_handle; - ASSERT_OK(FindMetaBlock( + ASSERT_OK(FindMetaBlockInFile( file_reader.get(), file_size, kBlockBasedTableMagicNumber, - ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle)); + ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle)); ASSERT_OK(TryReopen()); - CorruptFile(filename, static_cast(range_del_handle.offset()), 1); + ASSERT_OK(test::CorruptFile(env_, filename, + static_cast(range_del_handle.offset()), 1)); ASSERT_TRUE(TryReopen().IsCorruption()); } TEST_F(CorruptionTest, FileSystemStateCorrupted) { for (int iter = 0; iter < 2; ++iter) { Options options; + options.env = env_; options.paranoid_checks = true; options.create_if_missing = true; Reopen(&options); Build(10); ASSERT_OK(db_->Flush(FlushOptions())); - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); std::vector metadata; dbi->GetLiveFilesMetaData(&metadata); - ASSERT_GT(metadata.size(), size_t(0)); + ASSERT_GT(metadata.size(), 0); std::string filename = dbname_ + metadata[0].name; delete db_; @@ -580,25 +598,326 @@ if (iter == 0) { // corrupt file size std::unique_ptr file; - env_.NewWritableFile(filename, &file, EnvOptions()); - file->Append(Slice("corrupted sst")); + ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions())); + ASSERT_OK(file->Append(Slice("corrupted sst"))); file.reset(); Status x = TryReopen(&options); ASSERT_TRUE(x.IsCorruption()); } else { // delete the file - env_.DeleteFile(filename); + ASSERT_OK(env_->DeleteFile(filename)); Status x = TryReopen(&options); - ASSERT_TRUE(x.IsPathNotFound()); + ASSERT_TRUE(x.IsCorruption()); + } + + ASSERT_OK(DestroyDB(dbname_, options_)); + } +} + +static const auto& corruption_modes = { + mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey, + mock::MockTableFactory::kCorruptValue, + mock::MockTableFactory::kCorruptReorderKey}; + +TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + db_ = nullptr; + s = DestroyDB(dbname_, options); + ASSERT_OK(s); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + mock->SetCorruptionMode(mode); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(10); + s = db_->Flush(FlushOptions()); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { + Options options; + options.env = env_; + options.paranoid_file_checks = true; + options.create_if_missing = true; + options.check_flush_compaction_key_order = false; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + db_ = nullptr; + s = DestroyDB(dbname_, options); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(100, 2); + // ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + mock->SetCorruptionMode(mode); + s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + std::string start, end; + assert(db_ != nullptr); // suppress false clang-analyze report + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + Build(10); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); } + db_->ReleaseSnapshot(snap); + } +} - DestroyDB(dbname_, options_); +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(10, 0, 0); + std::string start, end; + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(5, &start), Key(15, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(12, &start), Key(17, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(4, &end))); + Build(10, 10, 0); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + } + db_->ReleaseSnapshot(snap); } } +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + std::string start, end; + Build(10); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(6, &start), Key(8, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.allow_data_in_errors = true; + auto mode = mock::MockTableFactory::kCorruptKey; + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + + std::shared_ptr mock = + std::make_shared(); + mock->SetCorruptionMode(mode); + options.table_factory = mock; + + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(100, 2); + + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + Status s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(CorruptionTest, CompactionKeyOrderCheck) { + Options options; + options.env = env_; + options.paranoid_file_checks = false; + options.create_if_missing = true; + options.check_flush_compaction_key_order = false; + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey); + Build(100, 2); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone); + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); + ASSERT_NOK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); +} + +TEST_F(CorruptionTest, FlushKeyOrderCheck) { + Options options; + options.env = env_; + options.paranoid_file_checks = false; + options.create_if_missing = true; + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); + + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1")); + + int cnt = 0; + // Generate some out of order keys from the memtable + SyncPoint::GetInstance()->SetCallBack( + "MemTableIterator::Next:0", [&](void* arg) { + MemTableRep::Iterator* mem_iter = + static_cast(arg); + if (++cnt == 3) { + mem_iter->Prev(); + mem_iter->Prev(); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = static_cast_with_check(db_)->TEST_FlushMemTable(); + ASSERT_NOK(s); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(CorruptionTest, DisableKeyOrderCheck) { + ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}})); + DBImpl* dbi = static_cast_with_check(db_); + + SyncPoint::GetInstance()->SetCallBack( + "OutputValidator::Add:order_check", + [&](void* /*arg*/) { ASSERT_TRUE(false); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1")); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1")); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(CorruptionTest, VerifyWholeTableChecksum) { + CloseDb(); + Options options; + options.env = env_; + ASSERT_OK(DestroyDB(dbname_, options)); + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(&options); + + Build(10, 5); + + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + CloseDb(); + + // Corrupt the first byte of each table file, this must be data block. + Corrupt(kTableFile, 0, 1); + + ASSERT_OK(TryReopen(&options)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + auto* s = reinterpret_cast(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "table/cuckoo/cuckoo_table_factory.h" @@ -13,6 +14,7 @@ #include "table/meta_blocks.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -46,9 +48,7 @@ return options; } - DBImpl* dbfull() { - return reinterpret_cast(db_); - } + DBImpl* dbfull() { return static_cast_with_check(db_); } // The following util methods are copied from plain_table_db_test. void Reopen(Options* options = nullptr) { @@ -64,6 +64,15 @@ ASSERT_OK(DB::Open(opts, dbname_, &db_)); } + void DestroyAndReopen(Options* options) { + assert(options); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + Reopen(options); + } + Status Put(const Slice& k, const Slice& v) { return db_->Put(WriteOptions(), k, v); } @@ -86,8 +95,8 @@ int NumTableFilesAtLevel(int level) { std::string property; - EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(level), &property)); + EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level), + &property)); return atoi(property.c_str()); } @@ -121,10 +130,11 @@ ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key3", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(3U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); @@ -138,9 +148,10 @@ ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key5", "v5")); ASSERT_OK(Put("key6", "v6")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(2U, ptc.size()); auto row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -156,8 +167,9 @@ ASSERT_OK(Delete("key6")); ASSERT_OK(Delete("key5")); ASSERT_OK(Delete("key4")); - dbfull()->TEST_FlushMemTable(); - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(3U, ptc.size()); row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -178,10 +190,11 @@ ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key1", "v3")); // Duplicate - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); + VerifySstUniqueIds(ptc); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(2U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); @@ -206,12 +219,12 @@ TEST_F(CuckooTableDBTest, Uint64Comparator) { Options options = CurrentOptions(); options.comparator = test::Uint64Comparator(); - Reopen(&options); + DestroyAndReopen(&options); ASSERT_OK(Put(Uint64Key(1), "v1")); ASSERT_OK(Put(Uint64Key(2), "v2")); ASSERT_OK(Put(Uint64Key(3), "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get(Uint64Key(1))); ASSERT_EQ("v2", Get(Uint64Key(2))); @@ -220,10 +233,10 @@ // Add more keys. ASSERT_OK(Delete(Uint64Key(2))); // Delete. - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. ASSERT_OK(Put(Uint64Key(4), "v4")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get(Uint64Key(1))); ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2))); ASSERT_EQ("v0", Get(Uint64Key(3))); @@ -243,11 +256,11 @@ for (int idx = 0; idx < 28; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("1", FilesPerLevel()); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); ASSERT_EQ("0,2", FilesPerLevel()); for (int idx = 0; idx < 28; ++idx) { ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); @@ -266,15 +279,15 @@ for (int idx = 0; idx < 11; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a'))); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("1", FilesPerLevel()); // Generate one more file in level-0, and should trigger level-0 compaction for (int idx = 0; idx < 11; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel()); for (int idx = 0; idx < 11; ++idx) { @@ -295,7 +308,7 @@ ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key3", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Write some keys using plain table. std::shared_ptr block_based_factory( @@ -311,7 +324,7 @@ Reopen(&options); ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key1", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Write some keys using block based table. options.table_factory.reset(NewAdaptiveTableFactory( @@ -320,7 +333,7 @@ Reopen(&options); ASSERT_OK(Put("key5", "v6")); ASSERT_OK(Put("key2", "v7")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v5", Get("key1")); ASSERT_EQ("v7", Get("key2")); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_basic_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,30 +6,44 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + #include "db/db_test_util.h" +#include "options/options_helper.h" #include "port/stack_trace.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/perf_context.h" +#include "rocksdb/table.h" #include "rocksdb/utilities/debug.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" #endif +#include "util/file_checksum_helper.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" namespace ROCKSDB_NAMESPACE { class DBBasicTest : public DBTestBase { public: - DBBasicTest() : DBTestBase("/db_basic_test") {} + DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {} }; TEST_F(DBBasicTest, OpenWhenOpen) { Options options = CurrentOptions(); options.env = env_; - ROCKSDB_NAMESPACE::DB* db2 = nullptr; - ROCKSDB_NAMESPACE::Status s = DB::Open(options, dbname_, &db2); - + DB* db2 = nullptr; + Status s = DB::Open(options, dbname_, &db2); + ASSERT_NOK(s) << [db2]() { + delete db2; + return "db2 open: ok"; + }(); ASSERT_EQ(Status::Code::kIOError, s.code()); ASSERT_EQ(Status::SubCode::kNone, s.subcode()); ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); @@ -37,6 +51,62 @@ delete db2; } +TEST_F(DBBasicTest, UniqueSession) { + Options options = CurrentOptions(); + std::string sid1, sid2, sid3, sid4; + + ASSERT_OK(db_->GetDbSessionId(sid1)); + Reopen(options); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(db_->GetDbSessionId(sid4)); + Reopen(options); + ASSERT_OK(db_->GetDbSessionId(sid3)); + + ASSERT_NE(sid1, sid2); + ASSERT_NE(sid1, sid3); + ASSERT_NE(sid2, sid3); + + ASSERT_EQ(sid2, sid4); + + // Expected compact format for session ids (see notes in implementation) + TestRegex expected("[0-9A-Z]{20}"); + EXPECT_MATCHES_REGEX(sid1, expected); + EXPECT_MATCHES_REGEX(sid2, expected); + EXPECT_MATCHES_REGEX(sid3, expected); + +#ifndef ROCKSDB_LITE + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(db_->GetDbSessionId(sid1)); + // Test uniqueness between readonly open (sid1) and regular open (sid3) + ASSERT_NE(sid1, sid3); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_EQ("v1", Get("foo")); + ASSERT_OK(db_->GetDbSessionId(sid3)); + + ASSERT_NE(sid1, sid2); + + ASSERT_EQ(sid2, sid3); +#endif // ROCKSDB_LITE + + CreateAndReopenWithCF({"goku"}, options); + ASSERT_OK(db_->GetDbSessionId(sid1)); + ASSERT_OK(Put("bar", "e1")); + ASSERT_OK(db_->GetDbSessionId(sid2)); + ASSERT_EQ("e1", Get("bar")); + ASSERT_OK(db_->GetDbSessionId(sid3)); + ReopenWithColumnFamilies({"default", "goku"}, options); + ASSERT_OK(db_->GetDbSessionId(sid4)); + + ASSERT_EQ(sid1, sid2); + ASSERT_EQ(sid2, sid3); + + ASSERT_NE(sid1, sid4); +} + #ifndef ROCKSDB_LITE TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); @@ -44,29 +114,46 @@ ASSERT_OK(Put("foo", "v3")); Close(); + auto verify_one_iter = [&](Iterator* iter) { + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + // Always expect two keys: "foo" and "bar" + ASSERT_EQ(count, 2); + }; + + auto verify_all_iters = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + verify_one_iter(iter); + delete iter; + + std::vector iters; + ASSERT_OK(db_->NewIterators(ReadOptions(), + {dbfull()->DefaultColumnFamily()}, &iters)); + ASSERT_EQ(static_cast(1), iters.size()); + verify_one_iter(iters[0]); + delete iters[0]; + }; + auto options = CurrentOptions(); assert(options.env == env_); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); - Iterator* iter = db_->NewIterator(ReadOptions()); - int count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - ++count; - } - ASSERT_EQ(count, 2); - delete iter; + verify_all_iters(); Close(); // Reopen and flush memtable. Reopen(options); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); + verify_all_iters(); ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); } @@ -81,7 +168,7 @@ assert(options.env == env_); ASSERT_OK(ReadOnlyReopen(options)); std::string db_id1; - db_->GetDbIdentity(db_id1); + ASSERT_OK(db_->GetDbIdentity(db_id1)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); Iterator* iter = db_->NewIterator(ReadOptions()); @@ -96,7 +183,7 @@ // Reopen and flush memtable. Reopen(options); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); @@ -104,7 +191,7 @@ ASSERT_EQ("v2", Get("bar")); ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); std::string db_id2; - db_->GetDbIdentity(db_id2); + ASSERT_OK(db_->GetDbIdentity(db_id2)); ASSERT_EQ(db_id1, db_id2); } @@ -119,7 +206,7 @@ Reopen(options); // 1 L0 file, use CompactedDB if max_open_files = -1 ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); - Flush(); + ASSERT_OK(Flush()); Close(); ASSERT_OK(ReadOnlyReopen(options)); Status s = Put("new", "value"); @@ -137,12 +224,12 @@ Reopen(options); // Add more L0 files ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); - Flush(); + ASSERT_OK(Flush()); Close(); ASSERT_OK(ReadOnlyReopen(options)); @@ -159,7 +246,7 @@ ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(3, NumTableFilesAtLevel(1)); Close(); @@ -217,8 +304,8 @@ int i = 0; while (NumTableFilesAtLevel(2, 1) == 0) { ASSERT_OK(Put(1, Key(i++), value)); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } options.num_levels = 1; @@ -272,8 +359,8 @@ options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "a", Slice()); - SingleDelete(1, "a"); + ASSERT_OK(Put(1, "a", Slice())); + ASSERT_OK(SingleDelete(1, "a")); ASSERT_OK(Flush(1)); ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); @@ -319,12 +406,19 @@ TEST_F(DBBasicTest, CheckLock) { do { - DB* localdb; + DB* localdb = nullptr; Options options = CurrentOptions(); ASSERT_OK(TryReopen(options)); // second open should fail - ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + Status s = DB::Open(options, dbname_, &localdb); + ASSERT_NOK(s) << [localdb]() { + delete localdb; + return "localdb open: ok"; + }(); +#ifdef OS_LINUX + ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos); +#endif // OS_LINUX } while (ChangeCompactOptions()); } @@ -392,7 +486,7 @@ sleeping_task_low.WaitUntilDone(); } -TEST_F(DBBasicTest, FLUSH) { +TEST_F(DBBasicTest, Flush) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); WriteOptions writeOpt = WriteOptions(); @@ -513,29 +607,30 @@ #ifndef ROCKSDB_LITE TEST_F(DBBasicTest, Snapshot) { + env_->SetMockSleep(); anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - Put(0, "foo", "0v1"); - Put(1, "foo", "1v1"); + ASSERT_OK(Put(0, "foo", "0v1")); + ASSERT_OK(Put(1, "foo", "1v1")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_EQ(1U, GetNumSnapshots()); uint64_t time_snap1 = GetTimeOldestSnapshots(); ASSERT_GT(time_snap1, 0U); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v2"); - Put(1, "foo", "1v2"); + ASSERT_OK(Put(0, "foo", "0v2")); + ASSERT_OK(Put(1, "foo", "1v2")); - env_->addon_time_.fetch_add(1); + env_->MockSleepForSeconds(1); const Snapshot* s2 = db_->GetSnapshot(); ASSERT_EQ(2U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v3"); - Put(1, "foo", "1v3"); + ASSERT_OK(Put(0, "foo", "0v3")); + ASSERT_OK(Put(1, "foo", "1v3")); { ManagedSnapshot s3(db_); @@ -543,8 +638,8 @@ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v4"); - Put(1, "foo", "1v4"); + ASSERT_OK(Put(0, "foo", "0v4")); + ASSERT_OK(Put(1, "foo", "1v4")); ASSERT_EQ("0v1", Get(0, "foo", s1)); ASSERT_EQ("1v1", Get(1, "foo", s1)); ASSERT_EQ("0v2", Get(0, "foo", s2)); @@ -584,60 +679,79 @@ #endif // ROCKSDB_LITE -TEST_F(DBBasicTest, CompactBetweenSnapshots) { +class DBBasicMultiConfigs : public DBBasicTest, + public ::testing::WithParamInterface { + public: + DBBasicMultiConfigs() { option_config_ = GetParam(); } + + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) { + option_configs.push_back(option_config); + } + } + return option_configs; + } +}; + +TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; - do { - Options options = CurrentOptions(options_override); - options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, options); - Random rnd(301); - FillLevels("a", "z", 1); + Options options = CurrentOptions(options_override); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + FillLevels("a", "z", 1); - Put(1, "foo", "first"); - const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "second"); - Put(1, "foo", "third"); - Put(1, "foo", "fourth"); - const Snapshot* snapshot2 = db_->GetSnapshot(); - Put(1, "foo", "fifth"); - Put(1, "foo", "sixth"); - - // All entries (including duplicates) exist - // before any compaction or flush is triggered. - ASSERT_EQ(AllEntriesFor("foo", 1), - "[ sixth, fifth, fourth, third, second, first ]"); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ("first", Get(1, "foo", snapshot1)); + ASSERT_OK(Put(1, "foo", "first")); + const Snapshot* snapshot1 = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "second")); + ASSERT_OK(Put(1, "foo", "third")); + ASSERT_OK(Put(1, "foo", "fourth")); + const Snapshot* snapshot2 = db_->GetSnapshot(); + ASSERT_OK(Put(1, "foo", "fifth")); + ASSERT_OK(Put(1, "foo", "sixth")); + + // All entries (including duplicates) exist + // before any compaction or flush is triggered. + ASSERT_EQ(AllEntriesFor("foo", 1), + "[ sixth, fifth, fourth, third, second, first ]"); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ("first", Get(1, "foo", snapshot1)); - // After a flush, "second", "third" and "fifth" should - // be removed - ASSERT_OK(Flush(1)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); + // After a flush, "second", "third" and "fifth" should + // be removed + ASSERT_OK(Flush(1)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); - // after we release the snapshot1, only two values left - db_->ReleaseSnapshot(snapshot1); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - - // We have only one valid snapshot snapshot2. Since snapshot1 is - // not valid anymore, "first" should be removed by a compaction. - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); - - // after we release the snapshot2, only one value should be left - db_->ReleaseSnapshot(snapshot2); - FillLevels("a", "z", 1); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); - ASSERT_EQ("sixth", Get(1, "foo")); - ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); - } while (ChangeOptions(kSkipFIFOCompaction)); + // after we release the snapshot1, only two values left + db_->ReleaseSnapshot(snapshot1); + FillLevels("a", "z", 1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + + // We have only one valid snapshot snapshot2. Since snapshot1 is + // not valid anymore, "first" should be removed by a compaction. + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); + + // after we release the snapshot2, only one value should be left + db_->ReleaseSnapshot(snapshot2); + FillLevels("a", "z", 1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + ASSERT_EQ("sixth", Get(1, "foo")); + ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); } +INSTANTIATE_TEST_CASE_P( + DBBasicMultiConfigs, DBBasicMultiConfigs, + ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs())); + TEST_F(DBBasicTest, DBOpen_Options) { Options options = CurrentOptions(); Close(); @@ -685,18 +799,18 @@ options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); + ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); // Write two new keys - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); // Case1: Delete followed by a put - Delete(1, "foo"); - Put(1, "foo", "v2"); + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); // After the current memtable is flushed, the DEL should @@ -704,66 +818,66 @@ ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); // Case 2: Delete followed by another delete - Delete(1, "foo"); - Delete(1, "foo"); + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Delete(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 3: Put followed by a delete - Put(1, "foo", "v3"); - Delete(1, "foo"); + ASSERT_OK(Put(1, "foo", "v3")); + ASSERT_OK(Delete(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 4: Put followed by another Put - Put(1, "foo", "v4"); - Put(1, "foo", "v5"); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_OK(Put(1, "foo", "v5")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: Put followed by snapshot followed by another Put // Both puts should remain. - Put(1, "foo", "v6"); + ASSERT_OK(Put(1, "foo", "v6")); const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "v7"); + ASSERT_OK(Put(1, "foo", "v7")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); db_->ReleaseSnapshot(snapshot); // clear database - Delete(1, "foo"); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: snapshot followed by a put followed by another Put // Only the last put should remain. const Snapshot* snapshot1 = db_->GetSnapshot(); - Put(1, "foo", "v8"); - Put(1, "foo", "v9"); + ASSERT_OK(Put(1, "foo", "v8")); + ASSERT_OK(Put(1, "foo", "v9")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); db_->ReleaseSnapshot(snapshot1); @@ -786,7 +900,7 @@ ASSERT_OK(Put(7, "popovich", "popovich")); for (int i = 0; i < 8; ++i) { - Flush(i); + ASSERT_OK(Flush(i)); auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), i + 1U); } @@ -859,16 +973,24 @@ } while (ChangeCompactOptions()); } -TEST_F(DBBasicTest, ChecksumTest) { +class DBBlockChecksumTest : public DBBasicTest, + public testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest, + testing::ValuesIn(test::kFooterFormatVersionsToTest)); + +TEST_P(DBBlockChecksumTest, BlockChecksumTest) { BlockBasedTableOptions table_options; + table_options.format_version = GetParam(); Options options = CurrentOptions(); - // change when new checksum type added - int max_checksum = static_cast(kxxHash64); const int kNumPerFile = 2; + const auto algs = GetSupportedChecksums(); + const int algs_size = static_cast(algs.size()); + // generate one table with each type of checksum - for (int i = 0; i <= max_checksum; ++i) { - table_options.checksum = static_cast(i); + for (int i = 0; i < algs_size; ++i) { + table_options.checksum = algs[i]; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); for (int j = 0; j < kNumPerFile; ++j) { @@ -878,15 +1000,20 @@ } // with each valid checksum type setting... - for (int i = 0; i <= max_checksum; ++i) { - table_options.checksum = static_cast(i); + for (int i = 0; i < algs_size; ++i) { + table_options.checksum = algs[i]; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); // verify every type of checksum (should be regardless of that setting) - for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) { + for (int j = 0; j < algs_size * kNumPerFile; ++j) { ASSERT_EQ(Key(j), Get(Key(j))); } } + + // Now test invalid checksum type + table_options.checksum = static_cast(123); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); } // On Windows you can have either memory mapped file or a file @@ -919,44 +1046,46 @@ #endif class TestEnv : public EnvWrapper { - public: - explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} + public: + explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} + static const char* kClassName() { return "TestEnv"; } + const char* Name() const override { return kClassName(); } - class TestLogger : public Logger { - public: - using Logger::Logv; - explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } - ~TestLogger() override { - if (!closed_) { - CloseHelper(); - } - } - void Logv(const char* /*format*/, va_list /*ap*/) override {} - - protected: - Status CloseImpl() override { return CloseHelper(); } - - private: - Status CloseHelper() { - env->CloseCountInc(); - ; - return Status::IOError(); - } - TestEnv* env; - }; - - void CloseCountInc() { close_count++; } - - int GetCloseCount() { return close_count; } - - Status NewLogger(const std::string& /*fname*/, - std::shared_ptr* result) override { - result->reset(new TestLogger(this)); - return Status::OK(); + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + CloseHelper().PermitUncheckedError(); + } } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + + protected: + Status CloseImpl() override { return CloseHelper(); } private: - int close_count; + Status CloseHelper() { + env->CloseCountInc(); + ; + return Status::IOError(); + } + TestEnv* env; + }; + + void CloseCountInc() { close_count++; } + + int GetCloseCount() { return close_count; } + + Status NewLogger(const std::string& /*fname*/, + std::shared_ptr* result) override { + result->reset(new TestLogger(this)); + return Status::OK(); + } + + private: + int close_count; }; TEST_F(DBBasicTest, DBClose) { @@ -1008,7 +1137,7 @@ Options options = GetDefaultOptions(); options.create_if_missing = true; options.manual_wal_flush = true; - options.write_buffer_size=100; + options.write_buffer_size = 100; options.env = fault_injection_env.get(); Reopen(options); @@ -1018,9 +1147,15 @@ ASSERT_OK(Put("key3", "value3")); fault_injection_env->SetFilesystemActive(false); Status s = dbfull()->Close(); + ASSERT_NE(s, Status::OK()); + // retry should return the same error + s = dbfull()->Close(); + ASSERT_NE(s, Status::OK()); fault_injection_env->SetFilesystemActive(true); + // retry close() is no-op even the system is back. Could be improved if + // Close() is retry-able: #9029 + s = dbfull()->Close(); ASSERT_NE(s, Status::OK()); - Destroy(options); } @@ -1048,7 +1183,7 @@ } int get_sv_count = 0; - ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast(db_); + ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { @@ -1066,7 +1201,7 @@ } if (get_sv_count == 11) { for (int i = 0; i < 8; ++i) { - auto* cfd = reinterpret_cast( + auto* cfd = static_cast_with_check( db->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); @@ -1117,9 +1252,10 @@ ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); for (int cf = 0; cf < 8; ++cf) { - auto* cfd = reinterpret_cast( - reinterpret_cast(db_)->GetColumnFamilyHandle(cf)) - ->cfd(); + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(cf)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); } @@ -1179,9 +1315,10 @@ "cf" + std::to_string(j) + "_val" + std::to_string(retries)); } for (int i = 0; i < 8; ++i) { - auto* cfd = reinterpret_cast( - reinterpret_cast(db_)->GetColumnFamilyHandle(i)) - ->cfd(); + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(i)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } } @@ -1198,7 +1335,7 @@ } int get_sv_count = 0; - ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast(db_); + ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { @@ -1210,7 +1347,7 @@ } if (get_sv_count == 8) { for (int i = 0; i < 8; ++i) { - auto* cfd = reinterpret_cast( + auto* cfd = static_cast_with_check( db->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_TRUE( @@ -1238,13 +1375,36 @@ ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val"); } for (int i = 0; i < 8; ++i) { - auto* cfd = reinterpret_cast( - reinterpret_cast(db_)->GetColumnFamilyHandle(i)) - ->cfd(); + auto* cfd = + static_cast_with_check( + static_cast_with_check(db_)->GetColumnFamilyHandle(i)) + ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } } +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"one", "two"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(2, "baz", "xyz")); + ASSERT_OK(Put(1, "abc", "def")); + + // Note: keys for the same CF do not form a consecutive range + std::vector cfs{1, 2, 1}; + std::vector keys{"foo", "baz", "abc"}; + std::vector values; + + values = + MultiGet(cfs, keys, /* snapshot */ nullptr, /* batched */ GetParam()); + + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(values[0], "bar"); + ASSERT_EQ(values[1], "xyz"); + ASSERT_EQ(values[2], "def"); +} + INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam, testing::Bool()); @@ -1289,14 +1449,18 @@ } while (ChangeCompactOptions()); } -TEST_F(DBBasicTest, MultiGetBatchedSimpleSorted) { +TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); + // To expand the power of this test, generate > 1 table file and + // mix with memtable ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Flush(1)); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k5", "v5")); ASSERT_OK(Delete(1, "no_key")); @@ -1327,7 +1491,58 @@ ASSERT_TRUE(s[5].IsNotFound()); SetPerfLevel(kDisable); - } while (ChangeCompactOptions()); + } while (ChangeOptions()); +} + +TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) { + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateStringAppendOperator(); + CreateAndReopenWithCF({"pikachu"}, opts); + SetPerfLevel(kEnableCount); + // To expand the power of this test, generate > 1 table file and + // mix with memtable + ASSERT_OK(Merge(1, "k1", "v1")); + ASSERT_OK(Merge(1, "k2", "v2")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k3", "v3")); + ASSERT_OK(Merge(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k4", "v4_2")); + ASSERT_OK(Merge(1, "k6", "v6")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k7", "v7")); + ASSERT_OK(Merge(1, "k8", "v8")); + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + get_perf_context()->Reset(); + + std::vector keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); + ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3"); + ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes); + + for (Status& status : s) { + ASSERT_OK(status); + } + + SetPerfLevel(kDisable); } TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { @@ -1340,12 +1555,12 @@ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); @@ -1354,12 +1569,12 @@ ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); @@ -1368,12 +1583,12 @@ ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); @@ -1419,12 +1634,12 @@ ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); @@ -1433,12 +1648,12 @@ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); @@ -1447,18 +1662,19 @@ ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); for (int i = 0; i < 128; i += 9) { - ASSERT_OK(Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + ASSERT_OK( + Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); } std::vector keys; @@ -1490,6 +1706,310 @@ } } +TEST_F(DBBasicTest, MultiGetBatchedValueSizeInMemory) { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v_1")); + ASSERT_OK(Put(1, "k2", "v_2")); + ASSERT_OK(Put(1, "k3", "v_3")); + ASSERT_OK(Put(1, "k4", "v_4")); + ASSERT_OK(Put(1, "k5", "v_5")); + ASSERT_OK(Put(1, "k6", "v_6")); + std::vector keys = {"k1", "k2", "k3", "k4", "k5", "k6"}; + std::vector values(keys.size()); + std::vector s(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + + get_perf_context()->Reset(); + ReadOptions ro; + ro.value_size_soft_limit = 11; + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + for (unsigned int i = 0; i < 4; i++) { + ASSERT_EQ(std::string(values[i].data(), values[i].size()), + "v_" + std::to_string(i + 1)); + } + + for (unsigned int i = 4; i < 6; i++) { + ASSERT_TRUE(s[i].IsAborted()); + } + + ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes); + SetPerfLevel(kDisable); +} + +TEST_F(DBBasicTest, MultiGetBatchedValueSize) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + SetPerfLevel(kEnableCount); + + ASSERT_OK(Put(1, "k6", "v6")); + ASSERT_OK(Put(1, "k7", "v7_")); + ASSERT_OK(Put(1, "k3", "v3_")); + ASSERT_OK(Put(1, "k4", "v4")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Delete(1, "k4")); + ASSERT_OK(Put(1, "k11", "v11")); + ASSERT_OK(Delete(1, "no_key")); + ASSERT_OK(Put(1, "k8", "v8_")); + ASSERT_OK(Put(1, "k13", "v13")); + ASSERT_OK(Put(1, "k14", "v14")); + ASSERT_OK(Put(1, "k15", "v15")); + ASSERT_OK(Put(1, "k16", "v16")); + ASSERT_OK(Put(1, "k17", "v17")); + ASSERT_OK(Flush(1)); + + ASSERT_OK(Put(1, "k1", "v1_")); + ASSERT_OK(Put(1, "k2", "v2_")); + ASSERT_OK(Put(1, "k5", "v5_")); + ASSERT_OK(Put(1, "k9", "v9_")); + ASSERT_OK(Put(1, "k10", "v10")); + ASSERT_OK(Delete(1, "k2")); + ASSERT_OK(Delete(1, "k6")); + + get_perf_context()->Reset(); + + std::vector keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15", + "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7", + "k8", "k9", "no_key"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + ro.value_size_soft_limit = 20; + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + + // In memory keys + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10"); + ASSERT_TRUE(s[9].IsNotFound()); // k2 + ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_"); + ASSERT_TRUE(s[13].IsNotFound()); // k6 + ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_"); + + // In sst files + ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13"); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14"); + + // Remaining aborted after value_size exceeds. + ASSERT_TRUE(s[3].IsAborted()); + ASSERT_TRUE(s[6].IsAborted()); + ASSERT_TRUE(s[7].IsAborted()); + ASSERT_TRUE(s[8].IsAborted()); + ASSERT_TRUE(s[10].IsAborted()); + ASSERT_TRUE(s[11].IsAborted()); + ASSERT_TRUE(s[14].IsAborted()); + ASSERT_TRUE(s[15].IsAborted()); + ASSERT_TRUE(s[17].IsAborted()); + + // 6 kv pairs * 3 bytes per value (i.e. 18) + ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes); + SetPerfLevel(kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + int num_keys = 0; + + for (int i = 0; i < 64; ++i) { + ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(2); + + for (int i = 0; i < 64; i += 3) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + MoveFilesToLevel(1); + + for (int i = 0; i < 64; i += 5) { + ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); + num_keys++; + if (num_keys == 8) { + ASSERT_OK(Flush()); + num_keys = 0; + } + } + if (num_keys > 0) { + ASSERT_OK(Flush()); + num_keys = 0; + } + ASSERT_EQ(0, num_keys); + + for (int i = 0; i < 64; i += 9) { + ASSERT_OK( + Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); + } + + std::vector keys_str; + for (int i = 10; i < 50; ++i) { + keys_str.push_back("key_" + std::to_string(i)); + } + + std::vector keys(keys_str.size()); + for (int i = 0; i < 40; i++) { + keys[i] = Slice(keys_str[i]); + } + + std::vector values(keys_str.size()); + std::vector statuses(keys_str.size()); + ReadOptions read_options; + read_options.verify_checksums = true; + read_options.value_size_soft_limit = 380; + db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + + ASSERT_EQ(values.size(), keys.size()); + + for (unsigned int j = 0; j < 26; ++j) { + int key = j + 10; + std::string value; + value.append("val_l2_" + std::to_string(key)); + if (key % 3 == 0) { + value.append(","); + value.append("val_l1_" + std::to_string(key)); + } + if (key % 5 == 0) { + value.append(","); + value.append("val_l0_" + std::to_string(key)); + } + if (key % 9 == 0) { + value.append(","); + value.append("val_mem_" + std::to_string(key)); + } + ASSERT_EQ(values[j], value); + ASSERT_OK(statuses[j]); + } + + // All remaning keys status is set Status::Abort + for (unsigned int j = 26; j < 40; j++) { + ASSERT_TRUE(statuses[j].IsAborted()); + } +} + +TEST_F(DBBasicTest, MultiGetStats) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + int total_keys = 2000; + std::vector keys_str(total_keys); + std::vector keys(total_keys); + static size_t kMultiGetBatchSize = 100; + std::vector values(kMultiGetBatchSize); + std::vector s(kMultiGetBatchSize); + ReadOptions read_opts; + + Random rnd(309); + // Create Multiple SST files at multiple levels. + for (int i = 0; i < 500; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + for (int i = 501; i < 1000; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + + ASSERT_OK(Flush(1)); + MoveFilesToLevel(2, 1); + + for (int i = 1001; i < total_keys; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + ASSERT_OK(Flush(1)); + } + } + ASSERT_OK(Flush(1)); + MoveFilesToLevel(1, 1); + Close(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(options.statistics->Reset()); + + db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250], + values.data(), s.data(), false); + + ASSERT_EQ(values.size(), kMultiGetBatchSize); + HistogramData hist_data_blocks; + HistogramData hist_index_and_filter_blocks; + HistogramData hist_sst; + + options.statistics->histogramData(NUM_DATA_BLOCKS_READ_PER_LEVEL, + &hist_data_blocks); + options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + &hist_index_and_filter_blocks); + options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst); + + // Maximum number of blocks read from a file system in a level. + ASSERT_EQ(hist_data_blocks.max, 32); + ASSERT_GT(hist_index_and_filter_blocks.max, 0); + // Maximum number of sst files read from file system in a level. + ASSERT_EQ(hist_sst.max, 2); + + // Minimun number of blocks read in a level. + ASSERT_EQ(hist_data_blocks.min, 4); + ASSERT_GT(hist_index_and_filter_blocks.min, 0); + // Minimun number of sst files read in a level. + ASSERT_EQ(hist_sst.min, 1); +} + // Test class for batched MultiGet with prefix extractor // Param bool - If true, use partitioned filters // If false, use full filter block @@ -1565,11 +2085,11 @@ ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k5", "v5")); const Snapshot* snap1 = dbfull()->GetSnapshot(); ASSERT_OK(Delete(1, "k4")); - Flush(1); + ASSERT_OK(Flush(1)); const Snapshot* snap2 = dbfull()->GetSnapshot(); get_perf_context()->Reset(); @@ -1674,13 +2194,13 @@ ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); // Check non-default column family - for (size_t i = 0; i != kNumInserts - 1; ++i) { + for (size_t i = 0; i + 1 != kNumInserts; ++i) { ASSERT_OK(Put(1, std::to_string(i), "value")); } - for (size_t i = 0; i != kNumUpdates - 1; ++i) { + for (size_t i = 0; i + 1 != kNumUpdates; ++i) { ASSERT_OK(Put(1, std::to_string(i), "value1")); } - for (size_t i = 0; i != kNumDeletes - 1; ++i) { + for (size_t i = 0; i + 1 != kNumDeletes; ++i) { ASSERT_OK(Delete(1, std::to_string(i))); } ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions( @@ -1696,19 +2216,19 @@ BlockBasedTableOptions table_options; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.block_size = 16 * 1024; - assert(table_options.block_size > - BlockBasedTable::kMultiGetReadStackBufSize); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + ASSERT_TRUE(table_options.block_size > + BlockBasedTable::kMultiGetReadStackBufSize); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); std::string zero_str(128, '\0'); for (int i = 0; i < 100; ++i) { // Make the value compressible. A purely random string doesn't compress // and the resultant data block will not be compressed - std::string value(RandomString(&rnd, 128) + zero_str); + std::string value(rnd.RandomString(128) + zero_str); assert(Put(Key(i), value) == Status::OK()); } - Flush(); + ASSERT_OK(Flush()); std::vector key_data(10); std::vector keys; @@ -1729,15 +2249,451 @@ keys.data(), values.data(), statuses.data(), true); } -class DBBasicTestWithParallelIO - : public DBTestBase, - public testing::WithParamInterface> { +TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t cf = 0; cf != num_cfs; ++cf) { + for (size_t i = 0; i != 10000; ++i) { + std::string key_str = Key(static_cast(i)); + std::string value_str = std::to_string(cf) + "_" + std::to_string(i); + + ASSERT_OK(Put(static_cast(cf), key_str, value_str)); + if (0 == (i % 1000)) { + ASSERT_OK(Flush(static_cast(cf))); + } + } + } + for (size_t cf = 0; cf != num_cfs; ++cf) { + ASSERT_OK(Flush(static_cast(cf))); + } + Close(); + options.best_efforts_recovery = true; + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + for (size_t cf = 0; cf != num_cfs; ++cf) { + for (int i = 0; i != 10000; ++i) { + std::string key_str = Key(static_cast(i)); + std::string expected_value_str = + std::to_string(cf) + "_" + std::to_string(i); + ASSERT_EQ(expected_value_str, Get(static_cast(cf), key_str)); + } + } +} + +TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + options.best_efforts_recovery = true; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#ifndef ROCKSDB_LITE +namespace { +class TableFileListener : public EventListener { public: - DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") { - bool compressed_cache = std::get<0>(GetParam()); - bool uncompressed_cache = std::get<1>(GetParam()); - compression_enabled_ = std::get<2>(GetParam()); - fill_cache_ = std::get<3>(GetParam()); + void OnTableFileCreated(const TableFileCreationInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + cf_to_paths_[info.cf_name].push_back(info.file_path); + } + std::vector& GetFiles(const std::string& cf_name) { + InstrumentedMutexLock lock(&mutex_); + return cf_to_paths_[cf_name]; + } + + private: + InstrumentedMutex mutex_; + std::unordered_map> cf_to_paths_; +}; +} // namespace + +TEST_F(DBBasicTest, LastSstFileNotInManifest) { + // If the last sst file is not tracked in MANIFEST, + // or the VersionEdit for the last sst file is not synced, + // on recovery, the last sst file should be deleted, + // and new sst files shouldn't reuse its file number. + Options options = CurrentOptions(); + DestroyAndReopen(options); + Close(); + + // Manually add a sst file. + constexpr uint64_t kSstFileNumber = 100; + const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); + ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + /* fname = */ kSstFile, + /* should_sync = */ true)); + ASSERT_OK(env_->FileExists(kSstFile)); + + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + Reopen(options); + // kSstFile should already be deleted. + ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound()); + + ASSERT_OK(Put("k", "v")); + ASSERT_OK(Flush()); + // New sst file should have file number > kSstFileNumber. + std::vector& files = + listener->GetFiles(kDefaultColumnFamilyName); + ASSERT_EQ(files.size(), 1); + const std::string fname = files[0].erase(0, (dbname_ + "/").size()); + uint64_t number = 0; + FileType type = kTableFile; + ASSERT_TRUE(ParseFileName(fname, &number, &type)); + ASSERT_EQ(type, kTableFile); + ASSERT_GT(number, kSstFileNumber); +} + +TEST_F(DBBasicTest, RecoverWithMissingFiles) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + TableFileListener* listener = new TableFileListener(); + // Disable auto compaction to simplify SST file name tracking. + options.disable_auto_compactions = true; + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", + "eevee"}; + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + for (size_t cf = 0; cf != num_cfs; ++cf) { + ASSERT_OK(Put(static_cast(cf), "a", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + ASSERT_OK(Put(static_cast(cf), "b", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + ASSERT_OK(Put(static_cast(cf), "c", "0_value")); + ASSERT_OK(Flush(static_cast(cf))); + } + + // Delete and corrupt files + for (size_t i = 0; i < all_cf_names.size(); ++i) { + std::vector& files = listener->GetFiles(all_cf_names[i]); + ASSERT_EQ(3, files.size()); + std::string corrupted_data; + ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data)); + ASSERT_OK(WriteStringToFile( + env_, corrupted_data.substr(0, corrupted_data.size() - 2), + files[files.size() - 1], /*should_sync=*/true)); + for (int j = static_cast(files.size() - 2); j >= static_cast(i); + --j) { + ASSERT_OK(env_->DeleteFile(files[j])); + } + } + options.best_efforts_recovery = true; + ReopenWithColumnFamilies(all_cf_names, options); + // Verify data + ReadOptions read_opts; + read_opts.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[1])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[2])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("b", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } +} + +TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + ASSERT_OK(Flush()); + Close(); + { + // Hack by adding a new MANIFEST with high file number + std::string garbage(10, '\0'); + ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000", + /*should_sync=*/true)); + } + { + // Hack by adding a corrupted SST not referenced by any MANIFEST + std::string garbage(10, '\0'); + ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst", + /*should_sync=*/true)); + } + + options.best_efforts_recovery = true; + + Reopen(options); + ASSERT_OK(Put("bar", "value")); +} + +TEST_F(DBBasicTest, RecoverWithNoCurrentFile) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + options.best_efforts_recovery = true; + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put(1, "bar", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(Flush(1)); + Close(); + ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_))); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + std::vector cf_names; + ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names)); + ASSERT_EQ(2, cf_names.size()); + for (const auto& name : cf_names) { + ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu"); + } +} + +TEST_F(DBBasicTest, RecoverWithNoManifest) { + Options options = CurrentOptions(); + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + Close(); + { + // Delete all MANIFEST. + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kWalFile; + if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { + ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file)); + } + } + } + options.best_efforts_recovery = true; + options.create_if_missing = false; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsInvalidArgument()); + options.create_if_missing = true; + Reopen(options); + // Since no MANIFEST exists, best-efforts recovery creates a new, empty db. + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + CreateAndReopenWithCF({"pikachu"}, options); + std::vector kAllCfNames = {kDefaultColumnFamilyName, "pikachu"}; + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + for (int cf = 0; cf < static_cast(kAllCfNames.size()); ++cf) { + ASSERT_OK(Put(cf, "a", "0_value")); + ASSERT_OK(Flush(cf)); + ASSERT_OK(Put(cf, "b", "0_value")); + } + // Delete files + for (size_t i = 0; i < kAllCfNames.size(); ++i) { + std::vector& files = listener->GetFiles(kAllCfNames[i]); + ASSERT_EQ(1, files.size()); + for (int j = static_cast(files.size() - 1); j >= static_cast(i); + --j) { + ASSERT_OK(env_->DeleteFile(files[j])); + } + } + options.best_efforts_recovery = true; + ReopenWithColumnFamilies(kAllCfNames, options); + // Verify WAL is not applied + ReadOptions read_opts; + read_opts.total_order_seek = true; + std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + iter.reset(db_->NewIterator(read_opts, handles_[1])); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_F(DBBasicTest, DisableTrackWal) { + // If WAL tracking was enabled, and then disabled during reopen, + // the previously tracked WALs should be removed from MANIFEST. + + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + // extremely small write buffer size, + // so that new WALs are created more frequently. + options.write_buffer_size = 100; + options.env = env_; + DestroyAndReopen(options); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i))); + } + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->SyncWAL()); + // Some WALs are tracked. + ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Disable WAL tracking. + options.track_and_verify_wals_in_manifest = false; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + // Previously tracked WALs are cleared. + ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Re-enable WAL tracking again. + options.track_and_verify_wals_in_manifest = true; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBasicTest, ManifestChecksumMismatch) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) { + auto* crc = reinterpret_cast(arg); + *crc = *crc + 1; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions write_opts; + write_opts.disableWAL = true; + Status s = db_->Put(write_opts, "foo", "value"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(Put("foo", "value1")); + ASSERT_OK(Flush()); + s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBBasicTest, ConcurrentlyCloseDB) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + std::vector workers; + for (int i = 0; i < 10; i++) { + workers.push_back(std::thread([&]() { + auto s = db_->Close(); + ASSERT_OK(s); + })); + } + for (auto& w : workers) { + w.join(); + } +} + +#ifndef ROCKSDB_LITE +class DBBasicTestTrackWal : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestTrackWal() + : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {} + + int CountWalFiles() { + VectorLogPtr log_files; + EXPECT_OK(dbfull()->GetSortedWalFiles(log_files)); + return static_cast(log_files.size()); + }; +}; + +TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) { + // If a WAL becomes obsolete after flushing, but is not deleted from disk yet, + // then if SyncWAL is called afterwards, the obsolete WAL should not be + // tracked in MANIFEST. + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.track_and_verify_wals_in_manifest = true; + options.atomic_flush = GetParam(); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf"}, options); + ASSERT_EQ(handles_.size(), 2); // default, cf + // Do not delete WALs. + ASSERT_OK(db_->DisableFileDeletions()); + constexpr int n = 10; + std::vector> wals(n); + for (size_t i = 0; i < n; i++) { + // Generate a new WAL for each key-value. + const int cf = i % 2; + ASSERT_OK(db_->GetCurrentWalFile(&wals[i])); + ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i))); + ASSERT_OK(Flush({0, 1})); + } + ASSERT_EQ(CountWalFiles(), n); + // Since all WALs are obsolete, no WAL should be tracked in MANIFEST. + ASSERT_OK(db_->SyncWAL()); + + // Manually delete all WALs. + Close(); + for (const auto& wal : wals) { + ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); + } + + // If SyncWAL tracks the obsolete WALs in MANIFEST, + // reopen will fail because the WALs are missing from disk. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options)); + Destroy(options); +} + +INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal, + testing::Bool()); +#endif // ROCKSDB_LITE + +class DBBasicTestMultiGet : public DBTestBase { + public: + DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache, + bool uncompressed_cache, bool _compression_enabled, + bool _fill_cache, uint32_t compression_parallel_threads) + : DBTestBase(test_dir, /*env_do_fsync=*/false) { + compression_enabled_ = _compression_enabled; + fill_cache_ = _fill_cache; if (compressed_cache) { std::shared_ptr cache = NewLRUCache(1048576); @@ -1760,10 +2716,17 @@ compression_types = GetSupportedCompressions(); // Not every platform may have compression libraries available, so // dynamically pick based on what's available - if (compression_types.size() == 0) { - compression_enabled_ = false; + CompressionType tmp_type = kNoCompression; + for (auto c_type : compression_types) { + if (c_type != kNoCompression) { + tmp_type = c_type; + break; + } + } + if (tmp_type != kNoCompression) { + options.compression = tmp_type; } else { - options.compression = compression_types[0]; + compression_enabled_ = false; } } #else @@ -1771,7 +2734,7 @@ if (!Snappy_Supported()) { compression_enabled_ = false; } -#endif //ROCKSDB_LITE +#endif // ROCKSDB_LITE table_options.block_cache = uncompressed_cache_; if (table_options.block_cache == nullptr) { @@ -1782,28 +2745,57 @@ table_options.block_cache_compressed = compressed_cache_; table_options.flush_block_policy_factory.reset( new MyFlushBlockPolicyFactory()); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); if (!compression_enabled_) { options.compression = kNoCompression; + } else { + options.compression_opts.parallel_threads = compression_parallel_threads; } + options_ = options; Reopen(options); + if (num_cfs > 1) { + for (int cf = 0; cf < num_cfs; ++cf) { + cf_names_.emplace_back("cf" + std::to_string(cf)); + } + CreateColumnFamilies(cf_names_, options); + cf_names_.emplace_back("default"); + } + std::string zero_str(128, '\0'); - for (int i = 0; i < 100; ++i) { - // Make the value compressible. A purely random string doesn't compress - // and the resultant data block will not be compressed - values_.emplace_back(RandomString(&rnd, 128) + zero_str); - assert(Put(Key(i), values_[i]) == Status::OK()); - } - Flush(); - - for (int i = 0; i < 100; ++i) { - // block cannot gain space by compression - uncompressable_values_.emplace_back(RandomString(&rnd, 256) + '\0'); - std::string tmp_key = "a" + Key(i); - assert(Put(tmp_key, uncompressable_values_[i]) == Status::OK()); + for (int cf = 0; cf < num_cfs; ++cf) { + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + values_.emplace_back(rnd.RandomString(128) + zero_str); + assert(((num_cfs == 1) ? Put(Key(i), values_[i]) + : Put(cf, Key(i), values_[i])) == Status::OK()); + } + if (num_cfs == 1) { + EXPECT_OK(Flush()); + } else { + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); + } + + for (int i = 0; i < 100; ++i) { + // block cannot gain space by compression + uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0'); + std::string tmp_key = "a" + Key(i); + assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i]) + : Put(cf, tmp_key, uncompressable_values_[i])) == + Status::OK()); + } + if (num_cfs == 1) { + EXPECT_OK(Flush()); + } else { + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); + } + } + // Clear compressed cache, which is always pre-populated + if (compressed_cache_) { + compressed_cache_->SetCapacity(0); + compressed_cache_->SetCapacity(1048576); } - Flush(); } bool CheckValue(int i, const std::string& value) { @@ -1820,6 +2812,8 @@ return false; } + const std::vector& GetCFNames() const { return cf_names_; } + int num_lookups() { return uncompressed_cache_->num_lookups(); } int num_found() { return uncompressed_cache_->num_found(); } int num_inserts() { return uncompressed_cache_->num_inserts(); } @@ -1832,11 +2826,12 @@ bool compression_enabled() { return compression_enabled_; } bool has_compressed_cache() { return compressed_cache_ != nullptr; } bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } + Options get_options() { return options_; } static void SetUpTestCase() {} static void TearDownTestCase() {} - private: + protected: class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { public: MyFlushBlockPolicyFactory() {} @@ -1877,23 +2872,27 @@ const BlockBuilder& data_block_builder_; }; - class MyBlockCache : public Cache { + class MyBlockCache : public CacheWrapper { public: - explicit MyBlockCache(std::shared_ptr& target) - : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {} - - virtual const char* Name() const override { return "MyBlockCache"; } - - virtual Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { + explicit MyBlockCache(std::shared_ptr target) + : CacheWrapper(target), + num_lookups_(0), + num_found_(0), + num_inserts_(0) {} + + const char* Name() const override { return "MyBlockCache"; } + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { num_inserts_++; return target_->Insert(key, value, charge, deleter, handle, priority); } - virtual Handle* Lookup(const Slice& key, - Statistics* stats = nullptr) override { + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { num_lookups_++; Handle* handle = target_->Lookup(key, stats); if (handle != nullptr) { @@ -1901,57 +2900,6 @@ } return handle; } - - virtual bool Ref(Handle* handle) override { return target_->Ref(handle); } - - virtual bool Release(Handle* handle, bool force_erase = false) override { - return target_->Release(handle, force_erase); - } - - virtual void* Value(Handle* handle) override { - return target_->Value(handle); - } - - virtual void Erase(const Slice& key) override { target_->Erase(key); } - virtual uint64_t NewId() override { return target_->NewId(); } - - virtual void SetCapacity(size_t capacity) override { - target_->SetCapacity(capacity); - } - - virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override { - target_->SetStrictCapacityLimit(strict_capacity_limit); - } - - virtual bool HasStrictCapacityLimit() const override { - return target_->HasStrictCapacityLimit(); - } - - virtual size_t GetCapacity() const override { - return target_->GetCapacity(); - } - - virtual size_t GetUsage() const override { return target_->GetUsage(); } - - virtual size_t GetUsage(Handle* handle) const override { - return target_->GetUsage(handle); - } - - virtual size_t GetPinnedUsage() const override { - return target_->GetPinnedUsage(); - } - - virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; } - - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override { - return target_->ApplyToAllCacheEntries(callback, thread_safe); - } - - virtual void EraseUnRefEntries() override { - return target_->EraseUnRefEntries(); - } - int num_lookups() { return num_lookups_; } int num_found() { return num_found_; } @@ -1959,7 +2907,6 @@ int num_inserts() { return num_inserts_; } private: - std::shared_ptr target_; int num_lookups_; int num_found_; int num_inserts_; @@ -1967,10 +2914,24 @@ std::shared_ptr compressed_cache_; std::shared_ptr uncompressed_cache_; + Options options_; bool compression_enabled_; std::vector values_; std::vector uncompressable_values_; bool fill_cache_; + std::vector cf_names_; +}; + +class DBBasicTestWithParallelIO + : public DBBasicTestMultiGet, + public testing::WithParamInterface< + std::tuple> { + public: + DBBasicTestWithParallelIO() + : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1, + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()), std::get<3>(GetParam()), + std::get<4>(GetParam())) {} }; TEST_P(DBBasicTestWithParallelIO, MultiGet) { @@ -2096,6 +3057,125 @@ } } +#ifndef ROCKSDB_LITE +TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { + class FakeDirectIOEnv : public EnvWrapper { + class FakeDirectIOSequentialFile; + class FakeDirectIORandomAccessFile; + + public: + FakeDirectIOEnv(Env* env) : EnvWrapper(env) {} + static const char* kClassName() { return "FakeDirectIOEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + std::unique_ptr file; + assert(options.use_direct_reads); + EnvOptions opts = options; + opts.use_direct_reads = false; + Status s = target()->NewRandomAccessFile(fname, &file, opts); + if (!s.ok()) { + return s; + } + result->reset(new FakeDirectIORandomAccessFile(std::move(file))); + return s; + } + + private: + class FakeDirectIOSequentialFile : public SequentialFileWrapper { + public: + FakeDirectIOSequentialFile(std::unique_ptr&& file) + : SequentialFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIOSequentialFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + + class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper { + public: + FakeDirectIORandomAccessFile(std::unique_ptr&& file) + : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIORandomAccessFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + }; + + std::unique_ptr env(new FakeDirectIOEnv(env_)); + Options opts = get_options(); + opts.env = env.get(); + opts.use_direct_reads = true; + Reopen(opts); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + if (uncompressed_cache_) { + uncompressed_cache_->SetCapacity(0); + uncompressed_cache_->SetCapacity(1048576); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads; + if (!compression_enabled() || !has_compressed_cache()) { + expected_reads += 2; + } else { + expected_reads += (read_from_cache ? 0 : 2); + } + if (env_->random_read_counter_.Read() != expected_reads) { + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } + Close(); +} +#endif // ROCKSDB_LITE + TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { std::vector key_data(10); std::vector keys; @@ -2108,13 +3188,13 @@ ro.fill_cache = fill_cache(); SyncPoint::GetInstance()->SetCallBack( - "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) { - Status* s = static_cast(status); - read_count++; - if (read_count == 2) { - *s = Status::Corruption(); - } - }); + "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) { + Status* s = static_cast(status); + read_count++; + if (read_count == 2) { + *s = Status::Corruption(); + } + }); SyncPoint::GetInstance()->EnableProcessing(); // Warm up the cache first @@ -2127,7 +3207,7 @@ dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(0, values[0].ToString())); - //ASSERT_TRUE(CheckValue(50, values[1].ToString())); + // ASSERT_TRUE(CheckValue(50, values[1].ToString())); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::Corruption()); @@ -2145,10 +3225,10 @@ ro.fill_cache = fill_cache(); SyncPoint::GetInstance()->SetCallBack( - "TableCache::MultiGet:FindTable", [&](void *status) { - Status* s = static_cast(status); - *s = Status::IOError(); - }); + "TableCache::MultiGet:FindTable", [&](void* status) { + Status* s = static_cast(status); + *s = Status::IOError(); + }); // DB open will create table readers unless we reduce the table cache // capacity. // SanitizeOptions will set max_open_files to minimum of 20. Table cache @@ -2157,10 +3237,10 @@ // prevent file open during DB open and force the file to be opened // during MultiGet SyncPoint::GetInstance()->SetCallBack( - "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) { - int* max_open_files = (int*)arg; - *max_open_files = 11; - }); + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); SyncPoint::GetInstance()->EnableProcessing(); Reopen(CurrentOptions()); @@ -2180,362 +3260,645 @@ SyncPoint::GetInstance()->DisableProcessing(); } -INSTANTIATE_TEST_CASE_P( - ParallelIO, DBBasicTestWithParallelIO, - // Params are as follows - - // Param 0 - Compressed cache enabled - // Param 1 - Uncompressed cache enabled - // Param 2 - Data compression enabled - // Param 3 - ReadOptions::fill_cache - ::testing::Combine(::testing::Bool(), ::testing::Bool(), - ::testing::Bool(), ::testing::Bool())); - -class DBBasicTestWithTimestampBase : public DBTestBase { - public: - explicit DBBasicTestWithTimestampBase(const std::string& dbname) - : DBTestBase(dbname) {} +INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO, + // Params are as follows - + // Param 0 - Compressed cache enabled + // Param 1 - Uncompressed cache enabled + // Param 2 - Data compression enabled + // Param 3 - ReadOptions::fill_cache + // Param 4 - CompressionOptions::parallel_threads + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Values(1, 4))); - protected: - class TestComparatorBase : public Comparator { - public: - explicit TestComparatorBase(size_t ts_sz) : Comparator(ts_sz) {} +// Forward declaration +class DeadlineFS; - const char* Name() const override { return "TestComparator"; } +class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + DeadlineRandomAccessFile(DeadlineFS& fs, + std::unique_ptr& file) + : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} + + IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - void FindShortSuccessor(std::string*) const override {} + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; - void FindShortestSeparator(std::string*, const Slice&) const override {} + private: + DeadlineFS& fs_; + std::unique_ptr file_; +}; - int Compare(const Slice& a, const Slice& b) const override { - int r = CompareWithoutTimestamp(a, b); - if (r != 0 || 0 == timestamp_size()) { - return r; +class DeadlineFS : public FileSystemWrapper { + public: + // The error_on_delay parameter specifies whether a IOStatus::TimedOut() + // status should be returned after delaying the IO to exceed the timeout, + // or to simply delay but return success anyway. The latter mimics the + // behavior of PosixFileSystem, which does not enforce any timeout + explicit DeadlineFS(SpecialEnv* env, bool error_on_delay) + : FileSystemWrapper(env->GetFileSystem()), + deadline_(std::chrono::microseconds::zero()), + io_timeout_(std::chrono::microseconds::zero()), + env_(env), + timedout_(false), + ignore_deadline_(false), + error_on_delay_(error_on_delay) {} + + static const char* kClassName() { return "DeadlineFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + EXPECT_OK(s); + result->reset(new DeadlineRandomAccessFile(*this, file)); + + const std::chrono::microseconds deadline = GetDeadline(); + const std::chrono::microseconds io_timeout = GetIOTimeout(); + if (deadline.count() || io_timeout.count()) { + AssertDeadline(deadline, io_timeout, opts.io_options); + } + return ShouldDelay(opts.io_options); + } + + // Set a vector of {IO counter, delay in microseconds, return status} tuples + // that control when to inject a delay and duration of the delay + void SetDelayTrigger(const std::chrono::microseconds deadline, + const std::chrono::microseconds io_timeout, + const int trigger) { + delay_trigger_ = trigger; + io_count_ = 0; + deadline_ = deadline; + io_timeout_ = io_timeout; + timedout_ = false; + } + + // Increment the IO counter and return a delay in microseconds + IOStatus ShouldDelay(const IOOptions& opts) { + if (timedout_) { + return IOStatus::TimedOut(); + } else if (!deadline_.count() && !io_timeout_.count()) { + return IOStatus::OK(); + } + if (!ignore_deadline_ && delay_trigger_ == io_count_++) { + env_->SleepForMicroseconds(static_cast(opts.timeout.count() + 1)); + timedout_ = true; + if (error_on_delay_) { + return IOStatus::TimedOut(); } - return CompareTimestamp( - Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), - Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); } + return IOStatus::OK(); + } - virtual int CompareImpl(const Slice& a, const Slice& b) const = 0; + const std::chrono::microseconds GetDeadline() { + return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_; + } - int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { - assert(a.size() >= timestamp_size()); - assert(b.size() >= timestamp_size()); - Slice k1 = StripTimestampFromUserKey(a, timestamp_size()); - Slice k2 = StripTimestampFromUserKey(b, timestamp_size()); + const std::chrono::microseconds GetIOTimeout() { + return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_; + } - return CompareImpl(k1, k2); - } + bool TimedOut() { return timedout_; } - int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { - if (!ts1.data() && !ts2.data()) { - return 0; - } else if (ts1.data() && !ts2.data()) { - return 1; - } else if (!ts1.data() && ts2.data()) { - return -1; - } - assert(ts1.size() == ts2.size()); - uint64_t low1 = 0; - uint64_t low2 = 0; - uint64_t high1 = 0; - uint64_t high2 = 0; - auto* ptr1 = const_cast(&ts1); - auto* ptr2 = const_cast(&ts2); - if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || - !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { - assert(false); - } - if (high1 < high2) { - return 1; - } else if (high1 > high2) { - return -1; - } - if (low1 < low2) { - return 1; - } else if (low1 > low2) { - return -1; + void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; } + + void AssertDeadline(const std::chrono::microseconds deadline, + const std::chrono::microseconds io_timeout, + const IOOptions& opts) const { + // Give a leeway of +- 10us as it can take some time for the Get/ + // MultiGet call to reach here, in order to avoid false alarms + std::chrono::microseconds now = + std::chrono::microseconds(env_->NowMicros()); + std::chrono::microseconds timeout; + if (deadline.count()) { + timeout = deadline - now; + if (io_timeout.count()) { + timeout = std::min(timeout, io_timeout); } - return 0; + } else { + timeout = io_timeout; + } + if (opts.timeout != timeout) { + ASSERT_EQ(timeout, opts.timeout); } - }; - - Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) { - assert(nullptr != ts); - ts->clear(); - PutFixed64(ts, low); - PutFixed64(ts, high); - assert(ts->size() == sizeof(low) + sizeof(high)); - return Slice(*ts); } + + private: + // The number of IOs to trigger the delay after + int delay_trigger_; + // Current IO count + int io_count_; + // ReadOptions deadline for the Get/MultiGet/Iterator + std::chrono::microseconds deadline_; + // ReadOptions io_timeout for the Get/MultiGet/Iterator + std::chrono::microseconds io_timeout_; + SpecialEnv* env_; + // Flag to indicate whether we injected a delay + bool timedout_; + // Temporarily ignore deadlines/timeouts + bool ignore_deadline_; + // Return IOStatus::TimedOut() or IOStatus::OK() + bool error_on_delay_; }; -class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { +IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len, + const IOOptions& opts, Slice* result, + char* scratch, + IODebugContext* dbg) const { + const std::chrono::microseconds deadline = fs_.GetDeadline(); + const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); + IOStatus s; + if (deadline.count() || io_timeout.count()) { + fs_.AssertDeadline(deadline, io_timeout, opts); + } + if (s.ok()) { + s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch, + dbg); + } + if (s.ok()) { + s = fs_.ShouldDelay(opts); + } + return s; +} + +IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs, + size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + const std::chrono::microseconds deadline = fs_.GetDeadline(); + const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); + IOStatus s; + if (deadline.count() || io_timeout.count()) { + fs_.AssertDeadline(deadline, io_timeout, options); + } + if (s.ok()) { + s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg); + } + if (s.ok()) { + s = fs_.ShouldDelay(options); + } + return s; +} + +// A test class for intercepting random reads and injecting artificial +// delays. Used for testing the MultiGet deadline feature +class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet { public: - DBBasicTestWithTimestamp() - : DBBasicTestWithTimestampBase("/db_basic_test_with_timestamp") {} - - protected: - class TestComparator : public TestComparatorBase { - public: - const int kKeyPrefixLength = - 3; // 3: length of "key" in generated keys ("key" + std::to_string(j)) - explicit TestComparator(size_t ts_sz) : TestComparatorBase(ts_sz) {} - - int CompareImpl(const Slice& a, const Slice& b) const override { - int n1 = atoi( - std::string(a.data() + kKeyPrefixLength, a.size() - kKeyPrefixLength) - .c_str()); - int n2 = atoi( - std::string(b.data() + kKeyPrefixLength, b.size() - kKeyPrefixLength) - .c_str()); - return (n1 < n2) ? -1 : (n1 > n2) ? 1 : 0; + DBBasicTestMultiGetDeadline() + : DBBasicTestMultiGet( + "db_basic_test_multiget_deadline" /*Test dir*/, + 10 /*# of column families*/, false /*compressed cache enabled*/, + true /*uncompressed cache enabled*/, true /*compression enabled*/, + true /*ReadOptions.fill_cache*/, + 1 /*# of parallel compression threads*/) {} + + inline void CheckStatus(std::vector& statuses, size_t num_ok) { + for (size_t i = 0; i < statuses.size(); ++i) { + if (i < num_ok) { + EXPECT_OK(statuses[i]); + } else { + if (statuses[i] != Status::TimedOut()) { + EXPECT_EQ(statuses[i], Status::TimedOut()); + } + } } - }; + } }; -#ifndef ROCKSDB_LITE -// A class which remembers the name of each flushed file. -class FlushedFileCollector : public EventListener { - public: - FlushedFileCollector() {} - ~FlushedFileCollector() override {} +TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { + std::shared_ptr fs = std::make_shared(env_, false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options = CurrentOptions(); - void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { - InstrumentedMutexLock lock(&mutex_); - flushed_files_.push_back(info.file_path); + std::shared_ptr cache = NewLRUCache(1048576); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = env.get(); + SetTimeElapseOnlySleepOnReopen(&options); + ReopenWithColumnFamilies(GetCFNames(), options); + + // Test the non-batched version of MultiGet with multiple column + // families + std::vector key_str; + size_t i; + for (i = 0; i < 5; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + std::vector cfs(key_str.size()); + ; + std::vector keys(key_str.size()); + std::vector values(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + cfs[i] = handles_[i]; + keys[i] = Slice(key_str[i].data(), key_str[i].size()); } - std::vector GetFlushedFiles() { - std::vector result; - { - InstrumentedMutexLock lock(&mutex_); - result = flushed_files_; - } - return result; + ReadOptions ro; + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + // Delay the first IO + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); + + std::vector statuses = dbfull()->MultiGet(ro, cfs, keys, &values); + // The first key is successful because we check after the lookup, but + // subsequent keys fail due to deadline exceeded + CheckStatus(statuses, 1); + + // Clear the cache + cache->SetCapacity(0); + cache->SetCapacity(1048576); + // Test non-batched Multiget with multiple column families and + // introducing an IO delay in one of the middle CFs + key_str.clear(); + for (i = 0; i < 10; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + cfs.resize(key_str.size()); + keys.resize(key_str.size()); + values.resize(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + // 2 keys per CF + cfs[i] = handles_[i / 2]; + keys[i] = Slice(key_str[i].data(), key_str[i].size()); + } + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); + statuses = dbfull()->MultiGet(ro, cfs, keys, &values); + CheckStatus(statuses, 3); + + // Test batched MultiGet with an IO delay in the first data block read. + // Both keys in the first CF should succeed as they're in the same data + // block and would form one batch, and we check for deadline between + // batches. + std::vector pin_values(keys.size()); + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 2); + + // Similar to the previous one, but an IO delay in the third CF data block + // read + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 6); + + // Similar to the previous one, but an IO delay in the last but one CF + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3); + dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 8); + + // Test batched MultiGet with single CF and lots of keys. Inject delay + // into the second batch of keys. As each batch is 32, the first 64 keys, + // i.e first two batches, should succeed and the rest should time out + for (PinnableSlice& value : pin_values) { + value.Reset(); + } + cache->SetCapacity(0); + cache->SetCapacity(1048576); + key_str.clear(); + for (i = 0; i < 100; ++i) { + key_str.emplace_back(Key(static_cast(i))); + } + keys.resize(key_str.size()); + pin_values.clear(); + pin_values.resize(key_str.size()); + for (i = 0; i < key_str.size(); ++i) { + keys[i] = Slice(key_str[i].data(), key_str[i].size()); } + statuses.clear(); + statuses.resize(keys.size()); + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); + dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(), + pin_values.data(), statuses.data()); + CheckStatus(statuses, 64); + Close(); +} - void ClearFlushedFiles() { - InstrumentedMutexLock lock(&mutex_); - flushed_files_.clear(); +TEST_F(DBBasicTest, ManifestWriteFailure) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto* s = reinterpret_cast(arg); + ASSERT_OK(*s); + // Manually overwrite return status + *s = Status::IOError(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("key", "value")); + ASSERT_NOK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); +} + +TEST_F(DBBasicTest, DestroyDefaultCfHandle) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + for (const auto* h : handles_) { + ASSERT_NE(db_->DefaultColumnFamily(), h); } - private: - std::vector flushed_files_; - InstrumentedMutex mutex_; -}; + // We have two handles to the default column family. The two handles point to + // different ColumnFamilyHandle objects. + assert(db_->DefaultColumnFamily()); + ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID()); + assert(handles_[0]); + ASSERT_EQ(0U, handles_[0]->GetID()); + + // You can destroy handles_[...]. + for (auto* h : handles_) { + ASSERT_OK(db_->DestroyColumnFamilyHandle(h)); + } + handles_.clear(); + + // But you should not destroy db_->DefaultColumnFamily(), since it's going to + // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used + // elsewhere internally too. + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument()); +} -TEST_F(DBBasicTestWithTimestamp, PutAndGetWithCompaction) { - const int kNumKeysPerFile = 8192; - const size_t kNumTimestamps = 2; - const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; - const size_t kSplitPosBase = kNumKeysPerTimestamp / 2; - Options options = CurrentOptions(); +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, VerifyFileChecksums) { + Options options = GetDefaultOptions(); options.create_if_missing = true; options.env = env_; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); - FlushedFileCollector* collector = new FlushedFileCollector(); - options.listeners.emplace_back(collector); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + Reopen(options); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); - std::string tmp; - size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); - TestComparator test_cmp(ts_sz); - options.comparator = &test_cmp; - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy( - 10 /*bits_per_key*/, false /*use_block_based_builder*/)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - size_t num_cfs = handles_.size(); - ASSERT_EQ(2, num_cfs); - std::vector write_ts_strs(kNumTimestamps); - std::vector read_ts_strs(kNumTimestamps); - std::vector write_ts_list; - std::vector read_ts_list; - - for (size_t i = 0; i != kNumTimestamps; ++i) { - write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); - read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); - const Slice& write_ts = write_ts_list.back(); - WriteOptions wopts; - wopts.timestamp = &write_ts; - for (int cf = 0; cf != static_cast(num_cfs); ++cf) { - for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { - ASSERT_OK(Put(cf, "key" + std::to_string(j), - "value_" + std::to_string(j) + "_" + std::to_string(i), - wopts)); - if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) { - // flush all keys with the same timestamp to two sst files, split at - // incremental positions such that lowerlevel[1].smallest.userkey == - // higherlevel[0].largest.userkey - ASSERT_OK(Flush(cf)); - - // compact files (2 at each level) to a lower level such that all keys - // with the same timestamp is at one level, with newer versions at - // higher levels. - CompactionOptions compact_opt; - compact_opt.compression = kNoCompression; - db_->CompactFiles(compact_opt, handles_[cf], - collector->GetFlushedFiles(), - static_cast(kNumTimestamps - i)); - collector->ClearFlushedFiles(); - } - } - } - } - const auto& verify_db_func = [&]() { - for (size_t i = 0; i != kNumTimestamps; ++i) { - ReadOptions ropts; - ropts.timestamp = &read_ts_list[i]; - for (int cf = 0; cf != static_cast(num_cfs); ++cf) { - ColumnFamilyHandle* cfh = handles_[cf]; - for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { - std::string value; - ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); - ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), - value); - } - } + // Write an L0 with checksum computed. + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + + // Does the right thing but with the wrong name -- using it should lead to an + // error. + class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c { + public: + MisnamedFileChecksumGenerator(const FileChecksumGenContext& context) + : FileChecksumGenCrc32c(context) {} + + const char* Name() const override { return "sha1"; } + }; + + class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + return std::unique_ptr( + new MisnamedFileChecksumGenerator(context)); } }; - verify_db_func(); + + options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory()); + Reopen(options); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); } #endif // !ROCKSDB_LITE -class DBBasicTestWithTimestampWithParam - : public DBBasicTestWithTimestampBase, - public testing::WithParamInterface { - public: - DBBasicTestWithTimestampWithParam() - : DBBasicTestWithTimestampBase( - "/db_basic_test_with_timestamp_with_param") {} +// A test class for intercepting random reads and injecting artificial +// delays. Used for testing the deadline/timeout feature +class DBBasicTestDeadline + : public DBBasicTest, + public testing::WithParamInterface> {}; + +TEST_P(DBBasicTestDeadline, PointLookupDeadline) { + std::shared_ptr fs = std::make_shared(env_, true); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + bool set_deadline = std::get<0>(GetParam()); + bool set_timeout = std::get<1>(GetParam()); + + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { + continue; + } + option_config_ = option_config; + Options options = CurrentOptions(); + if (options.use_direct_reads) { + continue; + } + options.env = env.get(); + options.disable_auto_compactions = true; + Cache* block_cache = nullptr; + // Fileter block reads currently don't cause the request to get + // aborted on a read timeout, so its possible those block reads + // may get issued even if the deadline is past + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Get:BeforeFilterMatch", + [&](void* /*arg*/) { fs->IgnoreDeadline(true); }); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Get:AfterFilterMatch", + [&](void* /*arg*/) { fs->IgnoreDeadline(false); }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); - protected: - class TestComparator : public TestComparatorBase { - private: - const Comparator* cmp_without_ts_; + SetTimeElapseOnlySleepOnReopen(&options); + Reopen(options); - public: - explicit TestComparator(size_t ts_sz) - : TestComparatorBase(ts_sz), cmp_without_ts_(nullptr) { - cmp_without_ts_ = BytewiseComparator(); + if (options.table_factory) { + block_cache = options.table_factory->GetOptions( + TableFactory::kBlockCacheOpts()); } - int CompareImpl(const Slice& a, const Slice& b) const override { - return cmp_without_ts_->Compare(a, b); + Random rnd(301); + for (int i = 0; i < 400; ++i) { + std::string key = "k" + ToString(i); + ASSERT_OK(Put(key, rnd.RandomString(100))); } - }; -}; + ASSERT_OK(Flush()); -TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) { - const int kNumKeysPerFile = 8192; - const size_t kNumTimestamps = 6; - bool memtable_only = GetParam(); - Options options = CurrentOptions(); - options.create_if_missing = true; - options.env = env_; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); - std::string tmp; - size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); - TestComparator test_cmp(ts_sz); - options.comparator = &test_cmp; - BlockBasedTableOptions bbto; - bbto.filter_policy.reset(NewBloomFilterPolicy( - 10 /*bits_per_key*/, false /*use_block_based_builder*/)); - bbto.whole_key_filtering = true; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + bool timedout = true; + // A timeout will be forced when the IO counter reaches this value + int io_deadline_trigger = 0; + // Keep incrementing io_deadline_trigger and call Get() until there is an + // iteration that doesn't cause a timeout. This ensures that we cover + // all file reads in the point lookup path that can potentially timeout + // and cause the Get() to fail. + while (timedout) { + ReadOptions ro; + if (set_deadline) { + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + } + if (set_timeout) { + ro.io_timeout = std::chrono::microseconds{5000}; + } + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); - std::vector compression_types; - compression_types.push_back(kNoCompression); - if (Zlib_Supported()) { - compression_types.push_back(kZlibCompression); - } -#if LZ4_VERSION_NUMBER >= 10400 // r124+ - compression_types.push_back(kLZ4Compression); - compression_types.push_back(kLZ4HCCompression); -#endif // LZ4_VERSION_NUMBER >= 10400 - if (ZSTD_Supported()) { - compression_types.push_back(kZSTD); - } - - // Switch compression dictionary on/off to check key extraction - // correctness in kBuffered state - std::vector max_dict_bytes_list = {0, 1 << 14}; // 0 or 16KB - - for (auto compression_type : compression_types) { - for (uint32_t max_dict_bytes : max_dict_bytes_list) { - options.compression = compression_type; - options.compression_opts.max_dict_bytes = max_dict_bytes; - if (compression_type == kZSTD) { - options.compression_opts.zstd_max_train_bytes = max_dict_bytes; - } - options.target_file_size_base = 1 << 26; // 64MB - - DestroyAndReopen(options); - CreateAndReopenWithCF({"pikachu"}, options); - size_t num_cfs = handles_.size(); - ASSERT_EQ(2, num_cfs); - std::vector write_ts_strs(kNumTimestamps); - std::vector read_ts_strs(kNumTimestamps); - std::vector write_ts_list; - std::vector read_ts_list; - - for (size_t i = 0; i != kNumTimestamps; ++i) { - write_ts_list.emplace_back( - EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); - read_ts_list.emplace_back( - EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); - const Slice& write_ts = write_ts_list.back(); - WriteOptions wopts; - wopts.timestamp = &write_ts; - for (int cf = 0; cf != static_cast(num_cfs); ++cf) { - for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { - ASSERT_OK(Put( - cf, "key" + std::to_string(j), - "value_" + std::to_string(j) + "_" + std::to_string(i), wopts)); - } - if (!memtable_only) { - ASSERT_OK(Flush(cf)); - } - } + block_cache->SetCapacity(0); + block_cache->SetCapacity(1048576); + + std::string value; + Status s = dbfull()->Get(ro, "k50", &value); + if (fs->TimedOut()) { + ASSERT_EQ(s, Status::TimedOut()); + } else { + timedout = false; + ASSERT_OK(s); } - const auto& verify_db_func = [&]() { - for (size_t i = 0; i != kNumTimestamps; ++i) { - ReadOptions ropts; - ropts.timestamp = &read_ts_list[i]; - for (int cf = 0; cf != static_cast(num_cfs); ++cf) { - ColumnFamilyHandle* cfh = handles_[cf]; - for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; - ++j) { - std::string value; - ASSERT_OK( - db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); - ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), - value); - } - } - } - }; - verify_db_func(); + io_deadline_trigger++; } + // Reset the delay sequence in order to avoid false alarms during Reopen + fs->SetDelayTrigger(std::chrono::microseconds::zero(), + std::chrono::microseconds::zero(), 0); } + Close(); } -INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam, - ::testing::Bool()); +TEST_P(DBBasicTestDeadline, IteratorDeadline) { + std::shared_ptr fs = std::make_shared(env_, true); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + bool set_deadline = std::get<0>(GetParam()); + bool set_timeout = std::get<1>(GetParam()); + + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { + continue; + } + Options options = CurrentOptions(); + if (options.use_direct_reads) { + continue; + } + options.env = env.get(); + options.disable_auto_compactions = true; + Cache* block_cache = nullptr; + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); -} // namespace ROCKSDB_NAMESPACE + SetTimeElapseOnlySleepOnReopen(&options); + Reopen(options); + + if (options.table_factory) { + block_cache = options.table_factory->GetOptions( + TableFactory::kBlockCacheOpts()); + } + + Random rnd(301); + for (int i = 0; i < 400; ++i) { + std::string key = "k" + ToString(i); + ASSERT_OK(Put(key, rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + bool timedout = true; + // A timeout will be forced when the IO counter reaches this value + int io_deadline_trigger = 0; + // Keep incrementing io_deadline_trigger and call Get() until there is an + // iteration that doesn't cause a timeout. This ensures that we cover + // all file reads in the point lookup path that can potentially timeout + while (timedout) { + ReadOptions ro; + if (set_deadline) { + ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; + } + if (set_timeout) { + ro.io_timeout = std::chrono::microseconds{5000}; + } + fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); + block_cache->SetCapacity(0); + block_cache->SetCapacity(1048576); + + Iterator* iter = dbfull()->NewIterator(ro); + int count = 0; + iter->Seek("k50"); + while (iter->Valid() && count++ < 100) { + iter->Next(); + } + if (fs->TimedOut()) { + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(iter->status(), Status::TimedOut()); + } else { + timedout = false; + ASSERT_OK(iter->status()); + } + delete iter; + io_deadline_trigger++; + } + // Reset the delay sequence in order to avoid false alarms during Reopen + fs->SetDelayTrigger(std::chrono::microseconds::zero(), + std::chrono::microseconds::zero(), 0); + } + Close(); } -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +// Param 0: If true, set read_options.deadline +// Param 1: If true, set read_options.io_timeout +INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline, + ::testing::Values(std::make_tuple(true, false), + std::make_tuple(false, true), + std::make_tuple(true, true))); +} // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_blob_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_blob_index_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,436 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include -#include -#include -#include - -#include "db/arena_wrapped_db_iter.h" -#include "db/column_family.h" -#include "db/db_iter.h" -#include "db/db_test_util.h" -#include "db/dbformat.h" -#include "db/write_batch_internal.h" -#include "port/port.h" -#include "port/stack_trace.h" -#include "util/string_util.h" -#include "utilities/merge_operators.h" - -namespace ROCKSDB_NAMESPACE { - -// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb -// should accept the value type on write, and report not supported value -// for reads, unless caller request for it explicitly. The base rocksdb -// doesn't understand format of actual blob index (the value). -class DBBlobIndexTest : public DBTestBase { - public: - enum Tier { - kMemtable = 0, - kImmutableMemtables = 1, - kL0SstFile = 2, - kLnSstFile = 3, - }; - const std::vector kAllTiers = {Tier::kMemtable, - Tier::kImmutableMemtables, - Tier::kL0SstFile, Tier::kLnSstFile}; - - DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {} - - ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } - - ColumnFamilyData* cfd() { - return reinterpret_cast(cfh())->cfd(); - } - - Status PutBlobIndex(WriteBatch* batch, const Slice& key, - const Slice& blob_index) { - return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key, - blob_index); - } - - Status Write(WriteBatch* batch) { - return dbfull()->Write(WriteOptions(), batch); - } - - std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr, - const Snapshot* snapshot = nullptr) { - ReadOptions read_options; - read_options.snapshot = snapshot; - PinnableSlice value; - DBImpl::GetImplOptions get_impl_options; - get_impl_options.column_family = cfh(); - get_impl_options.value = &value; - get_impl_options.is_blob_index = is_blob_index; - auto s = dbfull()->GetImpl(read_options, key, get_impl_options); - if (s.IsNotFound()) { - return "NOT_FOUND"; - } - if (s.IsNotSupported()) { - return "NOT_SUPPORTED"; - } - if (!s.ok()) { - return s.ToString(); - } - return value.ToString(); - } - - std::string GetBlobIndex(const Slice& key, - const Snapshot* snapshot = nullptr) { - bool is_blob_index = false; - std::string value = GetImpl(key, &is_blob_index, snapshot); - if (!is_blob_index) { - return "NOT_BLOB"; - } - return value; - } - - ArenaWrappedDBIter* GetBlobIterator() { - return dbfull()->NewIteratorImpl( - ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), - nullptr /*read_callback*/, true /*allow_blob*/); - } - - Options GetTestOptions() { - Options options; - options.create_if_missing = true; - options.num_levels = 2; - options.disable_auto_compactions = true; - // Disable auto flushes. - options.max_write_buffer_number = 10; - options.min_write_buffer_number_to_merge = 10; - options.merge_operator = MergeOperators::CreateStringAppendOperator(); - return options; - } - - void MoveDataTo(Tier tier) { - switch (tier) { - case Tier::kMemtable: - break; - case Tier::kImmutableMemtables: - ASSERT_OK(dbfull()->TEST_SwitchMemtable()); - break; - case Tier::kL0SstFile: - ASSERT_OK(Flush()); - break; - case Tier::kLnSstFile: - ASSERT_OK(Flush()); - ASSERT_OK(Put("a", "dummy")); - ASSERT_OK(Put("z", "dummy")); - ASSERT_OK(Flush()); - ASSERT_OK( - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); -#ifndef ROCKSDB_LITE - ASSERT_EQ("0,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE - break; - } - } -}; - -// Should be able to write kTypeBlobIndex to memtables and SST files. -TEST_F(DBBlobIndexTest, Write) { - for (auto tier : kAllTiers) { - DestroyAndReopen(GetTestOptions()); - for (int i = 1; i <= 5; i++) { - std::string index = ToString(i); - WriteBatch batch; - ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index)); - ASSERT_OK(Write(&batch)); - } - MoveDataTo(tier); - for (int i = 1; i <= 5; i++) { - std::string index = ToString(i); - ASSERT_EQ("blob" + index, GetBlobIndex("key" + index)); - } - } -} - -// Get should be able to return blob index if is_blob_index is provided, -// otherwise return Status::NotSupported status. -TEST_F(DBBlobIndexTest, Get) { - for (auto tier : kAllTiers) { - DestroyAndReopen(GetTestOptions()); - WriteBatch batch; - ASSERT_OK(batch.Put("key", "value")); - ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index")); - ASSERT_OK(Write(&batch)); - MoveDataTo(tier); - // Verify normal value - bool is_blob_index = false; - PinnableSlice value; - ASSERT_EQ("value", Get("key")); - ASSERT_EQ("value", GetImpl("key")); - ASSERT_EQ("value", GetImpl("key", &is_blob_index)); - ASSERT_FALSE(is_blob_index); - // Verify blob index - ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); - ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); - ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index)); - ASSERT_TRUE(is_blob_index); - } -} - -// Get should NOT return Status::NotSupported if blob index is updated with -// a normal value. -TEST_F(DBBlobIndexTest, Updated) { - for (auto tier : kAllTiers) { - DestroyAndReopen(GetTestOptions()); - WriteBatch batch; - for (int i = 0; i < 10; i++) { - ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index")); - } - ASSERT_OK(Write(&batch)); - // Avoid blob values from being purged. - const Snapshot* snapshot = dbfull()->GetSnapshot(); - ASSERT_OK(Put("key1", "new_value")); - ASSERT_OK(Merge("key2", "a")); - ASSERT_OK(Merge("key2", "b")); - ASSERT_OK(Merge("key2", "c")); - ASSERT_OK(Delete("key3")); - ASSERT_OK(SingleDelete("key4")); - ASSERT_OK(Delete("key5")); - ASSERT_OK(Merge("key5", "a")); - ASSERT_OK(Merge("key5", "b")); - ASSERT_OK(Merge("key5", "c")); - ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9")); - MoveDataTo(tier); - for (int i = 0; i < 10; i++) { - ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot)); - } - ASSERT_EQ("new_value", Get("key1")); - ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); - ASSERT_EQ("NOT_FOUND", Get("key3")); - ASSERT_EQ("NOT_FOUND", Get("key4")); - ASSERT_EQ("a,b,c", GetImpl("key5")); - for (int i = 6; i < 9; i++) { - ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i))); - } - ASSERT_EQ("blob_index", GetBlobIndex("key9")); - dbfull()->ReleaseSnapshot(snapshot); - } -} - -// Iterator should get blob value if allow_blob flag is set, -// otherwise return Status::NotSupported status. -TEST_F(DBBlobIndexTest, Iterate) { - const std::vector> data = { - /*00*/ {kTypeValue}, - /*01*/ {kTypeBlobIndex}, - /*02*/ {kTypeValue}, - /*03*/ {kTypeBlobIndex, kTypeValue}, - /*04*/ {kTypeValue}, - /*05*/ {kTypeValue, kTypeBlobIndex}, - /*06*/ {kTypeValue}, - /*07*/ {kTypeDeletion, kTypeBlobIndex}, - /*08*/ {kTypeValue}, - /*09*/ {kTypeSingleDeletion, kTypeBlobIndex}, - /*10*/ {kTypeValue}, - /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex}, - /*12*/ {kTypeValue}, - /*13*/ - {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex}, - /*14*/ {kTypeValue}, - /*15*/ {kTypeBlobIndex}, - /*16*/ {kTypeValue}, - }; - - auto get_key = [](int index) { - char buf[20]; - snprintf(buf, sizeof(buf), "%02d", index); - return "key" + std::string(buf); - }; - - auto get_value = [&](int index, int version) { - return get_key(index) + "_value" + ToString(version); - }; - - auto check_iterator = [&](Iterator* iterator, Status::Code expected_status, - const Slice& expected_value) { - ASSERT_EQ(expected_status, iterator->status().code()); - if (expected_status == Status::kOk) { - ASSERT_TRUE(iterator->Valid()); - ASSERT_EQ(expected_value, iterator->value()); - } else { - ASSERT_FALSE(iterator->Valid()); - } - }; - - auto create_normal_iterator = [&]() -> Iterator* { - return dbfull()->NewIterator(ReadOptions()); - }; - - auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); }; - - auto check_is_blob = [&](bool is_blob) { - return [is_blob](Iterator* iterator) { - ASSERT_EQ(is_blob, - reinterpret_cast(iterator)->IsBlob()); - }; - }; - - auto verify = [&](int index, Status::Code expected_status, - const Slice& forward_value, const Slice& backward_value, - std::function create_iterator, - std::function extra_check = nullptr) { - // Seek - auto* iterator = create_iterator(); - ASSERT_OK(iterator->Refresh()); - iterator->Seek(get_key(index)); - check_iterator(iterator, expected_status, forward_value); - if (extra_check) { - extra_check(iterator); - } - delete iterator; - - // Next - iterator = create_iterator(); - ASSERT_OK(iterator->Refresh()); - iterator->Seek(get_key(index - 1)); - ASSERT_TRUE(iterator->Valid()); - iterator->Next(); - check_iterator(iterator, expected_status, forward_value); - if (extra_check) { - extra_check(iterator); - } - delete iterator; - - // SeekForPrev - iterator = create_iterator(); - ASSERT_OK(iterator->Refresh()); - iterator->SeekForPrev(get_key(index)); - check_iterator(iterator, expected_status, backward_value); - if (extra_check) { - extra_check(iterator); - } - delete iterator; - - // Prev - iterator = create_iterator(); - iterator->Seek(get_key(index + 1)); - ASSERT_TRUE(iterator->Valid()); - iterator->Prev(); - check_iterator(iterator, expected_status, backward_value); - if (extra_check) { - extra_check(iterator); - } - delete iterator; - }; - - for (auto tier : {Tier::kMemtable} /*kAllTiers*/) { - // Avoid values from being purged. - std::vector snapshots; - DestroyAndReopen(GetTestOptions()); - - // fill data - for (int i = 0; i < static_cast(data.size()); i++) { - for (int j = static_cast(data[i].size()) - 1; j >= 0; j--) { - std::string key = get_key(i); - std::string value = get_value(i, j); - WriteBatch batch; - switch (data[i][j]) { - case kTypeValue: - ASSERT_OK(Put(key, value)); - break; - case kTypeDeletion: - ASSERT_OK(Delete(key)); - break; - case kTypeSingleDeletion: - ASSERT_OK(SingleDelete(key)); - break; - case kTypeMerge: - ASSERT_OK(Merge(key, value)); - break; - case kTypeBlobIndex: - ASSERT_OK(PutBlobIndex(&batch, key, value)); - ASSERT_OK(Write(&batch)); - break; - default: - assert(false); - }; - } - snapshots.push_back(dbfull()->GetSnapshot()); - } - ASSERT_OK( - dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16))); - snapshots.push_back(dbfull()->GetSnapshot()); - MoveDataTo(tier); - - // Normal iterator - verify(1, Status::kNotSupported, "", "", create_normal_iterator); - verify(3, Status::kNotSupported, "", "", create_normal_iterator); - verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), - create_normal_iterator); - verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), - create_normal_iterator); - verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), - create_normal_iterator); - verify(11, Status::kNotSupported, "", "", create_normal_iterator); - verify(13, Status::kOk, - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - create_normal_iterator); - verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), - create_normal_iterator); - - // Iterator with blob support - verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), - create_blob_iterator, check_is_blob(true)); - verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), - create_blob_iterator, check_is_blob(true)); - verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), - create_blob_iterator, check_is_blob(false)); - verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), - create_blob_iterator, check_is_blob(false)); - verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), - create_blob_iterator, check_is_blob(false)); - verify(11, Status::kNotSupported, "", "", create_blob_iterator); - verify(13, Status::kOk, - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - create_blob_iterator, check_is_blob(false)); - verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), - create_blob_iterator, check_is_blob(false)); - -#ifndef ROCKSDB_LITE - // Iterator with blob support and using seek. - ASSERT_OK(dbfull()->SetOptions( - cfh(), {{"max_sequential_skip_in_iterations", "0"}})); - verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), - create_blob_iterator, check_is_blob(true)); - verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), - create_blob_iterator, check_is_blob(true)); - verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), - create_blob_iterator, check_is_blob(false)); - verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), - create_blob_iterator, check_is_blob(false)); - verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), - create_blob_iterator, check_is_blob(false)); - verify(11, Status::kNotSupported, "", "", create_blob_iterator); - verify(13, Status::kOk, - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), - create_blob_iterator, check_is_blob(false)); - verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), - create_blob_iterator, check_is_blob(false)); -#endif // !ROCKSDB_LITE - - for (auto* snapshot : snapshots) { - dbfull()->ReleaseSnapshot(snapshot); - } - } -} - -} // namespace ROCKSDB_NAMESPACE - -int main(int argc, char** argv) { - ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_block_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_block_cache_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,10 +7,21 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include +#include + +#include "cache/cache_entry_roles.h" #include "cache/lru_cache.h" +#include "db/column_family.h" #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" #include "util/compression.h" +#include "util/defer.h" +#include "util/random.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -32,7 +43,8 @@ const size_t kNumBlocks = 10; const size_t kValueSize = 100; - DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {} + DBBlockCacheTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {} BlockBasedTableOptions GetTableOptions() { BlockBasedTableOptions table_options; @@ -47,7 +59,7 @@ options.avoid_flush_during_recovery = false; // options.compression = kNoCompression; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); return options; } @@ -144,6 +156,19 @@ compressed_insert_count_ = new_insert_count; compressed_failure_count_ = new_failure_count; } + +#ifndef ROCKSDB_LITE + const std::array GetCacheEntryRoleCountsBg() { + // Verify in cache entry role stats + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats(); + InternalStats::CacheEntryRoleStats stats; + internal_stats_ptr->TEST_GetCacheEntryRoleStats(&stats, + /*foreground=*/false); + return stats.entry_counts; + } +#endif // ROCKSDB_LITE }; TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { @@ -153,9 +178,15 @@ auto options = GetOptions(table_options); InitTable(options); - std::shared_ptr cache = NewLRUCache(0, 0, false); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); RecordCacheCounters(options); @@ -177,9 +208,15 @@ auto options = GetOptions(table_options); InitTable(options); - std::shared_ptr cache = NewLRUCache(0, 0, false); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); RecordCacheCounters(options); @@ -187,7 +224,7 @@ Iterator* iter = nullptr; // Load blocks into cache. - for (size_t i = 0; i < kNumBlocks - 1; i++) { + for (size_t i = 0; i + 1 < kNumBlocks; i++) { iter = db_->NewIterator(read_options); iter->Seek(ToString(i)); ASSERT_OK(iter->status()); @@ -209,12 +246,12 @@ iter = nullptr; // Release iterators and access cache again. - for (size_t i = 0; i < kNumBlocks - 1; i++) { + for (size_t i = 0; i + 1 < kNumBlocks; i++) { iterators[i].reset(); CheckCacheCounters(options, 0, 0, 0, 0); } ASSERT_EQ(0, cache->GetPinnedUsage()); - for (size_t i = 0; i < kNumBlocks - 1; i++) { + for (size_t i = 0; i + 1 < kNumBlocks; i++) { iter = db_->NewIterator(read_options); iter->Seek(ToString(i)); ASSERT_OK(iter->status()); @@ -225,34 +262,54 @@ #ifdef SNAPPY TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { - ReadOptions read_options; - auto table_options = GetTableOptions(); - auto options = GetOptions(table_options); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.block_cache_compressed = nullptr; + table_options.block_size = 1; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.compression = CompressionType::kSnappyCompression; - InitTable(options); - std::shared_ptr cache = NewLRUCache(0, 0, false); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + } + + ReadOptions read_options; std::shared_ptr compressed_cache = NewLRUCache(1 << 25, 0, false); + LRUCacheOptions co; + co.capacity = 0; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + // Needed not to count entry stats collector + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; + table_options.no_block_cache = false; table_options.block_cache_compressed = compressed_cache; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + table_options.max_auto_readahead_size = 0; + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); RecordCacheCounters(options); - std::vector> iterators(kNumBlocks - 1); - Iterator* iter = nullptr; - // Load blocks into cache. for (size_t i = 0; i < kNumBlocks - 1; i++) { - iter = db_->NewIterator(read_options); - iter->Seek(ToString(i)); - ASSERT_OK(iter->status()); + ASSERT_EQ(value, Get(ToString(i))); CheckCacheCounters(options, 1, 0, 1, 0); CheckCompressedCacheCounters(options, 1, 0, 1, 0); - iterators[i].reset(iter); } + size_t usage = cache->GetUsage(); - ASSERT_LT(0, usage); + ASSERT_EQ(0, usage); ASSERT_EQ(usage, cache->GetPinnedUsage()); size_t compressed_usage = compressed_cache->GetUsage(); ASSERT_LT(0, compressed_usage); @@ -264,24 +321,158 @@ cache->SetCapacity(usage); cache->SetStrictCapacityLimit(true); ASSERT_EQ(usage, cache->GetPinnedUsage()); - iter = db_->NewIterator(read_options); - iter->Seek(ToString(kNumBlocks - 1)); - ASSERT_TRUE(iter->status().IsIncomplete()); + + // Load last key block. + ASSERT_EQ("Result incomplete: Insert failed due to LRU cache being full.", + Get(ToString(kNumBlocks - 1))); + // Failure will also record the miss counter. CheckCacheCounters(options, 1, 0, 0, 1); CheckCompressedCacheCounters(options, 1, 0, 1, 0); - delete iter; - iter = nullptr; // Clear strict capacity limit flag. This time we shall hit compressed block - // cache. + // cache and load into block cache. cache->SetStrictCapacityLimit(false); - iter = db_->NewIterator(read_options); - iter->Seek(ToString(kNumBlocks - 1)); - ASSERT_OK(iter->status()); + // Load last key block. + ASSERT_EQ(value, Get(ToString(kNumBlocks - 1))); CheckCacheCounters(options, 1, 0, 1, 0); CheckCompressedCacheCounters(options, 0, 1, 0, 0); - delete iter; - iter = nullptr; +} + +namespace { +class PersistentCacheFromCache : public PersistentCache { + public: + PersistentCacheFromCache(std::shared_ptr cache, bool read_only) + : cache_(cache), read_only_(read_only) {} + + Status Insert(const Slice& key, const char* data, + const size_t size) override { + if (read_only_) { + return Status::NotSupported(); + } + std::unique_ptr copy{new char[size]}; + std::copy_n(data, size, copy.get()); + Status s = cache_->Insert( + key, copy.get(), size, + GetCacheEntryDeleterForRole()); + if (s.ok()) { + copy.release(); + } + return s; + } + + Status Lookup(const Slice& key, std::unique_ptr* data, + size_t* size) override { + auto handle = cache_->Lookup(key); + if (handle) { + char* ptr = static_cast(cache_->Value(handle)); + *size = cache_->GetCharge(handle); + data->reset(new char[*size]); + std::copy_n(ptr, *size, data->get()); + cache_->Release(handle); + return Status::OK(); + } else { + return Status::NotFound(); + } + } + + bool IsCompressed() override { return false; } + + StatsType Stats() override { return StatsType(); } + + std::string GetPrintableOptions() const override { return ""; } + + uint64_t NewId() override { return cache_->NewId(); } + + private: + std::shared_ptr cache_; + bool read_only_; +}; + +class ReadOnlyCacheWrapper : public CacheWrapper { + using CacheWrapper::CacheWrapper; + + using Cache::Insert; + Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/, + void (*)(const Slice& key, void* value) /*deleter*/, + Handle** /*handle*/, Priority /*priority*/) override { + return Status::NotSupported(); + } +}; + +} // namespace + +TEST_F(DBBlockCacheTest, TestWithSameCompressed) { + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr rw_cache{NewLRUCache(1000000)}; + std::shared_ptr rw_pcache{ + new PersistentCacheFromCache(rw_cache, /*read_only*/ false)}; + // Exercise some obscure behavior with read-only wrappers + std::shared_ptr ro_cache{new ReadOnlyCacheWrapper(rw_cache)}; + std::shared_ptr ro_pcache{ + new PersistentCacheFromCache(rw_cache, /*read_only*/ true)}; + + // Simple same pointer + table_options.block_cache = rw_cache; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache same as block_cache_compressed not " + "currently supported, and would be bad for performance anyway"); + + // Other cases + table_options.block_cache = ro_cache; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache and block_cache_compressed share " + "the same key space, which is not supported"); + + table_options.block_cache = rw_cache; + table_options.block_cache_compressed = ro_cache; + table_options.persistent_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache_compressed and block_cache share " + "the same key space, which is not supported"); + + table_options.block_cache = ro_cache; + table_options.block_cache_compressed.reset(); + table_options.persistent_cache = rw_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache and persistent_cache share the same " + "key space, which is not supported"); + + table_options.block_cache = rw_cache; + table_options.block_cache_compressed.reset(); + table_options.persistent_cache = ro_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: persistent_cache and block_cache share the same " + "key space, which is not supported"); + + table_options.block_cache.reset(); + table_options.no_block_cache = true; + table_options.block_cache_compressed = ro_cache; + table_options.persistent_cache = rw_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: block_cache_compressed and persistent_cache " + "share the same key space, which is not supported"); + + table_options.block_cache.reset(); + table_options.no_block_cache = true; + table_options.block_cache_compressed = rw_cache; + table_options.persistent_cache = ro_pcache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_EQ(TryReopen(options).ToString(), + "Invalid argument: persistent_cache and block_cache_compressed " + "share the same key space, which is not supported"); } #endif // SNAPPY @@ -296,7 +487,7 @@ BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "val")); @@ -352,7 +543,7 @@ std::shared_ptr cache = NewLRUCache(10, 0, true); table_options.block_cache = cache; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); ASSERT_OK(Put("key1", "val1")); ASSERT_OK(Put("key2", "val2")); @@ -390,7 +581,7 @@ std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "longer_key", "val")); @@ -429,6 +620,183 @@ // filter_bytes_insert); } +#if (defined OS_LINUX || defined OS_WIN) +TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + table_options.cache_index_and_filter_blocks = false; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 1; i <= kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(value, Get(ToString(i))); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT)); + } + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); +} + +// This test cache data, index and filter blocks during flush. +class DBBlockCacheTest1 : public DBTestBase, + public ::testing::WithParamInterface { + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {} +}; + +INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1, + ::testing::Values(1, 2, 3)); + +TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + + uint32_t filter_type = GetParam(); + switch (filter_type) { + case 1: // partition_filter + table_options.partition_filters = true; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + break; + case 2: // block-based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + break; + case 3: // full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + break; + default: + assert(false); + } + + table_options.cache_index_and_filter_blocks = true; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 1; i <= kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + if (filter_type == 1) { + ASSERT_EQ(2 * i, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(2 * i, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } else { + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } + ASSERT_EQ(value, Get(ToString(i))); + + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT)); + if (filter_type == 1) { + ASSERT_EQ(i * 3, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + } else { + ASSERT_EQ(i * 2, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + } + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + } + + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + // Index and filter blocks are automatically warmed when the new table file + // is automatically opened at the end of compaction. This is not easily + // disabled so results in the new index and filter blocks being warmed. + if (filter_type == 1) { + EXPECT_EQ(2 * (1 + kNumBlocks), + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2 * (1 + kNumBlocks), + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } else { + EXPECT_EQ(1 + kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(1 + kNumBlocks, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + } +} + +TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + table_options.cache_index_and_filter_blocks = false; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= 5; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(ToString(i))); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ( + 0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + } + + ASSERT_OK(dbfull()->SetOptions( + {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}})); + + for (size_t i = 6; i <= kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(ToString(i))); + ASSERT_EQ(1, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ( + 1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + } +} +#endif + namespace { // A mock cache wraps LRUCache, and record how many entries have been @@ -443,15 +811,18 @@ false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) { } - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), Handle** handle, - Priority priority) override { + using ShardedCache::Insert; + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper_cb, size_t charge, + Handle** handle, Priority priority) override { + DeleterFn delete_cb = helper_cb->del_cb; if (priority == Priority::LOW) { low_pri_insert_count++; } else { high_pri_insert_count++; } - return LRUCache::Insert(key, value, charge, deleter, handle, priority); + return LRUCache::Insert(key, value, charge, delete_cb, handle, priority); } }; @@ -471,7 +842,7 @@ table_options.filter_policy.reset(NewBloomFilterPolicy(20)); table_options.cache_index_and_filter_blocks_with_high_priority = priority == Cache::Priority::HIGH ? true : false; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); MockCache::high_pri_insert_count = 0; @@ -517,6 +888,140 @@ } } +namespace { + +// An LRUCache wrapper that can falsely report "not found" on Lookup. +// This allows us to manipulate BlockBasedTableReader into thinking +// another thread inserted the data in between Lookup and Insert, +// while mostly preserving the LRUCache interface/behavior. +class LookupLiarCache : public CacheWrapper { + int nth_lookup_not_found_ = 0; + + public: + explicit LookupLiarCache(std::shared_ptr target) + : CacheWrapper(std::move(target)) {} + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats) override { + if (nth_lookup_not_found_ == 1) { + nth_lookup_not_found_ = 0; + return nullptr; + } + if (nth_lookup_not_found_ > 1) { + --nth_lookup_not_found_; + } + return CacheWrapper::Lookup(key, stats); + } + + // 1 == next lookup, 2 == after next, etc. + void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; } +}; + +} // anonymous namespace + +TEST_F(DBBlockCacheTest, AddRedundantStats) { + const size_t capacity = size_t{1} << 25; + const int num_shard_bits = 0; // 1 shard + int iterations_tested = 0; + for (std::shared_ptr base_cache : + {NewLRUCache(capacity, num_shard_bits), + NewClockCache(capacity, num_shard_bits)}) { + if (!base_cache) { + // Skip clock cache when not supported + continue; + } + ++iterations_tested; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + std::shared_ptr cache = + std::make_shared(base_cache); + + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // Normal access filter+index+data. + ASSERT_EQ("value", Get("foo")); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Againt access filter+index+data, but force redundant load+insert on index + cache->SetNthLookupNotFound(2); + ASSERT_EQ("value", Get("bar")); + + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Access just filter (with high probability), and force redundant + // load+insert + cache->SetNthLookupNotFound(1); + ASSERT_EQ("NOT_FOUND", Get("this key was not added")); + + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + + // Access just data, forcing redundant load+insert + ReadOptions read_options; + std::unique_ptr iter{db_->NewIterator(read_options)}; + cache->SetNthLookupNotFound(1); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), "bar"); + + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD)); + EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD)); + // -------- + EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT)); + EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT)); + // -------- + EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT)); + } + EXPECT_GE(iterations_tested, 1); +} + TEST_F(DBBlockCacheTest, ParanoidFileChecks) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -526,7 +1031,7 @@ BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = false; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "1_key", "val")); @@ -541,7 +1046,7 @@ // Create a new SST file. This will further trigger a compaction // and generate another file. ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(3, /* Totally 3 files created up to now */ TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -556,7 +1061,7 @@ ASSERT_OK(Put(1, "1_key4", "val4")); ASSERT_OK(Put(1, "9_key4", "val4")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(3, /* Totally 3 files created up to now */ TestGetTickerCount(options, BLOCK_CACHE_ADD)); } @@ -631,7 +1136,7 @@ std::string str; for (int i = 0; i < num_iter; i++) { if (i % 4 == 0) { // high compression ratio - str = RandomString(&rnd, 1000); + str = rnd.RandomString(1000); } values.push_back(str); ASSERT_OK(Put(1, Key(i), values[i])); @@ -701,8 +1206,9 @@ Random rnd(301); for (auto compression_type : compression_types) { Options options = CurrentOptions(); - options.compression = compression_type; - options.compression_opts.max_dict_bytes = 4096; + options.bottommost_compression = compression_type; + options.bottommost_compression_opts.max_dict_bytes = 4096; + options.bottommost_compression_opts.enabled = true; options.create_if_missing = true; options.num_levels = 2; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); @@ -710,7 +1216,7 @@ BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.block_cache.reset(new MockCache()); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); RecordCacheCountersForCompressionDict(options); @@ -718,12 +1224,12 @@ for (int i = 0; i < kNumFiles; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); for (int j = 0; j < kNumEntriesPerFile; ++j) { - std::string value = RandomString(&rnd, kNumBytesPerEntry); + std::string value = rnd.RandomString(kNumBytesPerEntry); ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str())); } ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); @@ -750,8 +1256,628 @@ } } +static void ClearCache(Cache* cache) { + auto roles = CopyCacheDeleterRoleMap(); + std::deque keys; + Cache::ApplyToAllEntriesOptions opts; + auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/, + Cache::DeleterFn deleter) { + if (roles.find(deleter) == roles.end()) { + // Keep the stats collector + return; + } + keys.push_back(key.ToString()); + }; + cache->ApplyToAllEntries(callback, opts); + for (auto& k : keys) { + cache->Erase(k); + } +} + +TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { + const size_t capacity = size_t{1} << 25; + int iterations_tested = 0; + for (bool partition : {false, true}) { + for (std::shared_ptr cache : + {NewLRUCache(capacity), NewClockCache(capacity)}) { + if (!cache) { + // Skip clock cache when not supported + continue; + } + ++iterations_tested; + + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 13; + options.table_cache_numshardbits = 0; + // If this wakes up, it could interfere with test + options.stats_dump_period_sec = 0; + + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + if (partition) { + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + table_options.metadata_cache_options.top_level_index_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.partition_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.unpartitioned_pinning = + PinningTier::kNone; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("zfoo", "value")); + ASSERT_OK(Put("zbar", "value")); + ASSERT_OK(Flush()); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + + // Fresh cache + ClearCache(cache.get()); + + std::array expected{}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + std::array prev_expected = expected; + + // First access only filters + ASSERT_EQ("NOT_FOUND", Get("different from any key added")); + expected[static_cast(CacheEntryRole::kFilterBlock)] += 2; + if (partition) { + expected[static_cast(CacheEntryRole::kFilterMetaBlock)] += 2; + } + // Within some time window, we will get cached entry stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Not enough to force a miss + env_->MockSleepForSeconds(45); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + // Now access index and data block + ASSERT_EQ("value", Get("foo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Enough to force a miss + env_->MockSleepForSeconds(601); + // But inject a simulated long scan so that we need a longer + // interval to force a miss next time. + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this](void*) { + // To spend no more than 0.2% of time scanning, we would need + // interval of at least 10000s + env_->MockSleepForSeconds(20); + }); + SyncPoint::GetInstance()->EnableProcessing(); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // The same for other file + ASSERT_EQ("value", Get("zfoo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Because of the simulated long scan, this is not enough to force + // a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // But this is enough + env_->MockSleepForSeconds(10000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + + // Also check the GetProperty interface + std::map values; + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kIndexBlock)]), + values["count.index-block"]); + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kDataBlock)]), + values["count.data-block"]); + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kFilterBlock)]), + values["count.filter-block"]); + EXPECT_EQ( + ToString( + prev_expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + EXPECT_EQ(ToString(expected[static_cast(CacheEntryRole::kMisc)]), + values["count.misc"]); + + // Add one for kWriteBuffer + { + WriteBufferManager wbm(size_t{1} << 20, cache); + wbm.ReserveMem(1024); + expected[static_cast(CacheEntryRole::kWriteBuffer)]++; + // Now we check that the GetProperty interface is more agressive about + // re-scanning stats, but not totally aggressive. + // Within some time window, we will get cached entry stats + env_->MockSleepForSeconds(1); + EXPECT_EQ(ToString(prev_expected[static_cast( + CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + // Not enough for a "background" miss but enough for a "foreground" miss + env_->MockSleepForSeconds(45); + + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, + &values)); + EXPECT_EQ( + ToString( + expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + } + prev_expected = expected; + + // With collector pinned in cache, we should be able to hit + // even if the cache is full + ClearCache(cache.get()); + Cache::Handle* h = nullptr; + ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1, + GetNoopDeleterForRole(), + &h, Cache::Priority::HIGH)); + ASSERT_GT(cache->GetUsage(), cache->GetCapacity()); + expected = {}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + // For Fill-it-up + expected[static_cast(CacheEntryRole::kMisc)]++; + // Still able to hit on saved stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(1000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + cache->Release(h); + + // Now we test that the DB mutex is not held during scans, for the ways + // we know how to (possibly) trigger them. Without a better good way to + // check this, we simply inject an acquire & release of the DB mutex + // deep in the stat collection code. If we were already holding the + // mutex, that is UB that would at least be found by TSAN. + int scan_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this, &scan_count](void*) { + dbfull()->TEST_LockMutex(); + dbfull()->TEST_UnlockMutex(); + ++scan_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Different things that might trigger a scan, with mock sleeps to + // force a miss. + env_->MockSleepForSeconds(10000); + dbfull()->DumpStats(); + ASSERT_EQ(scan_count, 1); + + env_->MockSleepForSeconds(10000); + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + ASSERT_EQ(scan_count, 2); + + env_->MockSleepForSeconds(10000); + std::string value_str; + ASSERT_TRUE( + db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str)); + ASSERT_EQ(scan_count, 3); + + env_->MockSleepForSeconds(10000); + ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str)); + // To match historical speed, querying this property no longer triggers + // a scan, even if results are old. But periodic dump stats should keep + // things reasonably updated. + ASSERT_EQ(scan_count, /*unchanged*/ 3); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + EXPECT_GE(iterations_tested, 1); + } +} + #endif // ROCKSDB_LITE +class DBBlockCacheKeyTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBBlockCacheKeyTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {} + + void SetUp() override { + use_compressed_cache_ = std::get<0>(GetParam()); + exclude_file_numbers_ = std::get<1>(GetParam()); + } + + bool use_compressed_cache_; + bool exclude_file_numbers_; +}; + +// Disable LinkFile so that we can physically copy a DB using Checkpoint. +// Disable file GetUniqueId to enable stable cache keys. +class StableCacheKeyTestFS : public FaultInjectionTestFS { + public: + explicit StableCacheKeyTestFS(const std::shared_ptr& base) + : FaultInjectionTestFS(base) { + SetFailGetUniqueId(true); + } + + virtual ~StableCacheKeyTestFS() override {} + + IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, + IODebugContext*) override { + return IOStatus::NotSupported("Disabled"); + } +}; + +TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { + std::shared_ptr test_fs{ + new StableCacheKeyTestFS(env_->GetFileSystem())}; + std::unique_ptr test_env{ + new CompositeEnvWrapper(env_, test_fs)}; + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.env = test_env.get(); + + BlockBasedTableOptions table_options; + + int key_count = 0; + uint64_t expected_stat = 0; + + std::function verify_stats; + if (use_compressed_cache_) { + if (!Snappy_Supported()) { + ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); + return; + } + options.compression = CompressionType::kSnappyCompression; + table_options.no_block_cache = true; + table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options, &expected_stat] { + // One for ordinary SST file and one for external SST file + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD)); + }; + } else { + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options, &expected_stat] { + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + }; + } + + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"koko"}, options); + + if (exclude_file_numbers_) { + // Simulate something like old behavior without file numbers in properties. + // This is a "control" side of the test that also ensures safely degraded + // behavior on old files. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", + [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + props->orig_file_number = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + std::function perform_gets = [&key_count, &expected_stat, this]() { + if (exclude_file_numbers_) { + // No cache key reuse should happen, because we can't rely on current + // file number being stable + expected_stat += key_count; + } else { + // Cache keys should be stable + expected_stat = key_count; + } + for (int i = 0; i < key_count; ++i) { + ASSERT_EQ(Get(1, Key(i)), "abc"); + } + }; + + // Ordinary SST files with same session id + const std::string something_compressible(500U, 'x'); + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put(1, Key(key_count), "abc")); + ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible)); + ASSERT_OK(Flush(1)); + ++key_count; + } + +#ifndef ROCKSDB_LITE + // Save an export of those ordinary SST files for later + std::string export_files_dir = dbname_ + "/exported"; + ExportImportFilesMetaData* metadata_ptr_ = nullptr; + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + checkpoint = nullptr; + + // External SST files with same session id + SstFileWriter sst_file_writer(EnvOptions(), options); + std::vector external; + for (int i = 0; i < 2; ++i) { + std::string f = dbname_ + "/external" + ToString(i) + ".sst"; + external.push_back(f); + ASSERT_OK(sst_file_writer.Open(f)); + ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc")); + ASSERT_OK( + sst_file_writer.Put(Key(key_count) + "a", something_compressible)); + ++key_count; + ExternalSstFileInfo external_info; + ASSERT_OK(sst_file_writer.Finish(&external_info)); + IngestExternalFileOptions ingest_opts; + ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts)); + } + + if (exclude_file_numbers_) { + // FIXME(peterd): figure out where these extra ADDs are coming from + options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD, + uint64_t{0} - uint64_t{2}); + } +#endif + + perform_gets(); + verify_stats(); + + // Make sure we can cache hit after re-open + ReopenWithColumnFamilies({"default", "koko"}, options); + + perform_gets(); + verify_stats(); + + // Make sure we can cache hit even on a full copy of the DB. Using + // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link. + // (Checkpoint not available in LITE mode to test this.) +#ifndef ROCKSDB_LITE + auto db_copy_name = dbname_ + "-copy"; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name)); + delete checkpoint; + + Close(); + Destroy(options); + + // Switch to the DB copy + SaveAndRestore save_dbname(&dbname_, db_copy_name); + ReopenWithColumnFamilies({"default", "koko"}, options); + + perform_gets(); + verify_stats(); + + // And ensure that re-importing + ingesting the same files into a + // different DB uses same cache keys + DestroyAndReopen(options); + + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + delete cfh; + cfh = nullptr; + delete metadata_ptr_; + metadata_ptr_ = nullptr; + + DestroyDB(export_files_dir, options); + + ReopenWithColumnFamilies({"default", "yoyo"}, options); + + IngestExternalFileOptions ingest_opts; + ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts)); + + perform_gets(); + verify_stats(); +#endif // !ROCKSDB_LITE + + Close(); + Destroy(options); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + +class DBBlockCachePinningTest + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBBlockCachePinningTest() + : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {} + + void SetUp() override { + partition_index_and_filters_ = std::get<0>(GetParam()); + top_level_index_pinning_ = std::get<1>(GetParam()); + partition_pinning_ = std::get<2>(GetParam()); + unpartitioned_pinning_ = std::get<3>(GetParam()); + } + + bool partition_index_and_filters_; + PinningTier top_level_index_pinning_; + PinningTier partition_pinning_; + PinningTier unpartitioned_pinning_; +}; + +TEST_P(DBBlockCachePinningTest, TwoLevelDB) { + // Creates one file in L0 and one file in L1. Both files have enough data that + // their index and filter blocks are partitioned. The L1 file will also have + // a compression dictionary (those are trained only during compaction), which + // must be unpartitioned. + const int kKeySize = 32; + const int kBlockSize = 128; + const int kNumBlocksPerFile = 128; + const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize; + + Options options = CurrentOptions(); + // `kNoCompression` makes the unit test more portable. But it relies on the + // current behavior of persisting/accessing dictionary even when there's no + // (de)compression happening, which seems fairly likely to change over time. + options.compression = kNoCompression; + options.compression_opts.max_dict_bytes = 4 << 10; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 20 /* capacity */); + table_options.block_size = kBlockSize; + table_options.metadata_block_size = kBlockSize; + table_options.cache_index_and_filter_blocks = true; + table_options.metadata_cache_options.top_level_index_pinning = + top_level_index_pinning_; + table_options.metadata_cache_options.partition_pinning = partition_pinning_; + table_options.metadata_cache_options.unpartitioned_pinning = + unpartitioned_pinning_; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10 /* bits_per_key */)); + if (partition_index_and_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize))); + } + ASSERT_OK(Flush()); + if (i == 0) { + // Prevent trivial move so file will be rewritten with dictionary and + // reopened with L1's pinning settings. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + } + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Get base cache values + uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t compression_dict_misses = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + + // Read a key from the L0 file + Get(Key(kNumKeysPerFile)); + uint64_t expected_filter_misses = filter_misses; + uint64_t expected_index_misses = index_misses; + uint64_t expected_compression_dict_misses = compression_dict_misses; + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Read a key from the L1 file + Get(Key(0)); + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone || + top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone || + partition_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); +} + +INSTANTIATE_TEST_CASE_P( + DBBlockCachePinningTest, DBBlockCachePinningTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,10 +7,19 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" #include "db/db_test_util.h" +#include "options/options_helper.h" #include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/perf_context.h" #include "table/block_based/filter_policy_internal.h" +#include "test_util/testutil.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -22,7 +31,8 @@ class DBBloomFilterTest : public DBTestBase { public: - DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} + DBBloomFilterTest() + : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {} }; class DBBloomFilterTestWithParam : public DBTestBase, @@ -35,7 +45,8 @@ uint32_t format_version_; public: - DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {} + DBBloomFilterTestWithParam() + : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} ~DBBloomFilterTestWithParam() override {} @@ -80,13 +91,16 @@ options_override.partition_filters = partition_filters_; options_override.metadata_block_size = 32; Options options = CurrentOptions(options_override); - if (partition_filters_ && - static_cast( - options.table_factory->GetOptions()) - ->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) { - // In the current implementation partitioned filters depend on partitioned - // indexes - continue; + if (partition_filters_) { + auto* table_options = + options.table_factory->GetOptions(); + if (table_options != nullptr && + table_options->index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // In the current implementation partitioned filters depend on + // partitioned indexes + continue; + } } options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); @@ -122,8 +136,8 @@ ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); @@ -172,7 +186,7 @@ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); @@ -238,7 +252,7 @@ ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); @@ -291,7 +305,7 @@ // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foo")); @@ -322,7 +336,7 @@ // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Reopen with both of whole key off and prefix extractor enabled. // Still no bloom filter should be used. @@ -345,7 +359,7 @@ // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); options.prefix_extractor.reset(); bbto.whole_key_filtering = true; @@ -358,7 +372,7 @@ // not filtered out by key ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - Flush(); + ASSERT_OK(Flush()); // Now we have two files: // File 1: An older file with prefix bloom. @@ -461,7 +475,7 @@ for (int i = 0; i < N; i += 100) { ASSERT_OK(Put(1, Key(i), Key(i))); } - Flush(1); + ASSERT_OK(Flush(1)); // Prevent auto compactions triggered by seeks env_->delay_sstable_sync_.store(true, std::memory_order_release); @@ -497,36 +511,50 @@ ASSERT_LE(reads, 3 * N / 100); } +#ifndef ROCKSDB_LITE + // Sanity check some table properties + std::map props; + ASSERT_TRUE(db_->GetMapProperty( + handles_[1], DB::Properties::kAggregatedTableProperties, &props)); + uint64_t nkeys = N + N / 100; + uint64_t filter_size = ParseUint64(props["filter_size"]); + EXPECT_LE(filter_size, + (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8); + EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8); + + uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); + EXPECT_EQ(num_filter_entries, nkeys); +#endif // ROCKSDB_LITE + env_->delay_sstable_sync_.store(false, std::memory_order_release); Close(); } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestDefFormatVersion, ::testing::Values( std::make_tuple(BFP::kDeprecatedBlock, false, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestWithParam, ::testing::Values( std::make_tuple(BFP::kDeprecatedBlock, false, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, ::testing::Values( - std::make_tuple(BFP::kDeprecatedBlock, false, - test::kLatestFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion))); -#endif // ROCKSDB_VALGRIND_RUN + std::make_tuple(BFP::kDeprecatedBlock, false, kLatestFormatVersion), + std::make_tuple(BFP::kAutoBloom, true, kLatestFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, kLatestFormatVersion))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBBloomFilterTest, BloomFilterRate) { while (ChangeFilterOptions()) { @@ -641,6 +669,439 @@ } } +/* + * A cache wrapper that tracks peaks and increments of filter + * construction cache reservation. + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +class FilterConstructResPeakTrackingCache : public CacheWrapper { + public: + explicit FilterConstructResPeakTrackingCache(std::shared_ptr target) + : CacheWrapper(std::move(target)), + cur_cache_res_(0), + cache_res_peak_(0), + cache_res_increment_(0), + last_peak_tracked_(false), + cache_res_increments_sum_(0) {} + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + Status s = target_->Insert(key, value, charge, deleter, handle, priority); + if (deleter == kNoopDeleterForFilterConstruction) { + if (last_peak_tracked_) { + cache_res_peak_ = 0; + cache_res_increment_ = 0; + last_peak_tracked_ = false; + } + cur_cache_res_ += charge; + cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_); + cache_res_increment_ += charge; + } + return s; + } + + using Cache::Release; + bool Release(Handle* handle, bool force_erase = false) override { + auto deleter = GetDeleter(handle); + if (deleter == kNoopDeleterForFilterConstruction) { + if (!last_peak_tracked_) { + cache_res_peaks_.push_back(cache_res_peak_); + cache_res_increments_sum_ += cache_res_increment_; + last_peak_tracked_ = true; + } + cur_cache_res_ -= GetCharge(handle); + } + bool is_successful = target_->Release(handle, force_erase); + return is_successful; + } + + std::deque GetReservedCachePeaks() { return cache_res_peaks_; } + + std::size_t GetReservedCacheIncrementSum() { + return cache_res_increments_sum_; + } + + private: + static const Cache::DeleterFn kNoopDeleterForFilterConstruction; + + std::size_t cur_cache_res_; + std::size_t cache_res_peak_; + std::size_t cache_res_increment_; + bool last_peak_tracked_; + std::deque cache_res_peaks_; + std::size_t cache_res_increments_sum_; +}; + +const Cache::DeleterFn + FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction = + CacheReservationManager::TEST_GetNoopDeleterForRole< + CacheEntryRole::kFilterConstruction>(); + +// To align with the type of hash entry being reserved in implementation. +using FilterConstructionReserveMemoryHash = uint64_t; + +class DBFilterConstructionReserveMemoryTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionReserveMemoryTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true), + num_key_(0), + reserve_table_builder_memory_(std::get<0>(GetParam())), + policy_(std::get<1>(GetParam())), + partition_filters_(std::get<2>(GetParam())) { + if (!reserve_table_builder_memory_ || + policy_ == BloomFilterPolicy::Mode::kDeprecatedBlock || + policy_ == BloomFilterPolicy::Mode::kLegacyBloom) { + // For these cases, we only interested in whether filter construction + // cache resevation happens instead of its accuracy. Therefore we don't + // need many keys. + num_key_ = 5; + } else if (partition_filters_) { + // For PartitionFilter case, since we set + // table_options.metadata_block_size big enough such that each partition + // trigger at least 1 dummy entry reservation each for hash entries and + // final filter, we need a large number of keys to ensure we have at least + // two partitions. + num_key_ = 18 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else if (policy_ == BloomFilterPolicy::Mode::kFastLocalBloom) { + // For Bloom Filter + FullFilter case, since we design the num_key_ to + // make hash entry cache reservation be a multiple of dummy entries, the + // correct behavior of charging final filter on top of it will trigger at + // least another dummy entry insertion. Therefore we can assert that + // behavior and we don't need a large number of keys to verify we + // indeed charge the final filter for cache reservation, even though final + // filter is a lot smaller than hash entries. + num_key_ = 1 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else { + // For Ribbon Filter + FullFilter case, we need a large enough number of + // keys so that charging final filter after releasing the hash entries + // reservation will trigger at least another dummy entry (or equivalently + // to saying, causing another peak in cache reservation) as banding + // reservation might not be a multiple of dummy entry. + num_key_ = 12 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + + // We set cache capacity big enough to prevent cache full for convenience in + // calculation. + constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; + + table_options.reserve_table_builder_memory = reserve_table_builder_memory_; + table_options.filter_policy.reset(new BloomFilterPolicy(10, policy_)); + table_options.partition_filters = partition_filters_; + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size big enough so that each + // partition trigger at least 1 dummy entry insertion each for hash + // entries and final filter. + table_options.metadata_block_size = 409000; + } + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + cache_ = std::make_shared( + (NewLRUCache(lo))); + table_options.block_cache = cache_; + + return table_options; + } + + std::size_t GetNumKey() { return num_key_; } + + bool ReserveTableBuilderMemory() { return reserve_table_builder_memory_; } + + BloomFilterPolicy::Mode GetFilterPolicy() { return policy_; } + + bool PartitionFilters() { return partition_filters_; } + + std::shared_ptr + GetFilterConstructResPeakTrackingCache() { + return cache_; + } + + private: + std::size_t num_key_; + bool reserve_table_builder_memory_; + BloomFilterPolicy::Mode policy_; + bool partition_filters_; + std::shared_ptr cache_; +}; + +INSTANTIATE_TEST_CASE_P( + BlockBasedTableOptions, DBFilterConstructionReserveMemoryTestWithParam, + ::testing::Values( + std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + true), + std::make_tuple(true, BloomFilterPolicy::Mode::kDeprecatedBlock, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kLegacyBloom, false))); + +// TODO: Speed up this test. +// The current test inserts many keys (on the scale of dummy entry size) +// in order to make small memory user (e.g, final filter, partitioned hash +// entries/filter/banding) , which is proportional to the number of +// keys, big enough so that its cache reservation triggers dummy entry insertion +// and becomes observable in the test. +// +// However, inserting that many keys slows down this test and leaves future +// developers an opportunity to speed it up. +// +// Possible approaches & challenges: +// 1. Use sync point during cache reservation of filter construction +// +// Benefit: It does not rely on triggering dummy entry insertion +// but the sync point to verify small memory user is charged correctly. +// +// Challenge: this approach is intrusive. +// +// 2. Make dummy entry size configurable and set it small in the test +// +// Benefit: It increases the precision of cache reservation and therefore +// small memory usage can still trigger insertion of dummy entry. +// +// Challenge: change CacheReservationManager related APIs and a hack +// might be needed to control the size of dummmy entry of +// CacheReservationManager used in filter construction for testing +// since CacheReservationManager is not exposed at the high level. +// +TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { + Options options = CurrentOptions(); + // We set write_buffer_size big enough so that in the case where there is + // filter construction cache reservation, flush won't be triggered before we + // manually trigger it for clean testing + options.write_buffer_size = 640 << 20; + options.table_factory.reset( + NewBlockBasedTableFactory(GetBlockBasedTableOptions())); + std::shared_ptr cache = + GetFilterConstructResPeakTrackingCache(); + options.create_if_missing = true; + // Disable auto compaction to prevent its unexpected side effect + // to the number of keys per partition designed by us in the test + options.disable_auto_compactions = true; + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0) + << "Flush was triggered too early in the test case with filter " + "construction cache reservation - please make sure no flush triggered " + "during the key insertions above"; + + ASSERT_OK(Flush()); + + bool reserve_table_builder_memory = ReserveTableBuilderMemory(); + BloomFilterPolicy::Mode policy = GetFilterPolicy(); + bool partition_filters = PartitionFilters(); + + std::deque filter_construction_cache_res_peaks = + cache->GetReservedCachePeaks(); + std::size_t filter_construction_cache_res_increments_sum = + cache->GetReservedCacheIncrementSum(); + + if (!reserve_table_builder_memory) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); + return; + } + + if (policy == BloomFilterPolicy::Mode::kDeprecatedBlock || + policy == BloomFilterPolicy::Mode::kLegacyBloom) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0) + << "There shouldn't be filter construction cache reservation as this " + "feature does not support BloomFilterPolicy::Mode::kDeprecatedBlock " + "nor BloomFilterPolicy::Mode::kLegacyBloom"; + return; + } + + const std::size_t kDummyEntrySize = + CacheReservationManager::GetDummyEntrySize(); + + const std::size_t predicted_hash_entries_cache_res = + num_key * sizeof(FilterConstructionReserveMemoryHash); + ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) + << "It's by this test's design that predicted_hash_entries_cache_res is " + "a multipe of dummy entry"; + + const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = + predicted_hash_entries_cache_res / kDummyEntrySize; + const std::size_t predicted_final_filter_cache_res = + static_cast(std::ceil( + 1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 * + (policy == BloomFilterPolicy::Mode::kStandard128Ribbon ? 0.7 : 1))) * + kDummyEntrySize; + const std::size_t predicted_banding_cache_res = + static_cast( + std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) * + kDummyEntrySize; + + if (policy == BloomFilterPolicy::Mode::kFastLocalBloom) { + /* BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter + * p0 + * / \ + * b / \ + * / \ + * / \ + * 0/ \ + * hash entries = b - 0, final filter = p0 - b + * p0 = hash entries + final filter + * + * The test is designed in a way such that the reservation for b is a + * multiple of dummy entries so that reservation for (p0 - b) + * will trigger at least another dummy entry insertion. + * + * BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter + * p1 + * / \ + * p0 b'/ \ + * / \ / \ + * b / \ / \ + * / \ / \ + * / a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a + * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' + * + * (increment p0 - 0) + (increment p1 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + final filter + * + */ + if (!partition_filters) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache reservation should have only 1 peak in " + "case: BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter"; + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + EXPECT_GT(filter_construction_cache_res_peak, + predicted_hash_entries_cache_res) + << "The testing number of hash entries is designed to make hash " + "entries cache reservation be multiples of dummy entries" + " so the correct behavior of charging final filter on top of it" + " should've triggered at least another dummy entry insertion"; + + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have multiple peaks " + "in case: BloomFilterPolicy::Mode::kFastLocalBloom + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } + + if (policy == BloomFilterPolicy::Mode::kStandard128Ribbon) { + /* BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter + * p0 + * / \ p1 + * / \/\ + * b / b' \ + * / \ + * 0/ \ + * hash entries = b - 0, banding = p0 - b, final filter = p1 - b' + * p0 = hash entries + banding + * + * The test is designed in a way such that the reservation for (p1 - b') + * will trigger at least another dummy entry insertion + * (or equivelantly to saying, creating another peak). + * + * BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter + * p3 + * p0 /\ p4 + * / \ p1 / \ /\ + * / \/\ b''/ a' \ + * b / b' \ / \ + * / \ / \ + * 0/ a \ + * partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a + * partitioned banding1 = p0 - b, partitioned banding2 = p3 - b'' + * parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a' + * + * (increment p0 - 0) + (increment p1 - b') + * + (increment p3 - a) + (increment p4 - a') + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned banding1 + parittioned banding2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + banding + final filter + */ + if (!partition_filters) { + ASSERT_GE(std::floor(1.0 * predicted_final_filter_cache_res / + CacheReservationManager::GetDummyEntrySize()), + 1) + << "Final filter cache reservation too small for this test - please " + "increase the number of keys"; + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have 2 peaks in " + "case: BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter. " + "The second peak is resulted from charging the final filter after " + "decreasing the hash entry reservation since the testing final " + "filter reservation is designed to be at least 1 dummy entry size"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) + << "Filter construction cache reservation should have more than 3 " + "peaks " + "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_banding_cache_res + + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } +} + namespace { // A wrapped bloom over block-based FilterPolicy class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy { @@ -765,6 +1226,14 @@ const std::unique_ptr policy_otherwise_; }; +static std::map + table_file_creation_reason_to_string{ + {TableFileCreationReason::kCompaction, "kCompaction"}, + {TableFileCreationReason::kFlush, "kFlush"}, + {TableFileCreationReason::kMisc, "kMisc"}, + {TableFileCreationReason::kRecovery, "kRecovery"}, + }; + class TestingContextCustomFilterPolicy : public LevelAndStyleCustomFilterPolicy { public: @@ -777,11 +1246,17 @@ const FilterBuildingContext& context) const override { test_report_ += "cf="; test_report_ += context.column_family_name; - test_report_ += ",cs="; + test_report_ += ",s="; test_report_ += OptionsHelper::compaction_style_to_string[context.compaction_style]; - test_report_ += ",lv="; - test_report_ += std::to_string(context.level_at_creation); + test_report_ += ",n="; + test_report_ += ToString(context.num_levels); + test_report_ += ",l="; + test_report_ += ToString(context.level_at_creation); + test_report_ += ",b="; + test_report_ += ToString(int{context.is_bottommost}); + test_report_ += ",r="; + test_report_ += table_file_creation_reason_to_string[context.reason]; test_report_ += "\n"; return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); @@ -799,18 +1274,21 @@ } // namespace TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { + auto policy = std::make_shared(15, 8, 5); + Options options; for (bool fifo : {true, false}) { - Options options = CurrentOptions(); + options = CurrentOptions(); + options.max_open_files = fifo ? -1 : options.max_open_files; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.compaction_style = fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; BlockBasedTableOptions table_options; - auto policy = std::make_shared(15, 8, 5); table_options.filter_policy = policy; table_options.format_version = 5; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopen(options); CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); const int maxKey = 10000; @@ -821,16 +1299,16 @@ ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); Flush(1); EXPECT_EQ(policy->DumpTestReport(), - fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" - : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); for (int i = maxKey / 2; i < maxKey; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } Flush(1); EXPECT_EQ(policy->DumpTestReport(), - fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" - : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); // Check that they can be found for (int i = 0; i < maxKey; i++) { @@ -858,7 +1336,7 @@ ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); EXPECT_EQ(policy->DumpTestReport(), - "cf=bob,cs=kCompactionStyleLevel,lv=1\n"); + "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); // Check that we now have one filter, about 9.2% FP rate (5 bits per key) for (int i = 0; i < maxKey; i++) { @@ -870,11 +1348,25 @@ EXPECT_GE(useful_count, maxKey * 0.90); EXPECT_LE(useful_count, maxKey * 0.91); } + } else { +#ifndef ROCKSDB_LITE + // Also try external SST file + { + std::string file_path = dbname_ + "/external.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("key", "value")); + ASSERT_OK(sst_file_writer.Finish()); + } + // Note: kCompactionStyleLevel is default, ignored if num_levels == -1 + EXPECT_EQ(policy->DumpTestReport(), + "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); +#endif } // Destroy ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - dbfull()->DestroyColumnFamilyHandle(handles_[1]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; } } @@ -1010,6 +1502,63 @@ ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); } +TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + Options options = CurrentOptions(); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { constexpr size_t kPrefixSize = 8; const std::string kKey = "key"; @@ -1029,6 +1578,215 @@ ASSERT_EQ(kKey, iter->key()); } +class DBBloomFilterTestVaryPrefixAndFormatVer + : public DBTestBase, + public testing::WithParamInterface> { + protected: + bool use_prefix_; + uint32_t format_version_; + + public: + DBBloomFilterTestVaryPrefixAndFormatVer() + : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {} + + ~DBBloomFilterTestVaryPrefixAndFormatVer() override {} + + void SetUp() override { + use_prefix_ = std::get<0>(GetParam()); + format_version_ = std::get<1>(GetParam()); + } + + static std::string UKey(uint32_t i) { return Key(static_cast(i)); } +}; + +TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { + Options options = CurrentOptions(); + if (use_prefix_) { + // Entire key from UKey() + options.prefix_extractor.reset(NewCappedPrefixTransform(9)); + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(20)); + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.whole_key_filtering = !use_prefix_; + if (use_prefix_) { // (not related to prefix, just alternating between) + // Make sure code appropriately deals with metadata block size setting + // that is "too small" (smaller than minimum size for filter builder) + bbto.metadata_block_size = 63; + } else { + // Make sure the test will work even on platforms with large minimum + // filter size, due to large cache line size. + // (Largest cache line size + 10+% overhead.) + bbto.metadata_block_size = 290; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + ReadOptions ropts; + + constexpr uint32_t N = 12000; + // Add N/2 evens + for (uint32_t i = 0; i < N; i += 2) { + ASSERT_OK(Put(UKey(i), UKey(i))); + } + ASSERT_OK(Flush()); +#ifndef ROCKSDB_LITE + ASSERT_EQ(TotalTableFiles(), 1); +#endif + + constexpr uint32_t Q = 29; + // MultiGet In + std::array keys; + std::array key_slices; + std::array column_families; + // MultiGet Out + std::array statuses; + std::array values; + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL); + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED); + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE); + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE); + + // Check that initial clump of keys only loads one partition filter from + // block cache. + // And that spread out keys load many partition filters. + // In both cases, mix present vs. not present keys. + for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) { + for (uint32_t i = 0; i < Q; ++i) { + keys[i] = UKey(i * stride); + key_slices[i] = Slice(keys[i]); + column_families[i] = db_->DefaultColumnFamily(); + statuses[i] = Status(); + values[i] = PinnableSlice(); + } + + db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], + /*timestamps=*/nullptr, &statuses[0], true); + + // Confirm correct status results + uint32_t number_not_found = 0; + for (uint32_t i = 0; i < Q; ++i) { + if ((i * stride % 2) == 0) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + ++number_not_found; + } + } + + // Confirm correct Bloom stats (no FPs) + uint64_t filter_useful = TestGetAndResetTickerCount( + options, + use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_checked = + TestGetAndResetTickerCount(options, use_prefix_ + ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + + (use_prefix_ ? 0 : filter_useful); + EXPECT_EQ(filter_useful, number_not_found); + EXPECT_EQ(filter_checked, Q); + if (!use_prefix_) { + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); + } + + // Confirm no duplicate loading same filter partition + uint64_t filter_accesses = + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + if (stride == 1) { + EXPECT_EQ(filter_accesses, 1); + } else { + // for large stride + EXPECT_GE(filter_accesses, Q / 2 + 1); + } + } + + // Check that a clump of keys (present and not) works when spanning + // two partitions + int found_spanning = 0; + for (uint32_t start = 0; start < N / 2;) { + for (uint32_t i = 0; i < Q; ++i) { + keys[i] = UKey(start + i); + key_slices[i] = Slice(keys[i]); + column_families[i] = db_->DefaultColumnFamily(); + statuses[i] = Status(); + values[i] = PinnableSlice(); + } + + db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0], + /*timestamps=*/nullptr, &statuses[0], true); + + // Confirm correct status results + uint32_t number_not_found = 0; + for (uint32_t i = 0; i < Q; ++i) { + if (((start + i) % 2) == 0) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + ++number_not_found; + } + } + + // Confirm correct Bloom stats (might see some FPs) + uint64_t filter_useful = TestGetAndResetTickerCount( + options, + use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_checked = + TestGetAndResetTickerCount(options, use_prefix_ + ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + + (use_prefix_ ? 0 : filter_useful); + EXPECT_GE(filter_useful, number_not_found - 2); // possible FP + EXPECT_EQ(filter_checked, Q); + if (!use_prefix_) { + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); + } + + // Confirm no duplicate loading of same filter partition + uint64_t filter_accesses = + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + + TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + if (filter_accesses == 2) { + // Spanned across partitions. + ++found_spanning; + if (found_spanning >= 2) { + break; + } else { + // Ensure that at least once we have at least one present and + // one non-present key on both sides of partition boundary. + start += 2; + } + } else { + EXPECT_EQ(filter_accesses, 1); + // See explanation at "start += 2" + start += Q - 4; + } + } + EXPECT_TRUE(found_spanning >= 2); +} + +INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer, + DBBloomFilterTestVaryPrefixAndFormatVer, + ::testing::Values( + // (use_prefix, format_version) + std::make_tuple(false, 2), + std::make_tuple(false, 3), + std::make_tuple(false, 4), + std::make_tuple(false, 5), + std::make_tuple(true, 2), + std::make_tuple(true, 3), + std::make_tuple(true, 4), + std::make_tuple(true, 5))); + #ifndef ROCKSDB_LITE namespace { namespace BFP2 { @@ -1229,9 +1987,9 @@ snprintf(buf, sizeof(buf), "%02d______:end", 10); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, - nullptr); // move to level 1 + ASSERT_OK(dbtest->Flush()); + ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr)); // move to level 1 // GROUP 1 for (int i = 1; i <= small_range_sstfiles; i++) { @@ -1343,27 +2101,26 @@ for (int i = 0; i < numkeys; i += 2) { keys.push_back(i); } - std::random_shuffle(std::begin(keys), std::end(keys)); - + RandomShuffle(std::begin(keys), std::end(keys)); int num_inserted = 0; for (int key : keys) { ASSERT_OK(Put(1, Key(key), "val")); if (++num_inserted % 1000 == 0) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0, 1) == 0) { // No Level 0 file. Create one. ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } for (int i = 1; i < numkeys; i += 2) { @@ -1468,7 +2225,8 @@ BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsNotSupported()); ASSERT_EQ(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); @@ -1500,10 +2258,10 @@ int CountIter(std::unique_ptr& iter, const Slice& key) { int count = 0; - for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); - iter->Next()) { + for (iter->Seek(key); iter->Valid(); iter->Next()) { count++; } + EXPECT_OK(iter->status()); return count; } @@ -1516,6 +2274,7 @@ int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; options.create_if_missing = true; + options.env = CurrentOptions().env; options.prefix_extractor.reset(NewCappedPrefixTransform(4)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); @@ -1532,7 +2291,7 @@ ASSERT_OK(Put("abcdxxx1", "val2")); ASSERT_OK(Put("abcdxxx2", "val3")); ASSERT_OK(Put("abcdxxx3", "val4")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // prefix_extractor has not changed, BF will always be read Slice upper_bound("abce"); @@ -1553,8 +2312,8 @@ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.FixedPrefix.5")); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.5"); { // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read Slice upper_bound("abce"); @@ -1646,6 +2405,7 @@ for (auto bfp_impl : BFP::kAllFixedImpls) { int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; @@ -1672,8 +2432,8 @@ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 2); @@ -1689,7 +2449,7 @@ ASSERT_OK(Put("foo4", "bar4")); ASSERT_OK(Put("foq5", "bar5")); ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is cappped:3 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); @@ -1706,14 +2466,14 @@ } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); // third SST with fixed:2 BF ASSERT_OK(Put("foo6", "bar6")); ASSERT_OK(Put("foo7", "bar7")); ASSERT_OK(Put("foq8", "bar8")); ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is fixed:2 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); @@ -1754,8 +2514,8 @@ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); { std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 6); @@ -1795,9 +2555,8 @@ // create a new CF and set prefix_extractor dynamically options.prefix_extractor.reset(NewCappedPrefixTransform(3)); CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); ASSERT_OK(Put(2, "foo3", "bar3")); ASSERT_OK(Put(2, "foo4", "bar4")); ASSERT_OK(Put(2, "foo5", "bar5")); @@ -1813,9 +2572,8 @@ } ASSERT_OK( dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); { std::unique_ptr iter( db_->NewIterator(read_options, handles_[2])); @@ -1824,10 +2582,10 @@ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); - dbfull()->DestroyColumnFamilyHandle(handles_[2]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); handles_[2] = nullptr; ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - dbfull()->DestroyColumnFamilyHandle(handles_[1]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; iteration++; } @@ -1838,6 +2596,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { for (auto bfp_impl : BFP::kAllFixedImpls) { Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; @@ -1879,8 +2638,8 @@ ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); { std::unique_ptr iter(db_->NewIterator(read_options)); // "fp*" should be skipped @@ -1899,6 +2658,55 @@ } } +TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { + Options options = CurrentOptions(); + constexpr size_t kNumKeys = 10000; + static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000"); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeys + 10)); + options.create_if_missing = true; + constexpr size_t kPrefixLength = 4; + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(50)); + bbto.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + bbto.block_size = 128; + bbto.metadata_block_size = 128; + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const std::string value(64, '\0'); + + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t i = 0; i < kNumKeys; ++i) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i; + ASSERT_OK(db_->Put(write_opts, oss.str(), value)); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + // Use legacy, implicit prefix seek + read_opts.total_order_seek = false; + read_opts.auto_prefix_mode = false; + std::unique_ptr it(db_->NewIterator(read_opts)); + for (size_t i = 0; i < kNumKeys; ++i) { + // Seek with a key after each one added but with same prefix. One will + // surely cross a partition boundary. + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a"; + it->SeekForPrev(oss.str()); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + } + it.reset(); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -21,7 +21,8 @@ class DBTestCompactionFilter : public DBTestBase { public: - DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {} + DBTestCompactionFilter() + : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {} }; // Param variant of DBTestBase::ChangeCompactOptions @@ -41,11 +42,11 @@ option_config_ == kUniversalSubcompactions) { assert(options.max_subcompactions > 1); } - TryReopen(options); + Reopen(options); } }; -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) INSTANTIATE_TEST_CASE_P( CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam, ::testing::Values(DBTestBase::OptionConfig::kDefault, @@ -54,11 +55,11 @@ DBTestBase::OptionConfig::kLevelSubcompactions, DBTestBase::OptionConfig::kUniversalSubcompactions)); #else -// Run fewer cases in valgrind +// Run fewer cases in non-full valgrind to save time. INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam, ::testing::Values(DBTestBase::OptionConfig::kDefault)); -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) class KeepFilter : public CompactionFilter { public: @@ -81,6 +82,11 @@ return true; } + bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const override { + return true; + } + const char* Name() const override { return "DeleteFilter"; } }; @@ -126,22 +132,6 @@ const char* Name() const override { return "DeleteFilter"; } }; -class DelayFilter : public CompactionFilter { - public: - explicit DelayFilter(DBTestBase* d) : db_test(d) {} - bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, - std::string* /*new_value*/, - bool* /*value_changed*/) const override { - db_test->env_->addon_time_.fetch_add(1000); - return true; - } - - const char* Name() const override { return "DelayFilter"; } - - private: - DBTestBase* db_test; -}; - class ConditionalFilter : public CompactionFilter { public: explicit ConditionalFilter(const std::string* filtered_value) @@ -205,18 +195,36 @@ bool compaction_filter_created_; }; +// This filter factory is configured with a `TableFileCreationReason`. Only +// table files created for that reason will undergo filtering. This +// configurability makes it useful to tests for filtering non-compaction table +// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery". class DeleteFilterFactory : public CompactionFilterFactory { public: + explicit DeleteFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) override { - if (context.is_manual_compaction) { - return std::unique_ptr(new DeleteFilter()); - } else { + EXPECT_EQ(reason_, context.reason); + if (context.reason == TableFileCreationReason::kCompaction && + !context.is_manual_compaction) { + // Table files created by automatic compaction do not undergo filtering. + // Presumably some tests rely on this. return std::unique_ptr(nullptr); } + return std::unique_ptr(new DeleteFilter()); + } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; } const char* Name() const override { return "DeleteFilterFactory"; } + + private: + const TableFileCreationReason reason_; }; // Delete Filter Factory which ignores snapshots @@ -248,20 +256,6 @@ const char* Name() const override { return "SkipEvenFilterFactory"; } }; -class DelayFilterFactory : public CompactionFilterFactory { - public: - explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} - std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& /*context*/) override { - return std::unique_ptr(new DelayFilter(db_test)); - } - - const char* Name() const override { return "DelayFilterFactory"; } - - private: - DBTestBase* db_test; -}; - class ConditionalFilterFactory : public CompactionFilterFactory { public: explicit ConditionalFilterFactory(const Slice& filtered_value) @@ -305,7 +299,7 @@ for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } ASSERT_OK(Flush(1)); @@ -313,10 +307,10 @@ // the compaction is each level invokes the filter for // all the keys in that level. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -336,19 +330,21 @@ InternalKeyComparator icmp(options.comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* upper_bound */); + ReadOptions read_options; ScopedArenaIterator iter(dbfull()->NewInternalIterator( - &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); + read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); total++; if (ikey.sequence != 0) { count++; } iter->Next(); } + ASSERT_OK(iter->status()); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 0); @@ -365,10 +361,10 @@ // means that all keys should pass at least once // via the compaction filter cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); @@ -376,7 +372,8 @@ // create a new database with the compaction // filter in such a way that it deletes all keys - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.create_if_missing = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -397,10 +394,10 @@ // verify that at the end of the compaction process, // nothing is left. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 0); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); @@ -415,6 +412,7 @@ count++; iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 0); } @@ -426,13 +424,14 @@ InternalKeyComparator icmp(options.comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* upper_bound */); + ReadOptions read_options; ScopedArenaIterator iter(dbfull()->NewInternalIterator( - &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); + read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); ASSERT_NE(ikey.sequence, (unsigned)0); count++; iter->Next(); @@ -446,7 +445,8 @@ // entries in VersionEdit, but none of the 'AddFile's. TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { Options options = CurrentOptions(); - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.disable_auto_compactions = true; options.create_if_missing = true; DestroyAndReopen(options); @@ -454,9 +454,9 @@ // put some data for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); + ASSERT_OK(Flush()); } // this will produce empty file (delete compaction filter) @@ -467,6 +467,7 @@ Iterator* itr = db_->NewIterator(ReadOptions()); itr->SeekToFirst(); + ASSERT_OK(itr->status()); // empty db ASSERT_TRUE(!itr->Valid()); @@ -474,6 +475,64 @@ } #endif // ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilterFlush) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by flush. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kFlush); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + +TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by recovery. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kRecovery); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + TEST_P(DBTestCompactionFilterWithCompactParam, CompactionFilterWithValueChange) { Options options = CurrentOptions(); @@ -490,25 +549,25 @@ for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } // push all files to lower levels ASSERT_OK(Flush(1)); if (option_config_ != kUniversalCompactionMultiLevel && option_config_ != kUniversalSubcompactions) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); } else { - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); } // re-write all data again for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } // push all files to lower levels. This should @@ -516,11 +575,11 @@ ASSERT_OK(Flush(1)); if (option_config_ != kUniversalCompactionMultiLevel && option_config_ != kUniversalSubcompactions) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); } else { - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); } // verify that all keys now have the new value that @@ -558,7 +617,7 @@ ASSERT_OK(Flush()); std::string newvalue = Get("foo"); ASSERT_EQ(newvalue, three); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("foo"); ASSERT_EQ(newvalue, three); @@ -566,12 +625,12 @@ // merge keys. ASSERT_OK(db_->Put(WriteOptions(), "bar", two)); ASSERT_OK(Flush()); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("bar"); ASSERT_EQ("NOT_FOUND", newvalue); ASSERT_OK(db_->Merge(WriteOptions(), "bar", two)); ASSERT_OK(Flush()); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("bar"); ASSERT_EQ(two, two); @@ -582,7 +641,7 @@ ASSERT_OK(Flush()); newvalue = Get("foobar"); ASSERT_EQ(newvalue, three); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("foobar"); ASSERT_EQ(newvalue, three); @@ -595,7 +654,7 @@ ASSERT_OK(Flush()); newvalue = Get("barfoo"); ASSERT_EQ(newvalue, four); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("barfoo"); ASSERT_EQ(newvalue, four); } @@ -617,21 +676,21 @@ for (int i = 0; i < num_keys_per_file; i++) { char key[100]; snprintf(key, sizeof(key), "B%08d%02d", i, j); - Put(key, value); + ASSERT_OK(Put(key, value)); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Make sure next file is much smaller so automatic compaction will not // be triggered. num_keys_per_file /= 2; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Force a manual compaction cfilter_count = 0; filter->expect_manual_compaction_.store(true); filter->expect_full_compaction_.store(true); filter->expect_cf_id_.store(0); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(cfilter_count, 700); ASSERT_EQ(NumSortedRuns(0), 1); ASSERT_TRUE(filter->compaction_filter_created()); @@ -644,13 +703,14 @@ InternalKeyComparator icmp(options.comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* snapshots */); + ReadOptions read_options; ScopedArenaIterator iter(dbfull()->NewInternalIterator( - &arena, &range_del_agg, kMaxSequenceNumber)); + read_options, &arena, &range_del_agg, kMaxSequenceNumber)); iter->SeekToFirst(); ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); total++; if (ikey.sequence != 0) { count++; @@ -680,14 +740,14 @@ for (int i = 0; i < num_keys_per_file; i++) { char key[100]; snprintf(key, sizeof(key), "B%08d%02d", i, j); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } - Flush(1); + ASSERT_OK(Flush(1)); // Make sure next file is much smaller so automatic compaction will not // be triggered. num_keys_per_file /= 2; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(filter->compaction_filter_created()); } @@ -706,9 +766,9 @@ const Snapshot* snapshot = nullptr; for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); + ASSERT_OK(Flush()); if (table == 0) { snapshot = db_->GetSnapshot(); @@ -728,6 +788,7 @@ read_options.snapshot = snapshot; std::unique_ptr iter(db_->NewIterator(read_options)); iter->SeekToFirst(); + ASSERT_OK(iter->status()); int count = 0; while (iter->Valid()) { count++; @@ -736,6 +797,7 @@ ASSERT_EQ(count, 6); read_options.snapshot = nullptr; std::unique_ptr iter1(db_->NewIterator(read_options)); + ASSERT_OK(iter1->status()); iter1->SeekToFirst(); count = 0; while (iter1->Valid()) { @@ -766,9 +828,9 @@ for (int i = table * 6; i < 39 + table * 11; ++i) { char key[100]; snprintf(key, sizeof(key), "%010d", table * 100 + i); - Put(key, std::to_string(table * 1000 + i)); + ASSERT_OK(Put(key, std::to_string(table * 1000 + i))); } - Flush(); + ASSERT_OK(Flush()); } cfilter_skips = 0; @@ -807,10 +869,10 @@ options.create_if_missing = true; DestroyAndReopen(options); - Put("0000000010", "v10"); - Put("0000000020", "v20"); // skipped - Put("0000000050", "v50"); - Flush(); + ASSERT_OK(Put("0000000010", "v10")); + ASSERT_OK(Put("0000000020", "v20")); // skipped + ASSERT_OK(Put("0000000050", "v50")); + ASSERT_OK(Flush()); cfilter_skips = 0; EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -848,13 +910,13 @@ options.compaction_filter = new TestNotSupportedFilter(); DestroyAndReopen(options); - Put("a", "v10"); - Put("z", "v20"); - Flush(); - - Put("a", "v10"); - Put("z", "v20"); - Flush(); + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); // Comapction should fail because IgnoreSnapshots() = false EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) @@ -863,6 +925,49 @@ delete options.compaction_filter; } +class TestNotSupportedFilterFactory : public CompactionFilterFactory { + public: + explicit TestNotSupportedFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /* context */) override { + return std::unique_ptr(new TestNotSupportedFilter()); + } + + const char* Name() const override { return "TestNotSupportedFilterFactory"; } + + private: + const TableFileCreationReason reason_; +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kFlush); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(Flush().IsNotSupported()); +} + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kRecovery); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,16 +7,23 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/blob/blob_index.h" #include "db/db_test_util.h" +#include "env/mock_env.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/experimental.h" #include "rocksdb/sst_file_writer.h" #include "rocksdb/utilities/convenience.h" -#include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" +#include "test_util/testutil.h" #include "util/concurrent_task_limiter_impl.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -25,14 +32,16 @@ class DBCompactionTest : public DBTestBase { public: - DBCompactionTest() : DBTestBase("/db_compaction_test") {} + DBCompactionTest() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {} }; class DBCompactionTestWithParam : public DBTestBase, public testing::WithParamInterface> { public: - DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") { + DBCompactionTestWithParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { max_subcompactions_ = std::get<0>(GetParam()); exclusive_manual_compaction_ = std::get<1>(GetParam()); } @@ -45,12 +54,34 @@ bool exclusive_manual_compaction_; }; +class DBCompactionTestWithBottommostParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBCompactionTestWithBottommostParam() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + bottommost_level_compaction_ = GetParam(); + } + + BottommostLevelCompaction bottommost_level_compaction_; +}; + class DBCompactionDirectIOTest : public DBCompactionTest, public ::testing::WithParamInterface { public: DBCompactionDirectIOTest() : DBCompactionTest() {} }; +// Param = true : target level is non-empty +// Param = false: level between target level and source level +// is not empty. +class ChangeLevelConflictsWithAuto + : public DBCompactionTest, + public ::testing::WithParamInterface { + public: + ChangeLevelConflictsWithAuto() : DBCompactionTest() {} +}; + namespace { class FlushedFileCollector : public EventListener { @@ -151,27 +182,28 @@ options.target_file_size_base * options.target_file_size_multiplier; options.max_bytes_for_level_multiplier = 2; options.disable_auto_compactions = false; + options.compaction_options_universal.max_size_amplification_percent = 100; return options; } bool HaveOverlappingKeyRanges( const Comparator* c, const SstFileMetaData& a, const SstFileMetaData& b) { - if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { - if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { // b.smallestkey <= a.smallestkey <= b.largestkey return true; } - } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { // a.smallestkey < b.smallestkey <= a.largestkey return true; } - if (c->Compare(a.largestkey, b.largestkey) <= 0) { - if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) { + if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) { // b.smallestkey <= a.largestkey <= b.largestkey return true; } - } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { // a.smallestkey <= b.largestkey < a.largestkey return true; } @@ -226,7 +258,7 @@ const CompactionStatsCollector& collector) { #ifndef NDEBUG InternalStats* internal_stats_ptr = cfd.internal_stats(); - ASSERT_TRUE(internal_stats_ptr != nullptr); + ASSERT_NE(internal_stats_ptr, nullptr); const std::vector& comp_stats = internal_stats_ptr->TEST_GetCompactionStats(); const int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); @@ -270,7 +302,7 @@ } } // anonymous namespace -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) // All the TEST_P tests run once with sub_compactions disabled (i.e. // options.max_subcompactions = 1) and once with it enabled TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { @@ -295,25 +327,47 @@ const int kTestSize = kCDTKeysPerBuffer * 1024; std::vector values; for (int k = 0; k < kTestSize; ++k) { - values.push_back(RandomString(&rnd, kCDTValueSize)); + values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); - // must have much smaller db size. - ASSERT_GT(db_size[0] / 3, db_size[1]); + if (options.compaction_style == kCompactionStyleUniversal) { + // Claim: in universal compaction none of the original data will remain + // once compactions settle. + // + // Proof: The compensated size of the file containing the most tombstones + // is enough on its own to trigger size amp compaction. Size amp + // compaction is a full compaction, so all tombstones meet the obsolete + // keys they cover. + ASSERT_EQ(0, db_size[1]); + } else { + // Claim: in level compaction at most `db_size[0] / 2` of the original + // data will remain once compactions settle. + // + // Proof: Assume the original data is all in the bottom level. If it were + // not, it would meet its tombstone sooner. The original data size is + // large enough to require fanout to bottom level to be greater than + // `max_bytes_for_level_multiplier == 2`. In the level just above, + // tombstones must cover less than `db_size[0] / 4` bytes since fanout >= + // 2 and file size is compensated by doubling the size of values we expect + // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in + // levels above must cover less than `db_size[0] / 8` bytes of original + // data, `db_size[0] / 16`, and so on. + ASSERT_GT(db_size[0] / 2, db_size[1]); + } } } -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { // For each options type we test following @@ -343,7 +397,7 @@ const int kTestSize = kCDTKeysPerBuffer; std::vector values; for (int k = 0; k < kTestSize; ++k) { - values.push_back(RandomString(&rnd, kCDTValueSize)); + values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } @@ -357,8 +411,9 @@ cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_TRUE( + dbfull()->CompactRange(cro, nullptr, nullptr).IsInvalidArgument()); // check that normal user iterator doesn't see anything Iterator* db_iter = dbfull()->NewIterator(ReadOptions()); @@ -366,6 +421,7 @@ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; } + ASSERT_OK(db_iter->status()); ASSERT_EQ(i, 0); delete db_iter; @@ -373,6 +429,7 @@ ReadOptions ro; ro.iter_start_seqnum=1; db_iter = dbfull()->NewIterator(ro); + ASSERT_OK(db_iter->status()); i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; @@ -382,9 +439,10 @@ // now all deletes should be gone SetPreserveDeletesSequenceNumber(100000000); - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); db_iter = dbfull()->NewIterator(ro); + ASSERT_TRUE(db_iter->status().IsInvalidArgument()); i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; @@ -408,7 +466,7 @@ const int kTestSize = kCDTKeysPerBuffer * 512; std::vector values; for (int k = 0; k < kTestSize; ++k) { - values.push_back(RandomString(&rnd, kCDTValueSize)); + values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } @@ -446,6 +504,10 @@ options.new_table_reader_for_compaction_inputs = true; options.max_open_files = 20; options.level0_file_num_compaction_trigger = 3; + // Avoid many shards with small max_open_files, where as little as + // two table insertions could lead to an LRU eviction, depending on + // hash values. + options.table_cache_numshardbits = 2; DestroyAndReopen(options); Random rnd(301); @@ -470,8 +532,8 @@ ASSERT_OK(Put(Key(10 - k), "bar")); if (k < options.level0_file_num_compaction_trigger - 1) { num_table_cache_lookup = 0; - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // preloading iterator issues one table cache lookup and create // a new table reader, if not preloaded. int old_num_table_cache_lookup = num_table_cache_lookup; @@ -489,8 +551,8 @@ num_table_cache_lookup = 0; num_new_table_reader = 0; - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Preloading iterator issues one table cache lookup and creates // a new table reader. One file is created for flush and one for compaction. // Compaction inputs make no table cache look-up for data/range deletion @@ -517,7 +579,7 @@ cro.change_level = true; cro.target_level = 2; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); // Only verifying compaction outputs issues one table cache lookup // for both data block and range deletion block). // May preload table cache too. @@ -555,12 +617,12 @@ const int kTestSize = kCDTKeysPerBuffer * 512; std::vector values; for (int k = 0; k < kTestSize; ++k) { - values.push_back(RandomString(&rnd, kCDTValueSize)); + values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); Close(); // round 2 --- disable auto-compactions and issue deletions. @@ -571,11 +633,10 @@ for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); Close(); - // as auto_compaction is off, we shouldn't see too much reduce - // in db size. - ASSERT_LT(db_size[0] / 3, db_size[1]); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); // round 3 --- reopen db with auto_compaction on and see if // deletion compensation still work. @@ -585,14 +646,86 @@ for (int k = 0; k < kTestSize / 10; ++k) { ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[2] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); // this time we're expecting significant drop in size. - ASSERT_GT(db_size[0] / 3, db_size[2]); + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. In addition to that, this + // test inserts `db_size[0] / 10` to push the tombstones into SST files and + // then through automatic compactions. So in total `3 * db_size[0] / 5` of + // the original data may remain. + ASSERT_GT(3 * db_size[0] / 5, db_size[2]); } } +TEST_F(DBCompactionTest, CompactRangeBottomPri) { + ASSERT_OK(Put(Key(50), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(100), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(200), "")); + ASSERT_OK(Flush()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(2), "")); + ASSERT_OK(Put(Key(199), "")); + ASSERT_OK(Flush()); + ASSERT_EQ("2,0,3", FilesPerLevel(0)); + + // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will + // be triggered. + // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool + // and one compact to L2 in bottom pri pool. + int low_pri_count = 0; + int bottom_pri_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) { + Env::Priority* pri = reinterpret_cast(arg); + // First time is low pri pool in the test case. + if (low_pri_count == 0 && bottom_pri_count == 0) { + ASSERT_EQ(Env::Priority::LOW, *pri); + } + if (*pri == Env::Priority::LOW) { + low_pri_count++; + } else { + bottom_pri_count++; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(1, bottom_pri_count); + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // Recompact bottom most level uses bottom pool + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(1, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + env_->SetBackgroundThreads(0, Env::Priority::BOTTOM); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + // Low pri pool is used if bottom pool has size 0. + ASSERT_EQ(2, low_pri_count); + ASSERT_EQ(2, bottom_pri_count); + + SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { uint64_t db_size[3]; for (int test = 0; test < 2; ++test) { @@ -607,12 +740,19 @@ const int kTestSize = kCDTKeysPerBuffer * 512; std::vector values; for (int k = 0; k < kTestSize; ++k) { - values.push_back(RandomString(&rnd, kCDTValueSize)); + values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // L1 and L2 can fit deletions iff size compensation does not take effect, + // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining + // files at or above L2 down to L3 to ensure obsolete data does not + // accidentally meet its tombstone above L3. This makes the final size more + // deterministic and easy to see whether size compensation for deletions + // took effect. + MoveFilesToLevel(3 /* level */); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); Close(); // round 2 --- disable auto-compactions and issue deletions. @@ -625,27 +765,33 @@ for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); Close(); - // as auto_compaction is off, we shouldn't see too much reduce - // in db size. - ASSERT_LT(db_size[0] / 3, db_size[1]); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); // round 3 --- reopen db with auto_compaction on and see if // deletion compensation still work. options.disable_auto_compactions = false; Reopen(options); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[2] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); if (options.skip_stats_update_on_db_open) { // If update stats on DB::Open is disable, we don't expect // deletion entries taking effect. - ASSERT_LT(db_size[0] / 3, db_size[2]); + // + // The deletions are small enough to fit in L1 and L2, and obsolete keys + // were moved to L3+, so none of the original data should have been + // dropped. + ASSERT_LE(db_size[0], db_size[2]); } else { // Otherwise, we should see a significant drop in db size. - ASSERT_GT(db_size[0] / 3, db_size[2]); + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. + ASSERT_GT(db_size[0] / 2, db_size[2]); } } } @@ -660,7 +806,8 @@ options.num_levels = 3; options.level0_file_num_compaction_trigger = 3; options.max_subcompactions = max_subcompactions_; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -670,24 +817,24 @@ std::vector values; // Write 100KB (100 values, each 1K) for (int i = 0; i < kNumKeysPerFile; i++) { - values.push_back(RandomString(&rnd, 990)); + values.push_back(rnd.RandomString(990)); ASSERT_OK(Put(1, Key(i), values[i])); } // put extra key to trigger flush ASSERT_OK(Put(1, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } // generate one more file in level-0, and should trigger level-0 compaction std::vector values; for (int i = 0; i < kNumKeysPerFile; i++) { - values.push_back(RandomString(&rnd, 990)); + values.push_back(rnd.RandomString(990)); ASSERT_OK(Put(1, Key(i), values[i])); } // put extra key to trigger flush ASSERT_OK(Put(1, "", "")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); @@ -707,7 +854,8 @@ options.level0_slowdown_writes_trigger = 20; options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large options.max_background_compactions = 3; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); // Block all threads in thread pool. const size_t kTotalTasks = 4; @@ -729,7 +877,7 @@ } // put extra key to trigger flush ASSERT_OK(Put(cf, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); } } @@ -746,7 +894,7 @@ } // put extra key to trigger flush ASSERT_OK(Put(2, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, NumTableFilesAtLevel(0, 2)); } @@ -757,7 +905,7 @@ sleeping_tasks[i].WakeUp(); sleeping_tasks[i].WaitUntilDone(); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify number of compactions allowed will come back to 1. @@ -774,7 +922,7 @@ } // put extra key to trigger flush ASSERT_OK(Put(cf, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); } } @@ -801,14 +949,14 @@ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); + values.push_back(rnd.RandomString(100000)); ASSERT_OK(Put(1, Key(i), values[i])); } // Reopening moves updates to level-0 ReopenWithColumnFamilies({"default", "pikachu"}, options); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); @@ -852,27 +1000,27 @@ DestroyAndReopen(options); // create first file and flush to l0 - Put("4", "A"); - Put("3", "A"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - - Put("2", "A"); - Delete("3"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("3")); // move both files down to l1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("NOT_FOUND", Get("3")); for (int i = 0; i < 3; i++) { - Put("2", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("NOT_FOUND", Get("3")); } @@ -885,31 +1033,85 @@ DestroyAndReopen(options); // create first file and flush to l0 - Put("4", "A"); - Put("3", "A"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - - Put("2", "A"); - SingleDelete("3"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(SingleDelete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("3")); // move both files down to l1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("NOT_FOUND", Get("3")); for (int i = 0; i < 3; i++) { - Put("2", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("NOT_FOUND", Get("3")); } +TEST_F(DBCompactionTest, CompactionSstPartitioner) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("aaaa1", "A2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // move both files down to l1 + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A2", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + +TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 1; + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(4)); + options.sst_partitioner_factory = factory; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A", Get("aaaa1")); + ASSERT_EQ("B", Get("bbbb1")); +} + TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleLevel; @@ -931,22 +1133,23 @@ // create first file and flush to l0 for (auto& key : {"1", "2", "3", "3", "3", "3"}) { - Put(key, std::string(key_len, 'A')); + ASSERT_OK(Put(key, std::string(key_len, 'A'))); snaps.push_back(dbfull()->GetSnapshot()); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // create second file and flush to l0 for (auto& key : {"3", "4", "5", "6", "7", "8"}) { - Put(key, std::string(key_len, 'A')); + ASSERT_OK(Put(key, std::string(key_len, 'A'))); snaps.push_back(dbfull()->GetSnapshot()); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // move both files down to l1 - dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1); + ASSERT_OK( + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1)); // release snap so that first instance of key(3) can have seqId=0 for (auto snap : snaps) { @@ -955,12 +1158,12 @@ // create 3 files in l0 so to trigger compaction for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { - Put("2", std::string(1, 'A')); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(Put("", "")); } @@ -975,12 +1178,12 @@ for (int i = 0; i < 2; ++i) { for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) { // make l0 files' ranges overlap to avoid trivial move - Put(std::to_string(2 * i), std::string(1, 'A')); - Put(std::to_string(2 * i + 1), std::string(1, 'A')); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A'))); + ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1); } @@ -996,7 +1199,7 @@ // note CompactionOptions::output_file_size_limit is unset. CompactionOptions compact_opt; compact_opt.compression = kNoCompression; - dbfull()->CompactFiles(compact_opt, input_filenames, 1); + ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1)); } // Check that writes done during a memtable compaction are recovered @@ -1039,7 +1242,7 @@ Random rnd(301); std::vector values; for (int i = 0; i < num_keys; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(Key(i), values[i])); } @@ -1057,7 +1260,7 @@ cro.exclusive_manual_compaction = exclusive_manual_compaction_; // Compaction will initiate a trivial move from L0 to L1 - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); // File moved From L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 @@ -1111,7 +1314,7 @@ std::map values; for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { - values[j] = RandomString(&rnd, value_size); + values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } ASSERT_OK(Flush()); @@ -1126,7 +1329,7 @@ // Since data is non-overlapping we expect compaction to initiate // a trivial move - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); // We expect that all the files were trivially moved from L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); @@ -1157,13 +1360,13 @@ }; for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { - values[j] = RandomString(&rnd, value_size); + values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } ASSERT_OK(Flush()); } - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { @@ -1202,14 +1405,14 @@ // file 1 [0 => 300] for (int32_t i = 0; i <= 300; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 2 [600 => 700] for (int32_t i = 600; i <= 700; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1283,14 +1486,14 @@ // file 1 [0 => 100] for (int32_t i = 0; i < 100; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 2 [100 => 300] for (int32_t i = 100; i < 300; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1311,7 +1514,7 @@ // file 3 [ 0 => 200] for (int32_t i = 0; i < 200; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1343,21 +1546,21 @@ TEST_SYNC_POINT("DBCompaction::ManualPartial:1"); // file 4 [300 => 400) for (int32_t i = 300; i <= 400; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 5 [400 => 500) for (int32_t i = 400; i <= 500; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 6 [500 => 600) for (int32_t i = 500; i <= 600; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } // Second non-trivial compaction is triggered @@ -1367,8 +1570,8 @@ ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // After two non-trivial compactions are installed, there is 1 file in L6, and // 1 file in L1 ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); @@ -1425,14 +1628,14 @@ // file 1 [0 => 100] for (int32_t i = 0; i < 100; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 2 [100 => 300] for (int32_t i = 100; i < 300; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1451,7 +1654,7 @@ // file 3 [ 0 => 200] for (int32_t i = 0; i < 200; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1481,9 +1684,9 @@ for (int32_t j = 300; j < 4300; j++) { if (j == 2300) { ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - values[j] = RandomString(&rnd, value_size); + values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } } @@ -1497,8 +1700,8 @@ } TEST_SYNC_POINT("DBCompaction::PartialFill:2"); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); threads.join(); for (int32_t i = 0; i < 4300; i++) { @@ -1516,12 +1719,12 @@ Options options = CurrentOptions(); options.unordered_write = true; DestroyAndReopen(options); - Put("foo", "v1"); + ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Flush()); - Put("bar", "v1"); + ASSERT_OK(Put("bar", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - port::Thread writer([&]() { Put("foo", "v2"); }); + port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); }); TEST_SYNC_POINT( "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"); @@ -1554,14 +1757,14 @@ // file 1 [0 => 100] for (int32_t i = 0; i < 100; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); // file 2 [100 => 300] for (int32_t i = 100; i < 300; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1577,7 +1780,7 @@ // file 3 [ 0 => 200] for (int32_t i = 0; i < 200; i++) { - values[i] = RandomString(&rnd, value_size); + values[i] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1587,15 +1790,15 @@ for (int32_t j = 300; j < 4300; j++) { if (j == 2300) { ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - values[j] = RandomString(&rnd, value_size); + values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } } ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify level sizes uint64_t target_size = 4 * options.max_bytes_for_level_base; @@ -1605,7 +1808,7 @@ options.max_bytes_for_level_multiplier); } - size_t old_num_files = CountFiles(); + const size_t old_num_files = CountFiles(); std::string begin_string = Key(1000); std::string end_string = Key(2000); Slice begin(begin_string); @@ -1640,7 +1843,7 @@ compact_options.change_level = true; compact_options.target_level = 1; ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK( DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); @@ -1649,12 +1852,11 @@ for (int32_t i = 0; i < 4300; i++) { ReadOptions roptions; std::string result; - Status s = db_->Get(roptions, Key(i), &result); - ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound()); deleted_count2++; } ASSERT_GT(deleted_count2, deleted_count); - size_t new_num_files = CountFiles(); + const size_t new_num_files = CountFiles(); ASSERT_GT(old_num_files, new_num_files); } @@ -1676,7 +1878,7 @@ for (auto i = 0; i < 10; i++) { for (auto j = 0; j < 100; j++) { auto k = i * 100 + j; - values[k] = RandomString(&rnd, value_size); + values[k] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(k), values[k])); } ASSERT_OK(Flush()); @@ -1808,15 +2010,15 @@ // would cause `1 -> vals[0]` (an older key) to reappear. std::string vals[kNumL0Files]; for (int i = 0; i < kNumL0Files; ++i) { - vals[i] = RandomString(&rnd, kValSize); - Put(Key(i), vals[i]); - Put(Key(i + 1), vals[i]); - Flush(); + vals[i] = rnd.RandomString(kValSize); + ASSERT_OK(Put(Key(i), vals[i])); + ASSERT_OK(Put(Key(i + 1), vals[i])); + ASSERT_OK(Flush()); if (i == 0) { snapshot = db_->GetSnapshot(); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify `DeleteFilesInRange` can't drop only file 0 which would cause // "1 -> vals[0]" to reappear. @@ -1850,7 +2052,7 @@ std::vector values; // File with keys [ 0 => 99 ] for (int i = 0; i < 100; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1868,7 +2070,7 @@ // File with keys [ 100 => 199 ] for (int i = 100; i < 200; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Flush()); @@ -1895,7 +2097,7 @@ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -1903,16 +2105,8 @@ options.num_levels = 4; options.max_bytes_for_level_base = 400 * 1024; options.max_subcompactions = max_subcompactions_; - // options = CurrentOptions(options); - std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); - } - env_->DeleteDir(options.db_paths[1].path); - Reopen(options); + DestroyAndReopen(options); Random rnd(301); int key_idx = 0; @@ -2012,7 +2206,7 @@ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -2020,16 +2214,8 @@ options.num_levels = 4; options.max_bytes_for_level_base = 400 * 1024; options.max_subcompactions = max_subcompactions_; - // options = CurrentOptions(options); - std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); - } - env_->DeleteDir(options.db_paths[1].path); - Reopen(options); + DestroyAndReopen(options); Random rnd(301); int key_idx = 0; @@ -2130,7 +2316,7 @@ options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -2149,7 +2335,7 @@ option_vector.emplace_back(DBOptions(options), cf_opt1); CreateColumnFamilies({"one"},option_vector[1]); - // Configura CF2 specific paths. + // Configure CF2 specific paths. cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024); cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024); cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024); @@ -2204,13 +2390,16 @@ // Check that default column family uses db_paths. // And Column family "one" uses cf_paths. - // First three 110KB files are not going to second path. - // After that, (100K, 200K) + // The compaction in level0 outputs the sst files in level1. + // The first path cannot hold level1's data(400KB+400KB > 500KB), + // so every compaction move a sst file to second path. Please + // refer to LevelCompactionBuilder::GetPathId. for (int num = 0; num < 3; num++) { generate_file(); } + check_sstfilecount(0, 1); + check_sstfilecount(1, 2); - // Another 110KB triggers a compaction to 400K file to fill up first path generate_file(); check_sstfilecount(1, 3); @@ -2263,10 +2452,10 @@ for (int i = 0; i <= max_key_level_insert; i++) { // each value is 10K - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); } ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(TotalTableFiles(1, 4), 1); int non_level0_num_files = 0; @@ -2302,7 +2491,8 @@ compact_options.bottommost_level_compaction = BottommostLevelCompaction::kForce; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); // Only 1 file in L0 ASSERT_EQ("1", FilesPerLevel(1)); @@ -2321,11 +2511,11 @@ ReopenWithColumnFamilies({"default", "pikachu"}, options); for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); } - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 1; i < options.num_levels; i++) { ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); @@ -2335,6 +2525,7 @@ // compaction style std::string keys_in_db; Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { keys_in_db.append(iter->key().ToString()); keys_in_db.push_back(','); @@ -2372,24 +2563,24 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Delete(1, "e"); - Put(1, "", ""); + ASSERT_OK(Delete(1, "e")); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "c", "cv"); + ASSERT_OK(Put(1, "c", "cv")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "d", "dv"); + ASSERT_OK(Put(1, "d", "dv")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Delete(1, "d"); - Delete(1, "b"); + ASSERT_OK(Delete(1, "d")); + ASSERT_OK(Delete(1, "b")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("(->)(c->cv)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish @@ -2406,34 +2597,35 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put(1, "foo", ""); - Put(1, "bar", ""); - Flush(1); - Put(1, "foo", ""); - Put(1, "bar", ""); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); // Generate four files in CF 0, which should trigger an auto compaction - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); // The auto compaction is scheduled but waited until here TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); // The auto compaction will wait until the manual compaction is registerd // before processing so that it will be cancelled. - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); ASSERT_EQ("0,1", FilesPerLevel(1)); // Eventually the cancelled compaction will be rescheduled and executed. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -2459,7 +2651,7 @@ ASSERT_EQ("1,1,1", FilesPerLevel(1)); // Compaction range overlaps files - Compact(1, "p1", "p9"); + Compact(1, "p", "q"); ASSERT_EQ("0,0,1", FilesPerLevel(1)); // Populate a different range @@ -2478,7 +2670,7 @@ options.statistics->getTickerCount(BLOCK_CACHE_ADD); CompactRangeOptions cro; cro.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(cro, handles_[1], nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); // Verify manual compaction doesn't fill block cache ASSERT_EQ(prev_block_cache_add, options.statistics->getTickerCount(BLOCK_CACHE_ADD)); @@ -2526,7 +2718,7 @@ ASSERT_EQ("3", FilesPerLevel(1)); // Compaction range overlaps files - Compact(1, "p1", "p9", 1); + Compact(1, "p", "q", 1); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); @@ -2559,7 +2751,8 @@ CompactRangeOptions compact_options; compact_options.target_path_id = 1; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(1)); @@ -2616,10 +2809,10 @@ Random rnd(301); for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { - ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize))); } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); @@ -2692,13 +2885,13 @@ std::vector keys; std::vector values; for (int k = 0; k < kNumInsertedKeys; ++k) { - keys.emplace_back(RandomString(&rnd, kKeySize)); - values.emplace_back(RandomString(&rnd, kKvSize - kKeySize)); + keys.emplace_back(rnd.RandomString(kKeySize)); + values.emplace_back(rnd.RandomString(kKvSize - kKeySize)); ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); // Make sure the number of L0 files can trigger compaction. ASSERT_GE(NumTableFilesAtLevel(0), options.level0_file_num_compaction_trigger); @@ -2759,12 +2952,12 @@ for (int i = 0; i < 2; ++i) { // Create 1MB sst file for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); } ASSERT_OK(Flush()); } // this should execute L0->L1 - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); // block compactions @@ -2781,7 +2974,7 @@ sleeping_task.WaitUntilDone(); // this should execute L1->L2 (move) - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); @@ -2794,12 +2987,12 @@ for (int i = 0; i < 2; ++i) { // Create 1MB sst file for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024))); + ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024))); } ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->L2 (merge with previous file) - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,2", FilesPerLevel(0)); @@ -2807,6 +3000,7 @@ ASSERT_OK(env_->FileExists(dbname_ + moved_file_name)); listener->SetExpectedFileName(dbname_ + moved_file_name); + ASSERT_OK(iterator->status()); iterator.reset(); // this file should have been compacted away @@ -2821,7 +3015,7 @@ } Options options = CurrentOptions(); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 110 << 10; // 110KB options.arena_block_size = 4 << 10; @@ -2969,7 +3163,7 @@ for (int num = 0; num < 10; num++) { GenerateNewRandomFile(&rnd); } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"CompactionJob::Run():Start", @@ -2990,7 +3184,7 @@ "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"); GenerateNewRandomFile(&rnd, /* nowait */ true); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; num++) { @@ -3000,7 +3194,7 @@ TEST_SYNC_POINT( "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } static std::string ShortKey(int i) { @@ -3052,7 +3246,7 @@ std::vector values; // File with keys [ 0 => 99 ] for (int i = 0; i < 100; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(ShortKey(i), values[i])); } ASSERT_OK(Flush()); @@ -3069,7 +3263,7 @@ // File with keys [ 100 => 199 ] for (int i = 100; i < 200; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(ShortKey(i), values[i])); } ASSERT_OK(Flush()); @@ -3087,7 +3281,7 @@ // File with keys [ 200 => 299 ] for (int i = 200; i < 300; i++) { - values.push_back(RandomString(&rnd, value_size)); + values.push_back(rnd.RandomString(value_size)); ASSERT_OK(Put(ShortKey(i), values[i])); } ASSERT_OK(Flush()); @@ -3118,14 +3312,28 @@ options.level0_file_num_compaction_trigger = 5; options.max_background_compactions = 2; options.max_subcompactions = max_subcompactions_; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.write_buffer_size = 2 << 20; // 2MB + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(64 << 20); // 64MB + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); const size_t kValueSize = 1 << 20; Random rnd(301); - std::string value(RandomString(&rnd, kValueSize)); + std::string value(rnd.RandomString(kValueSize)); + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompactionBySize:0", + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", "CompactionJob::Run():Start"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -3143,13 +3351,14 @@ for (int i = 0; i < 10; ++i) { ASSERT_OK(Put(Key(0), "")); // prevents trivial move if (i == 5) { + TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready"); ASSERT_OK(Put(Key(i + 1), value + value)); } else { ASSERT_OK(Put(Key(i + 1), value)); } ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; @@ -3162,6 +3371,16 @@ for (int i = 0; i < 2; ++i) { ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21); } + + // The index/filter in the file produced by intra-L0 should not be pinned. + // That means clearing unref'd entries in block cache and re-accessing the + // file produced by intra-L0 should bump the index block miss count. + uint64_t prev_index_misses = + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + table_options.block_cache->EraseUnRefEntries(); + ASSERT_EQ("", Get(Key(0))); + ASSERT_EQ(prev_index_misses + 1, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); } TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { @@ -3176,10 +3395,16 @@ const size_t kValueSize = 1 << 20; Random rnd(301); - std::string value(RandomString(&rnd, kValueSize)); + std::string value(rnd.RandomString(kValueSize)); + // The L0->L1 must be picked before we begin flushing files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompactionBySize:0", + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", "CompactionJob::Run():Start"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -3203,10 +3428,15 @@ } else { ASSERT_OK(Delete(Key(0))); } + if (i == 5) { + TEST_SYNC_POINT( + "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:" + "L0ToL1Ready"); + } ASSERT_OK(Put(Key(i + 1), value)); ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; @@ -3254,7 +3484,7 @@ int key_idx = 0; GenerateNewFile(&rnd, &key_idx); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, num_bottom_pri_compactions); @@ -3278,8 +3508,8 @@ // So key 0, 2, and 4+ fall outside these levels' key-ranges. for (int level = 2; level >= 1; --level) { for (int i = 0; i < 2; ++i) { - Put(Key(2 * i + 1), "val"); - Flush(); + ASSERT_OK(Put(Key(2 * i + 1), "val")); + ASSERT_OK(Flush()); } MoveFilesToLevel(level); ASSERT_EQ(2, NumTableFilesAtLevel(level)); @@ -3289,11 +3519,11 @@ // - Tombstones for keys 2 and 4 can be dropped early. // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges. for (int i = 0; i < kNumL0Files; ++i) { - Put(Key(0), "val"); // sentinel to prevent trivial move - Delete(Key(i + 1)); - Flush(); + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Delete(Key(i + 1))); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 0; i < kNumL0Files; ++i) { std::string value; @@ -3357,10 +3587,10 @@ TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { // Regression test for bug of not pulling in L0 files that overlap the user- // specified input files in time- and key-ranges. - Put(Key(0), "old_val"); - Flush(); - Put(Key(0), "new_val"); - Flush(); + ASSERT_OK(Put(Key(0), "old_val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(0), "new_val")); + ASSERT_OK(Flush()); ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); @@ -3376,6 +3606,41 @@ ASSERT_EQ("new_val", Get(Key(0))); } +TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const Snapshot* snapshot = nullptr; + const int kMaxKey = 10; + + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Delete(Key(i))); + if (!snapshot) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey))); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // test DeleteFilesInRange() deletes the files already picked for compaction + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifestStart", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Finish", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // release snapshot which mark bottommost file for compaction + db_->ReleaseSnapshot(snapshot); + std::string begin_string = Key(0); + std::string end_string = Key(kMaxKey + 1); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { // bottom-level files may contain deletions due to snapshots protecting the // deleted keys. Once the snapshot is released, we should see files with many @@ -3395,7 +3660,7 @@ for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } if (i == kNumLevelFiles - 1) { snapshot = db_->GetSnapshot(); @@ -3406,12 +3671,12 @@ ASSERT_OK(Delete(Key(j))); } } - Flush(); + ASSERT_OK(Flush()); if (i < kNumLevelFiles - 1) { ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); std::vector pre_release_metadata, post_release_metadata; @@ -3432,7 +3697,7 @@ CompactionReason::kBottommostFiles); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetLiveFilesMetaData(&post_release_metadata); ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); @@ -3448,6 +3713,76 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released, we should see files with many + // such deletions undergo single-file compactions. But when disabling auto + // compactions, it shouldn't be triggered which may causing too many + // background jobs. + const int kNumKeysPerFile = 1024; + const int kNumLevelFiles = 4; + const int kValueSize = 128; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.disable_auto_compactions = true; + options.level0_file_num_compaction_trigger = kNumLevelFiles; + // inflate it a bit to account for key/metadata overhead + options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100; + Reopen(options); + + Random rnd(301); + const Snapshot* snapshot = nullptr; + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + if (i == kNumLevelFiles - 1) { + snapshot = db_->GetSnapshot(); + // delete every other key after grabbing a snapshot, so these deletions + // and the keys they cover can't be dropped until after the snapshot is + // released. + for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) { + ASSERT_OK(Delete(Key(j))); + } + } + ASSERT_OK(Flush()); + if (i < kNumLevelFiles - 1) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + } + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr)); + ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata, post_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + + // release snapshot and no compaction should be triggered. + std::atomic num_compactions{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void* /*arg*/) { num_compactions.fetch_add(1); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + db_->ReleaseSnapshot(snapshot); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, num_compactions); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); + for (size_t i = 0; i < pre_release_metadata.size(); ++i) { + const auto& pre_file = pre_release_metadata[i]; + const auto& post_file = post_release_metadata[i]; + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // each file is same as before with deletion markers/deleted keys. + ASSERT_EQ(post_file.size, pre_file.size); + } +} + TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { const int kNumKeysPerFile = 32; const int kNumLevelFiles = 2; @@ -3457,21 +3792,22 @@ options.compression = kNoCompression; options.ttl = 24 * 60 * 60; // 24 hours options.max_open_files = -1; - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); Random rnd(301); for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); ASSERT_EQ("0,0,0,2", FilesPerLevel()); @@ -3480,44 +3816,45 @@ for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,0,0,2", FilesPerLevel()); MoveFilesToLevel(1); ASSERT_EQ("0,2,0,2", FilesPerLevel()); - env_->addon_time_.fetch_add(36 * 60 * 60); // 36 hours + env_->MockSleepForSeconds(36 * 60 * 60); // 36 hours ASSERT_EQ("0,2,0,2", FilesPerLevel()); // Just do a simple write + flush so that the Ttl expired files get // compacted. ASSERT_OK(Put("a", "1")); - Flush(); + ASSERT_OK(Flush()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { Compaction* compaction = reinterpret_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All non-L0 files are deleted, as they contained only deleted data. ASSERT_EQ("1", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); // Test dynamically changing ttl. - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); ASSERT_EQ("0,0,0,2", FilesPerLevel()); @@ -3526,19 +3863,19 @@ for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,0,0,2", FilesPerLevel()); MoveFilesToLevel(1); ASSERT_EQ("0,2,0,2", FilesPerLevel()); // Move time forward by 12 hours, and make sure that compaction still doesn't // trigger as ttl is set to 24 hours. - env_->addon_time_.fetch_add(12 * 60 * 60); + env_->MockSleepForSeconds(12 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,2,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -3551,13 +3888,14 @@ // Dynamically change ttl to 10 hours. // This should trigger a ttl compaction, as 12 hours have already passed. ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All non-L0 files are deleted, as they contained only deleted data. ASSERT_EQ("1", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { + env_->SetMockSleep(); const int kValueSize = 100; for (bool if_restart : {false, true}) { @@ -3588,10 +3926,10 @@ } }); - env_->time_elapse_only_sleep_ = false; options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); int ttl_compactions = 0; @@ -3608,9 +3946,9 @@ // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. Random rnd(301); for (int i = 1; i <= 100; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); // Get the first file's creation time. This will be the oldest file in the // DB. Compactions inolving this file's descendents should keep getting // this time. @@ -3619,35 +3957,35 @@ &level_to_files); uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time; // Add 1 hour and do another flush. - env_->addon_time_.fetch_add(1 * 60 * 60); + env_->MockSleepForSeconds(1 * 60 * 60); for (int i = 101; i <= 200; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); MoveFilesToLevel(6); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); - env_->addon_time_.fetch_add(1 * 60 * 60); + env_->MockSleepForSeconds(1 * 60 * 60); // Add two L4 files with key ranges: [1 .. 50], [51 .. 150]. for (int i = 1; i <= 50; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); - env_->addon_time_.fetch_add(1 * 60 * 60); + ASSERT_OK(Flush()); + env_->MockSleepForSeconds(1 * 60 * 60); for (int i = 51; i <= 150; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); MoveFilesToLevel(4); ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); - env_->addon_time_.fetch_add(1 * 60 * 60); + env_->MockSleepForSeconds(1 * 60 * 60); // Add one L1 file with key range: [26, 75]. for (int i = 26; i <= 75; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(1); ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); @@ -3671,15 +4009,15 @@ // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6. // Add 25 hours and do a write - env_->addon_time_.fetch_add(25 * 60 * 60); + env_->MockSleepForSeconds(25 * 60 * 60); ASSERT_OK(Put(Key(1), "1")); if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(5, ttl_compactions); @@ -3687,14 +4025,14 @@ &level_to_files); ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time); - env_->addon_time_.fetch_add(25 * 60 * 60); + env_->MockSleepForSeconds(25 * 60 * 60); ASSERT_OK(Put(Key(2), "1")); if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); ASSERT_GE(ttl_compactions, 6); @@ -3704,6 +4042,7 @@ } TEST_F(DBCompactionTest, LevelPeriodicCompaction) { + env_->SetMockSleep(); const int kNumKeysPerFile = 32; const int kNumLevelFiles = 2; const int kValueSize = 100; @@ -3735,10 +4074,10 @@ } }); - env_->time_elapse_only_sleep_ = false; options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); int periodic_compactions = 0; @@ -3755,21 +4094,21 @@ Random rnd(301); for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { - ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), - RandomString(&rnd, kValueSize))); + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2", FilesPerLevel()); ASSERT_EQ(0, periodic_compactions); // Add 50 hours and do a write - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Assert that the files stay in the same level ASSERT_EQ("3", FilesPerLevel()); // The two old files go through the periodic compaction process @@ -3779,24 +4118,24 @@ ASSERT_EQ("0,3", FilesPerLevel()); // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("b", "2")); if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,3", FilesPerLevel()); // The three old files now go through the periodic compaction process. 2 // + 3. ASSERT_EQ(5, periodic_compactions); // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("c", "3")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,3", FilesPerLevel()); // The four old files now go through the periodic compaction process. 5 // + 4. @@ -3817,10 +4156,11 @@ const int kValueSize = 100; Options options = CurrentOptions(); - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); int periodic_compactions = 0; @@ -3850,9 +4190,9 @@ for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); // Move the first two files to L2. if (i == 1) { MoveFilesToLevel(2); @@ -3868,7 +4208,7 @@ set_file_creation_time_to_zero = false; // Forward the clock by 2 days. - env_->addon_time_.fetch_add(2 * 24 * 60 * 60); + env_->MockSleepForSeconds(2 * 24 * 60 * 60); options.periodic_compaction_seconds = 1 * 24 * 60 * 60; // 1 day Reopen(options); @@ -3889,10 +4229,11 @@ options.ttl = 10 * 60 * 60; // 10 hours options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days options.max_open_files = -1; // needed for both periodic and ttl compactions - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); int periodic_compactions = 0; @@ -3913,11 +4254,11 @@ for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); @@ -3926,20 +4267,20 @@ ASSERT_EQ(0, ttl_compactions); // Add some time greater than periodic_compaction_time. - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Files in the bottom level go through periodic compactions. ASSERT_EQ("1,0,0,2", FilesPerLevel()); ASSERT_EQ(2, periodic_compactions); ASSERT_EQ(0, ttl_compactions); // Add a little more time than ttl - env_->addon_time_.fetch_add(11 * 60 * 60); + env_->MockSleepForSeconds(11 * 60 * 60); ASSERT_OK(Put("b", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Notice that the previous file in level 1 falls down to the bottom level // due to ttl compactions, one level at a time. // And bottom level files don't get picked up for ttl compactions. @@ -3948,10 +4289,10 @@ ASSERT_EQ(3, ttl_compactions); // Add some time greater than periodic_compaction_time. - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("c", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Previous L0 file falls one level at a time to bottom level due to ttl. // And all 4 bottom files go through periodic compactions. ASSERT_EQ("1,0,0,4", FilesPerLevel()); @@ -3961,6 +4302,67 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, LevelTtlBooster) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 3; + const int kValueSize = 1000; + + Options options = CurrentOptions(); + options.ttl = 10 * 60 * 60; // 10 hours + options.periodic_compaction_seconds = 480 * 60 * 60; // very long + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize}; + options.max_open_files = -1; // needed for both periodic and ttl compactions + options.compaction_pri = CompactionPri::kMinOverlappingRatio; + env_->SetMockSleep(); + options.env = env_; + + // NOTE: Presumed unnecessary and removed: resetting mock time in env + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + MoveFilesToLevel(2); + + ASSERT_EQ("0,0,3", FilesPerLevel()); + + // Create some files for L1 + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + ASSERT_EQ("0,1,3", FilesPerLevel()); + + // Make the new L0 files qualify TTL boosting and generate one more to trigger + // L1 -> L2 compaction. Old files will be picked even if their priority is + // lower without boosting. + env_->MockSleepForSeconds(8 * 60 * 60); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i), + rnd.RandomString(kValueSize * 2))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + ASSERT_EQ("0,1,2", FilesPerLevel()); + + ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize); +} + TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { class TestCompactionFilter : public CompactionFilter { const char* Name() const override { return "TestCompactionFilter"; } @@ -3981,9 +4383,10 @@ Options options = CurrentOptions(); TestCompactionFilter test_compaction_filter; - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; - env_->addon_time_.store(0); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env enum CompactionFilterType { kUseCompactionFilter, @@ -4024,20 +4427,20 @@ for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2", FilesPerLevel()); ASSERT_EQ(0, periodic_compactions); // Add 31 days and do a write - env_->addon_time_.fetch_add(31 * 24 * 60 * 60); + env_->MockSleepForSeconds(31 * 24 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Assert that the files stay in the same level ASSERT_EQ("3", FilesPerLevel()); // The two old files go through the periodic compaction process @@ -4084,18 +4487,18 @@ Random rnd(301); for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { for (int k = 0; k < 2; ++k) { - ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(k), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); manual_compaction_thread.join(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4138,21 +4541,21 @@ Random rnd(301); for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) { - ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(0), rnd.RandomString(1024))); FlushOptions flush_opts; flush_opts.wait = false; flush_opts.allow_write_stall = true; - dbfull()->Flush(flush_opts); + ASSERT_OK(dbfull()->Flush(flush_opts)); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); manual_compaction_thread.join(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4186,14 +4589,13 @@ Random rnd(301); for (int j = 0; j < kNumL0FilesLimit - 1; ++j) { for (int k = 0; k < 2; ++k) { - ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024))); + ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024))); } - Flush(1); + ASSERT_OK(Flush(1)); } auto manual_compaction_thread = port::Thread([this, i]() { CompactRangeOptions cro; cro.allow_write_stall = false; - Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr); if (i == 0) { ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) .IsColumnFamilyDropped()); @@ -4213,7 +4615,7 @@ manual_compaction_thread.join(); TEST_SYNC_POINT( "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } } @@ -4246,27 +4648,28 @@ flush_opts.allow_write_stall = true; for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { for (int j = 0; j < 2; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } - dbfull()->Flush(flush_opts); + ASSERT_OK(dbfull()->Flush(flush_opts)); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); - Put(ToString(0), RandomString(&rnd, 1024)); - dbfull()->Flush(flush_opts); - Put(ToString(0), RandomString(&rnd, 1024)); + ASSERT_OK(Put(ToString(0), rnd.RandomString(1024))); + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(Put(ToString(0), rnd.RandomString(1024))); TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); manual_compaction_thread.join(); // If CompactRange's flush was skipped, the final Put above will still be // in the active memtable. std::string num_keys_in_memtable; - db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable); + ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, + &num_keys_in_memtable)); ASSERT_EQ(ToString(1), num_keys_in_memtable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4324,7 +4727,7 @@ } else { ASSERT_EQ(2, num_memtable_entries); // flush anyways to prepare for next iteration - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } } } @@ -4339,12 +4742,12 @@ for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); ColumnFamilyData* cfd = cfh->cfd(); @@ -4429,7 +4832,7 @@ ASSERT_OK(Delete("b")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 0); @@ -4476,7 +4879,8 @@ options.level0_slowdown_writes_trigger = 64; options.level0_stop_writes_trigger = 64; options.max_background_jobs = kMaxBackgroundThreads; // Enough threads - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); options.max_write_buffer_number = 10; // Enough memtables DestroyAndReopen(options); @@ -4562,7 +4966,7 @@ } for (unsigned int cf = 0; cf < cf_count; cf++) { - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } } @@ -4580,7 +4984,7 @@ } // put extra key to trigger flush ASSERT_OK(Put(0, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, NumTableFilesAtLevel(0, 0)); } @@ -4595,7 +4999,7 @@ } for (unsigned int cf = 0; cf < cf_count; cf++) { - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -4617,7 +5021,7 @@ // put extra key to trigger flush ASSERT_OK(Put(cf_test, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test])); ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test)); Compact(cf_test, Key(0), Key(keyIndex)); @@ -4636,7 +5040,7 @@ options.create_if_missing = true; options.disable_auto_compactions = true; options.use_direct_io_for_flush_and_compaction = GetParam(); - options.env = new MockEnv(Env::Default()); + options.env = MockEnv::Create(Env::Default()); Reopen(options); bool readahead = false; SyncPoint::GetInstance()->SetCallBack( @@ -4655,7 +5059,7 @@ CreateAndReopenWithCF({"pikachu"}, options); MakeTables(3, "p", "q", 1); ASSERT_EQ("1,1,1", FilesPerLevel(1)); - Compact(1, "p1", "p9"); + Compact(1, "p", "q"); ASSERT_EQ(readahead, options.use_direct_reads); ASSERT_EQ("0,0,1", FilesPerLevel(1)); Destroy(options); @@ -4668,7 +5072,8 @@ class CompactionPriTest : public DBTestBase, public testing::WithParamInterface { public: - CompactionPriTest() : DBTestBase("/compaction_pri_test") { + CompactionPriTest() + : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) { compaction_pri_ = GetParam(); } @@ -4696,13 +5101,13 @@ for (int i = 0; i < kNKeys; i++) { keys[i] = i; } - std::random_shuffle(std::begin(keys), std::end(keys)); + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); for (int i = 0; i < kNKeys; i++) { - ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102))); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 0; i < kNKeys; i++) { ASSERT_NE("NOT_FOUND", Get(Key(i))); } @@ -4741,9 +5146,9 @@ Random rnd(301); for (auto i = 0; i < 8; ++i) { for (auto j = 0; j < 10; ++j) { - Merge("foo", RandomString(&rnd, 1024)); + ASSERT_OK(Merge("foo", rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } MoveFilesToLevel(2); @@ -4756,7 +5161,7 @@ CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); } TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { @@ -4764,7 +5169,7 @@ // is in read-only mode. Verify it now at least returns, despite failing. const int kNumL0Files = 4; std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options opts = CurrentOptions(); opts.disable_auto_compactions = true; opts.env = mock_env.get(); @@ -4773,9 +5178,9 @@ Random rnd(301); for (int i = 0; i < kNumL0Files; ++i) { // Make sure files are overlapping in key-range to prevent trivial move. - Put("key1", RandomString(&rnd, 1024)); - Put("key2", RandomString(&rnd, 1024)); - Flush(); + ASSERT_OK(Put("key1", rnd.RandomString(1024))); + ASSERT_OK(Put("key2", rnd.RandomString(1024))); + ASSERT_OK(Flush()); } ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0)); @@ -4783,7 +5188,7 @@ mock_env->SetFilesystemActive(false); // Make sure this is outside `CompactRange`'s range so that it doesn't fail // early trying to flush memtable. - ASSERT_NOK(Put("key3", RandomString(&rnd, 1024))); + ASSERT_NOK(Put("key3", rnd.RandomString(1024))); // In the bug scenario, the first manual compaction would fail and forget to // unregister itself, causing the second one to hang forever due to conflict @@ -4822,9 +5227,9 @@ for (auto i = 0; i < 8; ++i) { for (auto j = 0; j < 10; ++j) { ASSERT_OK( - Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } MoveFilesToLevel(2); @@ -4832,9 +5237,9 @@ for (auto i = 0; i < 8; ++i) { for (auto j = 0; j < 10; ++j) { ASSERT_OK( - Put("bar" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } const std::vector& comp_stats = internal_stats_ptr->TEST_GetCompactionStats(); @@ -4843,7 +5248,7 @@ CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); const std::vector& comp_stats2 = internal_stats_ptr->TEST_GetCompactionStats(); @@ -4851,6 +5256,97 @@ ASSERT_EQ(num, 0); } +TEST_F(DBCompactionTest, ManualCompactionMax) { + uint64_t l1_avg_size = 0, l2_avg_size = 0; + auto generate_sst_func = [&]() { + Random rnd(301); + for (auto i = 0; i < 100; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + + uint64_t total = 0; + for (const auto& file : level_to_files[1]) { + total += file.compensated_file_size; + } + l1_avg_size = total / level_to_files[1].size(); + + total = 0; + for (const auto& file : level_to_files[2]) { + total += file.compensated_file_size; + } + l2_avg_size = total / level_to_files[2].size(); + }; + + std::atomic_int num_compactions(0); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + + // with default setting (1.6G by default), it should cover all files in 1 + // compaction + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == 1); + + // split the compaction to 5 + int num_split = 5; + DestroyAndReopen(opts); + generate_sst_func(); + uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + opts.max_compaction_bytes = total_size / num_split; + opts.target_file_size_base = total_size / num_split; + Reopen(opts); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); + + // very small max_compaction_bytes, it should still move forward + opts.max_compaction_bytes = l1_avg_size / 2; + opts.target_file_size_base = l1_avg_size / 2; + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() > 10); + + // dynamically set the option + num_split = 2; + opts.max_compaction_bytes = 0; + DestroyAndReopen(opts); + generate_sst_func(); + total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + Status s = db_->SetOptions( + {{"max_compaction_bytes", std::to_string(total_size / num_split)}, + {"target_file_size_base", std::to_string(total_size / num_split)}}); + ASSERT_OK(s); + + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); +} + TEST_F(DBCompactionTest, CompactionDuringShutdown) { Options opts = CurrentOptions(); opts.level0_file_num_compaction_trigger = 2; @@ -4866,16 +5362,17 @@ for (auto i = 0; i < 2; ++i) { for (auto j = 0; j < 10; ++j) { ASSERT_OK( - Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsShutdownInProgress()); ASSERT_OK(dbfull()->error_handler_.GetBGError()); } @@ -4889,7 +5386,7 @@ // Generate an external SST file containing a single key, i.e. 99 std::string sst_files_dir = dbname_ + "/sst_files/"; - test::DestroyDir(env_, sst_files_dir); + ASSERT_OK(DestroyDir(env_, sst_files_dir)); ASSERT_OK(env_->CreateDir(sst_files_dir)); SstFileWriter sst_writer(EnvOptions(), options); const std::string sst_file_path = sst_files_dir + "test.sst"; @@ -4909,14 +5406,15 @@ options.level0_file_num_compaction_trigger = options.level0_stop_writes_trigger; options.max_subcompactions = max_subcompactions_; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); DestroyAndReopen(options); Random rnd(301); // Generate level0_stop_writes_trigger L0 files to trigger write stop for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { for (int j = 0; j != kNumKeysPerFile; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(j), rnd.RandomString(990))); } if (0 == i) { // When we reach here, the memtables have kNumKeysPerFile keys. Note that @@ -4928,7 +5426,7 @@ // extra key to trigger flush. ASSERT_OK(Put("", "")); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1); } // When we reach this point, there will be level0_stop_writes_trigger L0 @@ -4958,10 +5456,11 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) { Options options = CurrentOptions(); + options.force_consistency_checks = true; DestroyAndReopen(options); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "VersionBuilder::CheckConsistency", [&](void* arg) { + "VersionBuilder::CheckConsistency0", [&](void* arg) { auto p = reinterpret_cast*>(arg); // just swap the two FileMetaData so that we hit error @@ -4975,11 +5474,59 @@ for (int k = 0; k < 2; ++k) { ASSERT_OK(Put("foo", "bar")); - Flush(); + Status s = Flush(); + if (k < 1) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsCorruption()); + } } ASSERT_NOK(Put("foo", "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBCompactionTest, ConsistencyFailTest2) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.target_file_size_base = 1000; + options.level0_file_num_compaction_trigger = 2; + BlockBasedTableOptions bbto; + bbto.block_size = 400; // small block size + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency1", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + std::string value = rnd.RandomString(1000); + + ASSERT_OK(Put("foo1", value)); + ASSERT_OK(Put("z", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo2", value)); + ASSERT_OK(Put("z", "")); + Status s = Flush(); + ASSERT_TRUE(s.ok() || s.IsCorruption()); + + // This probably returns non-OK, but we rely on the next Put() + // to determine the DB is frozen. + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); + ASSERT_NOK(Put("foo", "bar")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); } void IngestOneKeyValue(DBImpl* db, const std::string& key, @@ -5012,10 +5559,16 @@ const size_t kValueSize = 1 << 20; Random rnd(301); std::atomic pick_intra_l0_count(0); - std::string value(RandomString(&rnd, kValueSize)); + std::string value(rnd.RandomString(kValueSize)); + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBCompactionTestWithParam::FlushAfterIntraL0:1", + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", "CompactionJob::Run():Start"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "FindIntraL0Compaction", @@ -5043,19 +5596,20 @@ ASSERT_OK(Put(Key(0), "a")); ASSERT_EQ(5, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"); // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. for (int i = 5; i < 10; i++) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); IngestOneKeyValue(dbfull(), Key(i), value, options); - ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } - TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1"); // Put one key, to make biggest log sequence number in this memtable is bigger // than sst which would be ingested in next step. ASSERT_OK(Put(Key(2), "b")); - ASSERT_EQ(10, NumTableFilesAtLevel(0)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), @@ -5080,8 +5634,8 @@ const size_t kValueSize = 1 << 20; Random rnd(301); - std::string value(RandomString(&rnd, kValueSize)); - std::string value2(RandomString(&rnd, kValueSize)); + std::string value(rnd.RandomString(kValueSize)); + std::string value2(rnd.RandomString(kValueSize)); std::string bigvalue = value + value; // prevents trivial move @@ -5093,8 +5647,14 @@ ASSERT_EQ(0, NumTableFilesAtLevel(0)); std::atomic pick_intra_l0_count(0); + // The L0->L1 must be picked before we begin ingesting files to trigger + // intra-L0 compaction, and must not finish until after an intra-L0 + // compaction has been picked. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1", + {{"LevelCompactionPicker::PickCompaction:Return", + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"}, + {"LevelCompactionPicker::PickCompactionBySize:0", "CompactionJob::Run():Start"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "FindIntraL0Compaction", @@ -5125,18 +5685,19 @@ } ASSERT_EQ(6, NumTableFilesAtLevel(0)); + TEST_SYNC_POINT( + "DBCompactionTestWithParam::" + "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"); // ingest file to trigger IntraL0Compaction for (int i = 6; i < 10; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0)); IngestOneKeyValue(dbfull(), Key(i), value2, options); } - ASSERT_EQ(10, NumTableFilesAtLevel(0)); // Wake up flush job sleeping_tasks.WakeUp(); sleeping_tasks.WaitUntilDone(); - TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); uint64_t error_count = 0; @@ -5151,7 +5712,1668 @@ } } -#endif // !defined(ROCKSDB_LITE) +TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_EQ(ToString(kSstNum), FilesPerLevel(0)); + + auto cro = CompactRangeOptions(); + cro.bottommost_level_compaction = bottommost_level_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || + bottommost_level_compaction_ == + BottommostLevelCompaction::kForceOptimized) { + // Real compaction to compact all sst files from level 0 to 1 file on level + // 1 + ASSERT_EQ("0,1", FilesPerLevel(0)); + } else { + // Just trivial move from level 0 -> 1 + ASSERT_EQ("0," + ToString(kSstNum), FilesPerLevel(0)); + } +} + +INSTANTIATE_TEST_CASE_P( + DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized)); + +TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10); + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + + has_compaction = false; + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { + Options options = CurrentOptions(); + options.max_subcompactions = 10; + options.compaction_style = kCompactionStyleUniversal; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + bool has_compaction = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 10); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); + has_compaction = false; + + ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}})); + ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->max_subcompactions() == 2); + has_compaction = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Trigger compaction + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 5000; j++) { + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(has_compaction); +} + +TEST_P(ChangeLevelConflictsWithAuto, TestConflict) { + // A `CompactRange()` may race with an automatic compaction, we'll need + // to make sure it doesn't corrupte the data. + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // Run a qury to refitting to level 1 while another thread writing to + // the same level. + SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::CompactRange:BeforeRefit:1", + "AutoCompactionFinished1", + }, + { + "AutoCompactionFinished2", + "DBImpl::CompactRange:BeforeRefit:2", + }, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread auto_comp([&] { + TEST_SYNC_POINT("AutoCompactionFinished1"); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bar", "v3")); + ASSERT_OK(Put("foo", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + TEST_SYNC_POINT("AutoCompactionFinished2"); + }); + + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = GetParam() ? 1 : 0; + // This should return non-OK, but it's more important for the test to + // make sure that the DB is not corrupted. + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + auto_comp.join(); + // Refitting didn't happen. + SyncPoint::GetInstance()->DisableProcessing(); + + // Write something to DB just make sure that consistency check didn't + // fail and make the DB readable. +} + +INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto, + ChangeLevelConflictsWithAuto, testing::Bool()); + +TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) { + // A `CompactRange()` with `change_level == true` needs to execute its final + // step, `ReFitLevel()`, in isolation. Previously there was a bug where + // refitting could target the same level as an ongoing manual compaction, + // leading to overlapping files in that level. + // + // This test ensures that case is not possible by verifying any manual + // compaction issued during the `ReFitLevel()` phase fails with + // `Status::Incomplete`. + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the + // foreground thread will try to simultaneously compact L0->L1. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // The first two dependencies ensure the foreground creates an L0 file + // between the background compaction's L0->L1 and its L1->L2. + { + "DBImpl::RunManualCompaction()::1", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "PutFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "FlushedFG", + "DBImpl::RunManualCompaction()::2", + }, + // The next two dependencies ensure the foreground invokes + // `CompactRange()` while the background is refitting. The + // foreground's `CompactRange()` is guaranteed to attempt an L0->L1 + // as we set it up with an empty memtable and a new L0 file. + { + "DBImpl::CompactRange:PreRefitLevel", + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactFG", + }, + { + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG", + "DBImpl::CompactRange:PostRefitLevel", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG"); + // Make sure we have something new to compact in the foreground. + // Note key 1 is carefully chosen as it ensures the file we create here + // overlaps with one of the files being refitted L2->L1 in the background. + // If we chose key 0, the file created here would not overlap. + ASSERT_OK(Put(Key(1), "val")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG"); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG"); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:" + "CompactedFG"); + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { + // This test is added to ensure that RefitLevel() error paths are clearing + // internal flags and to test that subsequent valid RefitLevel() calls + // succeeds + Options options = CurrentOptions(); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + Reopen(options); + + ASSERT_EQ("", FilesPerLevel(0)); + + // Setup an LSM with three levels populated. + Random rnd(301); + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1", FilesPerLevel(0)); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + auto start_idx = key_idx; + GenerateNewFile(&rnd, &key_idx); + GenerateNewFile(&rnd, &key_idx); + auto end_idx = key_idx - 1; + ASSERT_EQ("1,1,2", FilesPerLevel(0)); + + // Next two CompactRange() calls are used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + + // Trigger a refit to L1 first + { + std::string begin_string = Key(start_idx); + std::string end_string = Key(end_idx); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a refit from L2->L1 - this should fail and exercise error paths in + // RefitLevel() + { + // Select key range that matches the bottom most level (L2) + std::string begin_string = Key(0); + std::string end_string = Key(start_idx - 1); + Slice begin(begin_string); + Slice end(end_string); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); + } + ASSERT_EQ("0,3,2", FilesPerLevel(0)); + + // Try a valid Refit request to ensure, the path is still working + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,5", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, CompactionWithBlob) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), third_value); + ASSERT_EQ(Get(second_key), third_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_EQ(l1_files.size(), 1); + + const FileMetaData* const table_file = l1_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.begin()->second; + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), first_key); + ASSERT_EQ(table_file->largest.user_key(), second_key); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 2); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1); +} + +class DBCompactionTestBlobError + : public DBCompactionTest, + public testing::WithParamInterface { + public: + DBCompactionTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBCompactionTestBlobError, CompactionError) { + Options options; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_TRUE(l1_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 0); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } +} + +class DBCompactionTestBlobGC + : public DBCompactionTest, + public testing::WithParamInterface> { + public: + DBCompactionTestBlobGC() + : blob_gc_age_cutoff_(std::get<0>(GetParam())), + updated_enable_blob_files_(std::get<1>(GetParam())) {} + + double blob_gc_age_cutoff_; + bool updated_enable_blob_files_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC, + ::testing::Combine(::testing::Values(0.0, 0.5, 1.0), + ::testing::Bool())); + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 4); + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + size_t expected_number_of_files = original_blob_files.size(); + + if (!updated_enable_blob_files_) { + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + expected_number_of_files -= cutoff_index; + } + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (blob_gc_age_cutoff_ > 0.0) { + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + + if (updated_enable_blob_files_) { + // GC relocated some blobs to new blob files + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, + compaction_stats[1].bytes_written_blob); + } else { + // GC moved some blobs back to the LSM, no new blob files + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + } else { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char corrupt_blob_index[] = "foobar"; + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, + corrupt_blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // options is not set, the checksum handoff will not be triggered + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, FIFOWarm) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleFIFO; + options.num_levels = 1; + options.max_open_files = -1; + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + CompactionOptionsFIFO fifo_options; + fifo_options.age_for_warm = 1000; + fifo_options.max_table_files_size = 100000000; + options.compaction_options_fifo = fifo_options; + env_->SetMockSleep(); + Reopen(options); + + int total_warm = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile::FileOptions.temperature", [&](void* arg) { + Temperature temperature = *(static_cast(arg)); + if (temperature == Temperature::kWarm) { + total_warm++; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(Put(Key(0), "value1")); + env_->MockSleepForSeconds(800); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(4, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_warm); + + Destroy(options); +} + +TEST_F(DBCompactionTest, DisableMultiManualCompaction) { + const int kNumL0Files = 10; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Generate 2 levels of file to make sure the manual compaction is not skipped + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(2); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), "value")); + if (i % 2) { + ASSERT_OK(Flush()); + } + } + MoveFilesToLevel(1); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + port::Thread compact_thread1([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(0); + std::string end_str = Key(3); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + port::Thread compact_thread2([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = false; + std::string begin_str = Key(4); + std::string end_str = Key(7); + Slice b = begin_str; + Slice e = end_str; + auto s = db_->CompactRange(cro, &b, &e); + ASSERT_TRUE(s.IsIncomplete()); + }); + + // Disable manual compaction should cancel both manual compactions and both + // compaction should return incomplete. + db_->DisableManualCompaction(); + + compact_thread1.join(); + compact_thread2.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +} + +TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + // make sure the manual compaction background is started but not yet set the + // status to in_progress, then cancel the manual compaction, which should not + // result in segfault + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + TEST_SYNC_POINT( + "DBCompactionTest::DisableJustStartedManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableInProgressManualCompaction) { + const int kNumL0Files = 4; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction:InProgress", + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"}, + {"DBImpl::RunManualCompaction:Unscheduled", + "CompactionJob::Run():Start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableInProgressManualCompaction:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + + compact_thread.join(); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFull:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_EQ("0,1", FilesPerLevel(0)); +} + +TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + db_->DisableManualCompaction(); + + // CompactRange should return before the compaction has the chance to run + compact_thread.join(); + + // Try close DB while manual compaction is canceled but still in the queue. + // And an auto-triggered compaction is also in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, DBCloseWithManualCompaction) { + const int kNumL0Files = 4; + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::RunManualCompaction:Scheduled", + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + // Block compaction queue + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + // generate files, but avoid trigger auto compaction + for (int i = 0; i < kNumL0Files / 2; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + port::Thread compact_thread([&]() { + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + auto s = db_->CompactRange(cro, nullptr, nullptr); + ASSERT_TRUE(s.IsIncomplete()); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:" + "PreDisableManualCompaction"); + + // Generate more files to trigger auto compaction which is scheduled after + // manual compaction. Has to generate 4 more files because existing files are + // pending compaction + for (int i = 0; i < kNumL0Files; i++) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0)); + + // Close DB with manual compaction and auto triggered compaction in the queue. + auto s = db_->Close(); + ASSERT_OK(s); + + // manual compaction thread should return with Incomplete(). + compact_thread.join(); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + +TEST_F(DBCompactionTest, + DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) { + // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait + // for automatic compactions to drain before starting the manual compaction. + // This test verifies `DisableManualCompaction()` can cancel such a compaction + // without waiting for the drain to complete. + const int kNumL0Files = 4; + + // Enforces manual compaction enters wait loop due to pending automatic + // compaction. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"}, + {"DBImpl::RunManualCompaction:WaitScheduled", + "BackgroundCallCompaction:0"}}); + // The automatic compaction will cancel the waiting manual compaction. + // Completing this implies the cancellation did not wait on automatic + // compactions to finish. + bool callback_completed = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void* /*arg*/) { + db_->DisableManualCompaction(); + callback_completed = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + Reopen(options); + + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = true; + ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(callback_completed); +} + +TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) { + Options options = CurrentOptions(); + options.num_levels = 3; + Reopen(options); + + // Setup an LSM with L2 populated. + Random rnd(301); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + } + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // The background thread will refit L2->L1 while the foreground thread will + // attempt to run a compaction on new data. The following dependencies + // ensure the background manual compaction's refitting phase disables manual + // compaction immediately before the foreground manual compaction can register + // itself. Manual compaction is kept disabled until the foreground manual + // checks for the failure once. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + // Only do Put()s for foreground CompactRange() once the background + // CompactRange() has reached the refitting phase. + { + "DBImpl::CompactRange:BeforeRefit:1", + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange", + }, + // Right before we register the manual compaction, proceed with + // the refitting phase so manual compactions are disabled. Stay in + // the refitting phase with manual compactions disabled until it is + // noticed. + { + "DBImpl::RunManualCompaction:0", + "DBImpl::CompactRange:BeforeRefit:2", + }, + { + "DBImpl::CompactRange:PreRefitLevel", + "DBImpl::RunManualCompaction:1", + }, + { + "DBImpl::RunManualCompaction:PausedAtStart", + "DBImpl::CompactRange:PostRefitLevel", + }, + // If compaction somehow were scheduled, let's let it run after reenabling + // manual compactions. This dependency is not expected to be hit but is + // here for speculatively coercing future bugs. + { + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled", + "BackgroundCallCompaction:0", + }, + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + TEST_SYNC_POINT( + "DBCompactionTest::ChangeLevelConflictsWithManual:" + "PreForegroundCompactRange"); + ASSERT_OK(Put(Key(0), rnd.RandomString(990))); + ASSERT_OK(Put(Key(1), rnd.RandomString(990))); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + + refit_level_thread.join(); +} + +TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) { + // Flushes several files to trigger compaction while lock is released during + // a bottom-pri compaction. Verifies it does not get scheduled to thread pool + // because per-DB limit for compaction parallelism is one (default). + const int kNumL0Files = 4; + const int kNumLevels = 3; + + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.num_levels = kNumLevels; + DestroyAndReopen(options); + + // Setup last level to be non-empty since it's a bit unclear whether + // compaction to an empty level would be considered "bottommost". + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(kNumLevels - 1); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BGWorkBottomCompaction", + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"}, + {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction", + "BackgroundCallCompaction:0"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread compact_range_thread([&] { + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.exclusive_manual_compaction = false; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + }); + + // Sleep in the low-pri thread so any newly scheduled compaction will be + // queued. Otherwise it might finish before we check its existence. + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PreTriggerCompaction"); + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + } + ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + TEST_SYNC_POINT( + "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:" + "PostTriggerCompaction"); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + compact_range_thread.join(); +} + +#endif // !defined(ROCKSDB_LITE) + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,11 +15,14 @@ #include "db/db_test_util.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/env.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { class DBTestDynamicLevel : public DBTestBase { public: - DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {} + DBTestDynamicLevel() + : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {} }; TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) { @@ -27,7 +30,7 @@ return; } // Use InMemoryEnv, or it would be too slow. - std::unique_ptr env(new MockEnv(env_)); + std::unique_ptr env(NewMemEnv(env_)); const int kNKeys = 1000; int keys[kNKeys]; @@ -50,7 +53,7 @@ keys[i] = i; } if (ordered_insert == 0) { - std::random_shuffle(std::begin(keys), std::end(keys)); + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); } for (int max_background_compactions = 1; max_background_compactions < 4; max_background_compactions += 2) { @@ -80,9 +83,9 @@ for (int i = 0; i < kNKeys; i++) { int key = keys[i]; - ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(key), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102))); + ASSERT_OK(Put(Key(key), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102))); ASSERT_OK(Delete(Key(kNKeys + keys[i / 10]))); env_->SleepForMicroseconds(5000); } @@ -100,7 +103,8 @@ } // Test compact range works - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // All data should be in the last level. ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(&cf_meta); @@ -139,6 +143,7 @@ options.max_background_compactions = 2; options.num_levels = 5; options.max_compaction_bytes = 0; // Force not expanding in compactions + options.db_host_id = ""; // Setting this messes up the file size calculation BlockBasedTableOptions table_options; table_options.block_size = 1024; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -158,13 +163,13 @@ // Put about 28K to L0 for (int i = 0; i < 70; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 380))); + rnd.RandomString(380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(4U, int_prop); @@ -175,14 +180,14 @@ })); for (int i = 0; i < 70; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 380))); + rnd.RandomString(380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(3U, int_prop); ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); @@ -197,13 +202,13 @@ // Write about 40K more for (int i = 0; i < 100; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 380))); + rnd.RandomString(380))); } ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(3U, int_prop); @@ -216,7 +221,7 @@ // Each file is about 11KB, with 9KB of data. for (int i = 0; i < 1300; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 380))); + rnd.RandomString(380))); } // Make sure that the compaction starts before the last bit of data is @@ -231,8 +236,8 @@ })); TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0"); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(2U, int_prop); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -257,11 +262,11 @@ TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1"); for (int i = 0; i < 2; i++) { ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 380))); + rnd.RandomString(380))); } TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2"); - Flush(); + ASSERT_OK(Flush()); thread.join(); @@ -299,7 +304,7 @@ DestroyAndReopen(options); // Compact against empty DB - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); uint64_t int_prop; std::string str_prop; @@ -310,16 +315,16 @@ // Put about 7K to L0 for (int i = 0; i < 140; i++) { - ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); + ASSERT_OK( + Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 0) { // Make sure level 0 is not empty - ASSERT_OK(Put(Key(static_cast(rnd.Uniform(kMaxKey))), - RandomString(&rnd, 80))); - Flush(); + ASSERT_OK( + Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); + ASSERT_OK(Flush()); } ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); @@ -340,7 +345,7 @@ }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(output_levels.size(), 2); ASSERT_TRUE(output_levels.find(3) != output_levels.end()); ASSERT_TRUE(output_levels.find(4) != output_levels.end()); @@ -382,12 +387,12 @@ const int total_keys = 3000; const int random_part_size = 100; for (int i = 0; i < total_keys; i++) { - std::string value = RandomString(&rnd, random_part_size); + std::string value = rnd.RandomString(random_part_size); PutFixed32(&value, static_cast(i)); ASSERT_OK(Put(Key(i), value)); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(non_trivial, 0); @@ -441,12 +446,12 @@ int total_keys = 1000; for (int i = 0; i < total_keys; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); ASSERT_OK(Delete(Key(i / 10))); } verify_func(total_keys, false); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); options.level_compaction_dynamic_level_bytes = true; options.disable_auto_compactions = true; @@ -461,7 +466,7 @@ CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = options.num_levels - 1; - dbfull()->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); compaction_finished.store(true); }); do { @@ -475,13 +480,13 @@ int total_keys2 = 2000; for (int i = total_keys; i < total_keys2; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); ASSERT_OK(Delete(Key(i / 10))); } verify_func(total_keys2, false); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); verify_func(total_keys2, false); // Base level is not level 1 diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_encryption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_encryption_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,7 +16,15 @@ class DBEncryptionTest : public DBTestBase { public: - DBEncryptionTest() : DBTestBase("/db_encryption_test") {} + DBEncryptionTest() + : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {} + Env* GetTargetEnv() { + if (encrypted_env_ != nullptr) { + return (static_cast(encrypted_env_))->target(); + } else { + return env_; + } + } }; #ifndef ROCKSDB_LITE @@ -33,20 +41,20 @@ auto status = env_->GetChildren(dbname_, &fileNames); ASSERT_OK(status); - auto defaultEnv = Env::Default(); + Env* target = GetTargetEnv(); int hits = 0; for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) { - if ((*it == "..") || (*it == ".")) { + if (*it == "LOCK") { continue; } auto filePath = dbname_ + "/" + *it; std::unique_ptr seqFile; auto envOptions = EnvOptions(CurrentOptions()); - status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + status = target->NewSequentialFile(filePath, &seqFile, envOptions); ASSERT_OK(status); uint64_t fileSize; - status = defaultEnv->GetFileSize(filePath, &fileSize); + status = target->GetFileSize(filePath, &fileSize); ASSERT_OK(status); std::string scratch; @@ -84,7 +92,7 @@ } TEST_F(DBEncryptionTest, ReadEmptyFile) { - auto defaultEnv = Env::Default(); + auto defaultEnv = GetTargetEnv(); // create empty file for reading it back in later auto envOptions = EnvOptions(CurrentOptions()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_filesnapshot.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_filesnapshot.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,72 +6,62 @@ #ifndef ROCKSDB_LITE -#include #include -#include +#include +#include #include +#include + #include "db/db_impl/db_impl.h" #include "db/job_context.h" #include "db/version_set.h" #include "file/file_util.h" #include "file/filename.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/types.h" #include "test_util/sync_point.h" +#include "util/file_checksum_helper.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -Status DBImpl::DisableFileDeletions() { - InstrumentedMutexLock l(&mutex_); - ++disable_delete_obsolete_files_; - if (disable_delete_obsolete_files_ == 1) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Disabled, but already disabled. Counter: %d", - disable_delete_obsolete_files_); - } - return Status::OK(); -} +Status DBImpl::FlushForGetLiveFiles() { + mutex_.AssertHeld(); -Status DBImpl::EnableFileDeletions(bool force) { - // Job id == 0 means that this is not our background process, but rather - // user thread - JobContext job_context(0); - bool file_deletion_enabled = false; - { - InstrumentedMutexLock l(&mutex_); - if (force) { - // if force, we need to enable file deletions right away - disable_delete_obsolete_files_ = 0; - } else if (disable_delete_obsolete_files_ > 0) { - --disable_delete_obsolete_files_; - } - if (disable_delete_obsolete_files_ == 0) { - file_deletion_enabled = true; - FindObsoleteFiles(&job_context, true); - bg_cv_.SignalAll(); - } - } - if (file_deletion_enabled) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); + // flush all dirty data to disk. + Status status; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + status = + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles); + if (status.IsColumnFamilyDropped()) { + status = Status::OK(); } + mutex_.Lock(); } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Enable, but not really enabled. Counter: %d", - disable_delete_obsolete_files_); + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + mutex_.Unlock(); + status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); + TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); + mutex_.Lock(); + if (!status.ok() && !status.IsColumnFamilyDropped()) { + break; + } else if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + } } - job_context.Clean(); - LogFlush(immutable_db_options_.info_log); - return Status::OK(); -} - -int DBImpl::IsFileDeletionsEnabled() const { - return !disable_delete_obsolete_files_; + return status; } Status DBImpl::GetLiveFiles(std::vector& ret, @@ -82,34 +72,7 @@ mutex_.Lock(); if (flush_memtable) { - // flush all dirty data to disk. - Status status; - if (immutable_db_options_.atomic_flush) { - autovector cfds; - SelectColumnFamiliesForAtomicFlush(&cfds); - mutex_.Unlock(); - status = AtomicFlushMemTables(cfds, FlushOptions(), - FlushReason::kGetLiveFiles); - mutex_.Lock(); - } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - cfd->Ref(); - mutex_.Unlock(); - status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); - TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); - TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); - mutex_.Lock(); - cfd->UnrefAndTryDelete(); - if (!status.ok()) { - break; - } - } - } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - + Status status = FlushForGetLiveFiles(); if (!status.ok()) { mutex_.Unlock(); ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", @@ -118,27 +81,40 @@ } } - // Make a set of all of the live *.sst files - std::vector live; + // Make a set of all of the live table and blob files + std::vector live_table_files; + std::vector live_blob_files; for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->IsDropped()) { continue; } - cfd->current()->AddLiveFiles(&live); + cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files); } ret.clear(); - ret.reserve(live.size() + 3); // *.sst + CURRENT + MANIFEST + OPTIONS + ret.reserve(live_table_files.size() + live_blob_files.size() + + 3); // for CURRENT + MANIFEST + OPTIONS // create names of the live files. The names are not absolute // paths, instead they are relative to dbname_; - for (const auto& live_file : live) { - ret.push_back(MakeTableFileName("", live_file.GetNumber())); + for (const auto& table_file_number : live_table_files) { + ret.emplace_back(MakeTableFileName("", table_file_number)); } - ret.push_back(CurrentFileName("")); - ret.push_back(DescriptorFileName("", versions_->manifest_file_number())); - ret.push_back(OptionsFileName("", versions_->options_file_number())); + for (const auto& blob_file_number : live_blob_files) { + ret.emplace_back(BlobFileName("", blob_file_number)); + } + + ret.emplace_back(CurrentFileName("")); + ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (versions_->options_file_number() != 0) { + ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + } // find length of manifest file while holding the mutex lock *manifest_file_size = versions_->manifest_file_size(); @@ -148,19 +124,33 @@ } Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { + // If caller disabled deletions, this function should return files that are + // guaranteed not to be deleted until deletions are re-enabled. We need to + // wait for pending purges to finish since WalManager doesn't know which + // files are going to be purged. Additional purges won't be scheduled as + // long as deletions are disabled (so the below loop must terminate). + // Also note that we disable deletions anyway to avoid the case where a + // file is deleted in the middle of the scan, causing IO error. + Status deletions_disabled = DisableFileDeletions(); { - // If caller disabled deletions, this function should return files that are - // guaranteed not to be deleted until deletions are re-enabled. We need to - // wait for pending purges to finish since WalManager doesn't know which - // files are going to be purged. Additional purges won't be scheduled as - // long as deletions are disabled (so the below loop must terminate). InstrumentedMutexLock l(&mutex_); - while (disable_delete_obsolete_files_ > 0 && - pending_purge_obsolete_files_ > 0) { + while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) { bg_cv_.Wait(); } } - return wal_manager_.GetSortedWalFiles(files); + + Status s = wal_manager_.GetSortedWalFiles(files); + + // DisableFileDeletions / EnableFileDeletions not supported in read-only DB + if (deletions_disabled.ok()) { + Status s2 = EnableFileDeletions(/*force*/ false); + assert(s2.ok()); + s2.PermitUncheckedError(); + } else { + assert(deletions_disabled.IsNotSupported()); + } + + return s; } Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { @@ -172,6 +162,245 @@ return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); } + +Status DBImpl::GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) { + // To avoid returning partial results, only move to ouput on success + assert(files); + files->clear(); + std::vector results; + + // NOTE: This implementation was largely migrated from Checkpoint. + + Status s; + VectorLogPtr live_wal_files; + bool flush_memtable = true; + if (!immutable_db_options_.allow_2pc) { + if (opts.wal_size_for_flush == port::kMaxUint64) { + flush_memtable = false; + } else if (opts.wal_size_for_flush > 0) { + // If out standing log files are small, we skip the flush. + s = GetSortedWalFiles(live_wal_files); + + if (!s.ok()) { + return s; + } + + // Don't flush column families if total log size is smaller than + // log_size_for_flush. We copy the log files instead. + // We may be able to cover 2PC case too. + uint64_t total_wal_size = 0; + for (auto& wal : live_wal_files) { + total_wal_size += wal->SizeFileBytes(); + } + if (total_wal_size < opts.wal_size_for_flush) { + flush_memtable = false; + } + live_wal_files.clear(); + } + } + + // This is a modified version of GetLiveFiles, to get access to more + // metadata. + mutex_.Lock(); + if (flush_memtable) { + Status status = FlushForGetLiveFiles(); + if (!status.ok()) { + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n", + status.ToString().c_str()); + return status; + } + } + + // Make a set of all of the live table and blob files + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + VersionStorageInfo& vsi = *cfd->current()->storage_info(); + auto& cf_paths = cfd->ioptions()->cf_paths; + + auto GetDir = [&](size_t path_id) { + // Matching TableFileName() behavior + if (path_id >= cf_paths.size()) { + assert(false); + return cf_paths.back().path; + } else { + return cf_paths[path_id].path; + } + }; + + for (int level = 0; level < vsi.num_levels(); ++level) { + const auto& level_files = vsi.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = MakeTableFileName(meta->fd.GetNumber()); + info.directory = GetDir(meta->fd.GetPathId()); + info.file_number = meta->fd.GetNumber(); + info.file_type = kTableFile; + info.size = meta->fd.GetFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->file_checksum_func_name; + info.file_checksum = meta->file_checksum; + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + info.temperature = meta->temperature; + } + } + const auto& blob_files = vsi.GetBlobFiles(); + for (const auto& pair : blob_files) { + const auto& meta = pair.second; + assert(meta); + + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = BlobFileName(meta->GetBlobFileNumber()); + info.directory = GetName(); // TODO?: support db_paths/cf_paths + info.file_number = meta->GetBlobFileNumber(); + info.file_type = kBlobFile; + info.size = meta->GetBlobFileSize(); + if (opts.include_checksum_info) { + info.file_checksum_func_name = meta->GetChecksumMethod(); + info.file_checksum = meta->GetChecksumValue(); + if (info.file_checksum_func_name.empty()) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + // TODO?: info.temperature + } + } + + // Capture some final info before releasing mutex + const uint64_t manifest_number = versions_->manifest_file_number(); + const uint64_t manifest_size = versions_->manifest_file_size(); + const uint64_t options_number = versions_->options_file_number(); + const uint64_t options_size = versions_->options_file_size_; + const uint64_t min_log_num = MinLogNumberToKeep(); + + mutex_.Unlock(); + + std::string manifest_fname = DescriptorFileName(manifest_number); + { // MANIFEST + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = manifest_fname; + info.directory = GetName(); + info.file_number = manifest_number; + info.file_type = kDescriptorFile; + info.size = manifest_size; + info.trim_to_size = true; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + { // CURRENT + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = kCurrentFileName; + info.directory = GetName(); + info.file_type = kCurrentFile; + // CURRENT could be replaced so we have to record the contents we want + // for it + info.replacement_contents = manifest_fname + "\n"; + info.size = manifest_fname.size() + 1; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (options_number != 0) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + + info.relative_filename = OptionsFileName(options_number); + info.directory = GetName(); + info.file_number = options_number; + info.file_type = kOptionsFile; + info.size = options_size; + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + + // Some legacy testing stuff TODO: carefully clean up obsolete parts + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone"); + + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); + + if (s.ok()) { + s = FlushWAL(false /* sync */); + } + + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"); + + // if we have more than one column family, we need to also get WAL files + if (s.ok()) { + s = GetSortedWalFiles(live_wal_files); + } + if (!s.ok()) { + return s; + } + + size_t wal_size = live_wal_files.size(); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size()); + + // Link WAL files. Copy exact size of last one because it is the only one + // that has changes after the last flush. + auto wal_dir = immutable_db_options_.GetWalDir(); + for (size_t i = 0; s.ok() && i < wal_size; ++i) { + if ((live_wal_files[i]->Type() == kAliveLogFile) && + (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) { + results.emplace_back(); + LiveFileStorageInfo& info = results.back(); + auto f = live_wal_files[i]->PathName(); + assert(!f.empty() && f[0] == '/'); + info.relative_filename = f.substr(1); + info.directory = wal_dir; + info.file_number = live_wal_files[i]->LogNumber(); + info.file_type = kWalFile; + info.size = live_wal_files[i]->SizeFileBytes(); + // Only last should need to be trimmed + info.trim_to_size = (i + 1 == wal_size); + if (opts.include_checksum_info) { + info.file_checksum_func_name = kUnknownFileChecksumFuncName; + info.file_checksum = kUnknownFileChecksum; + } + } + } + + if (s.ok()) { + // Only move output on success + *files = std::move(results); + } + return s; +} + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_flush_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_flush_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,21 +8,31 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "env/mock_env.h" +#include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" -#include "test_util/fault_injection_test_env.h" +#include "rocksdb/utilities/transaction_db.h" #include "test_util/sync_point.h" +#include "test_util/testutil.h" #include "util/cast_util.h" #include "util/mutexlock.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { +// This is a static filter used for filtering +// kvs during the compaction process. +static std::string NEW_VALUE = "NewValue"; + class DBFlushTest : public DBTestBase { public: - DBFlushTest() : DBTestBase("/db_flush_test") {} + DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {} }; class DBFlushDirectIOTest : public DBFlushTest, @@ -62,7 +72,7 @@ ASSERT_OK(Put("bar", "v")); ASSERT_OK(dbfull()->Flush(no_wait)); // If the issue is hit we will wait here forever. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE ASSERT_EQ(2, TotalTableFiles()); #endif // ROCKSDB_LITE @@ -78,41 +88,26 @@ options.env = fault_injection_env.get(); SyncPoint::GetInstance()->LoadDependency( - {{"DBFlushTest::SyncFail:GetVersionRefCount:1", - "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"}, - {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", - "DBFlushTest::SyncFail:GetVersionRefCount:2"}, - {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, + {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}}); SyncPoint::GetInstance()->EnableProcessing(); CreateAndReopenWithCF({"pikachu"}, options); - Put("key", "value"); - auto* cfd = - reinterpret_cast(db_->DefaultColumnFamily()) - ->cfd(); + ASSERT_OK(Put("key", "value")); FlushOptions flush_options; flush_options.wait = false; ASSERT_OK(dbfull()->Flush(flush_options)); // Flush installs a new super-version. Get the ref count after that. - auto current_before = cfd->current(); - int refs_before = cfd->current()->TEST_refs(); - TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1"); - TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2"); - int refs_after_picking_memtables = cfd->current()->TEST_refs(); - ASSERT_EQ(refs_before + 1, refs_after_picking_memtables); fault_injection_env->SetFilesystemActive(false); TEST_SYNC_POINT("DBFlushTest::SyncFail:1"); TEST_SYNC_POINT("DBFlushTest::SyncFail:2"); fault_injection_env->SetFilesystemActive(true); // Now the background job will do the flush; wait for it. - dbfull()->TEST_WaitForFlushMemTable(); + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE ASSERT_EQ("", FilesPerLevel()); // flush failed. #endif // ROCKSDB_LITE - // Backgroun flush job should release ref count to current version. - ASSERT_EQ(current_before, cfd->current()); - ASSERT_EQ(refs_before, cfd->current()->TEST_refs()); Destroy(options); } @@ -125,7 +120,7 @@ SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); - Put("key", "value"); + ASSERT_OK(Put("key", "value")); FlushOptions flush_options; flush_options.wait = false; @@ -135,7 +130,7 @@ TEST_SYNC_POINT("DBFlushTest::SyncSkip:2"); // Now the background job will do the flush; wait for it. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); Destroy(options); } @@ -145,7 +140,7 @@ // scheduled in the low-pri (compaction) thread pool. Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(1)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); Reopen(options); env_->SetBackgroundThreads(0, Env::HIGH); @@ -170,13 +165,73 @@ ASSERT_OK(Put("key", "val")); for (int i = 0; i < 4; ++i) { ASSERT_OK(Put("key", "val")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(4, num_flushes); ASSERT_EQ(1, num_compactions); } +// Test when flush job is submitted to low priority thread pool and when DB is +// closed in the meanwhile, CloseHelper doesn't hang. +TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) { + Options options = CurrentOptions(); + options.max_background_flushes = 1; + options.max_total_wal_size = 8192; + + DestroyAndReopen(options); + CreateColumnFamilies({"cf1", "cf2"}, options); + + env_->SetBackgroundThreads(0, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + int num_flushes = 0; + + SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush", + [&](void* /*arg*/) { ++num_flushes; }); + + int num_low_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) { + num_low_flush_unscheduled++; + // There should be one flush job in low pool that needs to be + // unscheduled + ASSERT_EQ(num_low_flush_unscheduled, 1); + }); + + int num_high_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) { + num_high_flush_unscheduled++; + // There should be no flush job in high pool + ASSERT_EQ(num_high_flush_unscheduled, 0); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(0, "key1", DummyString(8192))); + // Block thread so that flush cannot be run and can be removed from the queue + // when called Unschedule. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + + // Trigger flush and flush job will be scheduled to LOW priority thread. + ASSERT_OK(Put(0, "key2", DummyString(8192))); + + // Close DB and flush job in low priority queue will be removed without + // running. + Close(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_EQ(0, num_flushes); + + TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key3", DummyString(8192))); + ASSERT_OK(Flush(0)); + ASSERT_EQ(1, num_flushes); +} + TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) { Options options = CurrentOptions(); options.write_buffer_size = 100; @@ -236,13 +291,1096 @@ SyncPoint::GetInstance()->ClearAllCallBacks(); } +// The following 3 tests are designed for testing garbage statistics at flush +// time. +// +// ======= General Information ======= (from GitHub Wiki). +// There are three scenarios where memtable flush can be triggered: +// +// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size +// after a write. +// 2 - Total memtable size across all column families exceeds +// DBOptions::db_write_buffer_size, +// or DBOptions::write_buffer_manager signals a flush. In this scenario +// the largest memtable will be flushed. +// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size. +// In this scenario the memtable with the oldest data will be flushed, +// in order to allow the WAL file with data from this memtable to be +// purged. +// +// As a result, a memtable can be flushed before it is full. This is one +// reason the generated SST file can be smaller than the corresponding +// memtable. Compression is another factor to make SST file smaller than +// corresponding memtable, since data in memtable is uncompressed. + +TEST_F(DBFlushTest, StatisticsGarbageBasic) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 64 << 20; + + ASSERT_OK(TryReopen(options)); + + // Put multiple times the same key-values. + // The encoded length of a db entry in the memtable is + // defined in db/memtable.cc (MemTable::Add) as the variable: + // encoded_len= VarintLength(internal_key_size) --> = + // log_256(internal_key). + // Min # of bytes + // necessary to + // store + // internal_key_size. + // + internal_key_size --> = actual key string, + // (size key_size: w/o term null char) + // + 8 bytes for + // fixed uint64 "seq + // number + // + + // insertion type" + // + VarintLength(val_size) --> = min # of bytes to + // store val_size + // + val_size --> = actual value + // string + // For example, in our situation, "key1" : size 4, "value1" : size 6 + // (the terminating null characters are not copied over to the memtable). + // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry. + // However in terms of raw data contained in the memtable, and written + // over to the SSTable, we only count internal_key_size and val_size, + // because this is the only raw chunk of bytes that contains everything + // necessary to reconstruct a user entry: sequence number, insertion type, + // key, and value. + + // To test the relevance of our Memtable garbage statistics, + // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + // we insert K-V pairs with 3 distinct keys (of length 4), + // and random values of arbitrary length RAND_VALUES_LENGTH, + // and we repeat this step NUM_REPEAT times total. + // At the end, we insert 3 final K-V pairs with the same 3 keys + // and known values (these will be the final values, of length 6). + // I chose NUM_REPEAT=2,000 such that no automatic flush is + // triggered (the number of bytes in the memtable is therefore + // well below any meaningful heuristic for a memtable of size 64MB). + // As a result, since each K-V pair is inserted as a payload + // of N meaningful bytes (sequence number, insertion type, + // key, and value = 8 + 4 + RAND_VALUE_LENGTH), + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes + // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH + + // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we + // expect: + // N = 8 + 4 + 172 = 184 bytes + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes. + // MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes. + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 172; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string VALUE1 = "value1"; + const std::string VALUE2 = "value2"; + const std::string VALUE3 = "value3"; + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + ASSERT_OK(Put(KEY1, VALUE1)); + ASSERT_OK(Put(KEY2, VALUE2)); + ASSERT_OK(Put(KEY3, VALUE3)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() + + VALUE3.size() + 3 * sizeof(uint64_t); + + // We assert that the last K-V pairs have been successfully inserted, + // and that the valid values are VALUE1, VALUE2, VALUE3. + PinnableSlice value; + ASSERT_OK(Get(KEY1, &value)); + ASSERT_EQ(value.ToString(), VALUE1); + ASSERT_OK(Get(KEY2, &value)); + ASSERT_EQ(value.ToString(), VALUE2); + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value.ToString(), VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 37; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + WriteBatch batch; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(Delete(KEY1)); + ASSERT_OK(Delete(KEY2)); + ASSERT_OK(Delete(KEY3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of delete for KEY1, KEY2, KEY3 is written to + // SSTable to propagate the delete operations to K-V pairs + // that could have been inserted into the database during past Flush + // opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK(Delete(KEY4)); + ASSERT_OK(Delete(KEY5)); + ASSERT_OK(Delete(KEY6)); + + // // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + ASSERT_NOK(Get(KEY3, &value)); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 1000; + const size_t RAND_VALUES_LENGTH = 42; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + const std::string VALUE3 = "value3"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + // Also insert DeleteRange + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1, + KEY2)); + // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3) + // is deleted. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2, + KEY3)); + // Delete ranges are stored as a regular K-V pair, with key=STARTKEY, + // value=ENDKEY. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written + // to SSTable to propagate the deleteRange operations to K-V pairs that could + // have been inserted into the database during past Flush opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + + // Overwrite KEY3 with known value (VALUE3) + // Note that during the whole time KEY3 has never been deleted + // by the RangeDeletes. + ASSERT_OK(Put(KEY3, VALUE3)); + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY3.size() + VALUE3.size() + sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5)); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + (KEY4.size() + KEY5.size() + sizeof(uint64_t)) + + (KEY5.size() + KEY6.size() + sizeof(uint64_t)); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + // And that KEY3's value is correct. + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value, VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +#ifndef ROCKSDB_LITE +// This simple Listener can only handle one flush at a time. +class TestFlushListener : public EventListener { + public: + TestFlushListener(Env* env, DBFlushTest* test) + : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { + db_closed = false; + } + + ~TestFlushListener() override { + prev_fc_info_.status.PermitUncheckedError(); // Ignore the status + } + + void OnTableFileCreated(const TableFileCreationInfo& info) override { + // remember the info for later checking the FlushJobInfo. + prev_fc_info_ = info; + ASSERT_GT(info.db_name.size(), 0U); + ASSERT_GT(info.cf_name.size(), 0U); + ASSERT_GT(info.file_path.size(), 0U); + ASSERT_GT(info.job_id, 0); + ASSERT_GT(info.table_properties.data_size, 0U); + ASSERT_GT(info.table_properties.raw_key_size, 0U); + ASSERT_GT(info.table_properties.raw_value_size, 0U); + ASSERT_GT(info.table_properties.num_data_blocks, 0U); + ASSERT_GT(info.table_properties.num_entries, 0U); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + } + + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + flushed_dbs_.push_back(db); + flushed_column_family_names_.push_back(info.cf_name); + if (info.triggered_writes_slowdown) { + slowdown_count++; + } + if (info.triggered_writes_stop) { + stop_count++; + } + // verify whether the previously created file matches the flushed file. + ASSERT_EQ(prev_fc_info_.db_name, db->GetName()); + ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name); + ASSERT_EQ(prev_fc_info_.job_id, info.job_id); + ASSERT_EQ(prev_fc_info_.file_path, info.file_path); + ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number); + + // Note: the following chunk relies on the notification pertaining to the + // database pointed to by DBTestBase::db_, and is thus bypassed when + // that assumption does not hold (see the test case MultiDBMultiListeners + // below). + ASSERT_TRUE(test_); + if (db == test_->db_) { + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(), + &files_by_level); + + ASSERT_FALSE(files_by_level.empty()); + auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(), + [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == info.file_number; + }); + ASSERT_NE(it, files_by_level[0].end()); + ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number); + } + + ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); + ASSERT_GT(info.thread_id, 0U); + } + + std::vector flushed_column_family_names_; + std::vector flushed_dbs_; + int slowdown_count; + int stop_count; + bool db_closing; + std::atomic_bool db_closed; + TableFileCreationInfo prev_fc_info_; + + protected: + Env* env_; + DBFlushTest* test_; +}; +#endif // !ROCKSDB_LITE + +TEST_F(DBFlushTest, MemPurgeBasic) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 1.0; +#ifndef ROCKSDB_LITE + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#endif // !ROCKSDB_LITE + ASSERT_OK(TryReopen(options)); + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::string KEY1 = "IamKey1"; + std::string KEY2 = "IamKey2"; + std::string KEY3 = "IamKey3"; + std::string KEY4 = "IamKey4"; + std::string KEY5 = "IamKey5"; + std::string KEY6 = "IamKey6"; + std::string KEY7 = "IamKey7"; + std::string KEY8 = "IamKey8"; + std::string KEY9 = "IamKey9"; + std::string RNDKEY1, RNDKEY2, RNDKEY3; + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(719); + const size_t NUM_REPEAT = 100; + const size_t RAND_KEYS_LENGTH = 57; + const size_t RAND_VALUES_LENGTH = 10240; + std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1, + p_rv2, p_rv3; + + // Insert a very first set of keys that will be + // mempurged at least once. + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + + // Insertion of of K-V pairs, multiple times (overwrites). + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v6 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v7 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v8 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v9 = rnd.RandomString(RAND_VALUES_LENGTH); + + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Put(KEY6, p_v6)); + ASSERT_OK(Put(KEY7, p_v7)); + ASSERT_OK(Put(KEY8, p_v8)); + ASSERT_OK(Put(KEY9, p_v9)); + + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Insertion of of K-V pairs, no overwrites. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH); + RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH); + RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH); + p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH); + + ASSERT_OK(Put(RNDKEY1, p_rv1)); + ASSERT_OK(Put(RNDKEY2, p_rv2)); + ASSERT_OK(Put(RNDKEY3, p_rv3)); + + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + ASSERT_EQ(Get(RNDKEY1), p_rv1); + ASSERT_EQ(Get(RNDKEY2), p_rv2); + ASSERT_EQ(Get(RNDKEY3), p_rv3); + } + + // Assert that at least one flush to storage has been performed + EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT); + // (which will consequently increase the number of mempurges recorded too). + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + + // Assert that there is no data corruption, even with + // a flush to storage. + ASSERT_EQ(Get(KEY1), p_v1); + ASSERT_EQ(Get(KEY2), p_v2); + ASSERT_EQ(Get(KEY3), p_v3); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + ASSERT_EQ(Get(KEY6), p_v6); + ASSERT_EQ(Get(KEY7), p_v7); + ASSERT_EQ(Get(KEY8), p_v8); + ASSERT_EQ(Get(KEY9), p_v9); + ASSERT_EQ(Get(RNDKEY1), p_rv1); + ASSERT_EQ(Get(RNDKEY2), p_rv2); + ASSERT_EQ(Get(RNDKEY3), p_rv3); + + Close(); +} + +TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { + Options options = CurrentOptions(); + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; +#ifndef ROCKSDB_LITE + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#endif // !ROCKSDB_LITE + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 1.0; + + ASSERT_OK(TryReopen(options)); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::string KEY1 = "ThisIsKey1"; + std::string KEY2 = "ThisIsKey2"; + std::string KEY3 = "ThisIsKey3"; + std::string KEY4 = "ThisIsKey4"; + std::string KEY5 = "ThisIsKey5"; + const std::string NOT_FOUND = "NOT_FOUND"; + + Random rnd(117); + const size_t NUM_REPEAT = 100; + const size_t RAND_VALUES_LENGTH = 10240; + + std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5; + int count = 0; + const int EXPECTED_COUNT_FORLOOP = 3; + const int EXPECTED_COUNT_END = 4; + + ReadOptions ropt; + ropt.pin_data = true; + ropt.total_order_seek = true; + Iterator* iter = nullptr; + + // Insertion of of K-V pairs, multiple times. + // Also insert DeleteRange + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3b = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Delete(KEY2)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2, + KEY4)); + ASSERT_OK(Put(KEY3, p_v3b)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1, + KEY3)); + ASSERT_OK(Delete(KEY1)); + + ASSERT_EQ(Get(KEY1), NOT_FOUND); + ASSERT_EQ(Get(KEY2), NOT_FOUND); + ASSERT_EQ(Get(KEY3), p_v3b); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); + + iter = db_->NewIterator(ropt); + iter->SeekToFirst(); + count = 0; + for (; iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + key = (iter->key()).ToString(false); + value = (iter->value()).ToString(false); + if (key.compare(KEY3) == 0) + ASSERT_EQ(value, p_v3b); + else if (key.compare(KEY4) == 0) + ASSERT_EQ(value, p_v4); + else if (key.compare(KEY5) == 0) + ASSERT_EQ(value, p_v5); + else + ASSERT_EQ(value, NOT_FOUND); + count++; + } + + // Expected count here is 3: KEY3, KEY4, KEY5. + ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP); + if (iter) { + delete iter; + } + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Additional test for the iterator+memPurge. + ASSERT_OK(Put(KEY2, p_v2)); + iter = db_->NewIterator(ropt); + iter->SeekToFirst(); + ASSERT_OK(Put(KEY4, p_v4)); + count = 0; + for (; iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + key = (iter->key()).ToString(false); + value = (iter->value()).ToString(false); + if (key.compare(KEY2) == 0) + ASSERT_EQ(value, p_v2); + else if (key.compare(KEY3) == 0) + ASSERT_EQ(value, p_v3b); + else if (key.compare(KEY4) == 0) + ASSERT_EQ(value, p_v4); + else if (key.compare(KEY5) == 0) + ASSERT_EQ(value, p_v5); + else + ASSERT_EQ(value, NOT_FOUND); + count++; + } + + // Expected count here is 4: KEY2, KEY3, KEY4, KEY5. + ASSERT_EQ(count, EXPECTED_COUNT_END); + if (iter) delete iter; + + Close(); +} + +// Create a Compaction Fitler that will be invoked +// at flush time and will update the value of a KV pair +// if the key string is "lower" than the filter_key_ string. +class ConditionalUpdateFilter : public CompactionFilter { + public: + explicit ConditionalUpdateFilter(const std::string* filtered_key) + : filtered_key_(filtered_key) {} + bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/, + std::string* new_value, bool* value_changed) const override { + // If key CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new ConditionalUpdateFilter(&filtered_key_)); + } + + const char* Name() const override { return "ConditionalUpdateFilterFactory"; } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + // This compaction filter will be invoked + // at flush time (and therefore at MemPurge time). + return (reason == TableFileCreationReason::kFlush); + } + + private: + std::string filtered_key_; +}; + +TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) { + Options options = CurrentOptions(); + + std::string KEY1 = "ThisIsKey1"; + std::string KEY2 = "ThisIsKey2"; + std::string KEY3 = "ThisIsKey3"; + std::string KEY4 = "ThisIsKey4"; + std::string KEY5 = "ThisIsKey5"; + std::string KEY6 = "ThisIsKey6"; + std::string KEY7 = "ThisIsKey7"; + std::string KEY8 = "ThisIsKey8"; + std::string KEY9 = "ThisIsKey9"; + const std::string NOT_FOUND = "NOT_FOUND"; + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; +#ifndef ROCKSDB_LITE + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); +#endif // !ROCKSDB_LITE + // Create a ConditionalUpdate compaction filter + // that will update all the values of the KV pairs + // where the keys are "lower" than KEY4. + options.compaction_filter_factory = + std::make_shared(KEY4); + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 1 << 20; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 1.0; + + ASSERT_OK(TryReopen(options)); + + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(53); + const size_t NUM_REPEAT = 1000; + const size_t RAND_VALUES_LENGTH = 10240; + std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9; + + p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v4 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v5 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + ASSERT_OK(Put(KEY4, p_v4)); + ASSERT_OK(Put(KEY5, p_v5)); + ASSERT_OK(Delete(KEY1)); + + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary + // length RAND_VALUES_LENGTH bytes. + p_v6 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v7 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v8 = rnd.RandomString(RAND_VALUES_LENGTH); + p_v9 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY6, p_v6)); + ASSERT_OK(Put(KEY7, p_v7)); + ASSERT_OK(Put(KEY8, p_v8)); + ASSERT_OK(Put(KEY9, p_v9)); + + ASSERT_OK(Delete(KEY7)); + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + + // Verify that the ConditionalUpdateCompactionFilter + // updated the values of KEY2 and KEY3, and not KEY4 and KEY5. + ASSERT_EQ(Get(KEY1), NOT_FOUND); + ASSERT_EQ(Get(KEY2), NEW_VALUE); + ASSERT_EQ(Get(KEY3), NEW_VALUE); + ASSERT_EQ(Get(KEY4), p_v4); + ASSERT_EQ(Get(KEY5), p_v5); +} + +TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) { + Options options = CurrentOptions(); + + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 128KB. + options.write_buffer_size = 128 << 10; + // Activate the MemPurge prototype. + options.experimental_mempurge_threshold = 1.0; + + ASSERT_OK(TryReopen(options)); + + const size_t KVSIZE = 10; + + do { + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_OK(Put(1, "baz", "v5")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "baz")); + ASSERT_OK(Put(0, "bar", "v2")); + ASSERT_OK(Put(1, "bar", "v2")); + ASSERT_OK(Put(1, "foo", "v3")); + std::atomic mempurge_count{0}; + std::atomic sst_count{0}; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:MemPurgeSuccessful", + [&](void* /*arg*/) { mempurge_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector keys; + for (size_t k = 0; k < KVSIZE; k++) { + keys.push_back("IamKey" + std::to_string(k)); + } + + std::string RNDKEY, RNDVALUE; + const std::string NOT_FOUND = "NOT_FOUND"; + + // Heavy overwrite workload, + // more than would fit in maximum allowed memtables. + Random rnd(719); + const size_t NUM_REPEAT = 100; + const size_t RAND_KEY_LENGTH = 4096; + const size_t RAND_VALUES_LENGTH = 1024; + std::vector values_default(KVSIZE), values_pikachu(KVSIZE); + + // Insert a very first set of keys that will be + // mempurged at least once. + for (size_t k = 0; k < KVSIZE / 2; k++) { + values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH); + values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert keys[0:KVSIZE/2] to + // both 'default' and 'pikachu' CFs. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_OK(Put(0, keys[k], values_default[k])); + ASSERT_OK(Put(1, keys[k], values_pikachu[k])); + } + + // Check that the insertion was seamless. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + + // Insertion of of K-V pairs, multiple times (overwrites) + // into 'default' CF. Will trigger mempurge. + for (size_t j = 0; j < NUM_REPEAT; j++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert K-V into default CF. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + ASSERT_OK(Put(0, keys[k], values_default[k])); + } + + // Check key validity, for all keys, both in + // default and pikachu CFs. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + } + // Note that at this point, only keys[0:KVSIZE/2] + // have been inserted into Pikachu. + for (size_t k = 0; k < KVSIZE / 2; k++) { + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + } + + // Insertion of of K-V pairs, multiple times (overwrites) + // into 'pikachu' CF. Will trigger mempurge. + // Check that we keep the older logs for 'default' imm(). + for (size_t j = 0; j < NUM_REPEAT; j++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH); + } + + // Insert K-V into pikachu CF. + for (size_t k = KVSIZE / 2; k < KVSIZE; k++) { + ASSERT_OK(Put(1, keys[k], values_pikachu[k])); + } + + // Check key validity, for all keys, + // both in default and pikachu. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + } + + // Check that there was at least one mempurge + const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1; + // Check that there was no SST files created during flush. + const uint32_t EXPECTED_SST_COUNT = 0; + + EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT); + if (options.experimental_mempurge_threshold == + std::numeric_limits::max()) { + EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT); + } + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Check that there was no data corruption anywhere, + // not in 'default' nor in 'Pikachu' CFs. + ASSERT_EQ("v3", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v4")); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + // Check keys in 'Default' and 'Pikachu'. + // keys[0:KVSIZE/2] were for sure contained + // in the imm() at Reopen/recovery time. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + // Insertion of random K-V pairs to trigger + // a flush in the Pikachu CF. + for (size_t j = 0; j < NUM_REPEAT; j++) { + RNDKEY = rnd.RandomString(RAND_KEY_LENGTH); + RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(1, RNDKEY, RNDVALUE)); + } + // ASsert than there was at least one flush to storage. + EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v4", Get(1, "foo")); + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v5", Get(1, "baz")); + // Since values in default are held in mutable mem() + // and imm(), check if the flush in pikachu didn't + // affect these values. + for (size_t k = 0; k < KVSIZE; k++) { + ASSERT_EQ(Get(0, keys[k]), values_default[k]); + ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]); + } + ASSERT_EQ(Get(1, RNDKEY), RNDVALUE); + } while (ChangeWalOptions()); +} + TEST_P(DBFlushDirectIOTest, DirectIO) { Options options; options.create_if_missing = true; options.disable_auto_compactions = true; options.max_background_flushes = 2; options.use_direct_io_for_flush_and_compaction = GetParam(); - options.env = new MockEnv(Env::Default()); + options.env = MockEnv::Create(Env::Default()); SyncPoint::GetInstance()->SetCallBack( "BuildTable:create_file", [&](void* arg) { bool* use_direct_writes = static_cast(arg); @@ -305,7 +1443,8 @@ // mode. fault_injection_env->SetFilesystemActive(false); ASSERT_OK(db_->ContinueBackgroundWork()); - dbfull()->TEST_WaitForFlushMemTable(); + // We ingested the error to env, so the returned status is not OK. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE uint64_t num_bg_errors; ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors, @@ -379,9 +1518,9 @@ DBImpl* db_impl = static_cast_with_check(db); InstrumentedMutex* mutex = db_impl->mutex(); mutex->Lock(); - auto* cfd = - reinterpret_cast(db->DefaultColumnFamily()) - ->cfd(); + auto* cfd = static_cast_with_check( + db->DefaultColumnFamily()) + ->cfd(); ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber()); mutex->Unlock(); } @@ -394,7 +1533,7 @@ std::shared_ptr listener = std::make_shared(); SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BackgroundCallFlush:start", + {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"}, {"DBImpl::FlushMemTableToOutputFile:Finish", "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}}); @@ -443,6 +1582,568 @@ } #endif // !ROCKSDB_LITE +TEST_F(DBFlushTest, FlushWithBlob) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char short_value[] = "short"; + static_assert(sizeof(short_value) - 1 < min_blob_size, + "short_value too long"); + + constexpr char long_value[] = "long_value"; + static_assert(sizeof(long_value) - 1 >= min_blob_size, + "long_value too short"); + + ASSERT_OK(Put("key1", short_value)); + ASSERT_OK(Put("key2", long_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("key1"), short_value); + ASSERT_EQ(Get("key2"), long_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_EQ(l0_files.size(), 1); + + const FileMetaData* const table_file = l0_files[0]; + assert(table_file); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.begin()->second; + assert(blob_file); + + ASSERT_EQ(table_file->smallest.user_key(), "key1"); + ASSERT_EQ(table_file->largest.user_key(), "key2"); + ASSERT_EQ(table_file->fd.smallest_seqno, 1); + ASSERT_EQ(table_file->fd.largest_seqno, 2); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put("key3", "value3")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, PickRightMemtables) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + options.create_if_missing = true; + + const std::string test_cf_name = "test_cf"; + options.max_write_buffer_number = 128; + CreateColumnFamilies({test_cf_name}, options); + + Close(); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options); + + ASSERT_OK(db_->Put(WriteOptions(), "key", "value")); + + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) { + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v")); + auto* cfhi = + static_cast_with_check(handles_[1]); + assert(cfhi); + ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd())); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) { + auto* job = reinterpret_cast(arg); + assert(job); + const auto& mems = job->GetMemTables(); + assert(mems.size() == 1); + assert(mems[0]); + ASSERT_EQ(1, mems[0]->GetID()); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class DBFlushTestBlobError : public DBFlushTest, + public testing::WithParamInterface { + public: + DBFlushTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBFlushTestBlobError, FlushError) { + Options options; + options.enable_blob_files = true; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + ASSERT_OK(Put("key", "blob")); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_NOK(Flush()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_TRUE(l0_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + // Make sure the files generated by the failed job have been deleted + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kTableFile; + + if (!ParseFileName(file, &number, &type)) { + continue; + } + + ASSERT_NE(type, kTableFile); + ASSERT_NE(type, kBlobFile); + } + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 0); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_GT(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) { + class SimpleTestFlushListener : public EventListener { + public: + explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {} + ~SimpleTestFlushListener() override {} + + void OnFlushBegin(DB* db, const FlushJobInfo& info) override { + ASSERT_EQ(static_cast(0), info.cf_id); + + ASSERT_OK(db->Delete(WriteOptions(), "foo")); + snapshot_ = db->GetSnapshot(); + ASSERT_OK(db->Put(WriteOptions(), "foo", "value")); + + auto* dbimpl = static_cast_with_check(db); + assert(dbimpl); + + ColumnFamilyHandle* cfh = db->DefaultColumnFamily(); + auto* cfhi = static_cast_with_check(cfh); + assert(cfhi); + ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd())); + } + + DBFlushTest* test_ = nullptr; + const Snapshot* snapshot_ = nullptr; + }; + + Options options = CurrentOptions(); + options.create_if_missing = true; + auto* listener = new SimpleTestFlushListener(this); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0")); + + ManagedSnapshot snapshot_guard(db_); + + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + ASSERT_OK(db_->Flush(FlushOptions(), default_cf)); + + const Snapshot* snapshot = listener->snapshot_; + assert(snapshot); + + ReadOptions read_opts; + read_opts.snapshot = snapshot; + + // Using snapshot should not see "foo". + { + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + } + + db_->ReleaseSnapshot(snapshot); +} + +TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.allow_2pc = true; + options.atomic_flush = GetParam(); + // 64MB so that memtable flush won't be trigger by the small writes. + options.write_buffer_size = (static_cast(64) << 20); + + // Destroy the DB to recreate as a TransactionDB. + Close(); + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db; + + // Create two more columns other than default CF. + std::vector cfs = {"puppy", "kitty"}; + CreateColumnFamilies(cfs, options); + ASSERT_EQ(handles_.size(), 2); + ASSERT_EQ(handles_[0]->GetName(), cfs[0]); + ASSERT_EQ(handles_[1]->GetName(), cfs[1]); + const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1; + + WriteOptions wopts; + TransactionOptions txn_opts; + // txn1 only prepare, but does not commit. + // The WAL containing the prepared but uncommitted data must be kept. + Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + // txn2 not only prepare, but also commit. + Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn1, nullptr); + ASSERT_NE(txn2, nullptr); + for (size_t i = 0; i < kNumCfToFlush; i++) { + ASSERT_OK(txn1->Put(handles_[i], "k1", "v1")); + ASSERT_OK(txn2->Put(handles_[i], "k2", "v2")); + } + // A txn must be named before prepare. + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn2->SetName("txn2")); + // Prepare writes to WAL, but not to memtable. (WriteCommitted) + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn2->Prepare()); + // Commit writes to memtable. + ASSERT_OK(txn2->Commit()); + delete txn1; + delete txn2; + + // There are still data in memtable not flushed. + // But since data is small enough to reside in the active memtable, + // there are no immutable memtable. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + + // Atomic flush memtables, + // the min log with prepared data should be written to MANIFEST. + std::vector cfs_to_flush(kNumCfToFlush); + for (size_t i = 0; i < kNumCfToFlush; i++) { + cfs_to_flush[i] = handles_[i]; + } + ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush)); + + // There are no remaining data in memtable after flush. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); + } + + // The recovered min log number with prepared data should be non-zero. + // In 2pc mode, MinLogNumberToKeep returns the + // VersionSet::min_log_number_to_keep_2pc recovered from MANIFEST, if it's 0, + // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. + cfs.push_back(kDefaultColumnFamilyName); + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + DBImpl* db_impl = reinterpret_cast(db_); + ASSERT_TRUE(db_impl->allow_2pc()); + ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); +} +#endif // ROCKSDB_LITE + TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -457,18 +2158,84 @@ for (size_t i = 0; i != num_cfs; ++i) { ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); } + + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + std::vector cf_ids; for (size_t i = 0; i != num_cfs; ++i) { cf_ids.emplace_back(static_cast(i)); } ASSERT_OK(Flush(cf_ids)); + for (size_t i = 0; i != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); } } +TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.write_buffer_size = (static_cast(64) << 20); + CreateAndReopenWithCF({"pikachu"}, options); + + const size_t num_cfs = handles_.size(); + ASSERT_EQ(num_cfs, 2); + WriteOptions wopts; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + + { + // Flush the default CF only. + std::vector cf_ids{0}; + ASSERT_OK(Flush(cf_ids)); + + autovector flushed_cfds; + autovector> flush_edits; + auto flushed_cfh = static_cast(handles_[0]); + flushed_cfds.push_back(flushed_cfh->cfd()); + flush_edits.push_back({}); + auto unflushed_cfh = static_cast(handles_[1]); + + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(), + flushed_cfds, flush_edits), + unflushed_cfh->cfd()->GetLogNumber()); + } + + { + // Flush all CFs. + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + cf_ids.emplace_back(static_cast(i)); + } + ASSERT_OK(Flush(cf_ids)); + uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber(); + + uint64_t min_log_number_to_keep = port::kMaxUint64; + autovector flushed_cfds; + autovector> flush_edits; + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + flushed_cfds.push_back(cfh->cfd()); + flush_edits.push_back({}); + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber()); + } + ASSERT_EQ(min_log_number_to_keep, log_num_after_flush); + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(), + flushed_cfds, flush_edits), + min_log_number_to_keep); + } +} + TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -499,13 +2266,13 @@ TEST_SYNC_POINT( "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"); if (options.atomic_flush) { - for (size_t i = 0; i != num_cfs - 1; ++i) { + for (size_t i = 0; i + 1 != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); } } else { - for (size_t i = 0; i != num_cfs - 1; ++i) { + for (size_t i = 0; i + 1 != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); @@ -549,7 +2316,8 @@ fault_injection_env->SetFilesystemActive(false); TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2"); for (auto* cfh : handles_) { - dbfull()->TEST_WaitForFlushMemTable(cfh); + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh)); } for (size_t i = 0; i != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); @@ -651,7 +2419,7 @@ options.create_if_missing = true; options.atomic_flush = atomic_flush; options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysTriggerFlush)); + test::NewSpecialSkipListFactory(kNumKeysTriggerFlush)); CreateAndReopenWithCF({"pikachu"}, options); for (int i = 0; i != kNumKeysTriggerFlush; ++i) { @@ -770,6 +2538,122 @@ SyncPoint::GetInstance()->ClearAllCallBacks(); } +// In atomic flush, concurrent bg flush threads commit to the MANIFEST in +// serial, in the order of their picked memtables for each column family. +// Only when a bg flush thread finds out that its memtables are the earliest +// unflushed ones for all the included column families will this bg flush +// thread continue to commit to MANIFEST. +// This unit test uses sync point to coordinate the execution of two bg threads +// executing the same sequence of functions. The interleaving are as follows. +// time bg1 bg2 +// | pick memtables to flush +// | flush memtables cf1_m1, cf2_m1 +// | join MANIFEST write queue +// | pick memtabls to flush +// | flush memtables cf1_(m1+1) +// | join MANIFEST write queue +// | wait to write MANIFEST +// | write MANIFEST +// | IO error +// | detect IO error and stop waiting +// V +TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + auto fault_injection_env = std::make_shared(env_); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.atomic_flush = true; + options.env = fault_injection_env.get(); + // Set a larger value than default so that RocksDB can schedule concurrent + // background flush threads. + options.max_background_jobs = 8; + options.max_write_buffer_number = 8; + CreateAndReopenWithCF({"pikachu"}, options); + + assert(2 == handles_.size()); + + WriteOptions write_opts; + write_opts.disableWAL = true; + + ASSERT_OK(Put(0, "a", "v_0_a", write_opts)); + ASSERT_OK(Put(1, "a", "v_1_a", write_opts)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + SyncPoint::GetInstance()->LoadDependency({ + {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"}, + }); + + std::thread::id bg_flush_thr1, bg_flush_thr2; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCallFlush:start", [&](void*) { + if (bg_flush_thr1 == std::thread::id()) { + bg_flush_thr1 = std::this_thread::get_id(); + } else if (bg_flush_thr2 == std::thread::id()) { + bg_flush_thr2 = std::this_thread::get_id(); + } + }); + + int called = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) { + if (std::this_thread::get_id() == bg_flush_thr2) { + const auto* ptr = reinterpret_cast*>(arg); + assert(ptr); + if (0 == called) { + // When bg flush thread 2 reaches here for the first time. + ASSERT_OK(ptr->first); + ASSERT_TRUE(ptr->second); + } else if (1 == called) { + // When bg flush thread 2 reaches here for the second time. + ASSERT_TRUE(ptr->first.IsIOError()); + ASSERT_FALSE(ptr->second); + } + ++called; + TEST_SYNC_POINT("BgFlushThr2:WaitToCommit"); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + [&](void*) { + if (std::this_thread::get_id() == bg_flush_thr1) { + TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest"); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (std::this_thread::get_id() != bg_flush_thr1) { + return; + } + ASSERT_OK(db_->Put(write_opts, "b", "v_1_b")); + + FlushOptions flush_opts; + flush_opts.wait = false; + std::vector cfhs(1, db_->DefaultColumnFamily()); + ASSERT_OK(dbfull()->Flush(flush_opts, cfhs)); + }); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + auto* ptr = reinterpret_cast(arg); + assert(ptr); + *ptr = IOStatus::IOError("Injected failure"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError()); + + Close(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,173 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "db/db_impl/compacted_db_impl.h" + +#include "db/db_impl/db_impl.h" +#include "db/version_set.h" +#include "logging/logging.h" +#include "table/get_context.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +extern void MarkKeyMayExist(void* arg); +extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool hit_and_return); + +CompactedDBImpl::CompactedDBImpl(const DBOptions& options, + const std::string& dbname) + : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true, + /*read_only*/ true), + cfd_(nullptr), + version_(nullptr), + user_comparator_(nullptr) {} + +CompactedDBImpl::~CompactedDBImpl() { +} + +size_t CompactedDBImpl::FindFile(const Slice& key) { + size_t right = files_.num_files - 1; + auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { + return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; + }; + return static_cast(std::lower_bound(files_.files, + files_.files + right, key, cmp) - files_.files); +} + +Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, + const Slice& key, PinnableSlice* value) { + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, key, value, nullptr, nullptr, + nullptr, true, nullptr, nullptr); + LookupKey lkey(key, kMaxSequenceNumber); + Status s = files_.files[FindFile(key)].fd.table_reader->Get( + options, lkey.internal_key(), &get_context, nullptr); + if (!s.ok() && !s.IsNotFound()) { + return s; + } + if (get_context.State() == GetContext::kFound) { + return Status::OK(); + } + return Status::NotFound(); +} + +std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) { + autovector reader_list; + for (const auto& key : keys) { + const FdWithKeyRange& f = files_.files[FindFile(key)]; + if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) { + reader_list.push_back(nullptr); + } else { + LookupKey lkey(key, kMaxSequenceNumber); + f.fd.table_reader->Prepare(lkey.internal_key()); + reader_list.push_back(f.fd.table_reader); + } + } + std::vector statuses(keys.size(), Status::NotFound()); + values->resize(keys.size()); + int idx = 0; + for (auto* r : reader_list) { + if (r != nullptr) { + PinnableSlice pinnable_val; + std::string& value = (*values)[idx]; + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, keys[idx], &pinnable_val, + nullptr, nullptr, nullptr, true, nullptr, nullptr); + LookupKey lkey(keys[idx], kMaxSequenceNumber); + Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr); + assert(static_cast(idx) < statuses.size()); + if (!s.ok() && !s.IsNotFound()) { + statuses[idx] = s; + } else { + value.assign(pinnable_val.data(), pinnable_val.size()); + if (get_context.State() == GetContext::kFound) { + statuses[idx] = Status::OK(); + } + } + } + ++idx; + } + return statuses; +} + +Status CompactedDBImpl::Init(const Options& options) { + SuperVersionContext sv_context(/* create_superversion */ true); + mutex_.Lock(); + ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + Status s = Recover({cf}, true /* read only */, false, true); + if (s.ok()) { + cfd_ = static_cast_with_check(DefaultColumnFamily()) + ->cfd(); + cfd_->InstallSuperVersion(&sv_context, &mutex_); + } + mutex_.Unlock(); + sv_context.Clean(); + if (!s.ok()) { + return s; + } + NewThreadStatusCfInfo(cfd_); + version_ = cfd_->GetSuperVersion()->current; + user_comparator_ = cfd_->user_comparator(); + auto* vstorage = version_->storage_info(); + if (vstorage->num_non_empty_levels() == 0) { + return Status::NotSupported("no file exists"); + } + const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); + // L0 should not have files + if (l0.num_files > 1) { + return Status::NotSupported("L0 contain more than 1 file"); + } + if (l0.num_files == 1) { + if (vstorage->num_non_empty_levels() > 1) { + return Status::NotSupported("Both L0 and other level contain files"); + } + files_ = l0; + return Status::OK(); + } + + for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { + if (vstorage->LevelFilesBrief(i).num_files > 0) { + return Status::NotSupported("Other levels also contain files"); + } + } + + int level = vstorage->num_non_empty_levels() - 1; + if (vstorage->LevelFilesBrief(level).num_files > 0) { + files_ = vstorage->LevelFilesBrief(level); + return Status::OK(); + } + return Status::NotSupported("no file exists"); +} + +Status CompactedDBImpl::Open(const Options& options, + const std::string& dbname, DB** dbptr) { + *dbptr = nullptr; + + if (options.max_open_files != -1) { + return Status::InvalidArgument("require max_open_files = -1"); + } + if (options.merge_operator.get() != nullptr) { + return Status::InvalidArgument("merge operator is not supported"); + } + DBOptions db_options(options); + std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); + Status s = db->Init(options); + if (s.ok()) { + db->StartPeriodicWorkScheduler(); + ROCKS_LOG_INFO(db->immutable_db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->immutable_db_options_.info_log); + *dbptr = db.release(); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,118 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class CompactedDBImpl : public DBImpl { + public: + CompactedDBImpl(const DBOptions& options, const std::string& dbname); + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&) = delete; + void operator=(const CompactedDBImpl&) = delete; + + ~CompactedDBImpl() override; + + static Status Open(const Options& options, const std::string& dbname, + DB** dbptr); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) + override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + private: + friend class DB; + inline size_t FindFile(const Slice& key); + Status Init(const Options& options); + + ColumnFamilyData* cfd_; + Version* version_; + const Comparator* user_comparator_; + LevelFilesBrief files_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -18,10 +18,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -45,6 +45,7 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" +#include "db/periodic_work_scheduler.h" #include "db/range_tombstone_fragmenter.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" @@ -52,7 +53,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" -#include "env/composite_env_wrapper.h" +#include "env/unique_id_gen.h" #include "file/file_util.h" #include "file/filename.h" #include "file/random_access_file_reader.h" @@ -60,9 +61,8 @@ #include "logging/auto_roll_logger.h" #include "logging/log_buffer.h" #include "logging/logging.h" -#include "memtable/hash_linklist_rep.h" -#include "memtable/hash_skiplist_rep.h" #include "monitoring/in_memory_stats_history.h" +#include "monitoring/instrumented_mutex.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/persistent_stats_history.h" @@ -82,25 +82,29 @@ #include "rocksdb/stats_history.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" #include "table/merging_iterator.h" #include "table/multiget_context.h" +#include "table/sst_file_dumper.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" -#include "tools/sst_dump_tool_imp.h" +#include "trace_replay/trace_replay.h" #include "util/autovector.h" -#include "util/build_version.h" #include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" +#include "util/defer.h" #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" +#include "utilities/trace/replayer_impl.h" namespace ROCKSDB_NAMESPACE { @@ -146,26 +150,31 @@ } // namespace DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, - const bool seq_per_batch, const bool batch_per_txn) + const bool seq_per_batch, const bool batch_per_txn, + bool read_only) : dbname_(dbname), own_info_log_(options.info_log == nullptr), - initial_db_options_(SanitizeOptions(dbname, options)), + initial_db_options_(SanitizeOptions(dbname, options, read_only)), env_(initial_db_options_.env), - fs_(initial_db_options_.file_system), + io_tracer_(std::make_shared()), immutable_db_options_(initial_db_options_), + fs_(immutable_db_options_.fs, io_tracer_), mutable_db_options_(initial_db_options_), - stats_(immutable_db_options_.statistics.get()), - mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, + stats_(immutable_db_options_.stats), + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, immutable_db_options_.use_adaptive_mutex), default_cf_handle_(nullptr), + error_handler_(this, immutable_db_options_, &mutex_), + event_logger_(immutable_db_options_.info_log.get()), max_total_in_memory_state_(0), file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( file_options_, immutable_db_options_)), seq_per_batch_(seq_per_batch), batch_per_txn_(batch_per_txn), - db_lock_(nullptr), + next_job_id_(1), shutting_down_(false), + db_lock_(nullptr), manual_compaction_paused_(false), bg_cv_(&mutex_), logfile_number_(0), @@ -190,20 +199,22 @@ bg_purge_scheduled_(0), disable_delete_obsolete_files_(0), pending_purge_obsolete_files_(0), - delete_obsolete_files_last_run_(env_->NowMicros()), + delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), last_stats_dump_time_microsec_(0), - next_job_id_(1), has_unpersisted_data_(false), unable_to_release_oldest_log_(false), num_running_ingest_file_(0), #ifndef ROCKSDB_LITE - wal_manager_(immutable_db_options_, file_options_, seq_per_batch), + wal_manager_(immutable_db_options_, file_options_, io_tracer_, + seq_per_batch), #endif // ROCKSDB_LITE - event_logger_(immutable_db_options_.info_log.get()), bg_work_paused_(0), bg_compaction_paused_(0), refitting_level_(false), opened_successfully_(false), +#ifndef ROCKSDB_LITE + periodic_work_scheduler_(nullptr), +#endif // ROCKSDB_LITE two_write_queues_(options.two_write_queues), manual_wal_flush_(options.manual_wal_flush), // last_sequencee_ is always maintained by the main queue that also writes @@ -225,12 +236,15 @@ own_sfm_(options.sst_file_manager == nullptr), preserve_deletes_(options.preserve_deletes), closed_(false), - error_handler_(this, immutable_db_options_, &mutex_), - atomic_flush_install_cv_(&mutex_) { + atomic_flush_install_cv_(&mutex_), + blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, + &error_handler_, &event_logger_, + immutable_db_options_.listeners, dbname_) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); - env_->GetAbsolutePath(dbname, &db_absolute_path_); + // TODO: Check for an error here + env_->GetAbsolutePath(dbname, &db_absolute_path_).PermitUncheckedError(); // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. @@ -242,15 +256,18 @@ co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; co.metadata_charge_policy = kDontChargeCacheMetadata; table_cache_ = NewLRUCache(co); + SetDbSessionId(); + assert(!db_session_id_.empty()); versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, table_cache_.get(), write_buffer_manager_, - &write_controller_, &block_cache_tracer_)); + &write_controller_, &block_cache_tracer_, + io_tracer_, db_session_id_)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); - DumpDBFileSummary(immutable_db_options_, dbname_); + DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_); immutable_db_options_.Dump(immutable_db_options_.info_log.get()); mutable_db_options_.Dump(immutable_db_options_.info_log.get()); DumpSupportInfo(immutable_db_options_.info_log.get()); @@ -259,6 +276,10 @@ // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber() // is called by client and this seqnum is advanced. preserve_deletes_seqnum_.store(0); + + if (write_buffer_manager_) { + wbm_stall_.reset(new WBMStallInterface()); + } } Status DBImpl::Resume() { @@ -294,22 +315,59 @@ // 4. Schedule compactions if needed for all the CFs. This is needed as the // flush in the prior step might have been a no-op for some CFs, which // means a new super version wouldn't have been installed -Status DBImpl::ResumeImpl() { +Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); WaitForBackgroundWork(); - Status bg_error = error_handler_.GetBGError(); Status s; if (shutdown_initiated_) { // Returning shutdown status to SFM during auto recovery will cause it // to abort the recovery and allow the shutdown to progress s = Status::ShutdownInProgress(); } - if (s.ok() && bg_error.severity() > Status::Severity::kHardError) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "DB resume requested but failed due to Fatal/Unrecoverable error"); - s = bg_error; + + if (s.ok()) { + Status bg_error = error_handler_.GetBGError(); + if (bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } + } + + // Make sure the IO Status stored in version set is set to OK. + bool file_deletion_disabled = !IsFileDeletionsEnabled(); + if (s.ok()) { + IOStatus io_s = versions_->io_status(); + if (io_s.IsIOError()) { + // If resuming from IOError resulted from MANIFEST write, then assert + // that we must have already set the MANIFEST writer to nullptr during + // clean-up phase MANIFEST writing. We must have also disabled file + // deletions. + assert(!versions_->descriptor_log_); + assert(file_deletion_disabled); + // Since we are trying to recover from MANIFEST write error, we need to + // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. + // Therefore, force writing a dummy version edit because we do not know + // whether there are flush jobs with non-empty data to flush, triggering + // appends to MANIFEST. + VersionEdit edit; + auto cfh = + static_cast_with_check(default_cf_handle_); + assert(cfh); + ColumnFamilyData* cfd = cfh->cfd(); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + io_s = versions_->io_status(); + if (!io_s.ok()) { + s = error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); + } + } + } } // We cannot guarantee consistency of the WAL. So force flush Memtables of @@ -322,18 +380,15 @@ autovector cfds; SelectColumnFamiliesForAtomicFlush(&cfds); mutex_.Unlock(); - s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery); + s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason); mutex_.Lock(); } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (cfd->IsDropped()) { continue; } - cfd->Ref(); - mutex_.Unlock(); - s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery); - mutex_.Lock(); - cfd->UnrefAndTryDelete(); + InstrumentedMutexUnlock u(&mutex_); + s = FlushMemTable(cfd, flush_opts, context.flush_reason); if (!s.ok()) { break; } @@ -348,9 +403,6 @@ JobContext job_context(0); FindObsoleteFiles(&job_context, true); - if (s.ok()) { - s = error_handler_.ClearBGError(); - } mutex_.Unlock(); job_context.manifest_file_number = 1; @@ -360,9 +412,42 @@ job_context.Clean(); if (s.ok()) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + assert(versions_->io_status().ok()); + // If we reach here, we should re-enable file deletions if it was disabled + // during previous error handling. + if (file_deletion_disabled) { + // Always return ok + s = EnableFileDeletions(/*force=*/true); + if (!s.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but could not enable file deletions [%s]", + s.ToString().c_str()); + assert(false); + } + } } + mutex_.Lock(); + if (s.ok()) { + // This will notify and unblock threads waiting for error recovery to + // finish. Those previouly waiting threads can now proceed, which may + // include closing the db. + s = error_handler_.ClearBGError(); + } else { + // NOTE: this is needed to pass ASSERT_STATUS_CHECKED + // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test. + // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952 + error_handler_.GetRecoveryError().PermitUncheckedError(); + } + + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]", + s.ToString().c_str()); + } + // Check for shutdown again before scheduling further compactions, // since we released and re-acquired the lock above if (shutdown_initiated_) { @@ -396,14 +481,12 @@ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown: canceling all background work"); - if (thread_dump_stats_ != nullptr) { - thread_dump_stats_->cancel(); - thread_dump_stats_.reset(); - } - if (thread_persist_stats_ != nullptr) { - thread_persist_stats_->cancel(); - thread_persist_stats_.reset(); +#ifndef ROCKSDB_LITE + if (periodic_work_scheduler_ != nullptr) { + periodic_work_scheduler_->Unregister(this); } +#endif // !ROCKSDB_LITE + InstrumentedMutexLock l(&mutex_); if (!shutting_down_.load(std::memory_order_acquire) && has_unpersisted_data_.load(std::memory_order_relaxed) && @@ -412,20 +495,19 @@ autovector cfds; SelectColumnFamiliesForAtomicFlush(&cfds); mutex_.Unlock(); - AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + Status s = + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? mutex_.Lock(); } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { - cfd->Ref(); - mutex_.Unlock(); - FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); - mutex_.Lock(); - cfd->UnrefAndTryDelete(); + InstrumentedMutexUnlock u(&mutex_); + Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? } } } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); } shutting_down_.store(true, std::memory_order_release); @@ -447,19 +529,29 @@ } mutex_.Unlock(); + // Below check is added as recovery_error_ is not checked and it causes crash + // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is + // reached. + error_handler_.GetRecoveryError().PermitUncheckedError(); + // CancelAllBackgroundWork called with false means we just set the shutdown // marker. After this we do a variant of the waiting and unschedule work // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) CancelAllBackgroundWork(false); - int bottom_compactions_unscheduled = - env_->UnSchedule(this, Env::Priority::BOTTOM); - int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); - int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); - Status ret; + + // Cancel manual compaction if there's any + if (HasPendingManualCompaction()) { + DisableManualCompaction(); + } mutex_.Lock(); - bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled; - bg_compaction_scheduled_ -= compactions_unscheduled; - bg_flush_scheduled_ -= flushes_unscheduled; + // Unschedule all tasks for this DB + for (uint8_t i = 0; i < static_cast(TaskType::kCount); i++) { + env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM); + env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW); + env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH); + } + + Status ret = Status::OK(); // Wait for background work to finish while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || @@ -475,12 +567,45 @@ flush_scheduler_.Clear(); trim_history_scheduler_.Clear(); + // For now, simply trigger a manual flush at close time + // on all the column families. + // TODO(bjlemaire): Check if this is needed. Also, in the + // future we can contemplate doing a more fine-grained + // flushing by first checking if there is a need for + // flushing (but need to implement something + // else than imm()->IsFlushPending() because the output + // memtables added to imm() dont trigger flushes). + if (immutable_db_options_.experimental_mempurge_threshold > 0.0) { + Status flush_ret; + mutex_.Unlock(); + for (ColumnFamilyData* cf : *versions_->GetColumnFamilySet()) { + if (immutable_db_options_.atomic_flush) { + flush_ret = AtomicFlushMemTables({cf}, FlushOptions(), + FlushReason::kManualFlush); + if (!flush_ret.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Atomic flush memtables failed upon closing (mempurge)."); + } + } else { + flush_ret = + FlushMemTable(cf, FlushOptions(), FlushReason::kManualFlush); + if (!flush_ret.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Flush memtables failed upon closing (mempurge)."); + } + } + } + mutex_.Lock(); + } + while (!flush_queue_.empty()) { const FlushRequest& flush_req = PopFirstFromFlushQueue(); for (const auto& iter : flush_req) { iter.first->UnrefAndTryDelete(); } } + while (!compaction_queue_.empty()) { auto cfd = PopFirstFromCompactionQueue(); cfd->UnrefAndTryDelete(); @@ -533,7 +658,7 @@ ROCKS_LOG_WARN( immutable_db_options_.info_log, "Unable to Sync WAL file %s with error -- %s", - LogFileName(immutable_db_options_.wal_dir, log_number).c_str(), + LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(), s.ToString().c_str()); // Retain the first error if (ret.ok()) { @@ -567,7 +692,8 @@ versions_.reset(); mutex_.Unlock(); if (db_lock_ != nullptr) { - env_->UnlockFile(db_lock_); + // TODO: Check for unlock error + env_->UnlockFile(db_lock_).PermitUncheckedError(); } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); @@ -586,11 +712,15 @@ if (immutable_db_options_.info_log && own_info_log_) { Status s = immutable_db_options_.info_log->Close(); - if (ret.ok()) { + if (!s.ok() && !s.IsNotSupported() && ret.ok()) { ret = s; } } + if (write_buffer_manager_ && wbm_stall_) { + write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get()); + } + if (ret.IsAborted()) { // Reserve IsAborted() error for those where users didn't release // certain resource and they can release them and come back and @@ -603,9 +733,11 @@ Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); if (!closed_) { closed_ = true; - CloseHelper(); + closing_status_ = CloseHelper(); + closing_status_.PermitUncheckedError(); } } @@ -620,44 +752,48 @@ } const Status DBImpl::CreateArchivalDirectory() { - if (immutable_db_options_.wal_ttl_seconds > 0 || - immutable_db_options_.wal_size_limit_mb > 0) { - std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir); + if (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = + ArchivalDirectory(immutable_db_options_.GetWalDir()); return env_->CreateDirIfMissing(archivalPath); } return Status::OK(); } void DBImpl::PrintStatistics() { - auto dbstats = immutable_db_options_.statistics.get(); + auto dbstats = immutable_db_options_.stats; if (dbstats) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", dbstats->ToString().c_str()); } } -void DBImpl::StartTimedTasks() { - unsigned int stats_dump_period_sec = 0; - unsigned int stats_persist_period_sec = 0; +void DBImpl::StartPeriodicWorkScheduler() { +#ifndef ROCKSDB_LITE + +#ifndef NDEBUG + // It only used by test to disable scheduler + bool disable_scheduler = false; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", + &disable_scheduler); + if (disable_scheduler) { + return; + } +#endif // !NDEBUG + { InstrumentedMutexLock l(&mutex_); - stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec; - if (stats_dump_period_sec > 0) { - if (!thread_dump_stats_) { - thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( - [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - static_cast(stats_dump_period_sec) * kMicrosInSecond)); - } - } - stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec; - if (stats_persist_period_sec > 0) { - if (!thread_persist_stats_) { - thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( - [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - static_cast(stats_persist_period_sec) * kMicrosInSecond)); - } - } + periodic_work_scheduler_ = PeriodicWorkScheduler::Default(); + TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicWorkScheduler:Init", + &periodic_work_scheduler_); } + + periodic_work_scheduler_->Register( + this, mutable_db_options_.stats_dump_period_sec, + mutable_db_options_.stats_persist_period_sec); +#endif // !ROCKSDB_LITE } // esitmate the total size of stats_history_ @@ -683,8 +819,11 @@ if (shutdown_initiated_) { return; } - uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; - Statistics* statistics = immutable_db_options_.statistics.get(); + TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning"); + uint64_t now_seconds = + immutable_db_options_.clock->NowMicros() / kMicrosInSecond; + + Statistics* statistics = immutable_db_options_.stats; if (!statistics) { return; } @@ -703,29 +842,34 @@ if (immutable_db_options_.persist_stats_to_disk) { WriteBatch batch; + Status s = Status::OK(); if (stats_slice_initialized_) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Reading %" ROCKSDB_PRIszt " stats from statistics\n", stats_slice_.size()); for (const auto& stat : stats_map) { - char key[100]; - int length = - EncodePersistentStatsKey(now_seconds, stat.first, 100, key); - // calculate the delta from last time - if (stats_slice_.find(stat.first) != stats_slice_.end()) { - uint64_t delta = stat.second - stats_slice_[stat.first]; - batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)), - ToString(delta)); + if (s.ok()) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + s = batch.Put(persist_stats_cf_handle_, + Slice(key, std::min(100, length)), ToString(delta)); + } } } } stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); - WriteOptions wo; - wo.low_pri = true; - wo.no_slowdown = true; - wo.sync = false; - Status s = Write(wo, &batch); + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } if (!s.ok()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Writing to persistent stats CF failed -- %s", @@ -774,6 +918,7 @@ " bytes, slice count: %" ROCKSDB_PRIszt, stats_history_size, stats_history_.size()); } + TEST_SYNC_POINT("DBImpl::PersistStats:End"); #endif // !ROCKSDB_LITE } @@ -817,31 +962,50 @@ void DBImpl::DumpStats() { TEST_SYNC_POINT("DBImpl::DumpStats:1"); #ifndef ROCKSDB_LITE - const DBPropertyInfo* cf_property_info = - GetPropertyInfo(DB::Properties::kCFStats); - assert(cf_property_info != nullptr); - const DBPropertyInfo* db_property_info = - GetPropertyInfo(DB::Properties::kDBStats); - assert(db_property_info != nullptr); - std::string stats; if (shutdown_initiated_) { return; } + + TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); { InstrumentedMutexLock l(&mutex_); - default_cf_internal_stats_->GetStringProperty( - *db_property_info, DB::Properties::kDBStats, &stats); + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->initialized()) { + // Release DB mutex for gathering cache entry stats. Pass over all + // column families for this first so that other stats are dumped + // near-atomically. + InstrumentedMutexUnlock u(&mutex_); + cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); + } + } + + const std::string* property = &DB::Properties::kDBStats; + const DBPropertyInfo* property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); + default_cf_internal_stats_->GetStringProperty(*property_info, *property, + &stats); + + property = &DB::Properties::kCFStatsNoFileHistogram; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->initialized()) { - cfd->internal_stats()->GetStringProperty( - *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats); + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); } } + + property = &DB::Properties::kCFFileHistogram; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + assert(!property_info->need_out_of_mutex); for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->initialized()) { - cfd->internal_stats()->GetStringProperty( - *cf_property_info, DB::Properties::kCFFileHistogram, &stats); + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); } } } @@ -863,12 +1027,18 @@ PrintStatistics(); } +void DBImpl::FlushInfoLog() { + if (shutdown_initiated_) { + return; + } + TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning"); + LogFlush(immutable_db_options_.info_log); +} + Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, int max_entries_to_print, std::string* out_str) { - auto* cfh = - static_cast_with_check( - column_family); + auto* cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); @@ -890,9 +1060,9 @@ } } -Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { +FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { assert(cfd); - Directory* ret_dir = cfd->GetDataDir(path_id); + FSDirectory* ret_dir = cfd->GetDataDir(path_id); if (ret_dir == nullptr) { return directories_.GetDataDir(path_id); } @@ -907,7 +1077,8 @@ (void)options_map; return Status::NotSupported("Not supported in ROCKSDB LITE"); #else - auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* cfd = + static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetOptions() on column family [%s], empty input", @@ -918,6 +1089,7 @@ MutableCFOptions new_options; Status s; Status persist_options_status; + persist_options_status.PermitUncheckedError(); // Allow uninitialized access SuperVersionContext sv_context(/* create_superversion */ true); { auto db_options = GetDBOptions(); @@ -927,8 +1099,8 @@ new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. VersionEdit dummy_edit; - versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. @@ -978,16 +1150,26 @@ MutableDBOptions new_options; Status s; - Status persist_options_status; + Status persist_options_status = Status::OK(); bool wal_changed = false; WriteContext write_context; { InstrumentedMutexLock l(&mutex_); s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, &new_options); + if (new_options.bytes_per_sync == 0) { new_options.bytes_per_sync = 1024 * 1024; } + + if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "SetDBOptions(), input option value is not changed, " + "skipping updating."); + persist_options_status.PermitUncheckedError(); + return s; + } + DBOptions new_db_options = BuildDBOptions(immutable_db_options_, new_options); if (s.ok()) { @@ -1006,12 +1188,12 @@ } if (s.ok()) { const BGJobLimits current_bg_job_limits = - GetBGJobLimits(immutable_db_options_.max_background_flushes, + GetBGJobLimits(mutable_db_options_.max_background_flushes, mutable_db_options_.max_background_compactions, mutable_db_options_.max_background_jobs, /* parallelize_compactions */ true); const BGJobLimits new_bg_job_limits = GetBGJobLimits( - immutable_db_options_.max_background_flushes, + new_options.max_background_flushes, new_options.max_background_compactions, new_options.max_background_jobs, /* parallelize_compactions */ true); @@ -1036,36 +1218,15 @@ } if (new_options.stats_dump_period_sec != - mutable_db_options_.stats_dump_period_sec) { - if (thread_dump_stats_) { - mutex_.Unlock(); - thread_dump_stats_->cancel(); - mutex_.Lock(); - } - if (new_options.stats_dump_period_sec > 0) { - thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( - [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - static_cast(new_options.stats_dump_period_sec) * - kMicrosInSecond)); - } else { - thread_dump_stats_.reset(); - } - } - if (new_options.stats_persist_period_sec != - mutable_db_options_.stats_persist_period_sec) { - if (thread_persist_stats_) { - mutex_.Unlock(); - thread_persist_stats_->cancel(); - mutex_.Lock(); - } - if (new_options.stats_persist_period_sec > 0) { - thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( - [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - static_cast(new_options.stats_persist_period_sec) * - kMicrosInSecond)); - } else { - thread_persist_stats_.reset(); - } + mutable_db_options_.stats_dump_period_sec || + new_options.stats_persist_period_sec != + mutable_db_options_.stats_persist_period_sec) { + mutex_.Unlock(); + periodic_work_scheduler_->Unregister(this); + periodic_work_scheduler_->Register( + this, new_options.stats_dump_period_sec, + new_options.stats_persist_period_sec); + mutex_.Lock(); } write_controller_.set_max_delayed_write_rate( new_options.delayed_write_rate); @@ -1097,6 +1258,10 @@ persist_options_status = WriteOptionsFile( false /*need_mutex_lock*/, false /*need_enter_write_thread*/); write_thread_.ExitUnbatched(&w); + } else { + // To get here, we must have had invalid options and will not attempt to + // persist the options, which means the status is "OK/Uninitialized. + persist_options_status.PermitUncheckedError(); } } ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); @@ -1147,25 +1312,25 @@ Status DBImpl::FlushWAL(bool sync) { if (manual_wal_flush_) { - Status s; + IOStatus io_s; { // We need to lock log_write_mutex_ since logs_ might change concurrently InstrumentedMutexLock wl(&log_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; - s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(); } - if (!s.ok()) { + if (!io_s.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", - s.ToString().c_str()); + io_s.ToString().c_str()); // In case there is a fs error we should set it globally to prevent the // future writes - WriteStatusCheck(s); + IOStatusCheck(io_s); // whether sync or not, we should abort the rest of function upon error - return s; + return std::move(io_s); } if (!sync) { ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false"); - return s; + return std::move(io_s); } } if (!sync) { @@ -1217,21 +1382,36 @@ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); RecordTick(stats_, WAL_FILE_SYNCED); Status status; + IOStatus io_s; for (log::Writer* log : logs_to_sync) { - status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); - if (!status.ok()) { + io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!io_s.ok()) { + status = io_s; break; } } + if (!io_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s", + io_s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + IOStatusCheck(io_s); + } if (status.ok() && need_log_dir_sync) { - status = directories_.GetWalDir()->Fsync(); + status = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); { InstrumentedMutexLock l(&mutex_); - MarkLogsSynced(current_log_number, need_log_dir_sync, status); + if (status.ok()) { + status = MarkLogsSynced(current_log_number, need_log_dir_sync); + } else { + MarkLogsNotSynced(current_log_number); + } } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1249,7 +1429,7 @@ // future writes WriteStatusCheck(status); } - return status; + return std::move(status); } Status DBImpl::UnlockWAL() { @@ -1257,27 +1437,54 @@ return Status::OK(); } -void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, - const Status& status) { +Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { mutex_.AssertHeld(); - if (synced_dir && logfile_number_ == up_to && status.ok()) { + if (synced_dir && logfile_number_ == up_to) { log_dir_synced_ = true; } + VersionEdit synced_wals; for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { - auto& log = *it; - assert(log.getting_synced); - if (status.ok() && logs_.size() > 1) { - logs_to_free_.push_back(log.ReleaseWriter()); + auto& wal = *it; + assert(wal.getting_synced); + if (logs_.size() > 1) { + if (immutable_db_options_.track_and_verify_wals_in_manifest && + wal.writer->file()->GetFileSize() > 0) { + synced_wals.AddWal(wal.number, + WalMetadata(wal.writer->file()->GetFileSize())); + } + logs_to_free_.push_back(wal.ReleaseWriter()); // To modify logs_ both mutex_ and log_write_mutex_ must be held InstrumentedMutexLock l(&log_write_mutex_); it = logs_.erase(it); } else { - log.getting_synced = false; + wal.getting_synced = false; ++it; } } - assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || + assert(logs_.empty() || logs_[0].number > up_to || (logs_.size() == 1 && !logs_[0].getting_synced)); + + Status s; + if (synced_wals.IsWalAddition()) { + // not empty, write to MANIFEST. + s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + } + log_sync_cv_.SignalAll(); + return s; +} + +void DBImpl::MarkLogsNotSynced(uint64_t up_to) { + mutex_.AssertHeld(); + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; + ++it) { + auto& wal = *it; + assert(wal.getting_synced); + wal.getting_synced = false; + } log_sync_cv_.SignalAll(); } @@ -1298,23 +1505,49 @@ } } -InternalIterator* DBImpl::NewInternalIterator( - Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, - ColumnFamilyHandle* column_family) { +Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) { + if (ts_low == nullptr) { + return Status::InvalidArgument("ts_low is nullptr"); + } + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + InstrumentedMutexLock l(&mutex_); + *ts_low = cfd->GetFullHistoryTsLow(); + assert(cfd->user_comparator()->timestamp_size() == ts_low->size()); + return Status::OK(); +} + +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + Arena* arena, + RangeDelAggregator* range_del_agg, + SequenceNumber sequence, + ColumnFamilyHandle* column_family, + bool allow_unprepared_value) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); } else { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); cfd = cfh->cfd(); } mutex_.Lock(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); - ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg, - sequence); + return NewInternalIterator(read_options, cfd, super_version, arena, + range_del_agg, sequence, allow_unprepared_value); } void DBImpl::SchedulePurge() { @@ -1346,6 +1579,8 @@ mutex_.Lock(); } + assert(bg_purge_scheduled_ > 0); + // Can't use iterator to go over purge_files_ because inside the loop we're // unlocking the mutex that protects purge_files_. while (!purge_files_.empty()) { @@ -1413,17 +1648,7 @@ delete state->super_version; } if (job_context.HaveSomethingToDelete()) { - if (state->background_purge) { - // PurgeObsoleteFiles here does not delete files. Instead, it adds the - // files to be deleted to a job queue, and deletes it in a separate - // background thread. - state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); - state->mu->Lock(); - state->db->SchedulePurge(); - state->mu->Unlock(); - } else { - state->db->PurgeObsoleteFiles(job_context); - } + state->db->PurgeObsoleteFiles(job_context, state->background_purge); } job_context.Clean(); } @@ -1437,7 +1662,8 @@ SuperVersion* super_version, Arena* arena, RangeDelAggregator* range_del_agg, - SequenceNumber sequence) { + SequenceNumber sequence, + bool allow_unprepared_value) { InternalIterator* internal_iter; assert(arena != nullptr); assert(range_del_agg != nullptr); @@ -1469,7 +1695,8 @@ // Collect iterators for files in L0 - Ln if (read_options.read_tier != kMemtableTier) { super_version->current->AddIterators(read_options, file_options_, - &merge_iter_builder, range_del_agg); + &merge_iter_builder, range_del_agg, + allow_unprepared_value); } internal_iter = merge_iter_builder.Finish(); IterState* cleanup = @@ -1496,22 +1723,57 @@ Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + return Get(read_options, column_family, key, value, /*timestamp=*/nullptr); +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { GetImplOptions get_impl_options; get_impl_options.column_family = column_family; get_impl_options.value = value; - return GetImpl(read_options, key, get_impl_options); + get_impl_options.timestamp = timestamp; + Status s = GetImpl(read_options, key, get_impl_options); + return s; } +namespace { +class GetWithTimestampReadCallback : public ReadCallback { + public: + explicit GetWithTimestampReadCallback(SequenceNumber seq) + : ReadCallback(seq) {} + bool IsVisibleFullCheck(SequenceNumber seq) override { + return seq <= max_visible_seq_; + } +}; +} // namespace + Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, - GetImplOptions get_impl_options) { + GetImplOptions& get_impl_options) { assert(get_impl_options.value != nullptr || get_impl_options.merge_operands != nullptr); - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_GET); + + assert(get_impl_options.column_family); + const Comparator* ucmp = get_impl_options.column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + GetWithTimestampReadCallback read_cb(0); // Will call Refresh + +#ifndef NDEBUG + if (ts_sz > 0) { + assert(read_options.timestamp); + assert(read_options.timestamp->size() == ts_sz); + } else { + assert(!read_options.timestamp); + } +#endif // NDEBUG + + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); - auto cfh = - reinterpret_cast(get_impl_options.column_family); + auto cfh = static_cast_with_check( + get_impl_options.column_family); auto cfd = cfh->cfd(); if (tracer_) { @@ -1519,7 +1781,8 @@ // tracing is enabled. InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - tracer_->Get(get_impl_options.column_family, key); + // TODO: maybe handle the tracing status? + tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError(); } } @@ -1544,9 +1807,11 @@ // data for the snapshot, so the reader would see neither data that was be // visible to the snapshot before compaction nor the newer data inserted // afterwards. - snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); + if (last_seq_same_as_publish_seq_) { + snapshot = versions_->LastSequence(); + } else { + snapshot = versions_->LastPublishedSequence(); + } if (get_impl_options.callback) { // The unprep_seqs are not published for write unprepared, so it could be // that max_visible_seq is larger. Seek to the std::max of the two. @@ -1566,6 +1831,16 @@ snapshot = get_impl_options.callback->max_visible_seq(); } } + // If timestamp is used, we use read callback to ensure is returned + // only if t <= read_opts.timestamp and s <= snapshot. + // HACK: temporarily overwrite input struct field but restore + SaveAndRestore restore_callback(&get_impl_options.callback); + if (ts_sz > 0) { + assert(!get_impl_options + .callback); // timestamp with callback is not supported + read_cb.Refresh(snapshot); + get_impl_options.callback = &read_cb; + } TEST_SYNC_POINT("DBImpl::GetImpl:3"); TEST_SYNC_POINT("DBImpl::GetImpl:4"); @@ -1583,10 +1858,11 @@ bool skip_memtable = (read_options.read_tier == kPersistedTier && has_unpersisted_data_.load(std::memory_order_relaxed)); bool done = false; + std::string* timestamp = ts_sz > 0 ? get_impl_options.timestamp : nullptr; if (!skip_memtable) { // Get value associated with key if (get_impl_options.get_value) { - if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s, + if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), timestamp, &s, &merge_context, &max_covering_tombstone_seq, read_options, get_impl_options.callback, get_impl_options.is_blob_index)) { @@ -1594,9 +1870,10 @@ get_impl_options.value->PinSelf(); RecordTick(stats_, MEMTABLE_HIT); } else if ((s.ok() || s.IsMergeInProgress()) && - sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s, - &merge_context, &max_covering_tombstone_seq, - read_options, get_impl_options.callback, + sv->imm->Get(lkey, get_impl_options.value->GetSelf(), + timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + get_impl_options.callback, get_impl_options.is_blob_index)) { done = true; get_impl_options.value->PinSelf(); @@ -1605,9 +1882,9 @@ } else { // Get Merge Operands associated with key, Merge Operands should not be // merged and raw values should be returned to the user. - if (sv->mem->Get(lkey, nullptr, &s, &merge_context, - &max_covering_tombstone_seq, read_options, nullptr, - nullptr, false)) { + if (sv->mem->Get(lkey, /*value*/ nullptr, /*timestamp=*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, + read_options, nullptr, nullptr, false)) { done = true; RecordTick(stats_, MEMTABLE_HIT); } else if ((s.ok() || s.IsMergeInProgress()) && @@ -1623,11 +1900,12 @@ return s; } } + PinnedIteratorsManager pinned_iters_mgr; if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get( - read_options, lkey, get_impl_options.value, &s, &merge_context, - &max_covering_tombstone_seq, + read_options, lkey, get_impl_options.value, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, get_impl_options.get_value ? get_impl_options.value_found : nullptr, nullptr, nullptr, get_impl_options.get_value ? get_impl_options.callback : nullptr, @@ -1675,17 +1953,49 @@ const ReadOptions& read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_MULTIGET); + return MultiGet(read_options, column_family, keys, values, + /*timestamps=*/nullptr); +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); PERF_TIMER_GUARD(get_snapshot_time); +#ifndef NDEBUG + for (const auto* cfh : column_family) { + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + assert(read_options.timestamp); + assert(ucmp->timestamp_size() == read_options.timestamp->size()); + } else { + assert(!read_options.timestamp); + } + } +#endif // NDEBUG + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(column_family, keys).PermitUncheckedError(); + } + } + SequenceNumber consistent_seqnum; - ; std::unordered_map multiget_cf_data( column_family.size()); for (auto cf : column_family) { - auto cfh = reinterpret_cast(cf); + auto cfh = static_cast_with_check(cf); auto cfd = cfh->cfd(); if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { multiget_cf_data.emplace(cfd->GetID(), @@ -1704,6 +2014,9 @@ read_options, nullptr, iter_deref_lambda, &multiget_cf_data, &consistent_seqnum); + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1"); + TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2"); + // Contain a list of merge operations if merge occurs. MergeContext merge_context; @@ -1711,6 +2024,9 @@ size_t num_keys = keys.size(); std::vector stat_list(num_keys); values->resize(num_keys); + if (timestamps) { + timestamps->resize(num_keys); + } // Keep track of bytes that we read for statistics-recording later uint64_t bytes_read = 0; @@ -1721,13 +2037,25 @@ // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. size_t num_found = 0; - for (size_t i = 0; i < num_keys; ++i) { - merge_context.Clear(); - Status& s = stat_list[i]; - std::string* value = &(*values)[i]; + size_t keys_read; + uint64_t curr_value_size = 0; - LookupKey lkey(keys[i], consistent_seqnum); - auto cfh = reinterpret_cast(column_family[i]); + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + for (keys_read = 0; keys_read < num_keys; ++keys_read) { + merge_context.Clear(); + Status& s = stat_list[keys_read]; + std::string* value = &(*values)[keys_read]; + std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr; + + LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); + auto cfh = + static_cast_with_check(column_family[keys_read]); SequenceNumber max_covering_tombstone_seq = 0; auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); assert(mgd_iter != multiget_cf_data.end()); @@ -1738,13 +2066,15 @@ has_unpersisted_data_.load(std::memory_order_relaxed)); bool done = false; if (!skip_memtable) { - if (super_version->mem->Get(lkey, value, &s, &merge_context, - &max_covering_tombstone_seq, read_options)) { + if (super_version->mem->Get(lkey, value, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + read_callback)) { done = true; RecordTick(stats_, MEMTABLE_HIT); - } else if (super_version->imm->Get(lkey, value, &s, &merge_context, + } else if (super_version->imm->Get(lkey, value, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, - read_options)) { + read_options, read_callback)) { done = true; RecordTick(stats_, MEMTABLE_HIT); } @@ -1752,8 +2082,13 @@ if (!done) { PinnableSlice pinnable_val; PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get(read_options, lkey, &pinnable_val, &s, - &merge_context, &max_covering_tombstone_seq); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, &pinnable_val, timestamp, + &s, &merge_context, + &max_covering_tombstone_seq, + &pinned_iters_mgr, /*value_found=*/nullptr, + /*key_exists=*/nullptr, + /*seq=*/nullptr, read_callback); value->assign(pinnable_val.data(), pinnable_val.size()); RecordTick(stats_, MEMTABLE_MISS); } @@ -1761,6 +2096,28 @@ if (s.ok()) { bytes_read += value->size(); num_found++; + curr_value_size += value->size(); + if (curr_value_size > read_options.value_size_soft_limit) { + while (++keys_read < num_keys) { + stat_list[keys_read] = Status::Aborted(); + } + break; + } + } + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + break; + } + } + + if (keys_read < num_keys) { + // The only reason to break out of the loop is when the deadline is + // exceeded + assert(immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())); + for (++keys_read; keys_read < num_keys; ++keys_read) { + stat_list[keys_read] = Status::TimedOut(); } } @@ -1827,16 +2184,18 @@ // version because a flush happening in between may compact away data for // the snapshot, but the snapshot is earlier than the data overwriting it, // so users may see wrong results. - *snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); + if (last_seq_same_as_publish_seq_) { + *snapshot = versions_->LastSequence(); + } else { + *snapshot = versions_->LastPublishedSequence(); + } } } else { // If we end up with the same issue of memtable geting sealed during 2 // consecutive retries, it means the write rate is very high. In that case // its probably ok to take the mutex on the 3rd try so we can succeed for // sure - static const int num_retries = 3; + constexpr int num_retries = 3; for (int i = 0; i < num_retries; ++i) { last_try = (i == num_retries - 1); bool retry = false; @@ -1860,12 +2219,15 @@ // acquire the lock so we're sure to succeed mutex_.Lock(); } - *snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); + if (last_seq_same_as_publish_seq_) { + *snapshot = versions_->LastSequence(); + } else { + *snapshot = versions_->LastPublishedSequence(); + } } else { - *snapshot = reinterpret_cast(read_options.snapshot) - ->number_; + *snapshot = + static_cast_with_check(read_options.snapshot) + ->number_; } for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); ++cf_iter) { @@ -1915,14 +2277,49 @@ ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + return MultiGet(read_options, num_keys, column_families, keys, values, + /*timestamps=*/nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input) { if (num_keys == 0) { return; } + +#ifndef NDEBUG + for (size_t i = 0; i < num_keys; ++i) { + ColumnFamilyHandle* cfh = column_families[i]; + assert(cfh); + const Comparator* const ucmp = cfh->GetComparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + assert(read_options.timestamp); + assert(read_options.timestamp->size() == ucmp->timestamp_size()); + } else { + assert(!read_options.timestamp); + } + } +#endif // NDEBUG + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError(); + } + } + autovector key_context; autovector sorted_keys; sorted_keys.resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { key_context.emplace_back(column_families[i], keys[i], &values[i], + timestamps ? ×tamps[i] : nullptr, &statuses[i]); } for (size_t i = 0; i < num_keys; ++i) { @@ -1934,20 +2331,18 @@ multiget_cf_data; size_t cf_start = 0; ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + for (size_t i = 0; i < num_keys; ++i) { KeyContext* key_ctx = sorted_keys[i]; if (key_ctx->column_family != cf) { - multiget_cf_data.emplace_back( - MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr)); + multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr); cf_start = i; cf = key_ctx->column_family; } } - { - // multiget_cf_data.emplace_back( - // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr)); - multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); - } + + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + std::function::iterator&)> @@ -1963,14 +2358,38 @@ read_options, nullptr, iter_deref_lambda, &multiget_cf_data, &consistent_seqnum); - for (auto cf_iter = multiget_cf_data.begin(); - cf_iter != multiget_cf_data.end(); ++cf_iter) { - MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys, - cf_iter->super_version, consistent_seqnum, nullptr, nullptr); + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = nullptr; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s; + auto cf_iter = multiget_cf_data.begin(); + for (; cf_iter != multiget_cf_data.end(); ++cf_iter) { + s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, + &sorted_keys, cf_iter->super_version, consistent_seqnum, + read_callback); + if (!s.ok()) { + break; + } + } + if (!s.ok()) { + assert(s.IsTimedOut() || s.IsAborted()); + for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) { + for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys; + ++i) { + *sorted_keys[i]->s = s; + } + } + } + + for (const auto& iter : multiget_cf_data) { if (!unref_only) { - ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version); + ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); } else { - cf_iter->cfd->GetSuperVersion()->Unref(); + iter.cfd->GetSuperVersion()->Unref(); } } } @@ -1983,7 +2402,7 @@ static_cast(lhs->column_family); uint32_t cfd_id1 = cfh->cfd()->GetID(); const Comparator* comparator = cfh->cfd()->user_comparator(); - cfh = static_cast(lhs->column_family); + cfh = static_cast(rhs->column_family); uint32_t cfd_id2 = cfh->cfd()->GetID(); if (cfd_id1 < cfd_id2) { @@ -1993,7 +2412,8 @@ } // Both keys are from the same column family - int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + int cmp = comparator->CompareWithoutTimestamp( + *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false); if (cmp < 0) { return true; } @@ -2006,48 +2426,47 @@ void DBImpl::PrepareMultiGetKeys( size_t num_keys, bool sorted_input, autovector* sorted_keys) { -#ifndef NDEBUG if (sorted_input) { - for (size_t index = 0; index < sorted_keys->size(); ++index) { - if (index > 0) { - KeyContext* lhs = (*sorted_keys)[index - 1]; - KeyContext* rhs = (*sorted_keys)[index]; - ColumnFamilyHandleImpl* cfh = - reinterpret_cast(lhs->column_family); - uint32_t cfd_id1 = cfh->cfd()->GetID(); - const Comparator* comparator = cfh->cfd()->user_comparator(); - cfh = reinterpret_cast(lhs->column_family); - uint32_t cfd_id2 = cfh->cfd()->GetID(); - - assert(cfd_id1 <= cfd_id2); - if (cfd_id1 < cfd_id2) { - continue; - } - - // Both keys are from the same column family - int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); - assert(cmp <= 0); - } - index++; - } - } +#ifndef NDEBUG + assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(), + CompareKeyContext())); #endif - if (!sorted_input) { - CompareKeyContext sort_comparator; - std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, - sort_comparator); + return; } + + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + CompareKeyContext()); } void DBImpl::MultiGet(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + return MultiGet(read_options, column_family, num_keys, keys, values, + /*timestamp=*/nullptr, statuses, sorted_input); +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool sorted_input) { + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError(); + } + } autovector key_context; autovector sorted_keys; sorted_keys.resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { - key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]); + key_context.emplace_back(column_family, keys[i], &values[i], + timestamps ? ×tamps[i] : nullptr, + &statuses[i]); } for (size_t i = 0; i < num_keys; ++i) { sorted_keys[i] = &key_context[i]; @@ -2100,33 +2519,61 @@ consistent_seqnum = callback->max_visible_seq(); } - MultiGetImpl(read_options, 0, num_keys, sorted_keys, - multiget_cf_data[0].super_version, consistent_seqnum, nullptr, - nullptr); + GetWithTimestampReadCallback timestamp_read_callback(0); + ReadCallback* read_callback = callback; + if (read_options.timestamp && read_options.timestamp->size() > 0) { + assert(!read_callback); // timestamp with callback is not supported + timestamp_read_callback.Refresh(consistent_seqnum); + read_callback = ×tamp_read_callback; + } + + Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, + read_callback); + assert(s.ok() || s.IsTimedOut() || s.IsAborted()); ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, multiget_cf_data[0].super_version); } -void DBImpl::MultiGetImpl( +// The actual implementation of batched MultiGet. Parameters - +// start_key - Index in the sorted_keys vector to start processing from +// num_keys - Number of keys to lookup, starting with sorted_keys[start_key] +// sorted_keys - The entire batch of sorted keys for this CF +// +// The per key status is returned in the KeyContext structures pointed to by +// sorted_keys. An overall Status is also returned, with the only possible +// values being Status::OK() and Status::TimedOut(). The latter indicates +// that the call exceeded read_options.deadline +Status DBImpl::MultiGetImpl( const ReadOptions& read_options, size_t start_key, size_t num_keys, autovector* sorted_keys, SuperVersion* super_version, SequenceNumber snapshot, - ReadCallback* callback, bool* is_blob_index) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_MULTIGET); + ReadCallback* callback) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. size_t keys_left = num_keys; + Status s; + uint64_t curr_value_size = 0; while (keys_left) { + if (read_options.deadline.count() && + immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())) { + s = Status::TimedOut(); + break; + } + size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) ? MultiGetContext::MAX_BATCH_SIZE : keys_left; MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, - batch_size, snapshot); + batch_size, snapshot, read_options); MultiGetRange range = ctx.GetMultiGetRange(); + range.AddValueSize(curr_value_size); bool lookup_current = false; keys_left -= batch_size; @@ -2140,11 +2587,9 @@ (read_options.read_tier == kPersistedTier && has_unpersisted_data_.load(std::memory_order_relaxed)); if (!skip_memtable) { - super_version->mem->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->mem->MultiGet(read_options, &range, callback); if (!range.empty()) { - super_version->imm->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->imm->MultiGet(read_options, &range, callback); } if (!range.empty()) { lookup_current = true; @@ -2154,8 +2599,12 @@ } if (lookup_current) { PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->current->MultiGet(read_options, &range, callback); + } + curr_value_size = range.GetValueSize(); + if (curr_value_size > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; } } @@ -2163,13 +2612,21 @@ PERF_TIMER_GUARD(get_post_process_time); size_t num_found = 0; uint64_t bytes_read = 0; - for (size_t i = start_key; i < start_key + num_keys; ++i) { + for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) { KeyContext* key = (*sorted_keys)[i]; if (key->s->ok()) { bytes_read += key->value->size(); num_found++; } } + if (keys_left) { + assert(s.IsTimedOut() || s.IsAborted()); + for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys; + ++i) { + KeyContext* key = (*sorted_keys)[i]; + *key->s = s; + } + } RecordTick(stats_, NUMBER_MULTIGET_CALLS); RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); @@ -2178,6 +2635,8 @@ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); PERF_TIMER_STOP(get_post_process_time); + + return s; } Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, @@ -2252,7 +2711,6 @@ const std::string& column_family_name, ColumnFamilyHandle** handle) { Status s; - Status persist_options_status; *handle = nullptr; DBOptions db_options = @@ -2301,7 +2759,7 @@ auto* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); assert(cfd != nullptr); - std::map> dummy_created_dirs; + std::map> dummy_created_dirs; s = cfd->AddDirectories(&dummy_created_dirs); } if (s.ok()) { @@ -2333,7 +2791,7 @@ // this is outside the mutex if (s.ok()) { NewThreadStatusCfInfo( - reinterpret_cast(*handle)->cfd()); + static_cast_with_check(*handle)->cfd()); } return s; } @@ -2370,7 +2828,7 @@ } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { return Status::InvalidArgument("Can't drop default column family"); @@ -2436,7 +2894,8 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, bool* value_found) { + std::string* value, std::string* timestamp, + bool* value_found) { assert(value != nullptr); if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value @@ -2449,6 +2908,7 @@ get_impl_options.column_family = column_family; get_impl_options.value = &pinnable_val; get_impl_options.value_found = value_found; + get_impl_options.timestamp = timestamp; auto s = GetImpl(roptions, key, get_impl_options); value->assign(pinnable_val.data(), pinnable_val.size()); @@ -2471,6 +2931,13 @@ } // if iterator wants internal keys, we can only proceed if // we can guarantee the deletes haven't been processed yet + if (read_options.iter_start_seqnum > 0 && + !iter_start_seqnum_deprecation_warned_.exchange(true)) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "iter_start_seqnum is deprecated, will be removed in a future release. " + "Please try using user-defined timestamp instead."); + } if (immutable_db_options_.preserve_deletes && read_options.iter_start_seqnum > 0 && read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) { @@ -2478,8 +2945,9 @@ "Iterator requested internal keys which are too old and are not" " guaranteed to be preserved, try larger iter_start_seqnum opt.")); } - auto cfh = reinterpret_cast(column_family); - auto cfd = cfh->cfd(); + auto cfh = static_cast_with_check(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + assert(cfd != nullptr); ReadCallback* read_callback = nullptr; // No read callback provided. if (read_options.tailing) { #ifdef ROCKSDB_LITE @@ -2488,10 +2956,11 @@ #else SuperVersion* sv = cfd->GetReferencedSuperVersion(this); - auto iter = new ForwardIterator(this, read_options, cfd, sv); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); result = NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, kMaxSequenceNumber, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd); #endif @@ -2499,10 +2968,11 @@ // Note: no need to consider the special case of // last_seq_same_as_publish_seq_==false since NewIterator is overridden in // WritePreparedTxnDB - auto snapshot = read_options.snapshot != nullptr - ? read_options.snapshot->GetSequenceNumber() - : versions_->LastSequence(); - result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + result = NewIteratorImpl(read_options, cfd, + (read_options.snapshot != nullptr) + ? read_options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber, + read_callback); } return result; } @@ -2511,10 +2981,28 @@ ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback, - bool allow_blob, + bool expose_blob_index, bool allow_refresh) { SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + TEST_SYNC_POINT("DBImpl::NewIterator:1"); + TEST_SYNC_POINT("DBImpl::NewIterator:2"); + + if (snapshot == kMaxSequenceNumber) { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + // Note that the super version might not contain all the data available + // to this snapshot, but in that case it can see all the data in the + // super version, which is a valid consistent state after the user + // calls NewIterator(). + snapshot = versions_->LastSequence(); + TEST_SYNC_POINT("DBImpl::NewIterator:3"); + TEST_SYNC_POINT("DBImpl::NewIterator:4"); + } + // Try to generate a DB iterator tree in continuous memory area to be // cache friendly. Here is an example of result: // +-------------------------------+ @@ -2558,14 +3046,15 @@ // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback, this, cfd, allow_blob, + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, expose_blob_index, read_options.snapshot != nullptr ? false : allow_refresh); - InternalIterator* internal_iter = - NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), - db_iter->GetRangeDelAggregator(), snapshot); + InternalIterator* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot, + /* allow_unprepared_value */ true); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; @@ -2591,12 +3080,13 @@ "Tailing iterator not supported in RocksDB lite"); #else for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); + auto cfd = static_cast_with_check(cfh)->cfd(); SuperVersion* sv = cfd->GetReferencedSuperVersion(this); - auto iter = new ForwardIterator(this, read_options, cfd, sv); + auto iter = new ForwardIterator(this, read_options, cfd, sv, + /* allow_unprepared_value */ true); iterators->push_back(NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, kMaxSequenceNumber, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd)); } @@ -2610,7 +3100,8 @@ : versions_->LastSequence(); for (size_t i = 0; i < column_families.size(); ++i) { auto* cfd = - reinterpret_cast(column_families[i])->cfd(); + static_cast_with_check(column_families[i]) + ->cfd(); iterators->push_back( NewIteratorImpl(read_options, cfd, snapshot, read_callback)); } @@ -2630,7 +3121,8 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, bool lock) { int64_t unix_time = 0; - env_->GetCurrentTime(&unix_time); // Ignore error + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error SnapshotImpl* s = new SnapshotImpl; if (lock) { @@ -2656,7 +3148,7 @@ } namespace { -typedef autovector CfdList; +using CfdList = autovector; bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { for (const ColumnFamilyData* t : list) { if (t == cfd) { @@ -2668,15 +3160,23 @@ } // namespace void DBImpl::ReleaseSnapshot(const Snapshot* s) { + if (s == nullptr) { + // DBImpl::GetSnapshot() can return nullptr when snapshot + // not supported by specifying the condition: + // inplace_update_support enabled. + return; + } const SnapshotImpl* casted_s = reinterpret_cast(s); { InstrumentedMutexLock l(&mutex_); snapshots_.Delete(casted_s); uint64_t oldest_snapshot; if (snapshots_.empty()) { - oldest_snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); + if (last_seq_same_as_publish_seq_) { + oldest_snapshot = versions_->LastSequence(); + } else { + oldest_snapshot = versions_->LastPublishedSequence(); + } } else { oldest_snapshot = snapshots_.oldest()->number_; } @@ -2717,7 +3217,7 @@ #ifndef ROCKSDB_LITE Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); // Increment the ref count @@ -2739,7 +3239,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); // Increment the ref count @@ -2765,17 +3265,37 @@ Env* DBImpl::GetEnv() const { return env_; } FileSystem* DB::GetFileSystem() const { - static LegacyFileSystemWrapper fs_wrap(GetEnv()); - return &fs_wrap; + const auto& fs = GetEnv()->GetFileSystem(); + return fs.get(); } FileSystem* DBImpl::GetFileSystem() const { return immutable_db_options_.fs.get(); } +SystemClock* DBImpl::GetSystemClock() const { + return immutable_db_options_.clock; +} + +#ifndef ROCKSDB_LITE + +Status DBImpl::StartIOTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + assert(trace_writer != nullptr); + return io_tracer_->StartIOTrace(GetSystemClock(), trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndIOTrace() { + io_tracer_->EndIOTrace(); + return Status::OK(); +} + +#endif // ROCKSDB_LITE + Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { InstrumentedMutexLock l(&mutex_); - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), cfh->cfd()->GetLatestCFOptions()); } @@ -2789,7 +3309,8 @@ const Slice& property, std::string* value) { const DBPropertyInfo* property_info = GetPropertyInfo(property); value->clear(); - auto cfd = reinterpret_cast(column_family)->cfd(); + auto cfd = + static_cast_with_check(column_family)->cfd(); if (property_info == nullptr) { return false; } else if (property_info->handle_int) { @@ -2801,16 +3322,21 @@ } return ret_value; } else if (property_info->handle_string) { - InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetStringProperty(*property_info, property, - value); + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } } else if (property_info->handle_string_dbimpl) { - std::string tmp_value; - bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value); - if (ret_value) { - *value = tmp_value; + if (property_info->need_out_of_mutex) { + return (this->*(property_info->handle_string_dbimpl))(value); + } else { + InstrumentedMutexLock l(&mutex_); + return (this->*(property_info->handle_string_dbimpl))(value); } - return ret_value; } // Shouldn't reach here since exactly one of handle_string and handle_int // should be non-nullptr. @@ -2823,13 +3349,19 @@ std::map* value) { const DBPropertyInfo* property_info = GetPropertyInfo(property); value->clear(); - auto cfd = reinterpret_cast(column_family)->cfd(); + auto cfd = + static_cast_with_check(column_family)->cfd(); if (property_info == nullptr) { return false; } else if (property_info->handle_map) { - InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetMapProperty(*property_info, property, - value); + if (property_info->need_out_of_mutex) { + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } } // If we reach this point it means that handle_map is not provided for the // requested property @@ -2842,7 +3374,8 @@ if (property_info == nullptr || property_info->handle_int == nullptr) { return false; } - auto cfd = reinterpret_cast(column_family)->cfd(); + auto cfd = + static_cast_with_check(column_family)->cfd(); return GetIntPropertyInternal(cfd, *property_info, false, value); } @@ -2860,17 +3393,17 @@ } } else { SuperVersion* sv = nullptr; - if (!is_locked) { - sv = GetAndRefSuperVersion(cfd); - } else { - sv = cfd->GetSuperVersion(); + if (is_locked) { + mutex_.Unlock(); } + sv = GetAndRefSuperVersion(cfd); bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( property_info, sv->current, value); - if (!is_locked) { - ReturnAndCleanupSuperVersion(cfd, sv); + ReturnAndCleanupSuperVersion(cfd, sv); + if (is_locked) { + mutex_.Lock(); } return ret; @@ -2879,7 +3412,7 @@ bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { assert(value != nullptr); - Statistics* statistics = immutable_db_options_.statistics.get(); + Statistics* statistics = immutable_db_options_.stats; if (!statistics) { return false; } @@ -2907,23 +3440,28 @@ } uint64_t sum = 0; + bool ret = true; { // Needs mutex to protect the list of column families. InstrumentedMutexLock l(&mutex_); uint64_t value; - for (auto* cfd : *versions_->GetColumnFamilySet()) { + for (auto* cfd : versions_->GetRefedColumnFamilySet()) { if (!cfd->initialized()) { continue; } - if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { + ret = GetIntPropertyInternal(cfd, *property_info, true, &value); + // GetIntPropertyInternal may release db mutex and re-acquire it. + mutex_.AssertHeld(); + if (ret) { sum += value; } else { - return false; + ret = false; + break; } } } *aggregated_value = sum; - return true; + return ret; } SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { @@ -3015,7 +3553,7 @@ uint64_t* const count, uint64_t* const size) { ColumnFamilyHandleImpl* cfh = - reinterpret_cast(column_family); + static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); SuperVersion* sv = GetAndRefSuperVersion(cfd); @@ -3039,16 +3577,34 @@ return Status::InvalidArgument("Invalid options"); } + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + Version* v; - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; for (int i = 0; i < n; i++) { + Slice start = range[i].start; + Slice limit = range[i].limit; + + // Add timestamp if needed + std::string start_with_ts, limit_with_ts; + if (ts_sz > 0) { + // Maximum timestamp means including all key with any timestamp + AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz); + // Append a maximum timestamp as the range limit is exclusive: + // [start, limit) + AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz); + start = start_with_ts; + limit = limit_with_ts; + } // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( @@ -3100,14 +3656,13 @@ FileType type; WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || - (type != kTableFile && type != kLogFile)) { + (type != kTableFile && type != kWalFile)) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } - Status status; - if (type == kLogFile) { + if (type == kWalFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, @@ -3115,7 +3670,7 @@ name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = wal_manager_.DeleteFile(name, number); + Status status = wal_manager_.DeleteFile(name, number); if (!status.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed -- %s.\n", name.c_str(), @@ -3124,6 +3679,7 @@ return status; } + Status status; int level; FileMetaData* metadata; ColumnFamilyData* cfd; @@ -3197,8 +3753,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { - Status status; - auto cfh = reinterpret_cast(column_family); + Status status = Status::OK(); + auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); VersionEdit edit; std::set deleted_files; @@ -3252,11 +3808,13 @@ deleted_files.insert(level_file); level_file->being_compacted = true; } + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); } } if (edit.GetDeletedFiles().empty()) { job_context.Clean(); - return Status::OK(); + return status; } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), @@ -3288,10 +3846,16 @@ versions_->GetLiveFilesMetaData(metadata); } +Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { + InstrumentedMutexLock l(&mutex_); + return versions_->GetLiveFilesChecksumInfo(checksum_list); +} + void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, ColumnFamilyMetaData* cf_meta) { assert(column_family); - auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* cfd = + static_cast_with_check(column_family)->cfd(); auto* sv = GetAndRefSuperVersion(cfd); { // Without mutex, Version::GetColumnFamilyMetaData will have data race with @@ -3309,6 +3873,17 @@ ReturnAndCleanupSuperVersion(cfd, sv); } +void DBImpl::GetAllColumnFamilyMetaData( + std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *(versions_->GetColumnFamilySet())) { + { + metadata->emplace_back(); + cfd->current()->GetColumnFamilyMetaData(&metadata->back()); + } + } +} + #endif // ROCKSDB_LITE Status DBImpl::CheckConsistency() { @@ -3400,13 +3975,48 @@ return s; } - // If last character is '\n' remove it from identity + // If last character is '\n' remove it from identity. (Old implementations + // of Env::GenerateUniqueId() would include a trailing '\n'.) if (identity->size() > 0 && identity->back() == '\n') { identity->pop_back(); } return s; } +Status DBImpl::GetDbSessionId(std::string& session_id) const { + session_id.assign(db_session_id_); + return Status::OK(); +} + +namespace { +SemiStructuredUniqueIdGen* DbSessionIdGen() { + static SemiStructuredUniqueIdGen gen; + return &gen; +} +} // namespace + +void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); } + +std::string DBImpl::GenerateDbSessionId(Env*) { + // See SemiStructuredUniqueIdGen for its desirable properties. + auto gen = DbSessionIdGen(); + + uint64_t lo, hi; + gen->GenerateNext(&hi, &lo); + if (lo == 0) { + // Avoid emitting session ID with lo==0, so that SST unique + // IDs can be more easily ensured non-zero + gen->GenerateNext(&hi, &lo); + assert(lo != 0); + } + return EncodeSessionId(hi, lo); +} + +void DBImpl::SetDbSessionId() { + db_session_id_ = GenerateDbSessionId(env_); + TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_); +} + // Default implementation -- returns not supported status Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, const std::string& /*column_family_name*/, @@ -3437,6 +4047,10 @@ } Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + if (DefaultColumnFamily() == column_family) { + return Status::InvalidArgument( + "Cannot destroy the handle returned by DefaultColumnFamily()"); + } delete column_family; return Status::OK(); } @@ -3444,30 +4058,27 @@ DB::~DB() {} Status DBImpl::Close() { - if (!closed_) { - { - InstrumentedMutexLock l(&mutex_); - // If there is unreleased snapshot, fail the close call - if (!snapshots_.empty()) { - return Status::Aborted("Cannot close DB with unreleased snapshot."); - } + InstrumentedMutexLock closing_lock_guard(&closing_mutex_); + if (closed_) { + return closing_status_; + } + { + InstrumentedMutexLock l(&mutex_); + // If there is unreleased snapshot, fail the close call + if (!snapshots_.empty()) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); } - - closed_ = true; - return CloseImpl(); } - return Status::OK(); + closing_status_ = CloseImpl(); + closed_ = true; + return closing_status_; } Status DB::ListColumnFamilies(const DBOptions& db_options, const std::string& name, std::vector* column_families) { - FileSystem* fs = db_options.file_system.get(); - LegacyFileSystemWrapper legacy_fs(db_options.env); - if (!fs) { - fs = &legacy_fs; - } - return VersionSet::ListColumnFamilies(column_families, name, fs); + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + return VersionSet::ListColumnFamilies(column_families, name, fs.get()); } Snapshot::~Snapshot() {} @@ -3477,13 +4088,13 @@ ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); Env* env = soptions.env; std::vector filenames; - bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions); + bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal soptions.info_log.reset(); // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); + env->GetChildren(dbname, &filenames).PermitUncheckedError(); FileLock* lock; const std::string lockname = LockFileName(dbname); @@ -3499,57 +4110,53 @@ std::string path_to_delete = dbname + "/" + fname; if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); - } else if (type == kTableFile || type == kLogFile) { - del = DeleteDBFile(&soptions, path_to_delete, dbname, - /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + } else if (type == kTableFile || type == kWalFile || + type == kBlobFile) { + del = DeleteDBFile( + &soptions, path_to_delete, dbname, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false); } else { del = env->DeleteFile(path_to_delete); } - if (result.ok() && !del.ok()) { + if (!del.ok() && result.ok()) { result = del; } } } - std::vector paths; - - for (const auto& path : options.db_paths) { - paths.emplace_back(path.path); - } - for (const auto& cf : column_families) { - for (const auto& path : cf.options.cf_paths) { - paths.emplace_back(path.path); + std::set paths; + for (const DbPath& db_path : options.db_paths) { + paths.insert(db_path.path); + } + for (const ColumnFamilyDescriptor& cf : column_families) { + for (const DbPath& cf_path : cf.options.cf_paths) { + paths.insert(cf_path.path); } } - - // Remove duplicate paths. - // Note that we compare only the actual paths but not path ids. - // This reason is that same path can appear at different path_ids - // for different column families. - std::sort(paths.begin(), paths.end()); - paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); - for (const auto& path : paths) { if (env->GetChildren(path, &filenames).ok()) { for (const auto& fname : filenames) { if (ParseFileName(fname, &number, &type) && - type == kTableFile) { // Lock file will be deleted at end - std::string table_path = path + "/" + fname; - Status del = DeleteDBFile(&soptions, table_path, dbname, + (type == kTableFile || + type == kBlobFile)) { // Lock file will be deleted at end + std::string file_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, file_path, dbname, /*force_bg=*/false, /*force_fg=*/false); - if (result.ok() && !del.ok()) { + if (!del.ok() && result.ok()) { result = del; } } } - env->DeleteDir(path); + // TODO: Should we return an error if we cannot delete the directory? + env->DeleteDir(path).PermitUncheckedError(); } } std::vector walDirFiles; std::string archivedir = ArchivalDirectory(dbname); bool wal_dir_exists = false; - if (dbname != soptions.wal_dir) { + if (!soptions.IsWalDirSameAsDBPath(dbname)) { wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok(); archivedir = ArchivalDirectory(soptions.wal_dir); } @@ -3561,42 +4168,47 @@ if (env->GetChildren(archivedir, &archiveFiles).ok()) { // Delete archival files. for (const auto& file : archiveFiles) { - if (ParseFileName(file, &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { Status del = DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); - if (result.ok() && !del.ok()) { + if (!del.ok() && result.ok()) { result = del; } } } - env->DeleteDir(archivedir); + // Ignore error in case dir contains other files + env->DeleteDir(archivedir).PermitUncheckedError(); } // Delete log files in the WAL dir if (wal_dir_exists) { for (const auto& file : walDirFiles) { - if (ParseFileName(file, &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { Status del = DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), soptions.wal_dir, /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); - if (result.ok() && !del.ok()) { + if (!del.ok() && result.ok()) { result = del; } } } - env->DeleteDir(soptions.wal_dir); + // Ignore error in case dir contains other files + env->DeleteDir(soptions.wal_dir).PermitUncheckedError(); } - env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(lockname); + // Ignore error since state is already gone + env->UnlockFile(lock).PermitUncheckedError(); + env->DeleteFile(lockname).PermitUncheckedError(); // sst_file_manager holds a ref to the logger. Make sure the logger is // gone before trying to remove the directory. soptions.sst_file_manager.reset(); - env->DeleteDir(dbname); // Ignore error in case dir contains other files + // Ignore error in case dir contains other files + env->DeleteDir(dbname).PermitUncheckedError(); + ; } return result; } @@ -3634,11 +4246,13 @@ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2"); + TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions", + &db_options); std::string file_name = TempOptionsFileName(GetName(), versions_->NewFileNumber()); Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, - GetFileSystem()); + fs_.get()); if (s.ok()) { s = RenameTempFileToOptionsFile(file_name); @@ -3723,15 +4337,29 @@ uint64_t options_file_number = versions_->NewFileNumber(); std::string options_file_name = OptionsFileName(GetName(), options_file_number); - // Retry if the file name happen to conflict with an existing one. - s = GetEnv()->RenameFile(file_name, options_file_name); + uint64_t options_file_size = 0; + s = GetEnv()->GetFileSize(file_name, &options_file_size); + if (s.ok()) { + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + std::unique_ptr dir_obj; + if (s.ok()) { + s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(options_file_name)); + } + } if (s.ok()) { InstrumentedMutexLock l(&mutex_); versions_->options_file_number_ = options_file_number; + versions_->options_file_size_ = options_file_size; } if (0 == disable_delete_obsolete_files_) { - DeleteObsoleteOptionsFiles(); + // TODO: Should we check for errors here? + DeleteObsoleteOptionsFiles().PermitUncheckedError(); } return s; #else @@ -3772,16 +4400,17 @@ // // A global method that can dump out the build version void DumpRocksDBBuildVersion(Logger* log) { -#if !defined(IOS_CROSS_COMPILE) - // if we compile with Xcode, we don't run build_detect_version, so we don't - // generate util/build_version.cc - ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, - ROCKSDB_MINOR, ROCKSDB_PATCH); - ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha); - ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date); -#else - (void)log; // ignore "-Wunused-parameter" -#endif + ROCKS_LOG_HEADER(log, "RocksDB version: %s\n", + GetRocksVersionAsString().c_str()); + const auto& props = GetRocksBuildProperties(); + const auto& sha = props.find("rocksdb_build_git_sha"); + if (sha != props.end()) { + ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str()); + } + const auto date = props.find("rocksdb_build_date"); + if (date != props.end()) { + ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str()); + } } #ifndef ROCKSDB_LITE @@ -3798,29 +4427,41 @@ return earliest_seq; } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE -Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, - bool cache_only, - SequenceNumber lower_bound_seq, - SequenceNumber* seq, - bool* found_record_for_key, - bool* is_blob_index) { +Status DBImpl::GetLatestSequenceForKey( + SuperVersion* sv, const Slice& key, bool cache_only, + SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp, + bool* found_record_for_key, bool* is_blob_index) { Status s; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); - LookupKey lkey(key, current_seq); + + ColumnFamilyData* cfd = sv->cfd; + assert(cfd); + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + std::string ts_buf; + if (ts_sz > 0) { + assert(timestamp); + ts_buf.assign(ts_sz, '\xff'); + } else { + assert(!timestamp); + } + Slice ts(ts_buf); + + LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts); *seq = kMaxSequenceNumber; *found_record_for_key = false; // Check if there is a record for this key in the latest memtable - sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, - seq, read_options, nullptr /*read_callback*/, is_blob_index); + sv->mem->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + nullptr /*read_callback*/, is_blob_index); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -3830,6 +4471,10 @@ return s; } + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); if (*seq != kMaxSequenceNumber) { // Found a sequence number, no need to check immutable memtables @@ -3845,8 +4490,9 @@ } // Check if there is a record for this key in the immutable memtables - sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, - seq, read_options, nullptr /*read_callback*/, is_blob_index); + sv->imm->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + nullptr /*read_callback*/, is_blob_index); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -3857,6 +4503,11 @@ return s; } + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); + if (*seq != kMaxSequenceNumber) { // Found a sequence number, no need to check memtable history *found_record_for_key = true; @@ -3871,9 +4522,9 @@ } // Check if there is a record for this key in the immutable memtables - sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, - &max_covering_tombstone_seq, seq, read_options, - is_blob_index); + sv->imm->GetFromHistory(lkey, /*value=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, seq, + read_options, is_blob_index); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading memtable. @@ -3885,8 +4536,13 @@ return s; } + assert(!ts_sz || + (*seq != kMaxSequenceNumber && + *timestamp != std::string(ts_sz, '\xff')) || + (*seq == kMaxSequenceNumber && timestamp->empty())); if (*seq != kMaxSequenceNumber) { // Found a sequence number, no need to check SST files + assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff')); *found_record_for_key = true; return Status::OK(); } @@ -3899,8 +4555,10 @@ // SST files if cache_only=true? if (!cache_only) { // Check tables - sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, - &max_covering_tombstone_seq, nullptr /* value_found */, + PinnedIteratorsManager pinned_iters_mgr; + sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s, + &merge_context, &max_covering_tombstone_seq, + &pinned_iters_mgr, nullptr /* value_found */, found_record_for_key, seq, nullptr /*read_callback*/, is_blob_index); @@ -3944,7 +4602,7 @@ } } // Ingest multiple external SST files atomically. - size_t num_cfs = args.size(); + const size_t num_cfs = args.size(); for (size_t i = 0; i != num_cfs; ++i) { if (args[i].external_files.empty()) { char err_msg[128] = {0}; @@ -3981,14 +4639,11 @@ std::vector ingestion_jobs; for (const auto& arg : args) { auto* cfd = static_cast(arg.column_family)->cfd(); - ingestion_jobs.emplace_back( - env_, versions_.get(), cfd, immutable_db_options_, file_options_, - &snapshots_, arg.options, &directories_, &event_logger_); - } - std::vector> exec_results; - for (size_t i = 0; i != num_cfs; ++i) { - exec_results.emplace_back(false, Status::OK()); + ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_, + file_options_, &snapshots_, arg.options, + &directories_, &event_logger_, io_tracer_); } + // TODO(yanqin) maybe make jobs run in parallel uint64_t start_file_number = next_file_number; for (size_t i = 1; i != num_cfs; ++i) { @@ -3996,9 +4651,14 @@ auto* cfd = static_cast(args[i].column_family)->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - exec_results[i].second = ingestion_jobs[i].Prepare( - args[i].external_files, start_file_number, super_version); - exec_results[i].first = true; + Status es = ingestion_jobs[i].Prepare( + args[i].external_files, args[i].files_checksums, + args[i].files_checksum_func_names, args[i].file_temperature, + start_file_number, super_version); + // capture first error only + if (!es.ok() && status.ok()) { + status = es; + } CleanupSuperVersion(super_version); } TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); @@ -4007,22 +4667,18 @@ auto* cfd = static_cast(args[0].column_family)->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - exec_results[0].second = ingestion_jobs[0].Prepare( - args[0].external_files, next_file_number, super_version); - exec_results[0].first = true; - CleanupSuperVersion(super_version); - } - for (const auto& exec_result : exec_results) { - if (!exec_result.second.ok()) { - status = exec_result.second; - break; + Status es = ingestion_jobs[0].Prepare( + args[0].external_files, args[0].files_checksums, + args[0].files_checksum_func_names, args[0].file_temperature, + next_file_number, super_version); + if (!es.ok()) { + status = es; } + CleanupSuperVersion(super_version); } if (!status.ok()) { for (size_t i = 0; i != num_cfs; ++i) { - if (exec_results[i].first) { - ingestion_jobs[i].Cleanup(status); - } + ingestion_jobs[i].Cleanup(status); } InstrumentedMutexLock l(&mutex_); ReleaseFileNumberFromPendingOutputs(pending_output_elem); @@ -4122,14 +4778,11 @@ if (status.ok()) { int consumed_seqno_count = ingestion_jobs[0].ConsumedSequenceNumbersCount(); -#ifndef NDEBUG for (size_t i = 1; i != num_cfs; ++i) { - assert(!!consumed_seqno_count == - !!ingestion_jobs[i].ConsumedSequenceNumbersCount()); - consumed_seqno_count += - ingestion_jobs[i].ConsumedSequenceNumbersCount(); + consumed_seqno_count = + std::max(consumed_seqno_count, + ingestion_jobs[i].ConsumedSequenceNumbersCount()); } -#endif if (consumed_seqno_count > 0) { const SequenceNumber last_seqno = versions_->LastSequence(); versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); @@ -4184,6 +4837,15 @@ #endif // !NDEBUG } } + } else if (versions_->io_status().IsIOError()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + const IOStatus& io_s = versions_->io_status(); + // Should handle return error? + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); } // Resume writes to the DB @@ -4243,11 +4905,11 @@ } // Import sst files from metadata. - auto cfh = reinterpret_cast(*handle); + auto cfh = static_cast_with_check(*handle); auto cfd = cfh->cfd(); - ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, - immutable_db_options_, file_options_, - import_options, metadata.files); + ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_, + file_options_, import_options, + metadata.files, io_tracer_); SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); VersionEdit dummy_edit; @@ -4338,15 +5000,49 @@ import_job.Cleanup(status); if (!status.ok()) { - DropColumnFamily(*handle); - DestroyColumnFamilyHandle(*handle); + Status temp_s = DropColumnFamily(*handle); + if (!temp_s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DropColumnFamily failed with error %s", + temp_s.ToString().c_str()); + } + // Always returns Status::OK() + temp_s = DestroyColumnFamilyHandle(*handle); + assert(temp_s.ok()); *handle = nullptr; } return status; } +Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); +} + Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { + return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false); +} + +Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum) { + // `bytes_read` stat is enabled based on compile-time support and cannot + // be dynamically toggled. So we do not need to worry about `PerfLevel` + // here, unlike many other `IOStatsContext` / `PerfContext` stats. + uint64_t prev_bytes_read = IOSTATS(bytes_read); + Status s; + + if (use_file_checksum) { + FileChecksumGenFactory* const file_checksum_gen_factory = + immutable_db_options_.file_checksum_gen_factory.get(); + if (!file_checksum_gen_factory) { + s = Status::InvalidArgument( + "Cannot verify file checksum if options.file_checksum_gen_factory is " + "null"); + return s; + } + } + + // TODO: simplify using GetRefedColumnFamilySet? std::vector cfd_list; { InstrumentedMutexLock l(&mutex_); @@ -4361,11 +5057,12 @@ for (auto cfd : cfd_list) { sv_list.push_back(cfd->GetReferencedSuperVersion(this)); } + for (auto& sv : sv_list) { VersionStorageInfo* vstorage = sv->current->storage_info(); ColumnFamilyData* cfd = sv->current->cfd(); Options opts; - { + if (!use_file_checksum) { InstrumentedMutexLock l(&mutex_); opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), cfd->GetLatestCFOptions()); @@ -4373,17 +5070,50 @@ for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); j++) { - const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; + const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j]; + const auto& fd = fd_with_krange.fd; + const FileMetaData* fmeta = fd_with_krange.file_metadata; + assert(fmeta); std::string fname = TableFileName(cfd->ioptions()->cf_paths, fd.GetNumber(), fd.GetPathId()); - s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_, - read_options, fname); + if (use_file_checksum) { + s = VerifyFullFileChecksum(fmeta->file_checksum, + fmeta->file_checksum_func_name, fname, + read_options); + } else { + s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_, + read_options, fname); + } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + } + } + + if (s.ok() && use_file_checksum) { + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& meta = pair.second; + assert(meta); + const std::string blob_file_name = BlobFileName( + cfd->ioptions()->cf_paths.front().path, blob_file_number); + s = VerifyFullFileChecksum(meta->GetChecksumValue(), + meta->GetChecksumMethod(), blob_file_name, + read_options); + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + prev_bytes_read = IOSTATS(bytes_read); + if (!s.ok()) { + break; + } } } if (!s.ok()) { break; } } + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; { @@ -4405,6 +5135,38 @@ cfd->UnrefAndTryDelete(); } } + RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, + IOSTATS(bytes_read) - prev_bytes_read); + return s; +} + +Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fname, + const ReadOptions& read_options) { + Status s; + if (file_checksum_expected == kUnknownFileChecksum) { + return s; + } + std::string file_checksum; + std::string func_name; + s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( + fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), + func_name_expected, &file_checksum, &func_name, + read_options.readahead_size, immutable_db_options_.allow_mmap_reads, + io_tracer_, immutable_db_options_.rate_limiter.get()); + if (s.ok()) { + assert(func_name_expected == func_name); + if (file_checksum != file_checksum_expected) { + std::ostringstream oss; + oss << fname << " file checksum mismatch, "; + oss << "expecting " + << Slice(file_checksum_expected).ToString(/*hex=*/true); + oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true); + s = Status::Corruption(oss.str()); + TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s); + } + } return s; } @@ -4437,7 +5199,8 @@ Status DBImpl::StartTrace(const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { InstrumentedMutexLock lock(&trace_mutex_); - tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer))); + tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options, + std::move(trace_writer))); return Status::OK(); } @@ -4448,16 +5211,24 @@ s = tracer_->Close(); tracer_.reset(); } else { - return Status::IOError("No trace file to close"); + s = Status::IOError("No trace file to close"); } return s; } +Status DBImpl::NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) { + replayer->reset(new ReplayerImpl(this, handles, std::move(reader))); + return Status::OK(); +} + Status DBImpl::StartBlockCacheTrace( const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { - return block_cache_tracer_.StartTrace(env_, trace_options, - std::move(trace_writer)); + return block_cache_tracer_.StartTrace(immutable_db_options_.clock, + trace_options, std::move(trace_writer)); } Status DBImpl::EndBlockCacheTrace() { @@ -4465,24 +5236,27 @@ return Status::OK(); } -Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { Status s; if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - s = tracer_->IteratorSeek(cf_id, key); + s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound); } } return s; } -Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, - const Slice& key) { +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { Status s; if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - s = tracer_->IteratorSeekForPrev(cf_id, key); + s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound); } } return s; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -20,8 +20,8 @@ #include #include "db/column_family.h" +#include "db/compaction/compaction_iterator.h" #include "db/compaction/compaction_job.h" -#include "db/dbformat.h" #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/external_sst_file_ingestion_job.h" @@ -50,12 +50,16 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/status.h" +#ifndef ROCKSDB_LITE #include "rocksdb/trace_reader_writer.h" +#endif // ROCKSDB_LITE #include "rocksdb/transaction_log.h" +#ifndef ROCKSDB_LITE +#include "rocksdb/utilities/replayer.h" +#endif // ROCKSDB_LITE #include "rocksdb/write_buffer_manager.h" +#include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" -#include "trace_replay/block_cache_tracer.h" -#include "trace_replay/trace_replay.h" #include "util/autovector.h" #include "util/hash.h" #include "util/repeatable_thread.h" @@ -69,6 +73,10 @@ class InMemoryStatsHistoryIterator; class MemTable; class PersistentStatsHistoryIterator; +class PeriodicWorkScheduler; +#ifndef NDEBUG +class PeriodicWorkTestScheduler; +#endif // !NDEBUG class TableCache; class TaskLimiterToken; class Version; @@ -82,13 +90,13 @@ // Class to maintain directories for all database paths other than main one. class Directories { public: - Status SetDirectories(Env* env, const std::string& dbname, - const std::string& wal_dir, - const std::vector& data_paths); + IOStatus SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); - Directory* GetDataDir(size_t path_id) const { + FSDirectory* GetDataDir(size_t path_id) const { assert(path_id < data_dirs_.size()); - Directory* ret_dir = data_dirs_[path_id].get(); + FSDirectory* ret_dir = data_dirs_[path_id].get(); if (ret_dir == nullptr) { // Should use db_dir_ return db_dir_.get(); @@ -96,19 +104,19 @@ return ret_dir; } - Directory* GetWalDir() { + FSDirectory* GetWalDir() { if (wal_dir_) { return wal_dir_.get(); } return db_dir_.get(); } - Directory* GetDbDir() { return db_dir_.get(); } + FSDirectory* GetDbDir() { return db_dir_.get(); } private: - std::unique_ptr db_dir_; - std::vector> data_dirs_; - std::unique_ptr wal_dir_; + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; }; // While DB is the public interface of RocksDB, and DBImpl is the actual @@ -127,7 +135,8 @@ class DBImpl : public DB { public: DBImpl(const DBOptions& options, const std::string& dbname, - const bool seq_per_batch = false, const bool batch_per_txn = true); + const bool seq_per_batch = false, const bool batch_per_txn = true, + bool read_only = false); // No copying allowed DBImpl(const DBImpl&) = delete; void operator=(const DBImpl&) = delete; @@ -163,6 +172,9 @@ virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; using DB::GetMergeOperands; Status GetMergeOperands(const ReadOptions& options, @@ -185,6 +197,11 @@ const std::vector& column_family, const std::vector& keys, std::vector* values) override; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values, + std::vector* timestamps) override; // This MultiGet is a batched version, which may be faster than calling Get // multiple times, especially if the keys have some spatial locality that @@ -198,11 +215,22 @@ const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input = false) override; + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, + const bool sorted_input = false) override; virtual void MultiGet(const ReadOptions& options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input = false) override; + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, + const bool sorted_input = false) override; virtual void MultiGetWithCallback( const ReadOptions& options, ColumnFamilyHandle* column_family, @@ -230,7 +258,7 @@ using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, - std::string* value, + std::string* value, std::string* timestamp, bool* value_found = nullptr) override; using DB::NewIterator; @@ -327,16 +355,34 @@ virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override; + // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire + // and release db_mutex + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override; + + // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and + // release db_mutex + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override; + virtual Status GetDbIdentity(std::string& identity) const override; virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + virtual Status GetDbSessionId(std::string& session_id) const override; + ColumnFamilyHandle* DefaultColumnFamily() const override; ColumnFamilyHandle* PersistentStatsColumnFamily() const; virtual Status Close() override; + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual bool IsFileDeletionsEnabled() const; + Status GetStatsHistory( uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; @@ -344,9 +390,6 @@ #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; - virtual Status DisableFileDeletions() override; - virtual Status EnableFileDeletions(bool force) override; - virtual int IsFileDeletionsEnabled() const; // All the returned filenames start with "/" virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, @@ -369,13 +412,21 @@ virtual void GetLiveFilesMetaData( std::vector* metadata) override; + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override; + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override; + // Obtains the meta data of the specified column family of the DB. - // Status::NotFound() will be returned if the current DB does not have - // any column family match the specified name. // TODO(yhchiang): output parameter is placed in the end in this codebase. virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, ColumnFamilyMetaData* metadata) override; + void GetAllColumnFamilyMetaData( + std::vector* metadata) override; + Status SuggestCompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) override; @@ -399,8 +450,29 @@ const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_options) override; + using DB::VerifyChecksum; virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + // Verify the checksums of files in db. Currently only tables are checked. + // + // read_options: controls file I/O behavior, e.g. read ahead size while + // reading all the live table files. + // + // use_file_checksum: if false, verify the block checksums of all live table + // in db. Otherwise, obtain the file checksums and compare + // with the MANIFEST. Currently, file checksums are + // recomputed by reading all table files. + // + // Returns: OK if there is no file whose file or block checksum mismatches. + Status VerifyChecksumInternal(const ReadOptions& read_options, + bool use_file_checksum); + + Status VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fpath, + const ReadOptions& read_options); using DB::StartTrace; virtual Status StartTrace( @@ -410,6 +482,12 @@ using DB::EndTrace; virtual Status EndTrace() override; + using DB::NewDefaultReplayer; + virtual Status NewDefaultReplayer( + const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override; + using DB::StartBlockCacheTrace; Status StartBlockCacheTrace( const TraceOptions& options, @@ -418,6 +496,13 @@ using DB::EndBlockCacheTrace; Status EndBlockCacheTrace() override; + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndIOTrace; + Status EndIOTrace() override; + using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables( ColumnFamilyHandle* column_family, @@ -429,10 +514,12 @@ #endif // ROCKSDB_LITE // ---- End of implementations of the DB interface ---- + SystemClock* GetSystemClock() const; struct GetImplOptions { ColumnFamilyHandle* column_family = nullptr; PinnableSlice* value = nullptr; + std::string* timestamp = nullptr; bool* value_found = nullptr; ReadCallback* callback = nullptr; bool* is_blob_index = nullptr; @@ -455,13 +542,14 @@ // If get_impl_options.get_value = false get merge operands associated with // get_impl_options.key via get_impl_options.merge_operands Status GetImpl(const ReadOptions& options, const Slice& key, - GetImplOptions get_impl_options); + GetImplOptions& get_impl_options); + // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file. ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback, - bool allow_blob = false, + bool expose_blob_index = false, bool allow_refresh = true); virtual SequenceNumber GetLastPublishedSequence() const { @@ -504,9 +592,15 @@ // in the memtables, including memtable history. If cache_only is false, // SST files will also be checked. // + // `key` should NOT have user-defined timestamp appended to user key even if + // timestamp is enabled. + // // If a key is found, *found_record_for_key will be set to true and // *seq will be set to the stored sequence number for the latest - // operation on this key or kMaxSequenceNumber if unknown. + // operation on this key or kMaxSequenceNumber if unknown. If user-defined + // timestamp is enabled for this column family and timestamp is not nullptr, + // then *timestamp will be set to the stored timestamp for the latest + // operation on this key. // If no key is found, *found_record_for_key will be set to false. // // Note: If cache_only=false, it is possible for *seq to be set to 0 if @@ -530,12 +624,15 @@ Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, bool cache_only, SequenceNumber lower_bound_seq, - SequenceNumber* seq, + SequenceNumber* seq, std::string* timestamp, bool* found_record_for_key, - bool* is_blob_index = nullptr); + bool* is_blob_index); - Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); - Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound); #endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot @@ -561,9 +658,16 @@ // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. + // If allow_unprepared_value is true, the returned iterator may defer reading + // the value and so will require PrepareValue() to be called before value(); + // allow_unprepared_value = false is convenient when this optimization is not + // useful, e.g. when reading the whole column family. + // @param read_options Must outlive the returned iterator. InternalIterator* NewInternalIterator( - Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, - ColumnFamilyHandle* column_family = nullptr); + const ReadOptions& read_options, Arena* arena, + RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr, + bool allow_unprepared_value = false); LogsWithPrepTracker* logs_with_prep_tracker() { return &logs_with_prep_tracker_; @@ -687,9 +791,14 @@ const WriteController& write_controller() { return write_controller_; } - InternalIterator* NewInternalIterator( - const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version, - Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence); + // @param read_options Must outlive the returned iterator. + InternalIterator* NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, + RangeDelAggregator* range_del_agg, + SequenceNumber sequence, + bool allow_unprepared_value); // hollow transactions shell used for recovery. // these will then be passed to TransactionDB so that @@ -817,8 +926,8 @@ InstrumentedMutex* mutex() const { return &mutex_; } // Initialize a brand new DB. The DB directory is expected to be empty before - // calling it. - Status NewDB(); + // calling it. Push new manifest file name into `new_filenames`. + Status NewDB(std::vector* new_filenames); // This is to be used only by internal rocksdb classes. static Status Open(const DBOptions& db_options, const std::string& name, @@ -826,8 +935,9 @@ std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn); - static Status CreateAndNewDirectory(Env* env, const std::string& dirname, - std::unique_ptr* directory); + static IOStatus CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory); // find stats map from stats_history_ with smallest timestamp in // the range of [start_time, end_time) @@ -842,13 +952,15 @@ int max_entries_to_print, std::string* out_str); + VersionSet* GetVersionSet() const { return versions_.get(); } + #ifndef NDEBUG // Compact any files in the named level that overlap [*begin, *end] Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, ColumnFamilyHandle* column_family = nullptr, bool disallow_trivial_move = false); - void TEST_SwitchWAL(); + Status TEST_SwitchWAL(); bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } @@ -872,6 +984,9 @@ Status TEST_AtomicFlushMemTables(const autovector& cfds, const FlushOptions& flush_opts); + // Wait for background threads to complete scheduled work. + Status TEST_WaitForBackgroundWork(); + // Wait for memtable compaction Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); @@ -880,9 +995,15 @@ // is only for the special test of CancelledCompactions Status TEST_WaitForCompact(bool waitUnscheduled = false); + // Wait for any background purge + Status TEST_WaitForPurge(); + + // Get the background error status + Status TEST_GetBGError(); + // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes( + uint64_t TEST_MaxNextLevelOverlappingBytes( ColumnFamilyHandle* column_family = nullptr); // Return the current manifest file no. @@ -894,8 +1015,10 @@ // get total level0 file size. Only for testing. uint64_t TEST_GetLevel0TotalSize(); - void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, - std::vector>* metadata); + void TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata, + std::vector>* blob_metadata = nullptr); void TEST_LockMutex(); @@ -938,22 +1061,104 @@ int TEST_BGCompactionsAllowed() const; int TEST_BGFlushesAllowed() const; size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - void TEST_WaitForDumpStatsRun(std::function callback) const; - void TEST_WaitForPersistStatsRun(std::function callback) const; - bool TEST_IsPersistentStatsEnabled() const; + void TEST_WaitForStatsDumpRun(std::function callback) const; size_t TEST_EstimateInMemoryStatsHistorySize() const; + + uint64_t TEST_GetCurrentLogNumber() const { + InstrumentedMutexLock l(mutex()); + assert(!logs_.empty()); + return logs_.back().number; + } + + const std::unordered_set& TEST_GetFilesGrabbedForPurge() const { + return files_grabbed_for_purge_; + } + +#ifndef ROCKSDB_LITE + PeriodicWorkTestScheduler* TEST_GetPeriodicWorkScheduler() const; +#endif // !ROCKSDB_LITE + #endif // NDEBUG + // persist stats to column family "_persistent_stats" + void PersistStats(); + + // dump rocksdb.stats to LOG + void DumpStats(); + + // flush LOG out of application buffer + void FlushInfoLog(); + + // Interface to block and signal the DB in case of stalling writes by + // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. + // When DB needs to be blocked or signalled by WriteBufferManager, + // state_ is changed accordingly. + class WBMStallInterface : public StallInterface { + public: + enum State { + BLOCKED = 0, + RUNNING, + }; + + WBMStallInterface() : state_cv_(&state_mutex_) { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + + void SetState(State state) { + MutexLock lock(&state_mutex_); + state_ = state; + } + + // Change the state_ to State::BLOCKED and wait until its state is + // changed by WriteBufferManager. When stall is cleared, Signal() is + // called to change the state and unblock the DB. + void Block() override { + MutexLock lock(&state_mutex_); + while (state_ == State::BLOCKED) { + TEST_SYNC_POINT("WBMStallInterface::BlockDB"); + state_cv_.Wait(); + } + } + + // Called from WriteBufferManager. This function changes the state_ + // to State::RUNNING indicating the stall is cleared and DB can proceed. + void Signal() override { + { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + state_cv_.Signal(); + } + + private: + // Conditional variable and mutex to block and + // signal the DB during stalling process. + port::Mutex state_mutex_; + port::CondVar state_cv_; + // state represting whether DB is running or blocked because of stall by + // WriteBufferManager. + State state_; + }; + + static void TEST_ResetDbSessionIdGen(); + static std::string GenerateDbSessionId(Env* env); + protected: const std::string dbname_; + // TODO(peterd): unify with VersionSet::db_id_ std::string db_id_; + // db_session_id_ is an identifier that gets reset + // every time the DB is opened + std::string db_session_id_; std::unique_ptr versions_; // Flag to check whether we allocated and own the info log file bool own_info_log_; const DBOptions initial_db_options_; Env* const env_; - std::shared_ptr fs_; + std::shared_ptr io_tracer_; const ImmutableDBOptions immutable_db_options_; + FileSystemPtr fs_; MutableDBOptions mutable_db_options_; Statistics* stats_; std::unordered_map @@ -972,6 +1177,14 @@ ColumnFamilyHandleImpl* default_cf_handle_; InternalStats* default_cf_internal_stats_; + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + ErrorHandler error_handler_; + + // Unified interface for logging events + EventLogger event_logger_; + // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families uint64_t max_total_in_memory_state_; @@ -1002,12 +1215,22 @@ // Default: true const bool batch_per_txn_; + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + std::atomic shutting_down_; + // Except in DB::Open(), WriteOptionsFile can only be called when: // Persist options to options file. // If need_mutex_lock = false, the method will lock DB mutex. // If need_enter_write_thread = false, the method will enter write thread. Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + Status CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); + // The following two functions can only be called when: // 1. WriteThread::Writer::EnterUnbatched() is used. // 2. db_mutex is NOT held @@ -1036,6 +1259,8 @@ #ifndef ROCKSDB_LITE void NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); + + Status FlushForGetLiveFiles(); #endif // !ROCKSDB_LITE void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; @@ -1113,12 +1338,33 @@ // skipped. virtual Status Recover( const std::vector& column_families, - bool read_only = false, bool error_if_log_file_exist = false, - bool error_if_data_exists_in_logs = false, + bool read_only = false, bool error_if_wal_file_exists = false, + bool error_if_data_exists_in_wals = false, uint64_t* recovered_seq = nullptr); virtual bool OwnTablesAndLogs() const { return true; } + // Set DB identity file, and write DB ID to manifest if necessary. + Status SetDBId(bool read_only); + + // REQUIRES: db mutex held when calling this function, but the db mutex can + // be released and re-acquired. Db mutex will be held when the function + // returns. + // After recovery, there may be SST files in db/cf paths that are + // not referenced in the MANIFEST (e.g. + // 1. It's best effort recovery; + // 2. The VersionEdits referencing the SST files are appended to + // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are + // still not synced to MANIFEST during recovery.) + // We delete these SST files. In the + // meantime, we find out the largest file number present in the paths, and + // bump up the version set's next_file_number_ to be 1 + largest_file_number. + Status DeleteUnreferencedSstFiles(); + + // SetDbSessionId() should be called in the constuctor DBImpl() + // to ensure that db_session_id_ gets updated every time the DB is opened + void SetDbSessionId(); + private: friend class DB; friend class ErrorHandler; @@ -1144,7 +1390,7 @@ friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; - friend class WriteCallbackTest_WriteWithCallbackTest_Test; + friend class WriteCallbackPTest_WriteWithCallbackTest_Test; friend class XFTransactionWriteHandler; friend class DBBlobIndexTest; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; @@ -1171,6 +1417,7 @@ struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + LogFileNumberSize() {} void AddSize(uint64_t new_size) { size += new_size; } uint64_t number; uint64_t size = 0; @@ -1245,21 +1492,34 @@ // Information for a manual compaction struct ManualCompactionState { + ManualCompactionState(ColumnFamilyData* _cfd, int _input_level, + int _output_level, uint32_t _output_path_id, + bool _exclusive, bool _disallow_trivial_move, + std::atomic* _canceled) + : cfd(_cfd), + input_level(_input_level), + output_level(_output_level), + output_path_id(_output_path_id), + exclusive(_exclusive), + disallow_trivial_move(_disallow_trivial_move), + canceled(_canceled) {} + ColumnFamilyData* cfd; int input_level; int output_level; uint32_t output_path_id; Status status; - bool done; - bool in_progress; // compaction request being processed? - bool incomplete; // only part of requested range compacted + bool done = false; + bool in_progress = false; // compaction request being processed? + bool incomplete = false; // only part of requested range compacted bool exclusive; // current behavior of only one manual bool disallow_trivial_move; // Force actual compaction to run - const InternalKey* begin; // nullptr means beginning of key range - const InternalKey* end; // nullptr means end of key range - InternalKey* manual_end; // how far we are compacting - InternalKey tmp_storage; // Used to keep track of compaction progress - InternalKey tmp_storage1; // Used to keep track of compaction progress + const InternalKey* begin = nullptr; // nullptr means beginning of key range + const InternalKey* end = nullptr; // nullptr means end of key range + InternalKey* manual_end = nullptr; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + std::atomic* canceled; // Compaction canceled by the user? }; struct PrepickedCompaction { // background compaction takes ownership of `compaction`. @@ -1276,6 +1536,7 @@ DBImpl* db; // background compaction takes ownership of `prepicked_compaction`. PrepickedCompaction* prepicked_compaction; + Env::Priority compaction_pri_; }; // Initialize the built-in column family for persistent stats. Depending on @@ -1293,7 +1554,7 @@ // Required: DB mutex held Status PersistentStatsProcessFormatVersion(); - Status ResumeImpl(); + Status ResumeImpl(DBRecoverContext context); void MaybeIgnoreError(Status* s) const; @@ -1332,7 +1593,7 @@ void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); - Status SyncClosedLogs(JobContext* job_context); + IOStatus SyncClosedLogs(JobContext* job_context); // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Then @@ -1370,6 +1631,12 @@ Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); + // Get the size of a log file and, if truncate is true, truncate the + // log file to its actual size, thereby freeing preallocated space. + // Return success even if truncate fails + Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log); + // Restore alive_log_files_ and total_log_size_ after recovery. // It needs to run only when there's no flush during recovery // (e.g. avoid_flush_during_recovery=true). May also trigger flush @@ -1380,6 +1647,10 @@ // `num_bytes` going through. Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + // Begin stalling of writes when memory usage increases beyond a certain + // threshold. + void WriteBufferManagerStallWrites(); + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, WriteBatch* my_batch); @@ -1452,6 +1723,25 @@ } } + // TaskType is used to identify tasks in thread-pool, currently only + // differentiate manual compaction, which could be unscheduled from the + // thread-pool. + enum class TaskType : uint8_t { + kDefault = 0, + kManualCompaction = 1, + kCount = 2, + }; + + // Task tag is used to identity tasks in thread-pool, which is + // dbImpl obj address + type + inline void* GetTaskTag(TaskType type) { + return GetTaskTag(static_cast(type)); + } + + inline void* GetTaskTag(uint8_t type) { + return static_cast(static_cast(this)) + type; + } + // REQUIRES: mutex locked and in write thread. void AssignAtomicFlushSeq(const autovector& cfds); @@ -1459,7 +1749,7 @@ Status SwitchWAL(WriteContext* write_context); // REQUIRES: mutex locked and in write thread. - Status HandleWriteBufferFull(WriteContext* write_context); + Status HandleWriteBufferManagerFlush(WriteContext* write_context); // REQUIRES: mutex locked Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, @@ -1469,21 +1759,30 @@ WriteBatch* tmp_batch, size_t* write_with_wal, WriteBatch** to_be_cached_state); - Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, - uint64_t* log_used, uint64_t* log_size); + IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, + uint64_t* log_used, uint64_t* log_size, + bool with_db_mutex = false, bool with_log_mutex = false); + + IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence); + + IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc); - Status WriteToWAL(const WriteThread::WriteGroup& write_group, - log::Writer* log_writer, uint64_t* log_used, - bool need_log_sync, bool need_log_dir_sync, - SequenceNumber sequence); - - Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, - uint64_t* log_used, SequenceNumber* last_sequence, - size_t seq_inc); + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + // Caller must hold mutex_. + void WriteStatusCheckOnLocked(const Status& status); // Used by WriteImpl to update bg_error_ if paranoid check is enabled. void WriteStatusCheck(const Status& status); + // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write + // WAL, sync WAL fails, if paranoid check is enabled. + void IOStatusCheck(const IOStatus& status); + // Used by WriteImpl to update bg_error_ in case of memtable insert error. void MemTableInsertStatusCheck(const Status& memtable_insert_status); @@ -1517,7 +1816,7 @@ // specified value, this flush request is considered to have completed its // work of flushing this column family. After completing the work for all // column families in this request, this flush is considered complete. - typedef std::vector> FlushRequest; + using FlushRequest = std::vector>; void GenerateFlushRequest(const autovector& cfds, FlushRequest* req); @@ -1558,18 +1857,12 @@ LogBuffer* log_buffer); // Schedule background tasks - void StartTimedTasks(); + void StartPeriodicWorkScheduler(); void PrintStatistics(); size_t EstimateInMemoryStatsHistorySize() const; - // persist stats to column family "_persistent_stats" - void PersistStats(); - - // dump rocksdb.stats to LOG - void DumpStats(); - // Return the minimum empty level that could hold the total data in the // input level. Return the input level, if such level could not be found. int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, @@ -1591,14 +1884,16 @@ std::unique_ptr* token, LogBuffer* log_buffer); // helper function to call after some of the logs_ were synced - void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + Status MarkLogsSynced(uint64_t up_to, bool synced_dir); + // WALs with log number up to up_to are not synced successfully. + void MarkLogsNotSynced(uint64_t up_to); SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, bool lock = true); uint64_t GetMaxTotalWalSize() const; - Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; Status CloseHelper(); @@ -1648,8 +1943,8 @@ size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } - Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log); + IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); // Validate self-consistency of DB options static Status ValidateOptions(const DBOptions& db_options); @@ -1727,14 +2022,15 @@ // to have acquired the SuperVersion and pass in a snapshot sequence number // in order to construct the LookupKeys. The start_key and num_keys specify // the range of keys in the sorted_keys vector for a single column family. - void MultiGetImpl( + Status MultiGetImpl( const ReadOptions& read_options, size_t start_key, size_t num_keys, autovector* sorted_keys, - SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, - bool* is_blob_index); + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback); - // table_cache_ provides its own synchronization - std::shared_ptr table_cache_; + Status DisableFileDeletionsWithLock(); + + Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low); // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; @@ -1749,8 +2045,13 @@ // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; - std::atomic shutting_down_; - std::atomic manual_compaction_paused_; + // If zero, manual compactions are allowed to proceed. If non-zero, manual + // compactions may still be running, but will quickly fail with + // `Status::Incomplete`. The value indicates how many threads have paused + // manual compactions. It is accessed in read mode outside the DB mutex in + // compaction code paths. + std::atomic manual_compaction_paused_; + // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't @@ -1778,7 +2079,7 @@ // accessed from the same write_thread_ without any locks. With // two_write_queues writes, where it can be updated in different threads, // read and writes are protected by log_write_mutex_ instead. This is to avoid - // expesnive mutex_ lock during WAL write, which update log_empty_. + // expensive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; ColumnFamilyHandleImpl* persist_stats_cf_handle_; @@ -1786,12 +2087,15 @@ bool persistent_stats_cfd_exists_ = true; // Without two_write_queues, read and writes to alive_log_files_ are - // protected by mutex_. However since back() is never popped, and push_back() - // is done only from write_thread_, the same thread can access the item - // reffered by back() without mutex_. With two_write_queues_, writes + // protected by mutex_. With two_write_queues_, writes // are protected by locking both mutex_ and log_write_mutex_, and reads must // be under either mutex_ or log_write_mutex_. std::deque alive_log_files_; + // Caching the result of `alive_log_files_.back()` so that we do not have to + // call `alive_log_files_.back()` in the write thread (WriteToWAL()) which + // requires locking db mutex if log_mutex_ is not already held in + // two-write-queues mode. + std::deque::reverse_iterator alive_log_files_tail_; // Log files that aren't fully synced, and the current log file. // Synchronization: // - push_back() is done from write_thread_ with locked mutex_ and @@ -1895,7 +2199,7 @@ std::unordered_map purge_files_; // A vector to store the file numbers that have been assigned to certain - // JobContext. Current implementation tracks ssts only. + // JobContext. Current implementation tracks table and blob files only. std::unordered_set files_grabbed_for_purge_; // A queue to store log writers to close @@ -1952,10 +2256,6 @@ // Number of threads intending to write to memtable std::atomic pending_memtable_writes_ = {}; - // Each flush or compaction gets its own job id. this counter makes sure - // they're unique - std::atomic next_job_id_; - // A flag indicating whether the current rocksdb database has any // data that is not yet persisted into either WAL or SST file. // Used when disableWAL is true. @@ -1984,9 +2284,6 @@ WalManager wal_manager_; #endif // ROCKSDB_LITE - // Unified interface for logging events - EventLogger event_logger_; - // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; @@ -2013,15 +2310,15 @@ // Only to be set during initialization std::unique_ptr recoverable_state_pre_release_callback_; - // handle for scheduling stats dumping at fixed intervals - // REQUIRES: mutex locked - std::unique_ptr thread_dump_stats_; - - // handle for scheduling stats snapshoting at fixed intervals - // REQUIRES: mutex locked - std::unique_ptr thread_persist_stats_; +#ifndef ROCKSDB_LITE + // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog(). + // Currently, it always use a global instance from + // PeriodicWorkScheduler::Default(). Only in unittest, it can be overrided by + // PeriodicWorkTestScheduler. + PeriodicWorkScheduler* periodic_work_scheduler_; +#endif - // When set, we use a separate queue for writes that dont write to memtable. + // When set, we use a separate queue for writes that don't write to memtable. // In 2PC these are the writes at Prepare phase. const bool two_write_queues_; const bool manual_wal_flush_; @@ -2053,8 +2350,10 @@ // Flag to check whether Close() has been called on this DB bool closed_; - - ErrorHandler error_handler_; + // save the closing status, for re-calling the close() + Status closing_status_; + // mutex for DB::Close() + InstrumentedMutex closing_mutex_; // Conditional variable to coordinate installation of atomic flush results. // With atomic flush, each bg thread installs the result of flushing multiple @@ -2068,11 +2367,22 @@ InstrumentedCondVar atomic_flush_install_cv_; bool wal_in_db_path_; + + BlobFileCompletionCallback blob_callback_; + + // Pointer to WriteBufferManager stalling interface. + std::unique_ptr wbm_stall_; + + // Indicate if deprecation warning message is logged before. Will be removed + // soon with the deprecated feature. + std::atomic_bool iter_start_seqnum_deprecation_warned_{false}; }; -extern Options SanitizeOptions(const std::string& db, const Options& src); +extern Options SanitizeOptions(const std::string& db, const Options& src, + bool read_only = false); -extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, + bool read_only = false); extern CompressionType GetCompressionFlush( const ImmutableCFOptions& ioptions, @@ -2084,18 +2394,37 @@ // `memtables_to_flush`) will be flushed and thus will not depend on any WAL // file. // The function is only applicable to 2pc mode. -extern uint64_t PrecomputeMinLogNumberToKeep( +extern uint64_t PrecomputeMinLogNumberToKeep2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, - autovector edit_list, + const autovector& edit_list, const autovector& memtables_to_flush, LogsWithPrepTracker* prep_tracker); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// In non-2PC mode, WALs with log number < the returned number can be +// deleted after the cfd_to_flush column family is flushed successfully. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists); // `cfd_to_flush` is the column family whose memtable will be flushed and thus // will not depend on any WAL file. nullptr means no memtable is being flushed. // The function is only applicable to 2pc mode. extern uint64_t FindMinPrepLogReferencedByMemTable( - VersionSet* vset, const ColumnFamilyData* cfd_to_flush, - const autovector& memtables_to_flush); + VersionSet* vset, const autovector& memtables_to_flush); +// For atomic flush. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush); // Fix user-supplied options to be reasonable template diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,14 +6,15 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include +#include #include "db/builder.h" +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/event_helpers.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" @@ -36,8 +37,10 @@ // Pass the current bg_error_ to SFM so it can decide what checks to // perform. If this DB instance hasn't seen any error yet, the SFM can be // optimistic and not do disk space checks - enough_room = - sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError()); + Status bg_error = error_handler_.GetBGError(); + enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error); + bg_error.PermitUncheckedError(); // bg_error is just a copy of the Status + // from the error_handler_ if (enough_room) { *sfm_reserved_compact_space = true; } @@ -79,7 +82,7 @@ return false; } -Status DBImpl::SyncClosedLogs(JobContext* job_context) { +IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); mutex_.AssertHeld(); autovector logs_to_sync; @@ -96,42 +99,52 @@ logs_to_sync.push_back(log.writer); } - Status s; + IOStatus io_s; if (!logs_to_sync.empty()) { mutex_.Unlock(); + assert(job_context); + for (log::Writer* log : logs_to_sync) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, log->get_log_number()); - s = log->file()->Sync(immutable_db_options_.use_fsync); - if (!s.ok()) { + io_s = log->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { break; } if (immutable_db_options_.recycle_log_file_num > 0) { - s = log->Close(); - if (!s.ok()) { + io_s = log->Close(); + if (!io_s.ok()) { break; } } } - if (s.ok()) { - s = directories_.GetWalDir()->Fsync(); + if (io_s.ok()) { + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } + TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock", + /*arg=*/nullptr); mutex_.Lock(); // "number <= current_log_number - 1" is equivalent to // "number < current_log_number". - MarkLogsSynced(current_log_number - 1, true, s); - if (!s.ok()) { - error_handler_.SetBGError(s, BackgroundErrorReason::kFlush); + if (io_s.ok()) { + io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true)); + } else { + MarkLogsNotSynced(current_log_number - 1); + } + if (!io_s.ok()) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); - return s; + return io_s; } } - return s; + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end"); + return io_s; } Status DBImpl::FlushMemTableToOutputFile( @@ -143,44 +156,98 @@ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, Env::Priority thread_pri) { mutex_.AssertHeld(); + assert(cfd); + assert(cfd->imm()); assert(cfd->imm()->NumNotFlushed() != 0); assert(cfd->imm()->IsFlushPending()); + assert(versions_); + assert(versions_->GetColumnFamilySet()); + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other (unflushed) column families are missing. + const bool needs_to_sync_closed_wals = + logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1; + + // If needs_to_sync_closed_wals is true, we need to record the current + // maximum memtable ID of this column family so that a later PickMemtables() + // call will not pick memtables whose IDs are higher. This is due to the fact + // that SyncClosedLogs() may release the db mutex, and memtable switch can + // happen for this column family in the meantime. The newly created memtables + // have their data backed by unsynced WALs, thus they cannot be included in + // this flush job. + // Another reason why we must record the current maximum memtable ID of this + // column family: SyncClosedLogs() may release db mutex, thus it's possible + // for application to continue to insert into memtables increasing db's + // sequence number. The application may take a snapshot, but this snapshot is + // not included in `snapshot_seqs` which will be passed to flush job because + // `snapshot_seqs` has already been computed before this function starts. + // Recording the max memtable ID ensures that the flush job does not flush + // a memtable without knowing such snapshot(s). + uint64_t max_memtable_id = needs_to_sync_closed_wals + ? cfd->imm()->GetLatestMemTableID() + : port::kMaxUint64; + + // If needs_to_sync_closed_wals is false, then the flush job will pick ALL + // existing memtables of the column family when PickMemTable() is called + // later. Although we won't call SyncClosedLogs() in this case, we may still + // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also + // releases and re-acquires the db mutex. In the meantime, the application + // can still insert into the memtables and increase the db's sequence number. + // The application can take a snapshot, hoping that the latest visible state + // to this snapshto is preserved. This is hard to guarantee since db mutex + // not held. This newly-created snapshot is not included in `snapshot_seqs` + // and the flush job is unaware of its presence. Consequently, the flush job + // may drop certain keys when generating the L0, causing incorrect data to be + // returned for snapshot read using this snapshot. + // To address this, we make sure NotifyOnFlushBegin() executes after memtable + // picking so that no new snapshot can be taken between the two functions. FlushJob flush_job( - dbname_, cfd, immutable_db_options_, mutable_cf_options, - nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(), - &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), - GetDataDir(cfd, 0U), + dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, + file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, - true /* sync_output_directory */, true /* write_manifest */, thread_pri); - + true /* sync_output_directory */, true /* write_manifest */, thread_pri, + io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(), + &blob_callback_); FileMetaData file_meta; + Status s; + bool need_cancel = false; + IOStatus log_io_s = IOStatus::OK(); + if (needs_to_sync_closed_wals) { + // SyncClosedLogs() may unlock and re-lock the db_mutex. + log_io_s = SyncClosedLogs(job_context); + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } + } else { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); + } + s = log_io_s; + + // If the log sync failed, we do not need to pick memtable. Otherwise, + // num_flush_not_started_ needs to be rollback. TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); - flush_job.PickMemTable(); - TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables"); + if (s.ok()) { + flush_job.PickMemTable(); + need_cancel = true; + } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job); #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); #endif // ROCKSDB_LITE - Status s; - if (logfile_number_ > 0 && - versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) { - // If there are more than one column families, we need to make sure that - // all the log files except the most recent one are synced. Otherwise if - // the host crashes after flushing and before WAL is persistent, the - // flushed SST may contain data from write batches whose updates to - // other column families are missing. - // SyncClosedLogs() may unlock and re-lock the db_mutex. - s = SyncClosedLogs(job_context); - } else { - TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); - } - + bool switched_to_mempurge = false; // Within flush_job.Run, rocksdb may call event listener to notify // file creation and deletion. // @@ -188,10 +255,19 @@ // and EventListener callback will be called when the db_mutex // is unlocked by the current thread. if (s.ok()) { - s = flush_job.Run(&logs_with_prep_tracker_, &file_meta); - } else { + s = flush_job.Run(&logs_with_prep_tracker_, &file_meta, + &switched_to_mempurge); + need_cancel = false; + } + + if (!s.ok() && need_cancel) { flush_job.Cancel(); } + IOStatus io_s = IOStatus::OK(); + io_s = flush_job.io_status(); + if (s.ok()) { + s = io_s; + } if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, superversion_context, @@ -199,17 +275,66 @@ if (made_progress) { *made_progress = true; } + + const std::string& column_family_name = cfd->GetName(); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", - cfd->GetName().c_str(), - cfd->current()->storage_info()->LevelSummary(&tmp)); + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 + "\n", + column_family_name.c_str(), blob_files.begin()->first, + blob_files.rbegin()->first); + } } if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { - Status new_bg_error = s; - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); - } - if (s.ok()) { + if (!io_s.ok() && !io_s.IsShutdownInProgress() && + !io_s.IsColumnFamilyDropped()) { + assert(log_io_s.ok()); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is + // needed. + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + if (log_io_s.ok()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } + } else { + // If we got here, then we decided not to care about the i_os status (either + // from never needing it or ignoring the flush job status + io_s.PermitUncheckedError(); + } + // If flush ran smoothly and no mempurge happened + // install new SST file path. + if (s.ok() && (!switched_to_mempurge)) { #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushCompleted(cfd, mutable_cf_options, @@ -220,7 +345,10 @@ // Notify sst_file_manager that a new file was added std::string file_path = MakeTableFileName( cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); - sfm->OnAddFile(file_path); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); if (sfm->IsMaxAllowedSpaceReached()) { Status new_bg_error = Status::SpaceLimit("Max allowed space was reached"); @@ -243,30 +371,22 @@ return AtomicFlushMemTablesToOutputFiles( bg_flush_args, made_progress, job_context, log_buffer, thread_pri); } + assert(bg_flush_args.size() == 1); std::vector snapshot_seqs; SequenceNumber earliest_write_conflict_snapshot; SnapshotChecker* snapshot_checker; GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); - Status status; - for (auto& arg : bg_flush_args) { - ColumnFamilyData* cfd = arg.cfd_; - MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); - SuperVersionContext* superversion_context = arg.superversion_context_; - Status s = FlushMemTableToOutputFile( - cfd, mutable_cf_options, made_progress, job_context, - superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, log_buffer, thread_pri); - if (!s.ok()) { - status = s; - if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { - // At this point, DB is not shutting down, nor is cfd dropped. - // Something is wrong, thus we break out of the loop. - break; - } - } - } - return status; + const auto& bg_flush_arg = bg_flush_args[0]; + ColumnFamilyData* cfd = bg_flush_arg.cfd_; + MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = + bg_flush_arg.superversion_context_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options, made_progress, job_context, superversion_context, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + log_buffer, thread_pri); + return s; } /* @@ -301,7 +421,7 @@ GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); - autovector distinct_output_dirs; + autovector distinct_output_dirs; autovector distinct_output_dir_paths; std::vector> jobs; std::vector all_mutable_cf_options; @@ -309,7 +429,7 @@ all_mutable_cf_options.reserve(num_cfs); for (int i = 0; i < num_cfs; ++i) { auto cfd = cfds[i]; - Directory* data_dir = GetDataDir(cfd, 0U); + FSDirectory* data_dir = GetDataDir(cfd, 0U); const std::string& curr_path = cfd->ioptions()->cf_paths[0].path; // Add to distinct output directories if eligible. Use linear search. Since @@ -329,7 +449,7 @@ all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); - const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); + uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_; jobs.emplace_back(new FlushJob( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, @@ -338,12 +458,16 @@ data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, - thread_pri)); - jobs.back()->PickMemTable(); + thread_pri, io_tracer_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow(), &blob_callback_)); } std::vector file_meta(num_cfs); + // Use of deque because vector + // is specific and doesn't allow &v[i]. + std::deque switched_to_mempurge(num_cfs, false); Status s; + IOStatus log_io_s = IOStatus::OK(); assert(num_cfs == static_cast(jobs.size())); #ifndef ROCKSDB_LITE @@ -358,23 +482,48 @@ if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. - s = SyncClosedLogs(job_context); + log_io_s = SyncClosedLogs(job_context); + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + if (total_log_size_ > 0) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } else { + // If the WAL is empty, we use different error reason + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL); + } + } } + s = log_io_s; // exec_status stores the execution status of flush_jobs as // autovector> exec_status; + autovector io_status; + std::vector pick_status; for (int i = 0; i != num_cfs; ++i) { // Initially all jobs are not executed, with status OK. exec_status.emplace_back(false, Status::OK()); + io_status.emplace_back(IOStatus::OK()); + pick_status.push_back(false); + } + + if (s.ok()) { + for (int i = 0; i != num_cfs; ++i) { + jobs[i]->PickMemTable(); + pick_status[i] = true; + } } if (s.ok()) { + assert(switched_to_mempurge.size() == + static_cast(num_cfs)); // TODO (yanqin): parallelize jobs with threads. for (int i = 1; i != num_cfs; ++i) { exec_status[i].second = - jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]); + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i], + &(switched_to_mempurge.at(i))); exec_status[i].first = true; + io_status[i] = jobs[i]->io_status(); } if (num_cfs > 1) { TEST_SYNC_POINT( @@ -384,9 +533,11 @@ } assert(exec_status.size() > 0); assert(!file_meta.empty()); - exec_status[0].second = - jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]); + exec_status[0].second = jobs[0]->Run( + &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */, + switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0))); exec_status[0].first = true; + io_status[0] = jobs[0]->io_status(); Status error_status; for (const auto& e : exec_status) { @@ -405,6 +556,21 @@ s = error_status.ok() ? s : error_status; } + IOStatus io_s = IOStatus::OK(); + if (io_s.ok()) { + IOStatus io_error = IOStatus::OK(); + for (int i = 0; i != static_cast(io_status.size()); i++) { + if (!io_status[i].ok() && !io_status[i].IsShutdownInProgress() && + !io_status[i].IsColumnFamilyDropped()) { + io_error = io_status[i]; + } + } + io_s = io_error; + if (s.ok() && !io_s.ok()) { + s = io_s; + } + } + if (s.IsColumnFamilyDropped()) { s = Status::OK(); } @@ -413,7 +579,9 @@ // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { - Status error_status = dir->Fsync(); + Status error_status = dir->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); if (!error_status.ok()) { s = error_status; break; @@ -426,12 +594,12 @@ // Have to cancel the flush jobs that have NOT executed because we need to // unref the versions. for (int i = 0; i != num_cfs; ++i) { - if (!exec_status[i].first) { + if (pick_status[i] && !exec_status[i].first) { jobs[i]->Cancel(); } } for (int i = 0; i != num_cfs; ++i) { - if (exec_status[i].first && exec_status[i].second.ok()) { + if (exec_status[i].second.ok() && exec_status[i].first) { auto& mems = jobs[i]->GetMemTables(); cfds[i]->imm()->RollbackMemtableFlush(mems, file_meta[i].fd.GetNumber()); @@ -440,7 +608,15 @@ } if (s.ok()) { - auto wait_to_install_func = [&]() { + const auto wait_to_install_func = + [&]() -> std::pair { + if (!versions_->io_status().ok()) { + // Something went wrong elsewhere, we cannot count on waiting for our + // turn to write/sync to MANIFEST or CURRENT. Just return. + return std::make_pair(versions_->io_status(), false); + } else if (shutting_down_.load(std::memory_order_acquire)) { + return std::make_pair(Status::ShutdownInProgress(), false); + } bool ready = true; for (size_t i = 0; i != cfds.size(); ++i) { const auto& mems = jobs[i]->GetMemTables(); @@ -464,18 +640,40 @@ break; } } - return ready; + return std::make_pair(Status::OK(), !ready); }; bool resuming_from_bg_err = error_handler_.IsDBStopped(); - while ((!error_handler_.IsDBStopped() || - error_handler_.GetRecoveryError().ok()) && - !wait_to_install_func()) { + while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) { + std::pair res = wait_to_install_func(); + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res); + + if (!res.first.ok()) { + s = res.first; + break; + } else if (!res.second) { + break; + } atomic_flush_install_cv_.Wait(); + + resuming_from_bg_err = error_handler_.IsDBStopped(); } - s = resuming_from_bg_err ? error_handler_.GetRecoveryError() - : error_handler_.GetBGError(); + if (!resuming_from_bg_err) { + // If not resuming from bg err, then we determine future action based on + // whether we hit background error. + if (s.ok()) { + s = error_handler_.GetBGError(); + } + } else if (s.ok()) { + // If resuming from bg err, we still rely on wait_to_install_func()'s + // result to determine future action. If wait_to_install_func() returns + // non-ok already, then we should not proceed to flush result + // installation. + s = error_handler_.GetRecoveryError(); + } } if (s.ok()) { @@ -483,6 +681,8 @@ autovector*> mems_list; autovector mutable_cf_options_list; autovector tmp_file_meta; + autovector>*> + committed_flush_jobs_info; for (int i = 0; i != num_cfs; ++i) { const auto& mems = jobs[i]->GetMemTables(); if (!cfds[i]->IsDropped() && !mems.empty()) { @@ -490,29 +690,54 @@ mems_list.emplace_back(&mems); mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); tmp_file_meta.emplace_back(&file_meta[i]); +#ifndef ROCKSDB_LITE + committed_flush_jobs_info.emplace_back( + jobs[i]->GetCommittedFlushJobsInfo()); +#endif //! ROCKSDB_LITE } } s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, tmp_file_meta, - &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer); + versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta, + committed_flush_jobs_info, &job_context->memtables_to_free, + directories_.GetDbDir(), log_buffer); } if (s.ok()) { assert(num_cfs == static_cast(job_context->superversion_contexts.size())); for (int i = 0; i != num_cfs; ++i) { + assert(cfds[i]); + if (cfds[i]->IsDropped()) { continue; } InstallSuperVersionAndScheduleWork(cfds[i], &job_context->superversion_contexts[i], all_mutable_cf_options[i]); + + const std::string& column_family_name = cfds[i]->GetName(); + + Version* const current = cfds[i]->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", - cfds[i]->GetName().c_str(), - cfds[i]->current()->storage_info()->LevelSummary(&tmp)); + column_family_name.c_str(), + storage_info->LevelSummary(&tmp)); + + const auto& blob_files = storage_info->GetBlobFiles(); + if (!blob_files.empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Blob file summary: head=%" PRIu64 + ", tail=%" PRIu64 "\n", + column_family_name.c_str(), blob_files.begin()->first, + blob_files.rbegin()->first); + } } if (made_progress) { *made_progress = true; @@ -521,7 +746,12 @@ auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); assert(all_mutable_cf_options.size() == static_cast(num_cfs)); - for (int i = 0; i != num_cfs; ++i) { + for (int i = 0; s.ok() && i != num_cfs; ++i) { + // If mempurge happened instead of Flush, + // no NotifyOnFlushCompleted call (no SST file created). + if (switched_to_mempurge[i]) { + continue; + } if (cfds[i]->IsDropped()) { continue; } @@ -530,7 +760,10 @@ if (sfm) { std::string file_path = MakeTableFileName( cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); - sfm->OnAddFile(file_path); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); if (sfm->IsMaxAllowedSpaceReached() && error_handler_.GetBGError().ok()) { Status new_bg_error = @@ -543,9 +776,35 @@ #endif // ROCKSDB_LITE } - if (!s.ok() && !s.IsShutdownInProgress()) { - Status new_bg_error = s; - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + if (!s.ok() && !s.IsColumnFamilyDropped()) { + if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) { + assert(log_io_s.ok()); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + if (!versions_->io_status().ok()) { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor + // is needed. + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); + } else { + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); + } + } else { + if (log_io_s.ok()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } } return s; @@ -644,29 +903,128 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) { - auto cfh = reinterpret_cast(column_family); + const Slice* begin_without_ts, + const Slice* end_without_ts) { + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return CompactRangeInternal(options, column_family, begin_without_ts, + end_without_ts); + } + + std::string begin_str; + std::string end_str; + + // CompactRange compact all keys: [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to include + // all `end` keys. + if (begin_without_ts != nullptr) { + AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz); + } + if (end_without_ts != nullptr) { + AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz); + } + Slice begin(begin_str); + Slice end(end_str); + + Slice* begin_with_ts = begin_without_ts ? &begin : nullptr; + Slice* end_with_ts = end_without_ts ? &end : nullptr; + + return CompactRangeInternal(options, column_family, begin_with_ts, + end_with_ts); +} + +Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) { + ColumnFamilyData* cfd = nullptr; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = static_cast_with_check(column_family); + assert(cfh != nullptr); + cfd = cfh->cfd(); + } + assert(cfd != nullptr && cfd->user_comparator() != nullptr); + if (cfd->user_comparator()->timestamp_size() == 0) { + return Status::InvalidArgument( + "Timestamp is not enabled in this column family"); + } + if (cfd->user_comparator()->timestamp_size() != ts_low.size()) { + return Status::InvalidArgument("ts_low size mismatch"); + } + return IncreaseFullHistoryTsLowImpl(cfd, ts_low); +} + +Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, + std::string ts_low) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.SetFullHistoryTsLow(ts_low); + + InstrumentedMutexLock l(&mutex_); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + const Comparator* ucmp = cfd->user_comparator(); + assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty()); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) { + return Status::InvalidArgument( + "Cannot decrease full_history_timestamp_low"); + } + + return versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_); +} + +Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) { return Status::InvalidArgument("Invalid target path ID"); } - bool exclusive = options.exclusive_manual_compaction; - bool flush_needed = true; + + // Update full_history_ts_low if it's set + if (options.full_history_ts_low != nullptr && + !options.full_history_ts_low->empty()) { + std::string ts_low = options.full_history_ts_low->ToString(); + if (begin != nullptr || end != nullptr) { + return Status::InvalidArgument( + "Cannot specify compaction range with full_history_ts_low"); + } + Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low); + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + Status s; if (begin != nullptr && end != nullptr) { // TODO(ajkr): We could also optimize away the flush in certain cases where // one/both sides of the interval are unbounded. But it requires more // changes to RangesOverlapWithMemtables. Range range(*begin, *end); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed); + s = cfd->RangesOverlapWithMemtables( + {range}, super_version, immutable_db_options_.allow_data_in_errors, + &flush_needed); CleanupSuperVersion(super_version); } - Status s; - if (flush_needed) { + if (s.ok() && flush_needed) { FlushOptions fo; fo.allow_write_stall = options.allow_write_stall; if (immutable_db_options_.atomic_flush) { @@ -686,25 +1044,9 @@ } } - int max_level_with_files = 0; - // max_file_num_to_ignore can be used to filter out newly created SST files, - // useful for bottom level compaction in a manual compaction - uint64_t max_file_num_to_ignore = port::kMaxUint64; - uint64_t next_file_number = port::kMaxUint64; - { - InstrumentedMutexLock l(&mutex_); - Version* base = cfd->current(); - for (int level = 1; level < base->storage_info()->num_non_empty_levels(); - level++) { - if (base->storage_info()->OverlapInLevel(level, begin, end)) { - max_level_with_files = level; - } - } - next_file_number = versions_->current_next_file_number(); - } - - int final_output_level = 0; - + constexpr int kInvalidLevel = -1; + int final_output_level = kInvalidLevel; + bool exclusive = options.exclusive_manual_compaction; if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && cfd->NumberLevels() > 1) { // Always compact all files together. @@ -715,70 +1057,132 @@ } s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, final_output_level, options, begin, end, exclusive, - false, max_file_num_to_ignore); + false, port::kMaxUint64); } else { - for (int level = 0; level <= max_level_with_files; level++) { - int output_level; - // in case the compaction is universal or if we're compacting the - // bottom-most level, the output level will be the same as input one. - // level 0 can never be the bottommost level (i.e. if all files are in - // level 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - output_level = level; - } else if (level == max_level_with_files && level > 0) { - if (options.bottommost_level_compaction == - BottommostLevelCompaction::kSkip) { - // Skip bottommost level compaction - continue; - } else if (options.bottommost_level_compaction == - BottommostLevelCompaction::kIfHaveCompactionFilter && - cfd->ioptions()->compaction_filter == nullptr && - cfd->ioptions()->compaction_filter_factory == nullptr) { - // Skip bottommost level compaction since we don't have a compaction - // filter - continue; + int first_overlapped_level = kInvalidLevel; + int max_overlapped_level = kInvalidLevel; + { + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* current_version = super_version->current; + ReadOptions ro; + ro.total_order_seek = true; + bool overlap; + for (int level = 0; + level < current_version->storage_info()->num_non_empty_levels(); + level++) { + overlap = true; + if (begin != nullptr && end != nullptr) { + Status status = current_version->OverlapWithLevelIterator( + ro, file_options_, *begin, *end, level, &overlap); + if (!status.ok()) { + overlap = current_version->storage_info()->OverlapInLevel( + level, begin, end); + } + } else { + overlap = current_version->storage_info()->OverlapInLevel(level, + begin, end); } - output_level = level; - // update max_file_num_to_ignore only for bottom level compaction - // because data in newly compacted files in middle levels may still need - // to be pushed down - max_file_num_to_ignore = next_file_number; - } else { - output_level = level + 1; - if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && - cfd->ioptions()->level_compaction_dynamic_level_bytes && - level == 0) { - output_level = ColumnFamilyData::kCompactToBaseLevel; + if (overlap) { + if (first_overlapped_level == kInvalidLevel) { + first_overlapped_level = level; + } + max_overlapped_level = level; } } - s = RunManualCompaction(cfd, level, output_level, options, begin, end, - exclusive, false, max_file_num_to_ignore); - if (!s.ok()) { - break; - } - if (output_level == ColumnFamilyData::kCompactToBaseLevel) { - final_output_level = cfd->NumberLevels() - 1; - } else if (output_level > final_output_level) { - final_output_level = output_level; + CleanupSuperVersion(super_version); + } + if (s.ok() && first_overlapped_level != kInvalidLevel) { + // max_file_num_to_ignore can be used to filter out newly created SST + // files, useful for bottom level compaction in a manual compaction + uint64_t max_file_num_to_ignore = port::kMaxUint64; + uint64_t next_file_number = versions_->current_next_file_number(); + final_output_level = max_overlapped_level; + int output_level; + for (int level = first_overlapped_level; level <= max_overlapped_level; + level++) { + bool disallow_trivial_move = false; + // in case the compaction is universal or if we're compacting the + // bottom-most level, the output level will be the same as input one. + // level 0 can never be the bottommost level (i.e. if all files are in + // level 0, we will compact to level 1) + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + output_level = level; + } else if (level == max_overlapped_level && level > 0) { + if (options.bottommost_level_compaction == + BottommostLevelCompaction::kSkip) { + // Skip bottommost level compaction + continue; + } else if (options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + cfd->ioptions()->compaction_filter == nullptr && + cfd->ioptions()->compaction_filter_factory == nullptr) { + // Skip bottommost level compaction since we don't have a compaction + // filter + continue; + } + output_level = level; + // update max_file_num_to_ignore only for bottom level compaction + // because data in newly compacted files in middle levels may still + // need to be pushed down + max_file_num_to_ignore = next_file_number; + } else { + output_level = level + 1; + if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + // if it's a BottommostLevel compaction and `kForce*` compaction is + // set, disallow trivial move + if (level == max_overlapped_level && + (options.bottommost_level_compaction == + BottommostLevelCompaction::kForce || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized)) { + disallow_trivial_move = true; + } + } + s = RunManualCompaction(cfd, level, output_level, options, begin, end, + exclusive, disallow_trivial_move, + max_file_num_to_ignore); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + final_output_level = cfd->NumberLevels() - 1; + } else if (output_level > final_output_level) { + final_output_level = output_level; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } } - if (!s.ok()) { + if (!s.ok() || final_output_level == kInvalidLevel) { LogFlush(immutable_db_options_.info_log); return s; } if (options.change_level) { + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1"); + TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "[RefitLevel] waiting for background threads to stop"); + DisableManualCompaction(); s = PauseBackgroundWork(); if (s.ok()) { + TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel"); s = ReFitLevel(cfd, final_output_level, options.target_level); - } - ContinueBackgroundWork(); + TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel"); + // ContinueBackgroundWork always return Status::OK(). + Status temp_s = ContinueBackgroundWork(); + assert(temp_s.ok()); + } + EnableManualCompaction(); + TEST_SYNC_POINT( + "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled"); } LogFlush(immutable_db_options_.info_log); @@ -813,11 +1217,12 @@ return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); } - auto cfd = reinterpret_cast(column_family)->cfd(); + auto cfd = + static_cast_with_check(column_family)->cfd(); assert(cfd); Status s; - JobContext job_context(0, true); + JobContext job_context(next_job_id_.fetch_add(1), true); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); @@ -884,7 +1289,7 @@ if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } - if (manual_compaction_paused_.load(std::memory_order_acquire)) { + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { return Status::Incomplete(Status::SubCode::kManualCompactionPaused); } @@ -946,7 +1351,7 @@ assert(cfd->compaction_picker()); c.reset(cfd->compaction_picker()->CompactFiles( compact_options, input_files, output_level, version->storage_info(), - *cfd->GetLatestMutableCFOptions(), output_path_id)); + *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id)); // we already sanitized the set of input files and checked for conflicts // without releasing the lock, so we're guaranteed a compaction can be formed. assert(c != nullptr); @@ -968,15 +1373,18 @@ assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJobStats compaction_job_stats; CompactionJob compaction_job( - job_context->job_id, c.get(), immutable_db_options_, + job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), - GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_, - &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, table_cache_, &event_logger_, + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_); + &compaction_job_stats, Env::Priority::USER, io_tracer_, + &manual_compaction_paused_, nullptr, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow(), &blob_callback_); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already @@ -990,17 +1398,23 @@ mutex_.Unlock(); TEST_SYNC_POINT("CompactFilesImpl:0"); TEST_SYNC_POINT("CompactFilesImpl:1"); - compaction_job.Run(); + // Ignore the status here, as it will be checked in the Install down below... + compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("CompactFilesImpl:2"); TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); Status status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { + assert(compaction_job.io_status().ok()); InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options()); } + // status above captures any error during compaction_job.Install, so its ok + // not check compaction_job.io_status() explicitly if we're not calling + // SetBGError + compaction_job.io_status().PermitUncheckedError(); c->ReleaseCompactionFiles(s); #ifndef ROCKSDB_LITE // Need to make sure SstFileManager does its bookkeeping @@ -1033,15 +1447,25 @@ "[%s] [JOB %d] Compaction error: %s", c->column_family_data()->GetName().c_str(), job_context->job_id, status.ToString().c_str()); - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + IOStatus io_s = compaction_job.io_status(); + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } } if (output_file_names != nullptr) { - for (const auto newf : c->edit()->GetNewFiles()) { - (*output_file_names) - .push_back(TableFileName(c->immutable_cf_options()->cf_paths, - newf.second.fd.GetNumber(), - newf.second.fd.GetPathId())); + for (const auto& newf : c->edit()->GetNewFiles()) { + output_file_names->push_back(TableFileName( + c->immutable_options()->cf_paths, newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + output_file_names->push_back( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber())); } } @@ -1099,9 +1523,11 @@ return; } if (c->is_manual_compaction() && - manual_compaction_paused_.load(std::memory_order_acquire)) { + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { return; } + + c->SetNotifyOnCompactionCompleted(); Version* current = cfd->current(); current->Ref(); // release lock while notifying events @@ -1109,46 +1535,11 @@ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); { CompactionJobInfo info{}; - info.cf_name = cfd->GetName(); - info.status = st; - info.thread_id = env_->GetThreadID(); - info.job_id = job_id; - info.base_input_level = c->start_level(); - info.output_level = c->output_level(); - info.stats = job_stats; - info.table_properties = c->GetOutputTableProperties(); - info.compaction_reason = c->compaction_reason(); - info.compression = c->output_compression(); - for (size_t i = 0; i < c->num_input_levels(); ++i) { - for (const auto fmd : *c->inputs(i)) { - const FileDescriptor& desc = fmd->fd; - const uint64_t file_number = desc.GetNumber(); - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, - file_number, desc.GetPathId()); - info.input_files.push_back(fn); - info.input_file_infos.push_back(CompactionFileInfo{ - static_cast(i), file_number, fmd->oldest_blob_file_number}); - if (info.table_properties.count(fn) == 0) { - std::shared_ptr tp; - auto s = current->GetTableProperties(&tp, fmd, &fn); - if (s.ok()) { - info.table_properties[fn] = tp; - } - } - } - } - for (const auto newf : c->edit()->GetNewFiles()) { - const FileMetaData& meta = newf.second; - const FileDescriptor& desc = meta.fd; - const uint64_t file_number = desc.GetNumber(); - info.output_files.push_back(TableFileName( - c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); - info.output_file_infos.push_back(CompactionFileInfo{ - newf.first, file_number, meta.oldest_blob_file_number}); - } + BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info); for (auto listener : immutable_db_options_.listeners) { listener->OnCompactionBegin(this, info); } + info.status.PermitUncheckedError(); } mutex_.Lock(); current->Unref(); @@ -1172,10 +1563,11 @@ if (shutting_down_.load(std::memory_order_acquire)) { return; } - if (c->is_manual_compaction() && - manual_compaction_paused_.load(std::memory_order_acquire)) { + + if (c->ShouldNotifyOnCompactionCompleted() == false) { return; } + Version* current = cfd->current(); current->Ref(); // release lock while notifying events @@ -1212,8 +1604,6 @@ SuperVersionContext sv_context(/* create_superversion */ true); - Status status; - InstrumentedMutexLock guard_lock(&mutex_); // only allow one thread refitting @@ -1232,20 +1622,32 @@ } auto* vstorage = cfd->current()->storage_info(); - if (to_level > level) { - if (level == 0) { - return Status::NotSupported( - "Cannot change from level 0 to other levels."); - } - // Check levels are empty for a trivial move - for (int l = level + 1; l <= to_level; l++) { - if (vstorage->NumLevelFiles(l) > 0) { + if (to_level != level) { + if (to_level > level) { + if (level == 0) { + refitting_level_ = false; return Status::NotSupported( - "Levels between source and target are not empty for a move."); + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } else { + // to_level < level + // Check levels are empty for a trivial move + for (int l = to_level; l < level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } } } - } - if (to_level != level) { ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); @@ -1254,19 +1656,20 @@ edit.SetColumnFamily(cfd->GetID()); for (const auto& f : vstorage->LevelFiles(level)) { edit.DeleteFile(level, f->fd.GetNumber()); - edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, - f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time, f->file_creation_time, - f->file_checksum, f->file_checksum_func_name); + edit.AddFile( + to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), + f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->min_timestamp, f->max_timestamp); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, - directories_.GetDbDir()); + Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, + &mutex_, directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", @@ -1277,16 +1680,18 @@ "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } + sv_context.Clean(); + refitting_level_ = false; + + return status; } - sv_context.Clean(); refitting_level_ = false; - - return status; + return Status::OK(); } int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); return cfh->cfd()->NumberLevels(); } @@ -1295,7 +1700,7 @@ } int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); InstrumentedMutexLock l(&mutex_); return cfh->cfd() ->GetSuperVersion() @@ -1304,7 +1709,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options, ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", cfh->GetName().c_str()); Status s; @@ -1373,20 +1778,16 @@ input_level >= 0); InternalKey begin_storage, end_storage; - CompactionArg* ca; + CompactionArg* ca = nullptr; bool scheduled = false; + bool unscheduled = false; + Env::Priority thread_pool_priority = Env::Priority::TOTAL; bool manual_conflict = false; - ManualCompactionState manual; - manual.cfd = cfd; - manual.input_level = input_level; - manual.output_level = output_level; - manual.output_path_id = compact_range_options.target_path_id; - manual.done = false; - manual.in_progress = false; - manual.incomplete = false; - manual.exclusive = exclusive; - manual.disallow_trivial_move = disallow_trivial_move; + + ManualCompactionState manual( + cfd, input_level, output_level, compact_range_options.target_path_id, + exclusive, disallow_trivial_move, compact_range_options.canceled); // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || @@ -1410,10 +1811,24 @@ TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); InstrumentedMutexLock l(&mutex_); + if (manual_compaction_paused_ > 0) { + // Does not make sense to `AddManualCompaction()` in this scenario since + // `DisableManualCompaction()` just waited for the manual compaction queue + // to drain. So return immediately. + TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart"); + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + manual.done = true; + return manual.status; + } + // When a manual compaction arrives, temporarily disable scheduling of // non-manual compactions and wait until the number of scheduled compaction - // jobs drops to zero. This is needed to ensure that this manual compaction - // can compact any range of keys/files. + // jobs drops to zero. This used to be needed to ensure that this manual + // compaction can compact any range of keys/files. Now it is optional + // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for + // `exclusive_manual_compaction=true` (the default) is unclear beyond not + // trusting the new code. // // HasPendingManualCompaction() is true when at least one thread is inside // RunManualCompaction(), i.e. during that time no other compaction will @@ -1427,8 +1842,20 @@ AddManualCompaction(&manual); TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); if (exclusive) { + // Limitation: there's no way to wake up the below loop when user sets + // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction` + // and `CompactRangeOptions::canceled` might not work well together. while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0) { + if (manual_compaction_paused_ > 0 || + (manual.canceled != nullptr && *manual.canceled == true)) { + // Pretend the error came from compaction so the below cleanup/error + // handling code can process it. + manual.done = true; + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + break; + } TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled"); ROCKS_LOG_INFO( immutable_db_options_.info_log, @@ -1455,9 +1882,9 @@ scheduled || (((manual.manual_end = &manual.tmp_storage1) != nullptr) && ((compaction = manual.cfd->CompactRange( - *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, - manual.output_level, compact_range_options, manual.begin, - manual.end, &manual.manual_end, &manual_conflict, + *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_, + manual.input_level, manual.output_level, compact_range_options, + manual.begin, manual.end, &manual.manual_end, &manual_conflict, max_file_num_to_ignore)) == nullptr && manual_conflict))) { // exclusive manual compactions should not see a conflict during @@ -1465,6 +1892,23 @@ assert(!exclusive || !manual_conflict); // Running either this or some other manual compaction bg_cv_.Wait(); + if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) { + assert(thread_pool_priority != Env::Priority::TOTAL); + // unschedule all manual compactions + auto unscheduled_task_num = env_->UnSchedule( + GetTaskTag(TaskType::kManualCompaction), thread_pool_priority); + if (unscheduled_task_num > 0) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Unscheduled %d number of manual compactions from the " + "thread-pool", + cfd->GetName().c_str(), unscheduled_task_num); + // it may unschedule other manual compactions, notify others. + bg_cv_.SignalAll(); + } + unscheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled"); + } if (scheduled && manual.incomplete == true) { assert(!manual.in_progress); scheduled = false; @@ -1487,10 +1931,25 @@ assert(false); } manual.incomplete = false; - bg_compaction_scheduled_++; - env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, - &DBImpl::UnscheduleCompactionCallback); + if (compaction->bottommost_level() && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + bg_bottom_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::BOTTOM; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, + Env::Priority::BOTTOM, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::BOTTOM; + } else { + bg_compaction_scheduled_++; + ca->compaction_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, + GetTaskTag(TaskType::kManualCompaction), + &DBImpl::UnscheduleCompactionCallback); + thread_pool_priority = Env::Priority::LOW; + } scheduled = true; + TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); } } @@ -1498,6 +1957,13 @@ assert(!manual.in_progress); assert(HasPendingManualCompaction()); RemoveManualCompaction(&manual); + // if the manual job is unscheduled, try schedule other jobs in case there's + // any unscheduled compaction job which was blocked by exclusive manual + // compaction. + if (manual.status.IsIncomplete() && + manual.status.subcode() == Status::SubCode::kManualCompactionPaused) { + MaybeScheduleFlushOrCompaction(); + } bg_cv_.SignalAll(); return manual.status; } @@ -1519,8 +1985,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& flush_options, FlushReason flush_reason, bool writes_stopped) { + // This method should not be called if atomic_flush is true. + assert(!immutable_db_options_.atomic_flush); Status s; - uint64_t flush_memtable_id = 0; if (!flush_options.allow_write_stall) { bool flush_needed = true; s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); @@ -1529,7 +1996,9 @@ return s; } } - FlushRequest flush_req; + + autovector flush_reqs; + autovector memtable_ids_to_wait; { WriteContext context; InstrumentedMutexLock guard_lock(&mutex_); @@ -1544,16 +2013,27 @@ } WaitForPendingWrites(); - if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { + if (flush_reason != FlushReason::kErrorRecoveryRetryFlush && + (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) { + // Note that, when flush reason is kErrorRecoveryRetryFlush, during the + // auto retry resume, we want to avoid creating new small memtables. + // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl + // will iterate through all the CFs and call FlushMemtable during auto + // retry resume, it is possible that in some CFs, + // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will + // be created and scheduled, status::OK() will be returned. s = SwitchMemtable(cfd, &context); } + const uint64_t flush_memtable_id = port::kMaxUint64; if (s.ok()) { if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { - flush_memtable_id = cfd->imm()->GetLatestMemTableID(); - flush_req.emplace_back(cfd, flush_memtable_id); + FlushRequest req{{cfd, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); } - if (immutable_db_options_.persist_stats_to_disk) { + if (immutable_db_options_.persist_stats_to_disk && + flush_reason != FlushReason::kErrorRecoveryRetryFlush) { ColumnFamilyData* cfd_stats = versions_->GetColumnFamilySet()->GetColumnFamily( kPersistentStatsColumnFamilyName); @@ -1576,16 +2056,19 @@ "to avoid holding old logs", cfd->GetName().c_str()); s = SwitchMemtable(cfd_stats, &context); - flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); - flush_req.emplace_back(cfd_stats, flush_memtable_id); + FlushRequest req{{cfd_stats, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back( + cfd->imm()->GetLatestMemTableID()); } } } } - if (s.ok() && !flush_req.empty()) { - for (auto& elem : flush_req) { - ColumnFamilyData* loop_cfd = elem.first; + if (s.ok() && !flush_reqs.empty()) { + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; loop_cfd->imm()->FlushRequested(); } // If the caller wants to wait for this flush to complete, it indicates @@ -1593,12 +2076,15 @@ // other threads which may drop the column family concurrently. // Therefore, we increase the cfd's ref count. if (flush_options.wait) { - for (auto& elem : flush_req) { - ColumnFamilyData* loop_cfd = elem.first; + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; loop_cfd->Ref(); } } - SchedulePendingFlush(flush_req, flush_reason); + for (const auto& req : flush_reqs) { + SchedulePendingFlush(req, flush_reason); + } MaybeScheduleFlushOrCompaction(); } @@ -1614,12 +2100,16 @@ if (s.ok() && flush_options.wait) { autovector cfds; autovector flush_memtable_ids; - for (auto& iter : flush_req) { - cfds.push_back(iter.first); - flush_memtable_ids.push_back(&(iter.second)); - } - s = WaitForFlushMemTables(cfds, flush_memtable_ids, - (flush_reason == FlushReason::kErrorRecovery)); + assert(flush_reqs.size() == memtable_ids_to_wait.size()); + for (size_t i = 0; i < flush_reqs.size(); ++i) { + assert(flush_reqs[i].size() == 1); + cfds.push_back(flush_reqs[i][0].first); + flush_memtable_ids.push_back(&(memtable_ids_to_wait[i])); + } + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); InstrumentedMutexLock lock_guard(&mutex_); for (auto* tmp_cfd : cfds) { tmp_cfd->UnrefAndTryDelete(); @@ -1677,7 +2167,8 @@ } } for (auto cfd : cfds) { - if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { + if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) || + flush_reason == FlushReason::kErrorRecoveryRetryFlush) { continue; } cfd->Ref(); @@ -1720,8 +2211,10 @@ for (auto& iter : flush_req) { flush_memtable_ids.push_back(&(iter.second)); } - s = WaitForFlushMemTables(cfds, flush_memtable_ids, - (flush_reason == FlushReason::kErrorRecovery)); + s = WaitForFlushMemTables( + cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery || + flush_reason == FlushReason::kErrorRecoveryRetryFlush)); InstrumentedMutexLock lock_guard(&mutex_); for (auto* cfd : cfds) { cfd->UnrefAndTryDelete(); @@ -1793,12 +2286,12 @@ // check whether one extra immutable memtable or an extra L0 file would // cause write stalling mode to be entered. It could still enter stall // mode due to pending compaction bytes, but that's less common - write_stall_condition = - ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + 1, - vstorage->l0_delay_trigger_count() + 1, - vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) - .first; + write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), + mutable_cf_options, *cfd->ioptions()) + .first; } while (write_stall_condition != WriteStallCondition::kNormal); } return Status::OK(); @@ -1821,16 +2314,29 @@ int num = static_cast(cfds.size()); // Wait until the compaction completes InstrumentedMutexLock l(&mutex_); + Status s; // If the caller is trying to resume from bg error, then // error_handler_.IsDBStopped() is true. while (resuming_from_bg_err || !error_handler_.IsDBStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { - return Status::ShutdownInProgress(); + s = Status::ShutdownInProgress(); + return s; } // If an error has occurred during resumption, then no need to wait. + // But flush operation may fail because of this error, so need to + // return the status. if (!error_handler_.GetRecoveryError().ok()) { + s = error_handler_.GetRecoveryError(); break; } + // If BGWorkStopped, which indicate that there is a BG error and + // 1) soft error but requires no BG work, 2) no in auto_recovery_ + if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() && + error_handler_.GetBGError().severity() < Status::Severity::kHardError) { + s = error_handler_.GetBGError(); + return s; + } + // Number of column families that have been dropped. int num_dropped = 0; // Number of column families that have finished flush. @@ -1846,7 +2352,8 @@ } } if (1 == num_dropped && 1 == num) { - return Status::InvalidArgument("Cannot flush a dropped CF"); + s = Status::ColumnFamilyDropped(); + return s; } // Column families involved in this flush request have either been dropped // or finished flush. Then it's time to finish waiting. @@ -1855,7 +2362,6 @@ } bg_cv_.Wait(); } - Status s; // If not resuming from bg error, and an error has caused the DB to stop, // then report the bg error to caller. if (!resuming_from_bg_err && error_handler_.IsDBStopped()) { @@ -1879,11 +2385,25 @@ } void DBImpl::DisableManualCompaction() { - manual_compaction_paused_.store(true, std::memory_order_release); + InstrumentedMutexLock l(&mutex_); + manual_compaction_paused_.fetch_add(1, std::memory_order_release); + + // Wake up manual compactions waiting to start. + bg_cv_.SignalAll(); + + // Wait for any pending manual compactions to finish (typically through + // failing with `Status::Incomplete`) prior to returning. This way we are + // guaranteed no pending manual compaction will commit while manual + // compactions are "disabled". + while (HasPendingManualCompaction()) { + bg_cv_.Wait(); + } } void DBImpl::EnableManualCompaction() { - manual_compaction_paused_.store(false, std::memory_order_release); + InstrumentedMutexLock l(&mutex_); + assert(manual_compaction_paused_ > 0); + manual_compaction_paused_.fetch_sub(1, std::memory_order_release); } void DBImpl::MaybeScheduleFlushOrCompaction() { @@ -1956,10 +2476,12 @@ return; } - while (bg_compaction_scheduled_ < bg_job_limits.max_compactions && + while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ < + bg_job_limits.max_compactions && unscheduled_compactions_ > 0) { CompactionArg* ca = new CompactionArg; ca->db = this; + ca->compaction_pri_ = Env::Priority::LOW; ca->prepicked_compaction = nullptr; bg_compaction_scheduled_++; unscheduled_compactions_--; @@ -1970,7 +2492,7 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { mutex_.AssertHeld(); - return GetBGJobLimits(immutable_db_options_.max_background_flushes, + return GetBGJobLimits(mutable_db_options_.max_background_flushes, mutable_db_options_.max_background_compactions, mutable_db_options_.max_background_jobs, write_controller_.NeedSpeedupCompaction()); @@ -2019,6 +2541,17 @@ assert(!flush_queue_.empty()); FlushRequest flush_req = flush_queue_.front(); flush_queue_.pop_front(); + if (!immutable_db_options_.atomic_flush) { + assert(flush_req.size() == 1); + } + for (const auto& elem : flush_req) { + if (!immutable_db_options_.atomic_flush) { + ColumnFamilyData* cfd = elem.first; + assert(cfd); + assert(cfd->queued_for_flush()); + cfd->set_queued_for_flush(false); + } + } // TODO: need to unset flush reason? return flush_req; } @@ -2051,19 +2584,47 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, FlushReason flush_reason) { + mutex_.AssertHeld(); if (flush_req.empty()) { return; } - for (auto& iter : flush_req) { - ColumnFamilyData* cfd = iter.first; - cfd->Ref(); - cfd->SetFlushReason(flush_reason); + if (!immutable_db_options_.atomic_flush) { + // For the non-atomic flush case, we never schedule multiple column + // families in the same flush request. + assert(flush_req.size() == 1); + ColumnFamilyData* cfd = flush_req[0].first; + assert(cfd); + // Note: SchedulePendingFlush is always preceded + // with an imm()->FlushRequested() call. However, + // we want to make this code snipper more resilient to + // future changes. Therefore, we add the following if + // statement - note that calling it twice (or more) + // doesn't break anything. + if (immutable_db_options_.experimental_mempurge_threshold > 0.0) { + // If imm() contains silent memtables, + // requesting a flush will mark the imm_needed as true. + cfd->imm()->FlushRequested(); + } + if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { + cfd->Ref(); + cfd->set_queued_for_flush(true); + cfd->SetFlushReason(flush_reason); + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } + } else { + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); } - ++unscheduled_flushes_; - flush_queue_.push_back(flush_req); } void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + mutex_.AssertHeld(); if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { AddToCompactionQueue(cfd); ++unscheduled_compactions_; @@ -2083,8 +2644,7 @@ IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); TEST_SYNC_POINT("DBImpl::BGWorkFlush"); - static_cast_with_check(fta.db_)->BackgroundCallFlush( - fta.thread_pri_); + static_cast_with_check(fta.db_)->BackgroundCallFlush(fta.thread_pri_); TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); } @@ -2095,7 +2655,7 @@ TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); auto prepicked_compaction = static_cast(ca.prepicked_compaction); - static_cast_with_check(ca.db)->BackgroundCallCompaction( + static_cast_with_check(ca.db)->BackgroundCallCompaction( prepicked_compaction, Env::Priority::LOW); delete prepicked_compaction; } @@ -2106,8 +2666,7 @@ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM); TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction"); auto* prepicked_compaction = ca.prepicked_compaction; - assert(prepicked_compaction && prepicked_compaction->compaction && - !prepicked_compaction->manual_compaction_state); + assert(prepicked_compaction && prepicked_compaction->compaction); ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM); delete prepicked_compaction; } @@ -2120,10 +2679,27 @@ } void DBImpl::UnscheduleCompactionCallback(void* arg) { - CompactionArg ca = *(reinterpret_cast(arg)); + CompactionArg* ca_ptr = reinterpret_cast(arg); + Env::Priority compaction_pri = ca_ptr->compaction_pri_; + if (Env::Priority::BOTTOM == compaction_pri) { + // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM + ca_ptr->db->bg_bottom_compaction_scheduled_--; + } else if (Env::Priority::LOW == compaction_pri) { + // Decrement bg_compaction_scheduled_ if priority is LOW + ca_ptr->db->bg_compaction_scheduled_--; + } + CompactionArg ca = *(ca_ptr); delete reinterpret_cast(arg); if (ca.prepicked_compaction != nullptr) { + // if it's a manual compaction, set status to ManualCompactionPaused + if (ca.prepicked_compaction->manual_compaction_state) { + ca.prepicked_compaction->manual_compaction_state->done = true; + ca.prepicked_compaction->manual_compaction_state->status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } if (ca.prepicked_compaction->compaction != nullptr) { + ca.prepicked_compaction->compaction->ReleaseCompactionFiles( + Status::Incomplete(Status::SubCode::kManualCompactionPaused)); delete ca.prepicked_compaction->compaction; } delete ca.prepicked_compaction; @@ -2132,6 +2708,14 @@ } void DBImpl::UnscheduleFlushCallback(void* arg) { + // Decrement bg_flush_scheduled_ in flush callback + reinterpret_cast(arg)->db_->bg_flush_scheduled_--; + Env::Priority flush_pri = reinterpret_cast(arg)->thread_pri_; + if (Env::Priority::LOW == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback"); + } else if (Env::Priority::HIGH == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback"); + } delete reinterpret_cast(arg); TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); } @@ -2169,6 +2753,11 @@ for (const auto& iter : flush_req) { ColumnFamilyData* cfd = iter.first; + if (immutable_db_options_.experimental_mempurge_threshold > 0.0) { + // If imm() contains silent memtables, + // requesting a flush will mark the imm_needed as true. + cfd->imm()->FlushRequested(); + } if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { // can't flush this CF, try next one column_families_not_to_flush.push_back(cfd); @@ -2220,10 +2809,12 @@ bool made_progress = false; JobContext job_context(next_job_id_.fetch_add(1), true); - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1"); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2"); { InstrumentedMutexLock l(&mutex_); assert(bg_flush_scheduled_); @@ -2252,7 +2843,7 @@ s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); + immutable_db_options_.clock->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2325,7 +2916,8 @@ if (s.IsBusy()) { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - env_->SleepForMicroseconds(10000); // prevent hot loop + immutable_db_options_.clock->SleepForMicroseconds( + 10000); // prevent hot loop mutex_.Lock(); } else if (!s.ok() && !s.IsShutdownInProgress() && !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { @@ -2343,9 +2935,10 @@ "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); + immutable_db_options_.clock->SleepForMicroseconds(1000000); mutex_.Lock(); } else if (s.IsManualCompactionPaused()) { + assert(prepicked_compaction); ManualCompactionState* m = prepicked_compaction->manual_compaction_state; assert(m); ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", @@ -2354,12 +2947,13 @@ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - // If compaction failed, we want to delete all temporary files that we might - // have created (they might not be all recorded in job_context in case of a - // failure). Thus, we force full scan in FindObsoleteFiles() + // If compaction failed, we want to delete all temporary files that we + // might have created (they might not be all recorded in job_context in + // case of a failure). Thus, we force full scan in FindObsoleteFiles() FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && !s.IsManualCompactionPaused() && - !s.IsColumnFamilyDropped()); + !s.IsColumnFamilyDropped() && + !s.IsBusy()); TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); // delete unnecessary files if any, this is done outside the mutex @@ -2382,6 +2976,7 @@ assert(num_running_compactions_ > 0); num_running_compactions_--; + if (bg_thread_pri == Env::Priority::LOW) { bg_compaction_scheduled_--; } else { @@ -2389,10 +2984,17 @@ bg_bottom_compaction_scheduled_--; } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - // See if there's more work to be done MaybeScheduleFlushOrCompaction(); + + if (prepicked_compaction != nullptr && + prepicked_compaction->task_token != nullptr) { + // Releasing task tokens affects (and asserts on) the DB state, so + // must be done before we potentially signal the DB close process to + // proceed below. + prepicked_compaction->task_token.reset(); + } + if (made_progress || (bg_compaction_scheduled_ == 0 && bg_bottom_compaction_scheduled_ == 0) || @@ -2443,7 +3045,10 @@ if (shutting_down_.load(std::memory_order_acquire)) { status = Status::ShutdownInProgress(); } else if (is_manual && - manual_compaction_paused_.load(std::memory_order_acquire)) { + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } else if (is_manual && manual_compaction->canceled && + manual_compaction->canceled->load(std::memory_order_acquire)) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } } else { @@ -2474,6 +3079,8 @@ manual_compaction->in_progress = true; } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress"); + std::unique_ptr task_token; // InternalKey manual_end_storage; @@ -2485,12 +3092,13 @@ if (!c) { m->done = true; m->manual_end = nullptr; - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Manual compaction from level-%d from %s .. " - "%s; nothing to do\n", - m->cfd->GetName().c_str(), m->input_level, - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)")); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)")); } else { // First check if we have enough room to do the compaction bool enough_room = EnoughRoomForCompaction( @@ -2509,11 +3117,11 @@ "[%s] Manual compaction from level-%d to level-%d from %s .. " "%s; will stop at %s\n", m->cfd->GetName().c_str(), m->input_level, c->output_level(), - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)"), + (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"), + (m->end ? m->end->DebugString(true).c_str() : "(end)"), ((m->done || m->manual_end == nullptr) ? "(end)" - : m->manual_end->DebugString().c_str())); + : m->manual_end->DebugString(true).c_str())); } } } else if (!is_prepicked && !compaction_queue_.empty()) { @@ -2557,7 +3165,8 @@ // compaction is not necessary. Need to make sure mutex is held // until we make a copy in the following code TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); - c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_, + log_buffer)); TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); if (c != nullptr) { @@ -2570,7 +3179,7 @@ c->column_family_data() ->current() ->storage_info() - ->ComputeCompactionScore(*(c->immutable_cf_options()), + ->ComputeCompactionScore(*(c->immutable_options()), *(c->mutable_cf_options())); AddToCompactionQueue(cfd); ++unscheduled_compactions_; @@ -2581,8 +3190,12 @@ status = Status::CompactionTooLarge(); } else { // update statistics - RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs(0)->size()); + size_t num_files = 0; + for (auto& each_level : *c->inputs()) { + num_files += each_level.files.size(); + } + RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files); + // There are three things that can change compaction score: // 1) When flush or compaction finish. This case is covered by // InstallSuperVersionAndScheduleWork @@ -2606,6 +3219,7 @@ } } + IOStatus io_s; if (!c) { // Nothing to do ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); @@ -2630,6 +3244,7 @@ status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], *c->mutable_cf_options()); @@ -2665,13 +3280,13 @@ for (size_t i = 0; i < c->num_input_files(l); i++) { FileMetaData* f = c->input(l, i); c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); - c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), - f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, - f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction, - f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, - f->file_checksum_func_name); + c->edit()->AddFile( + c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, f->file_checksum_func_name, + f->min_timestamp, f->max_timestamp); ROCKS_LOG_BUFFER( log_buffer, @@ -2686,6 +3301,7 @@ status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); + io_s = versions_->io_status(); // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], @@ -2727,6 +3343,7 @@ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); CompactionArg* ca = new CompactionArg; ca->db = this; + ca->compaction_pri_ = Env::Priority::BOTTOM; ca->prepicked_compaction = new PrepickedCompaction; ca->prepicked_compaction->compaction = c.release(); ca->prepicked_compaction->manual_compaction_state = nullptr; @@ -2750,28 +3367,34 @@ assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, - file_options_for_compaction_, versions_.get(), &shutting_down_, - preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), - GetDataDir(c->column_family_data(), c->output_path_id()), stats_, - &mutex_, &error_handler_, snapshot_seqs, - earliest_write_conflict_snapshot, snapshot_checker, table_cache_, - &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + mutable_db_options_, file_options_for_compaction_, versions_.get(), + &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer, + directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, thread_pri, - is_manual ? &manual_compaction_paused_ : nullptr); + &compaction_job_stats, thread_pri, io_tracer_, + is_manual ? &manual_compaction_paused_ : nullptr, + is_manual ? manual_compaction->canceled : nullptr, db_id_, + db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), + &blob_callback_); compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); - mutex_.Unlock(); TEST_SYNC_POINT_CALLBACK( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); - compaction_job.Run(); + // Should handle erorr? + compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); status = compaction_job.Install(*c->mutable_cf_options()); + io_s = compaction_job.io_status(); if (status.ok()) { InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], @@ -2781,6 +3404,13 @@ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); } + + if (status.ok() && !io_s.ok()) { + status = io_s; + } else { + io_s.PermitUncheckedError(); + } + if (c != nullptr) { c->ReleaseCompactionFiles(status); *made_progress = true; @@ -2806,7 +3436,19 @@ } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + if (!io_s.ok()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kCompaction + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); + } else { + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { // Put this cfd back in the compaction queue so we can retry after some // time @@ -2817,7 +3459,7 @@ c->column_family_data() ->current() ->storage_info() - ->ComputeCompactionScore(*(c->immutable_cf_options()), + ->ComputeCompactionScore(*(c->immutable_options()), *(c->mutable_cf_options())); if (!cfd->queued_for_compaction()) { AddToCompactionQueue(cfd); @@ -2873,6 +3515,7 @@ } void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + assert(manual_compaction_paused_ == 0); manual_compaction_dequeue_.push_back(m); } @@ -2958,7 +3601,7 @@ if (m->cfd != m1->cfd) { return false; } - return true; + return false; } #ifndef ROCKSDB_LITE @@ -2982,7 +3625,7 @@ for (const auto fmd : *c->inputs(i)) { const FileDescriptor& desc = fmd->fd; const uint64_t file_number = desc.GetNumber(); - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number, + auto fn = TableFileName(c->immutable_options()->cf_paths, file_number, desc.GetPathId()); compaction_job_info->input_files.push_back(fn); compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ @@ -3001,10 +3644,34 @@ const FileDescriptor& desc = meta.fd; const uint64_t file_number = desc.GetNumber(); compaction_job_info->output_files.push_back(TableFileName( - c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + c->immutable_options()->cf_paths, file_number, desc.GetPathId())); compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ newf.first, file_number, meta.oldest_blob_file_number}); } + compaction_job_info->blob_compression_type = + c->mutable_cf_options()->blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + compaction_job_info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } + + // Update BlobFilesGarbageInfo. + for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) { + BlobFileGarbageInfo blob_file_garbage_info( + BlobFileName(c->immutable_options()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(), + blob_file.GetGarbageBlobBytes()); + compaction_job_info->blob_file_garbage_infos.emplace_back( + std::move(blob_file_garbage_info)); + } } #endif @@ -3037,7 +3704,7 @@ if (UNLIKELY(sv_context->new_superversion == nullptr)) { sv_context->NewSuperVersion(); } - cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options); + cfd->InstallSuperVersion(sv_context, mutable_cf_options); // There may be a small data race here. The snapshot tricking bottommost // compaction may already be released here. But assuming there will always be diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,7 @@ #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/error_handler.h" +#include "db/periodic_work_scheduler.h" #include "monitoring/thread_status_updater.h" #include "util/cast_util.h" @@ -21,12 +22,13 @@ return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); } -void DBImpl::TEST_SwitchWAL() { +Status DBImpl::TEST_SwitchWAL() { WriteContext write_context; InstrumentedMutexLock l(&mutex_); void* writer = TEST_BeginWrite(); - SwitchWAL(&write_context); + auto s = SwitchWAL(&write_context); TEST_EndWrite(writer); + return s; } bool DBImpl::TEST_WALBufferIsEmpty(bool lock) { @@ -41,13 +43,13 @@ return res; } -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( +uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); } else { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); cfd = cfh->cfd(); } InstrumentedMutexLock l(&mutex_); @@ -56,8 +58,9 @@ void DBImpl::TEST_GetFilesMetaData( ColumnFamilyHandle* column_family, - std::vector>* metadata) { - auto cfh = reinterpret_cast(column_family); + std::vector>* metadata, + std::vector>* blob_metadata) { + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); InstrumentedMutexLock l(&mutex_); metadata->resize(NumberLevels()); @@ -70,6 +73,12 @@ (*metadata)[level].push_back(*f); } } + if (blob_metadata != nullptr) { + blob_metadata->clear(); + for (const auto& blob : cfd->current()->storage_info()->GetBlobFiles()) { + blob_metadata->push_back(blob.second); + } + } } uint64_t DBImpl::TEST_Current_Manifest_FileNo() { @@ -88,7 +97,7 @@ if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); } else { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); cfd = cfh->cfd(); } int output_level = @@ -131,7 +140,7 @@ if (cfh == nullptr) { cfd = default_cf_handle_->cfd(); } else { - auto cfhi = reinterpret_cast(cfh); + auto cfhi = static_cast_with_check(cfh); cfd = cfhi->cfd(); } return FlushMemTable(cfd, fo, FlushReason::kTest); @@ -147,12 +156,18 @@ return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); } +Status DBImpl::TEST_WaitForBackgroundWork() { + InstrumentedMutexLock l(&mutex_); + WaitForBackgroundWork(); + return error_handler_.GetBGError(); +} + Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); } else { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); cfd = cfh->cfd(); } return WaitForFlushMemTable(cfd, nullptr, false); @@ -169,12 +184,25 @@ while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || bg_flush_scheduled_ || (wait_unscheduled && unscheduled_compactions_)) && - (error_handler_.GetBGError() == Status::OK())) { + (error_handler_.GetBGError().ok())) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +Status DBImpl::TEST_WaitForPurge() { + InstrumentedMutexLock l(&mutex_); + while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) { bg_cv_.Wait(); } return error_handler_.GetBGError(); } +Status DBImpl::TEST_GetBGError() { + InstrumentedMutexLock l(&mutex_); + return error_handler_.GetBGError(); +} + void DBImpl::TEST_LockMutex() { mutex_.Lock(); } void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } @@ -234,15 +262,14 @@ uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { autovector empty_list; - return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr, - empty_list); + return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list); } Status DBImpl::TEST_GetLatestMutableCFOptions( ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { InstrumentedMutexLock l(&mutex_); - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); return Status::OK(); } @@ -271,21 +298,18 @@ return GetWalPreallocateBlockSize(write_buffer_size); } -void DBImpl::TEST_WaitForDumpStatsRun(std::function callback) const { - if (thread_dump_stats_ != nullptr) { - thread_dump_stats_->TEST_WaitForRun(callback); - } -} - -void DBImpl::TEST_WaitForPersistStatsRun(std::function callback) const { - if (thread_persist_stats_ != nullptr) { - thread_persist_stats_->TEST_WaitForRun(callback); +#ifndef ROCKSDB_LITE +void DBImpl::TEST_WaitForStatsDumpRun(std::function callback) const { + if (periodic_work_scheduler_ != nullptr) { + static_cast(periodic_work_scheduler_) + ->TEST_WaitForRun(callback); } } -bool DBImpl::TEST_IsPersistentStatsEnabled() const { - return thread_persist_stats_ && thread_persist_stats_->IsRunning(); +PeriodicWorkTestScheduler* DBImpl::TEST_GetPeriodicWorkScheduler() const { + return static_cast(periodic_work_scheduler_); } +#endif // !ROCKSDB_LITE size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { return EstimateInMemoryStatsHistorySize(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,22 +7,23 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include #include #include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "db/job_context.h" #include "db/version_set.h" +#include "logging/logging.h" #include "rocksdb/status.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); InternalKey start_key, end_key; if (begin != nullptr) { @@ -75,7 +76,8 @@ "PromoteL0 FAILED. Target level %d does not exist\n", target_level); job_context.Clean(); - return Status::InvalidArgument("Target level does not exist"); + status = Status::InvalidArgument("Target level does not exist"); + return status; } // Sort L0 files by range. @@ -95,7 +97,9 @@ "PromoteL0 FAILED. File %" PRIu64 " being compacted\n", f->fd.GetNumber()); job_context.Clean(); - return Status::InvalidArgument("PromoteL0 called during L0 compaction"); + status = + Status::InvalidArgument("PromoteL0 called during L0 compaction"); + return status; } if (i == 0) continue; @@ -106,7 +110,8 @@ " have overlapping ranges\n", prev_f->fd.GetNumber(), f->fd.GetNumber()); job_context.Clean(); - return Status::InvalidArgument("L0 has overlapping files"); + status = Status::InvalidArgument("L0 has overlapping files"); + return status; } } @@ -116,21 +121,23 @@ ROCKS_LOG_INFO(immutable_db_options_.info_log, "PromoteL0 FAILED. Level %d not empty\n", level); job_context.Clean(); - return Status::InvalidArgument( + status = Status::InvalidArgument( "All levels up to target_level " "must be empty"); + return status; } } edit.SetColumnFamily(cfd->GetID()); for (const auto& f : l0_files) { edit.DeleteFile(0, f->fd.GetNumber()); - edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, - f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time, f->file_creation_time, - f->file_checksum, f->file_checksum_func_name); + edit.AddFile( + target_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, f->file_checksum_func_name, + f->min_timestamp, f->max_timestamp); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,25 +6,24 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include #include #include + +#include "db/db_impl/db_impl.h" #include "db/event_helpers.h" #include "db/memtable_list.h" #include "file/file_util.h" +#include "file/filename.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" +#include "port/port.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { uint64_t DBImpl::MinLogNumberToKeep() { - if (allow_2pc()) { - return versions_->min_log_number_to_keep_2pc(); - } else { - return versions_->MinLogNumberWithUnflushedData(); - } + return versions_->min_log_number_to_keep(); } uint64_t DBImpl::MinObsoleteSstNumberToKeep() { @@ -35,7 +34,71 @@ return std::numeric_limits::max(); } -// * Returns the list of live files in 'sst_live' +Status DBImpl::DisableFileDeletions() { + Status s; + int my_disable_delete_obsolete_files; + { + InstrumentedMutexLock l(&mutex_); + s = DisableFileDeletionsWithLock(); + my_disable_delete_obsolete_files = disable_delete_obsolete_files_; + } + if (my_disable_delete_obsolete_files == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + my_disable_delete_obsolete_files); + } + return s; +} + +// FIXME: can be inconsistent with DisableFileDeletions in cases like +// DBImplReadOnly +Status DBImpl::DisableFileDeletionsWithLock() { + mutex_.AssertHeld(); + ++disable_delete_obsolete_files_; + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + int saved_counter; // initialize on all paths + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + saved_counter = disable_delete_obsolete_files_; + if (saved_counter == 0) { + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (saved_counter == 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + saved_counter); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +bool DBImpl::IsFileDeletionsEnabled() const { + return 0 == disable_delete_obsolete_files_; +} + +// * Returns the list of live files in 'sst_live' and 'blob_live'. // If it's doing full scan: // * Returns the list of all files in the filesystem in // 'full_scan_candidate_files'. @@ -62,7 +125,7 @@ mutable_db_options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { - const uint64_t now_micros = env_->NowMicros(); + const uint64_t now_micros = immutable_db_options_.clock->NowMicros(); if ((delete_obsolete_files_last_run_ + mutable_db_options_.delete_obsolete_files_period_micros) < now_micros) { @@ -76,26 +139,26 @@ // Since job_context->min_pending_output is set, until file scan finishes, // mutex_ cannot be released. Otherwise, we might see no min_pending_output // here but later find newer generated unfinalized files while scanning. - if (!pending_outputs_.empty()) { - job_context->min_pending_output = *pending_outputs_.begin(); - } else { - // delete all of them - job_context->min_pending_output = std::numeric_limits::max(); - } + job_context->min_pending_output = MinObsoleteSstNumberToKeep(); // Get obsolete files. This function will also update the list of // pending files in VersionSet(). - versions_->GetObsoleteFiles(&job_context->sst_delete_files, - &job_context->manifest_delete_files, - job_context->min_pending_output); - - // Mark the elements in job_context->sst_delete_files as grabbedForPurge - // so that other threads calling FindObsoleteFiles with full_scan=true - // will not add these files to candidate list for purge. + versions_->GetObsoleteFiles( + &job_context->sst_delete_files, &job_context->blob_delete_files, + &job_context->manifest_delete_files, job_context->min_pending_output); + + // Mark the elements in job_context->sst_delete_files and + // job_context->blob_delete_files as "grabbed for purge" so that other threads + // calling FindObsoleteFiles with full_scan=true will not add these files to + // candidate list for purge. for (const auto& sst_to_del : job_context->sst_delete_files) { MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber()); } + for (const auto& blob_file : job_context->blob_delete_files) { + MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber()); + } + // store the current filenum, lognum, etc job_context->manifest_file_number = versions_->manifest_file_number(); job_context->pending_manifest_file_number = @@ -103,7 +166,7 @@ job_context->log_number = MinLogNumberToKeep(); job_context->prev_log_number = versions_->prev_log_number(); - versions_->AddLiveFiles(&job_context->sst_live); + versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live); if (doing_the_full_scan) { InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), dbname_); @@ -133,7 +196,8 @@ // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(path, &files); // Ignore errors + Status s = env_->GetChildren(path, &files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& file : files) { uint64_t number; FileType type; @@ -149,27 +213,30 @@ continue; } - // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + // TODO(icanadi) clean up this mess to avoid having one-off "/" + // prefixes job_context->full_scan_candidate_files.emplace_back("/" + file, path); } } // Add log files in wal_dir - if (immutable_db_options_.wal_dir != dbname_) { + if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) { std::vector log_files; - env_->GetChildren(immutable_db_options_.wal_dir, - &log_files); // Ignore errors + Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& log_file : log_files) { job_context->full_scan_candidate_files.emplace_back( log_file, immutable_db_options_.wal_dir); } } + // Add info log files in db_log_dir if (!immutable_db_options_.db_log_dir.empty() && immutable_db_options_.db_log_dir != dbname_) { std::vector info_log_files; - // Ignore errors - env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + Status s = + env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (std::string& log_file : info_log_files) { job_context->full_scan_candidate_files.emplace_back( log_file, immutable_db_options_.db_log_dir); @@ -250,17 +317,22 @@ return (first.file_path > second.file_path); } } -}; // namespace +} // namespace // Delete obsolete files and log status and information of file deletion void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, const std::string& path_to_sync, FileType type, uint64_t number) { + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", + const_cast(&fname)); + Status file_deletion_status; - if (type == kTableFile || type == kLogFile) { - file_deletion_status = - DeleteDBFile(&immutable_db_options_, fname, path_to_sync, - /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); + if (type == kTableFile || type == kBlobFile || type == kWalFile) { + // Rate limit WAL deletion only if its in the DB dir + file_deletion_status = DeleteDBFile( + &immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, + /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false); } else { file_deletion_status = env_->DeleteFile(fname); } @@ -289,6 +361,11 @@ &event_logger_, job_id, number, fname, file_deletion_status, GetName(), immutable_db_options_.listeners); } + if (type == kBlobFile) { + EventHelpers::LogAndNotifyBlobFileDeletion( + &event_logger_, immutable_db_options_.listeners, job_id, number, fname, + file_deletion_status, GetName()); + } } // Diffs the files listed in filenames and those that do not @@ -303,19 +380,19 @@ // FindObsoleteFiles() should've populated this so nonzero assert(state.manifest_file_number != 0); - // Now, convert live list to an unordered map, WITHOUT mutex held; - // set is slow. - std::unordered_map sst_live_map; - for (const FileDescriptor& fd : state.sst_live) { - sst_live_map[fd.GetNumber()] = &fd; - } + // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow. + std::unordered_set sst_live_set(state.sst_live.begin(), + state.sst_live.end()); + std::unordered_set blob_live_set(state.blob_live.begin(), + state.blob_live.end()); std::unordered_set log_recycle_files_set( state.log_recycle_files.begin(), state.log_recycle_files.end()); auto candidate_files = state.full_scan_candidate_files; candidate_files.reserve( candidate_files.size() + state.sst_delete_files.size() + - state.log_delete_files.size() + state.manifest_delete_files.size()); + state.blob_delete_files.size() + state.log_delete_files.size() + + state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. for (auto& file : state.sst_delete_files) { candidate_files.emplace_back( @@ -326,10 +403,15 @@ file.DeleteMetadata(); } + for (const auto& blob_file : state.blob_delete_files) { + candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()), + blob_file.GetPath()); + } + + auto wal_dir = immutable_db_options_.GetWalDir(); for (auto file_num : state.log_delete_files) { if (file_num > 0) { - candidate_files.emplace_back(LogFileName(file_num), - immutable_db_options_.wal_dir); + candidate_files.emplace_back(LogFileName(file_num), wal_dir); } } for (const auto& filename : state.manifest_delete_files) { @@ -382,7 +464,8 @@ // Close WALs before trying to delete them. for (const auto w : state.logs_to_free) { // TODO: maybe check the return value of Close. - w->Close(); + auto s = w->Close(); + s.PermitUncheckedError(); } bool own_files = OwnTablesAndLogs(); @@ -398,7 +481,7 @@ bool keep = true; switch (type) { - case kLogFile: + case kWalFile: keep = ((number >= state.log_number) || (number == state.prev_log_number) || (log_recycle_files_set.find(number) != @@ -412,12 +495,19 @@ case kTableFile: // If the second condition is not there, this makes // DontDeletePendingOutputs fail - keep = (sst_live_map.find(number) != sst_live_map.end()) || + keep = (sst_live_set.find(number) != sst_live_set.end()) || number >= state.min_pending_output; if (!keep) { files_to_del.insert(number); } break; + case kBlobFile: + keep = number >= state.min_pending_output || + (blob_live_set.find(number) != blob_live_set.end()); + if (!keep) { + files_to_del.insert(number); + } + break; case kTempFile: // Any temp files that are currently being written to must // be recorded in pending_outputs_, which is inserted into "live". @@ -427,7 +517,8 @@ // // TODO(yhchiang): carefully modify the third condition to safely // remove the temp options files. - keep = (sst_live_map.find(number) != sst_live_map.end()) || + keep = (sst_live_set.find(number) != sst_live_set.end()) || + (blob_live_set.find(number) != blob_live_set.end()) || (number == state.pending_manifest_file_number) || (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); break; @@ -439,18 +530,11 @@ break; case kOptionsFile: keep = (number >= optsfile_num2); - TEST_SYNC_POINT_CALLBACK( - "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", - reinterpret_cast(&number)); - TEST_SYNC_POINT_CALLBACK( - "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", - reinterpret_cast(&keep)); break; case kCurrentFile: case kDBLockFile: case kIdentityFile: case kMetaDatabase: - case kBlobFile: keep = true; break; } @@ -466,9 +550,11 @@ TableCache::Evict(table_cache_.get(), number); fname = MakeTableFileName(candidate_file.file_path, number); dir_to_sync = candidate_file.file_path; + } else if (type == kBlobFile) { + fname = BlobFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; } else { - dir_to_sync = - (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_; + dir_to_sync = (type == kWalFile) ? wal_dir : dbname_; fname = dir_to_sync + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || (!to_delete.empty() && to_delete.front() == '/') @@ -478,8 +564,8 @@ } #ifndef ROCKSDB_LITE - if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || - immutable_db_options_.wal_size_limit_mb > 0)) { + if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0)) { wal_manager_.ArchiveWALFile(fname, number); continue; } @@ -491,7 +577,6 @@ if (!own_files) { continue; } - Status file_deletion_status; if (schedule_only) { InstrumentedMutexLock guard_lock(&mutex_); SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id); @@ -555,6 +640,11 @@ InstrumentedMutexLock l(&mutex_); --pending_purge_obsolete_files_; assert(pending_purge_obsolete_files_ >= 0); + if (schedule_only) { + // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_ + // while holding mutex (for GetSortedWalFiles() etc.) + SchedulePurge(); + } if (pending_purge_obsolete_files_ == 0) { bg_cv_.SignalAll(); } @@ -568,26 +658,28 @@ mutex_.Unlock(); if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); + bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io; + PurgeObsoleteFiles(job_context, defer_purge); } job_context.Clean(); mutex_.Lock(); } uint64_t FindMinPrepLogReferencedByMemTable( - VersionSet* vset, const ColumnFamilyData* cfd_to_flush, - const autovector& memtables_to_flush) { + VersionSet* vset, const autovector& memtables_to_flush) { uint64_t min_log = 0; // we must look through the memtables for two phase transactions // that have been committed but not yet flushed + std::unordered_set memtables_to_flush_set( + memtables_to_flush.begin(), memtables_to_flush.end()); for (auto loop_cfd : *vset->GetColumnFamilySet()) { - if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) { + if (loop_cfd->IsDropped()) { continue; } auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( - memtables_to_flush); + &memtables_to_flush_set); if (log > 0 && (min_log == 0 || log < min_log)) { min_log = log; @@ -603,16 +695,39 @@ return min_log; } -uint64_t PrecomputeMinLogNumberToKeep( +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, + const autovector*>& memtables_to_flush) { + uint64_t min_log = 0; + + std::unordered_set memtables_to_flush_set; + for (const autovector* memtables : memtables_to_flush) { + memtables_to_flush_set.insert(memtables->begin(), memtables->end()); + } + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped()) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, - autovector edit_list, - const autovector& memtables_to_flush, - LogsWithPrepTracker* prep_tracker) { + const autovector& edit_list) { assert(vset != nullptr); - assert(prep_tracker != nullptr); - // Calculate updated min_log_number_to_keep - // Since the function should only be called in 2pc mode, log number in - // the version edit should be sufficient. // Precompute the min log number containing unflushed data for the column // family being flushed (`cfd_to_flush`). @@ -636,6 +751,58 @@ min_log_number_to_keep = std::min(cf_min_log_number_to_keep, min_log_number_to_keep); } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists) { + assert(vset != nullptr); + assert(!cfds_to_flush.empty()); + assert(cfds_to_flush.size() == edit_lists.size()); + + uint64_t min_log_number_to_keep = port::kMaxUint64; + for (const auto& edit_list : edit_lists) { + uint64_t log = 0; + for (const auto& e : edit_list) { + if (e->HasLogNumber()) { + log = std::max(log, e->GetLogNumber()); + } + } + if (log != 0) { + min_log_number_to_keep = std::min(min_log_number_to_keep, log); + } + } + if (min_log_number_to_keep == port::kMaxUint64) { + min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber(); + for (size_t i = 1; i < cfds_to_flush.size(); i++) { + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber()); + } + } + + std::unordered_set flushed_cfds( + cfds_to_flush.begin(), cfds_to_flush.end()); + min_log_number_to_keep = + std::min(min_log_number_to_keep, + vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds)); + + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list); // if are 2pc we must consider logs containing prepared // sections of outstanding transactions. @@ -654,14 +821,162 @@ min_log_number_to_keep = min_log_in_prep_heap; } - uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable( - vset, &cfd_to_flush, memtables_to_flush); + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + assert(cfds_to_flush.size() == edit_lists.size()); + assert(cfds_to_flush.size() == memtables_to_flush.size()); + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists); + + uint64_t min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = + FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush); if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < min_log_number_to_keep) { min_log_number_to_keep = min_log_refed_by_mem; } + return min_log_number_to_keep; } +Status DBImpl::SetDBId(bool read_only) { + Status s; + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + // Create a new DB ID, saving to file only if allowed + if (read_only) { + db_id_ = env_->GenerateUniqueId(); + return Status::OK(); + } else { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + s = GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + /* new_descriptor_log */ false); + } + } else if (!read_only) { + s = SetIdentityFile(env_, dbname_, db_id_); + } + return s; +} + +Status DBImpl::DeleteUnreferencedSstFiles() { + mutex_.AssertHeld(); + std::vector paths; + paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); + for (const auto& db_path : immutable_db_options_.db_paths) { + paths.push_back( + NormalizePath(db_path.path + std::string(1, kFilePathSeparator))); + } + for (const auto* cfd : *versions_->GetColumnFamilySet()) { + for (const auto& cf_path : cfd->ioptions()->cf_paths) { + paths.push_back( + NormalizePath(cf_path.path + std::string(1, kFilePathSeparator))); + } + } + // Dedup paths + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + + uint64_t next_file_number = versions_->current_next_file_number(); + uint64_t largest_file_number = next_file_number; + std::set files_to_delete; + Status s; + for (const auto& path : paths) { + std::vector files; + s = env_->GetChildren(path, &files); + if (!s.ok()) { + break; + } + for (const auto& fname : files) { + uint64_t number = 0; + FileType type; + if (!ParseFileName(fname, &number, &type)) { + continue; + } + // path ends with '/' or '\\' + const std::string normalized_fpath = path + fname; + largest_file_number = std::max(largest_file_number, number); + if (type == kTableFile && number >= next_file_number && + files_to_delete.find(normalized_fpath) == files_to_delete.end()) { + files_to_delete.insert(normalized_fpath); + } + } + } + if (!s.ok()) { + return s; + } + + if (largest_file_number >= next_file_number) { + versions_->next_file_number_.store(largest_file_number + 1); + } + + VersionEdit edit; + edit.SetNextFile(versions_->next_file_number_.load()); + assert(versions_->GetColumnFamilySet()); + ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(default_cfd); + s = versions_->LogAndApply( + default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_, + directories_.GetDbDir(), /*new_descriptor_log*/ false); + if (!s.ok()) { + return s; + } + + mutex_.Unlock(); + for (const auto& fname : files_to_delete) { + s = env_->DeleteFile(fname); + if (!s.ok()) { + break; + } + } + mutex_.Lock(); + return s; +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,45 +6,41 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include #include "db/builder.h" +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" +#include "db/periodic_work_scheduler.h" #include "env/composite_env_wrapper.h" +#include "file/filename.h" #include "file/read_write_util.h" #include "file/sst_file_manager_impl.h" #include "file/writable_file_writer.h" +#include "logging/logging.h" #include "monitoring/persistent_stats_history.h" #include "options/options_helper.h" +#include "rocksdb/table.h" #include "rocksdb/wal_filter.h" -#include "table/block_based/block_based_table_factory.h" #include "test_util/sync_point.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -Options SanitizeOptions(const std::string& dbname, const Options& src) { - auto db_options = SanitizeOptions(dbname, DBOptions(src)); +Options SanitizeOptions(const std::string& dbname, const Options& src, + bool read_only) { + auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only); ImmutableDBOptions immutable_db_options(db_options); auto cf_options = SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); return Options(db_options, cf_options); } -DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, + bool read_only) { DBOptions result(src); - if (result.file_system == nullptr) { - if (result.env == Env::Default()) { - result.file_system = FileSystem::Default(); - } else { - result.file_system.reset(new LegacyFileSystemWrapper(result.env)); - } - } else { - if (result.env == nullptr) { - result.env = Env::Default(); - } + if (result.env == nullptr) { + result.env = Env::Default(); } // result.max_open_files means an "infinite" open files. @@ -58,7 +54,7 @@ &result.max_open_files); } - if (result.info_log == nullptr) { + if (result.info_log == nullptr && !read_only) { Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); if (!s.ok()) { // No place suitable for logging @@ -98,25 +94,47 @@ } if (result.recycle_log_file_num && - (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + (result.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { - // kPointInTimeRecovery is inconsistent with recycle log file feature since - // we define the "end" of the log as the first corrupt record we encounter. - // kAbsoluteConsistency doesn't make sense because even a clean - // shutdown leaves old junk at the end of the log file. + // - kTolerateCorruptedTailRecords is inconsistent with recycle log file + // feature. WAL recycling expects recovery success upon encountering a + // corrupt record at the point where new data ends and recycled data + // remains at the tail. However, `kTolerateCorruptedTailRecords` must fail + // upon encountering any such corrupt record, as it cannot differentiate + // between this and a real corruption, which would cause committed updates + // to be truncated -- a violation of the recovery guarantee. + // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with + // recycle log file feature temporarily due to a bug found introducing a + // hole in the recovered data + // (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236). + // Besides this bug, we believe the features are fundamentally compatible. result.recycle_log_file_num = 0; } - if (result.wal_dir.empty()) { + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } else if (result.wal_dir.empty()) { // Use dbname as default result.wal_dir = dbname; } - if (result.wal_dir.back() == '/') { - result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + if (!result.wal_dir.empty()) { + // If there is a wal_dir already set, check to see if the wal_dir is the + // same as the dbname AND the same as the db_path[0] (which must exist from + // a few lines ago). If the wal_dir matches both of these values, then clear + // the wal_dir value, which will make wal_dir == dbname. Most likely this + // condition was the result of reading an old options file where we forced + // wal_dir to be set (to dbname). + auto npath = NormalizePath(dbname + "/"); + if (npath == NormalizePath(result.wal_dir + "/") && + npath == NormalizePath(result.db_paths[0].path + "/")) { + result.wal_dir.clear(); + } } - if (result.db_paths.size() == 0) { - result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + if (!result.wal_dir.empty() && result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); } if (result.use_direct_reads && result.compaction_readahead_size == 0) { @@ -137,7 +155,7 @@ #ifndef ROCKSDB_LITE ImmutableDBOptions immutable_db_options(result); - if (!IsWalDirSameAsDBPath(&immutable_db_options)) { + if (!immutable_db_options.IsWalDirSameAsDBPath()) { // Either the WAL dir and db_paths[0]/db_name are not the same, or we // cannot tell for sure. In either case, assume they're different and // explicitly cleanup the trash log files (bypass DeleteScheduler) @@ -145,13 +163,15 @@ // DeleteScheduler::CleanupDirectory on the same dir later, it will be // safe std::vector filenames; - result.env->GetChildren(result.wal_dir, &filenames); + auto wal_dir = immutable_db_options.GetWalDir(); + Status s = result.env->GetChildren(wal_dir, &filenames); + s.PermitUncheckedError(); //**TODO: What to do on error? for (std::string& filename : filenames) { if (filename.find(".log.trash", filename.length() - std::string(".log.trash").length()) != std::string::npos) { - std::string trash_file = result.wal_dir + "/" + filename; - result.env->DeleteFile(trash_file); + std::string trash_file = wal_dir + "/" + filename; + result.env->DeleteFile(trash_file).PermitUncheckedError(); } } } @@ -161,7 +181,8 @@ // was not used) auto sfm = static_cast(result.sst_file_manager.get()); for (size_t i = 0; i < result.db_paths.size(); i++) { - DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path); + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path) + .PermitUncheckedError(); } // Create a default SstFileManager for purposes of tracking compaction size @@ -171,7 +192,7 @@ NewSstFileManager(result.env, result.info_log)); result.sst_file_manager = sst_file_manager; } -#endif +#endif // !ROCKSDB_LITE if (!result.paranoid_checks) { result.skip_checking_sst_file_sizes_on_db_open = true; @@ -179,16 +200,23 @@ "file size check will be skipped during open."); } + if (result.preserve_deletes) { + ROCKS_LOG_WARN( + result.info_log, + "preserve_deletes is deprecated, will be removed in a future release. " + "Please try using user-defined timestamp instead."); + } + return result; } namespace { -Status SanitizeOptionsByTable( +Status ValidateOptionsByTable( const DBOptions& db_opts, const std::vector& column_families) { Status s; for (auto cf : column_families) { - s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); + s = ValidateOptions(db_opts, cf.options); if (!s.ok()) { return s; } @@ -252,10 +280,16 @@ "atomic_flush is incompatible with enable_pipelined_write"); } + // TODO remove this restriction + if (db_options.atomic_flush && db_options.best_efforts_recovery) { + return Status::InvalidArgument( + "atomic_flush is currently incompatible with best-efforts recovery"); + } + return Status::OK(); } -Status DBImpl::NewDB() { +Status DBImpl::NewDB(std::vector* new_filenames) { VersionEdit new_db; Status s = SetIdentityFile(env_, dbname_); if (!s.ok()) { @@ -273,36 +307,47 @@ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); { + if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } std::unique_ptr file; FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); s = NewWritableFile(fs_.get(), manifest, &file, file_options); if (!s.ok()) { return s; } + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; file->SetPreallocationBlockSize( immutable_db_options_.manifest_preallocation_size); std::unique_ptr file_writer(new WritableFileWriter( - std::move(file), manifest, file_options, env_, nullptr /* stats */, - immutable_db_options_.listeners)); + std::move(file), manifest, file_options, immutable_db_options_.clock, + io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, + nullptr, tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); if (s.ok()) { - s = SyncManifest(env_, &immutable_db_options_, log.file()); + s = SyncManifest(&immutable_db_options_, log.file()); } } if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); + s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir()); + if (new_filenames) { + new_filenames->emplace_back( + manifest.substr(manifest.find_last_of("/\\") + 1)); + } } else { - fs_->DeleteFile(manifest, IOOptions(), nullptr); + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); } return s; } -Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, - std::unique_ptr* directory) { +IOStatus DBImpl::CreateAndNewDirectory( + FileSystem* fs, const std::string& dirname, + std::unique_ptr* directory) { // We call CreateDirIfMissing() as the directory may already exist (if we // are reopening a DB), when this happens we don't want creating the // directory to cause an error. However, we need to check if creating the @@ -310,24 +355,24 @@ // file not existing. One real-world example of this occurring is if // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. // when dbname_ is "dir/db" but when "dir" doesn't exist. - Status s = env->CreateDirIfMissing(dirname); - if (!s.ok()) { - return s; + IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr); + if (!io_s.ok()) { + return io_s; } - return env->NewDirectory(dirname, directory); + return fs->NewDirectory(dirname, IOOptions(), directory, nullptr); } -Status Directories::SetDirectories(Env* env, const std::string& dbname, - const std::string& wal_dir, - const std::vector& data_paths) { - Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_); - if (!s.ok()) { - return s; +IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { + IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_); + if (!io_s.ok()) { + return io_s; } if (!wal_dir.empty() && dbname != wal_dir) { - s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_); - if (!s.ok()) { - return s; + io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_); + if (!io_s.ok()) { + return io_s; } } @@ -337,28 +382,29 @@ if (db_path == dbname) { data_dirs_.emplace_back(nullptr); } else { - std::unique_ptr path_directory; - s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory); - if (!s.ok()) { - return s; + std::unique_ptr path_directory; + io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory); + if (!io_s.ok()) { + return io_s; } data_dirs_.emplace_back(path_directory.release()); } } assert(data_dirs_.size() == data_paths.size()); - return Status::OK(); + return IOStatus::OK(); } Status DBImpl::Recover( const std::vector& column_families, bool read_only, - bool error_if_log_file_exist, bool error_if_data_exists_in_logs, + bool error_if_wal_file_exists, bool error_if_data_exists_in_wals, uint64_t* recovered_seq) { mutex_.AssertHeld(); bool is_new_db = false; assert(db_lock_ == nullptr); + std::vector files_in_dbname; if (!read_only) { - Status s = directories_.SetDirectories(env_, dbname_, + Status s = directories_.SetDirectories(fs_.get(), dbname_, immutable_db_options_.wal_dir, immutable_db_options_.db_paths); if (!s.ok()) { @@ -371,10 +417,35 @@ } std::string current_fname = CurrentFileName(dbname_); - s = env_->FileExists(current_fname); + // Path to any MANIFEST file in the db dir. It does not matter which one. + // Since best-efforts recovery ignores CURRENT file, existence of a + // MANIFEST indicates the recovery to recover existing db. If no MANIFEST + // can be found, a new db will be created. + std::string manifest_path; + if (!immutable_db_options_.best_efforts_recovery) { + s = env_->FileExists(current_fname); + } else { + s = Status::NotFound(); + Status io_s = env_->GetChildren(dbname_, &files_in_dbname); + if (!io_s.ok()) { + s = io_s; + files_in_dbname.clear(); + } + for (const std::string& file : files_in_dbname) { + uint64_t number = 0; + FileType type = kWalFile; // initialize + if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { + // Found MANIFEST (descriptor log), thus best-efforts recovery does + // not have to treat the db as empty. + s = Status::OK(); + manifest_path = dbname_ + "/" + file; + break; + } + } + } if (s.IsNotFound()) { if (immutable_db_options_.create_if_missing) { - s = NewDB(); + s = NewDB(&files_in_dbname); is_new_db = true; if (!s.ok()) { return s; @@ -399,14 +470,14 @@ FileOptions customized_fs(file_options_); customized_fs.use_direct_reads |= immutable_db_options_.use_direct_io_for_flush_and_compaction; - s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, - nullptr); + const std::string& fname = + manifest_path.empty() ? current_fname : manifest_path; + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); if (!s.ok()) { std::string error_str = s.ToString(); // Check if unsupported Direct I/O is the root cause customized_fs.use_direct_reads = false; - s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, - nullptr); + s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr); if (s.ok()) { return Status::InvalidArgument( "Direct I/O is not supported by the specified DB."); @@ -416,49 +487,45 @@ } } } + } else if (immutable_db_options_.best_efforts_recovery) { + assert(files_in_dbname.empty()); + Status s = env_->GetChildren(dbname_, &files_in_dbname); + if (s.IsNotFound()) { + return Status::InvalidArgument(dbname_, + "does not exist (open for read only)"); + } else if (s.IsIOError()) { + return s; + } + assert(s.ok()); } assert(db_id_.empty()); - Status s = versions_->Recover(column_families, read_only, &db_id_); + Status s; + bool missing_table_file = false; + if (!immutable_db_options_.best_efforts_recovery) { + s = versions_->Recover(column_families, read_only, &db_id_); + } else { + assert(!files_in_dbname.empty()); + s = versions_->TryRecover(column_families, read_only, files_in_dbname, + &db_id_, &missing_table_file); + if (s.ok()) { + // TryRecover may delete previous column_family_set_. + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + } + } if (!s.ok()) { return s; } - // Happens when immutable_db_options_.write_dbid_to_manifest is set to true - // the very first time. - if (db_id_.empty()) { - // Check for the IDENTITY file and create it if not there. - s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); - // Typically Identity file is created in NewDB() and for some reason if - // it is no longer available then at this point DB ID is not in Identity - // file or Manifest. - if (s.IsNotFound()) { - s = SetIdentityFile(env_, dbname_); - if (!s.ok()) { - return s; - } - } else if (!s.ok()) { - assert(s.IsIOError()); - return s; - } - s = GetDbIdentityFromIdentityFile(&db_id_); - if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { - VersionEdit edit; - edit.SetDBId(db_id_); - Options options; - MutableCFOptions mutable_cf_options(options); - versions_->db_id_ = db_id_; - s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &edit, &mutex_, nullptr, - false); - } - } else { - s = SetIdentityFile(env_, dbname_, db_id_); + s = SetDBId(read_only); + if (s.ok() && !read_only) { + s = DeleteUnreferencedSstFiles(); } if (immutable_db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok() && !read_only) { - std::map> created_dirs; + std::map> created_dirs; for (auto cfd : *versions_->GetColumnFamilySet()) { s = cfd->AddDirectories(&created_dirs); if (!s.ok()) { @@ -471,8 +538,9 @@ s = InitPersistStatsColumnFamily(); } + std::vector files_in_wal_dir; if (s.ok()) { - // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // Initial max_total_in_memory_state_ before recovery wals. Log recovery // may check this value to decide whether to flush. max_total_in_memory_state_ = 0; for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -497,59 +565,86 @@ // Note that prev_log_number() is no longer used, but we pay // attention to it in case we are recovering a database // produced by an older version of rocksdb. - std::vector filenames; - s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + auto wal_dir = immutable_db_options_.GetWalDir(); + if (!immutable_db_options_.best_efforts_recovery) { + s = env_->GetChildren(wal_dir, &files_in_wal_dir); + } if (s.IsNotFound()) { - return Status::InvalidArgument("wal_dir not found", - immutable_db_options_.wal_dir); + return Status::InvalidArgument("wal_dir not found", wal_dir); } else if (!s.ok()) { return s; } - std::vector logs; - for (size_t i = 0; i < filenames.size(); i++) { + std::unordered_map wal_files; + for (const auto& file : files_in_wal_dir) { uint64_t number; FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { if (is_new_db) { return Status::Corruption( "While creating a new Db, wal_dir contains " "existing log file: ", - filenames[i]); + file); } else { - logs.push_back(number); + wal_files[number] = LogFileName(wal_dir, number); } } } - if (logs.size() > 0) { - if (error_if_log_file_exist) { + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + if (!immutable_db_options_.best_efforts_recovery) { + // Verify WALs in MANIFEST. + s = versions_->GetWalSet().CheckWals(env_, wal_files); + } // else since best effort recovery does not recover from WALs, no need + // to check WALs. + } else if (!versions_->GetWalSet().GetWals().empty()) { + // Tracking is disabled, clear previously tracked WALs from MANIFEST, + // otherwise, in the future, if WAL tracking is enabled again, + // since the WALs deleted when WAL tracking is disabled are not persisted + // into MANIFEST, WAL check may fail. + VersionEdit edit; + WalNumber max_wal_number = + versions_->GetWalSet().GetWals().rbegin()->first; + edit.DeleteWalsBefore(max_wal_number + 1); + s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_); + } + if (!s.ok()) { + return s; + } + + if (!wal_files.empty()) { + if (error_if_wal_file_exists) { return Status::Corruption( - "The db was opened in readonly mode with error_if_log_file_exist" - "flag but a log file already exists"); - } else if (error_if_data_exists_in_logs) { - for (auto& log : logs) { - std::string fname = LogFileName(immutable_db_options_.wal_dir, log); + "The db was opened in readonly mode with error_if_wal_file_exists" + "flag but a WAL file already exists"); + } else if (error_if_data_exists_in_wals) { + for (auto& wal_file : wal_files) { uint64_t bytes; - s = env_->GetFileSize(fname, &bytes); + s = env_->GetFileSize(wal_file.second, &bytes); if (s.ok()) { if (bytes > 0) { return Status::Corruption( - "error_if_data_exists_in_logs is set but there are data " - " in log files."); + "error_if_data_exists_in_wals is set but there are data " + " in WAL files."); } } } } } - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - bool corrupted_log_found = false; - s = RecoverLogFiles(logs, &next_sequence, read_only, - &corrupted_log_found); - if (corrupted_log_found && recovered_seq != nullptr) { + if (!wal_files.empty()) { + // Recover in the order in which the wals were generated + std::vector wals; + wals.reserve(wal_files.size()); + for (const auto& wal_file : wal_files) { + wals.push_back(wal_file.first); + } + std::sort(wals.begin(), wals.end()); + + bool corrupted_wal_found = false; + s = RecoverLogFiles(wals, &next_sequence, read_only, + &corrupted_wal_found); + if (corrupted_wal_found && recovered_seq != nullptr) { *recovered_seq = next_sequence; } if (!s.ok()) { @@ -567,23 +662,37 @@ // to reflect the most recent OPTIONS file. It does not matter for regular // read-write db instance because options_file_number_ will later be // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. - std::vector file_names; + std::vector filenames; if (s.ok()) { - s = env_->GetChildren(GetName(), &file_names); + const std::string normalized_dbname = NormalizePath(dbname_); + const std::string normalized_wal_dir = + NormalizePath(immutable_db_options_.GetWalDir()); + if (immutable_db_options_.best_efforts_recovery) { + filenames = std::move(files_in_dbname); + } else if (normalized_dbname == normalized_wal_dir) { + filenames = std::move(files_in_wal_dir); + } else { + s = env_->GetChildren(GetName(), &filenames); + } } if (s.ok()) { uint64_t number = 0; uint64_t options_file_number = 0; FileType type; - for (const auto& fname : file_names) { + for (const auto& fname : filenames) { if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { options_file_number = std::max(number, options_file_number); } } versions_->options_file_number_ = options_file_number; + uint64_t options_file_size = 0; + if (options_file_number > 0) { + s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number), + &options_file_size); + } + versions_->options_file_size_ = options_file_size; } } - return s; } @@ -612,41 +721,56 @@ (kStatsCFCurrentFormatVersion < format_version_recovered && kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { if (!s_format.ok() || !s_compatible.ok()) { - ROCKS_LOG_INFO( + ROCKS_LOG_WARN( immutable_db_options_.info_log, - "Reading persistent stats version key failed. Format key: %s, " - "compatible key: %s", + "Recreating persistent stats column family since reading " + "persistent stats version key failed. Format key: %s, compatible " + "key: %s", s_format.ToString().c_str(), s_compatible.ToString().c_str()); } else { - ROCKS_LOG_INFO( + ROCKS_LOG_WARN( immutable_db_options_.info_log, - "Disable persistent stats due to corrupted or incompatible format " - "version\n"); + "Recreating persistent stats column family due to corrupted or " + "incompatible format version. Recovered format: %" PRIu64 + "; recovered format compatible since: %" PRIu64 "\n", + format_version_recovered, compatible_version_recovered); + } + s = DropColumnFamily(persist_stats_cf_handle_); + if (s.ok()) { + s = DestroyColumnFamilyHandle(persist_stats_cf_handle_); } - DropColumnFamily(persist_stats_cf_handle_); - DestroyColumnFamilyHandle(persist_stats_cf_handle_); ColumnFamilyHandle* handle = nullptr; - ColumnFamilyOptions cfo; - OptimizeForPersistentStats(&cfo); - s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); - persist_stats_cf_handle_ = static_cast(handle); - // should also persist version here because old stats CF is discarded - should_persist_format_version = true; + if (s.ok()) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + } + if (s.ok()) { + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } } } - if (s.ok() && should_persist_format_version) { + if (should_persist_format_version) { // Persistent stats CF being created for the first time, need to write // format version key WriteBatch batch; - batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, - ToString(kStatsCFCurrentFormatVersion)); - batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, - ToString(kStatsCFCompatibleFormatVersion)); - WriteOptions wo; - wo.low_pri = true; - wo.no_slowdown = true; - wo.sync = false; - s = Write(wo, &batch); + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + ToString(kStatsCFCurrentFormatVersion)); + } + if (s.ok()) { + s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + ToString(kStatsCFCompatibleFormatVersion)); + } + if (s.ok()) { + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } } mutex_.Lock(); return s; @@ -679,10 +803,10 @@ return s; } -// REQUIRES: log_numbers are sorted in ascending order -Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, +// REQUIRES: wal_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, SequenceNumber* next_sequence, bool read_only, - bool* corrupted_log_found) { + bool* corrupted_wal_found) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; @@ -690,10 +814,10 @@ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false void Corruption(size_t bytes, const Status& s) override { ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", - (this->status == nullptr ? "(ignoring error) " : ""), - fname, static_cast(bytes), s.ToString().c_str()); - if (this->status != nullptr && this->status->ok()) { - *this->status = s; + (status == nullptr ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (status != nullptr && status->ok()) { + *status = s; } } }; @@ -712,10 +836,10 @@ auto stream = event_logger_.Log(); stream << "job" << job_id << "event" << "recovery_started"; - stream << "log_files"; + stream << "wal_files"; stream.StartArray(); - for (auto log_number : log_numbers) { - stream << log_number; + for (auto wal_number : wal_numbers) { + stream << wal_number; } stream.EndArray(); } @@ -738,25 +862,31 @@ bool stop_replay_by_wal_filter = false; bool stop_replay_for_corruption = false; bool flushed = false; - uint64_t corrupted_log_number = kMaxSequenceNumber; - uint64_t min_log_number = MinLogNumberToKeep(); - for (auto log_number : log_numbers) { - if (log_number < min_log_number) { + uint64_t corrupted_wal_number = kMaxSequenceNumber; + uint64_t min_wal_number = MinLogNumberToKeep(); + if (!allow_2pc()) { + // In non-2pc mode, we skip WALs that do not back unflushed data. + min_wal_number = + std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData()); + } + for (auto wal_number : wal_numbers) { + if (wal_number < min_wal_number) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Skipping log #%" PRIu64 " since it is older than min log to keep #%" PRIu64, - log_number, min_log_number); + wal_number, min_wal_number); continue; } // The previous incarnation may not have written any MANIFEST // records after allocating this log number. So we manually // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(log_number); + versions_->MarkFileNumberUsed(wal_number); // Open the log file - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Recovering log #%" PRIu64 " mode %d", log_number, + "Recovering log #%" PRIu64 " mode %d", wal_number, static_cast(immutable_db_options_.wal_recovery_mode)); auto logFileDropped = [this, &fname]() { uint64_t bytes; @@ -788,7 +918,8 @@ } } file_reader.reset(new SequentialFileReader( - std::move(file), fname, immutable_db_options_.log_readahead_size)); + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); } // Create the log reader. @@ -808,7 +939,7 @@ // to be skipped instead of propagating bad information (like overly // large sequence numbers). log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), - &reporter, true /*checksum*/, log_number); + &reporter, true /*checksum*/, wal_number); // Determine if we should tolerate incomplete records at the tail end of the // Read all the records and add to a memtable @@ -816,6 +947,8 @@ Slice record; WriteBatch batch; + TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal", + /*arg=*/nullptr); while (!stop_replay_by_wal_filter && reader.ReadRecord(&record, &scratch, immutable_db_options_.wal_recovery_mode) && @@ -825,7 +958,11 @@ Status::Corruption("log record too small")); continue; } - WriteBatchInternal::SetContents(&batch, record); + + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + return status; + } SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); if (immutable_db_options_.wal_recovery_mode == @@ -850,7 +987,7 @@ WalFilter::WalProcessingOption wal_processing_option = immutable_db_options_.wal_filter->LogRecordFound( - log_number, fname, batch, &new_batch, &batch_changed); + wal_number, fname, batch, &new_batch, &batch_changed); switch (wal_processing_option) { case WalFilter::WalProcessingOption::kContinueProcessing: @@ -902,7 +1039,7 @@ " mode %d log filter %s returned " "more records (%d) than original (%d) which is not allowed. " "Aborting recovery.", - log_number, + wal_number, static_cast(immutable_db_options_.wal_recovery_mode), immutable_db_options_.wal_filter->Name(), new_count, original_count); @@ -929,7 +1066,7 @@ bool has_valid_writes = false; status = WriteBatchInternal::InsertInto( &batch, column_family_memtables_.get(), &flush_scheduler_, - &trim_history_scheduler_, true, log_number, this, + &trim_history_scheduler_, true, wal_number, this, false /* concurrent_memtable_writes */, next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); MaybeIgnoreError(&status); @@ -949,7 +1086,7 @@ cfd->UnrefAndTryDelete(); // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families - assert(cfd->GetLogNumber() <= log_number); + assert(cfd->GetLogNumber() <= wal_number); auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; @@ -980,17 +1117,27 @@ status = Status::OK(); } else if (immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + if (status.IsIOError()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "IOError during point-in-time reading log #%" PRIu64 + " seq #%" PRIu64 + ". %s. This likely mean loss of synced WAL, " + "thus recovery fails.", + wal_number, *next_sequence, + status.ToString().c_str()); + return status; + } // We should ignore the error but not continue replaying status = Status::OK(); stop_replay_for_corruption = true; - corrupted_log_number = log_number; - if (corrupted_log_found != nullptr) { - *corrupted_log_found = true; + corrupted_wal_number = wal_number; + if (corrupted_wal_found != nullptr) { + *corrupted_wal_found = true; } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, - log_number, *next_sequence); + wal_number, *next_sequence); } else { assert(immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords || @@ -1016,7 +1163,7 @@ // corruption. This could during PIT recovery when the WAL is corrupted and // some (but not all) CFs are flushed // Exclude the PIT case where no log is dropped after the corruption point. - // This is to cover the case for empty logs after corrupted log, in which we + // This is to cover the case for empty wals after corrupted log, in which we // don't reset stop_replay_for_corruption. if (stop_replay_for_corruption == true && (immutable_db_options_.wal_recovery_mode == @@ -1024,11 +1171,29 @@ immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords)) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->GetLogNumber() > corrupted_log_number) { + // One special case cause cfd->GetLogNumber() > corrupted_wal_number but + // the CF is still consistent: If a new column family is created during + // the flush and the WAL sync fails at the same time, the new CF points to + // the new WAL but the old WAL is curropted. Since the new CF is empty, it + // is still consistent. We add the check of CF sst file size to avoid the + // false positive alert. + + // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to + // the ignorance of a very rare inconsistency case caused in data + // canclation. One CF is empty due to KV deletion. But those operations + // are in the WAL. If the WAL is corrupted, the status of this CF might + // not be consistent with others. However, the consistency check will be + // bypassed due to empty CF. + // TODO: a better and complete implementation is needed to ensure strict + // consistency check in WAL recovery including hanlding the tailing + // issues. + if (cfd->GetLogNumber() > corrupted_wal_number && + cfd->GetLiveSstFilesSize() > 0) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Column family inconsistency: SST file contains data" " beyond the point of corruption."); - return Status::Corruption("SST file is ahead of WALs"); + return Status::Corruption("SST file is ahead of WALs in CF " + + cfd->GetName()); } } } @@ -1039,16 +1204,16 @@ if (!read_only) { // no need to refcount since client still doesn't have access // to the DB and can not drop column families while we iterate - auto max_log_number = log_numbers.back(); + const WalNumber max_wal_number = wal_numbers.back(); for (auto cfd : *versions_->GetColumnFamilySet()) { auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; - if (cfd->GetLogNumber() > max_log_number) { + if (cfd->GetLogNumber() > max_wal_number) { // Column family cfd has already flushed the data - // from all logs. Memtable has to be empty because - // we filter the updates based on log_number + // from all wals. Memtable has to be empty because + // we filter the updates based on wal_number // (in WriteBatch::InsertInto) assert(cfd->mem()->GetFirstSequenceNumber() == 0); assert(edit->NumEntries() == 0); @@ -1080,13 +1245,13 @@ // Update the log number info in the version edit corresponding to this // column family. Note that the version edits will be written to MANIFEST // together later. - // writing log_number in the manifest means that any log file - // with number strongly less than (log_number + 1) is already + // writing wal_number in the manifest means that any log file + // with number strongly less than (wal_number + 1) is already // recovered and should be ignored on next reincarnation. - // Since we already recovered max_log_number, we want all logs - // with numbers `<= max_log_number` (includes this one) to be ignored + // Since we already recovered max_wal_number, we want all wals + // with numbers `<= max_wal_number` (includes this one) to be ignored if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { - edit->SetLogNumber(max_log_number + 1); + edit->SetLogNumber(max_wal_number + 1); } } if (status.ok()) { @@ -1094,7 +1259,7 @@ // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number - versions_->MarkFileNumberUsed(max_log_number + 1); + versions_->MarkFileNumberUsed(max_wal_number + 1); autovector cfds; autovector cf_opts; @@ -1106,6 +1271,21 @@ assert(iter != version_edits.end()); edit_lists.push_back({&iter->second}); } + + std::unique_ptr wal_deletion; + if (flushed) { + wal_deletion = std::unique_ptr(new VersionEdit()); + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + wal_deletion->DeleteWalsBefore(max_wal_number + 1); + } + if (!allow_2pc()) { + // In non-2pc mode, flushing the memtables of the column families + // means we can advance min_log_number_to_keep. + wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1); + } + edit_lists.back().push_back(wal_deletion.get()); + } + // write MANIFEST with update status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, directories_.GetDbDir(), @@ -1113,8 +1293,17 @@ } } - if (status.ok() && data_seen && !flushed) { - status = RestoreAliveLogFiles(log_numbers); + if (status.ok()) { + if (data_seen && !flushed) { + status = RestoreAliveLogFiles(wal_numbers); + } else { + // If there's no data in the WAL, or we flushed all the data, still + // truncate the log file. If the process goes into a crash loop before + // the file is deleted, the preallocated space will never get freed. + const bool truncate = !read_only; + GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) + .PermitUncheckedError(); + } } event_logger_.Log() << "job" << job_id << "event" @@ -1123,8 +1312,43 @@ return status; } -Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { - if (log_numbers.empty()) { +Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log_ptr) { + LogFileNumberSize log(wal_number); + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), wal_number); + Status s; + // This gets the appear size of the wals, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (s.ok() && truncate) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok() && !truncate_status.IsNotSupported()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", wal_number, + truncate_status.ToString().c_str()); + } + } + if (log_ptr) { + *log_ptr = log; + } + return s; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { + if (wal_numbers.empty()) { return Status::OK(); } Status s; @@ -1137,41 +1361,27 @@ // FindObsoleteFiles() total_log_size_ = 0; log_empty_ = false; - for (auto log_number : log_numbers) { - LogFileNumberSize log(log_number); - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); - // This gets the appear size of the logs, not including preallocated space. - s = env_->GetFileSize(fname, &log.size); + uint64_t min_wal_with_unflushed_data = + versions_->MinLogNumberWithUnflushedData(); + for (auto wal_number : wal_numbers) { + if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) { + // In non-2pc mode, the WAL files not backing unflushed data are not + // alive, thus should not be added to the alive_log_files_. + continue; + } + // We preallocate space for wals, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + LogFileNumberSize log; + s = GetLogSizeAndMaybeTruncate( + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); if (!s.ok()) { break; } total_log_size_ += log.size; alive_log_files_.push_back(log); - // We preallocate space for logs, but then after a crash and restart, those - // preallocated space are not needed anymore. It is likely only the last - // log has such preallocated space, so we only truncate for the last log. - if (log_number == log_numbers.back()) { - std::unique_ptr last_log; - Status truncate_status = fs_->ReopenWritableFile( - fname, - fs_->OptimizeForLogWrite( - file_options_, - BuildDBOptions(immutable_db_options_, mutable_db_options_)), - &last_log, nullptr); - if (truncate_status.ok()) { - truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); - } - if (truncate_status.ok()) { - truncate_status = last_log->Close(IOOptions(), nullptr); - } - // Not a critical error if fail to truncate. - if (!truncate_status.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Failed to truncate log #%" PRIu64 ": %s", log_number, - truncate_status.ToString().c_str()); - } - } } + alive_log_files_tail_ = alive_log_files_.rbegin(); if (two_write_queues_) { log_write_mutex_.Unlock(); } @@ -1181,8 +1391,17 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit) { mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); + assert(cfd); + assert(cfd->imm()); + // The immutable memtable list must be empty. + assert(std::numeric_limits::max() == + cfd->imm()->GetEarliestMemTableID()); + + const uint64_t start_micros = immutable_db_options_.clock->NowMicros(); + FileMetaData meta; + std::vector blob_file_additions; + std::unique_ptr::iterator> pending_outputs_inserted_elem( new std::list::iterator( CaptureCurrentFileNumberInPendingOutputs())); @@ -1206,7 +1425,8 @@ cfd->GetLatestMutableCFOptions()->paranoid_file_checks; int64_t _current_time = 0; - env_->GetCurrentTime(&_current_time); // ignore error + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); meta.oldest_ancester_time = current_time; @@ -1228,18 +1448,26 @@ if (range_del_iter != nullptr) { range_del_iters.emplace_back(range_del_iter); } + + IOStatus io_s; + TableBuilderOptions tboptions( + *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, current_time, + 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_, + db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber()); s = BuildTable( - dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options, + dbname_, versions_.get(), immutable_db_options_, tboptions, file_options_for_compaction_, cfd->table_cache(), iter.get(), - std::move(range_del_iters), &meta, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, - GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), - mutable_cf_options.sample_for_compression, - cfd->ioptions()->compression_opts, paranoid_file_checks, - cfd->internal_stats(), TableFileCreationReason::kRecovery, - &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, - -1 /* level */, current_time, write_hint); + paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kRecovery, &event_logger_, job_id, + Env::IO_HIGH, nullptr /* table_properties */, write_hint, + nullptr /*full_history_ts_low*/, &blob_callback_); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" @@ -1247,29 +1475,54 @@ cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); mutex_.Lock(); + + // TODO(AR) is this ok? + if (!io_s.ok() && s.ok()) { + s = io_s; + } } } ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. - int level = 0; - if (s.ok() && meta.fd.GetFileSize() > 0) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction, meta.oldest_blob_file_number, - meta.oldest_ancester_time, meta.file_creation_time, - meta.file_checksum, meta.file_checksum_func_name); + const bool has_output = meta.fd.GetFileSize() > 0; + + constexpr int level = 0; + + if (s.ok() && has_output) { + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.file_checksum, + meta.file_checksum_func_name, meta.min_timestamp, meta.max_timestamp); + + for (const auto& blob : blob_file_additions) { + edit->AddBlobFile(blob); + } } InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.fd.GetFileSize(); - stats.num_output_files = 1; + stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; + + if (has_output) { + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); - cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, - meta.fd.GetFileSize()); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; } @@ -1311,52 +1564,55 @@ !kSeqPerBatch, kBatchPerTxn); } -Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log) { - Status s; +IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, + log::Writer** new_log) { + IOStatus io_s; std::unique_ptr lfile; DBOptions db_options = BuildDBOptions(immutable_db_options_, mutable_db_options_); FileOptions opt_file_options = fs_->OptimizeForLogWrite(file_options_, db_options); - std::string log_fname = - LogFileName(immutable_db_options_.wal_dir, log_file_num); + std::string wal_dir = immutable_db_options_.GetWalDir(); + std::string log_fname = LogFileName(wal_dir, log_file_num); if (recycle_log_number) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); - std::string old_log_fname = - LogFileName(immutable_db_options_.wal_dir, recycle_log_number); + std::string old_log_fname = LogFileName(wal_dir, recycle_log_number); TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); - s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, - &lfile, /*dbg=*/nullptr); + io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, + &lfile, /*dbg=*/nullptr); } else { - s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); + io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); } - if (s.ok()) { + if (io_s.ok()) { lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); lfile->SetPreallocationBlockSize(preallocate_block_size); const auto& listeners = immutable_db_options_.listeners; - std::unique_ptr file_writer( - new WritableFileWriter(std::move(lfile), log_fname, opt_file_options, - env_, nullptr /* stats */, listeners)); + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; + std::unique_ptr file_writer(new WritableFileWriter( + std::move(lfile), log_fname, opt_file_options, + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, + nullptr, tmp_set.Contains(FileType::kWalFile), + tmp_set.Contains(FileType::kWalFile))); *new_log = new log::Writer(std::move(file_writer), log_file_num, immutable_db_options_.recycle_log_file_num > 0, immutable_db_options_.manual_wal_flush); } - return s; + return io_s; } Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn) { - Status s = SanitizeOptionsByTable(db_options, column_families); + Status s = ValidateOptionsByTable(db_options, column_families); if (!s.ok()) { return s; } @@ -1376,7 +1632,7 @@ } DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); - s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir); + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir()); if (s.ok()) { std::vector paths; for (auto& db_path : impl->immutable_db_options_.db_paths) { @@ -1400,19 +1656,15 @@ impl->error_handler_.EnableAutoRecovery(); } } - - if (!s.ok()) { - delete impl; - return s; + if (s.ok()) { + s = impl->CreateArchivalDirectory(); } - - s = impl->CreateArchivalDirectory(); if (!s.ok()) { delete impl; return s; } - impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); impl->mutex_.Lock(); // Handles create_if_missing, error_if_exists @@ -1429,6 +1681,7 @@ InstrumentedMutexLock wl(&impl->log_write_mutex_); impl->logfile_number_ = new_log_number; assert(new_log != nullptr); + assert(impl->logs_.empty()); impl->logs_.emplace_back(new_log_number, new_log); } @@ -1454,7 +1707,7 @@ break; } } else { - s = Status::InvalidArgument("Column family not found: ", cf.name); + s = Status::InvalidArgument("Column family not found", cf.name); break; } } @@ -1472,18 +1725,16 @@ } impl->alive_log_files_.push_back( DBImpl::LogFileNumberSize(impl->logfile_number_)); + impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin(); if (impl->two_write_queues_) { impl->log_write_mutex_.Unlock(); } - - impl->DeleteObsoleteFiles(); - s = impl->directories_.GetDbDir()->Fsync(); } if (s.ok()) { // In WritePrepared there could be gap in sequence numbers. This breaks // the trick we use in kPointInTimeRecovery which assumes the first seq in // the log right after the corrupted log is one larger than the last seq - // we read from the logs. To let this trick keep working, we add a dummy + // we read from the wals. To let this trick keep working, we add a dummy // entry with the expected sequence to the first log right after recovery. // In non-WritePrepared case also the new log after recovery could be // empty, and thus missing the consecutive seq hint to distinguish @@ -1495,7 +1746,8 @@ WriteOptions write_options; uint64_t log_used, log_size; log::Writer* log_writer = impl->logs_.back().writer; - s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size); + s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size, + /*with_db_mutex==*/true); if (s.ok()) { // Need to fsync, otherwise it might get lost after a power reset. s = impl->FlushWAL(false); @@ -1507,7 +1759,7 @@ } } if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { - // try to read format version but no need to fail Open() even if it fails + // try to read format version s = impl->PersistentStatsProcessFormatVersion(); } @@ -1550,7 +1802,11 @@ *dbptr = impl; impl->opened_successfully_ = true; + impl->DeleteObsoleteFiles(); + TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); impl->MaybeScheduleFlushOrCompaction(); + } else { + persist_options_status.PermitUncheckedError(); } impl->mutex_.Unlock(); @@ -1558,6 +1814,12 @@ auto sfm = static_cast( impl->immutable_db_options_.sst_file_manager.get()); if (s.ok() && sfm) { + // Set Statistics ptr for SstFileManager to dump the stats of + // DeleteScheduler. + sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics); + ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, + "SstFileManager instance %p", sfm); + // Notify SstFileManager about all sst files that already exist in // db_paths[0] and cf_paths[0] when the DB is opened. @@ -1568,6 +1830,8 @@ std::vector metadata; + // TODO: Once GetLiveFilesMetaData supports blob files, update the logic + // below to get known_file_sizes for blob files. impl->mutex_.Lock(); impl->versions_->GetLiveFilesMetaData(&metadata); impl->mutex_.Unlock(); @@ -1593,20 +1857,22 @@ paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); for (auto& path : paths) { std::vector existing_files; - impl->immutable_db_options_.env->GetChildren(path, &existing_files); + impl->immutable_db_options_.env->GetChildren(path, &existing_files) + .PermitUncheckedError(); //**TODO: What do to on error? for (auto& file_name : existing_files) { uint64_t file_number; FileType file_type; std::string file_path = path + "/" + file_name; if (ParseFileName(file_name, &file_number, &file_type) && - file_type == kTableFile) { + (file_type == kTableFile || file_type == kBlobFile)) { + // TODO: Check for errors from OnAddFile? if (known_file_sizes.count(file_name)) { // We're assuming that each sst file name exists in at most one of // the paths. - sfm->OnAddFile(file_path, known_file_sizes.at(file_name), - /* compaction */ false); + sfm->OnAddFile(file_path, known_file_sizes.at(file_name)) + .PermitUncheckedError(); } else { - sfm->OnAddFile(file_path); + sfm->OnAddFile(file_path).PermitUncheckedError(); } } } @@ -1620,6 +1886,7 @@ sfm->ReserveDiskBuffer(max_write_buffer_size, impl->immutable_db_options_.db_paths[0].path); } + #endif // !ROCKSDB_LITE if (s.ok()) { @@ -1634,11 +1901,14 @@ "DB::Open() failed --- Unable to persist Options file", persist_options_status.ToString()); } + } else { + ROCKS_LOG_WARN(impl->immutable_db_options_.info_log, + "Persisting Option File error: %s", + persist_options_status.ToString().c_str()); } if (s.ok()) { - impl->StartTimedTasks(); - } - if (!s.ok()) { + impl->StartPeriodicWorkScheduler(); + } else { for (auto* h : *handles) { delete h; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,13 +4,15 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/db_impl/db_impl_readonly.h" -#include "db/arena_wrapped_db_iter.h" -#include "db/compacted_db_impl.h" +#include "db/arena_wrapped_db_iter.h" +#include "db/db_impl/compacted_db_impl.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -18,7 +20,8 @@ DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) - : DBImpl(db_options, dbname) { + : DBImpl(db_options, dbname, /*seq_per_batch*/ false, + /*batch_per_txn*/ true, /*read_only*/ true) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Opening the db in read only mode"); LogFlush(immutable_db_options_.info_log); @@ -35,7 +38,7 @@ PERF_TIMER_GUARD(get_snapshot_time); Status s; SequenceNumber snapshot = versions_->LastSequence(); - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); @@ -48,14 +51,17 @@ SequenceNumber max_covering_tombstone_seq = 0; LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, read_options)) { pinnable_val->PinSelf(); RecordTick(stats_, MEMTABLE_HIT); } else { PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get(read_options, lkey, pinnable_val, &s, - &merge_context, &max_covering_tombstone_seq); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, pinnable_val, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } RecordTick(stats_, NUMBER_KEYS_READ); @@ -68,7 +74,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -80,12 +86,13 @@ ReadCallback* read_callback = nullptr; // No read callback provided. auto db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - read_seq, + super_version->current, read_seq, super_version->mutable_cf_options.max_sequential_skip_in_iterations, super_version->version_number, read_callback); - auto internal_iter = - NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), - db_iter->GetRangeDelAggregator(), read_seq); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq, + /* allow_unprepared_value */ true); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } @@ -108,15 +115,17 @@ : latest_snapshot; for (auto cfh : column_families) { - auto* cfd = reinterpret_cast(cfh)->cfd(); + auto* cfd = static_cast_with_check(cfh)->cfd(); auto* sv = cfd->GetSuperVersion()->Ref(); auto* db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq, + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + sv->current, read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, sv->version_number, read_callback); - auto* internal_iter = - NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), - db_iter->GetRangeDelAggregator(), read_seq); + auto* internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq, + /* allow_unprepared_value */ true); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } @@ -124,12 +133,37 @@ return Status::OK(); } +namespace { +// Return OK if dbname exists in the file system or create it if +// create_if_missing +Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, + const std::string& dbname) { + Status s; + if (!db_options.create_if_missing) { + // Attempt to read "CURRENT" file + const std::shared_ptr& fs = db_options.env->GetFileSystem(); + std::string manifest_path; + uint64_t manifest_file_number; + s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path, + &manifest_file_number); + } else { + // Historic behavior that doesn't necessarily make sense + s = db_options.env->CreateDirIfMissing(dbname); + } + return s; +} +} // namespace + Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, - DB** dbptr, bool /*error_if_log_file_exist*/) { + DB** dbptr, bool /*error_if_wal_file_exists*/) { + Status s = OpenForReadOnlyCheckExistence(options, dbname); + if (!s.ok()) { + return s; + } + *dbptr = nullptr; // Try to first open DB as fully compacted DB - Status s; s = CompactedDBImpl::Open(options, dbname, dbptr); if (s.ok()) { return s; @@ -142,7 +176,8 @@ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); std::vector handles; - s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + s = DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { assert(handles.size() == 1); // i can delete the handle since DBImpl is always holding a @@ -156,7 +191,23 @@ const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, - bool error_if_log_file_exist) { + bool error_if_wal_file_exists) { + // If dbname does not exist in the file system, should not do anything + Status s = OpenForReadOnlyCheckExistence(db_options, dbname); + if (!s.ok()) { + return s; + } + + return DBImplReadOnly::OpenForReadOnlyWithoutCheck( + db_options, dbname, column_families, handles, dbptr, + error_if_wal_file_exists); +} + +Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists) { *dbptr = nullptr; handles->clear(); @@ -164,14 +215,14 @@ DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); impl->mutex_.Lock(); Status s = impl->Recover(column_families, true /* read only */, - error_if_log_file_exist); + error_if_wal_file_exists); if (s.ok()) { // set column family handles for (auto cf : column_families) { auto cfd = impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); if (cfd == nullptr) { - s = Status::InvalidArgument("Column family not found: ", cf.name); + s = Status::InvalidArgument("Column family not found", cf.name); break; } handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); @@ -189,7 +240,7 @@ *dbptr = impl; for (auto* h : *handles) { impl->NewThreadStatusCfInfo( - reinterpret_cast(h)->cfd()); + static_cast_with_check(h)->cfd()); } } else { for (auto h : *handles) { @@ -205,7 +256,7 @@ Status DB::OpenForReadOnly(const Options& /*options*/, const std::string& /*dbname*/, DB** /*dbptr*/, - bool /*error_if_log_file_exist*/) { + bool /*error_if_wal_file_exists*/) { return Status::NotSupported("Not supported in ROCKSDB_LITE."); } @@ -213,7 +264,7 @@ const DBOptions& /*db_options*/, const std::string& /*dbname*/, const std::vector& /*column_families*/, std::vector* /*handles*/, DB** /*dbptr*/, - bool /*error_if_log_file_exist*/) { + bool /*error_if_wal_file_exists*/) { return Status::NotSupported("Not supported in ROCKSDB_LITE."); } #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h 2025-05-19 16:14:27.000000000 +0000 @@ -130,6 +130,15 @@ } private: + // A "helper" function for DB::OpenForReadOnly without column families + // to reduce unnecessary I/O + // It has the same functionality as DB::OpenForReadOnly with column families + // but does not check the existence of dbname in the file system + static Status OpenForReadOnlyWithoutCheck( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_wal_file_exists = false); friend class DB; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,15 +10,19 @@ #include "db/arena_wrapped_db_iter.h" #include "db/merge_context.h" #include "logging/auto_roll_logger.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" +#include "rocksdb/configurable.h" #include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE DBImplSecondary::DBImplSecondary(const DBOptions& db_options, - const std::string& dbname) - : DBImpl(db_options, dbname) { + const std::string& dbname, + std::string secondary_path) + : DBImpl(db_options, dbname, false, true, true), + secondary_path_(std::move(secondary_path)) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Opening the db in secondary mode"); LogFlush(immutable_db_options_.info_log); @@ -28,8 +32,8 @@ Status DBImplSecondary::Recover( const std::vector& column_families, - bool /*readonly*/, bool /*error_if_log_file_exist*/, - bool /*error_if_data_exists_in_logs*/, uint64_t*) { + bool /*readonly*/, bool /*error_if_wal_file_exists*/, + bool /*error_if_data_exists_in_wals*/, uint64_t*) { mutex_.AssertHeld(); JobContext job_context(0); @@ -38,6 +42,9 @@ ->Recover(column_families, &manifest_reader_, &manifest_reporter_, &manifest_reader_status_); if (!s.ok()) { + if (manifest_reader_status_) { + manifest_reader_status_->PermitUncheckedError(); + } return s; } if (immutable_db_options_.paranoid_checks && s.ok()) { @@ -94,10 +101,10 @@ assert(logs != nullptr); std::vector filenames; Status s; - s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + s = env_->GetChildren(immutable_db_options_.GetWalDir(), &filenames); if (s.IsNotFound()) { return Status::InvalidArgument("Failed to open wal_dir", - immutable_db_options_.wal_dir); + immutable_db_options_.GetWalDir()); } else if (!s.ok()) { return s; } @@ -112,7 +119,7 @@ for (size_t i = 0; i < filenames.size(); i++) { uint64_t number; FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + if (ParseFileName(filenames[i], &number, &type) && type == kWalFile && number >= log_number_min) { logs->push_back(number); } @@ -137,7 +144,8 @@ // initialize log reader from log_number // TODO: min_log_number_to_keep_2pc check needed? // Open the log file - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + std::string fname = + LogFileName(immutable_db_options_.GetWalDir(), log_number); ROCKS_LOG_INFO(immutable_db_options_.info_log, "Recovering log #%" PRIu64 " mode %d", log_number, static_cast(immutable_db_options_.wal_recovery_mode)); @@ -153,7 +161,8 @@ return status; } file_reader.reset(new SequentialFileReader( - std::move(file), fname, immutable_db_options_.log_readahead_size)); + std::move(file), fname, immutable_db_options_.log_readahead_size, + io_tracer_)); } // Create the log reader. @@ -191,6 +200,8 @@ auto it = log_readers_.find(log_number); assert(it != log_readers_.end()); log::FragmentBufferedReader* reader = it->second->reader_; + Status* wal_read_status = it->second->status_; + assert(wal_read_status); // Manually update the file number allocation counter in VersionSet. versions_->MarkFileNumberUsed(log_number); @@ -202,13 +213,16 @@ while (reader->ReadRecord(&record, &scratch, immutable_db_options_.wal_recovery_mode) && - status.ok()) { + wal_read_status->ok() && status.ok()) { if (record.size() < WriteBatchInternal::kHeader) { reader->GetReporter()->Corruption( record.size(), Status::Corruption("log record too small")); continue; } - WriteBatchInternal::SetContents(&batch, record); + status = WriteBatchInternal::SetContents(&batch, record); + if (!status.ok()) { + break; + } SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); std::vector column_family_ids; status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); @@ -294,6 +308,9 @@ reader->GetReporter()->Corruption(record.size(), status); } } + if (status.ok() && !wal_read_status->ok()) { + status = *wal_read_status; + } if (!status.ok()) { return status; } @@ -318,8 +335,8 @@ ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { assert(pinnable_val != nullptr); - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_GET); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); auto cfh = static_cast(column_family); @@ -340,15 +357,16 @@ PERF_TIMER_STOP(get_snapshot_time); bool done = false; - if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), + /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, read_options)) { done = true; pinnable_val->PinSelf(); RecordTick(stats_, MEMTABLE_HIT); } else if ((s.ok() || s.IsMergeInProgress()) && super_version->imm->Get( - lkey, pinnable_val->GetSelf(), &s, &merge_context, - &max_covering_tombstone_seq, read_options)) { + lkey, pinnable_val->GetSelf(), /*timestamp=*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, read_options)) { done = true; pinnable_val->PinSelf(); RecordTick(stats_, MEMTABLE_HIT); @@ -359,8 +377,10 @@ } if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get(read_options, lkey, pinnable_val, &s, - &merge_context, &max_covering_tombstone_seq); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, pinnable_val, + /*timestamp=*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } { @@ -386,7 +406,7 @@ "ReadTier::kPersistedData is not yet supported in iterators.")); } Iterator* result = nullptr; - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); ReadCallback* read_callback = nullptr; // No read callback provided. if (read_options.tailing) { @@ -397,7 +417,7 @@ return NewErrorIterator( Status::NotSupported("snapshot not supported in secondary mode")); } else { - auto snapshot = versions_->LastSequence(); + SequenceNumber snapshot(kMaxSequenceNumber); result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); } return result; @@ -405,17 +425,23 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( const ReadOptions& read_options, ColumnFamilyData* cfd, - SequenceNumber snapshot, ReadCallback* read_callback) { + SequenceNumber snapshot, ReadCallback* read_callback, + bool expose_blob_index, bool allow_refresh) { assert(nullptr != cfd); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + assert(snapshot == kMaxSequenceNumber); + snapshot = versions_->LastSequence(); + assert(snapshot != kMaxSequenceNumber); auto db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - snapshot, + super_version->current, snapshot, super_version->mutable_cf_options.max_sequential_skip_in_iterations, - super_version->version_number, read_callback); - auto internal_iter = - NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), - db_iter->GetRangeDelAggregator(), snapshot); + super_version->version_number, read_callback, this, cfd, + expose_blob_index, read_options.snapshot ? false : allow_refresh); + auto internal_iter = NewInternalIterator( + db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot, + /* allow_unprepared_value */ true); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } @@ -507,7 +533,8 @@ { InstrumentedMutexLock lock_guard(&mutex_); s = static_cast_with_check(versions_.get()) - ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + ->ReadAndApply(&mutex_, &manifest_reader_, + manifest_reader_status_.get(), &cfds_changed); ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, static_cast(versions_->LastSequence())); @@ -604,14 +631,14 @@ } handles->clear(); - DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path); impl->versions_.reset(new ReactiveVersionSet( dbname, &impl->immutable_db_options_, impl->file_options_, impl->table_cache_.get(), impl->write_buffer_manager_, - &impl->write_controller_)); + &impl->write_controller_, impl->io_tracer_)); impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); - impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); impl->mutex_.Lock(); s = impl->Recover(column_families, true, false, false); @@ -620,7 +647,7 @@ auto cfd = impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); if (nullptr == cfd) { - s = Status::InvalidArgument("Column family not found: ", cf.name); + s = Status::InvalidArgument("Column family not found", cf.name); break; } handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); @@ -639,7 +666,7 @@ *dbptr = impl; for (auto h : *handles) { impl->NewThreadStatusCfInfo( - reinterpret_cast(h)->cfd()); + static_cast_with_check(h)->cfd()); } } else { for (auto h : *handles) { @@ -650,6 +677,160 @@ } return s; } + +Status DBImplSecondary::CompactWithoutInstallation( + ColumnFamilyHandle* cfh, const CompactionServiceInput& input, + CompactionServiceResult* result) { + InstrumentedMutexLock l(&mutex_); + auto cfd = static_cast_with_check(cfh)->cfd(); + if (!cfd) { + return Status::InvalidArgument("Cannot find column family" + + cfh->GetName()); + } + + std::unordered_set input_set; + for (const auto& file_name : input.input_files) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + auto* version = cfd->current(); + + ColumnFamilyMetaData cf_meta; + version->GetColumnFamilyMetaData(&cf_meta); + + const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions(); + VersionStorageInfo* vstorage = version->storage_info(); + + // Use comp_options to reuse some CompactFiles functions + CompactionOptions comp_options; + comp_options.compression = kDisableCompressionOption; + comp_options.output_file_size_limit = MaxFileSizeForLevel( + *mutable_cf_options, input.output_level, cf_options.compaction_style, + vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes); + + std::vector input_files; + Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage, comp_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + comp_options, input_files, input.output_level, vstorage, + *mutable_cf_options, mutable_db_options_, 0)); + assert(c != nullptr); + + c->SetInputVersion(version); + + // Create output directory if it's not existed yet + std::unique_ptr output_dir; + s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir); + if (!s.ok()) { + return s; + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + const int job_id = next_job_id_.fetch_add(1); + + CompactionServiceCompactionJob compaction_job( + job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, + input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, + db_id_, db_session_id_, secondary_path_, input, result); + + mutex_.Unlock(); + s = compaction_job.Run(); + mutex_.Lock(); + + // clean up + compaction_job.io_status().PermitUncheckedError(); + compaction_job.CleanupCompaction(); + c->ReleaseCompactionFiles(s); + c.reset(); + + TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End", + &s); + result->status = s; + return s; +} + +Status DB::OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* result, + const CompactionServiceOptionsOverride& override_options) { + CompactionServiceInput compaction_input; + Status s = CompactionServiceInput::Read(input, &compaction_input); + if (!s.ok()) { + return s; + } + + compaction_input.db_options.max_open_files = -1; + compaction_input.db_options.compaction_service = nullptr; + if (compaction_input.db_options.statistics) { + compaction_input.db_options.statistics.reset(); + } + compaction_input.db_options.env = override_options.env; + compaction_input.db_options.file_checksum_gen_factory = + override_options.file_checksum_gen_factory; + compaction_input.db_options.statistics = override_options.statistics; + compaction_input.column_family.options.comparator = + override_options.comparator; + compaction_input.column_family.options.merge_operator = + override_options.merge_operator; + compaction_input.column_family.options.compaction_filter = + override_options.compaction_filter; + compaction_input.column_family.options.compaction_filter_factory = + override_options.compaction_filter_factory; + compaction_input.column_family.options.prefix_extractor = + override_options.prefix_extractor; + compaction_input.column_family.options.table_factory = + override_options.table_factory; + compaction_input.column_family.options.sst_partitioner_factory = + override_options.sst_partitioner_factory; + + std::vector column_families; + column_families.push_back(compaction_input.column_family); + // TODO: we have to open default CF, because of an implementation limitation, + // currently we just use the same CF option from input, which is not collect + // and open may fail. + if (compaction_input.column_family.name != kDefaultColumnFamilyName) { + column_families.emplace_back(kDefaultColumnFamilyName, + compaction_input.column_family.options); + } + + DB* db; + std::vector handles; + + s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory, + column_families, &handles, &db); + if (!s.ok()) { + return s; + } + + CompactionServiceResult compaction_result; + DBImplSecondary* db_secondary = static_cast_with_check(db); + assert(handles.size() > 0); + s = db_secondary->CompactWithoutInstallation(handles[0], compaction_input, + &compaction_result); + + Status serialization_status = compaction_result.Write(result); + + for (auto& handle : handles) { + delete handle; + } + delete db; + if (s.ok()) { + return serialization_status; + } + return s; +} + #else // !ROCKSDB_LITE Status DB::OpenAsSecondary(const Options& /*options*/, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,9 @@ #include #include + #include "db/db_impl/db_impl.h" +#include "logging/logging.h" namespace ROCKSDB_NAMESPACE { @@ -71,14 +73,15 @@ // effort attempts to catch up with the primary. class DBImplSecondary : public DBImpl { public: - DBImplSecondary(const DBOptions& options, const std::string& dbname); + DBImplSecondary(const DBOptions& options, const std::string& dbname, + std::string secondary_path); ~DBImplSecondary() override; // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ // and log_readers_ to facilitate future operations. Status Recover(const std::vector& column_families, - bool read_only, bool error_if_log_file_exist, - bool error_if_data_exists_in_logs, + bool read_only, bool error_if_wal_file_exists, + bool error_if_data_exists_in_wals, uint64_t* = nullptr) override; // Implementations of the DB interface @@ -96,7 +99,9 @@ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, ColumnFamilyData* cfd, SequenceNumber snapshot, - ReadCallback* read_callback); + ReadCallback* read_callback, + bool expose_blob_index = false, + bool allow_refresh = true); Status NewIterators(const ReadOptions& options, const std::vector& column_families, @@ -222,6 +227,14 @@ // not flag the missing file as inconsistency. Status CheckConsistency() override; +#ifndef NDEBUG + Status TEST_CompactWithoutInstallation(ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result) { + return CompactWithoutInstallation(cfh, input, result); + } +#endif // NDEBUG + protected: // ColumnFamilyCollector is a write batch handler which does nothing // except recording unique column family IDs @@ -269,6 +282,20 @@ return AddColumnFamilyId(column_family_id); } + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkNoop(bool) override { return Status::OK(); } + const std::unordered_set& column_families() const { return column_family_ids_; } @@ -316,6 +343,13 @@ std::unordered_set* cfds_changed, JobContext* job_context); + // Run compaction without installation, the output files will be placed in the + // secondary DB path. The LSM tree won't be changed, the secondary DB is still + // in read-only mode. + Status CompactWithoutInstallation(ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result); + std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; @@ -326,6 +360,8 @@ // Current WAL number replayed for each column family. std::unordered_map cfd_to_current_log_; + + const std::string secondary_path_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,14 +6,16 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include + +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/event_helpers.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "options/options_helper.h" #include "test_util/sync_point.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { // Convenience methods @@ -24,7 +26,7 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { - auto cfh = reinterpret_cast(column_family); + auto cfh = static_cast_with_check(column_family); if (!cfh->cfd()->ioptions()->merge_operator) { return Status::NotSupported("Provide a merge_operator when opening DB"); } else { @@ -73,10 +75,16 @@ if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_) { - tracer_->Write(my_batch); + if (tracer_ && !tracer_->IsWriteOrderPreserved()) { + // We don't have to preserve write order so can trace anywhere. It's more + // efficient to trace here than to add latency to a phase of the log/apply + // pipeline. + // TODO: maybe handle the tracing status? + tracer_->Write(my_batch).PermitUncheckedError(); } } if (write_options.sync && write_options.disableWAL) { @@ -100,11 +108,10 @@ assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || disable_memtable); - Status status; if (write_options.low_pri) { - status = ThrottleLowPriWritesIfNeeded(write_options, my_batch); - if (!status.ok()) { - return status; + Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch); + if (!s.ok()) { + return s; } } @@ -124,13 +131,13 @@ ? batch_cnt // every key is a sub-batch consuming a seq : WriteBatchInternal::Count(my_batch); - uint64_t seq; + uint64_t seq = 0; // Use a write thread to i) optimize for WAL write, ii) publish last // sequence in in increasing order, iii) call pre_release_callback serially - status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback, - log_used, log_ref, &seq, sub_batch_cnt, - pre_release_callback, kDoAssignOrder, - kDoPublishLastSeq, disable_memtable); + Status status = WriteImplWALOnly( + &write_thread_, write_options, my_batch, callback, log_used, log_ref, + &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); if (!status.ok()) { return status; @@ -154,12 +161,7 @@ PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable, batch_cnt, pre_release_callback); - - if (!write_options.disableWAL) { - RecordTick(stats_, WRITE_WITH_WAL); - } - - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); write_thread_.JoinBatchGroup(&w); if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { @@ -191,8 +193,6 @@ } assert(w.state == WriteThread::STATE_COMPLETED); // STATE_COMPLETED conditional below handles exit - - status = w.FinalStatus(); } if (w.state == WriteThread::STATE_COMPLETED) { if (log_used != nullptr) { @@ -206,7 +206,7 @@ } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); - + Status status; // Once reaches this point, the current writer "w" will try to do its write // job. It may also pick up some of the remaining writers in the "writers_" // when it finds suitable, and finish them in the same write batch. @@ -220,7 +220,8 @@ bool need_log_sync = write_options.sync; bool need_log_dir_sync = need_log_sync && !log_dir_synced_; - if (!two_write_queues_ || !disable_memtable) { + assert(!two_write_queues_ || !disable_memtable); + { // With concurrent writes we do preprocess only in the write thread that // also does write to memtable to avoid sync issue on shared data structure // with the other thread @@ -250,7 +251,20 @@ last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + IOStatus io_s; + Status pre_release_cb_status; if (status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } // Rules for when we can update the memtable concurrently // 1. supported by memtable // 2. Puts are not okay if inplace_update_support @@ -322,21 +336,22 @@ if (!two_write_queues_) { if (status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); - status = WriteToWAL(write_group, log_writer, log_used, need_log_sync, - need_log_dir_sync, last_sequence + 1); + io_s = WriteToWAL(write_group, log_writer, log_used, need_log_sync, + need_log_dir_sync, last_sequence + 1); } } else { if (status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); // LastAllocatedSequence is increased inside WriteToWAL under // wal_write_mutex_ to ensure ordered events in WAL - status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, - seq_inc); + io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, + seq_inc); } else { // Otherwise we inc seq number for memtable writes last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); } } + status = io_s; assert(last_sequence != kMaxSequenceNumber); const SequenceNumber current_sequence = last_sequence + 1; last_sequence += seq_inc; @@ -359,7 +374,7 @@ writer->sequence, disable_memtable, writer->log_used, index++, pre_release_callback_cnt); if (!ws.ok()) { - status = ws; + status = pre_release_cb_status = ws; break; } } @@ -411,12 +426,23 @@ PERF_TIMER_START(write_pre_and_post_process_time); if (!w.CallbackFailed()) { - WriteStatusCheck(status); + if (!io_s.ok()) { + assert(pre_release_cb_status.ok()); + IOStatusCheck(io_s); + } else { + WriteStatusCheck(pre_release_cb_status); + } + } else { + assert(io_s.ok() && pre_release_cb_status.ok()); } if (need_log_sync) { mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + if (status.ok()) { + status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + } else { + MarkLogsNotSynced(logfile_number_); + } mutex_.Unlock(); // Requesting sync with two_write_queues_ is expected to be very rare. We // hence provide a simple implementation that is not necessarily efficient. @@ -456,13 +482,14 @@ uint64_t* log_used, uint64_t log_ref, bool disable_memtable, uint64_t* seq_used) { PERF_TIMER_GUARD(write_pre_and_post_process_time); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); WriteContext write_context; WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable); write_thread_.JoinBatchGroup(&w); + TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"); if (w.state == WriteThread::STATE_GROUP_LEADER) { WriteThread::WriteGroup wal_write_group; if (w.callback && !w.callback->AllowWriteBatching()) { @@ -487,6 +514,17 @@ size_t total_byte_size = 0; if (w.status.ok()) { + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : wal_write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } SequenceNumber next_sequence = current_sequence; for (auto writer : wal_write_group) { if (writer->CheckCallback(this)) { @@ -515,6 +553,9 @@ PERF_TIMER_STOP(write_pre_and_post_process_time); + IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized + if (w.status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); @@ -524,24 +565,38 @@ wal_write_group.size - 1); RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } - w.status = WriteToWAL(wal_write_group, log_writer, log_used, - need_log_sync, need_log_dir_sync, current_sequence); + io_s = WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync, + need_log_dir_sync, current_sequence); + w.status = io_s; } if (!w.CallbackFailed()) { - WriteStatusCheck(w.status); + if (!io_s.ok()) { + IOStatusCheck(io_s); + } else { + WriteStatusCheck(w.status); + } } if (need_log_sync) { mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status); + if (w.status.ok()) { + w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + } else { + MarkLogsNotSynced(logfile_number_); + } mutex_.Unlock(); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } + // NOTE: the memtable_write_group is declared before the following + // `if` statement because its lifetime needs to be longer + // that the inner context of the `if` as a reference to it + // may be used further below within the outer _write_thread WriteThread::WriteGroup memtable_write_group; + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { PERF_TIMER_GUARD(write_memtable_time); assert(w.ShouldWriteToMemtable()); @@ -558,6 +613,10 @@ versions_->SetLastSequence(memtable_write_group.last_sequence); write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); } + } else { + // NOTE: the memtable_write_group is never really used, + // so we need to set its status to pass ASSERT_STATUS_CHECKED + memtable_write_group.status.PermitUncheckedError(); } if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { @@ -590,7 +649,7 @@ SequenceNumber seq, const size_t sub_batch_cnt) { PERF_TIMER_GUARD(write_pre_and_post_process_time); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); WriteThread::Writer w(write_options, my_batch, callback, log_ref, false /*disable_memtable*/); @@ -610,8 +669,6 @@ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, write_options.memtable_insert_hint_per_batch); - - WriteStatusCheck(w.status); if (write_options.disableWAL) { has_unpersisted_data_.store(true, std::memory_order_relaxed); } @@ -626,6 +683,7 @@ std::lock_guard lck(switch_mutex_); switch_cv_.notify_all(); } + WriteStatusCheck(w.status); if (!w.FinalStatus().ok()) { return w.FinalStatus(); @@ -642,12 +700,10 @@ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, const PublishLastSeq publish_last_seq, const bool disable_memtable) { - Status status; PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable, sub_batch_cnt, pre_release_callback); - RecordTick(stats_, WRITE_WITH_WAL); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); write_thread->JoinBatchGroup(&w); assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); @@ -664,6 +720,8 @@ assert(w.state == WriteThread::STATE_GROUP_LEADER); if (publish_last_seq == kDoPublishLastSeq) { + Status status; + // Currently we only use kDoPublishLastSeq in unordered_write assert(immutable_db_options_.unordered_write); WriteContext write_context; @@ -676,7 +734,7 @@ InstrumentedMutexLock l(&mutex_); bool need_log_sync = false; status = PreprocessWrite(write_options, &need_log_sync, &write_context); - WriteStatusCheck(status); + WriteStatusCheckOnLocked(status); } if (!status.ok()) { WriteThread::WriteGroup write_group; @@ -691,6 +749,17 @@ write_thread->EnterAsBatchGroupLeader(&w, &write_group); // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } size_t pre_release_callback_cnt = 0; size_t total_byte_size = 0; @@ -740,9 +809,12 @@ } seq_inc = total_batch_cnt; } + Status status; + IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized if (!write_options.disableWAL) { - status = - ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + status = io_s; } else { // Otherwise we inc seq number to do solely the seq allocation last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); @@ -777,7 +849,11 @@ PERF_TIMER_START(write_pre_and_post_process_time); if (!w.CallbackFailed()) { - WriteStatusCheck(status); + if (!io_s.ok()) { + IOStatusCheck(io_s); + } else { + WriteStatusCheck(status); + } } if (status.ok()) { size_t index = 0; @@ -812,17 +888,45 @@ return status; } +void DBImpl::WriteStatusCheckOnLocked(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + // Caller must hold mutex_. + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); + mutex_.AssertHeld(); + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + } +} + void DBImpl::WriteStatusCheck(const Status& status) { // Is setting bg_error_ enough here? This will at least stop // compaction and fail any further writes. + assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok()); if (immutable_db_options_.paranoid_checks && !status.ok() && !status.IsBusy() && !status.IsIncomplete()) { mutex_.Lock(); + // Maybe change the return status to void? error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); mutex_.Unlock(); } } +void DBImpl::IOStatusCheck(const IOStatus& io_status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if ((immutable_db_options_.paranoid_checks && !io_status.ok() && + !io_status.IsBusy() && !io_status.IsIncomplete()) || + io_status.IsIOFenced()) { + mutex_.Lock(); + // Maybe change the return status to void? + error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } +} + void DBImpl::MemTableInsertStatusCheck(const Status& status) { // A non-OK status here indicates that the state implied by the // WAL has diverged from the in-memory state. This could be @@ -832,7 +936,9 @@ if (!status.ok()) { mutex_.Lock(); assert(!error_handler_.IsBGWorkStopped()); - error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable); + // Maybe change the return status to void? + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable) + .PermitUncheckedError(); mutex_.Unlock(); } } @@ -865,7 +971,7 @@ // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. WaitForPendingWrites(); - status = HandleWriteBufferFull(write_context); + status = HandleWriteBufferManagerFlush(write_context); } if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { @@ -892,6 +998,20 @@ PERF_TIMER_START(write_pre_and_post_process_time); } + // If memory usage exceeded beyond a certain threshold, + // write_buffer_manager_->ShouldStall() returns true to all threads writing to + // all DBs and writers will be stalled. + // It does soft checking because WriteBufferManager::buffer_limit_ has already + // exceeded at this point so no new write (including current one) will go + // through until memory usage is decreased. + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + if (write_options.no_slowdown) { + status = Status::Incomplete("Write stall"); + } else { + WriteBufferManagerStallWrites(); + } + } + if (status.ok() && *need_log_sync) { // Wait until the parallel syncs are finished. Any sync process has to sync // the front log too so it is enough to check the status of front() @@ -946,8 +1066,10 @@ merged_batch = tmp_batch; for (auto writer : write_group) { if (!writer->CallbackFailed()) { - WriteBatchInternal::Append(merged_batch, writer->batch, - /*WAL_only*/ true); + Status s = WriteBatchInternal::Append(merged_batch, writer->batch, + /*WAL_only*/ true); + // Always returns Status::OK. + assert(s.ok()); if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) { // We only need to cache the last of such write batch *to_be_cached_state = writer->batch; @@ -961,10 +1083,20 @@ // When two_write_queues_ is disabled, this function is called from the only // write thread. Otherwise this must be called holding log_write_mutex_. -Status DBImpl::WriteToWAL(const WriteBatch& merged_batch, - log::Writer* log_writer, uint64_t* log_used, - uint64_t* log_size) { +IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size, + bool with_db_mutex, bool with_log_mutex) { assert(log_size != nullptr); + + // Assert mutex explicitly. + if (with_db_mutex) { + mutex_.AssertHeld(); + } else if (two_write_queues_) { + log_write_mutex_.AssertHeld(); + assert(with_log_mutex); + } + Slice log_entry = WriteBatchInternal::Contents(&merged_batch); *log_size = log_entry.size(); // When two_write_queues_ WriteToWAL has to be protected from concurretn calls @@ -978,7 +1110,8 @@ if (UNLIKELY(needs_locking)) { log_write_mutex_.Lock(); } - Status status = log_writer->AddRecord(log_entry); + IOStatus io_s = log_writer->AddRecord(log_entry); + if (UNLIKELY(needs_locking)) { log_write_mutex_.Unlock(); } @@ -986,19 +1119,22 @@ *log_used = logfile_number_; } total_log_size_ += log_entry.size(); - // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here - // since alive_log_files_ might be modified concurrently - alive_log_files_.back().AddSize(log_entry.size()); + if (with_db_mutex || with_log_mutex) { + assert(alive_log_files_tail_ == alive_log_files_.rbegin()); + assert(alive_log_files_tail_ != alive_log_files_.rend()); + } + LogFileNumberSize& last_alive_log = *alive_log_files_tail_; + last_alive_log.AddSize(*log_size); log_empty_ = false; - return status; + return io_s; } -Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, - log::Writer* log_writer, uint64_t* log_used, - bool need_log_sync, bool need_log_dir_sync, - SequenceNumber sequence) { - Status status; - +IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence) { + IOStatus io_s; + assert(!two_write_queues_); assert(!write_group.leader->disable_wal); // Same holds for all in the batch group size_t write_with_wal = 0; @@ -1016,14 +1152,14 @@ WriteBatchInternal::SetSequence(merged_batch, sequence); uint64_t log_size; - status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; } - if (status.ok() && need_log_sync) { - StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + if (io_s.ok() && need_log_sync) { + StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS); // It's safe to access logs_ with unlocked mutex_ here because: // - we've set getting_synced=true for all logs, // so other threads won't pop from logs_ while we're here, @@ -1031,24 +1167,43 @@ // writer thread, so no one will push to logs_, // - as long as other threads don't modify it, it's safe to read // from std::deque from multiple threads concurrently. + // + // Sync operation should work with locked log_write_mutex_, because: + // when DBOptions.manual_wal_flush_ is set, + // FlushWAL function will be invoked by another thread. + // if without locked log_write_mutex_, the log file may get data + // corruption + + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + for (auto& log : logs_) { - status = log.writer->file()->Sync(immutable_db_options_.use_fsync); - if (!status.ok()) { + io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!io_s.ok()) { break; } } - if (status.ok() && need_log_dir_sync) { + + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + + if (io_s.ok() && need_log_dir_sync) { // We only sync WAL directory the first time WAL syncing is // requested, so that in case users never turn on WAL sync, // we can avoid the disk I/O in the write code path. - status = directories_.GetWalDir()->Fsync(); + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } } if (merged_batch == &tmp_batch_) { tmp_batch_.Clear(); } - if (status.ok()) { + if (io_s.ok()) { auto stats = default_cf_internal_stats_; if (need_log_sync) { stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); @@ -1059,15 +1214,15 @@ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); } - return status; + return io_s; } -Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, - uint64_t* log_used, - SequenceNumber* last_sequence, - size_t seq_inc) { - Status status; +IOStatus DBImpl::ConcurrentWriteToWAL( + const WriteThread::WriteGroup& write_group, uint64_t* log_used, + SequenceNumber* last_sequence, size_t seq_inc) { + IOStatus io_s; + assert(two_write_queues_ || immutable_db_options_.unordered_write); assert(!write_group.leader->disable_wal); // Same holds for all in the batch group WriteBatch tmp_batch; @@ -1092,14 +1247,15 @@ log::Writer* log_writer = logs_.back().writer; uint64_t log_size; - status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size, + /*with_db_mutex=*/false, /*with_log_mutex=*/true); if (to_be_cached_state) { cached_recoverable_state_ = *to_be_cached_state; cached_recoverable_state_empty_ = false; } log_write_mutex_.Unlock(); - if (status.ok()) { + if (io_s.ok()) { const bool concurrent = true; auto stats = default_cf_internal_stats_; stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, @@ -1109,7 +1265,7 @@ concurrent); RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); } - return status; + return io_s; } Status DBImpl::WriteRecoverableState() { @@ -1271,16 +1427,23 @@ } for (auto cfd : cfds) { cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); MaybeScheduleFlushOrCompaction(); } return status; } -Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { +Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { mutex_.AssertHeld(); assert(write_context != nullptr); Status status; @@ -1292,7 +1455,7 @@ // suboptimal but still correct. ROCKS_LOG_INFO( immutable_db_options_.info_log, - "Flushing column family with oldest memtable entry. Write buffer is " + "Flushing column family with oldest memtable entry. Write buffers are " "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".", write_buffer_manager_->memory_usage(), write_buffer_manager_->buffer_size()); @@ -1350,10 +1513,17 @@ } for (const auto cfd : cfds) { cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); MaybeScheduleFlushOrCompaction(); } return status; @@ -1373,8 +1543,10 @@ uint64_t time_delayed = 0; bool delayed = false; { - StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); - uint64_t delay = write_controller_.GetDelay(env_, num_bytes); + StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, + &time_delayed); + uint64_t delay = + write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); if (delay > 0) { if (write_options.no_slowdown) { return Status::Incomplete("Write stall"); @@ -1386,19 +1558,21 @@ write_thread_.BeginWriteStall(); TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); mutex_.Unlock(); - // We will delay the write until we have slept for delay ms or - // we don't need a delay anymore - const uint64_t kDelayInterval = 1000; + // We will delay the write until we have slept for `delay` microseconds + // or we don't need a delay anymore. We check for cancellation every 1ms + // (slightly longer because WriteController minimum delay is 1ms, in + // case of sleep imprecision, rounding, etc.) + const uint64_t kDelayInterval = 1001; uint64_t stall_end = sw.start_time() + delay; while (write_controller_.NeedsDelay()) { - if (env_->NowMicros() >= stall_end) { + if (immutable_db_options_.clock->NowMicros() >= stall_end) { // We already delayed this write `delay` microseconds break; } delayed = true; // Sleep for 0.001 seconds - env_->SleepForMicroseconds(kDelayInterval); + immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval); } mutex_.Lock(); write_thread_.EndWriteStall(); @@ -1444,6 +1618,29 @@ return s; } +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +void DBImpl::WriteBufferManagerStallWrites() { + mutex_.AssertHeld(); + // First block future writer threads who want to add themselves to the queue + // of WriteThread. + write_thread_.BeginWriteStall(); + mutex_.Unlock(); + + // Change the state to State::Blocked. + static_cast(wbm_stall_.get()) + ->SetState(WBMStallInterface::State::BLOCKED); + // Then WriteBufferManager will add DB instance to its queue + // and block this thread by calling WBMStallInterface::Block(). + write_buffer_manager_->BeginWriteStall(wbm_stall_.get()); + wbm_stall_->Block(); + + mutex_.Lock(); + // Stall has ended. Signal writer threads so that they can add + // themselves to the WriteThread queue for writes. + write_thread_.EndWriteStall(); +} + Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, WriteBatch* my_batch) { assert(write_options.low_pri); @@ -1517,11 +1714,9 @@ } for (auto& cfd : cfds) { autovector to_delete; - cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage()); - if (!to_delete.empty()) { - for (auto m : to_delete) { - delete m; - } + bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_, + cfd->mem()->MemoryAllocatedBytes()); + if (trimmed) { context->superversion_context.NewSuperVersion(); assert(context->superversion_context.new_superversion.get() != nullptr); cfd->InstallSuperVersion(&context->superversion_context, &mutex_); @@ -1574,10 +1769,16 @@ if (status.ok()) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } else { + for (auto* cfd : cfds) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); MaybeScheduleFlushOrCompaction(); } return status; @@ -1605,10 +1806,9 @@ // two_write_queues_ is true (This is to simplify the reasoning.) Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); - WriteThread::Writer nonmem_w; - std::unique_ptr lfile; log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; + IOStatus io_s; // Recoverable state is persisted in WAL. After memtable switch, WAL might // be deleted, so we write the state to memtable to be persisted as well. @@ -1654,8 +1854,11 @@ if (creating_new_log) { // TODO: Write buffer size passed in should be max of all CF's instead // of mutable_cf_options.write_buffer_size. - s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, - &new_log); + io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, + &new_log); + if (s.ok()) { + s = io_s; + } } if (s.ok()) { SequenceNumber seq = versions_->LastSequence(); @@ -1681,7 +1884,10 @@ if (!logs_.empty()) { // Alway flush the buffer of the last log before switching to a new one log::Writer* cur_log_writer = logs_.back().writer; - s = cur_log_writer->WriteBuffer(); + io_s = cur_log_writer->WriteBuffer(); + if (s.ok()) { + s = io_s; + } if (!s.ok()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64 @@ -1696,6 +1902,7 @@ log_dir_synced_ = false; logs_.emplace_back(logfile_number_, new_log); alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + alive_log_files_tail_ = alive_log_files_.rbegin(); } log_write_mutex_.Unlock(); } @@ -1703,45 +1910,92 @@ if (!s.ok()) { // how do we fail if we're not creating new log? assert(creating_new_log); - if (new_mem) { - delete new_mem; - } - if (new_log) { - delete new_log; - } - SuperVersion* new_superversion = - context->superversion_context.new_superversion.release(); - if (new_superversion != nullptr) { - delete new_superversion; - } + delete new_mem; + delete new_log; + context->superversion_context.new_superversion.reset(); // We may have lost data from the WritableFileBuffer in-memory buffer for // the current log, so treat it as a fatal error and set bg_error - error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + if (!io_s.ok()) { + error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); + } else { + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + } // Read back bg_error in order to get the right severity s = error_handler_.GetBGError(); return s; } - for (auto loop_cfd : *versions_->GetColumnFamilySet()) { - // all this is just optimization to delete logs that - // are no longer needed -- if CF is empty, that means it - // doesn't need that particular log to stay alive, so we just - // advance the log number. no need to persist this in the manifest - if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && - loop_cfd->imm()->NumNotFlushed() == 0) { - if (creating_new_log) { - loop_cfd->SetLogNumber(logfile_number_); + bool empty_cf_updated = false; + if (immutable_db_options_.track_and_verify_wals_in_manifest && + !immutable_db_options_.allow_2pc && creating_new_log) { + // In non-2pc mode, WALs become obsolete if they do not contain unflushed + // data. Updating the empty CF's log number might cause some WALs to become + // obsolete. So we should track the WAL obsoletion event before actually + // updating the empty CF's log number. + uint64_t min_wal_number_to_keep = + versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + if (min_wal_number_to_keep > + versions_->GetWalSet().GetMinWalNumberToKeep()) { + // Get a snapshot of the empty column families. + // LogAndApply may release and reacquire db + // mutex, during that period, column family may become empty (e.g. its + // flush succeeds), then it affects the computed min_log_number_to_keep, + // so we take a snapshot for consistency of column family data + // status. If a column family becomes non-empty afterwards, its active log + // should still be the created new log, so the min_log_number_to_keep is + // not affected. + autovector empty_cfs; + for (auto cf : *versions_->GetColumnFamilySet()) { + if (cf->IsEmpty()) { + empty_cfs.push_back(cf); + } + } + + VersionEdit wal_deletion; + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + return s; + } + + for (auto cf : empty_cfs) { + if (cf->IsEmpty()) { + cf->SetLogNumber(logfile_number_); + // MEMPURGE: No need to change this, because new adds + // should still receive new sequence numbers. + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } // cf may become non-empty. + } + empty_cf_updated = true; + } + } + if (!empty_cf_updated) { + for (auto cf : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cf->IsEmpty()) { + if (creating_new_log) { + cf->SetLogNumber(logfile_number_); + } + cf->mem()->SetCreationSeq(versions_->LastSequence()); } - loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); } } cfd->mem()->SetNextLogNumber(logfile_number_); + assert(new_mem != nullptr); cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); new_mem->Ref(); cfd->SetMemtable(new_mem); InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, mutable_cf_options); + #ifndef ROCKSDB_LITE mutex_.Unlock(); // Notify client that memtable is sealed, now that we have successfully @@ -1749,6 +2003,10 @@ NotifyOnMemTableSealed(cfd, memtable_info); mutex_.Lock(); #endif // ROCKSDB_LITE + // It is possible that we got here without checking the value of i_os, but + // that is okay. If we did, it most likely means that s was already an error. + // In any case, ignore any unchecked error for i_os here. + io_s.PermitUncheckedError(); return s; } @@ -1792,13 +2050,20 @@ const Slice* ts = opt.timestamp; assert(nullptr != ts); size_t ts_sz = ts->size(); - WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0, - ts_sz); - Status s = batch.Put(column_family, key, value); - if (!s.ok()) { - return s; + assert(column_family->GetComparator()); + assert(ts_sz == column_family->GetComparator()->timestamp_size()); + WriteBatch batch; + Status s; + if (key.data() + key.size() == ts->data()) { + Slice key_with_ts = Slice(key.data(), key.size() + ts_sz); + s = batch.Put(column_family, key_with_ts, value); + } else { + std::array key_with_ts_slices{{key, *ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + std::array value_slices{{value}}; + SliceParts values(value_slices.data(), 1); + s = batch.Put(column_family, key_with_ts, values); } - s = batch.AssignTimestamp(*ts); if (!s.ok()) { return s; } @@ -1807,23 +2072,77 @@ Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key) { + if (nullptr == opt.timestamp) { + WriteBatch batch; + Status s = batch.Delete(column_family, key); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + const Slice* ts = opt.timestamp; + assert(ts != nullptr); + size_t ts_sz = ts->size(); + assert(column_family->GetComparator()); + assert(ts_sz == column_family->GetComparator()->timestamp_size()); WriteBatch batch; - batch.Delete(column_family, key); + Status s; + if (key.data() + key.size() == ts->data()) { + Slice key_with_ts = Slice(key.data(), key.size() + ts_sz); + s = batch.Delete(column_family, key_with_ts); + } else { + std::array key_with_ts_slices{{key, *ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + s = batch.Delete(column_family, key_with_ts); + } + if (!s.ok()) { + return s; + } return Write(opt, &batch); } Status DB::SingleDelete(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key) { + Status s; + if (opt.timestamp == nullptr) { + WriteBatch batch; + s = batch.SingleDelete(column_family, key); + if (!s.ok()) { + return s; + } + s = Write(opt, &batch); + return s; + } + + const Slice* ts = opt.timestamp; + assert(ts != nullptr); + size_t ts_sz = ts->size(); + assert(column_family->GetComparator()); + assert(ts_sz == column_family->GetComparator()->timestamp_size()); WriteBatch batch; - batch.SingleDelete(column_family, key); - return Write(opt, &batch); + if (key.data() + key.size() == ts->data()) { + Slice key_with_ts = Slice(key.data(), key.size() + ts_sz); + s = batch.SingleDelete(column_family, key_with_ts); + } else { + std::array key_with_ts_slices{{key, *ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + s = batch.SingleDelete(column_family, key_with_ts); + } + if (!s.ok()) { + return s; + } + s = Write(opt, &batch); + return s; } Status DB::DeleteRange(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& begin_key, const Slice& end_key) { WriteBatch batch; - batch.DeleteRange(column_family, begin_key, end_key); + Status s = batch.DeleteRange(column_family, begin_key, end_key); + if (!s.ok()) { + return s; + } return Write(opt, &batch); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,869 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_impl/db_impl_secondary.h" -#include "db/db_test_util.h" -#include "port/stack_trace.h" -#include "test_util/fault_injection_test_env.h" -#include "test_util/sync_point.h" - -namespace ROCKSDB_NAMESPACE { - -#ifndef ROCKSDB_LITE -class DBSecondaryTest : public DBTestBase { - public: - DBSecondaryTest() - : DBTestBase("/db_secondary_test"), - secondary_path_(), - handles_secondary_(), - db_secondary_(nullptr) { - secondary_path_ = - test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); - } - - ~DBSecondaryTest() override { - CloseSecondary(); - if (getenv("KEEP_DB") != nullptr) { - fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); - } else { - Options options; - options.env = env_; - EXPECT_OK(DestroyDB(secondary_path_, options)); - } - } - - protected: - Status ReopenAsSecondary(const Options& options) { - return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); - } - - void OpenSecondary(const Options& options); - - void OpenSecondaryWithColumnFamilies( - const std::vector& column_families, const Options& options); - - void CloseSecondary() { - for (auto h : handles_secondary_) { - db_secondary_->DestroyColumnFamilyHandle(h); - } - handles_secondary_.clear(); - delete db_secondary_; - db_secondary_ = nullptr; - } - - DBImplSecondary* db_secondary_full() { - return static_cast(db_secondary_); - } - - void CheckFileTypeCounts(const std::string& dir, int expected_log, - int expected_sst, int expected_manifest) const; - - std::string secondary_path_; - std::vector handles_secondary_; - DB* db_secondary_; -}; - -void DBSecondaryTest::OpenSecondary(const Options& options) { - Status s = - DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); - ASSERT_OK(s); -} - -void DBSecondaryTest::OpenSecondaryWithColumnFamilies( - const std::vector& column_families, const Options& options) { - std::vector cf_descs; - cf_descs.emplace_back(kDefaultColumnFamilyName, options); - for (const auto& cf_name : column_families) { - cf_descs.emplace_back(cf_name, options); - } - Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, - &handles_secondary_, &db_secondary_); - ASSERT_OK(s); -} - -void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, - int expected_log, int expected_sst, - int expected_manifest) const { - std::vector filenames; - env_->GetChildren(dir, &filenames); - - int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; - for (auto file : filenames) { - uint64_t number; - FileType type; - if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); - sst_cnt += (type == kTableFile); - manifest_cnt += (type == kDescriptorFile); - } - } - ASSERT_EQ(expected_log, log_cnt); - ASSERT_EQ(expected_sst, sst_cnt); - ASSERT_EQ(expected_manifest, manifest_cnt); -} - -TEST_F(DBSecondaryTest, ReopenAsSecondary) { - Options options; - options.env = env_; - Reopen(options); - ASSERT_OK(Put("foo", "foo_value")); - ASSERT_OK(Put("bar", "bar_value")); - ASSERT_OK(dbfull()->Flush(FlushOptions())); - Close(); - - ASSERT_OK(ReopenAsSecondary(options)); - ASSERT_EQ("foo_value", Get("foo")); - ASSERT_EQ("bar_value", Get("bar")); - ReadOptions ropts; - ropts.verify_checksums = true; - auto db1 = static_cast(db_); - ASSERT_NE(nullptr, db1); - Iterator* iter = db1->NewIterator(ropts); - ASSERT_NE(nullptr, iter); - size_t count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - if (0 == count) { - ASSERT_EQ("bar", iter->key().ToString()); - ASSERT_EQ("bar_value", iter->value().ToString()); - } else if (1 == count) { - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("foo_value", iter->value().ToString()); - } - ++count; - } - delete iter; - ASSERT_EQ(2, count); -} - -TEST_F(DBSecondaryTest, OpenAsSecondary) { - Options options; - options.env = env_; - options.level0_file_num_compaction_trigger = 4; - Reopen(options); - for (int i = 0; i < 3; ++i) { - ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); - ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); - ASSERT_OK(Flush()); - } - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - ReadOptions ropts; - ropts.verify_checksums = true; - const auto verify_db_func = [&](const std::string& foo_val, - const std::string& bar_val) { - std::string value; - ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_EQ(foo_val, value); - ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); - ASSERT_EQ(bar_val, value); - Iterator* iter = db_secondary_->NewIterator(ropts); - ASSERT_NE(nullptr, iter); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ(foo_val, iter->value().ToString()); - iter->Seek("bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("bar", iter->key().ToString()); - ASSERT_EQ(bar_val, iter->value().ToString()); - size_t count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ++count; - } - ASSERT_EQ(2, count); - delete iter; - }; - - verify_db_func("foo_value2", "bar_value2"); - - ASSERT_OK(Put("foo", "new_foo_value")); - ASSERT_OK(Put("bar", "new_bar_value")); - ASSERT_OK(Flush()); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db_func("new_foo_value", "new_bar_value"); -} - -namespace { -class TraceFileEnv : public EnvWrapper { - public: - explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {} - Status NewRandomAccessFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& env_options) override { - class TracedRandomAccessFile : public RandomAccessFile { - public: - TracedRandomAccessFile(std::unique_ptr&& target, - std::atomic& counter) - : target_(std::move(target)), files_closed_(counter) {} - ~TracedRandomAccessFile() override { - files_closed_.fetch_add(1, std::memory_order_relaxed); - } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - return target_->Read(offset, n, result, scratch); - } - - private: - std::unique_ptr target_; - std::atomic& files_closed_; - }; - Status s = target()->NewRandomAccessFile(f, r, env_options); - if (s.ok()) { - r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_)); - } - return s; - } - - int files_closed() const { - return files_closed_.load(std::memory_order_relaxed); - } - - private: - std::atomic files_closed_{0}; -}; -} // namespace - -TEST_F(DBSecondaryTest, SecondaryCloseFiles) { - Options options; - options.env = env_; - options.max_open_files = 1; - options.disable_auto_compactions = true; - Reopen(options); - Options options1; - std::unique_ptr traced_env(new TraceFileEnv(env_)); - options1.env = traced_env.get(); - OpenSecondary(options1); - - static const auto verify_db = [&]() { - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); - std::unique_ptr iter2(db_secondary_->NewIterator(ReadOptions())); - for (iter1->SeekToFirst(), iter2->SeekToFirst(); - iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { - ASSERT_EQ(iter1->key(), iter2->key()); - ASSERT_EQ(iter1->value(), iter2->value()); - } - ASSERT_FALSE(iter1->Valid()); - ASSERT_FALSE(iter2->Valid()); - }; - - ASSERT_OK(Put("a", "value")); - ASSERT_OK(Put("c", "value")); - ASSERT_OK(Flush()); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(); - - ASSERT_OK(Put("b", "value")); - ASSERT_OK(Put("d", "value")); - ASSERT_OK(Flush()); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(); - - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - ASSERT_EQ(2, static_cast(traced_env.get())->files_closed()); - - Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}}); - ASSERT_TRUE(s.IsNotSupported()); - CloseSecondary(); -} - -TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { - Options options; - options.env = env_; - options.level0_file_num_compaction_trigger = 4; - Reopen(options); - for (int i = 0; i < 3; ++i) { - ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); - ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); - } - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - ReadOptions ropts; - ropts.verify_checksums = true; - const auto verify_db_func = [&](const std::string& foo_val, - const std::string& bar_val) { - std::string value; - ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_EQ(foo_val, value); - ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); - ASSERT_EQ(bar_val, value); - Iterator* iter = db_secondary_->NewIterator(ropts); - ASSERT_NE(nullptr, iter); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ(foo_val, iter->value().ToString()); - iter->Seek("bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("bar", iter->key().ToString()); - ASSERT_EQ(bar_val, iter->value().ToString()); - size_t count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ++count; - } - ASSERT_EQ(2, count); - delete iter; - }; - - verify_db_func("foo_value2", "bar_value2"); - - ASSERT_OK(Put("foo", "new_foo_value")); - ASSERT_OK(Put("bar", "new_bar_value")); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db_func("new_foo_value", "new_bar_value"); - - ASSERT_OK(Flush()); - ASSERT_OK(Put("foo", "new_foo_value_1")); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db_func("new_foo_value_1", "new_bar_value"); -} - -TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { - Options options; - options.env = env_; - CreateAndReopenWithCF({"pikachu"}, options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - std::vector cf_descs; - cf_descs.emplace_back(kDefaultColumnFamilyName, options1); - cf_descs.emplace_back("pikachu", options1); - cf_descs.emplace_back("eevee", options1); - Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, - &handles_secondary_, &db_secondary_); - ASSERT_NOK(s); -} - -TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { - Options options; - options.env = env_; - CreateAndReopenWithCF({"pikachu"}, options); - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - ASSERT_EQ(0, handles_secondary_.size()); - ASSERT_NE(nullptr, db_secondary_); - - ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); - ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); - ASSERT_OK(Flush(0 /*cf*/)); - ASSERT_OK(Flush(1 /*cf*/)); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - ReadOptions ropts; - ropts.verify_checksums = true; - std::string value; - ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_EQ("foo_value", value); -} - -TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { - Options options; - options.env = env_; - Reopen(options); - Close(); - - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->LoadDependency( - {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", - "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, - {"VersionSet::ProcessManifestWrites:AfterNewManifest", - "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" - "1"}}); - SyncPoint::GetInstance()->EnableProcessing(); - - // Make sure db calls RecoverLogFiles so as to trigger a manifest write, - // which causes the db to switch to a new MANIFEST upon start. - port::Thread ro_db_thread([&]() { - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - CloseSecondary(); - }); - Reopen(options); - ro_db_thread.join(); -} - -TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { - Options options; - options.env = env_; - options.level0_file_num_compaction_trigger = 4; - Reopen(options); - for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { - ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); - ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); - ASSERT_OK(dbfull()->Flush(FlushOptions())); - } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - ReadOptions ropts; - ropts.verify_checksums = true; - std::string value; - ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_EQ("foo_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - value); - ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); - ASSERT_EQ("bar_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - value); - Iterator* iter = db_secondary_->NewIterator(ropts); - ASSERT_NE(nullptr, iter); - iter->Seek("bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("bar", iter->key().ToString()); - ASSERT_EQ("bar_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - iter->value().ToString()); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("foo_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - iter->value().ToString()); - size_t count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ++count; - } - ASSERT_EQ(2, count); - delete iter; -} - -TEST_F(DBSecondaryTest, MissingTableFile) { - int table_files_not_exist = 0; - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", - [&](void* arg) { - Status s = *reinterpret_cast(arg); - if (s.IsPathNotFound()) { - ++table_files_not_exist; - } else if (!s.ok()) { - assert(false); // Should not reach here - } - }); - SyncPoint::GetInstance()->EnableProcessing(); - Options options; - options.env = env_; - options.level0_file_num_compaction_trigger = 4; - Reopen(options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { - ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); - ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); - ASSERT_OK(dbfull()->Flush(FlushOptions())); - } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - ASSERT_NE(nullptr, db_secondary_full()); - ReadOptions ropts; - ropts.verify_checksums = true; - std::string value; - ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); - ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); - ASSERT_EQ("foo_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - value); - ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); - ASSERT_EQ("bar_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - value); - Iterator* iter = db_secondary_->NewIterator(ropts); - ASSERT_NE(nullptr, iter); - iter->Seek("bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("bar", iter->key().ToString()); - ASSERT_EQ("bar_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - iter->value().ToString()); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("foo_value" + - std::to_string(options.level0_file_num_compaction_trigger - 1), - iter->value().ToString()); - size_t count = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ++count; - } - ASSERT_EQ(2, count); - delete iter; -} - -TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { - Options options; - options.env = env_; - const std::string kCfName1 = "pikachu"; - CreateAndReopenWithCF({kCfName1}, options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondaryWithColumnFamilies({kCfName1}, options1); - ASSERT_EQ(2, handles_secondary_.size()); - - ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); - ASSERT_OK(Flush(1 /*cf*/)); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - ReadOptions ropts; - ropts.verify_checksums = true; - std::string value; - ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); - ASSERT_EQ("foo_val_1", value); - - ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - Close(); - CheckFileTypeCounts(dbname_, 1, 0, 1); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - value.clear(); - ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); - ASSERT_EQ("foo_val_1", value); -} - -TEST_F(DBSecondaryTest, SwitchManifest) { - Options options; - options.env = env_; - options.level0_file_num_compaction_trigger = 4; - Reopen(options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - const int kNumFiles = options.level0_file_num_compaction_trigger - 1; - // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, - // ..., 9. - const int kNumKeys = 10; - // Create two sst - for (int i = 0; i != kNumFiles; ++i) { - for (int j = 0; j != kNumKeys; ++j) { - ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); - } - ASSERT_OK(Flush()); - } - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - const auto& range_scan_db = [&]() { - ReadOptions tmp_ropts; - tmp_ropts.total_order_seek = true; - tmp_ropts.verify_checksums = true; - std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); - int cnt = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { - ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); - ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), - iter->value().ToString()); - } - }; - - range_scan_db(); - - // While secondary instance still keeps old MANIFEST open, we close primary, - // restart primary, performs full compaction, close again, restart again so - // that next time secondary tries to catch up with primary, the secondary - // will skip the MANIFEST in middle. - Reopen(options); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - Reopen(options); - ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - range_scan_db(); -} - -// Here, "Snapshot" refers to the version edits written by -// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after -// switching from the old one. -TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { - Options options; - options.env = env_; - options.disable_auto_compactions = true; - Reopen(options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - ASSERT_OK(Put("0", "value0")); - ASSERT_OK(Flush()); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - std::string value; - ReadOptions ropts; - ropts.verify_checksums = true; - ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); - ASSERT_EQ("value0", value); - - Reopen(options); - ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); -} - -TEST_F(DBSecondaryTest, SwitchWAL) { - const int kNumKeysPerMemtable = 1; - Options options; - options.env = env_; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 2; - options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysPerMemtable)); - Reopen(options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - const auto& verify_db = [](DB* db1, DB* db2) { - ASSERT_NE(nullptr, db1); - ASSERT_NE(nullptr, db2); - ReadOptions read_opts; - read_opts.verify_checksums = true; - std::unique_ptr it1(db1->NewIterator(read_opts)); - std::unique_ptr it2(db2->NewIterator(read_opts)); - it1->SeekToFirst(); - it2->SeekToFirst(); - for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { - ASSERT_EQ(it1->key(), it2->key()); - ASSERT_EQ(it1->value(), it2->value()); - } - ASSERT_FALSE(it1->Valid()); - ASSERT_FALSE(it2->Valid()); - - for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { - std::string value; - ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); - ASSERT_EQ(it1->value(), value); - } - for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { - std::string value; - ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); - ASSERT_EQ(it2->value(), value); - } - }; - for (int k = 0; k != 16; ++k) { - ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(dbfull(), db_secondary_); - } -} - -TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { - const int kNumKeysPerMemtable = 1; - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", - "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); - SyncPoint::GetInstance()->EnableProcessing(); - const std::string kCFName1 = "pikachu"; - Options options; - options.env = env_; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 2; - options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysPerMemtable)); - CreateAndReopenWithCF({kCFName1}, options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondaryWithColumnFamilies({kCFName1}, options1); - ASSERT_EQ(2, handles_secondary_.size()); - - const auto& verify_db = [](DB* db1, - const std::vector& handles1, - DB* db2, - const std::vector& handles2) { - ASSERT_NE(nullptr, db1); - ASSERT_NE(nullptr, db2); - ReadOptions read_opts; - read_opts.verify_checksums = true; - ASSERT_EQ(handles1.size(), handles2.size()); - for (size_t i = 0; i != handles1.size(); ++i) { - std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); - std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); - it1->SeekToFirst(); - it2->SeekToFirst(); - for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { - ASSERT_EQ(it1->key(), it2->key()); - ASSERT_EQ(it1->value(), it2->value()); - } - ASSERT_FALSE(it1->Valid()); - ASSERT_FALSE(it2->Valid()); - - for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { - std::string value; - ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); - ASSERT_EQ(it1->value(), value); - } - for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { - std::string value; - ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); - ASSERT_EQ(it2->value(), value); - } - } - }; - for (int k = 0; k != 8; ++k) { - ASSERT_OK( - Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); - ASSERT_OK( - Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); - TEST_SYNC_POINT( - "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); - SyncPoint::GetInstance()->ClearTrace(); - } -} - -TEST_F(DBSecondaryTest, CatchUpAfterFlush) { - const int kNumKeysPerMemtable = 16; - Options options; - options.env = env_; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 2; - options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysPerMemtable)); - Reopen(options); - - Options options1; - options1.env = env_; - options1.max_open_files = -1; - OpenSecondary(options1); - - WriteOptions write_opts; - WriteBatch wb; - wb.Put("key0", "value0"); - wb.Put("key1", "value1"); - ASSERT_OK(dbfull()->Write(write_opts, &wb)); - ReadOptions read_opts; - std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); - iter1->Seek("key0"); - ASSERT_FALSE(iter1->Valid()); - iter1->Seek("key1"); - ASSERT_FALSE(iter1->Valid()); - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - iter1->Seek("key0"); - ASSERT_FALSE(iter1->Valid()); - iter1->Seek("key1"); - ASSERT_FALSE(iter1->Valid()); - std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); - iter2->Seek("key0"); - ASSERT_TRUE(iter2->Valid()); - ASSERT_EQ("value0", iter2->value()); - iter2->Seek("key1"); - ASSERT_TRUE(iter2->Valid()); - ASSERT_EQ("value1", iter2->value()); - - { - WriteBatch wb1; - wb1.Put("key0", "value01"); - wb1.Put("key1", "value11"); - ASSERT_OK(dbfull()->Write(write_opts, &wb1)); - } - - { - WriteBatch wb2; - wb2.Put("key0", "new_value0"); - wb2.Delete("key1"); - ASSERT_OK(dbfull()->Write(write_opts, &wb2)); - } - - ASSERT_OK(Flush()); - - ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); - // iter3 should not see value01 and value11 at all. - iter3->Seek("key0"); - ASSERT_TRUE(iter3->Valid()); - ASSERT_EQ("new_value0", iter3->value()); - iter3->Seek("key1"); - ASSERT_FALSE(iter3->Valid()); -} - -TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { - bool called = false; - Options options; - options.env = env_; - options.disable_auto_compactions = true; - Reopen(options); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack( - "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { - ASSERT_NE(nullptr, arg); - called = true; - auto* s = reinterpret_cast(arg); - ASSERT_NOK(*s); - }); - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", - "BackgroundCallCompaction:0"}, - {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", - "DBImpl::CheckConsistency:BeforeGetFileSize"}}); - SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("a", "value0")); - ASSERT_OK(Put("c", "value0")); - ASSERT_OK(Flush()); - ASSERT_OK(Put("b", "value1")); - ASSERT_OK(Put("d", "value1")); - ASSERT_OK(Flush()); - port::Thread thread([this]() { - Options opts; - opts.env = env_; - opts.max_open_files = -1; - OpenSecondary(opts); - }); - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - thread.join(); - ASSERT_TRUE(called); -} -#endif //! ROCKSDB_LITE - -} // namespace ROCKSDB_NAMESPACE - -int main(int argc, char** argv) { - ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,7 +17,8 @@ namespace ROCKSDB_NAMESPACE { void DumpDBFileSummary(const ImmutableDBOptions& options, - const std::string& dbname) { + const std::string& dbname, + const std::string& session_id) { if (options.info_log == nullptr) { return; } @@ -32,6 +33,8 @@ std::string file_info, wal_info; Header(options.info_log, "DB SUMMARY\n"); + Header(options.info_log, "DB Session ID: %s\n", session_id.c_str()); + // Get files in dbname dir if (!env->GetChildren(dbname, &files).ok()) { Error(options.info_log, @@ -50,16 +53,25 @@ Header(options.info_log, "IDENTITY file: %s\n", file.c_str()); break; case kDescriptorFile: - env->GetFileSize(dbname + "/" + file, &file_size); - Header(options.info_log, "MANIFEST file: %s size: %" PRIu64 " Bytes\n", - file.c_str(), file_size); - break; - case kLogFile: - env->GetFileSize(dbname + "/" + file, &file_size); - char str[16]; - snprintf(str, sizeof(str), "%" PRIu64, file_size); - wal_info.append(file).append(" size: "). - append(str).append(" ; "); + if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) { + Header(options.info_log, + "MANIFEST file: %s size: %" PRIu64 " Bytes\n", file.c_str(), + file_size); + } else { + Error(options.info_log, "Error when reading MANIFEST file: %s/%s\n", + dbname.c_str(), file.c_str()); + } + break; + case kWalFile: + if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file: %s/%s\n", + dbname.c_str(), file.c_str()); + } break; case kTableFile: if (++file_num < 10) { @@ -97,27 +109,30 @@ } // Get wal file in wal_dir - if (dbname.compare(options.wal_dir) != 0) { - if (!env->GetChildren(options.wal_dir, &files).ok()) { - Error(options.info_log, - "Error when reading %s dir\n", - options.wal_dir.c_str()); + const auto& wal_dir = options.GetWalDir(dbname); + if (!options.IsWalDirSameAsDBPath(dbname)) { + if (!env->GetChildren(wal_dir, &files).ok()) { + Error(options.info_log, "Error when reading %s dir\n", wal_dir.c_str()); return; } wal_info.clear(); for (const std::string& file : files) { if (ParseFileName(file, &number, &type)) { - if (type == kLogFile) { - env->GetFileSize(options.wal_dir + "/" + file, &file_size); - char str[16]; - snprintf(str, sizeof(str), "%" PRIu64, file_size); - wal_info.append(file).append(" size: "). - append(str).append(" ; "); + if (type == kWalFile) { + if (env->GetFileSize(wal_dir + "/" + file, &file_size).ok()) { + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); + } else { + Error(options.info_log, "Error when reading LOG file %s/%s\n", + wal_dir.c_str(), file.c_str()); + } } } } } - Header(options.info_log, "Write Ahead Log file in %s: %s\n", - options.wal_dir.c_str(), wal_info.c_str()); + Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(), + wal_info.c_str()); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,5 +10,6 @@ namespace ROCKSDB_NAMESPACE { void DumpDBFileSummary(const ImmutableDBOptions& options, - const std::string& dbname); + const std::string& dbname, + const std::string& session_id = ""); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,8 @@ class DBTestInPlaceUpdate : public DBTestBase { public: - DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {} + DBTestInPlaceUpdate() + : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {} }; TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) { @@ -168,6 +169,36 @@ ASSERT_EQ(Get(1, "key"), "NOT_FOUND"); } while (ChangeCompactOptions()); } + +TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = true; + options.env = env_; + options.write_buffer_size = 100000; + options.allow_concurrent_memtable_write = false; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + // Update key with values of smaller size, and + // run GetSnapshot and ReleaseSnapshot + int numValues = 2; + for (int i = numValues; i > 0; i--) { + const Snapshot* s = db_->GetSnapshot(); + ASSERT_EQ(nullptr, s); + std::string value = DummyString(i, 'a'); + ASSERT_OK(Put(1, "key", value)); + ASSERT_EQ(value, Get(1, "key")); + // release s (nullptr) + db_->ReleaseSnapshot(s); + } + + // Only 1 instance for that key. + validateNumberOfEntries(1, 1); + } while (ChangeCompactOptions()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_io_failure_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_io_failure_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,12 +9,14 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "test_util/testutil.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { class DBIOFailureTest : public DBTestBase { public: - DBIOFailureTest() : DBTestBase("/db_io_failure_test") {} + DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {} }; #ifndef ROCKSDB_LITE @@ -33,7 +35,7 @@ // Force out-of-space errors env_->drop_writes_.store(true, std::memory_order_release); env_->sleep_counter_.Reset(); - env_->no_slowdown_ = true; + env_->SetMockSleep(); for (int i = 0; i < 5; i++) { if (option_config_ != kUniversalCompactionMultiLevel && option_config_ != kUniversalSubcompactions) { @@ -41,11 +43,15 @@ if (level > 0 && level == dbfull()->NumberLevels() - 1) { break; } - dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, - true /* disallow trivial move */); + Status s = + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.ok() || s.IsCorruption()); } } else { - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + Status s = + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsCorruption()); } } @@ -54,7 +60,8 @@ ASSERT_EQ("5", property_value); env_->drop_writes_.store(false, std::memory_order_release); - ASSERT_LT(CountFiles(), num_files + 3); + const size_t count = CountFiles(); + ASSERT_LT(count, num_files + 3); // Check that compaction attempts slept after errors // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler @@ -80,7 +87,8 @@ ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("0", property_value); - dbfull()->TEST_FlushMemTable(true); + // ASSERT file is too short + ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption()); ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("1", property_value); @@ -164,7 +172,7 @@ ASSERT_EQ("bar", Get("foo")); // Memtable compaction (will succeed) - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("foo")); const int last = 2; MoveFilesToLevel(2); @@ -172,7 +180,8 @@ // Merging compaction (will fail) error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_NOK( + dbfull()->TEST_CompactRange(last, nullptr, nullptr)); // Should fail ASSERT_EQ("bar", Get("foo")); error_type->store(false, std::memory_order_release); @@ -190,7 +199,13 @@ // Merging compaction (will fail) error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + Status s = + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + if (iter == 0) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsIOError()); + } ASSERT_EQ("bar", Get("foo")); // Recovery: should not lose data @@ -218,18 +233,15 @@ options.paranoid_checks = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo2", "bar2")); env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); // the next put should fail, too - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo3", "bar3")); // but we're still able to read ASSERT_EQ("bar", Get(1, "foo")); @@ -242,12 +254,10 @@ ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo2", "bar2")); env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); // the next put should NOT fail - ASSERT_TRUE(s.ok()); + ASSERT_OK(Put(1, "foo3", "bar3")); } #if !(defined NDEBUG) || !defined(OS_WIN) TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { @@ -260,29 +270,29 @@ options.writable_file_max_buffer_size = 128 * 1024; options.bytes_per_sync = 128 * 1024; options.level0_file_num_compaction_trigger = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(10)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(10)); BlockBasedTableOptions table_options; table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + const char* io_error_msg = "range sync dummy error"; std::atomic range_sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { if (range_sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("range sync dummy error"); + *st = Status::IOError(io_error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Random rnd(301); std::string rnd_str = - RandomString(&rnd, static_cast(options.bytes_per_sync / 2)); - std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024); + rnd.RandomString(static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = rnd.RandomString(512 * 1024); ASSERT_OK(Put(1, "foo", "bar")); // First 1MB doesn't get range synced @@ -296,7 +306,9 @@ ASSERT_OK(Put(1, "foo3_2", rnd_str)); ASSERT_OK(Put(1, "foo3_3", rnd_str)); ASSERT_OK(Put(1, "foo4", "bar")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -326,12 +338,11 @@ options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; Random rnd(301); std::string rnd_str = - RandomString(&rnd, static_cast(options.bytes_per_sync / 2)); - std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024); + rnd.RandomString(static_cast(options.bytes_per_sync / 2)); + std::string rnd_str_512kb = rnd.RandomString(512 * 1024); ASSERT_OK(Put(1, "foo", "bar")); // First 1MB doesn't get range synced @@ -340,21 +351,22 @@ ASSERT_OK(Put(1, "foo1_1", rnd_str)); ASSERT_OK(Put(1, "foo1_2", rnd_str)); ASSERT_OK(Put(1, "foo1_3", rnd_str)); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo3_1", rnd_str)); ASSERT_OK(Put(1, "foo3_2", rnd_str)); ASSERT_OK(Put(1, "foo3_3", rnd_str)); ASSERT_OK(Put(1, "foo4", "bar")); - Flush(1); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + const char* io_error_msg = "range sync dummy error"; std::atomic range_sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { if (range_sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("range sync dummy error"); + *st = Status::IOError(io_error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -363,7 +375,9 @@ { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -383,17 +397,18 @@ options.error_if_exists = false; options.paranoid_checks = true; options.level0_file_num_compaction_trigger = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(2)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + + const char* io_error_msg = "close dummy error"; std::atomic close_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Close", [&](void* arg) { if (close_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -402,7 +417,9 @@ ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); ASSERT_OK(Put(1, "foo", "bar2")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -427,25 +444,25 @@ DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar2")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar3")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + const char* io_error_msg = "close dummy error"; std::atomic close_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Close", [&](void* arg) { if (close_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -454,7 +471,9 @@ { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as compaction failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -474,17 +493,18 @@ options.paranoid_checks = true; options.use_fsync = false; options.level0_file_num_compaction_trigger = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(2)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + + const char* io_error_msg = "sync dummy error"; std::atomic sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Sync", [&](void* arg) { if (sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("sync dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -493,7 +513,9 @@ ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); ASSERT_OK(Put(1, "foo", "bar2")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -519,25 +541,25 @@ DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar2")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar3")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + const char* io_error_msg = "sync dummy error"; std::atomic sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Sync", [&](void* arg) { if (sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -546,7 +568,9 @@ { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as compaction failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -564,5 +588,6 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,9 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_iter.h" -#include + #include #include +#include #include "db/dbformat.h" #include "db/merge_context.h" @@ -24,6 +25,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" +#include "rocksdb/system_clock.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "trace_replay/trace_replay.h" @@ -34,21 +36,26 @@ namespace ROCKSDB_NAMESPACE { DBIter::DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, - const Comparator* cmp, InternalIterator* iter, SequenceNumber s, - bool arena_mode, uint64_t max_sequential_skip_in_iterations, + const Comparator* cmp, InternalIterator* iter, + const Version* version, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob) + ColumnFamilyData* cfd, bool expose_blob_index) : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), env_(_env), - logger_(cf_options.info_log), + clock_(ioptions.clock), + logger_(ioptions.logger), user_comparator_(cmp), - merge_operator_(cf_options.merge_operator), + merge_operator_(ioptions.merge_operator.get()), iter_(iter), + version_(version), read_callback_(read_callback), sequence_(s), - statistics_(cf_options.statistics), + statistics_(ioptions.stats), + max_skip_(max_sequential_skip_in_iterations), + max_skippable_internal_keys_(read_options.max_skippable_internal_keys), num_internal_keys_skipped_(0), iterate_lower_bound_(read_options.iterate_lower_bound), iterate_upper_bound_(read_options.iterate_upper_bound), @@ -63,22 +70,26 @@ expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || read_options.auto_prefix_mode), - allow_blob_(allow_blob), + read_tier_(read_options.read_tier), + verify_checksums_(read_options.verify_checksums), + expose_blob_index_(expose_blob_index), is_blob_(false), arena_mode_(arena_mode), - range_del_agg_(&cf_options.internal_comparator, s), + range_del_agg_(&ioptions.internal_comparator, s), db_impl_(db_impl), cfd_(cfd), - start_seqnum_(read_options.iter_start_seqnum) { + start_seqnum_(read_options.iter_start_seqnum), + timestamp_ub_(read_options.timestamp), + timestamp_lb_(read_options.iter_start_ts), + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { RecordTick(statistics_, NO_ITERATOR_CREATED); - max_skip_ = max_sequential_skip_in_iterations; - max_skippable_internal_keys_ = read_options.max_skippable_internal_keys; if (pin_thru_lifetime_) { pinned_iters_mgr_.StartPinning(); } if (iter_.iter()) { iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); } + assert(timestamp_size_ == user_comparator_.timestamp_size()); } Status DBIter::GetProperty(std::string prop_name, std::string* prop) { @@ -103,11 +114,11 @@ } bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_.key(), ikey)) { - status_ = Status::Corruption("corrupted internal key in DBIter"); + Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */); + if (!s.ok()) { + status_ = Status::Corruption("In DBIter: ", s.getState()); valid_ = false; - ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s", - iter_.key().ToString(true).c_str()); + ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState()); return false; } else { return true; @@ -118,7 +129,7 @@ assert(valid_); assert(status_.ok()); - PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); // Release temporarily pinned blocks from last operation ReleaseTempPinnedData(); local_stats_.skip_count_ += num_internal_keys_skipped_; @@ -143,13 +154,13 @@ local_stats_.next_count_++; if (ok && iter_.Valid()) { - Slice prefix; if (prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); - prefix = prefix_.GetUserKey(); + const Slice prefix = prefix_.GetUserKey(); + FindNextUserEntry(true /* skipping the current user key */, &prefix); + } else { + FindNextUserEntry(true /* skipping the current user key */, nullptr); } - FindNextUserEntry(true /* skipping the current user key */, - prefix_same_as_start_ ? &prefix : nullptr); } else { is_key_seqnum_zero_ = false; valid_ = false; @@ -160,6 +171,43 @@ } } +bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, + const Slice& blob_index) { + assert(!is_blob_); + + if (expose_blob_index_) { // Stacked BlobDB implementation + is_blob_ = true; + return true; + } + + if (!version_) { + status_ = Status::Corruption("Encountered unexpected blob index."); + valid_ = false; + return false; + } + + // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to + // avoid having to copy options back and forth. + ReadOptions read_options; + read_options.read_tier = read_tier_; + read_options.verify_checksums = verify_checksums_; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + const Status s = version_->GetBlob(read_options, user_key, blob_index, + prefetch_buffer, &blob_value_, bytes_read); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + is_blob_ = true; + return true; +} + // PRE: saved_key_ has the current user key if skipping_saved_key // POST: saved_key_ should have the next user key if valid_, // if the current entry is a result of merge @@ -216,19 +264,28 @@ is_key_seqnum_zero_ = false; return false; } + Slice user_key_without_ts = + StripTimestampFromUserKey(ikey_.user_key, timestamp_size_); is_key_seqnum_zero_ = (ikey_.sequence == 0); - assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || - user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); - if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && - user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { + assert(iterate_upper_bound_ == nullptr || + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound || + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) < 0); + if (iterate_upper_bound_ != nullptr && + iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound && + user_comparator_.CompareWithoutTimestamp( + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { break; } assert(prefix == nullptr || prefix_extractor_ != nullptr); if (prefix != nullptr && - prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) { + prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) != + 0) { assert(prefix_same_as_start_); break; } @@ -237,24 +294,37 @@ return false; } - if (IsVisible(ikey_.sequence)) { + assert(ikey_.user_key.size() >= timestamp_size_); + Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_) + : Slice(); + bool more_recent = false; + if (IsVisible(ikey_.sequence, ts, &more_recent)) { // If the previous entry is of seqnum 0, the current entry will not // possibly be skipped. This condition can potentially be relaxed to // prev_key.seq <= ikey_.sequence. We are cautious because it will be more // prone to bugs causing the same user key with the same sequence number. - if (!is_prev_key_seqnum_zero && skipping_saved_key && - user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= - 0) { + // Note that with current timestamp implementation, the same user key can + // have different timestamps and zero sequence number on the bottommost + // level. This may change in the future. + if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) && + skipping_saved_key && + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { assert(!skipping_saved_key || - user_comparator_.Compare(ikey_.user_key, - saved_key_.GetUserKey()) > 0); + CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0); + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } num_skipped = 0; reseek_done = false; switch (ikey_.type) { case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. @@ -263,7 +333,20 @@ // 2) return ikey only if ikey.seqnum >= start_seqnum_ // note that if deletion seqnum is < start_seqnum_ we // just skip it like in normal iterator. - if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_) { + if (start_seqnum_ > 0) { + if (ikey_.sequence >= start_seqnum_) { + saved_key_.SetInternalKey(ikey_); + valid_ = true; + return true; + } else { + saved_key_.SetUserKey( + ikey_.user_key, + !pin_thru_lifetime_ || + !iter_.iter()->IsKeyPinned() /* copy */); + skipping_saved_key = true; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + } + } else if (timestamp_lb_) { saved_key_.SetInternalKey(ikey_); valid_ = true; return true; @@ -278,11 +361,15 @@ case kTypeValue: case kTypeBlobIndex: if (start_seqnum_ > 0) { - // we are taking incremental snapshot here - // incremental snapshots aren't supported on DB with range deletes - assert(ikey_.type != kTypeBlobIndex); if (ikey_.sequence >= start_seqnum_) { saved_key_.SetInternalKey(ikey_); + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + } + valid_ = true; return true; } else { @@ -294,6 +381,17 @@ !iter_.iter()->IsKeyPinned() /* copy */); skipping_saved_key = true; } + } else if (timestamp_lb_) { + saved_key_.SetInternalKey(ikey_); + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + } + + valid_ = true; + return true; } else { saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || @@ -306,20 +404,13 @@ num_skipped = 0; reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - } else if (ikey_.type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - valid_ = false; - return false; + } else { + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } } - is_blob_ = true; - valid_ = true; - return true; - } else { valid_ = true; return true; } @@ -346,18 +437,23 @@ } break; default: - assert(false); - break; + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey_.type))); + return false; } } } else { - PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + if (more_recent) { + PERF_COUNTER_ADD(internal_recent_skipped_count, 1); + } - // This key was inserted after our snapshot was taken. - // If this happens too many times in a row for the same user key, we want - // to seek to the target sequence number. - int cmp = - user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()); + // This key was inserted after our snapshot was taken or skipped by + // timestamp range. If this happens too many times in a row for the same + // user key, we want to seek to the target sequence number. + int cmp = user_comparator_.CompareWithoutTimestamp( + ikey_.user_key, saved_key_.GetUserKey()); if (cmp == 0 || (skipping_saved_key && cmp < 0)) { num_skipped++; } else { @@ -388,8 +484,17 @@ // We're looking for the next user-key but all we see are the same // user-key with decreasing sequence numbers. Fast forward to // sequence number 0 and type deletion (the smallest type). - AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), - 0, kTypeDeletion)); + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); + } else { + const std::string kTsMin(timestamp_size_, '\0'); + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion), + kTsMin); + } // Don't set skipping_saved_key = false because we may still see more // user-keys equal to saved_key_. } else { @@ -398,9 +503,17 @@ // Note that this only covers a case when a higher key was overwritten // many times since our snapshot was taken, not the case when a lot of // different keys were inserted after our snapshot was taken. - AppendInternalKey(&last_key, - ParsedInternalKey(saved_key_.GetUserKey(), sequence_, - kValueTypeForSeek)); + if (timestamp_size_ == 0) { + AppendInternalKey( + &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + *timestamp_ub_); + } } iter_.Seek(last_key); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); @@ -417,6 +530,7 @@ // Scan from the newer entries to older entries. // PRE: iter_.key() points to the first merge type entry // saved_key_ stores the user key +// iter_.PrepareValue() has been called // POST: saved_value_ has the merged value for the user key // iter_ points to the next entry (or invalid) bool DBIter::MergeValuesNewToOld() { @@ -436,7 +550,6 @@ TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); ParsedInternalKey ikey; - Status s; for (iter_.Next(); iter_.Valid(); iter_.Next()) { TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand"); if (!ParseKey(&ikey)) { @@ -446,23 +559,26 @@ if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { // hit the next user key, stop right here break; - } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || + } + if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || range_del_agg_.ShouldDelete( ikey, RangeDelPositioningMode::kForwardTraversal)) { // hit a delete with the same user key, stop right here // iter_ is positioned after delete iter_.Next(); break; - } else if (kTypeValue == ikey.type) { + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (kTypeValue == ikey.type) { // hit a put, merge the put value with operands and store the // final result in saved_value_. We are done! const Slice val = iter_.value(); - s = MergeHelper::TimedFullMerge( - merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, env_, &pinned_value_, true); + Status s = Merge(&val, ikey.user_key); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } // iter_ is positioned after put @@ -479,19 +595,37 @@ iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (kTypeBlobIndex == ikey.type) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; } + valid_ = true; + const Slice blob_value = value(); + Status s = Merge(&blob_value, ikey.user_key); + if (!s.ok()) { + return false; + } + is_blob_ = false; + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; + } else { valid_ = false; + status_ = Status::Corruption( + "Unrecognized value type: " + + std::to_string(static_cast(ikey.type))); return false; - } else { - assert(false); } } @@ -504,16 +638,10 @@ // a deletion marker. // feed null as the existing value to the merge operator, such that // client can differentiate this scenario and do things accordingly. - s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(), - nullptr, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, env_, - &pinned_value_, true); + Status s = Merge(nullptr, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } - assert(status_.ok()); return true; } @@ -522,7 +650,7 @@ assert(valid_); assert(status_.ok()); - PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); bool ok = true; @@ -557,9 +685,16 @@ // If that's the case, seek iter_ to current key. if (!expect_total_order_inner_iter() || !iter_.Valid()) { IterKey last_key; - last_key.SetInternalKey(ParsedInternalKey( - saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); } direction_ = kForward; @@ -610,6 +745,7 @@ iter_.SeekToLast(); } } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); } direction_ = kReverse; @@ -624,7 +760,9 @@ assert(prefix == nullptr || prefix_extractor_ != nullptr); if (prefix != nullptr && - prefix_extractor_->Transform(saved_key_.GetUserKey()) + prefix_extractor_ + ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(), + timestamp_size_)) .compare(*prefix) != 0) { assert(prefix_same_as_start_); // Current key does not have the same prefix as start @@ -633,11 +771,13 @@ } assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_lower_bound_) >= 0); + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, + *iterate_lower_bound_, /*b_has_ts=*/false) >= 0); if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_lower_bound_) < 0) { + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { // We've iterated earlier than the user-specified lower bound. valid_ = false; return; @@ -682,8 +822,8 @@ assert(iter_.Valid()); merge_context_.Clear(); current_entry_is_merged_ = false; - // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or - // kTypeValue) + // last entry before merge (could be kTypeDeletion, + // kTypeDeletionWithTimestamp, kTypeSingleDeletion or kTypeValue) ValueType last_not_merge_type = kTypeDeletion; ValueType last_key_entry_type = kTypeDeletion; @@ -697,10 +837,20 @@ return false; } - if (!IsVisible(ikey.sequence) || - !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + if (!IsVisible(ikey.sequence, ts) || + !user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { break; } + if (!ts.empty()) { + saved_timestamp_.assign(ts.data(), ts.size()); + } if (TooManyInternalKeysSkipped()) { return false; } @@ -712,6 +862,11 @@ return FindValueForCurrentKeyUsingSeek(); } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + last_key_entry_type = ikey.type; switch (last_key_entry_type) { case kTypeValue: @@ -720,14 +875,22 @@ ikey, RangeDelPositioningMode::kBackwardTraversal)) { last_key_entry_type = kTypeRangeDeletion; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - } else { - assert(iter_.iter()->IsValuePinned()); + } else if (iter_.iter()->IsValuePinned()) { pinned_value_ = iter_.value(); + } else { + valid_ = false; + status_ = Status::NotSupported( + "Backward iteration not supported if underlying iterator's value " + "cannot be pinned."); } merge_context_.Clear(); last_not_merge_type = last_key_entry_type; + if (!status_.ok()) { + return false; + } break; case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: merge_context_.Clear(); last_not_merge_type = last_key_entry_type; @@ -749,7 +912,11 @@ } break; default: - assert(false); + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; } PERF_COUNTER_ADD(internal_key_skipped_count, 1); @@ -763,9 +930,11 @@ } Status s; + s.PermitUncheckedError(); is_blob_ = false; switch (last_key_entry_type) { case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: case kTypeRangeDeletion: valid_ = false; @@ -775,47 +944,52 @@ if (last_not_merge_type == kTypeDeletion || last_not_merge_type == kTypeSingleDeletion || last_not_merge_type == kTypeRangeDeletion) { - s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), nullptr, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + s = Merge(nullptr, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + return true; } else if (last_not_merge_type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; } - valid_ = false; - return false; + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + valid_ = true; + const Slice blob_value = value(); + s = Merge(&blob_value, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + is_blob_ = false; + return true; } else { assert(last_not_merge_type == kTypeValue); - s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), &pinned_value_, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + s = Merge(&pinned_value_, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + return true; } break; case kTypeValue: // do nothing - we've already has value in pinned_value_ break; case kTypeBlobIndex: - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - valid_ = false; + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { return false; } - is_blob_ = true; break; default: - assert(false); - break; + valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(last_key_entry_type))); + return false; } if (!s.ok()) { valid_ = false; @@ -835,8 +1009,17 @@ // FindValueForCurrentKeyUsingSeek() assert(pinned_iters_mgr_.PinningEnabled()); std::string last_key; - AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), - sequence_, kValueTypeForSeek)); + if (0 == timestamp_size_) { + AppendInternalKey(&last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + *timestamp_ub_); + } iter_.Seek(last_key); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); @@ -853,7 +1036,15 @@ if (!ParseKey(&ikey)) { return false; } - if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { // No visible values for this key, even though FindValueForCurrentKey() // has seen some. This is possible if we're using a tailing iterator, and // the entries were discarded in a compaction. @@ -861,7 +1052,7 @@ return true; } - if (IsVisible(ikey.sequence)) { + if (IsVisible(ikey.sequence, ts)) { break; } @@ -870,22 +1061,28 @@ if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || range_del_agg_.ShouldDelete( - ikey, RangeDelPositioningMode::kBackwardTraversal)) { + ikey, RangeDelPositioningMode::kBackwardTraversal) || + kTypeDeletionWithTimestamp == ikey.type) { valid_ = false; return true; } - if (ikey.type == kTypeBlobIndex && !allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); + if (!iter_.PrepareValue()) { valid_ = false; return false; } + if (timestamp_size_ > 0) { + Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_); + saved_timestamp_.assign(ts.data(), ts.size()); + } if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) { assert(iter_.iter()->IsValuePinned()); pinned_value_ = iter_.value(); - is_blob_ = (ikey.type == kTypeBlobIndex); + if (ikey.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { + return false; + } + } + valid_ = true; return true; } @@ -913,52 +1110,56 @@ if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { break; } - if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || range_del_agg_.ShouldDelete( ikey, RangeDelPositioningMode::kForwardTraversal)) { break; - } else if (ikey.type == kTypeValue) { + } + if (!iter_.PrepareValue()) { + valid_ = false; + return false; + } + + if (ikey.type == kTypeValue) { const Slice val = iter_.value(); - Status s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), &val, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + Status s = Merge(&val, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } - valid_ = true; return true; } else if (ikey.type == kTypeMerge) { merge_context_.PushOperand( iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (ikey.type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; + } + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; } + valid_ = true; + const Slice blob_value = value(); + Status s = Merge(&blob_value, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + is_blob_ = false; + return true; + } else { valid_ = false; + status_ = Status::Corruption( + "Unknown value type: " + + std::to_string(static_cast(ikey.type))); return false; - } else { - assert(false); } } - Status s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), nullptr, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_, - &pinned_value_, true); + Status s = Merge(nullptr, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } @@ -981,6 +1182,19 @@ return true; } +Status DBIter::Merge(const Slice* val, const Slice& user_key) { + Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, val, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return s; + } + valid_ = true; + return s; +} + // Move backwards until the key smaller than saved_key_. // Changes valid_ only if return value is false. bool DBIter::FindUserKeyBeforeSavedKey() { @@ -992,7 +1206,8 @@ return false; } - if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) { + if (user_comparator_.CompareWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey()) < 0) { return true; } @@ -1001,7 +1216,13 @@ } assert(ikey.sequence != kMaxSequenceNumber); - if (!IsVisible(ikey.sequence)) { + assert(ikey.user_key.size() >= timestamp_size_); + Slice ts; + if (timestamp_size_ > 0) { + ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_, + timestamp_size_); + } + if (!IsVisible(ikey.sequence, ts)) { PERF_COUNTER_ADD(internal_recent_skipped_count, 1); } else { PERF_COUNTER_ADD(internal_key_skipped_count, 1); @@ -1010,8 +1231,14 @@ if (num_skipped >= max_skip_) { num_skipped = 0; IterKey last_key; - last_key.SetInternalKey(ParsedInternalKey( - saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); // It would be more efficient to use SeekForPrev() here, but some // iterators may not support it. iter_.Seek(last_key.GetInternalKey()); @@ -1046,26 +1273,40 @@ return false; } -bool DBIter::IsVisible(SequenceNumber sequence) { - if (read_callback_ == nullptr) { - return sequence <= sequence_; - } else { - return read_callback_->IsVisible(sequence); +bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent) { + // Remember that comparator orders preceding timestamp as larger. + // TODO(yanqin): support timestamp in read_callback_. + bool visible_by_seq = (read_callback_ == nullptr) + ? sequence <= sequence_ + : read_callback_->IsVisible(sequence); + + bool visible_by_ts = + (timestamp_ub_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) && + (timestamp_lb_ == nullptr || + user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0); + + if (more_recent) { + *more_recent = !visible_by_seq; } + return visible_by_seq && visible_by_ts; } void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { is_key_seqnum_zero_ = false; SequenceNumber seq = sequence_; saved_key_.Clear(); - saved_key_.SetInternalKey(target, seq); + saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_); if (iterate_lower_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < - 0) { + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { // Seek key is smaller than the lower bound. saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_lower_bound_, seq); + saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek, + timestamp_ub_); } } @@ -1074,23 +1315,50 @@ saved_key_.Clear(); // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, 0 /* sequence_number */, - kValueTypeForSeekForPrev); + kValueTypeForSeekForPrev, timestamp_ub_); + + if (timestamp_size_ > 0) { + const std::string kTsMin(timestamp_size_, '\0'); + Slice ts = kTsMin; + saved_key_.UpdateInternalKey(/*seq=*/0, kValueTypeForSeekForPrev, &ts); + } if (iterate_upper_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_upper_bound_) >= 0) { + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber, + kValueTypeForSeekForPrev, timestamp_ub_); + if (timestamp_size_ > 0) { + const std::string kTsMax(timestamp_size_, '\xff'); + Slice ts = kTsMax; + saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev, + &ts); + } } } void DBIter::Seek(const Slice& target) { - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); - StopWatch sw(env_, statistics_, DB_SEEK); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); #ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { - db_impl_->TraceIteratorSeek(cfd_->GetID(), target); + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) + .PermitUncheckedError(); } #endif // ROCKSDB_LITE @@ -1118,7 +1386,7 @@ // we need to find out the next key that is visible to the user. ClearSavedValue(); if (prefix_same_as_start_) { - // The case where the iterator needs to be invalidated if it has exausted + // The case where the iterator needs to be invalidated if it has exhausted // keys within the same prefix of the seek key. assert(prefix_extractor_ != nullptr); Slice target_prefix = prefix_extractor_->Transform(target); @@ -1146,12 +1414,27 @@ } void DBIter::SeekForPrev(const Slice& target) { - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); - StopWatch sw(env_, statistics_, DB_SEEK); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); #ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { - db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + // TODO: What do we do if this returns an error? + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_ + ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound, + upper_bound) + .PermitUncheckedError(); } #endif // ROCKSDB_LITE @@ -1178,7 +1461,7 @@ // backward direction. ClearSavedValue(); if (prefix_same_as_start_) { - // The case where the iterator needs to be invalidated if it has exausted + // The case where the iterator needs to be invalidated if it has exhausted // keys within the same prefix of the seek key. assert(prefix_extractor_ != nullptr); Slice target_prefix = prefix_extractor_->Transform(target); @@ -1205,7 +1488,7 @@ Seek(*iterate_lower_bound_); return; } - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. if (!expect_total_order_inner_iter()) { @@ -1243,7 +1526,8 @@ } if (valid_ && prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); - prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); } } @@ -1251,14 +1535,16 @@ if (iterate_upper_bound_ != nullptr) { // Seek to last key strictly less than ReadOptions.iterate_upper_bound. SeekForPrev(*iterate_upper_bound_); - if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) { + if (Valid() && 0 == user_comparator_.CompareWithoutTimestamp( + *iterate_upper_bound_, /*a_has_ts=*/false, key(), + /*b_has_ts=*/false)) { ReleaseTempPinnedData(); PrevInternal(nullptr); } return; } - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. if (!expect_total_order_inner_iter()) { @@ -1287,23 +1573,25 @@ } if (valid_ && prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); - prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); } } Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* user_key_comparator, - InternalIterator* internal_iter, + InternalIterator* internal_iter, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob) { - DBIter* db_iter = new DBIter( - env, read_options, cf_options, mutable_cf_options, user_key_comparator, - internal_iter, sequence, false, max_sequential_skip_in_iterations, - read_callback, db_impl, cfd, allow_blob); + ColumnFamilyData* cfd, bool expose_blob_index) { + DBIter* db_iter = + new DBIter(env, read_options, ioptions, mutable_cf_options, + user_key_comparator, internal_iter, version, sequence, false, + max_sequential_skip_in_iterations, read_callback, db_impl, cfd, + expose_blob_index); return db_iter; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,10 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include +#include #include + #include "db/db_impl/db_impl.h" -#include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "memory/arena.h" #include "options/cf_options.h" @@ -21,6 +21,7 @@ #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +class Version; // This file declares the factory functions of DBIter, in its original form // or a wrapped form with class ArenaWrappedDBIter, which is defined here. @@ -66,7 +67,7 @@ // this->key(). // (2) When moving backwards, the internal iterator is positioned // just before all entries whose user key == this->key(). - enum Direction { kForward, kReverse }; + enum Direction : uint8_t { kForward, kReverse }; // LocalStatistics contain Statistics counters that will be aggregated per // each iterator instance and then will be sent to the global statistics when @@ -112,12 +113,12 @@ }; DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* cmp, - InternalIterator* iter, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, + InternalIterator* iter, const Version* version, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob); + bool expose_blob_index); // No copying allowed DBIter(const DBIter&) = delete; @@ -140,18 +141,29 @@ } ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; } - bool Valid() const override { return valid_; } + bool Valid() const override { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (valid_) { + status_.PermitUncheckedError(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + return valid_; + } Slice key() const override { assert(valid_); - if (start_seqnum_ > 0) { + if (start_seqnum_ > 0 || timestamp_lb_) { return saved_key_.GetInternalKey(); } else { - return saved_key_.GetUserKey(); + const Slice ukey_and_ts = saved_key_.GetUserKey(); + return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_); } } Slice value() const override { assert(valid_); - if (current_entry_is_merged_) { + + if (!expose_blob_index_ && is_blob_) { + return blob_value_; + } else if (current_entry_is_merged_) { // If pinned_value_ is set then the result of merge operator is one of // the merge operands and we should return it. return pinned_value_.data() ? pinned_value_ : saved_value_; @@ -169,8 +181,18 @@ return status_; } } + Slice timestamp() const override { + assert(valid_); + assert(timestamp_size_ > 0); + if (direction_ == kReverse) { + return saved_timestamp_; + } + const Slice ukey_and_ts = saved_key_.GetUserKey(); + assert(timestamp_size_ < ukey_and_ts.size()); + return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_); + } bool IsBlob() const { - assert(valid_ && (allow_blob_ || !is_blob_)); + assert(valid_); return is_blob_; } @@ -178,6 +200,8 @@ void Next() final override; void Prev() final override; + // 'target' does not contain timestamp, even if user timestamp feature is + // enabled. void Seek(const Slice& target) final override; void SeekForPrev(const Slice& target) final override; void SeekToFirst() final override; @@ -210,7 +234,7 @@ // If `skipping_saved_key` is true, the function will keep iterating until it // finds a user key that is larger than `saved_key_`. // If `prefix` is not null, the iterator needs to stop when all keys for the - // prefix are exhausted and the interator is set to invalid. + // prefix are exhausted and the iterator is set to invalid. bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); // Internal implementation of FindNextUserEntry(). bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); @@ -221,7 +245,8 @@ // entry can be found within the prefix. void PrevInternal(const Slice* prefix); bool TooManyInternalKeysSkipped(bool increment = true); - bool IsVisible(SequenceNumber sequence); + bool IsVisible(SequenceNumber sequence, const Slice& ts, + bool* more_recent = nullptr); // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() // is called @@ -260,12 +285,29 @@ return expect_total_order_inner_iter_; } + // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need + // to return versions of the same key. We cannot just skip if the key value + // is the same but timestamps are different but fall in timestamp range. + inline int CompareKeyForSkip(const Slice& a, const Slice& b) { + return timestamp_lb_ != nullptr + ? user_comparator_.Compare(a, b) + : user_comparator_.CompareWithoutTimestamp(a, b); + } + + // Retrieves the blob value for the specified user key using the given blob + // index when using the integrated BlobDB implementation. + bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); + + Status Merge(const Slice* val, const Slice& user_key); + const SliceTransform* prefix_extractor_; Env* const env_; + SystemClock* clock_; Logger* logger_; UserComparatorWrapper user_comparator_; const MergeOperator* const merge_operator_; IteratorWrapper iter_; + const Version* version_; ReadCallback* read_callback_; // Max visible sequence number. It is normally the snapshot seq unless we have // uncommitted data in db as in WriteUnCommitted. @@ -279,6 +321,7 @@ std::string saved_value_; Slice pinned_value_; // for prefix seek mode to support prev() + PinnableSlice blob_value_; Statistics* statistics_; uint64_t max_skip_; uint64_t max_skippable_internal_keys_; @@ -308,7 +351,11 @@ // Expect the inner iterator to maintain a total order. // prefix_extractor_ must be non-NULL if the value is false. const bool expect_total_order_inner_iter_; - bool allow_blob_; + ReadTier read_tier_; + bool verify_checksums_; + // Whether the iterator is allowed to expose blob references. Set to true when + // the stacked BlobDB implementation is used, false otherwise. + bool expose_blob_index_; bool is_blob_; bool arena_mode_; // List of operands for merge operator. @@ -327,18 +374,22 @@ // for diff snapshots we want the lower bound on the seqnum; // if this value > 0 iterator will return internal keys SequenceNumber start_seqnum_; + const Slice* const timestamp_ub_; + const Slice* const timestamp_lb_; + const size_t timestamp_size_; + std::string saved_timestamp_; }; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified `sequence` number // into appropriate user keys. extern Iterator* NewDBIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* user_key_comparator, InternalIterator* internal_iter, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl = nullptr, - ColumnFamilyData* cfd = nullptr, bool allow_blob = false); + const Version* version, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -97,7 +97,8 @@ bool MaybeFail() { if (rnd->Next() >= - std::numeric_limits::max() * error_probability) { + static_cast(std::numeric_limits::max()) * + error_probability) { return false; } if (rnd->Next() % 2) { @@ -114,7 +115,8 @@ void MaybeMutate() { if (rnd->Next() >= - std::numeric_limits::max() * mutation_probability) { + static_cast(std::numeric_limits::max()) * + mutation_probability) { return; } do { @@ -126,8 +128,9 @@ if (data->hidden.empty()) { hide_probability = 1; } - bool do_hide = - rnd->Next() < std::numeric_limits::max() * hide_probability; + bool do_hide = rnd->Next() < + static_cast(std::numeric_limits::max()) * + hide_probability; if (do_hide) { // Hide a random entry. size_t idx = rnd->Next() % data->entries.size(); @@ -508,9 +511,9 @@ target_hidden_fraction; internal_iter->trace = trace; db_iter.reset(NewDBIterator( - env_, ropt, ImmutableCFOptions(options), + env_, ropt, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), - internal_iter, sequence, + internal_iter, nullptr /* version */, sequence, options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -99,9 +99,11 @@ } for (auto it = data_.begin(); it != data_.end(); ++it) { ParsedInternalKey ikey; - bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey); - assert(ok); - if (ikey.user_key != _key) { + Status pik_status = + ParseInternalKey(it->first, &ikey, true /* log_err_key */); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + if (!pik_status.ok() || ikey.user_key != _key) { continue; } if (valid_ && data_.begin() + iter_ > it) { @@ -235,7 +237,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); @@ -250,9 +252,10 @@ ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -283,9 +286,10 @@ ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -310,9 +314,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -343,9 +348,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -379,12 +385,14 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } // Test case to check SeekToLast with iterate_upper_bound set // (same key put may times - SeekToLast should start with the @@ -409,9 +417,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -447,9 +456,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -473,12 +483,14 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } // Test to check the SeekToLast() with the iterate_upper_bound set // (Deletion cases) @@ -496,9 +508,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -532,9 +545,10 @@ ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -562,9 +576,10 @@ ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -583,6 +598,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -605,9 +621,10 @@ ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -637,9 +654,10 @@ ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -659,7 +677,7 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); ReadOptions ro; @@ -668,11 +686,13 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -680,11 +700,13 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -703,9 +725,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -726,6 +749,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); } @@ -733,7 +757,7 @@ ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -748,9 +772,10 @@ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -769,6 +794,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -784,9 +810,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -800,6 +827,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -813,9 +841,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 202, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 202 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -834,6 +863,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -846,14 +876,17 @@ internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TestIterator* internal_iter = new TestIterator(BytewiseComparator()); @@ -863,9 +896,10 @@ internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 200, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 200 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -873,6 +907,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -881,6 +916,7 @@ db_iter->Next(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -898,9 +934,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -919,6 +956,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -933,9 +971,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -958,13 +997,14 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } } TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); ReadOptions ro; @@ -983,9 +1023,10 @@ ro.max_skippable_internal_keys = 0; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1013,7 +1054,7 @@ db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); - ASSERT_TRUE(db_iter->status().ok()); + ASSERT_OK(db_iter->status()); } // Test to make sure that the request will *not* fail as incomplete if @@ -1030,9 +1071,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1075,9 +1117,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1114,9 +1157,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1150,9 +1194,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1181,9 +1226,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1219,9 +1265,10 @@ ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1257,9 +1304,10 @@ ro.max_skippable_internal_keys = i; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1311,9 +1359,10 @@ options.max_sequential_skip_in_iterations = 1000; ro.max_skippable_internal_keys = i; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1350,9 +1399,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1378,9 +1428,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 0 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1403,9 +1454,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1428,9 +1480,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1447,7 +1500,7 @@ ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1462,9 +1515,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1485,9 +1539,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1508,9 +1563,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1531,9 +1587,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 3, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1554,9 +1611,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1577,9 +1635,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1600,9 +1659,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1621,9 +1681,10 @@ internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -1637,7 +1698,7 @@ ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1652,9 +1713,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1675,9 +1737,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1698,9 +1761,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1721,9 +1785,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 3, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -1740,9 +1805,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1763,9 +1829,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1786,9 +1853,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1802,7 +1870,7 @@ ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1829,9 +1897,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1864,9 +1933,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1905,9 +1975,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1946,9 +2017,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1992,9 +2064,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2039,9 +2112,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2080,9 +2154,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 9, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 9 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2127,9 +2202,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 13, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 13 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2175,9 +2251,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 14, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 14 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2206,9 +2283,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2237,9 +2315,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2304,9 +2383,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); @@ -2344,9 +2424,9 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, 0 /* force seek */, - nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2373,9 +2453,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2400,8 +2481,9 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -2437,8 +2519,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, 3 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), key); @@ -2465,8 +2549,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, 1 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2492,19 +2578,21 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 13, - options.max_sequential_skip_in_iterations, nullptr)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 13 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); // Expecting InternalKeys in [5,8] range with correct type int seqnums[4] = {5,8,11,13}; std::string user_keys[4] = {"1","2","3","4"}; std::string values[4] = {"1c", "2c", "3c", "4b"}; int i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { - FullKey fkey; - ParseFullKey(db_iter->key(), &fkey); + ParsedInternalKey fkey; + ASSERT_OK( + ParseInternalKey(db_iter->key(), &fkey, true /* log_err_key */)); ASSERT_EQ(user_keys[i], fkey.user_key.ToString()); - ASSERT_EQ(EntryType::kEntryPut, fkey.type); + ASSERT_EQ(kTypeValue, fkey.type); ASSERT_EQ(seqnums[i], fkey.sequence); ASSERT_EQ(values[i], db_iter->value().ToString()); i++; @@ -2527,19 +2615,21 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 13, - options.max_sequential_skip_in_iterations, nullptr)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 13 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); // Expecting InternalKeys in [5,8] range with correct type int seqnums[4] = {5,8,11,13}; - EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete, - EntryType::kEntryDelete,EntryType::kEntryPut}; + ValueType key_types[4] = {kTypeDeletion, kTypeDeletion, kTypeDeletion, + kTypeValue}; std::string user_keys[4] = {"1","2","3","4"}; std::string values[4] = {"", "", "", "4b"}; int i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { - FullKey fkey; - ParseFullKey(db_iter->key(), &fkey); + ParsedInternalKey fkey; + ASSERT_OK( + ParseInternalKey(db_iter->key(), &fkey, true /* log_err_key */)); ASSERT_EQ(user_keys[i], fkey.user_key.ToString()); ASSERT_EQ(key_types[i], fkey.type); ASSERT_EQ(seqnums[i], fkey.sequence); @@ -2577,10 +2667,10 @@ NewMergingIterator(&icomp_, &child_iters[0], 2u); db_iter_.reset(NewDBIterator( - env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_), - BytewiseComparator(), merge_iter, + env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_), + BytewiseComparator(), merge_iter, nullptr /* version */, 8 /* read data earlier than seqId 8 */, - 3 /* max iterators before reseek */, nullptr /*read_callback*/)); + 3 /* max iterators before reseek */, nullptr /* read_callback */)); } Env* env_; @@ -3017,9 +3107,10 @@ ro.prefix_same_as_start = true; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); int skipped_keys = 0; @@ -3053,15 +3144,16 @@ ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); db_iter->SeekToFirst(); if (i == kNumKeys + 1) { // lower bound was beyond the last key ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } else { ASSERT_TRUE(db_iter->Valid()); int expected; @@ -3092,9 +3184,10 @@ ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); for (int i = kNumKeys; i >= kLowerBound; --i) { @@ -3120,9 +3213,10 @@ ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); auto before_lower_bound_str = std::to_string(kLowerBound - 1); Slice before_lower_bound(lower_bound_str); @@ -3145,9 +3239,10 @@ internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ReadOptions(), ImmutableCFOptions(options), - MutableCFOptions(options), BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekForPrev("a"); ASSERT_TRUE(db_iter->Valid()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iterator_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,6 +17,8 @@ #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" #include "table/block_based/flush_block_policy.h" +#include "util/random.h" +#include "utilities/merge_operators/string_append/stringappend2.h" namespace ROCKSDB_NAMESPACE { @@ -33,14 +35,15 @@ class DBIteratorTest : public DBTestBase, public testing::WithParamInterface { public: - DBIteratorTest() : DBTestBase("/db_iterator_test") {} + DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {} Iterator* NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family = nullptr) { if (column_family == nullptr) { column_family = db_->DefaultColumnFamily(); } - auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* cfd = + static_cast_with_check(column_family)->cfd(); SequenceNumber seq = read_options.snapshot != nullptr ? read_options.snapshot->GetSequenceNumber() : db_->GetLatestSequenceNumber(); @@ -65,8 +68,8 @@ // The test needs to be changed if kPersistedTier is supported in iterator. Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "1", "2"); - Delete(1, "2"); + ASSERT_OK(Put(1, "1", "2")); + ASSERT_OK(Delete(1, "2")); ReadOptions ropt; ropt.pin_data = false; { @@ -170,10 +173,10 @@ TEST_P(DBIteratorTest, IterSeekBeforePrev) { ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("0", "f")); ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("2", "j")); auto iter = NewIterator(ReadOptions()); iter->Seek(Slice("c")); @@ -193,11 +196,11 @@ options.compression = kNoCompression; Reopen(options); - ASSERT_OK(Put("a", RandomString(&rnd, 400))); - ASSERT_OK(Put("aabb", RandomString(&rnd, 400))); - ASSERT_OK(Put("aaef", RandomString(&rnd, 400))); - ASSERT_OK(Put("b", RandomString(&rnd, 400))); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("a", rnd.RandomString(400))); + ASSERT_OK(Put("aabb", rnd.RandomString(400))); + ASSERT_OK(Put("aaef", rnd.RandomString(400))); + ASSERT_OK(Put("b", rnd.RandomString(400))); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ReadOptions opts; Slice ub = Slice("aa"); opts.iterate_upper_bound = &ub; @@ -213,10 +216,10 @@ TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) { ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("0", "f")); ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("2", "j")); auto iter = NewIterator(ReadOptions()); iter->SeekForPrev(Slice("0")); @@ -236,7 +239,7 @@ ASSERT_OK(Put(MakeLongKey(20, 0), "0")); ASSERT_OK(Put(MakeLongKey(32, 2), "2")); ASSERT_OK(Put("a", "b")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put(MakeLongKey(50, 1), "1")); ASSERT_OK(Put(MakeLongKey(127, 3), "3")); ASSERT_OK(Put(MakeLongKey(64, 4), "4")); @@ -274,7 +277,7 @@ TEST_P(DBIteratorTest, IterNextWithNewerSeq) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("d", "e")); @@ -300,7 +303,7 @@ TEST_P(DBIteratorTest, IterPrevWithNewerSeq) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("d", "e")); @@ -331,7 +334,7 @@ TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("e", "f")); @@ -375,6 +378,8 @@ iter->SeekForPrev("foo"); ASSERT_EQ(IterStatus(iter), "(invalid)"); + ASSERT_OK(iter->status()); + delete iter; } while (ChangeCompactOptions()); } @@ -615,6 +620,40 @@ delete iter; } +TEST_F(DBIteratorTest, ReseekUponDirectionChange) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset( + new StringAppendTESTOperator(/*delim_char=*/' ')); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToLast(); + it->Prev(); + it->Next(); + } + ASSERT_EQ(1, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + const std::string merge_key("good"); + ASSERT_OK(Put(merge_key, "orig")); + ASSERT_OK(Merge(merge_key, "suffix")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek(merge_key); + ASSERT_TRUE(it->Valid()); + const uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Prev(); + ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION)); + } +} + TEST_P(DBIteratorTest, IterSmallAndLargeMix) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); @@ -781,18 +820,18 @@ TEST_P(DBIteratorTest, IteratorPinsRef) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "hello"); + ASSERT_OK(Put(1, "foo", "hello")); // Get iterator that will yield the current contents of the DB. Iterator* iter = NewIterator(ReadOptions(), handles_[1]); // Write to force compactions - Put(1, "foo", "newvalue1"); + ASSERT_OK(Put(1, "foo", "newvalue1")); for (int i = 0; i < 100; i++) { // 100K values ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); } - Put(1, "foo", "newvalue2"); + ASSERT_OK(Put(1, "foo", "newvalue2")); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); @@ -807,8 +846,8 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "delete-cf-then-delete-iter"); - Put(1, "hello", "value2"); + ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter")); + ASSERT_OK(Put(1, "hello", "value2")); ColumnFamilyHandle* cf = handles_[1]; ReadOptions ro; @@ -818,7 +857,7 @@ ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter"); // delete CF handle - db_->DestroyColumnFamilyHandle(cf); + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); handles_.erase(std::begin(handles_) + 1); // delete Iterator after CF handle is deleted @@ -830,7 +869,7 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "drop-cf-then-delete-iter"); + ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter")); ReadOptions ro; ColumnFamilyHandle* cf = handles_[1]; @@ -840,8 +879,8 @@ ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter"); // drop and delete CF - db_->DropColumnFamily(cf); - db_->DestroyColumnFamilyHandle(cf); + EXPECT_OK(db_->DropColumnFamily(cf)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); handles_.erase(std::begin(handles_) + 1); // delete Iterator after CF handle is dropped @@ -1167,32 +1206,62 @@ ropt.tailing = tailing; std::unique_ptr iter(NewIterator(ropt)); + ropt.read_tier = ReadTier::kBlockCacheTier; + std::unique_ptr nonblocking_iter(NewIterator(ropt)); + iter->Seek("b10"); ASSERT_TRUE(iter->Valid()); EXPECT_EQ("b2", iter->key().ToString()); EXPECT_EQ("y2", iter->value().ToString()); EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + // The cache-only iterator should succeed too, using the blocks pulled into + // the cache by the previous iterator. + nonblocking_iter->Seek("b10"); + ASSERT_TRUE(nonblocking_iter->Valid()); + EXPECT_EQ("b2", nonblocking_iter->key().ToString()); + EXPECT_EQ("y2", nonblocking_iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // ... but it shouldn't be able to step forward since the next block is + // not in cache yet. + nonblocking_iter->Next(); + ASSERT_FALSE(nonblocking_iter->Valid()); + ASSERT_TRUE(nonblocking_iter->status().IsIncomplete()); + + // ... nor should a seek to the next key succeed. + nonblocking_iter->Seek("b20"); + ASSERT_FALSE(nonblocking_iter->Valid()); + ASSERT_TRUE(nonblocking_iter->status().IsIncomplete()); + iter->Next(); ASSERT_TRUE(iter->Valid()); EXPECT_EQ("b3", iter->key().ToString()); EXPECT_EQ("y3", iter->value().ToString()); - EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); - EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // After the blocking iterator loaded the next block, the nonblocking + // iterator's seek should succeed. + nonblocking_iter->Seek("b20"); + ASSERT_TRUE(nonblocking_iter->Valid()); + EXPECT_EQ("b3", nonblocking_iter->key().ToString()); + EXPECT_EQ("y3", nonblocking_iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); iter->Seek("c0"); ASSERT_TRUE(iter->Valid()); EXPECT_EQ("c0", iter->key().ToString()); EXPECT_EQ("z1,z2", iter->value().ToString()); - EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); - EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); iter->Next(); ASSERT_TRUE(iter->Valid()); EXPECT_EQ("c3", iter->key().ToString()); EXPECT_EQ("z3", iter->value().ToString()); - EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); - EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); iter.reset(); @@ -1207,13 +1276,13 @@ ASSERT_TRUE(iter->Valid()); EXPECT_EQ("b2", iter->key().ToString()); EXPECT_EQ("y2", iter->value().ToString()); - EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); - EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); iter->Next(); ASSERT_FALSE(iter->Valid()); - EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); - EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); } } @@ -1275,9 +1344,9 @@ // write three entries with different keys using Merge() WriteOptions wopts; - db_->Merge(wopts, "1", "data1"); - db_->Merge(wopts, "2", "data2"); - db_->Merge(wopts, "3", "data3"); + ASSERT_OK(db_->Merge(wopts, "1", "data1")); + ASSERT_OK(db_->Merge(wopts, "2", "data2")); + ASSERT_OK(db_->Merge(wopts, "3", "data3")); std::unique_ptr it(NewIterator(ReadOptions())); @@ -1329,7 +1398,7 @@ std::vector generated_keys(key_pool); for (int i = 0; i < key_pool; i++) { - generated_keys[i] = RandomString(&rnd, key_size); + generated_keys[i] = rnd.RandomString(key_size); } std::map true_data; @@ -1337,7 +1406,7 @@ std::vector deleted_keys; for (int i = 0; i < puts; i++) { auto& k = generated_keys[rnd.Next() % key_pool]; - auto v = RandomString(&rnd, val_size); + auto v = rnd.RandomString(val_size); // Insert data to true_data map and to DB true_data[k] = v; @@ -1361,7 +1430,7 @@ if (run_config == TestConfig::FLUSH_EVERY_1000) { if (i && i % 1000 == 0) { - Flush(); + ASSERT_OK(Flush()); } } } @@ -1370,7 +1439,7 @@ Close(); Reopen(options); } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } ReadOptions ro; @@ -1467,9 +1536,11 @@ } }; +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) { PinnedDataIteratorRandomized(TestConfig::NORMAL); } +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) { PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN); @@ -1484,6 +1555,10 @@ PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000); } +INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance, + DBIteratorTestForPinnedData, + testing::Values(true, false)); + #ifndef ROCKSDB_LITE TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { Options options = CurrentOptions(); @@ -1500,7 +1575,7 @@ Random rnd(301); for (int i = 1; i <= 1000; i++) { std::string k = Key(i * 3); - std::string v = RandomString(&rnd, 100); + std::string v = rnd.RandomString(100); ASSERT_OK(Put(k, v)); true_data[k] = v; if (i % 250 == 0) { @@ -1514,7 +1589,7 @@ // Generate 4 sst files in L0 for (int i = 1; i <= 1000; i++) { std::string k = Key(i * 2); - std::string v = RandomString(&rnd, 100); + std::string v = rnd.RandomString(100); ASSERT_OK(Put(k, v)); true_data[k] = v; if (i % 250 == 0) { @@ -1526,7 +1601,7 @@ // Add some keys/values in memtables for (int i = 1; i <= 1000; i++) { std::string k = Key(i); - std::string v = RandomString(&rnd, 100); + std::string v = rnd.RandomString(100); ASSERT_OK(Put(k, v)); true_data[k] = v; } @@ -1628,8 +1703,8 @@ std::map true_data; for (int i = 0; i < 1000; i++) { - std::string k = RandomString(&rnd, 10); - std::string v = RandomString(&rnd, 1000); + std::string k = rnd.RandomString(10); + std::string v = rnd.RandomString(1000); ASSERT_OK(Put(k, v)); true_data[k] = v; } @@ -1643,7 +1718,7 @@ if (rnd.OneIn(2)) { ASSERT_OK(Delete(kv.first)); } else { - std::string new_val = RandomString(&rnd, 1000); + std::string new_val = rnd.RandomString(1000); ASSERT_OK(Put(kv.first, new_val)); } } @@ -1736,6 +1811,7 @@ Iterator* iter = NewIterator(ro); iter->SeekForPrev("c2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1791,6 +1867,7 @@ Iterator* iter = NewIterator(ro); iter->SeekForPrev("c2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1873,7 +1950,7 @@ DestroyAndReopen(options); const int kNumKeys = 500; - // Small number of merge operands to make sure that DBIter::Prev() dont + // Small number of merge operands to make sure that DBIter::Prev() don't // fall back to Seek() const int kNumMergeOperands = 3; // Use value size that will make sure that every block contain 1 key @@ -1900,7 +1977,7 @@ for (int i = 0; i < kNumKeys; i++) { gen_key = Key(i); - gen_val = RandomString(&rnd, kValSize); + gen_val = rnd.RandomString(kValSize); ASSERT_OK(Put(gen_key, gen_val)); true_data[gen_key] = gen_val; @@ -1908,7 +1985,7 @@ ASSERT_OK(Flush()); // Separate values and merge operands in different file so that we - // make sure that we dont merge them while flushing but actually + // make sure that we don't merge them while flushing but actually // merge them in the read path for (int i = 0; i < kNumKeys; i++) { if (rnd.PercentTrue(kNoMergeOpPercentage)) { @@ -1918,7 +1995,7 @@ for (int j = 0; j < kNumMergeOperands; j++) { gen_key = Key(i); - gen_val = RandomString(&rnd, kValSize); + gen_val = rnd.RandomString(kValSize); ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val)); true_data[gen_key] += "," + gen_val; @@ -2018,7 +2095,7 @@ Random rnd(301); for (int i = 0; i < 1000; i++) { // Key 10 bytes / Value 10 bytes - ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } std::atomic total_next(0); @@ -2114,24 +2191,24 @@ BlockBasedTableOptions table_options; table_options.block_size = 1024; table_options.no_block_cache = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); std::string value(1024, 'a'); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); MoveFilesToLevel(2); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); MoveFilesToLevel(1); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); #ifndef ROCKSDB_LITE @@ -2238,6 +2315,7 @@ ASSERT_OK(Put("x", "y")); std::unique_ptr iter(NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->key().compare(Slice("x")), 0); @@ -2252,7 +2330,8 @@ iter->Next(); ASSERT_FALSE(iter->Valid()); - iter->Refresh(); + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2263,7 +2342,7 @@ iter->Next(); ASSERT_FALSE(iter->Valid()); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("m", "n")); @@ -2276,7 +2355,8 @@ iter->Next(); ASSERT_FALSE(iter->Valid()); - iter->Refresh(); + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2299,6 +2379,7 @@ ReadOptions options; options.snapshot = snapshot; Iterator* iter = NewIterator(options); + ASSERT_OK(iter->status()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2314,8 +2395,8 @@ iter->Next(); ASSERT_FALSE(iter->Valid()); - Status s; - s = iter->Refresh(); + ASSERT_OK(iter->status()); + Status s = iter->Refresh(); ASSERT_TRUE(s.IsNotSupported()); db_->ReleaseSnapshot(snapshot); delete iter; @@ -2373,14 +2454,14 @@ TEST_P(DBIteratorTest, TableFilter) { ASSERT_OK(Put("a", "1")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("b", "2")); ASSERT_OK(Put("c", "3")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("d", "4")); ASSERT_OK(Put("e", "5")); ASSERT_OK(Put("f", "6")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); // Ensure the table_filter callback is called once for each table. { @@ -2565,13 +2646,13 @@ ReadOptions ropts; ropts.max_skippable_internal_keys = 2; - Put("1", "val_1"); + ASSERT_OK(Put("1", "val_1")); // Add more tombstones than max_skippable_internal_keys so that Next() fails. - Delete("2"); - Delete("3"); - Delete("4"); - Delete("5"); - Put("6", "val_6"); + ASSERT_OK(Delete("2")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Delete("4")); + ASSERT_OK(Delete("5")); + ASSERT_OK(Put("6", "val_6")); std::unique_ptr iter(NewIterator(ropts)); iter->SeekToFirst(); @@ -2613,9 +2694,9 @@ DestroyAndReopen(options); // Two records in sst file, each in its own block. - Put("b", ""); - Put("d", ""); - Flush(); + ASSERT_OK(Put("b", "")); + ASSERT_OK(Put("d", "")); + ASSERT_OK(Flush()); // Create a nonblocking iterator before writing to memtable. ReadOptions ropt; @@ -2625,7 +2706,7 @@ // Overwrite a key in memtable many times to hit // max_sequential_skip_in_iterations (which is 8 by default). for (int i = 0; i < 20; ++i) { - Put("c", ""); + ASSERT_OK(Put("c", "")); } // Load the second block in sst file into the block cache. @@ -2642,9 +2723,9 @@ } TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { - Put("a", ""); - Put("b", ""); - Flush(); + ASSERT_OK(Put("a", "")); + ASSERT_OK(Put("b", "")); + ASSERT_OK(Flush()); ReadOptions ropt; Slice ub = "b"; @@ -2674,7 +2755,7 @@ Reopen(options); Random rnd(301); - std::string random_str = RandomString(&rnd, 180); + std::string random_str = rnd.RandomString(180); ASSERT_OK(Put("1", random_str)); ASSERT_OK(Put("2", random_str)); @@ -2851,6 +2932,127 @@ ASSERT_OK(iter->status()); } +TEST_P(DBIteratorTest, Blob) { + Options options = CurrentOptions(); + options.enable_blob_files = true; + options.max_sequential_skip_in_iterations = 2; + options.statistics = CreateDBStatistics(); + + Reopen(options); + + // Note: we have 4 KVs (3 of which are hidden) for key "b" and + // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek + // anytime we move from "b" to "c" or vice versa. + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb3")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Flush()); + + std::unique_ptr iter_guard(NewIterator(ReadOptions())); + Iterator* const iter = iter_guard.get(); + + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->SeekForPrev("d"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->Seek("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Seek("z"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->SeekForPrev(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + // Switch from forward to reverse + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8); + ASSERT_EQ(IterStatus(iter), "b->vb3"); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); @@ -2881,7 +3083,7 @@ SequenceNumber seq2 = db_->GetLatestSequenceNumber(); auto* cfd = - reinterpret_cast(db_->DefaultColumnFamily()) + static_cast_with_check(db_->DefaultColumnFamily()) ->cfd(); // The iterator are suppose to see data before seq1. Iterator* iter = @@ -2989,6 +3191,44 @@ delete iter; } +TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = false; + options.env = env_; + DestroyAndReopen(options); + constexpr int kNumKeys = 10; + + // Write kNumKeys to WAL. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ReadOptions read_opts; + read_opts.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + int count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ++count; + } + ASSERT_EQ(kNumKeys, count); + } + + // Reopen and rebuild the memtable from WAL. + options.create_if_missing = false; + options.avoid_flush_during_recovery = true; + options.inplace_update_support = true; + options.allow_concurrent_memtable_write = false; + Reopen(options); + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + // Backward iteration not supported due to inplace_update_support = true. + ASSERT_TRUE(iter->status().IsNotSupported()); + ASSERT_FALSE(iter->Valid()); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,197 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +enum class WriteBatchOpType { + kPut = 0, + kDelete, + kSingleDelete, + kDeleteRange, + kMerge, + kBlobIndex, + kNum, +}; + +// Integer addition is needed for `::testing::Range()` to take the enum type. +WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) + rhs); +} + +class DbKvChecksumTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + DbKvChecksumTest() + : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) { + op_type_ = std::get<0>(GetParam()); + corrupt_byte_addend_ = std::get<1>(GetParam()); + } + + std::pair GetWriteBatch(ColumnFamilyHandle* cf_handle) { + Status s; + WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */, + 8 /* protection_bytes_per_entry */); + switch (op_type_) { + case WriteBatchOpType::kPut: + s = wb.Put(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kDelete: + s = wb.Delete(cf_handle, "key"); + break; + case WriteBatchOpType::kSingleDelete: + s = wb.SingleDelete(cf_handle, "key"); + break; + case WriteBatchOpType::kDeleteRange: + s = wb.DeleteRange(cf_handle, "begin", "end"); + break; + case WriteBatchOpType::kMerge: + s = wb.Merge(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kBlobIndex: + // TODO(ajkr): use public API once available. + uint32_t cf_id; + if (cf_handle == nullptr) { + cf_id = 0; + } else { + cf_id = cf_handle->GetID(); + } + s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", "val"); + break; + case WriteBatchOpType::kNum: + assert(false); + } + return {std::move(wb), std::move(s)}; + } + + void CorruptNextByteCallBack(void* arg) { + Slice encoded = *static_cast(arg); + if (entry_len_ == port::kMaxSizet) { + // We learn the entry size on the first attempt + entry_len_ = encoded.size(); + } + // All entries should be the same size + assert(entry_len_ == encoded.size()); + char* buf = const_cast(encoded.data()); + buf[corrupt_byte_offset_] += corrupt_byte_addend_; + ++corrupt_byte_offset_; + } + + bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; } + + protected: + WriteBatchOpType op_type_; + char corrupt_byte_addend_; + size_t corrupt_byte_offset_ = 0; + size_t entry_len_ = port::kMaxSizet; +}; + +std::string GetTestNameSuffix( + ::testing::TestParamInfo> info) { + std::ostringstream oss; + switch (std::get<0>(info.param)) { + case WriteBatchOpType::kPut: + oss << "Put"; + break; + case WriteBatchOpType::kDelete: + oss << "Delete"; + break; + case WriteBatchOpType::kSingleDelete: + oss << "SingleDelete"; + break; + case WriteBatchOpType::kDeleteRange: + oss << "DeleteRange"; + break; + case WriteBatchOpType::kMerge: + oss << "Merge"; + break; + case WriteBatchOpType::kBlobIndex: + oss << "BlobIndex"; + break; + case WriteBatchOpType::kNum: + assert(false); + } + oss << "Add" + << static_cast(static_cast(std::get<1>(info.param))); + return oss.str(); +} + +INSTANTIATE_TEST_CASE_P( + DbKvChecksumTest, DbKvChecksumTest, + ::testing::Combine(::testing::Range(static_cast(0), + WriteBatchOpType::kNum), + ::testing::Values(2, 103, 251)), + GetTestNameSuffix); + +TEST_P(DbKvChecksumTest, MemTableAddCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_`. Each attempt has one byte corrupted in its + // memtable entry by adding `corrupt_byte_addend_` to its original value. The + // test repeats until an attempt has been made on each byte in the encoded + // memtable entry. All attempts are expected to fail with `Status::Corruption` + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + Reopen(options); + + SyncPoint::GetInstance()->EnableProcessing(); + auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */); + ASSERT_OK(batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_` to a non-default column family. Each attempt has + // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_` + // to its original value. The test repeats until an attempt has been made on + // each byte in the encoded memtable entry. All attempts are expected to fail + // with `Status::Corruption`. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + + SyncPoint::GetInstance()->EnableProcessing(); + auto batch_and_status = GetWriteBatch(handles_[1]); + ASSERT_OK(batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_log_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_log_iter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,13 +13,15 @@ #if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" +#include "env/mock_env.h" #include "port/stack_trace.h" namespace ROCKSDB_NAMESPACE { class DBTestXactLogIterator : public DBTestBase { public: - DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {} + DBTestXactLogIterator() + : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {} std::unique_ptr OpenTransactionLogIter( const SequenceNumber seq) { @@ -32,9 +34,8 @@ }; namespace { -SequenceNumber ReadRecords( - std::unique_ptr& iter, - int& count) { +SequenceNumber ReadRecords(std::unique_ptr& iter, + int& count, bool expect_ok = true) { count = 0; SequenceNumber lastSequence = 0; BatchResult res; @@ -46,6 +47,11 @@ EXPECT_OK(iter->status()); iter->Next(); } + if (expect_ok) { + EXPECT_OK(iter->status()); + } else { + EXPECT_NOK(iter->status()); + } return res.sequence; } @@ -63,9 +69,9 @@ Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Put(0, "key1", DummyString(1024)); - Put(1, "key2", DummyString(1024)); - Put(1, "key2", DummyString(1024)); + ASSERT_OK(Put(0, "key1", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); { auto iter = OpenTransactionLogIter(0); @@ -74,9 +80,9 @@ ReopenWithColumnFamilies({"default", "pikachu"}, options); env_->SleepForMicroseconds(2 * 1000 * 1000); { - Put(0, "key4", DummyString(1024)); - Put(1, "key5", DummyString(1024)); - Put(0, "key6", DummyString(1024)); + ASSERT_OK(Put(0, "key4", DummyString(1024))); + ASSERT_OK(Put(1, "key5", DummyString(1024))); + ASSERT_OK(Put(0, "key6", DummyString(1024))); } { auto iter = OpenTransactionLogIter(0); @@ -108,15 +114,15 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key2", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key3", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key4", DummyString(1024)); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key2", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key3", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key4", DummyString(1024))); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); { auto iter = OpenTransactionLogIter(0); @@ -129,11 +135,11 @@ // condition FlushOptions flush_options; flush_options.wait = false; - dbfull()->Flush(flush_options); + ASSERT_OK(dbfull()->Flush(flush_options)); // "key5" would be written in a new memtable and log - Put("key5", DummyString(1024)); - dbfull()->FlushWAL(false); + ASSERT_OK(Put("key5", DummyString(1024))); + ASSERT_OK(dbfull()->FlushWAL(false)); { // this iter would miss "key4" if not fixed auto iter = OpenTransactionLogIter(0); @@ -148,14 +154,14 @@ do { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); + ASSERT_OK(Put("key1", DummyString(1024))); auto iter = OpenTransactionLogIter(0); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); iter->Next(); ASSERT_TRUE(!iter->Valid()); ASSERT_OK(iter->status()); - Put("key2", DummyString(1024)); + ASSERT_OK(Put("key2", DummyString(1024))); iter->Next(); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); @@ -166,9 +172,9 @@ do { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); - Put("key2", DummyString(1023)); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(Put("key2", DummyString(1023))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); Reopen(options); auto iter = OpenTransactionLogIter(0); ExpectRecords(2, iter); @@ -179,31 +185,38 @@ do { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); + for (int i = 0; i < 1024; i++) { - Put("key"+ToString(i), DummyString(10)); + ASSERT_OK(Put("key" + ToString(i), DummyString(10))); } - dbfull()->Flush(FlushOptions()); - dbfull()->FlushWAL(false); + + ASSERT_OK(Flush()); + ASSERT_OK(db_->FlushWAL(false)); + // Corrupt this log to create a gap - ROCKSDB_NAMESPACE::VectorLogPtr wal_files; - ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_OK(db_->DisableFileDeletions()); + + VectorLogPtr wal_files; + ASSERT_OK(db_->GetSortedWalFiles(wal_files)); + ASSERT_FALSE(wal_files.empty()); + const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName(); - if (mem_env_) { - mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2); - } else { - ASSERT_EQ(0, truncate(logfile_path.c_str(), - wal_files.front()->SizeFileBytes() / 2)); - } + ASSERT_OK(test::TruncateFile(env_, logfile_path, + wal_files.front()->SizeFileBytes() / 2)); + + ASSERT_OK(db_->EnableFileDeletions()); // Insert a new entry to a new log file - Put("key1025", DummyString(10)); - dbfull()->FlushWAL(false); + ASSERT_OK(Put("key1025", DummyString(10))); + ASSERT_OK(db_->FlushWAL(false)); + // Try to read from the beginning. Should stop before the gap and read less // than 1025 entries auto iter = OpenTransactionLogIter(0); - int count; - SequenceNumber last_sequence_read = ReadRecords(iter, count); + int count = 0; + SequenceNumber last_sequence_read = ReadRecords(iter, count, false); ASSERT_LT(last_sequence_read, 1025U); + // Try to read past the gap, should be able to seek to key1025 auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); ExpectRecords(1, iter2); @@ -216,15 +229,15 @@ DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); WriteBatch batch; - batch.Put(handles_[1], "key1", DummyString(1024)); - batch.Put(handles_[0], "key2", DummyString(1024)); - batch.Put(handles_[1], "key3", DummyString(1024)); - batch.Delete(handles_[0], "key2"); - dbfull()->Write(WriteOptions(), &batch); - Flush(1); - Flush(0); + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); ReopenWithColumnFamilies({"default", "pikachu"}, options); - Put(1, "key4", DummyString(1024)); + ASSERT_OK(Put(1, "key4", DummyString(1024))); auto iter = OpenTransactionLogIter(3); ExpectRecords(2, iter); } while (ChangeCompactOptions()); @@ -236,13 +249,13 @@ CreateAndReopenWithCF({"pikachu"}, options); { WriteBatch batch; - batch.Put(handles_[1], "key1", DummyString(1024)); - batch.Put(handles_[0], "key2", DummyString(1024)); - batch.PutLogData(Slice("blob1")); - batch.Put(handles_[1], "key3", DummyString(1024)); - batch.PutLogData(Slice("blob2")); - batch.Delete(handles_[0], "key2"); - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); ReopenWithColumnFamilies({"default", "pikachu"}, options); } @@ -267,7 +280,7 @@ return Status::OK(); } } handler; - res.writeBatchPtr->Iterate(&handler); + ASSERT_OK(res.writeBatchPtr->Iterate(&handler)); ASSERT_EQ( "Put(1, key1, 1024)" "Put(0, key2, 1024)" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,513 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" + +#ifdef OS_LINUX +#include "env/io_posix.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { +class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper { + public: + EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache) + : EnvWrapper(env), cache_(cache) {} + + Status RegisterDbPaths(const std::vector& paths) override { + return cache_->RefAndCacheLogicalBlockSize(paths); + } + + Status UnregisterDbPaths(const std::vector& paths) override { + cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths); + return Status::OK(); + } + + private: + LogicalBlockSizeCache* cache_; +}; + +class DBLogicalBlockSizeCacheTest : public testing::Test { + public: + DBLogicalBlockSizeCacheTest() + : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")), + data_path_0_(dbname_ + "/data_path_0"), + data_path_1_(dbname_ + "/data_path_1"), + cf_path_0_(dbname_ + "/cf_path_0"), + cf_path_1_(dbname_ + "/cf_path_1") { + auto get_fd_block_size = [&](int fd) { return fd; }; + auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) { + *size = 1024; + return Status::OK(); + }; + cache_.reset( + new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size)); + env_.reset( + new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get())); + } + + protected: + std::string dbname_; + std::string data_path_0_; + std::string data_path_1_; + std::string cf_path_0_; + std::string cf_path_1_; + std::unique_ptr cache_; + std::unique_ptr env_; +}; + +TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) { + // Tests that Open will cache the logical block size for data paths, + // and Close will remove the cached sizes. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}}; + + for (int i = 0; i < 2; i++) { + DB* db; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); +#endif + } + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_OK(db->Close()); + ASSERT_EQ(0, cache_->Size()); + delete db; + } + ASSERT_OK(DestroyDB(dbname_, options, {})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) { + // Tests that Open will cache the logical block size for data paths, + // and delete the db pointer will remove the cached sizes. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + for (int i = 0; i < 2; i++) { + DB* db; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); +#endif + } + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK(DestroyDB(dbname_, options, {})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) { + // Tests that CreateColumnFamily will cache the cf_paths, + // drop the column family handle won't drop the cache, + // drop and then delete the column family handle will drop the cache. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + ColumnFamilyHandle* cf = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + // Drop column family does not drop cache. + ASSERT_OK(db->DropColumnFamily(cf)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + // Delete handle will drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cf)); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { + // Tests that CreateColumnFamilies will cache the cf_paths, + // drop the column family handle won't drop the cache, + // drop and then delete the column family handle will drop the cache. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + std::vector cfs; + ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Drop column family does not drop cache. + for (ColumnFamilyHandle* cf : cfs) { + ASSERT_OK(db->DropColumnFamily(cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + } + + // Delete one handle will not drop cache because another handle is still + // referencing cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Delete the last handle will drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(dbname_, options, + {{"cf1", cf_options}, {"cf2", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { + // Tests that Open two column families with the same cf_path will cache the + // cf_path and have 2 references to the cached size, + // drop the column family handle won't drop the cache, + // drop and then delete the column family handle will drop the cache. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + for (int i = 0; i < 2; i++) { + DB* db; + ColumnFamilyHandle* cf1 = nullptr; + ColumnFamilyHandle* cf2 = nullptr; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1)); + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2)); + ASSERT_OK(db->DestroyColumnFamilyHandle(cf1)); + ASSERT_OK(db->DestroyColumnFamilyHandle(cf2)); + delete db; + ASSERT_EQ(0, cache_->Size()); + + std::vector cfs; + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open(options, dbname_, + {{"cf1", cf_options}, + {"cf2", cf_options}, + {"default", ColumnFamilyOptions()}}, + &cfs, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, + {{"cf1", cf_options}, + {"cf2", cf_options}, + {"default", ColumnFamilyOptions()}}, + &cfs, &db)); +#endif + } + + // Logical block sizes of dbname_ and cf_path_0_ are cached during Open. + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Drop handles won't drop the cache. + ASSERT_OK(db->DropColumnFamily(cfs[0])); + ASSERT_OK(db->DropColumnFamily(cfs[1])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + // Delete 1st handle won't drop the cache for cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Delete 2nd handle will drop the cache for cf_path_0_. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + // Delete the default handle won't affect the cache because db still refers + // to the default CF. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2])); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK(DestroyDB(dbname_, options, + {{"cf1", cf_options}, {"cf2", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { + // Tests that destroy column family without dropping won't drop the cache, + // because compaction and flush might still need to get logical block size + // when opening new files. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + DB* db; + ASSERT_OK(DB::Open(options, dbname_, &db)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ColumnFamilyHandle* cf = nullptr; + ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Delete handle won't drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cf)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + + // Open with column families. + std::vector cfs; + for (int i = 0; i < 2; i++) { + if (!i) { + printf("Open\n"); + ASSERT_OK(DB::Open( + options, dbname_, + {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); + } else { +#ifdef ROCKSDB_LITE + break; +#else + printf("OpenForReadOnly\n"); + ASSERT_OK(DB::OpenForReadOnly( + options, dbname_, + {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); +#endif + } + // cf_path_0_ and dbname_ are cached. + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + // Deleting handle won't drop cache. + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0])); + ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1])); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(dbname_)); + ASSERT_EQ(1, cache_->GetRefCount(dbname_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + delete db; + ASSERT_EQ(0, cache_->Size()); + } + ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { + // Tests the cache behavior when there are multiple DBs sharing the same env + // with different db_paths and cf_paths. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + + DB* db0; + ASSERT_OK(DB::Open(options, data_path_0_, &db0)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + + ColumnFamilyOptions cf_options0; + cf_options0.cf_paths = {{cf_path_0_, 1024}}; + ColumnFamilyHandle* cf0; + ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + DB* db1; + ASSERT_OK(DB::Open(options, data_path_1_, &db1)); + ASSERT_EQ(3, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + + ColumnFamilyOptions cf_options1; + cf_options1.cf_paths = {{cf_path_1_, 1024}}; + ColumnFamilyHandle* cf1; + ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1)); + ASSERT_EQ(4, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); + delete db0; + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_1_)); + ASSERT_TRUE(cache_->Contains(cf_path_1_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); + ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}})); + + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); + delete db1; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}})); +} + +TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { + // Tests the cache behavior when there are multiple DBs sharing the same env + // with the same db_paths and cf_paths. + Options options; + options.create_if_missing = true; + options.env = env_.get(); + options.db_paths = {{data_path_0_, 1024}}; + ColumnFamilyOptions cf_options; + cf_options.cf_paths = {{cf_path_0_, 1024}}; + + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + + DB* db0; + ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0)); + ASSERT_EQ(1, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + + ColumnFamilyHandle* cf0; + ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + DB* db1; + ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + + ColumnFamilyHandle* cf1; + ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1)); + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); + + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); + delete db0; + ASSERT_EQ(2, cache_->Size()); + ASSERT_TRUE(cache_->Contains(data_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); + ASSERT_TRUE(cache_->Contains(cf_path_0_)); + ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); + ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}})); + + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); + delete db1; + ASSERT_EQ(0, cache_->Size()); + ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}})); +} + +} // namespace ROCKSDB_NAMESPACE +#endif // OS_LINUX + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_memtable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_memtable_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,7 +17,7 @@ class DBMemTableTest : public DBTestBase { public: - DBMemTableTest() : DBTestBase("/db_memtable_test") {} + DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {} }; class MockMemTableRep : public MemTableRep { @@ -129,7 +129,6 @@ TEST_F(DBMemTableTest, DuplicateSeq) { SequenceNumber seq = 123; std::string value; - Status s; MergeContext merge_context; Options options; InternalKeyComparator ikey_cmp(options.comparator); @@ -140,28 +139,31 @@ InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Write some keys and make sure it returns false on duplicates - bool res; - res = mem->Add(seq, kTypeValue, "key", "value2"); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key", "value2"); - ASSERT_FALSE(res); + ASSERT_OK( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); // Changing the type should still cause the duplicatae key - res = mem->Add(seq, kTypeMerge, "key", "value2"); - ASSERT_FALSE(res); + ASSERT_TRUE( + mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); // Changing the seq number will make the key fresh - res = mem->Add(seq + 1, kTypeMerge, "key", "value2"); - ASSERT_TRUE(res); + ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2", + nullptr /* kv_prot_info */)); // Test with different types for duplicate keys - res = mem->Add(seq, kTypeDeletion, "key", ""); - ASSERT_FALSE(res); - res = mem->Add(seq, kTypeSingleDeletion, "key", ""); - ASSERT_FALSE(res); + ASSERT_TRUE( + mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); + ASSERT_TRUE( + mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); // Test the duplicate keys under stress for (int i = 0; i < 10000; i++) { @@ -169,11 +171,12 @@ if (!insert_dup) { seq++; } - res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq)); + Status s = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq), + nullptr /* kv_prot_info */); if (insert_dup) { - ASSERT_FALSE(res); + ASSERT_TRUE(s.IsTryAgain()); } else { - ASSERT_TRUE(res); + ASSERT_OK(s); } } delete mem; @@ -181,26 +184,28 @@ // Test with InsertWithHint options.memtable_insert_with_hint_prefix_extractor.reset( new TestPrefixExtractor()); // which uses _ to extract the prefix - ioptions = ImmutableCFOptions(options); + ioptions = ImmutableOptions(options); mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Insert a duplicate key with _ in it - res = mem->Add(seq, kTypeValue, "key_1", "value"); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key_1", "value"); - ASSERT_FALSE(res); + ASSERT_OK( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */) + .IsTryAgain()); delete mem; // Test when InsertConcurrently will be invoked options.allow_concurrent_memtable_write = true; - ioptions = ImmutableCFOptions(options); + ioptions = ImmutableOptions(options); mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); MemTablePostProcessInfo post_process_info; - res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); - ASSERT_FALSE(res); + ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info)); + ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info) + .IsTryAgain()); delete mem; } @@ -208,7 +213,6 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { int num_ops = 1000; std::string value; - Status s; MergeContext merge_context; Options options; // A merge operator that is not sensitive to concurrent writes since in this @@ -220,15 +224,14 @@ auto factory = std::make_shared(); options.memtable_factory = factory; options.allow_concurrent_memtable_write = true; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Put 0 as the base PutFixed64(&value, static_cast(0)); - bool res = mem->Add(0, kTypeValue, "key", value); - ASSERT_TRUE(res); + ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */)); value.clear(); // Write Merge concurrently @@ -237,9 +240,8 @@ std::string v1; for (int seq = 1; seq < num_ops / 2; seq++) { PutFixed64(&v1, seq); - bool res1 = - mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1); - ASSERT_TRUE(res1); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */, + true, &post_process_info1)); v1.clear(); } }); @@ -248,9 +250,8 @@ std::string v2; for (int seq = num_ops / 2; seq < num_ops; seq++) { PutFixed64(&v2, seq); - bool res2 = - mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2); - ASSERT_TRUE(res2); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */, + true, &post_process_info2)); v2.clear(); } }); @@ -261,8 +262,9 @@ ReadOptions roptions; SequenceNumber max_covering_tombstone_seq = 0; LookupKey lkey("key", kMaxSequenceNumber); - res = mem->Get(lkey, &value, &status, &merge_context, - &max_covering_tombstone_seq, roptions); + bool res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status, + &merge_context, &max_covering_tombstone_seq, roptions); + ASSERT_OK(status); ASSERT_TRUE(res); uint64_t ivalue = DecodeFixed64(Slice(value).data()); uint64_t sum = 0; @@ -303,19 +305,20 @@ ASSERT_EQ(hint_bar, rep->last_hint_in()); ASSERT_EQ(hint_bar, rep->last_hint_out()); ASSERT_EQ(5, rep->num_insert_with_hint()); - ASSERT_OK(Put("whitelisted", "vvv")); + ASSERT_OK(Put("NotInPrefixDomain", "vvv")); ASSERT_EQ(5, rep->num_insert_with_hint()); ASSERT_EQ("foo_v1", Get("foo_k1")); ASSERT_EQ("foo_v2", Get("foo_k2")); ASSERT_EQ("foo_v3", Get("foo_k3")); ASSERT_EQ("bar_v1", Get("bar_k1")); ASSERT_EQ("bar_v2", Get("bar_k2")); - ASSERT_EQ("vvv", Get("whitelisted")); + ASSERT_EQ("vvv", Get("NotInPrefixDomain")); } TEST_F(DBMemTableTest, ColumnFamilyId) { // Verifies MemTableRepFactory is told the right column family id. Options options; + options.env = CurrentOptions().env; options.allow_concurrent_memtable_write = false; options.create_if_missing = true; options.memtable_factory.reset(new MockMemTableRepFactory()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,43 +8,85 @@ #include "rocksdb/perf_context.h" #include "rocksdb/utilities/debug.h" #include "table/block_based/block_builder.h" -#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" #endif #include "rocksdb/merge_operator.h" +#include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/sortlist.h" #include "utilities/merge_operators/string_append/stringappend2.h" namespace ROCKSDB_NAMESPACE { +namespace { +class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; +}; +} // namespace + class DBMergeOperandTest : public DBTestBase { public: - DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {} + DBMergeOperandTest() + : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {} }; -TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { - class LimitedStringAppendMergeOp : public StringAppendTESTOperator { - public: - LimitedStringAppendMergeOp(int limit, char delim) - : StringAppendTESTOperator(delim), limit_(limit) {} +TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) { + // There was a bug of reading merge operands after they are mistakely freed + // in DB::GetMergeOperands, which is surfaced by cache full. + // See PR#9507 for more. + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.env = env_; + BlockBasedTableOptions table_options; - const char* Name() const override { - return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; - } + // Small cache to simulate cache full + table_options.block_cache = NewLRUCache(1); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - bool ShouldMerge(const std::vector& operands) const override { - if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { - return true; - } - return false; - } + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; - private: - size_t limit_ = 0; - }; + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v4")); + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(number_of_operands, 4); + ASSERT_EQ(values[0].ToString(), "v1"); + ASSERT_EQ(values[1].ToString(), "v2"); + ASSERT_EQ(values[2].ToString(), "v3"); + ASSERT_EQ(values[3].ToString(), "v4"); +} +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { Options options; options.create_if_missing = true; // Use only the latest two merge operands. @@ -58,29 +100,29 @@ merge_operands_info.expected_max_number_of_operands = num_records; // k0 value in memtable - Put("k0", "PutARock"); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(Put("k0", "PutARock")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "PutARock"); // k0.1 value in SST - Put("k0.1", "RockInSST"); + ASSERT_OK(Put("k0.1", "RockInSST")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "RockInSST"); // All k1 values are in memtable. ASSERT_OK(Merge("k1", "a")); - Put("k1", "x"); + ASSERT_OK(Put("k1", "x")); ASSERT_OK(Merge("k1", "b")); ASSERT_OK(Merge("k1", "c")); ASSERT_OK(Merge("k1", "d")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "x"); ASSERT_EQ(values[1], "b"); ASSERT_EQ(values[2], "c"); @@ -97,13 +139,13 @@ // All k1.1 values are in memtable. ASSERT_OK(Merge("k1.1", "r")); - Delete("k1.1"); + ASSERT_OK(Delete("k1.1")); ASSERT_OK(Merge("k1.1", "c")); ASSERT_OK(Merge("k1.1", "k")); ASSERT_OK(Merge("k1.1", "s")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "c"); ASSERT_EQ(values[1], "k"); ASSERT_EQ(values[2], "s"); @@ -114,9 +156,9 @@ ASSERT_OK(Merge("k2", "e")); ASSERT_OK(Merge("k2", "r")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "q"); ASSERT_EQ(values[1], "w"); ASSERT_EQ(values[2], "e"); @@ -124,30 +166,30 @@ // All k2.1 values are flushed to L0 into a single file. ASSERT_OK(Merge("k2.1", "m")); - Put("k2.1", "l"); + ASSERT_OK(Put("k2.1", "l")); ASSERT_OK(Merge("k2.1", "n")); ASSERT_OK(Merge("k2.1", "o")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "l,n,o"); // All k2.2 values are flushed to L0 into a single file. ASSERT_OK(Merge("k2.2", "g")); - Delete("k2.2"); + ASSERT_OK(Delete("k2.2")); ASSERT_OK(Merge("k2.2", "o")); ASSERT_OK(Merge("k2.2", "t")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "o,t"); // Do some compaction that will make the following tests more predictable // Slice start("PutARock"); // Slice end("t"); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // All k3 values are flushed and are in different files. ASSERT_OK(Merge("k3", "ab")); @@ -157,9 +199,9 @@ ASSERT_OK(Merge("k3", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "ab"); ASSERT_EQ(values[1], "bc"); ASSERT_EQ(values[2], "cd"); @@ -168,14 +210,14 @@ // All k3.1 values are flushed and are in different files. ASSERT_OK(Merge("k3.1", "ab")); ASSERT_OK(Flush()); - Put("k3.1", "bc"); + ASSERT_OK(Put("k3.1", "bc")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.1", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.1", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "bc"); ASSERT_EQ(values[1], "cd"); ASSERT_EQ(values[2], "de"); @@ -183,14 +225,14 @@ // All k3.2 values are flushed and are in different files. ASSERT_OK(Merge("k3.2", "ab")); ASSERT_OK(Flush()); - Delete("k3.2"); + ASSERT_OK(Delete("k3.2")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.2", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.2", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "cd"); ASSERT_EQ(values[1], "de"); @@ -205,32 +247,120 @@ ASSERT_OK(Flush()); MoveFilesToLevel(1); ASSERT_OK(Merge("k4", "ed")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "ba"); ASSERT_EQ(values[1], "cb"); ASSERT_EQ(values[2], "dc"); ASSERT_EQ(values[3], "ed"); - // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable + // First 3 k5 values are in SST and next 4 k5 values are in Immutable + // Memtable ASSERT_OK(Merge("k5", "who")); ASSERT_OK(Merge("k5", "am")); ASSERT_OK(Merge("k5", "i")); ASSERT_OK(Flush()); - Put("k5", "remember"); + ASSERT_OK(Put("k5", "remember")); ASSERT_OK(Merge("k5", "i")); ASSERT_OK(Merge("k5", "am")); ASSERT_OK(Merge("k5", "rocks")); - dbfull()->TEST_SwitchMemtable(); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k5", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "remember"); ASSERT_EQ(values[1], "i"); ASSERT_EQ(values[2], "am"); } +TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) { + Options options; + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1 values are in memtable. + ASSERT_OK(Put("k1", "x")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Put("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "q,w,e,r"); + + // Do some compaction that will make the following tests more predictable + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Put("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All K4 values are in different levels + ASSERT_OK(Put("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,7 @@ #include "db/forward_iterator.h" #include "port/stack_trace.h" #include "rocksdb/merge_operator.h" +#include "util/random.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend2.h" @@ -35,7 +36,8 @@ // Test merge operator functionality. class DBMergeOperatorTest : public DBTestBase { public: - DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {} + DBMergeOperatorTest() + : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {} std::string GetWithReadCallback(SnapshotChecker* snapshot_checker, const Slice& key, @@ -92,7 +94,7 @@ ASSERT_OK(Merge("k1", "c")); ASSERT_OK(Merge("k1", "d")); std::string value; - ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k1", &value)); // Make sure that only the latest two merge operands are used. If this was // not the case the value would be "a,b,c,d". ASSERT_EQ(value, "c,d"); @@ -103,7 +105,7 @@ ASSERT_OK(Merge("k2", "c")); ASSERT_OK(Merge("k2", "d")); ASSERT_OK(Flush()); - ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k2", &value)); ASSERT_EQ(value, "c,d"); // All K3 values are flushed and are in different files. @@ -114,7 +116,7 @@ ASSERT_OK(Merge("k3", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3", "de")); - ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k3", &value)); ASSERT_EQ(value, "cd,de"); // All K4 values are in different levels @@ -128,7 +130,7 @@ ASSERT_OK(Flush()); MoveFilesToLevel(1); ASSERT_OK(Merge("k4", "de")); - ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k4", &value)); ASSERT_EQ(value, "cd,de"); } @@ -242,7 +244,7 @@ std::string key = Key(key_id % 35); key_id++; for (int k = 0; k < kOperandsPerKeyPerFile; k++) { - std::string val = RandomString(&rnd, kOperandSize); + std::string val = rnd.RandomString(kOperandSize); ASSERT_OK(db_->Merge(WriteOptions(), key, val)); if (true_data[key].size() == 0) { true_data[key] = val; @@ -327,7 +329,7 @@ for (int i = 0; i < kNumOperands; i++) { for (int j = 0; j < kNumKeys; j++) { std::string k = Key(j); - std::string v = RandomString(&rnd, kOperandSize); + std::string v = rnd.RandomString(kOperandSize); ASSERT_OK(db_->Merge(WriteOptions(), k, v)); true_data[k] = std::max(true_data[k], v); @@ -342,8 +344,9 @@ // Code executed before merge operation merge_hook->before_merge_ = [&]() { // Evict all tables from cache before every merge operation + auto* table_cache = dbfull()->TEST_table_cache(); for (uint64_t num : file_numbers) { - TableCache::Evict(dbfull()->TEST_table_cache(), num); + TableCache::Evict(table_cache, num); } // Decrease cache capacity to force all unrefed blocks to be evicted if (bbto.block_cache) { @@ -364,7 +367,7 @@ VerifyDBFromMap(true_data, &total_reads); ASSERT_EQ(merge_cnt, total_reads); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); VerifyDBFromMap(true_data, &total_reads); } @@ -383,7 +386,7 @@ std::function writer_func = [&]() { int k = 0; for (int i = 0; i < kNumWrites; i++) { - db_->Merge(WriteOptions(), Key(k), Key(k)); + ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k))); if (i && i % kNumOperands == 0) { k++; @@ -401,7 +404,7 @@ ReadOptions ro; ro.tailing = true; Iterator* iter = db_->NewIterator(ro); - + ASSERT_OK(iter->status()); iter->SeekToFirst(); for (int i = 0; i < (kNumWrites / kNumOperands); i++) { while (!iter->Valid()) { @@ -414,6 +417,7 @@ iter->Next(); } + ASSERT_OK(iter->status()); delete iter; }; @@ -447,12 +451,13 @@ // ForwardIterator to not pin it in some circumstances. This test // reproduces it. - db_->Merge(WriteOptions(), "key", "sst"); - db_->Flush(FlushOptions()); // Switch to SuperVersion A - db_->Merge(WriteOptions(), "key", "memtable"); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst")); + ASSERT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion A + ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable")); // Pin SuperVersion A std::unique_ptr someone_else(db_->NewIterator(ReadOptions())); + ASSERT_OK(someone_else->status()); bool pushed_first_operand = false; bool stepped_to_next_operand = false; @@ -460,7 +465,7 @@ "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) { EXPECT_FALSE(pushed_first_operand); pushed_first_operand = true; - db_->Flush(FlushOptions()); // Switch to SuperVersion B + EXPECT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion B }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) { @@ -475,7 +480,7 @@ std::unique_ptr iter(db_->NewIterator(ro)); iter->Seek("key"); - ASSERT_TRUE(iter->status().ok()); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString()); EXPECT_TRUE(pushed_first_operand); @@ -620,7 +625,7 @@ // kNumPutBefore keys will have base values for (int i = 0; i < kNumPutBefore; i++) { std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); + std::string value = rnd.RandomString(kOperandSize); ASSERT_OK(db_->Put(WriteOptions(), key, value)); true_data[key] = value; @@ -629,7 +634,7 @@ // Do kTotalMerges merges for (int i = 0; i < kTotalMerges; i++) { std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); + std::string value = rnd.RandomString(kOperandSize); ASSERT_OK(db_->Merge(WriteOptions(), key, value)); if (true_data[key] < value) { @@ -640,7 +645,7 @@ // Overwrite random kNumPutAfter keys for (int i = 0; i < kNumPutAfter; i++) { std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); + std::string value = rnd.RandomString(kOperandSize); ASSERT_OK(db_->Put(WriteOptions(), key, value)); true_data[key] = value; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_options_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_options_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -27,38 +27,33 @@ class DBOptionsTest : public DBTestBase { public: - DBOptionsTest() : DBTestBase("/db_options_test") {} + DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {} #ifndef ROCKSDB_LITE std::unordered_map GetMutableDBOptionsMap( const DBOptions& options) { std::string options_str; - GetStringFromDBOptions(&options_str, options); - std::unordered_map options_map; - StringToMap(options_str, &options_map); std::unordered_map mutable_map; - for (const auto opt : db_options_type_info) { - if (opt.second.is_mutable && - opt.second.verification != OptionVerificationType::kDeprecated) { - mutable_map[opt.first] = options_map[opt.first]; - } - } + ConfigOptions config_options(options); + config_options.delimiter = "; "; + + EXPECT_OK(GetStringFromMutableDBOptions( + config_options, MutableDBOptions(options), &options_str)); + EXPECT_OK(StringToMap(options_str, &mutable_map)); + return mutable_map; } std::unordered_map GetMutableCFOptionsMap( const ColumnFamilyOptions& options) { std::string options_str; - GetStringFromColumnFamilyOptions(&options_str, options); - std::unordered_map options_map; - StringToMap(options_str, &options_map); + ConfigOptions config_options; + config_options.delimiter = "; "; + std::unordered_map mutable_map; - for (const auto opt : cf_options_type_info) { - if (opt.second.is_mutable && - opt.second.verification != OptionVerificationType::kDeprecated) { - mutable_map[opt.first] = options_map[opt.first]; - } - } + EXPECT_OK(GetStringFromMutableCFOptions( + config_options, MutableCFOptions(options), &options_str)); + EXPECT_OK(StringToMap(options_str, &mutable_map)); return mutable_map; } @@ -84,9 +79,85 @@ #endif // ROCKSDB_LITE }; +TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) { + Options options; + options.env = env_; + options.track_and_verify_wals_in_manifest = true; + + ImmutableDBOptions db_options(options); + ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest); + + Reopen(options); + ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest); + + Status s = + dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}}); + ASSERT_FALSE(s.ok()); +} + // RocksDB lite don't support dynamic options. #ifndef ROCKSDB_LITE +TEST_F(DBOptionsTest, AvoidUpdatingOptions) { + Options options; + options.env = env_; + options.max_background_jobs = 4; + options.delayed_write_rate = 1024; + + Reopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + bool is_changed_stats = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) { + ASSERT_FALSE(is_changed_stats); // should only save options file once + is_changed_stats = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // helper function to check the status and reset after each check + auto is_changed = [&] { + bool ret = is_changed_stats; + is_changed_stats = false; + return ret; + }; + + // without changing the value, but it's sanitized to a different value + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}})); + ASSERT_TRUE(is_changed()); + + // without changing the value + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}})); + ASSERT_FALSE(is_changed()); + + // changing the value + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}})); + ASSERT_TRUE(is_changed()); + + // update again + ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}})); + ASSERT_FALSE(is_changed()); + + // without changing a default value + ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}})); + ASSERT_FALSE(is_changed()); + + // now change + ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}})); + ASSERT_TRUE(is_changed()); + + // multiple values without change + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}})); + ASSERT_FALSE(is_changed()); + + // multiple values with change + ASSERT_OK(dbfull()->SetDBOptions( + {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}})); + ASSERT_TRUE(is_changed()); +} + TEST_F(DBOptionsTest, GetLatestDBOptions) { // GetOptions should be able to get latest option changed by SetOptions. Options options; @@ -118,6 +189,127 @@ GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1]))); } +TEST_F(DBOptionsTest, SetMutableTableOptions) { + Options options; + options.create_if_missing = true; + options.env = env_; + options.blob_file_size = 16384; + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; + bbto.block_size = 8192; + bbto.block_restart_interval = 7; + + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + Options c_opts = dbfull()->GetOptions(cfh); + const auto* c_bbto = + c_opts.table_factory->GetOptions(); + ASSERT_NE(c_bbto, nullptr); + ASSERT_EQ(c_opts.blob_file_size, 16384); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 8192); + ASSERT_EQ(c_bbto->block_restart_interval, 7); + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"table_factory.block_restart_interval", "11"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Now set an option that is not mutable - options should not change + ASSERT_NOK( + dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that are not - options should not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.no_block_cache", "false"}, + {"table_factory.block_size", "8192"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that do not exist - options should not + // change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "8192"}, + {"table_factory.does_not_exist", "true"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Trying to change the table factory fails + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory", TableFactory::kPlainTableName()}})); + + // Set some on the table and some on the Column Family + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"blob_file_size", "32768"}, + {"table_factory.block_restart_interval", "13"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_opts.blob_file_size, 32768); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); + // Set some on the table and a bad one on the ColumnFamily - options should + // not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "1024"}, + {"no_such_option", "32768"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); +} + +TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) { + class DummySkipListFactory : public SkipListFactory { + public: + static const char* kClassName() { return "DummySkipListFactory"; } + const char* Name() const override { return kClassName(); } + explicit DummySkipListFactory() : SkipListFactory(2) {} + }; + { + // Verify the DummySkipList cannot be created + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + std::unique_ptr factory; + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, DummySkipListFactory::kClassName(), &factory)); + } + Options options; + options.create_if_missing = true; + // Try with fail_if_options_file_error=false/true to update the options + for (bool on_error : {false, true}) { + options.fail_if_options_file_error = on_error; + options.env = env_; + options.disable_auto_compactions = false; + + options.memtable_factory.reset(new DummySkipListFactory()); + Reopen(options); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ASSERT_OK( + dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}})); + ColumnFamilyDescriptor cfd; + ASSERT_OK(cfh->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + ColumnFamilyHandle* test = nullptr; + ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test)); + ASSERT_OK(test->GetDescriptor(&cfd)); + ASSERT_STREQ(cfd.options.memtable_factory->Name(), + DummySkipListFactory::kClassName()); + + ASSERT_OK(dbfull()->DropColumnFamily(test)); + delete test; + } +} + TEST_F(DBOptionsTest, SetBytesPerSync) { const size_t kValueSize = 1024 * 1024; // 1MB Options options; @@ -140,7 +332,7 @@ WriteOptions write_opts; // should sync approximately 40MB/1MB ~= 40 times. for (i = 0; i < 40; i++) { - Put(Key(i), kValue, write_opts); + ASSERT_OK(Put(Key(i), kValue, write_opts)); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -156,7 +348,7 @@ // should sync approximately 40MB*2/8MB ~= 10 times. // data will be 40*2MB because of previous Puts too. for (i = 0; i < 40; i++) { - Put(Key(i), kValue, write_opts); + ASSERT_OK(Put(Key(i), kValue, write_opts)); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -179,15 +371,16 @@ options.env = env_; Reopen(options); ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync); - int counter = 0; + std::atomic_int counter{0}; int low_bytes_per_sync = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; }); + "WritableFileWriter::RangeSync:0", + [&](void* /*arg*/) { counter.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); const std::string kValue(kValueSize, 'v'); int i = 0; for (; i < 10; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its // empty and will not get the new wal_bytes_per_sync value. @@ -198,7 +391,7 @@ counter = 0; i = 0; for (; i < 10; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } ASSERT_GT(counter, 0); ASSERT_GT(low_bytes_per_sync, 0); @@ -233,9 +426,9 @@ for (; i < 3; i++) { ASSERT_OK(Put("foo", ToString(i))); ASSERT_OK(Put("bar", ToString(i))); - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(unmatch_cnt, 0); ASSERT_GE(match_cnt, 11); @@ -251,9 +444,9 @@ for (; i < 3; i++) { ASSERT_OK(Put("foo", ToString(i))); ASSERT_OK(Put("bar", ToString(i))); - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(unmatch_cnt, 0); ASSERT_GE(match_cnt, 11); } @@ -289,14 +482,14 @@ DestroyAndReopen(options); int i = 0; for (; i < 1024; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } - Flush(); + ASSERT_OK(Flush()); for (; i < 1024 * 2; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(2, NumTableFilesAtLevel(0)); uint64_t l0_size = SizeAtLevel(0); @@ -318,7 +511,7 @@ break; } Reopen(options); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); @@ -365,7 +558,7 @@ TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3"); // Background compaction executed. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); } @@ -382,12 +575,12 @@ // Need to insert two keys to avoid trivial move. ASSERT_OK(Put("foo", ToString(i))); ASSERT_OK(Put("bar", ToString(i))); - Flush(); + ASSERT_OK(Flush()); } ASSERT_EQ("3", FilesPerLevel()); ASSERT_OK( dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel()); } @@ -404,6 +597,20 @@ ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); } +TEST_F(DBOptionsTest, SetBackgroundFlushThreads) { + Options options; + options.create_if_missing = true; + options.max_background_flushes = 1; + options.env = env_; + Reopen(options); + ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed()); + ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}})); + ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH)); + ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed()); +} + + TEST_F(DBOptionsTest, SetBackgroundJobs) { Options options; options.create_if_missing = true; @@ -476,8 +683,7 @@ TEST_F(DBOptionsTest, MaxTotalWalSizeChange) { Random rnd(1044); const auto value_size = size_t(1024); - std::string value; - test::RandomString(&rnd, value_size, &value); + std::string value = rnd.RandomString(value_size); Options options; options.create_if_missing = true; @@ -496,7 +702,7 @@ ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}})); for (size_t cf = 0; cf < handles_.size(); ++cf) { - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); ASSERT_EQ("1", FilesPerLevel(static_cast(cf))); } } @@ -547,10 +753,9 @@ } TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) { - SpecialEnv env(env_); - env.time_elapse_only_sleep_ = true; Options options; - options.env = &env; + options.env = env_; + SetTimeElapseOnlySleepOnReopen(&options); options.create_if_missing = true; ASSERT_OK(TryReopen(options)); @@ -569,10 +774,10 @@ assert_candidate_files_empty(dbfull(), true); - env.addon_time_.store(20); + env_->MockSleepForMicroseconds(20); assert_candidate_files_empty(dbfull(), true); - env.addon_time_.store(21); + env_->MockSleepForMicroseconds(1); assert_candidate_files_empty(dbfull(), false); Close(); @@ -599,6 +804,7 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { Options options; + options.env = CurrentOptions().env; options.delayed_write_rate = 0; Reopen(options); ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); @@ -610,6 +816,7 @@ TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { Options options; + options.env = CurrentOptions().env; options.compaction_style = kCompactionStyleUniversal; options.ttl = 0; @@ -639,6 +846,7 @@ TEST_F(DBOptionsTest, SanitizeTtlDefault) { Options options; + options.env = CurrentOptions().env; Reopen(options); ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); @@ -655,6 +863,7 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Options options; options.compaction_style = kCompactionStyleFIFO; + options.env = CurrentOptions().env; options.ttl = 0; Reopen(options); ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); @@ -680,17 +889,19 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { Options options; + options.env = CurrentOptions().env; options.compaction_style = kCompactionStyleFIFO; options.write_buffer_size = 10 << 10; // 10KB options.arena_block_size = 4096; options.compression = kNoCompression; options.create_if_missing = true; options.compaction_options_fifo.allow_compaction = false; - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; + // NOTE: Presumed unnecessary and removed: resetting mock time in env + // Test dynamically changing ttl. - env_->addon_time_.store(0); options.ttl = 1 * 60 * 60; // 1 hour ASSERT_OK(TryReopen(options)); @@ -698,30 +909,30 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); - // Add 61 seconds to the time. - env_->addon_time_.fetch_add(61); + env_->MockSleepForSeconds(61); // No files should be compacted as ttl is set to 1 hour. ASSERT_EQ(dbfull()->GetOptions().ttl, 3600); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 10); // Set ttl to 1 minute. So all files should get deleted. ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}})); ASSERT_EQ(dbfull()->GetOptions().ttl, 60); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + // Test dynamically changing compaction_options_fifo.max_table_files_size - env_->addon_time_.store(0); options.compaction_options_fifo.max_table_files_size = 500 << 10; // 00KB options.ttl = 0; DestroyAndReopen(options); @@ -729,9 +940,9 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); @@ -739,7 +950,7 @@ // No files should be compacted as max_table_files_size is set to 500 KB. ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, 500 << 10); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 10); // Set max_table_files_size to 12 KB. So only 1 file should remain now. @@ -747,7 +958,7 @@ {{"compaction_options_fifo", "{max_table_files_size=12288;}"}})); ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, 12 << 10); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 1); @@ -761,9 +972,9 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); @@ -772,7 +983,7 @@ // allow_compaction is false ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, false); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 10); // Set allow_compaction to true. So number of files should be between 1 and 5. @@ -780,7 +991,7 @@ {{"compaction_options_fifo", "{allow_compaction=true;}"}})); ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, true); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GE(NumTableFilesAtLevel(0), 1); ASSERT_LE(NumTableFilesAtLevel(0), 5); @@ -801,14 +1012,14 @@ ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}})); ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size); for (int i = 0; i < 1024; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } - Flush(); + ASSERT_OK(Flush()); for (int i = 0; i < 1024 * 2; i++) { - Put(Key(i), kValue); + ASSERT_OK(Put(Key(i), kValue)); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(256, env_->compaction_readahead_size_); Close(); } @@ -818,6 +1029,7 @@ options.compaction_style = kCompactionStyleFIFO; options.write_buffer_size = 10 << 10; // 10KB options.create_if_missing = true; + options.env = CurrentOptions().env; ASSERT_OK(TryReopen(options)); @@ -825,9 +1037,9 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); @@ -859,8 +1071,129 @@ ASSERT_EQ(dbfull()->GetOptions().ttl, 191); } +TEST_F(DBOptionsTest, ChangeCompression) { + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + Options options; + options.write_buffer_size = 10 << 10; // 10KB + options.level0_file_num_compaction_trigger = 2; + options.create_if_missing = true; + options.compression = CompressionType::kLZ4Compression; + options.bottommost_compression = CompressionType::kNoCompression; + options.bottommost_compression_opts.level = 2; + options.bottommost_compression_opts.parallel_threads = 1; + options.env = CurrentOptions().env; + + ASSERT_OK(TryReopen(options)); + + CompressionType compression_used = CompressionType::kLZ4Compression; + CompressionOptions compression_opt_used; + bool compacted = false; + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* c = reinterpret_cast(arg); + compression_used = c->output_compression(); + compression_opt_used = c->output_compression_opts(); + compacted = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kNoCompression, compression_used); + ASSERT_EQ(options.compression_opts.level, compression_opt_used.level); + ASSERT_EQ(options.compression_opts.parallel_threads, + compression_opt_used.parallel_threads); + + compression_used = CompressionType::kLZ4Compression; + compacted = false; + ASSERT_OK(dbfull()->SetOptions( + {{"bottommost_compression", "kSnappyCompression"}, + {"bottommost_compression_opts", "0:6:0:0:4:true"}})); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kSnappyCompression, compression_used); + ASSERT_EQ(6, compression_opt_used.level); + // Right now parallel_level is not yet allowed to be changed. + + SyncPoint::GetInstance()->DisableProcessing(); +} + #endif // ROCKSDB_LITE +TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) { + // Verify the bottommost compression options still take effect even when the + // bottommost compression type is left at its default value. Verify for both + // automatic and manual compaction. + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + + constexpr int kUpperCompressionLevel = 1; + constexpr int kBottommostCompressionLevel = 2; + constexpr int kNumL0Files = 2; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.compression = CompressionType::kLZ4Compression; + options.compression_opts.level = kUpperCompressionLevel; + options.bottommost_compression_opts.level = kBottommostCompressionLevel; + options.bottommost_compression_opts.enabled = true; + Reopen(options); + + CompressionType compression_used = CompressionType::kDisableCompressionOption; + CompressionOptions compression_opt_used; + bool compacted = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) { + Compaction* c = static_cast(arg); + compression_used = c->output_compression(); + compression_opt_used = c->output_compression_opts(); + compacted = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // First, verify for automatic compaction. + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); + + // Second, verify for manual compaction. + compacted = false; + compression_used = CompressionType::kDisableCompressionOption; + compression_opt_used = CompressionOptions(); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_properties_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_properties_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,6 +19,7 @@ #include "rocksdb/perf_context.h" #include "rocksdb/perf_level.h" #include "rocksdb/table.h" +#include "test_util/mock_time_env.h" #include "util/random.h" #include "util/string_util.h" @@ -26,7 +27,27 @@ class DBPropertiesTest : public DBTestBase { public: - DBPropertiesTest() : DBTestBase("/db_properties_test") {} + DBPropertiesTest() + : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {} + + void AssertDbStats(const std::map& db_stats, + double expected_uptime, int expected_user_bytes_written, + int expected_wal_bytes_written, + int expected_user_writes_by_self, + int expected_user_writes_with_wal) { + ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime")); + ASSERT_EQ(std::to_string(expected_wal_bytes_written), + db_stats.at("db.wal_bytes_written")); + ASSERT_EQ("0", db_stats.at("db.wal_syncs")); + ASSERT_EQ(std::to_string(expected_user_bytes_written), + db_stats.at("db.user_bytes_written")); + ASSERT_EQ("0", db_stats.at("db.user_writes_by_other")); + ASSERT_EQ(std::to_string(expected_user_writes_by_self), + db_stats.at("db.user_writes_by_self")); + ASSERT_EQ(std::to_string(expected_user_writes_with_wal), + db_stats.at("db.user_writes_with_wal")); + ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros")); + } }; #ifndef ROCKSDB_LITE @@ -52,12 +73,12 @@ // Block sync calls env_->delay_sstable_sync_.store(true, std::memory_order_release); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable ASSERT_TRUE(dbfull()->GetProperty( handles_[1], "rocksdb.num-entries-active-mem-table", &num)); ASSERT_EQ("2", num); - Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger compaction ASSERT_TRUE(dbfull()->GetProperty( handles_[1], "rocksdb.num-entries-active-mem-table", &num)); ASSERT_EQ("1", num); @@ -97,10 +118,10 @@ uint64_t v1, v2, v3; ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1)); - Put("12345678", ""); + ASSERT_OK(Put("12345678", "")); ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3)); @@ -126,8 +147,8 @@ Random rnd(301); for (auto* handle : handles_) { for (int i = 0; i < kKeyNum; ++i) { - db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); + ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); } } @@ -153,7 +174,7 @@ DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); // Issue flush and expect larger memory usage of table readers. - db_->Flush(FlushOptions(), handle); + ASSERT_OK(db_->Flush(FlushOptions(), handle)); ASSERT_TRUE(db_->GetAggregatedIntProperty( DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); @@ -212,7 +233,7 @@ void VerifyTableProperties( const TableProperties& base_tp, const TableProperties& new_tp, - double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1, + double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1, double index_size_bias = 0.1, double data_size_bias = 0.1, double num_data_blocks_bias = 0.05) { VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); @@ -299,9 +320,9 @@ for (int i = 0; i < files; i++) { int rows = files / 10; for (int j = 0; j < rows; j++) { - db_->Put(WriteOptions(), std::to_string(++key), "foo"); + ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } } std::string num; @@ -335,7 +356,7 @@ table_options.filter_policy.reset( NewBloomFilterPolicy(kBloomBitsPerKey, false)); table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -346,23 +367,24 @@ Random rnd(5632); for (int table = 1; table <= kTableCount; ++table) { for (int i = 0; i < kPutsPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); + ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); } for (int i = 0; i < kDeletionsPerTable; i++) { - db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize)); + ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize))); } for (int i = 0; i < kMergeOperandsPerTable; i++) { - db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); + ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); } for (int i = 0; i < kRangeDeletionsPerTable; i++) { - std::string start = RandomString(&rnd, kKeySize); + std::string start = rnd.RandomString(kKeySize); std::string end = start; end.resize(kValueSize); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + start, end)); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } std::string property; db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); @@ -409,11 +431,11 @@ int key_index = 0; Random rnd(301); for (int num = 0; num < 8; num++) { - Put("foo", "bar"); + ASSERT_OK(Put("foo", "bar")); GenerateNewFile(&rnd, &key_index); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); std::string prop; ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); @@ -429,7 +451,7 @@ // Reopen and issue Get(). See thee latency tracked ReopenWithColumnFamilies({"default", "pikachu"}, options); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int key = 0; key < key_index; key++) { Get(Key(key)); } @@ -457,6 +479,7 @@ std::unique_ptr iter(db_->NewIterator(ReadOptions())); for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { } + ASSERT_OK(iter->status()); } ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop)); ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); @@ -470,9 +493,9 @@ ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); // put something and read it back , CF 1 should show histogram. - Put(1, "foo", "bar"); - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("bar", Get(1, "foo")); ASSERT_TRUE( @@ -498,7 +521,7 @@ ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); // Clear internal stats - dbfull()->ResetStats(); + ASSERT_OK(dbfull()->ResetStats()); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop)); ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); @@ -533,7 +556,7 @@ table_options.filter_policy.reset( NewBloomFilterPolicy(kBloomBitsPerKey, false)); table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -546,24 +569,25 @@ TableProperties tp, sum_tp, expected_tp; for (int table = 1; table <= kTableCount; ++table) { for (int i = 0; i < kPutsPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); + ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); } for (int i = 0; i < kDeletionsPerTable; i++) { - db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize)); + ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize))); } for (int i = 0; i < kMergeOperandsPerTable; i++) { - db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); + ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize), + rnd.RandomString(kValueSize))); } for (int i = 0; i < kRangeDeletionsPerTable; i++) { - std::string start = RandomString(&rnd, kKeySize); + std::string start = rnd.RandomString(kKeySize); std::string end = start; end.resize(kValueSize); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + start, end)); } - db_->Flush(FlushOptions()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ResetTableProperties(&sum_tp); for (int level = 0; level < kMaxLevel; ++level) { db_->GetProperty( @@ -603,7 +627,8 @@ value_is_delta_encoded); // Gives larger bias here as index block size, filter block size, // and data block size become much harder to estimate in this test. - VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25); + VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5, + 0.4, 0.4, 0.25); } } } @@ -828,7 +853,7 @@ // Wait for compaction to be done. This is important because otherwise RocksDB // might schedule a compaction when reopening the database, failing assertion // (A) as a result. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); options.max_open_files = 10; Reopen(options); // After reopening, no table reader is loaded, so no memory for table readers @@ -856,7 +881,7 @@ std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); ASSERT_EQ(int_num, 2U); @@ -865,7 +890,7 @@ std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); ASSERT_EQ(int_num, 3U); @@ -920,11 +945,12 @@ for (int r = 0; r < kNumRounds; ++r) { for (int f = 0; f < kFlushesPerRound; ++f) { for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + ASSERT_OK( + Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize))); } } // Make sure that there is no flush between getting the two properties. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); // in no iterator case, these two number should be the same. @@ -938,12 +964,13 @@ iters.push_back(db_->NewIterator(ReadOptions())); for (int f = 0; f < kFlushesPerRound; ++f) { for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + ASSERT_OK( + Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize))); } } // Force flush to prevent flush from happening between getting the // properties or after getting the properties and before the new round. - Flush(); + ASSERT_OK(Flush()); // In the second round, add iterators. dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); @@ -958,6 +985,7 @@ // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks // whenever we release an iterator. for (auto* iter : iters) { + ASSERT_OK(iter->status()); delete iter; dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); // Expect the size shrinking @@ -1007,19 +1035,19 @@ uint64_t int_num; ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-pending-compaction-bytes", &int_num)); ASSERT_EQ(int_num, 0U); ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-pending-compaction-bytes", &int_num)); ASSERT_GT(int_num, 0U); ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - Flush(); + ASSERT_OK(Flush()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-pending-compaction-bytes", &int_num)); ASSERT_GT(int_num, 0U); @@ -1027,7 +1055,7 @@ sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-pending-compaction-bytes", &int_num)); ASSERT_EQ(int_num, 0U); @@ -1057,7 +1085,7 @@ std::string key = ToString(i) + ToString(j) + "key"; ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal)); } - Flush(); + ASSERT_OK(Flush()); } // no compression at L0, so ratio is less than one @@ -1065,7 +1093,7 @@ ASSERT_GT(CompressionRatioAtLevel(0), 0.0); ASSERT_EQ(CompressionRatioAtLevel(1), -1.0); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_EQ(CompressionRatioAtLevel(0), -1.0); // Data at L1 should be highly compressed thanks to Snappy and redundant data @@ -1168,6 +1196,61 @@ } }; +class BlockCountingTablePropertiesCollector : public TablePropertiesCollector { + public: + static const std::string kNumSampledBlocksPropertyName; + + const char* Name() const override { + return "BlockCountingTablePropertiesCollector"; + } + + Status Finish(UserCollectedProperties* properties) override { + (*properties)[kNumSampledBlocksPropertyName] = + ToString(num_sampled_blocks_); + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override { + if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) { + num_sampled_blocks_++; + } + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{ + {kNumSampledBlocksPropertyName, ToString(num_sampled_blocks_)}, + }; + } + + private: + uint32_t num_sampled_blocks_ = 0; +}; + +const std::string + BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName = + "NumSampledBlocks"; + +class BlockCountingTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + const char* Name() const override { + return "BlockCountingTablePropertiesCollectorFactory"; + } + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new BlockCountingTablePropertiesCollector(); + } +}; + #ifndef ROCKSDB_LITE TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { Options options = CurrentOptions(); @@ -1180,9 +1263,9 @@ // Create 4 tables for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } TablePropertiesCollection props; @@ -1204,7 +1287,7 @@ ASSERT_GT(collector_factory->num_created_, 0U); collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_GT(collector_factory->num_created_, 0U); } #endif // ROCKSDB_LITE @@ -1220,9 +1303,9 @@ // Create 2 files for (int table = 0; table < 2; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(1, ToString(table * 100 + i), "val"); + ASSERT_OK(Put(1, ToString(table * 100 + i), "val")); } - Flush(1); + ASSERT_OK(Flush(1)); } ASSERT_GT(collector_factory->num_created_, 0U); @@ -1230,15 +1313,15 @@ // Trigger automatic compactions. for (int table = 0; table < 3; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(1, ToString(table * 100 + i), "val"); + ASSERT_OK(Put(1, ToString(table * 100 + i), "val")); } - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_GT(collector_factory->num_created_, 0U); collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_GT(collector_factory->num_created_, 0U); // Come back to write to default column family @@ -1247,9 +1330,9 @@ // Create 4 tables in default column family for (int table = 0; table < 2; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_GT(collector_factory->num_created_, 0U); @@ -1257,15 +1340,15 @@ // Trigger automatic compactions. for (int table = 0; table < 3; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_GT(collector_factory->num_created_, 0U); collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_GT(collector_factory->num_created_, 0U); } @@ -1296,18 +1379,18 @@ const int kMaxKey = 1000; for (int i = 0; i < kMaxKey; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(i), rnd.RandomString(102))); + ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 1) { // Clear Level 0 so that when later flush a file with deletions, // we don't trigger an organic compaction. ASSERT_OK(Put(Key(0), "")); ASSERT_OK(Put(Key(kMaxKey * 2), "")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 0); @@ -1319,17 +1402,18 @@ iter->Next(); ++c; } + ASSERT_OK(iter->status()); ASSERT_EQ(c, 200); } - Delete(Key(0)); + ASSERT_OK(Delete(Key(0))); for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } - Delete(Key(kMaxKey * 2)); + ASSERT_OK(Delete(Key(kMaxKey * 2))); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); { SetPerfLevel(kEnableCount); @@ -1340,6 +1424,7 @@ while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(c, 0); ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u); ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u); @@ -1370,14 +1455,14 @@ for (int i = 0; i < kMaxKey; i++) { ASSERT_OK(Put(Key(i), "")); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); for (int i = 1; i < kMaxKey - 1; i++) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumTableFilesAtLevel(0), 2); // Restart the DB. Although number of files didn't reach @@ -1385,7 +1470,7 @@ // still be triggered because of the need-compaction hint. options.disable_auto_compactions = false; Reopen(options); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); { SetPerfLevel(kEnableCount); @@ -1395,6 +1480,7 @@ for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { c++; } + ASSERT_OK(iter->status()); ASSERT_EQ(c, 2); ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0); // We iterate every key twice. Is it a bug? @@ -1403,25 +1489,149 @@ } } +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { + // Sampled compression requires at least one of the following four types. + if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() && + !ZSTD_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.table_properties_collector_factories.emplace_back( + std::make_shared()); + + for (bool sample_for_compression : {false, true}) { + // For simplicity/determinism, sample 100% when enabled, or 0% when disabled + options.sample_for_compression = sample_for_compression ? 1 : 0; + + DestroyAndReopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + // A `BlockAdd()` should have been seen for files generated by flush or + // compaction when `sample_for_compression` is enabled. + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + auto& user_props = file_and_props.second->user_collected_properties; + ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName) != + user_props.end()); + ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName), + ToString(sample_for_compression ? 1 : 0)); + } + } +} + +class CompressionSamplingDBPropertiesTest + : public DBPropertiesTest, + public ::testing::WithParamInterface { + public: + CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {} + + protected: + const bool fast_; +}; + +INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest, + CompressionSamplingDBPropertiesTest, ::testing::Bool()); + +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_P(CompressionSamplingDBPropertiesTest, + EstimateDataSizeWithCompressionSampling) { + Options options = CurrentOptions(); + if (fast_) { + // One of the following light compression libraries must be present. + if (LZ4_Supported()) { + options.compression = kLZ4Compression; + } else if (Snappy_Supported()) { + options.compression = kSnappyCompression; + } else { + return; + } + } else { + // One of the following heavy compression libraries must be present. + if (ZSTD_Supported()) { + options.compression = kZSTD; + } else if (Zlib_Supported()) { + options.compression = kZlibCompression; + } else { + return; + } + } + options.disable_auto_compactions = true; + // For simplicity/determinism, sample 100%. + options.sample_for_compression = 1; + Reopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. The value consists of compressible data so the + // data block should be stored compressed. + std::string val(1024, 'a'); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", val)); + ASSERT_OK(Put("b", val)); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + ASSERT_GT(file_and_props.second->data_size, 0); + if (fast_) { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->fast_compression_estimated_data_size); + } else { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->slow_compression_estimated_data_size); + } + } +} + TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) { - Options options; + Options options = CurrentOptions(); Reopen(options); - Put("foo", "bar"); - Delete("foo"); - Delete("foo"); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Delete("foo")); + ASSERT_OK(Delete("foo")); uint64_t num_keys = 0; ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys)); ASSERT_EQ(0, num_keys); } TEST_F(DBPropertiesTest, EstimateOldestKeyTime) { - std::unique_ptr mock_env(new MockTimeEnv(Env::Default())); uint64_t oldest_key_time = 0; - Options options; - options.env = mock_env.get(); + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); // "rocksdb.estimate-oldest-key-time" only available to fifo compaction. - mock_env->set_current_time(100); for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal, kCompactionStyleNone}) { options.compaction_style = compaction; @@ -1432,60 +1642,61 @@ DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); } + int64_t mock_start_time; + ASSERT_OK(env_->GetCurrentTime(&mock_start_time)); + options.compaction_style = kCompactionStyleFIFO; options.ttl = 300; + options.max_open_files = -1; options.compaction_options_fifo.allow_compaction = false; DestroyAndReopen(options); - mock_env->set_current_time(100); + env_->MockSleepForSeconds(100); ASSERT_OK(Put("k1", "v1")); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(100, oldest_key_time); + ASSERT_EQ(100, oldest_key_time - mock_start_time); ASSERT_OK(Flush()); ASSERT_EQ("1", FilesPerLevel()); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(100, oldest_key_time); + ASSERT_EQ(100, oldest_key_time - mock_start_time); - mock_env->set_current_time(200); + env_->MockSleepForSeconds(100); // -> 200 ASSERT_OK(Put("k2", "v2")); ASSERT_OK(Flush()); ASSERT_EQ("2", FilesPerLevel()); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(100, oldest_key_time); + ASSERT_EQ(100, oldest_key_time - mock_start_time); - mock_env->set_current_time(300); + env_->MockSleepForSeconds(100); // -> 300 ASSERT_OK(Put("k3", "v3")); ASSERT_OK(Flush()); ASSERT_EQ("3", FilesPerLevel()); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(100, oldest_key_time); + ASSERT_EQ(100, oldest_key_time - mock_start_time); - mock_env->set_current_time(450); + env_->MockSleepForSeconds(150); // -> 450 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("2", FilesPerLevel()); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(200, oldest_key_time); + ASSERT_EQ(200, oldest_key_time - mock_start_time); - mock_env->set_current_time(550); + env_->MockSleepForSeconds(100); // -> 550 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("1", FilesPerLevel()); ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - ASSERT_EQ(300, oldest_key_time); + ASSERT_EQ(300, oldest_key_time - mock_start_time); - mock_env->set_current_time(650); + env_->MockSleepForSeconds(100); // -> 650 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("", FilesPerLevel()); ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime, &oldest_key_time)); - - // Close before mock_env destructs. - Close(); } TEST_F(DBPropertiesTest, SstFilesSize) { @@ -1516,6 +1727,7 @@ std::shared_ptr listener = std::make_shared(); Options options; + options.env = CurrentOptions().env; options.disable_auto_compactions = true; options.listeners.push_back(listener); Reopen(options); @@ -1588,11 +1800,11 @@ for (int i = 0; i < kNumL0Files; ++i) { // Make sure they overlap in keyspace to prevent trivial move - Put("key1", "val"); - Put("key2", "val"); - Flush(); + ASSERT_OK(Put("key1", "val")); + ASSERT_OK(Put("key2", "val")); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(listener->Validated()); } @@ -1600,6 +1812,8 @@ Options options; uint64_t value; + options.env = CurrentOptions().env; + // Block cache properties are not available for tables other than // block-based table. options.table_factory.reset(NewPlainTableFactory()); @@ -1650,7 +1864,8 @@ // Insert unpinned item to the cache and check size. constexpr size_t kSize1 = 50; - block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/); + ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1, + nullptr /*deleter*/)); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); @@ -1662,8 +1877,8 @@ // Insert pinned item to the cache and check size. constexpr size_t kSize2 = 30; Cache::Handle* item2 = nullptr; - block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/, - &item2); + ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2, + nullptr /*deleter*/, &item2)); ASSERT_NE(nullptr, item2); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -1676,8 +1891,8 @@ // Insert another pinned item to make the cache over-sized. constexpr size_t kSize3 = 80; Cache::Handle* item3 = nullptr; - block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/, - &item3); + ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3, + nullptr /*deleter*/, &item3)); ASSERT_NE(nullptr, item2); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -1701,7 +1916,80 @@ ASSERT_EQ(0, value); } +TEST_F(DBPropertiesTest, GetMapPropertyDbStats) { + auto mock_clock = std::make_shared(env_->GetSystemClock()); + CompositeEnvWrapper env(env_, mock_clock); + + Options opts = CurrentOptions(); + opts.env = &env; + Reopen(opts); + + { + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 0.0 /* expected_uptime */, + 0 /* expected_user_bytes_written */, + 0 /* expected_wal_bytes_written */, + 0 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + { + mock_clock->SleepForMicroseconds(1500000); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + 0 /* expected_user_bytes_written */, + 0 /* expected_wal_bytes_written */, + 0 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + int expected_user_bytes_written = 0; + { + // Write with WAL disabled. + WriteOptions write_opts; + write_opts.disableWAL = true; + + WriteBatch batch; + ASSERT_OK(batch.Put("key", "val")); + expected_user_bytes_written += static_cast(batch.GetDataSize()); + + ASSERT_OK(db_->Write(write_opts, &batch)); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + expected_user_bytes_written, + 0 /* expected_wal_bytes_written */, + 1 /* expected_user_writes_by_self */, + 0 /* expected_user_writes_with_wal */); + } + + int expected_wal_bytes_written = 0; + { + // Write with WAL enabled. + WriteBatch batch; + ASSERT_OK(batch.Delete("key")); + expected_user_bytes_written += static_cast(batch.GetDataSize()); + expected_wal_bytes_written += static_cast(batch.GetDataSize()); + + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::map db_stats; + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats)); + AssertDbStats(db_stats, 1.5 /* expected_uptime */, + expected_user_bytes_written, expected_wal_bytes_written, + 2 /* expected_user_writes_by_self */, + 1 /* expected_user_writes_with_wal */); + } + + Close(); +} + #endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_range_del_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_range_del_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,13 +7,14 @@ #include "port/stack_trace.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "test_util/testutil.h" +#include "util/random.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { class DBRangeDelTest : public DBTestBase { public: - DBRangeDelTest() : DBTestBase("/db_range_del_test") {} + DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {} std::string GetNumericStr(int key) { uint64_t uint64_key = static_cast(key); @@ -47,6 +48,21 @@ ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported()); } +TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) { + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b")); + ASSERT_EQ("val", Get("b")); +} + +TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) { + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); + ASSERT_TRUE( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a") + .IsInvalidArgument()); + ASSERT_EQ("val", Get("b")); +} + TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) { do { DestroyAndReopen(CurrentOptions()); @@ -57,6 +73,15 @@ } while (ChangeOptions(kRangeDelSkipConfigs)); } +TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) { + Options opts = CurrentOptions(); + opts.compression_opts.max_dict_bytes = 16384; + Reopen(opts); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr2")); + ASSERT_OK(db_->Flush(FlushOptions())); +} + TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) { do { Options opts = CurrentOptions(); @@ -66,13 +91,14 @@ // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"); - db_->Flush(FlushOptions()); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(1, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); @@ -92,7 +118,7 @@ Options options = CurrentOptions(); options.disable_auto_compactions = true; options.level0_file_num_compaction_trigger = kNumFiles; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); options.num_levels = 2; options.target_file_size_base = kFileBytes; BlockBasedTableOptions table_options; @@ -102,28 +128,29 @@ // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); Random rnd(301); for (int i = 0; i < kNumFiles; ++i) { std::vector values; // Write 12K (4 values, each 3K) for (int j = 0; j < kNumPerFile; j++) { - values.push_back(RandomString(&rnd, 3 << 10)); + values.push_back(rnd.RandomString(3 << 10)); ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); if (j == 0 && i > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } } // put extra key to trigger final flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(2, NumTableFilesAtLevel(1)); db_->ReleaseSnapshot(snapshot); @@ -139,42 +166,61 @@ opts.disable_auto_compactions = true; opts.level0_file_num_compaction_trigger = kNumFiles; opts.max_compaction_bytes = kNumPerFile * kBytesPerVal; - opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); // Want max_compaction_bytes to trigger the end of compaction output file, not // target_file_size_base, so make the latter much bigger - opts.target_file_size_base = 100 * opts.max_compaction_bytes; - Reopen(opts); + // opts.target_file_size_base = 100 * opts.max_compaction_bytes; + opts.target_file_size_base = 1; + DestroyAndReopen(opts); // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); + Random rnd(301); + + ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal))); + ASSERT_OK( + Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal))); + ASSERT_OK( + Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(NumTableFilesAtLevel(2), 2); + + ASSERT_OK(db_->SetOptions( + db_->DefaultColumnFamily(), + {{"target_file_size_base", ToString(100 * opts.max_compaction_bytes)}})); + // It spans the whole key-range, thus will be included in all output files ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), GetNumericStr(0), GetNumericStr(kNumFiles * kNumPerFile - 1))); - Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { std::vector values; // Write 1MB (256 values, each 4K) for (int j = 0; j < kNumPerFile; j++) { - values.push_back(RandomString(&rnd, kBytesPerVal)); + values.push_back(rnd.RandomString(kBytesPerVal)); ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j])); } // extra entry to trigger SpecialSkipListFactory's flush ASSERT_OK(Put(GetNumericStr(kNumPerFile), "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, + /*column_family=*/nullptr, + /*disallow_trivial_move=*/true)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GE(NumTableFilesAtLevel(1), 2); - std::vector> files; dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files); - for (size_t i = 0; i < files[1].size() - 1; ++i) { + for (size_t i = 0; i + 1 < files[1].size(); ++i) { ASSERT_TRUE(InternalKeyComparator(opts.comparator) .Compare(files[1][i].largest, files[1][i + 1].smallest) < 0); @@ -205,10 +251,10 @@ } TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) { - db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); - db_->Put(WriteOptions(), "b2", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "b2", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); // first iteration verifies query correctness in memtable, second verifies @@ -225,8 +271,9 @@ } TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { - db_->Put(WriteOptions(), "unused", "val"); // prevents empty after compaction - db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "unused", + "val")); // prevents empty after compaction + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); @@ -238,8 +285,8 @@ for (int i = 0; i < 2; ++i) { if (i > 0) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); } @@ -253,7 +300,7 @@ const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); - Reopen(opts); + DestroyAndReopen(opts); // Write a third before snapshot, a third between snapshot and tombstone, and // a third after the tombstone. Keys older than snapshot or newer than the @@ -263,12 +310,13 @@ if (i == kNum / 3) { snapshot = db_->GetSnapshot(); } else if (i == 2 * kNum / 3) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); for (int i = 0; i < kNum; ++i) { ReadOptions read_opts; @@ -290,32 +338,35 @@ Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); opts.disable_auto_compactions = true; - opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); opts.num_levels = 2; opts.statistics = CreateDBStatistics(); - Reopen(opts); + DestroyAndReopen(opts); for (int i = 0; i < kNumFiles; ++i) { if (i > 0) { // range tombstone covers first half of the previous file - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr((i - 1) * kNumPerFile), - GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)); + ASSERT_OK(db_->DeleteRange( + WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr((i - 1) * kNumPerFile), + GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2))); } // Make sure a given key appears in each file so compaction won't be able to // use trivial move, which would happen if the ranges were non-overlapping. // Also, we need an extra element since flush is only triggered when the // number of keys is one greater than SpecialSkipListFactory's limit. // We choose a key outside the key-range used by the test to avoid conflict. - db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), + "val")); for (int j = 0; j < kNumPerFile; ++j) { - db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"); + ASSERT_OK( + db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val")); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2, @@ -345,7 +396,7 @@ options.level0_file_num_compaction_trigger = kNumFiles; options.max_bytes_for_level_base = 2 * kFileBytes; options.max_subcompactions = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); options.num_levels = 3; options.target_file_size_base = kFileBytes; options.target_file_size_multiplier = 1; @@ -357,18 +408,18 @@ if (i > 0) { // delete [95,105) in two files, [295,305) in next two int mid = (j + (1 - j % 2)) * kNumPerFile; - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(mid - 5), Key(mid + 5)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); } std::vector values; // Write 100KB (100 values, each 1K) for (int k = 0; k < kNumPerFile; k++) { - values.push_back(RandomString(&rnd, 990)); + values.push_back(rnd.RandomString(990)); ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); } // put extra key to trigger flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); if (j < kNumFiles - 1) { // background compaction may happen early for kNumFiles'th file ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); @@ -384,7 +435,7 @@ // oversized L0 (relative to base_level) causes the compaction to run // earlier. ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), {{"disable_auto_compactions", "true"}})); ASSERT_EQ(NumTableFilesAtLevel(0), 0); @@ -404,7 +455,7 @@ options.compaction_style = kCompactionStyleUniversal; options.level0_file_num_compaction_trigger = kFilesPerLevel; options.max_subcompactions = 4; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); options.num_levels = kNumLevels; options.target_file_size_base = kNumPerFile << 10; options.target_file_size_multiplier = 1; @@ -417,24 +468,24 @@ // insert range deletions [95,105) in two files, [295,305) in next two // to prepare L1 for later manual compaction. int mid = (j + (1 - j % 2)) * kNumPerFile; - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(mid - 5), Key(mid + 5)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); } std::vector values; // Write 100KB (100 values, each 1K) for (int k = 0; k < kNumPerFile; k++) { - values.push_back(RandomString(&rnd, 990)); + values.push_back(rnd.RandomString(990)); ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); } // put extra key to trigger flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); if (j < kFilesPerLevel - 1) { // background compaction may happen early for kFilesPerLevel'th file ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); } @@ -444,7 +495,7 @@ // probably means universal compaction + subcompaction + range deletion are // compatible. ASSERT_OK(dbfull()->RunManualCompaction( - reinterpret_cast(db_->DefaultColumnFamily()) + static_cast_with_check(db_->DefaultColumnFamily()) ->cfd(), 1 /* input_level */, 2 /* output_level */, CompactRangeOptions(), nullptr /* begin */, nullptr /* end */, true /* exclusive */, @@ -457,7 +508,7 @@ const int kNumPerFile = 3, kNumFiles = 3; Options opts = CurrentOptions(); opts.disable_auto_compactions = true; - opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile)); opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); opts.num_levels = 2; Reopen(opts); @@ -467,17 +518,17 @@ for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) { if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) { // Delete merge operands from all but the last file - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", - "key_"); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); } std::string val; PutFixed64(&val, i); - db_->Merge(WriteOptions(), "key", val); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); // we need to prevent trivial move using Puts so compaction will actually // process the merge operands. - db_->Put(WriteOptions(), "prevent_trivial_move", ""); + ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", "")); if (i > 0 && i % kNumPerFile == 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } @@ -488,7 +539,7 @@ PutFixed64(&expected, 45); // 1+2+...+9 ASSERT_EQ(expected, actual); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); expected.clear(); ASSERT_OK(db_->Get(read_opts, "key", &actual)); @@ -534,19 +585,19 @@ opts.statistics = CreateDBStatistics(); Reopen(opts); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", - "dr10"); // obsolete after compaction - db_->Put(WriteOptions(), "key", "val"); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr10")); // obsolete after compaction + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", - "dr20"); // protected by snapshot - db_->Put(WriteOptions(), "key", "val"); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", + "dr20")); // protected by snapshot + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(2, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); @@ -581,34 +632,36 @@ opts.comparator = test::Uint64Comparator(); opts.level0_file_num_compaction_trigger = 4; opts.level0_stop_writes_trigger = 4; - opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); opts.num_levels = 2; BlockBasedTableOptions bbto; bbto.cache_index_and_filter_blocks = true; bbto.block_cache = NewLRUCache(8 << 20); opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(opts); + DestroyAndReopen(opts); // Hold a snapshot so range deletions can't become obsolete during compaction // to bottommost level (i.e., L1). const Snapshot* snapshot = db_->GetSnapshot(); for (int i = 0; i < kNum; ++i) { - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); if (i > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } } // Must be > 1 so the first L1 file can be closed before scan finishes - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(NumTableFilesAtLevel(1), 1); std::vector file_numbers = ListTableFiles(env_, dbname_); ReadOptions read_opts; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = kRangeEnd; iter->SeekToFirst(); for (auto file_number : file_numbers) { @@ -626,12 +679,22 @@ ASSERT_EQ(kNum, expected); delete iter; db_->ReleaseSnapshot(snapshot); + + // Also test proper cache handling in GetRangeTombstoneIterator, + // via TablesRangeTombstoneSummary. (This once triggered memory leak + // report with ASAN.) + opts.max_open_files = 1; + Reopen(opts); + + std::string str; + ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(), + 100, &str)); } TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) { do { DestroyAndReopen(CurrentOptions()); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); @@ -650,13 +713,13 @@ // memtable can hold. It switches the active memtable to immutable (flush is // prevented by the above options) upon inserting an element that would // overflow the memtable. - opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); DestroyAndReopen(opts); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "blah", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "blah", "val")); ReadOptions read_opts; std::string value; @@ -667,7 +730,7 @@ TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) { do { DestroyAndReopen(CurrentOptions()); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( @@ -690,11 +753,11 @@ for (int i = 0; i < kNumMergeOps; ++i) { std::string val; PutFixed64(&val, i); - db_->Merge(WriteOptions(), "key", val); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); if (i == kNumMergeOps / 2) { // deletes [0, 5] - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", - "key_"); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); } } @@ -715,19 +778,19 @@ Options opts = CurrentOptions(); opts.max_write_buffer_number = 4; opts.min_write_buffer_number_to_merge = 3; - opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); Reopen(opts); - db_->Put(WriteOptions(), "sst_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ASSERT_OK(db_->Flush(FlushOptions())); - db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); @@ -744,21 +807,23 @@ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); - opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + DestroyAndReopen(opts); // Write half of the keys before the tombstone and half after the tombstone. // Only covered keys (i.e., within the range and older than the tombstone) // should be deleted. for (int i = 0; i < kNum; ++i) { if (i == kNum / 2) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } ReadOptions read_opts; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -777,8 +842,8 @@ const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25; Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); - opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile)); + DestroyAndReopen(opts); const Snapshot* snapshot = nullptr; // Put a snapshot before the range tombstone, verify an iterator using that @@ -786,14 +851,16 @@ for (int i = 0; i < kNum; ++i) { if (i == kNum / 2) { snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } ReadOptions read_opts; read_opts.snapshot = snapshot; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -809,25 +876,26 @@ Options opts = CurrentOptions(); opts.max_write_buffer_number = 4; opts.min_write_buffer_number_to_merge = 3; - opts.memtable_factory.reset(new SpecialSkipListFactory(1)); + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); Reopen(opts); - db_->Put(WriteOptions(), "sst_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ASSERT_OK(db_->Flush(FlushOptions())); - db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ReadOptions read_opts; read_opts.ignore_range_deletions = true; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int i = 0; std::string expected[] = {"imm_key", "mem_key", "sst_key"}; for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) { @@ -841,7 +909,7 @@ #ifndef ROCKSDB_UBSAN_RUN TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( @@ -857,6 +925,7 @@ iter->SeekToFirst(); } ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; if (i == 0) { ASSERT_OK(db_->Flush(FlushOptions())); @@ -866,7 +935,6 @@ } db_->ReleaseSnapshot(snapshot); } - #endif // !ROCKSDB_UBSAN_RUN TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) { @@ -910,8 +978,8 @@ ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); - db_->EnableAutoCompaction({db_->DefaultColumnFamily()}); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->ReleaseSnapshot(snapshot); } @@ -933,7 +1001,7 @@ for (int i = 0; i < kNumKeys; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(kNumKeys))); for (int i = 0; i < kNumKeys; ++i) { @@ -956,7 +1024,7 @@ options.compression = kNoCompression; options.level0_file_num_compaction_trigger = kNumFilesPerLevel; options.memtable_factory.reset( - new SpecialSkipListFactory(2 /* num_entries_flush */)); + test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); options.target_file_size_base = kValueBytes; // i == 0: CompactFiles // i == 1: CompactRange @@ -971,24 +1039,24 @@ // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), - Key(2 * kNumFilesPerLevel)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(2 * kNumFilesPerLevel))); Random rnd(301); - std::string value = RandomString(&rnd, kValueBytes); + std::string value = rnd.RandomString(kValueBytes); for (int j = 0; j < kNumFilesPerLevel; ++j) { // give files overlapping key-ranges to prevent trivial move ASSERT_OK(Put(Key(j), value)); ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); if (j > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(j, NumTableFilesAtLevel(0)); } } // put extra key to trigger final flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1)); @@ -1006,7 +1074,7 @@ } else if (i == 2) { ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), {{"max_bytes_for_level_base", "10000"}})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(1)); } ASSERT_GT(NumTableFilesAtLevel(2), 0); @@ -1024,7 +1092,7 @@ options.compression = kNoCompression; options.level0_file_num_compaction_trigger = kNumFilesPerLevel; options.memtable_factory.reset( - new SpecialSkipListFactory(2 /* num_entries_flush */)); + test::NewSpecialSkipListFactory(2 /* num_entries_flush */)); options.target_file_size_base = kValueBytes; options.disable_auto_compactions = true; @@ -1040,15 +1108,15 @@ // A snapshot protects the range tombstone from dropping due to // becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(0), Key(2 * kNumFilesPerLevel)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(2 * kNumFilesPerLevel))); // Create 2 additional sstables in L0. Note that the first sstable // contains the range tombstone. // [key000000#3,1, key000004#72057594037927935,15] // [key000001#5,1, key000002#6,1] Random rnd(301); - std::string value = RandomString(&rnd, kValueBytes); + std::string value = rnd.RandomString(kValueBytes); for (int j = 0; j < kNumFilesPerLevel; ++j) { // Give files overlapping key-ranges to prevent a trivial move when we // compact from L0 to L1. @@ -1080,7 +1148,7 @@ ASSERT_EQ(value, Get(Key(2))); auto begin_str = Key(3); const ROCKSDB_NAMESPACE::Slice begin = begin_str; - dbfull()->TEST_CompactRange(1, &begin, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(2, NumTableFilesAtLevel(2)); ASSERT_EQ(value, Get(Key(2))); @@ -1099,7 +1167,7 @@ // [key000002#6,1, key000004#72057594037927935,15] auto begin_str = Key(0); const ROCKSDB_NAMESPACE::Slice begin = begin_str; - dbfull()->TEST_CompactRange(1, &begin, &begin); + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); ASSERT_EQ(3, NumTableFilesAtLevel(2)); } @@ -1183,7 +1251,7 @@ const Snapshot* snapshot = nullptr; for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kFileBytes / kValueBytes; ++j) { - auto value = RandomString(&rnd, kValueBytes); + auto value = rnd.RandomString(kValueBytes); ASSERT_OK(db_->Merge(WriteOptions(), "key", value)); } if (i == kNumFiles - 1) { @@ -1200,9 +1268,9 @@ std::string value; ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); - dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, - nullptr /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); // Now we have multiple files at L1 all containing a single user key, thus // guaranteeing overlap in the file endpoints. @@ -1213,9 +1281,9 @@ // Compact and verify again. It's worthwhile because now the files have // tighter endpoints, so we can verify that doesn't mess anything up. - dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */, - nullptr /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 1 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_GT(NumTableFilesAtLevel(2), 1); ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); @@ -1267,7 +1335,7 @@ const Snapshot* snapshots[] = {nullptr, nullptr}; for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kFileBytes / kValueBytes; ++j) { - auto value = RandomString(&rnd, kValueBytes); + auto value = rnd.RandomString(kValueBytes); std::string key; if (i < kNumFiles / 2) { key = Key(0); @@ -1291,6 +1359,7 @@ auto get_key_count = [this]() -> int { auto* iter = db_->NewIterator(ReadOptions()); + assert(iter->status().ok()); iter->SeekToFirst(); int keys_found = 0; for (; iter->Valid(); iter->Next()) { @@ -1313,7 +1382,7 @@ // Now overwrite a few keys that are in L1 files that definitely don't have // overlapping boundary keys. for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) { - auto value = RandomString(&rnd, kValueBytes); + auto value = rnd.RandomString(kValueBytes); ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value)); } ASSERT_OK(db_->Flush(FlushOptions())); @@ -1360,7 +1429,7 @@ const Snapshot* snapshot = nullptr; for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kFileBytes / kValueBytes; ++j) { - auto value = RandomString(&rnd, kValueBytes); + auto value = rnd.RandomString(kValueBytes); ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value)); if (i == 0 && j == kNumKeys) { // Take snapshot to prevent covered merge operands from being dropped or @@ -1393,6 +1462,7 @@ ASSERT_GT(NumTableFilesAtLevel(1), 1); auto* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); iter->SeekToLast(); int keys_found = 0; for (; iter->Valid(); iter->Prev()) { @@ -1419,11 +1489,12 @@ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(10))); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); ReadOptions read_opts; read_opts.snapshot = snapshot; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); @@ -1466,6 +1537,7 @@ ReadOptions read_opts; read_opts.snapshot = snapshot.get(); std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_OK(iter->status()); TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator"); @@ -1500,10 +1572,10 @@ for (int i = 0; i < kFileBytes / kValueBytes; ++i) { std::string key(1, first_char); key.append(Key(i)); - std::string value = RandomString(&rnd, kValueBytes); + std::string value = rnd.RandomString(kValueBytes); ASSERT_OK(Put(key, value)); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); MoveFilesToLevel(2); } ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -1522,7 +1594,7 @@ // TODO(ajkr): remove this `Put` after file cutting accounts for range // tombstones (#3977). ASSERT_OK(Put("c" + Key(1), "value")); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone // and the range tombstone is only placed in the second SST. @@ -1530,9 +1602,9 @@ Slice begin_key(begin_key_storage); std::string end_key_storage("d"); Slice end_key(end_key_storage); - dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */, - &end_key /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, &begin_key /* begin */, &end_key /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_EQ(2, NumTableFilesAtLevel(1)); std::vector all_metadata; @@ -1575,6 +1647,7 @@ const int kNumPerFile = 4, kNumFiles = 2; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; options.max_compaction_bytes = 9 * 1024; DestroyAndReopen(options); Random rnd(301); @@ -1582,7 +1655,7 @@ std::vector values; // Write 12K (4 values, each 3K) for (int j = 0; j < kNumPerFile; j++) { - values.push_back(RandomString(&rnd, 3 << 10)); + values.push_back(rnd.RandomString(3 << 10)); ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); } } @@ -1597,15 +1670,15 @@ ASSERT_EQ(1, NumTableFilesAtLevel(0)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); // The tombstone range is not broken up into multiple SSTs which may incur a // large compaction with L2. ASSERT_EQ(1, NumTableFilesAtLevel(1)); std::vector> files; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(1, NumTableFilesAtLevel(2)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); } @@ -1614,6 +1687,7 @@ const int kNumPerFile = 4, kNumFiles = 2; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; options.max_compaction_bytes = 9 * 1024; DestroyAndReopen(options); Random rnd(301); @@ -1621,7 +1695,7 @@ std::vector values; // Write 12K (4 values, each 3K) for (int j = 0; j < kNumPerFile; j++) { - values.push_back(RandomString(&rnd, 3 << 10)); + values.push_back(rnd.RandomString(3 << 10)); ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); } } @@ -1638,17 +1712,46 @@ // The key range is broken up into three SSTs to avoid a future big compaction // with the grandparent - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(3, NumTableFilesAtLevel(1)); - std::vector> files; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); - ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ( + 3, NumTableFilesAtLevel( + 2)); // L1->L2 compaction size is limited to max_compaction_bytes ASSERT_EQ(0, NumTableFilesAtLevel(1)); } +TEST_F(DBRangeDelTest, IteratorRefresh) { + // Refreshing an iterator after a range tombstone is added should cause the + // deleted range of keys to disappear. + for (bool sv_changed : {false, true}) { + ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1")); + ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2")); + + auto* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key2", "key3")); + + if (sv_changed) { + ASSERT_OK(db_->Flush(FlushOptions())); + } + + ASSERT_OK(iter->Refresh()); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + ASSERT_EQ("key1", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + delete iter; + } +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_secondary_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_secondary_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,1260 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class DBSecondaryTest : public DBTestBase { + public: + DBSecondaryTest() + : DBTestBase("db_secondary_test", /*env_do_fsync=*/true), + secondary_path_(), + handles_secondary_(), + db_secondary_(nullptr) { + secondary_path_ = + test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); + } + + ~DBSecondaryTest() override { + CloseSecondary(); + if (getenv("KEEP_DB") != nullptr) { + fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(secondary_path_, options)); + } + } + + protected: + Status ReopenAsSecondary(const Options& options) { + return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); + } + + void OpenSecondary(const Options& options); + + Status TryOpenSecondary(const Options& options); + + void OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options); + + void CloseSecondary() { + for (auto h : handles_secondary_) { + ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h)); + } + handles_secondary_.clear(); + delete db_secondary_; + db_secondary_ = nullptr; + } + + DBImplSecondary* db_secondary_full() { + return static_cast(db_secondary_); + } + + void CheckFileTypeCounts(const std::string& dir, int expected_log, + int expected_sst, int expected_manifest) const; + + std::string secondary_path_; + std::vector handles_secondary_; + DB* db_secondary_; +}; + +void DBSecondaryTest::OpenSecondary(const Options& options) { + ASSERT_OK(TryOpenSecondary(options)); +} + +Status DBSecondaryTest::TryOpenSecondary(const Options& options) { + Status s = + DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); + return s; +} + +void DBSecondaryTest::OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options) { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + for (const auto& cf_name : column_families) { + cf_descs.emplace_back(cf_name, options); + } + Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, + int expected_log, int expected_sst, + int expected_manifest) const { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dir, &filenames)); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kWalFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(expected_log, log_cnt); + ASSERT_EQ(expected_sst, sst_cnt); + ASSERT_EQ(expected_manifest, manifest_cnt); +} + +TEST_F(DBSecondaryTest, NonExistingDb) { + Destroy(last_options_); + + Options options = GetDefaultOptions(); + options.env = env_; + options.max_open_files = -1; + const std::string dbname = "/doesnt/exist"; + Status s = + DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_); + ASSERT_TRUE(s.IsIOError()); +} + +TEST_F(DBSecondaryTest, ReopenAsSecondary) { + Options options; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Put("bar", "bar_value")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Close(); + + ASSERT_OK(ReopenAsSecondary(options)); + ASSERT_EQ("foo_value", Get("foo")); + ASSERT_EQ("bar_value", Get("bar")); + ReadOptions ropts; + ropts.verify_checksums = true; + auto db1 = static_cast(db_); + ASSERT_NE(nullptr, db1); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + if (0 == count) { + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value", iter->value().ToString()); + } else if (1 == count) { + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value", iter->value().ToString()); + } + ++count; + } + delete iter; + ASSERT_EQ(2, count); +} + +TEST_F(DBSecondaryTest, SimpleInternalCompaction) { + Options options; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, + &result)); + + ASSERT_EQ(result.output_files.size(), 1); + InternalKey smallest, largest; + smallest.DecodeFrom(result.output_files[0].smallest_internal_key); + largest.DecodeFrom(result.output_files[0].largest_internal_key); + ASSERT_EQ(smallest.user_key().ToString(), "bar"); + ASSERT_EQ(largest.user_key().ToString(), "foo"); + ASSERT_EQ(result.output_level, 1); + ASSERT_EQ(result.output_path, this->secondary_path_); + ASSERT_EQ(result.num_output_records, 2); + ASSERT_GT(result.bytes_written, 0); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + const int kRangeL2 = 10; + const int kRangeL1 = 30; + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i * kRangeL2), "value" + ToString(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i * kRangeL1), "value" + ToString(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(i * 30), "value" + ToString(i))); + ASSERT_OK(Put(Key(i * 30 + 50), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + // pick 2 files on level 0 for compaction, which has 3 overlap files on L1 + CompactionServiceInput input1; + input1.input_files.push_back(meta.levels[0].files[2].name); + input1.input_files.push_back(meta.levels[0].files[3].name); + input1.input_files.push_back(meta.levels[1].files[0].name); + input1.input_files.push_back(meta.levels[1].files[1].name); + input1.input_files.push_back(meta.levels[1].files[2].name); + + input1.output_level = 1; + + options.max_open_files = -1; + Close(); + + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input1, + &result)); + ASSERT_OK(result.status); + + // pick 2 files on level 1 for compaction, which has 6 overlap files on L2 + CompactionServiceInput input2; + input2.input_files.push_back(meta.levels[1].files[1].name); + input2.input_files.push_back(meta.levels[1].files[2].name); + for (int i = 3; i < 9; i++) { + input2.input_files.push_back(meta.levels[2].files[i].name); + } + + input2.output_level = 2; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2, + &result)); + ASSERT_OK(result.status); + + CloseSecondary(); + + // delete all l2 files, without update manifest + for (auto& file : meta.levels[2].files) { + ASSERT_OK(env_->DeleteFile(dbname_ + file.name)); + } + OpenSecondary(options); + cfh = db_secondary_->DefaultColumnFamily(); + Status s = db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2, + &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + // TODO: L0 -> L1 compaction should success, currently version is not built + // if files is missing. + // ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, + // input1, &result)); +} + +TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + + // trigger compaction to delete the files for secondary instance compaction + ASSERT_OK(Put("foo", "foo_value" + std::to_string(3))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(3))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = + db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + + Close(); + + ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0])); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = + db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + input.input_files.erase(input.input_files.begin()); + + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, + &result)); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, OpenAsSecondary) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); +} + +namespace { +class TraceFileEnv : public EnvWrapper { + public: + explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {} + static const char* kClassName() { return "TraceFileEnv"; } + const char* Name() const override { return kClassName(); } + + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& env_options) override { + class TracedRandomAccessFile : public RandomAccessFile { + public: + TracedRandomAccessFile(std::unique_ptr&& target, + std::atomic& counter) + : target_(std::move(target)), files_closed_(counter) {} + ~TracedRandomAccessFile() override { + files_closed_.fetch_add(1, std::memory_order_relaxed); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + + private: + std::unique_ptr target_; + std::atomic& files_closed_; + }; + Status s = target()->NewRandomAccessFile(f, r, env_options); + if (s.ok()) { + r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_)); + } + return s; + } + + int files_closed() const { + return files_closed_.load(std::memory_order_relaxed); + } + + private: + std::atomic files_closed_{0}; +}; +} // namespace + +TEST_F(DBSecondaryTest, SecondaryCloseFiles) { + Options options; + options.env = env_; + options.max_open_files = 1; + options.disable_auto_compactions = true; + Reopen(options); + Options options1; + std::unique_ptr traced_env(new TraceFileEnv(env_)); + options1.env = traced_env.get(); + OpenSecondary(options1); + + static const auto verify_db = [&]() { + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + std::unique_ptr iter2(db_secondary_->NewIterator(ReadOptions())); + for (iter1->SeekToFirst(), iter2->SeekToFirst(); + iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { + ASSERT_EQ(iter1->key(), iter2->key()); + ASSERT_EQ(iter1->value(), iter2->value()); + } + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + }; + + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Put("c", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Put("d", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(2, static_cast(traced_env.get())->files_closed()); + + Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}}); + ASSERT_TRUE(s.IsNotSupported()); + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); +} + +TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) { + Options options; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto verify_db = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ReadOptions ropts; + Status s = db_secondary_->Get(ropts, "foo", &value); + ASSERT_OK(s); + ASSERT_EQ(foo_val, value); + + s = db_secondary_->Get(ropts, "bar", &value); + ASSERT_OK(s); + ASSERT_EQ(bar_val, value); + }; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db("foo_value2", "bar_value2"); + } +} + +TEST_F(DBSecondaryTest, RefreshIterator) { + Options options; + options.env = env_; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + std::unique_ptr it(db_secondary_->NewIterator(ReadOptions())); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + if (0 == i) { + it->Seek("foo"); + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + + ASSERT_OK(it->Refresh()); + + it->Seek("foo"); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value0", it->value()); + } else { + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value()); + ASSERT_OK(it->status()); + + ASSERT_OK(it->Refresh()); + + it->Seek("foo"); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("foo", it->key()); + ASSERT_EQ("foo_value" + std::to_string(i), it->value()); + } + } +} + +TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options1); + cf_descs.emplace_back("pikachu", options1); + cf_descs.emplace_back("eevee", options1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_NOK(s); +} + +TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_EQ(0, handles_secondary_.size()); + ASSERT_NE(nullptr, db_secondary_); + + ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(Flush(1 /*cf*/)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value", value); +} + +TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { + Options options; + options.env = env_; + Reopen(options); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, + {"DBImpl::Open:AfterDeleteFiles", + "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" + "1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread ro_db_thread([&]() { + Options options1; + options1.env = env_; + options1.max_open_files = -1; + Status s = TryOpenSecondary(options1); + ASSERT_TRUE(s.IsTryAgain()); + + // Try again + OpenSecondary(options1); + CloseSecondary(); + }); + Reopen(options); + ro_db_thread.join(); +} + +TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, MissingTableFile) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_NE(nullptr, db_secondary_full()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { + Options options; + options.env = env_; + const std::string kCfName1 = "pikachu"; + CreateAndReopenWithCF({kCfName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCfName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); + ASSERT_OK(Flush(1 /*cf*/)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + Close(); + CheckFileTypeCounts(dbname_, 1, 0, 1); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + value.clear(); + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); +} + +TEST_F(DBSecondaryTest, SwitchManifest) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + const std::string cf1_name("test_cf"); + CreateAndReopenWithCF({cf1_name}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, + options1); + + const int kNumFiles = options.level0_file_num_compaction_trigger - 1; + // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, + // ..., 9. + const int kNumKeys = 10; + // Create two sst + for (int i = 0; i != kNumFiles; ++i) { + for (int j = 0; j != kNumKeys; ++j) { + ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + const auto& range_scan_db = [&]() { + ReadOptions tmp_ropts; + tmp_ropts.total_order_seek = true; + tmp_ropts.verify_checksums = true; + std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); + int cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { + ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); + ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), + iter->value().ToString()); + } + }; + + range_scan_db(); + + // While secondary instance still keeps old MANIFEST open, we close primary, + // restart primary, performs full compaction, close again, restart again so + // that next time secondary tries to catch up with primary, the secondary + // will skip the MANIFEST in middle. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + range_scan_db(); +} + +TEST_F(DBSecondaryTest, SwitchManifestTwice) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + const std::string cf1_name("test_cf"); + CreateAndReopenWithCF({cf1_name}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, + options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options); + ASSERT_OK(Put("0", "value1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value1", value); +} + +TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) { + const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + } + TEST_SYNC_POINT( + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + ASSERT_OK(wb.Put("key0", "value0")); + ASSERT_OK(wb.Put("key1", "value1")); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(iter1->status()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_OK(iter2->status()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + ASSERT_OK(wb1.Put("key0", "value01")); + ASSERT_OK(wb1.Put("key1", "value11")); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + ASSERT_OK(wb2.Put("key0", "new_value0")); + ASSERT_OK(wb2.Delete("key1")); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); + ASSERT_OK(iter3->status()); +} + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} + +TEST_F(DBSecondaryTest, StartFromInconsistent) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options1; + options1.env = env_; + Status s = TryOpenSecondary(options1); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Flush()); + + Options options1; + options1.env = env_; + OpenSecondary(options1); + + { + std::string value; + ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("value", value); + } + + ASSERT_OK(Put("bar", "value1")); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + ASSERT_NE(nullptr, arg); + *(reinterpret_cast(arg)) = + Status::Corruption("Inject corruption"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = db_secondary_->TryCatchUpWithPrimary(); + ASSERT_TRUE(s.IsCorruption()); +} + +TEST_F(DBSecondaryTest, OpenWithTransactionDB) { + Options options = CurrentOptions(); + options.create_if_missing = true; + + // Destroy the DB to recreate as a TransactionDB. + Close(); + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db; + + std::vector cfs = {"new_CF"}; + CreateColumnFamilies(cfs, options); + ASSERT_EQ(handles_.size(), 1); + + WriteOptions wopts; + TransactionOptions txn_opts; + Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn1, nullptr); + ASSERT_OK(txn1->Put(handles_[0], "k1", "v1")); + ASSERT_OK(txn1->Commit()); + delete txn1; + + options = CurrentOptions(); + options.max_open_files = -1; + ASSERT_OK(TryOpenSecondary(options)); +} + +#endif //! ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_sst_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_sst_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,12 +12,13 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_manager.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { class DBSSTTest : public DBTestBase { public: - DBSSTTest() : DBTestBase("/db_sst_test") {} + DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {} }; #ifndef ROCKSDB_LITE @@ -97,7 +98,7 @@ for (int i = 0; i < 10; ++i) { GenerateNewFile(&rnd, &key_id, false); } - Flush(); + ASSERT_OK(Flush()); Close(); int const num_files = GetSstFileCount(dbname_); ASSERT_GT(num_files, 0); @@ -140,6 +141,7 @@ // Just open the DB with the option set to true and check that we don't crash. Options options; + options.env = env_; options.skip_checking_sst_file_sizes_on_db_open = true; Reopen(options); @@ -163,12 +165,12 @@ for (int i = 0; i < 2; ++i) { // Create 1MB sst file for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); } ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); // If the moved file is actually deleted (the move-safeguard in @@ -211,12 +213,12 @@ for (int i = 0; i < 2; ++i) { // Create 1MB sst file for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024))); } ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); test::SleepingBackgroundTask blocking_thread; @@ -242,7 +244,7 @@ // write_buffer_size. The flush will be blocked with block_first_time // pending_file is protecting all the files created after for (int j = 0; j < 256; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024))); } blocking_thread.WaitUntilSleeping(); @@ -262,9 +264,9 @@ // finish the flush! blocking_thread.WakeUp(); blocking_thread.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // File just flushed is too big for L0 and L1 so gets moved to L2. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0)); metadata.clear(); @@ -300,14 +302,18 @@ for (int i = 0; i < 25; i++) { GenerateNewRandomFile(&rnd); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify that we are tracking all sst files in dbname_ - ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); } ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - auto files_in_db = GetAllSSTFiles(); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); // Verify that we are tracking all sst files in dbname_ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); // Verify the total files size @@ -341,7 +347,272 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBSSTTest, RateLimitedDelete) { +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + DestroyAndReopen(options); + Random rnd(301); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(files_added, blob_files.size()); + // No blob file is obsoleted. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + // No files were moved. + ASSERT_EQ(files_moved, 0); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + + // Verify that we are tracking all sst and blob files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + Close(); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened. + Close(); + + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Destroy DB and it will remove all the blob files from sst file manager and + // blob files deletion will go through ScheduleFileDeletion. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_EQ(files_deleted, blob_files.size()); + ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + Random rnd(301); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + constexpr char fifth_key[] = "fifth_key"; + constexpr char fifth_value[] = "fifth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Put(fifth_key, fifth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 5); + ASSERT_EQ(files_added, 5); + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + ASSERT_EQ(files_moved, 0); + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + size_t expected_number_of_files = original_blob_files.size(); + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + expected_number_of_files -= cutoff_index; + files_added = 0; + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + ASSERT_EQ(Get(fifth_key), fifth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + // No new file is added. + ASSERT_EQ(files_added, 0); + ASSERT_EQ(files_deleted, cutoff_index); + ASSERT_EQ(files_scheduled_to_delete, cutoff_index); + ASSERT_EQ(files_moved, 0); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + sfm->WaitForEmptyTrash(); + ASSERT_EQ(files_deleted, 5); + ASSERT_EQ(files_scheduled_to_delete, 5); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class DBSSTTestRateLimit : public DBSSTTest, + public ::testing::WithParamInterface { + public: + DBSSTTestRateLimit() : DBSSTTest() {} + ~DBSSTTestRateLimit() override {} +}; + +TEST_P(DBSSTTestRateLimit, RateLimitedDelete) { Destroy(last_options_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"DBSSTTest::RateLimitedDelete:1", @@ -356,38 +627,38 @@ "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { // Turn timed wait into a simulated sleep uint64_t* abs_time_us = static_cast(arg); - int64_t cur_time = 0; - env_->GetCurrentTime(&cur_time); - if (*abs_time_us > static_cast(cur_time)) { - env_->addon_time_.fetch_add(*abs_time_us - - static_cast(cur_time)); + uint64_t cur_time = env_->NowMicros(); + if (*abs_time_us > cur_time) { + env_->MockSleepForMicroseconds(*abs_time_us - cur_time); } - // Randomly sleep shortly - env_->addon_time_.fetch_add( - static_cast(Random::GetTLSInstance()->Uniform(10))); - - // Set wait until time to before current to force not to sleep. - int64_t real_cur_time = 0; - Env::Default()->GetCurrentTime(&real_cur_time); - *abs_time_us = static_cast(real_cur_time); + // Plus an additional short, random amount + env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10)); + + // Set wait until time to before (actual) current time to force not + // to sleep + *abs_time_us = Env::Default()->NowMicros(); + }); + + // Disable PeriodicWorkScheduler as it also has TimedWait, which could update + // the simulated sleep time + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", [&](void* arg) { + bool* disable_scheduler = static_cast(arg); + *disable_scheduler = true; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - env_->no_slowdown_ = true; - env_->time_elapse_only_sleep_ = true; + bool different_wal_dir = GetParam(); Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); options.disable_auto_compactions = true; - // Need to disable stats dumping and persisting which also use - // RepeatableThread, one of whose member variables is of type - // InstrumentedCondVar. The callback for - // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping - // and persisting threads and cause time_spent_deleting measurement to become - // incorrect. - options.stats_dump_period_sec = 0; - options.stats_persist_period_sec = 0; options.env = env_; + options.statistics = CreateDBStatistics(); + if (different_wal_dir) { + options.wal_dir = alternative_wal_dir_; + } int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec Status s; @@ -399,8 +670,10 @@ sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1); WriteOptions wo; - wo.disableWAL = true; - ASSERT_OK(TryReopen(options)); + if (!different_wal_dir) { + wo.disableWAL = true; + } + Reopen(options); // Create 4 files in L0 for (char v = 'a'; v <= 'd'; v++) { ASSERT_OK(Put("Key2", DummyString(1024, v), wo)); @@ -437,10 +710,16 @@ } ASSERT_GT(time_spent_deleting, expected_penlty * 0.9); ASSERT_LT(time_spent_deleting, expected_penlty * 1.1); + ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ( + 0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit, + ::testing::Bool()); + TEST_F(DBSSTTest, RateLimitedWALDelete) { Destroy(last_options_); @@ -449,8 +728,6 @@ "DeleteScheduler::BackgroundEmptyTrash:Wait", [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); - env_->no_slowdown_ = true; - env_->time_elapse_only_sleep_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = true; options.compression = kNoCompression; @@ -464,6 +741,7 @@ options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); auto sfm = static_cast(options.sst_file_manager.get()); sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + SetTimeElapseOnlySleepOnReopen(&options); ASSERT_OK(TryReopen(options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -494,10 +772,11 @@ } class DBWALTestWithParam - : public DBSSTTest, + : public DBTestBase, public testing::WithParamInterface> { public: - DBWALTestWithParam() { + explicit DBWALTestWithParam() + : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) { wal_dir_ = std::get<0>(GetParam()); wal_dir_same_as_dbname_ = std::get<1>(GetParam()); } @@ -510,8 +789,8 @@ class MyEnv : public EnvWrapper { public: MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {} - - Status DeleteFile(const std::string& fname) { + const char* Name() const override { return "MyEnv"; } + Status DeleteFile(const std::string& fname) override { if (fname.find(".log.trash") != std::string::npos && fake_log_delete) { return Status::OK(); } @@ -525,7 +804,7 @@ bool fake_log_delete; }; - std::unique_ptr env(new MyEnv(Env::Default())); + std::unique_ptr env(new MyEnv(env_)); Destroy(last_options_); env->set_fake_log_delete(true); @@ -545,10 +824,17 @@ auto sfm = static_cast(options.sst_file_manager.get()); sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); - ASSERT_OK(TryReopen(options)); + Reopen(options); // Create 4 files in L0 for (char v = 'a'; v <= 'd'; v++) { + if (v == 'c') { + // Maximize the change that the last log file will be preserved in trash + // before restarting the DB. + // We have to set this on the 2nd to last file for it to delay deletion + // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash()) + options.sst_file_manager->SetDeleteRateBytesPerSecond(1); + } ASSERT_OK(Put("Key2", DummyString(1024, v))); ASSERT_OK(Put("Key3", DummyString(1024, v))); ASSERT_OK(Put("Key4", DummyString(1024, v))); @@ -567,11 +853,11 @@ if (!wal_dir_same_as_dbname_) { // Forcibly create some trash log files std::unique_ptr result; - env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, - EnvOptions()); + ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions())); result.reset(); } - env->GetChildren(options.wal_dir, &filenames); + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); for (const std::string& fname : filenames) { if (fname.find(".log.trash") != std::string::npos) { trash_log_count++; @@ -580,11 +866,11 @@ ASSERT_GE(trash_log_count, 1); env->set_fake_log_delete(false); - ASSERT_OK(TryReopen(options)); + Reopen(options); filenames.clear(); trash_log_count = 0; - env->GetChildren(options.wal_dir, &filenames); + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); for (const std::string& fname : filenames) { if (fname.find(".log.trash") != std::string::npos) { trash_log_count++; @@ -608,13 +894,13 @@ Destroy(last_options_); // Add some trash files to the db directory so the DB can clean them up - env_->CreateDirIfMissing(dbname_); + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); // Reopen the DB and verify that it deletes existing trash files - ASSERT_OK(TryReopen(options)); + Reopen(options); sfm->WaitForEmptyTrash(); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash")); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash")); @@ -733,7 +1019,7 @@ int num_sst_files = 0; int num_wal_files = 0; std::vector db_files; - env_->GetChildren(dbname_, &db_files); + ASSERT_OK(env_->GetChildren(dbname_, &db_files)); for (std::string f : db_files) { if (f.substr(f.find_last_of(".") + 1) == "sst") { num_sst_files++; @@ -747,7 +1033,9 @@ auto sfm = static_cast(options.sst_file_manager.get()); sfm->SetDeleteRateBytesPerSecond(1024 * 1024); - sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1); + // Set an extra high trash ratio to prevent immediate/non-rate limited + // deletions + sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0); ASSERT_OK(DestroyDB(dbname_, options)); sfm->WaitForEmptyTrash(); ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files); @@ -766,12 +1054,13 @@ // Generate a file containing 100 keys. for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); uint64_t first_file_size = 0; - auto files_in_db = GetAllSSTFiles(&first_file_size); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size)); ASSERT_EQ(sfm->GetTotalSize(), first_file_size); // Set the maximum allowed space usage to the current total size @@ -782,6 +1071,68 @@ ASSERT_NOK(Flush()); } +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + uint64_t files_size = 0; + uint64_t total_files_size = 0; + std::unordered_map files_in_db; + + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size)); + // Make sure blob files are considered by SSTFileManage in size limits. + ASSERT_GT(files_size, 0); + total_files_size = files_size; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size)); + total_files_size += files_size; + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Set the maximum allowed space usage to the current total size. + sfm->SetMaxAllowedSpaceUsage(total_files_size + 1); + + bool max_allowed_space_reached = false; + bool delete_blob_file = false; + // Sync point called after blob file is closed and max allowed space is + // checked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached", + [&](void* /*arg*/) { max_allowed_space_reached = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BuildTable::AfterDeleteFile", + [&](void* /*arg*/) { delete_blob_file = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + { + "BuildTable::AfterDeleteFile", + "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1", + }, + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key1", "val1")); + // This flush will fail + ASSERT_NOK(Flush()); + ASSERT_TRUE(max_allowed_space_reached); + + TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1"); + ASSERT_TRUE(delete_blob_file); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBSSTTest, CancellingCompactionsWorks) { std::shared_ptr sst_file_manager(NewSstFileManager(env_)); auto sfm = static_cast(sst_file_manager.get()); @@ -807,20 +1158,21 @@ // Generate a file containing 10 keys. for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); uint64_t total_file_size = 0; - auto files_in_db = GetAllSSTFiles(&total_file_size); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); // Set the maximum allowed space usage to the current total size sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); // Generate another file to trigger compaction. for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); // Because we set a callback in CancelledCompaction, we actually // let the compaction run @@ -828,6 +1180,12 @@ ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); // Make sure the stat is bumped ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0); + ASSERT_EQ(0, + dbfull()->immutable_db_options().statistics.get()->getTickerCount( + FILES_MARKED_TRASH)); + ASSERT_EQ(4, + dbfull()->immutable_db_options().statistics.get()->getTickerCount( + FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -848,25 +1206,28 @@ // Generate a file containing 10 keys. for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); uint64_t total_file_size = 0; - auto files_in_db = GetAllSSTFiles(&total_file_size); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); // Set the maximum allowed space usage to the current total size sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); // Generate another file to trigger compaction. for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); // OK, now trigger a manual compaction - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); // Make sure the stat is bumped @@ -876,10 +1237,13 @@ // Now make sure CompactFiles also gets cancelled auto l0_files = collector->GetFlushedFiles(); - dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); + ASSERT_TRUE( + dbfull() + ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0) + .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( COMPACTION_CANCELLED), @@ -894,8 +1258,9 @@ "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + l0_files, 0)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); ASSERT_GT(completed_compactions, 0); @@ -955,14 +1320,15 @@ // It is easy to detect if the test is stuck in a loop. No need for // complex termination logic. while (true) { - auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); + auto s = Put(rnd.RandomString(10), rnd.RandomString(50)); if (!s.ok()) { break; } } ASSERT_TRUE(bg_error_set); uint64_t total_sst_files_size = 0; - GetAllSSTFiles(&total_sst_files_size); + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size)); ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -998,7 +1364,7 @@ CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 2; - db_->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); // Create 12 Files in L0 for (int i = 0; i < 12; i++) { @@ -1033,13 +1399,16 @@ // we encode table properties as varint64. Force time to be 0 to work around // it. Should remove the workaround after we propagate the property on // compaction. - std::unique_ptr mock_env(new MockTimeEnv(Env::Default())); - mock_env->set_current_time(0); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) { + uint64_t* current_time = static_cast(arg); + *current_time = 0; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); options.disable_auto_compactions = true; options.compression = kNoCompression; - options.env = mock_env.get(); DestroyAndReopen(options); // Generate 5 files in L0 for (int i = 0; i < 5; i++) { @@ -1047,7 +1416,7 @@ std::string val = "val_file_" + ToString(i); ASSERT_OK(Put(Key(j), val)); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_EQ("5", FilesPerLevel(0)); @@ -1071,6 +1440,7 @@ // hold current version std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); // Compact 5 files into 1 file in L0 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1094,12 +1464,13 @@ // hold current version std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); // Delete all keys and compact, this will delete all live files for (int i = 0; i < 10; i++) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("", FilesPerLevel(0)); @@ -1113,6 +1484,7 @@ // Total SST files = 6 (5 original files + compacted file) ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + ASSERT_OK(iter1->status()); iter1.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", &total_sst_files_size)); @@ -1120,6 +1492,7 @@ // Total SST files = 1 (compacted file) ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + ASSERT_OK(iter2->status()); iter2.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", &total_sst_files_size)); @@ -1127,8 +1500,7 @@ // Total SST files = 0 ASSERT_EQ(total_sst_files_size, 0); - // Close db before mock_env destruct. - Close(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { @@ -1139,7 +1511,7 @@ // Generate 5 files in L0 for (int i = 0; i < 5; i++) { ASSERT_OK(Put(Key(i), "val")); - Flush(); + ASSERT_OK(Flush()); } ASSERT_EQ("5", FilesPerLevel(0)); @@ -1164,6 +1536,7 @@ // hold current version std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); // Compaction will do trivial move from L0 to L1 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1187,12 +1560,13 @@ // hold current version std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); // Delete all keys and compact, this will delete all live files for (int i = 0; i < 5; i++) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("", FilesPerLevel(0)); @@ -1206,7 +1580,9 @@ // Total SST files = 5 (used in 2 version) ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + ASSERT_OK(iter1->status()); iter1.reset(); + ASSERT_OK(iter2->status()); iter2.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", @@ -1216,6 +1592,103 @@ ASSERT_EQ(total_sst_files_size, 0); } +// This test if blob files are recorded by SST File Manager when Compaction job +// creates/delete them and in case of AtomicFlush. +TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.min_blob_size = 0; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + options.atomic_flush = true; + + int files_added = 0; + int files_deleted = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (EndsWith(*file_path, ".blob")) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + Random rnd(301); + + ASSERT_OK(Put("key_1", "value_1")); + ASSERT_OK(Put("key_2", "value_2")); + ASSERT_OK(Put("key_3", "value_3")); + ASSERT_OK(Put("key_4", "value_4")); + ASSERT_OK(Flush()); + + // Overwrite will create the garbage data. + ASSERT_OK(Put("key_3", "new_value_3")); + ASSERT_OK(Put("key_4", "new_value_4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + ASSERT_EQ(files_added, 3); + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + files_added = 0; + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + // Compaction job will create a new file and delete the older files. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(files_added, 1); + ASSERT_EQ(files_scheduled_to_delete, 1); + + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(files_deleted, 1); + + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + + ASSERT_EQ(files_scheduled_to_delete, 4); + + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(files_deleted, 4); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE @@ -1223,5 +1696,6 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_statistics_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_statistics_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,12 +9,14 @@ #include "monitoring/thread_status_util.h" #include "port/stack_trace.h" #include "rocksdb/statistics.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { class DBStatisticsTest : public DBTestBase { public: - DBStatisticsTest() : DBTestBase("/db_statistics_test") {} + DBStatisticsTest() + : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {} }; TEST_F(DBStatisticsTest, CompressionStatsTest) { @@ -55,7 +57,7 @@ Random rnd(301); for (int i = 0; i < kNumKeysWritten; ++i) { // compressible string - ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); } ASSERT_OK(Flush()); ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0); @@ -75,7 +77,7 @@ // Check that compressions do not occur when turned off for (int i = 0; i < kNumKeysWritten; ++i) { // compressible string - ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); } ASSERT_OK(Flush()); ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) @@ -135,11 +137,73 @@ ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); options.statistics->histogramData(DB_WRITE, &histogram_data); ASSERT_GT(histogram_data.max, 0.0); - options.statistics->Reset(); + ASSERT_OK(options.statistics->Reset()); } } } +TEST_F(DBStatisticsTest, ExcludeTickers) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + options.statistics->set_stats_level(StatsLevel::kExceptTickers); + ASSERT_OK(Put("foo", "value")); + ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN)); + options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers); + Reopen(options); + ASSERT_EQ("value", Get("foo")); + ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0); +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // Expected to be populated regardless of `PerfLevel` in user thread + SetPerfLevel(kDisable); + + { + // Scenario 0: only WAL data. Not verified so require ticker to be zero. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(0, + options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES)); + } + + // Create one SST. + ASSERT_OK(Flush()); + std::unordered_map table_files; + uint64_t table_files_size = 0; + GetAllDataFiles(kTableFile, &table_files, &table_files_size); + + { + // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read + // the whole file so we require the ticker stat exactly matches the file + // size. + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + ASSERT_EQ(table_files_size, + options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES)); + } + + { + // Scenario 2: Table verified in `VerifyChecksum()`. This opens a + // `TableReader` to verify each block. It can involve duplicate reads of the + // same data so we set a lower-bound only. + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES), + table_files_size); + } +} + +#endif // !ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_table_properties_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_table_properties_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,11 +11,16 @@ #include #include "db/db_test_util.h" +#include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/db.h" +#include "rocksdb/types.h" #include "rocksdb/utilities/table_properties_collectors.h" +#include "table/format.h" +#include "table/meta_blocks.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" #ifndef ROCKSDB_LITE @@ -42,12 +47,16 @@ ASSERT_EQ(props.size(), unique_entries.size()); ASSERT_EQ(expected_entries_size, sum); + + VerifySstUniqueIds(props); } } // namespace -class DBTablePropertiesTest : public DBTestBase { +class DBTablePropertiesTest : public DBTestBase, + public testing::WithParamInterface { public: - DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {} + DBTablePropertiesTest() + : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {} TablePropertiesCollection TestGetPropertiesOfTablesInRange( std::vector ranges, std::size_t* num_properties = nullptr, std::size_t* num_files = nullptr); @@ -56,21 +65,49 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 8; + // Part of strategy to prevent pinning table files + options.max_open_files = 42; Reopen(options); + // Create 4 tables for (int table = 0; table < 4; ++table) { + // Use old meta name for table properties for one file + if (table == 3) { + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) { + *reinterpret_cast(meta) = + &kPropertiesBlockOldName; + }); + SyncPoint::GetInstance()->EnableProcessing(); + } + // Build file for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } + SyncPoint::GetInstance()->DisableProcessing(); + std::string original_session_id; + ASSERT_OK(db_->GetDbSessionId(original_session_id)); + + // Part of strategy to prevent pinning table files + SyncPoint::GetInstance()->SetCallBack( + "VersionEditHandler::LoadTables:skip_load_table_files", + [&](void* skip_load) { *reinterpret_cast(skip_load) = true; }); + SyncPoint::GetInstance()->EnableProcessing(); // 1. Read table properties directly from file Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); VerifyTableProperties(db_, 10 + 11 + 12 + 13); // 2. Put two tables to table cache and Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); // fetch key from 1st and 2nd table, which will internally place that table to // the table cache. for (int i = 0; i < 2; ++i) { @@ -81,12 +118,113 @@ // 3. Put all tables to table cache Reopen(options); - // fetch key from 1st and 2nd table, which will internally place that table to - // the table cache. + // fetch key from all tables, which will place them in table cache. for (int i = 0; i < 4; ++i) { Get(ToString(i * 100 + 0)); } VerifyTableProperties(db_, 10 + 11 + 12 + 13); + + // 4. Try to read CORRUPT properties (a) directly from file, and (b) + // through reader on Get + + // It's not practical to prevent table file read on Open, so we + // corrupt after open and after purging table cache. + for (bool direct : {true, false}) { + Reopen(options); + // Clear out auto-opened files + dbfull()->TEST_table_cache()->EraseUnRefEntries(); + ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U); + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + std::string sst_file = props.begin()->first; + + // Corrupt the file's TableProperties using session id + std::string contents; + ASSERT_OK( + ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents)); + size_t pos = contents.find(original_session_id); + ASSERT_NE(pos, std::string::npos); + ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast(pos), 1, + /*verify checksum fails*/ false)); + + // Try to read CORRUPT properties + if (direct) { + ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption()); + } else { + bool found_corruption = false; + for (int i = 0; i < 4; ++i) { + std::string result = Get(ToString(i * 100 + 0)); + if (result.find_first_of("Corruption: block checksum mismatch") != + std::string::npos) { + found_corruption = true; + } + } + ASSERT_TRUE(found_corruption); + } + + // UN-corrupt file for next iteration + ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast(pos), 1, + /*verify checksum fails*/ false)); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTablePropertiesTest, InvalidIgnored) { + // RocksDB versions 2.5 - 2.7 generate some properties that Block considers + // invalid in some way. This approximates that. + + // Inject properties block data that Block considers invalid + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", + [&](void* block_data) { + *reinterpret_cast(block_data) = Slice("X"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Build file + for (int i = 0; i < 10; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), ToString(i), "val")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + + // Not crashing is good enough + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); +} + +TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) { + ConfigOptions options; + options.ignore_unsupported_options = false; + + std::shared_ptr factory; + std::string id = CompactOnDeletionCollectorFactory::kClassName(); + ASSERT_OK( + TablePropertiesCollectorFactory::CreateFromString(options, id, &factory)); + auto del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(0U, del_factory->GetWindowSize()); + ASSERT_EQ(0U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.0, del_factory->GetDeletionRatio()); + ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString( + options, "window_size=100; deletion_trigger=90; id=" + id, &factory)); + del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(100U, del_factory->GetWindowSize()); + ASSERT_EQ(90U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.0, del_factory->GetDeletionRatio()); + ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString( + options, + "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id, + &factory)); + del_factory = factory->CheckedCast(); + ASSERT_NE(del_factory, nullptr); + ASSERT_EQ(100U, del_factory->GetWindowSize()); + ASSERT_EQ(90U, del_factory->GetDeletionTrigger()); + ASSERT_EQ(0.5, del_factory->GetDeletionRatio()); } TablePropertiesCollection @@ -154,16 +292,16 @@ // build a decent LSM for (int i = 0; i < 10000; i++) { - ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); + ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 0) { - ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102))); - Flush(); + ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); + ASSERT_OK(Flush()); } - db_->PauseBackgroundWork(); + ASSERT_OK(db_->PauseBackgroundWork()); // Ensure that we have at least L0, L1 and L2 ASSERT_GT(NumTableFilesAtLevel(0), 0); @@ -231,8 +369,8 @@ // Create one table per CF, then verify it was created with the column family // name property. for (uint32_t cf = 0; cf < 2; ++cf) { - Put(cf, "key", "val"); - Flush(cf); + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Flush(cf)); TablePropertiesCollection fname_to_props; ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); @@ -251,7 +389,89 @@ } } -TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { +TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) { + CreateAndReopenWithCF({"goku"}, CurrentOptions()); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + std::string id, sid; + ASSERT_OK(db_->GetDbIdentity(id)); + ASSERT_OK(db_->GetDbSessionId(sid)); + ASSERT_EQ(id, fname_to_props.begin()->second->db_id); + ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id); + } +} + +class DBTableHostnamePropertyTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + DBTableHostnamePropertyTest() + : DBTestBase("db_table_hostname_property_test", + /*env_do_fsync=*/false) {} +}; + +TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) { + option_config_ = std::get<0>(GetParam()); + Options opts = CurrentOptions(); + std::string expected_host_id = std::get<1>(GetParam()); + ; + if (expected_host_id == kHostnameForDbHostId) { + ASSERT_OK(env_->GetHostNameString(&expected_host_id)); + } else { + opts.db_host_id = expected_host_id; + } + CreateAndReopenWithCF({"goku"}, opts); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id); + } +} + +INSTANTIATE_TEST_CASE_P( + DBTableHostnamePropertyTest, DBTableHostnamePropertyTest, + ::testing::Values( + // OptionConfig, override db_host_location + std::make_tuple(DBTestBase::OptionConfig::kDefault, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kDefault, ""), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + ""))); + +class DeletionTriggeredCompactionTestListener : public EventListener { + public: + void OnCompactionBegin(DB* , const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.compaction_reason, + CompactionReason::kFilesMarkedForCompaction); + } + + void OnCompactionCompleted(DB* , const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.compaction_reason, + CompactionReason::kFilesMarkedForCompaction); + } +}; + +TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { int kNumKeys = 1000; int kWindowSize = 100; int kNumDelsTrigger = 90; @@ -259,28 +479,37 @@ NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); Options opts = CurrentOptions(); + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); opts.table_properties_collector_factories.emplace_back(compact_on_del); + + if(GetParam() == "kCompactionStyleUniversal") { + opts.compaction_style = kCompactionStyleUniversal; + } Reopen(opts); // add an L1 file to prevent tombstones from dropping due to obsolescence // during flush - Put(Key(0), "val"); - Flush(); + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); MoveFilesToLevel(1); + DeletionTriggeredCompactionTestListener *listener = + new DeletionTriggeredCompactionTestListener(); + opts.listeners.emplace_back(listener); + Reopen(opts); + for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_GT(NumTableFilesAtLevel(1), 0); // Change the window size and deletion trigger and ensure new values take // effect @@ -293,16 +522,15 @@ for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); - ASSERT_GT(NumTableFilesAtLevel(1), 0); // Change the window size to disable delete triggered compaction kWindowSize = 0; @@ -313,18 +541,75 @@ for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED)); + ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED)); +} +TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) { + constexpr int kNumKeys = 1000; + constexpr int kWindowSize = 0; + constexpr int kNumDelsTrigger = 0; + constexpr double kDeletionRatio = 0.1; + std::shared_ptr compact_on_del = + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger, + kDeletionRatio); + + Options opts = CurrentOptions(); + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + opts.table_properties_collector_factories.emplace_back(compact_on_del); + + Reopen(opts); + + // Add an L2 file to prevent tombstones from dropping due to obsolescence + // during flush + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + auto* listener = new DeletionTriggeredCompactionTestListener(); + opts.listeners.emplace_back(listener); + Reopen(opts); + + // Generate one L0 with kNumKeys Put. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "not important")); + } + ASSERT_OK(Flush()); + + // Generate another L0 with kNumKeys Delete. + // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1. + // The resulting L1 file has only one tombstone for user key 'Key(0)'. + // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2 + // files to L2. However, the resulting file is empty because the tombstone + // and value are both dropped. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(0, NumTableFilesAtLevel(i)); + } } +INSTANTIATE_TEST_CASE_P( + DBTablePropertiesTest, + DBTablePropertiesTest, + ::testing::Values( + "kCompactionStyleLevel", + "kCompactionStyleUniversal" + )); + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -20,7 +20,8 @@ class DBTestTailingIterator : public DBTestBase { public: - DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {} + DBTestTailingIterator() + : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {} }; TEST_F(DBTestTailingIterator, TailingIteratorSingle) { @@ -30,6 +31,7 @@ std::unique_ptr iter(db_->NewIterator(read_options)); iter->SeekToFirst(); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); // add a record and check that iter can see it ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); @@ -47,6 +49,7 @@ read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::string value(1024, 'a'); const int num_records = 10000; @@ -69,7 +72,9 @@ read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); std::string value(1024, 'a'); const int num_records = 1000; @@ -137,8 +142,11 @@ Slice keyu(bufe, 20); read_options.iterate_upper_bound = &keyu; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); std::unique_ptr iterh(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iterh->status()); std::string value(1024, 'a'); bool file_iters_deleted = false; bool file_iters_renewed_null = false; @@ -178,7 +186,7 @@ if (i % 100 == 99) { ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (i == 299) { file_iters_deleted = true; } @@ -224,6 +232,7 @@ ReopenWithColumnFamilies({"default", "pikachu"}, options); read_options.read_tier = kBlockCacheTier; std::unique_ptr iteri(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iteri->status()); char buf5[32]; snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2); Slice target1(buf5, 20); @@ -235,6 +244,7 @@ options.table_factory.reset(NewBlockBasedTableFactory()); ReopenWithColumnFamilies({"default", "pikachu"}, options); iter.reset(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); for (int i = 2 * num_records; i > 0; --i) { char buf1[32]; char buf2[32]; @@ -261,6 +271,7 @@ read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); // write a single record, read it using the iterator, then delete it ASSERT_OK(Put(1, "0test", "test")); @@ -308,6 +319,7 @@ CreateAndReopenWithCF({"pikachu"}, options); std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); ASSERT_OK(Put(1, "0101", "test")); ASSERT_OK(Flush(1)); @@ -338,6 +350,7 @@ ASSERT_OK(db_->Put(WriteOptions(), key, value)); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->SeekToFirst(); // we either see the entry or it's not in cache ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); @@ -368,6 +381,7 @@ } std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); // Seek to 00001. We expect to find 00002. std::string start_key = "00001"; iter->Seek(start_key); @@ -403,6 +417,7 @@ ASSERT_OK(Put(1, "21", "21")); std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(it->status()); it->Seek("12"); ASSERT_TRUE(it->Valid()); ASSERT_EQ("12", it->key().ToString()); @@ -410,7 +425,7 @@ it->Next(); // Not valid since "21" is over the upper bound. ASSERT_FALSE(it->Valid()); - + ASSERT_OK(it->status()); // This keeps track of the number of times NeedToSeekImmutable() was true. int immutable_seeks = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -423,6 +438,7 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); ASSERT_EQ(0, immutable_seeks); } @@ -477,6 +493,8 @@ it->Next(); ASSERT_TRUE(it->Valid()); ASSERT_EQ("40", it->key().ToString()); + + ASSERT_OK(it->status()); } TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) { @@ -495,6 +513,7 @@ ASSERT_OK(Flush()); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->Seek("aa"); ASSERT_TRUE(iter->Valid()); @@ -517,6 +536,7 @@ ASSERT_OK(Flush()); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,11 +11,13 @@ // in Release build. // which is a pity, it is a good test #include + #include #include #include #include #include + #ifndef OS_WIN #include #endif @@ -24,7 +26,8 @@ #endif #include "cache/lru_cache.h" -#include "db/blob_index.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" @@ -33,7 +36,6 @@ #include "db/write_batch_internal.h" #include "env/mock_env.h" #include "file/filename.h" -#include "memtable/hash_linklist_rep.h" #include "monitoring/thread_status_util.h" #include "port/port.h" #include "port/stack_trace.h" @@ -52,27 +54,30 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/thread_status.h" +#include "rocksdb/types.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" -#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/compression.h" #include "util/mutexlock.h" +#include "util/random.h" #include "util/rate_limiter.h" #include "util/string_util.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { +// Note that whole DBTest and its child classes disable fsync on files +// and directories for speed. +// If fsync needs to be covered in a test, put it in other places. class DBTest : public DBTestBase { public: - DBTest() : DBTestBase("/db_test") {} + DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {} }; class DBTestWithParam @@ -93,7 +98,7 @@ }; TEST_F(DBTest, MockEnvTest) { - std::unique_ptr env{new MockEnv(Env::Default())}; + std::unique_ptr env{MockEnv::Create(Env::Default())}; Options options; options.create_if_missing = true; options.env = env.get(); @@ -126,7 +131,7 @@ // TEST_FlushMemTable() is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE - DBImpl* dbi = reinterpret_cast(db); + DBImpl* dbi = static_cast_with_check(db); ASSERT_OK(dbi->TEST_FlushMemTable()); for (size_t i = 0; i < 3; ++i) { @@ -174,7 +179,7 @@ ASSERT_TRUE(!iterator->Valid()); delete iterator; - DBImpl* dbi = reinterpret_cast(db); + DBImpl* dbi = static_cast_with_check(db); ASSERT_OK(dbi->TEST_FlushMemTable()); for (size_t i = 0; i < 3; ++i) { @@ -245,17 +250,21 @@ wo.sync = sync; wo.disableWAL = disableWAL; wo.no_slowdown = true; - dbfull()->Put(wo, "foo", "bar"); + // Large enough to exceed allowance for one time interval + std::string large_value(1024, 'x'); + // Perhaps ideally this first write would fail because of delay, but + // the current implementation does not guarantee that. + dbfull()->Put(wo, "foo", large_value).PermitUncheckedError(); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. - ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2")); + ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value)); ASSERT_GE(sleep_count.load(), 0); ASSERT_GE(wait_count.load(), 0); token.reset(); - token = dbfull()->TEST_write_controler().GetDelayToken(1000000000); + token = dbfull()->TEST_write_controler().GetDelayToken(1000000); wo.no_slowdown = false; - ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3")); + ASSERT_OK(dbfull()->Put(wo, "foo3", large_value)); ASSERT_GE(sleep_count.load(), 1); token.reset(); } @@ -308,7 +317,7 @@ wo.sync = false; wo.disableWAL = false; wo.no_slowdown = false; - dbfull()->Put(wo, "foo", "bar"); + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); @@ -366,7 +375,7 @@ wo.sync = false; wo.disableWAL = false; wo.no_slowdown = false; - dbfull()->Put(wo, "foo", "bar"); + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); @@ -435,7 +444,7 @@ wo.sync = false; wo.disableWAL = false; wo.no_slowdown = false; - dbfull()->Put(wo, "foo", "bar"); + ASSERT_OK(dbfull()->Put(wo, "foo", "bar")); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2")); @@ -616,24 +625,24 @@ // Put values on second level (so that they will not be in the same // compaction as the other operations. - Put(1, "foo", "first"); - Put(1, "bar", "one"); + ASSERT_OK(Put(1, "foo", "first")); + ASSERT_OK(Put(1, "bar", "one")); ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); // (Single) delete hidden by a put - SingleDelete(1, "foo"); - Put(1, "foo", "second"); - Delete(1, "bar"); - Put(1, "bar", "two"); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Put(1, "foo", "second")); + ASSERT_OK(Delete(1, "bar")); + ASSERT_OK(Put(1, "bar", "two")); ASSERT_OK(Flush(1)); - SingleDelete(1, "foo"); - Delete(1, "bar"); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Delete(1, "bar")); ASSERT_OK(Flush(1)); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ("NOT_FOUND", Get(1, "bar")); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); @@ -654,9 +663,9 @@ options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", Slice()); - Put(1, "a", Slice()); - SingleDelete(1, "a"); + ASSERT_OK(Put(1, "foo", Slice())); + ASSERT_OK(Put(1, "a", Slice())); + ASSERT_OK(SingleDelete(1, "a")); ASSERT_OK(Flush(1)); ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); @@ -764,8 +773,8 @@ // Block sync calls env_->delay_sstable_sync_.store(true, std::memory_order_release); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(0, "foo")); // Release sync calls @@ -843,19 +852,19 @@ // occurring at level 1 (instead of the correct level 0). // Step 1: First place sstables in levels 0 and 2 - Put(1, "a", "begin"); - Put(1, "z", "end"); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); - Put(1, "a", "begin"); - Put(1, "z", "end"); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1); @@ -866,7 +875,7 @@ } // Step 4: Wait for compaction to finish - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); @@ -902,6 +911,9 @@ static_cast(options.write_buffer_size); options.max_write_buffer_number = 2; options.write_buffer_size = 120 * 1024; + auto flush_listener = std::make_shared(); + flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull; + options.listeners.push_back(flush_listener); CreateAndReopenWithCF({"pikachu"}, options); std::vector threads; @@ -914,7 +926,7 @@ WriteOptions wo; // this should fill up 2 memtables for (int k = 0; k < 5000; ++k) { - ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), "")); } }; @@ -973,7 +985,7 @@ bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, std::string* /*new_value*/, bool* /*value_changed*/) const override { - db_test->env_->addon_time_.fetch_add(1000); + db_test->env_->MockSleepForMicroseconds(1000); return true; } @@ -1018,10 +1030,10 @@ } void CheckColumnFamilyMeta( - const ColumnFamilyMetaData& cf_meta, + const ColumnFamilyMetaData& cf_meta, const std::string& cf_name, const std::vector>& files_by_level, uint64_t start_time, uint64_t end_time) { - ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName); + ASSERT_EQ(cf_meta.name, cf_name); ASSERT_EQ(cf_meta.levels.size(), files_by_level.size()); uint64_t cf_size = 0; @@ -1115,6 +1127,53 @@ } #ifndef ROCKSDB_LITE +void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number, + uint64_t total_blob_count, uint64_t total_blob_bytes, + const std::string& checksum_method, + const std::string& checksum_value, + uint64_t garbage_blob_count = 0, + uint64_t garbage_blob_bytes = 0) { + ColumnFamilyData* cfd = + (static_cast(cfh))->cfd(); + assert(cfd); + + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + // Add a live blob file. + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + auto meta = BlobFileMetaData::Create(std::move(shared_meta), + BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); +} + +static void CheckBlobMetaData( + const BlobMetaData& bmd, uint64_t blob_file_number, + uint64_t total_blob_count, uint64_t total_blob_bytes, + const std::string& checksum_method, const std::string& checksum_value, + uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) { + ASSERT_EQ(bmd.blob_file_number, blob_file_number); + ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number)); + ASSERT_EQ(bmd.blob_file_size, + total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize); + + ASSERT_EQ(bmd.total_blob_count, total_blob_count); + ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes); + ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count); + ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes); + ASSERT_EQ(bmd.checksum_method, checksum_method); + ASSERT_EQ(bmd.checksum_value, checksum_value); +} + TEST_F(DBTest, MetaDataTest) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -1144,7 +1203,7 @@ // Fill up the rest of the file with random values. GenerateNewFile(&rnd, &key_index, /* nowait */ true); - Flush(); + ASSERT_OK(Flush()); } std::vector> files_by_level; @@ -1155,13 +1214,71 @@ ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(&cf_meta); - CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time); - + CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level, + start_time, end_time); std::vector live_file_meta; db_->GetLiveFilesMetaData(&live_file_meta); CheckLiveFilesMeta(live_file_meta, files_by_level); } +TEST_F(DBTest, AllMetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time).PermitUncheckedError(); + uint64_t start_time = static_cast(temp_time); + + Random rnd(301); + dbfull()->TEST_LockMutex(); + for (int cf = 0; cf < 2; cf++) { + AddBlobFile(handles_[cf], blob_file_number * (cf + 1), + total_blob_count * (cf + 1), total_blob_bytes * (cf + 1), + checksum_method, checksum_value); + } + dbfull()->TEST_UnlockMutex(); + + std::vector all_meta; + db_->GetAllColumnFamilyMetaData(&all_meta); + + std::vector> default_files_by_level; + std::vector> pikachu_files_by_level; + dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level); + dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level); + + options.env->GetCurrentTime(&temp_time).PermitUncheckedError(); + uint64_t end_time = static_cast(temp_time); + + ASSERT_EQ(all_meta.size(), 2); + for (int cf = 0; cf < 2; cf++) { + const auto& cfmd = all_meta[cf]; + if (cf == 0) { + CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time, + end_time); + } else { + CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time, + end_time); + } + ASSERT_EQ(cfmd.blob_files.size(), 1U); + const auto& bmd = cfmd.blob_files[0]; + ASSERT_EQ(cfmd.blob_file_count, 1U); + ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size); + ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_)); + CheckBlobMetaData(bmd, blob_file_number * (cf + 1), + total_blob_count * (cf + 1), total_blob_bytes * (cf + 1), + checksum_method, checksum_value); + } +} + namespace { void MinLevelHelper(DBTest* self, Options& options) { Random rnd(301); @@ -1171,20 +1288,20 @@ std::vector values; // Write 120KB (12 values, each 10K) for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); + values.push_back(rnd.RandomString(10000)); ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); } - self->dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1); } // generate one more file in level-0, and should trigger level-0 compaction std::vector values; for (int i = 0; i < 12; i++) { - values.push_back(DBTestBase::RandomString(&rnd, 10000)); + values.push_back(rnd.RandomString(10000)); ASSERT_OK(self->Put(DBTestBase::Key(i), values[i])); } - self->dbfull()->TEST_WaitForCompact(); + ASSERT_OK(self->dbfull()->TEST_WaitForCompact()); ASSERT_EQ(self->NumTableFilesAtLevel(0), 0); ASSERT_EQ(self->NumTableFilesAtLevel(1), 1); @@ -1294,7 +1411,7 @@ Random rnd(301); std::string value = - RandomString(&rnd, static_cast(2 * options.write_buffer_size)); + rnd.RandomString(static_cast(2 * options.write_buffer_size)); for (int i = 0; i < 5 * kMaxFiles; i++) { ASSERT_OK(Put(1, "key", value)); ASSERT_LE(TotalTableFiles(1), kMaxFiles); @@ -1303,51 +1420,6 @@ } #endif // ROCKSDB_LITE -TEST_F(DBTest, SparseMerge) { - do { - Options options = CurrentOptions(); - options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, options); - - FillLevels("A", "Z", 1); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put(1, "A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); - } - Put(1, "C", "vc"); - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - - // Make sparse update - Put(1, "A", "va2"); - Put(1, "B100", "bvalue2"); - Put(1, "C", "vc2"); - ASSERT_OK(Flush(1)); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - } while (ChangeCompactOptions()); -} - #ifndef ROCKSDB_LITE static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); @@ -1370,7 +1442,7 @@ const int N = 128; Random rnd(301); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); } uint64_t size; @@ -1380,33 +1452,37 @@ SizeApproximationOptions size_approx_options; size_approx_options.include_memtabtles = true; size_approx_options.include_files = true; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); ASSERT_LT(size, 204800); // Zero if not including mem table - db_->GetApproximateSizes(&r, 1, &size); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); ASSERT_EQ(size, 0); start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); } start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(100); end = Key(1020); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); options.max_write_buffer_number = 8; @@ -1421,58 +1497,64 @@ keys[i * 3 + 1] = i * 5 + 1; keys[i * 3 + 2] = i * 5 + 2; } - std::random_shuffle(std::begin(keys), std::end(keys)); + // MemTable entry counting is estimated and can vary greatly depending on + // layout. Thus, using deterministic seed for test stability. + RandomShuffle(std::begin(keys), std::end(keys), rnd.Next()); for (int i = 0; i < N * 3; i++) { - ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024))); } start = Key(100); end = Key(300); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); start = Key(2100); end = Key(2300); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, - &size_with_mt); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); ASSERT_GT(size_with_mt, 6000); - db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); ASSERT_EQ(size_without_mt, 0); - Flush(); + ASSERT_OK(Flush()); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024))); } start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, - &size_with_mt); - db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_without_mt, 6000); // Check that include_memtabtles flag works as expected size_approx_options.include_memtabtles = false; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, size_without_mt); // Check that files_size_error_margin works as expected, when the heuristic @@ -1481,63 +1563,92 @@ end = Key(1000 + N - 2); r = Range(start, end); size_approx_options.files_size_error_margin = -1.0; // disabled - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); uint64_t size2; size_approx_options.files_size_error_margin = 0.5; // enabled, but not used - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2)); ASSERT_EQ(size, size2); } TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + // Roughly 4 keys per data block, 1000 keys per file, + // with filter substantially larger than a data block + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(16)); + table_options.block_size = 100; Options options = CurrentOptions(); - options.write_buffer_size = 1024 * 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.write_buffer_size = 24 * 1024; options.compression = kNoCompression; options.create_if_missing = true; - options.target_file_size_base = 1024 * 1024; + options.target_file_size_base = 24 * 1024; DestroyAndReopen(options); const auto default_cf = db_->DefaultColumnFamily(); const int N = 64000; Random rnd(301); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(24))); } // Flush everything to files - Flush(); + ASSERT_OK(Flush()); // Compact the entire key space into the next level - db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr); + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr)); // Write more keys for (int i = N; i < (N + N / 4); i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(24))); } // Flush everything to files again - Flush(); + ASSERT_OK(Flush()); // Wait for compaction to finish ASSERT_OK(dbfull()->TEST_WaitForCompact()); - const std::string start = Key(0); - const std::string end = Key(2 * N); - const Range r(start, end); - - SizeApproximationOptions size_approx_options; - size_approx_options.include_memtabtles = false; - size_approx_options.include_files = true; - size_approx_options.files_size_error_margin = -1.0; // disabled - - // Get the precise size without any approximation heuristic - uint64_t size; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); - ASSERT_NE(size, 0); + { + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size)); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size2)); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); + } - // Get the size with an approximation heuristic - uint64_t size2; - const double error_margin = 0.2; - size_approx_options.files_size_error_margin = error_margin; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); - ASSERT_LT(size2, size * (1 + error_margin)); - ASSERT_GT(size2, size * (1 - error_margin)); + { + // Ensure that metadata is not falsely attributed only to the last data in + // the file. (In some applications, filters can be large portion of data + // size.) + // Perform many queries over small range, enough to ensure crossing file + // boundary, and make sure we never see a spike for large filter. + for (int i = 0; i < 3000; i += 10) { + const std::string start = Key(i); + const std::string end = Key(i + 11); // overlap by 1 key + const Range r(start, end); + uint64_t size; + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_LE(size, 11 * 100); + } + } } TEST_F(DBTest, GetApproximateMemTableStats) { @@ -1550,7 +1661,7 @@ const int N = 128; Random rnd(301); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); } uint64_t count; @@ -1572,7 +1683,7 @@ ASSERT_EQ(count, 0); ASSERT_EQ(size, 0); - Flush(); + ASSERT_OK(Flush()); start = Key(50); end = Key(60); @@ -1582,7 +1693,7 @@ ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); } start = Key(100); @@ -1602,9 +1713,12 @@ DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + uint64_t size; + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); // Write 8MB (80 values, each 100K) ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -1613,11 +1727,12 @@ static const int S2 = 105000; // Allow some expansion from metadata Random rnd(301); for (int i = 0; i < N; i++) { - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1))); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1))); } // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { @@ -1625,20 +1740,23 @@ for (int compact_start = 0; compact_start < N; compact_start += 10) { for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); - ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), - S2 * (i + 1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + ASSERT_OK(Size("", Key(i), 1, &size)); + ASSERT_TRUE(Between(size, S1 * i, S2 * i)); + ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1))); + ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 10, S2 * 10)); } - ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); - ASSERT_TRUE( - Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); + ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); std::string cstart_str = Key(compact_start); std::string cend_str = Key(compact_start + 9); Slice cstart = cstart_str; Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1])); } ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -1656,33 +1774,45 @@ CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000))); + std::string big1 = rnd.RandomString(100000); + ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000))); ASSERT_OK(Put(1, Key(2), big1)); - ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000))); ASSERT_OK(Put(1, Key(4), big1)); - ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000))); + ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000))); + ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000))); // Check sizes across recovery by reopening a few times + uint64_t size; for (int run = 0; run < 3; run++) { ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000)); + ASSERT_OK(Size("", Key(0), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + ASSERT_OK(Size("", Key(1), 1, &size)); + ASSERT_TRUE(Between(size, 10000, 11000)); + ASSERT_OK(Size("", Key(2), 1, &size)); + ASSERT_TRUE(Between(size, 20000, 21000)); + ASSERT_OK(Size("", Key(3), 1, &size)); + ASSERT_TRUE(Between(size, 120000, 121000)); + ASSERT_OK(Size("", Key(4), 1, &size)); + ASSERT_TRUE(Between(size, 130000, 131000)); + ASSERT_OK(Size("", Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 230000, 232000)); + ASSERT_OK(Size("", Key(6), 1, &size)); + ASSERT_TRUE(Between(size, 240000, 242000)); + // Ensure some overhead is accounted for, even without including all + ASSERT_OK(Size("", Key(7), 1, &size)); + ASSERT_TRUE(Between(size, 540500, 545000)); + ASSERT_OK(Size("", Key(8), 1, &size)); + ASSERT_TRUE(Between(size, 550500, 555000)); - ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000)); + ASSERT_OK(Size(Key(3), Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 110100, 111000)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); } // ApproximateOffsetOf() is not yet implemented in plain table format. } while (ChangeOptions(kSkipPlainTable)); @@ -1691,29 +1821,30 @@ #ifndef ROCKSDB_LITE TEST_F(DBTest, Snapshot) { + env_->SetMockSleep(); anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); - Put(0, "foo", "0v1"); - Put(1, "foo", "1v1"); + ASSERT_OK(Put(0, "foo", "0v1")); + ASSERT_OK(Put(1, "foo", "1v1")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_EQ(1U, GetNumSnapshots()); uint64_t time_snap1 = GetTimeOldestSnapshots(); ASSERT_GT(time_snap1, 0U); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v2"); - Put(1, "foo", "1v2"); + ASSERT_OK(Put(0, "foo", "0v2")); + ASSERT_OK(Put(1, "foo", "1v2")); - env_->addon_time_.fetch_add(1); + env_->MockSleepForSeconds(1); const Snapshot* s2 = db_->GetSnapshot(); ASSERT_EQ(2U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v3"); - Put(1, "foo", "1v3"); + ASSERT_OK(Put(0, "foo", "0v3")); + ASSERT_OK(Put(1, "foo", "1v3")); { ManagedSnapshot s3(db_); @@ -1721,8 +1852,8 @@ ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); - Put(0, "foo", "0v4"); - Put(1, "foo", "1v4"); + ASSERT_OK(Put(0, "foo", "0v4")); + ASSERT_OK(Put(1, "foo", "1v4")); ASSERT_EQ("0v1", Get(0, "foo", s1)); ASSERT_EQ("1v1", Get(1, "foo", s1)); ASSERT_EQ("0v2", Get(0, "foo", s2)); @@ -1763,35 +1894,38 @@ TEST_F(DBTest, HiddenValuesAreRemoved) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; + uint64_t size; do { Options options = CurrentOptions(options_override); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); FillLevels("a", "z", 1); - std::string big = RandomString(&rnd, 50000); - Put(1, "foo", big); - Put(1, "pastfoo", "v"); + std::string big = rnd.RandomString(50000); + ASSERT_OK(Put(1, "foo", big)); + ASSERT_OK(Put(1, "pastfoo", "v")); const Snapshot* snapshot = db_->GetSnapshot(); - Put(1, "foo", "tiny"); - Put(1, "pastfoo2", "v2"); // Advance sequence number one more + ASSERT_OK(Put(1, "foo", "tiny")); + ASSERT_OK(Put(1, "pastfoo2", "v2")); // Advance sequence number one more ASSERT_OK(Flush(1)); ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(big, Get(1, "foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 50000, 60000)); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); Slice x("x"); - dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1])); ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_GE(NumTableFilesAtLevel(1, 1), 1); - dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1])); ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 0, 1000)); // ApproximateOffsetOf() is not yet implemented in plain table format, // which is used by Size(). } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | @@ -1817,26 +1951,26 @@ options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "first"); + ASSERT_OK(Put(1, "foo", "first")); const Snapshot* snapshot = db_->GetSnapshot(); - SingleDelete(1, "foo"); - Put(1, "foo", "second"); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_OK(Put(1, "foo", "second")); ASSERT_OK(Flush(1)); ASSERT_EQ("first", Get(1, "foo", snapshot)); ASSERT_EQ("second", Get(1, "foo")); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1)); - SingleDelete(1, "foo"); + ASSERT_OK(SingleDelete(1, "foo")); ASSERT_EQ("first", Get(1, "foo", snapshot)); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); ASSERT_EQ("first", Get(1, "foo", snapshot)); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); @@ -1852,7 +1986,7 @@ TEST_F(DBTest, DeletionMarkers1) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); + ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); const int last = 2; MoveFilesToLevel(last, 1); @@ -1860,24 +1994,25 @@ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); MoveFilesToLevel(last - 1, 1); ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - Delete(1, "foo"); - Put(1, "foo", "v2"); + ASSERT_OK(Delete(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); ASSERT_OK(Flush(1)); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); Slice z("z"); - dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1])); // DEL eliminated, but v1 remains because we aren't compacting that level // (DEL can be eliminated because v2 hides v1). ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1])); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); @@ -1886,7 +2021,7 @@ TEST_F(DBTest, DeletionMarkers2) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "foo", "v1"); + ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); const int last = 2; MoveFilesToLevel(last, 1); @@ -1894,21 +2029,23 @@ ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); // Place a table at level last-1 to prevent merging with preceding mutation - Put(1, "a", "begin"); - Put(1, "z", "end"); - Flush(1); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); + ASSERT_OK(Flush(1)); MoveFilesToLevel(last - 1, 1); ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1); ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1); - Delete(1, "foo"); + ASSERT_OK(Delete(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); ASSERT_OK(Flush(1)); // Moves to level last-2 ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1])); // DEL kept: "last" file overlaps ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]); + ASSERT_OK( + dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1])); // Merging last-1 w/ last, so we are the base level for "foo", so // DEL is removed. (as is v1). ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); @@ -1923,11 +2060,11 @@ // 0. ASSERT_OK(Put(1, "100", "v100")); ASSERT_OK(Put(1, "999", "v999")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Delete(1, "100")); ASSERT_OK(Delete(1, "999")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(1, 1); ASSERT_EQ("0,1,1", FilesPerLevel(1)); @@ -1937,23 +2074,30 @@ // Note that files are sorted by smallest key. ASSERT_OK(Put(1, "300", "v300")); ASSERT_OK(Put(1, "500", "v500")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "200", "v200")); ASSERT_OK(Put(1, "600", "v600")); ASSERT_OK(Put(1, "900", "v900")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ("2,1,1", FilesPerLevel(1)); + // BEGIN addition to existing test + // Take this opportunity to verify SST unique ids (including Plain table) + TablePropertiesCollection tbc; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc)); + VerifySstUniqueIds(tbc); + // END addition to existing test + // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1])); ASSERT_EQ("2", FilesPerLevel(1)); // Do a memtable compaction. Before bug-fix, the compaction would // not detect the overlap with level-0 files and would incorrectly place // the deletion in a deeper level. ASSERT_OK(Delete(1, "600")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ("3", FilesPerLevel(1)); ASSERT_EQ("NOT_FOUND", Get(1, "600")); } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); @@ -2099,7 +2243,7 @@ ASSERT_OK(Put(1, "a", "123")); ASSERT_OK(Put(1, "b", "234")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(3, 1); Close(); @@ -2159,7 +2303,7 @@ ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); + values.push_back(rnd.RandomString(100000)); ASSERT_OK(Put((i < 40), Key(i), values[i])); } @@ -2170,8 +2314,8 @@ uint64_t manifest_number = 0; uint64_t manifest_size = 0; std::vector files; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(files, &manifest_size); + ASSERT_OK(dbfull()->DisableFileDeletions()); + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF) ASSERT_EQ(files.size(), 5U); @@ -2181,7 +2325,10 @@ // copy these files to a new snapshot directory std::string snapdir = dbname_ + ".snapdir/"; - ASSERT_OK(env_->CreateDirIfMissing(snapdir)); + if (env_->FileExists(snapdir).ok()) { + ASSERT_OK(DestroyDir(env_, snapdir)); + } + ASSERT_OK(env_->CreateDir(snapdir)); for (size_t i = 0; i < files.size(); i++) { // our clients require that GetLiveFiles returns @@ -2197,22 +2344,21 @@ // latest manifest file if (ParseFileName(files[i].substr(1), &number, &type)) { if (type == kDescriptorFile) { - if (number > manifest_number) { - manifest_number = number; - ASSERT_GE(size, manifest_size); - size = manifest_size; // copy only valid MANIFEST data - } + ASSERT_EQ(manifest_number, 0); + manifest_number = number; + ASSERT_GE(size, manifest_size); + size = manifest_size; // copy only valid MANIFEST data } } CopyFile(src, dest, size); } // release file snapshot - dbfull()->DisableFileDeletions(); + ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false)); // overwrite one key, this key should not appear in the snapshot std::vector extras; for (unsigned int i = 0; i < 1; i++) { - extras.push_back(RandomString(&rnd, 100000)); + extras.push_back(rnd.RandomString(100000)); ASSERT_OK(Put(0, Key(i), extras[i])); } @@ -2232,7 +2378,7 @@ ReadOptions roptions; std::string val; for (unsigned int i = 0; i < 80; i++) { - stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val); + ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val)); ASSERT_EQ(values[i].compare(val), 0); } for (auto cfh : cf_handles) { @@ -2245,8 +2391,8 @@ uint64_t new_manifest_number = 0; uint64_t new_manifest_size = 0; std::vector newfiles; - dbfull()->DisableFileDeletions(); - dbfull()->GetLiveFiles(newfiles, &new_manifest_size); + ASSERT_OK(dbfull()->DisableFileDeletions()); + ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size)); // find the new manifest file. assert that this manifest file is // the same one as in the previous snapshot. But its size should be @@ -2258,20 +2404,41 @@ // latest manifest file if (ParseFileName(newfiles[i].substr(1), &number, &type)) { if (type == kDescriptorFile) { - if (number > new_manifest_number) { - uint64_t size; - new_manifest_number = number; - ASSERT_OK(env_->GetFileSize(src, &size)); - ASSERT_GE(size, new_manifest_size); - } + ASSERT_EQ(new_manifest_number, 0); + uint64_t size; + new_manifest_number = number; + ASSERT_OK(env_->GetFileSize(src, &size)); + ASSERT_GE(size, new_manifest_size); } } } ASSERT_EQ(manifest_number, new_manifest_number); ASSERT_GT(new_manifest_size, manifest_size); - // release file snapshot - dbfull()->DisableFileDeletions(); + // Also test GetLiveFilesStorageInfo + std::vector new_infos; + ASSERT_OK(dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), + &new_infos)); + + // Close DB (while deletions disabled) + Close(); + + // Validate + for (auto& info : new_infos) { + std::string path = info.directory + "/" + info.relative_filename; + uint64_t size; + ASSERT_OK(env_->GetFileSize(path, &size)); + if (info.trim_to_size) { + ASSERT_LE(info.size, size); + } else if (!info.replacement_contents.empty()) { + ASSERT_EQ(info.size, info.replacement_contents.size()); + } else { + ASSERT_EQ(info.size, size); + } + if (info.file_type == kDescriptorFile) { + ASSERT_EQ(info.file_number, manifest_number); + } + } } while (ChangeCompactOptions()); } @@ -2292,7 +2459,7 @@ uint64_t manifest_size = 0; std::vector files; - dbfull()->GetLiveFiles(files, &manifest_size); + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); for (const std::string& f : files) { uint64_t number = 0; @@ -2300,7 +2467,7 @@ if (ParseFileName(f.substr(1), &number, &type)) { if (type == kDescriptorFile) { uint64_t size_on_disk; - env_->GetFileSize(dbname_ + "/" + f, &size_on_disk); + ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk)); ASSERT_EQ(manifest_size, size_on_disk); break; } @@ -2309,16 +2476,58 @@ Close(); } while (ChangeCompactOptions()); } + +TEST_F(DBTest, GetLiveBlobFiles) { + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) below and the periodic stat dumping thread. + Options options = CurrentOptions(); + options.stats_dump_period_sec = 0; + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + Reopen(options); + + AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, checksum_value, + garbage_blob_count, garbage_blob_bytes); + // Make sure it appears in the results returned by GetLiveFiles. + uint64_t manifest_size = 0; + std::vector files; + ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size)); + + ASSERT_FALSE(files.empty()); + ASSERT_EQ(files[0], BlobFileName("", blob_file_number)); + + ColumnFamilyMetaData cfmd; + + db_->GetColumnFamilyMetaData(&cfmd); + ASSERT_EQ(cfmd.blob_files.size(), 1); + const BlobMetaData& bmd = cfmd.blob_files[0]; + + CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value, garbage_blob_count, + garbage_blob_bytes); + ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_)); + ASSERT_EQ(cfmd.blob_file_count, 1U); + ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size); +} #endif TEST_F(DBTest, PurgeInfoLogs) { Options options = CurrentOptions(); options.keep_log_file_num = 5; options.create_if_missing = true; + options.env = env_; for (int mode = 0; mode <= 1; mode++) { if (mode == 1) { options.db_log_dir = dbname_ + "_logs"; - env_->CreateDirIfMissing(options.db_log_dir); + ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir)); } else { options.db_log_dir = ""; } @@ -2327,8 +2536,8 @@ } std::vector files; - env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir, - &files); + ASSERT_OK(env_->GetChildren( + options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files)); int info_log_count = 0; for (std::string file : files) { if (file.find("LOG") != std::string::npos) { @@ -2340,19 +2549,18 @@ Destroy(options); // For mode (1), test DestroyDB() to delete all the logs under DB dir. // For mode (2), no info log file should have been put under DB dir. + // Since dbname_ has no children, there is no need to loop db_files std::vector db_files; - env_->GetChildren(dbname_, &db_files); - for (std::string file : db_files) { - ASSERT_TRUE(file.find("LOG") == std::string::npos); - } + ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound()); + ASSERT_TRUE(db_files.empty()); if (mode == 1) { // Cleaning up - env_->GetChildren(options.db_log_dir, &files); + ASSERT_OK(env_->GetChildren(options.db_log_dir, &files)); for (std::string file : files) { - env_->DeleteFile(options.db_log_dir + "/" + file); + ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file)); } - env_->DeleteDir(options.db_log_dir); + ASSERT_OK(env_->DeleteDir(options.db_log_dir)); } } } @@ -2368,9 +2576,7 @@ struct MTState { DBTest* test; - std::atomic stop; std::atomic counter[kNumThreads]; - std::atomic thread_done[kNumThreads]; }; struct MTThread { @@ -2384,10 +2590,13 @@ int id = t->id; DB* db = t->state->test->db_; int counter = 0; + std::shared_ptr clock = SystemClock::Default(); + auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U; + fprintf(stderr, "... starting thread %d\n", id); Random rnd(1000 + id); char valbuf[1500]; - while (t->state->stop.load(std::memory_order_acquire) == false) { + while (clock->NowMicros() < end_micros) { t->state->counter[id].store(counter, std::memory_order_release); int key = rnd.Uniform(kNumKeys); @@ -2407,7 +2616,8 @@ for (int cf = 0; cf < kColumnFamilies; ++cf) { snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf), + Slice(valbuf))); } ASSERT_OK(db->Write(WriteOptions(), &batch)); } else { @@ -2415,7 +2625,8 @@ for (int cf = 0; cf < kColumnFamilies; ++cf) { snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id, static_cast(counter), cf, unique_id); - batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf)); + ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf), + Slice(valbuf))); } ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch())); } @@ -2482,7 +2693,6 @@ } counter++; } - t->state->thread_done[id].store(true, std::memory_order_release); fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); } @@ -2521,10 +2731,8 @@ // Initialize state MTState mt; mt.test = this; - mt.stop.store(false, std::memory_order_release); for (int id = 0; id < kNumThreads; id++) { mt.counter[id].store(0, std::memory_order_release); - mt.thread_done[id].store(false, std::memory_order_release); } // Start threads @@ -2536,16 +2744,7 @@ env_->StartThread(MTThreadBody, &thread[id]); } - // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); - - // Stop the threads and wait for them to finish - mt.stop.store(true, std::memory_order_release); - for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].load(std::memory_order_acquire) == false) { - env_->SleepForMicroseconds(100000); - } - } + env_->WaitForJoin(); } INSTANTIATE_TEST_CASE_P( @@ -2636,7 +2835,7 @@ #endif // TRAVIS namespace { -typedef std::map KVMap; +using KVMap = std::map; } class ModelDB : public DB { @@ -2657,7 +2856,10 @@ Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, const Slice& v) override { WriteBatch batch; - batch.Put(cf, k, v); + Status s = batch.Put(cf, k, v); + if (!s.ok()) { + return s; + } return Write(o, &batch); } using DB::Close; @@ -2666,21 +2868,30 @@ Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& key) override { WriteBatch batch; - batch.Delete(cf, key); + Status s = batch.Delete(cf, key); + if (!s.ok()) { + return s; + } return Write(o, &batch); } using DB::SingleDelete; Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& key) override { WriteBatch batch; - batch.SingleDelete(cf, key); + Status s = batch.SingleDelete(cf, key); + if (!s.ok()) { + return s; + } return Write(o, &batch); } using DB::Merge; Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k, const Slice& v) override { WriteBatch batch; - batch.Merge(cf, k, v); + Status s = batch.Merge(cf, k, v); + if (!s.ok()) { + return s; + } return Write(o, &batch); } using DB::Get; @@ -2929,15 +3140,27 @@ Status SyncWAL() override { return Status::OK(); } -#ifndef ROCKSDB_LITE Status DisableFileDeletions() override { return Status::OK(); } Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } +#ifndef ROCKSDB_LITE + Status GetLiveFiles(std::vector&, uint64_t* /*size*/, bool /*flush_memtable*/ = true) override { return Status::OK(); } + Status GetLiveFilesChecksumInfo( + FileChecksumList* /*checksum_list*/) override { + return Status::OK(); + } + + Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& /*opts*/, + std::vector* /*files*/) override { + return Status::OK(); + } + Status GetSortedWalFiles(VectorLogPtr& /*files*/) override { return Status::OK(); } @@ -2970,12 +3193,26 @@ return Status::OK(); } + Status GetDbSessionId(std::string& /*session_id*/) const override { + return Status::OK(); + } + SequenceNumber GetLatestSequenceNumber() const override { return 0; } bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override { return true; } + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/, + std::string /*ts_low*/) override { + return Status::OK(); + } + + Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/, + std::string* /*ts_low*/) override { + return Status::OK(); + } + ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; } private: @@ -3025,7 +3262,7 @@ std::string name_ = ""; }; -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) static std::string RandomKey(Random* rnd, int minimum = 0) { int len; do { @@ -3061,7 +3298,7 @@ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", step, EscapeString(miter->key()).c_str(), EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); + EscapeString(dbiter->value()).c_str()); ok = false; } } @@ -3125,8 +3362,8 @@ } if (p < 45) { // Put k = RandomKey(&rnd, minimum); - v = RandomString(&rnd, - rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8)); + v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100) + : rnd.Uniform(8)); ASSERT_OK(model.Put(WriteOptions(), k, v)); ASSERT_OK(db_->Put(WriteOptions(), k, v)); } else if (p < 90) { // Delete @@ -3144,10 +3381,10 @@ // we have multiple entries in the write batch for the same key } if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); + v = rnd.RandomString(rnd.Uniform(10)); + ASSERT_OK(b.Put(k, v)); } else { - b.Delete(k); + ASSERT_OK(b.Delete(k)); } } ASSERT_OK(model.Write(WriteOptions(), &b)); @@ -3180,7 +3417,7 @@ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); } -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { // create a DB with block prefix index @@ -3192,7 +3429,7 @@ Reopen(options); ASSERT_OK(Put("k1", "v1")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("k2", "v2")); // Reopen it without prefix extractor, make sure everything still works. @@ -3205,6 +3442,27 @@ ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); } +TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewCappedPrefixTransform(2)); + + Reopen(options); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk", "v3")); + ASSERT_OK(Put("k", "v4")); + Flush(); + + ASSERT_EQ("v1", Get("kk1")); + ASSERT_EQ("v2", Get("kk2")); + + ASSERT_EQ("v3", Get("kk")); + ASSERT_EQ("v4", Get("k")); +} TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { // create a DB with block prefix index @@ -3225,7 +3483,7 @@ Reopen(options); ASSERT_OK(Put("k1", "v1")); - Flush(); + ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; @@ -3314,7 +3572,7 @@ Random rnd(301); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 110; ++j) { - ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 100 + j), rnd.RandomString(980))); } // flush should happen here ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); @@ -3352,9 +3610,9 @@ for (int i = 0; i < 60; i++) { // Generate and flush a file about 20KB. for (int j = 0; j < 20; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // It should be compacted to 10 files. @@ -3363,9 +3621,9 @@ for (int i = 0; i < 60; i++) { // Generate and flush a file about 20KB. for (int j = 0; j < 20; j++) { - ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j + 2000), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } @@ -3393,10 +3651,10 @@ Random rnd(301); for (int i = 0; i < 3; i++) { // Each file contains a different key which will be dropped later. - ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500))); + ASSERT_OK(Put("a" + ToString(i), rnd.RandomString(500))); ASSERT_OK(Put("key" + ToString(i), "")); - ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500))); - Flush(); + ASSERT_OK(Put("z" + ToString(i), rnd.RandomString(500))); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 1); @@ -3405,10 +3663,10 @@ } for (int i = 0; i < 3; i++) { // Each file contains a different key which will be dropped later. - ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500))); + ASSERT_OK(Put("a" + ToString(i), rnd.RandomString(500))); ASSERT_OK(Delete("key" + ToString(i))); - ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500))); - Flush(); + ASSERT_OK(Put("z" + ToString(i), rnd.RandomString(500))); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 2); @@ -3418,17 +3676,21 @@ } // Check that FIFO-with-TTL is not supported with max_open_files != -1. +// Github issue #8014 TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleFIFO; options.create_if_missing = true; options.ttl = 600; // seconds - // TTL is now supported with max_open_files != -1. + // TTL is not supported with max_open_files != -1. + options.max_open_files = 0; + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + options.max_open_files = 100; - options = CurrentOptions(options); - ASSERT_OK(TryReopen(options)); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + // TTL is supported with unlimited max_open_files options.max_open_files = -1; ASSERT_OK(TryReopen(options)); } @@ -3460,13 +3722,14 @@ options.arena_block_size = 4096; options.compression = kNoCompression; options.create_if_missing = true; - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; // Test to make sure that all files with expired ttl are deleted on next // manual compaction. { - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + options.compaction_options_fifo.max_table_files_size = 150 << 10; // 150KB options.compaction_options_fifo.allow_compaction = false; options.ttl = 1 * 60 * 60 ; // 1 hour @@ -3477,25 +3740,22 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 10); // Sleep for 2 hours -- which is much greater than TTL. - // Note: Couldn't use SleepForMicroseconds because it takes an int instead - // of uint64_t. Hence used addon_time_ directly. - // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000); - env_->addon_time_.fetch_add(2 * 60 * 60); + env_->MockSleepForSeconds(2 * 60 * 60); // Since no flushes and compactions have run, the db should still be in // the same state even after considerable time has passed. ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 0); } @@ -3512,15 +3772,15 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 10); // Sleep for 2 hours -- which is much greater than TTL. - env_->addon_time_.fetch_add(2 * 60 * 60); + env_->MockSleepForSeconds(2 * 60 * 60); // Just to make sure that we are in the same state even after sleeping. ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); @@ -3528,9 +3788,9 @@ // Create 1 more file to trigger TTL compaction. The old files are dropped. for (int i = 0; i < 1; i++) { for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -3554,24 +3814,24 @@ for (int i = 0; i < 3; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 3); // Sleep for 2 hours -- which is much greater than TTL. - env_->addon_time_.fetch_add(2 * 60 * 60); + env_->MockSleepForSeconds(2 * 60 * 60); // Just to make sure that we are in the same state even after sleeping. ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 3); for (int i = 0; i < 5; i++) { for (int j = 0; j < 140; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // Size limit is still guaranteed. @@ -3592,9 +3852,9 @@ for (int i = 0; i < 10; i++) { // Generate and flush a file about 10KB. for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1 @@ -3603,7 +3863,7 @@ ASSERT_EQ(NumTableFilesAtLevel(0), 5); // Sleep for 2 hours -- which is much greater than TTL. - env_->addon_time_.fetch_add(2 * 60 * 60); + env_->MockSleepForSeconds(2 * 60 * 60); // Just to make sure that we are in the same state even after sleeping. ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 5); @@ -3611,9 +3871,9 @@ // Create 10 more files. The old 5 files are dropped as their ttl expired. for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(NumTableFilesAtLevel(0), 5); @@ -3636,9 +3896,9 @@ for (int i = 0; i < 60; i++) { // Generate and flush a file about 20KB. for (int j = 0; j < 20; j++) { - ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // It should be compacted to 10 files. @@ -3647,9 +3907,9 @@ for (int i = 0; i < 60; i++) { // Generate and flush a file about 20KB. for (int j = 0; j < 20; j++) { - ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980))); + ASSERT_OK(Put(ToString(i * 20 + j + 2000), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } @@ -3690,8 +3950,7 @@ uint64_t start = env_->NowMicros(); // Write ~96M data for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK( - Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); } uint64_t elapsed = env_->NowMicros() - start; double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed; @@ -3709,8 +3968,7 @@ start = env_->NowMicros(); // Write ~96M data for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK( - Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); } rate_limiter_drains = TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) - @@ -3735,8 +3993,7 @@ start = env_->NowMicros(); // Write ~96M data for (int64_t i = 0; i < (96 << 10); ++i) { - ASSERT_OK( - Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo)); + ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo)); } elapsed = env_->NowMicros() - start; rate_limiter_drains = @@ -3753,13 +4010,66 @@ ASSERT_LT(ratio, 0.6); } +// This is a mocked customed rate limiter without implementing optional APIs +// (e.g, RateLimiter::GetTotalPendingRequests()) +class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter { + public: + MockedRateLimiterWithNoOptionalAPIImpl() {} + + ~MockedRateLimiterWithNoOptionalAPIImpl() override {} + + const char* Name() const override { + return "MockedRateLimiterWithNoOptionalAPI"; + } + void SetBytesPerSecond(int64_t bytes_per_second) override { + (void)bytes_per_second; + } + + using RateLimiter::Request; + void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override { + (void)bytes; + (void)pri; + (void)stats; + } + + int64_t GetSingleBurstBytes() const override { return 200; } + + int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + (void)pri; + return 0; + } + + int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + (void)pri; + return 0; + } + + int64_t GetBytesPerSecond() const override { return 0; } +}; + +// To test that customed rate limiter not implementing optional APIs (e.g, +// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic +// operations (e.g, Put, Get, Flush) +TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) { + Options options = CurrentOptions(); + options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl()); + DestroyAndReopen(options); + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ(Get("abc"), "def"); + ASSERT_OK(Flush()); + ASSERT_EQ(Get("abc"), "def"); +} + TEST_F(DBTest, TableOptionsSanitizeTest) { Options options = CurrentOptions(); options.create_if_missing = true; DestroyAndReopen(options); ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); Destroy(options); ASSERT_TRUE(!TryReopen(options).IsNotSupported()); @@ -3840,7 +4150,7 @@ ASSERT_OK(Put("abc", "def")); ASSERT_EQ("def", Get("abc")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("def", Get("abc")); } } @@ -3849,9 +4159,9 @@ std::vector threads; dbfull()->TEST_LockMutex(); auto w = dbfull()->TEST_BeginWrite(); - threads.emplace_back([&] { Put("a", "b"); }); + threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); }); env_->SleepForMicroseconds(10000); - threads.emplace_back([&] { Flush(); }); + threads.emplace_back([&] { ASSERT_OK(Flush()); }); env_->SleepForMicroseconds(10000); dbfull()->TEST_UnlockMutex(); dbfull()->TEST_LockMutex(); @@ -3866,6 +4176,7 @@ TEST_F(DBTest, ConcurrentFlushWAL) { const size_t cnt = 100; Options options; + options.env = env_; WriteOptions wopt; ReadOptions ropt; for (bool two_write_queues : {false, true}) { @@ -3878,7 +4189,8 @@ threads.emplace_back([&] { for (size_t i = 0; i < cnt; i++) { auto istr = ToString(i); - db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr); + ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, + "b" + istr)); } }); if (two_write_queues) { @@ -3886,14 +4198,15 @@ for (size_t i = cnt; i < 2 * cnt; i++) { auto istr = ToString(i); WriteBatch batch; - batch.Put("a" + istr, "b" + istr); - dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true); + ASSERT_OK(batch.Put("a" + istr, "b" + istr)); + ASSERT_OK( + dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true)); } }); } threads.emplace_back([&] { for (size_t i = 0; i < cnt * 100; i++) { // FlushWAL is faster than Put - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(false)); } }); for (auto& t : threads) { @@ -3913,6 +4226,39 @@ } } +// This test failure will be caught with a probability +TEST_F(DBTest, ManualFlushWalAndWriteRace) { + Options options; + options.env = env_; + options.manual_wal_flush = true; + options.create_if_missing = true; + + DestroyAndReopen(options); + + WriteOptions wopts; + wopts.sync = true; + + port::Thread writeThread([&]() { + for (int i = 0; i < 100; i++) { + auto istr = ToString(i); + ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr)); + } + }); + port::Thread flushThread([&]() { + for (int i = 0; i < 100; i++) { + ASSERT_OK(dbfull()->FlushWAL(false)); + } + }); + + writeThread.join(); + flushThread.join(); + ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1")); + ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2")); + Reopen(options); + ASSERT_EQ("value1", Get("foo1")); + ASSERT_EQ("value2", Get("foo2")); +} + #ifndef ROCKSDB_LITE TEST_F(DBTest, DynamicMemtableOptions) { const uint64_t k64KB = 1 << 16; @@ -3936,7 +4282,7 @@ const int kNumPutsBeforeWaitForFlush = 64; Random rnd(301); for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); // The following condition prevents a race condition between flush jobs // acquiring work and this thread filling up multiple memtables. Without @@ -3944,10 +4290,10 @@ // multiple memtables are flushed into a single L0 file. This race // condition affects assertion (A). if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); }; // Test write_buffer_size @@ -3957,7 +4303,7 @@ ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2); // Clean up L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 0); // Increase buffer size @@ -4010,7 +4356,7 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); while (!sleeping_task_low.WokenUp() && count < 256) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); count++; } ASSERT_GT(static_cast(count), 128 * 0.8); @@ -4023,14 +4369,14 @@ {"max_write_buffer_number", "8"}, })); // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); sleeping_task_low.Reset(); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); count = 0; while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); count++; } // Windows fails this test. Will tune in the future and figure out @@ -4046,7 +4392,7 @@ {"max_write_buffer_number", "4"}, })); // Clean up memtable and L0 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); sleeping_task_low.Reset(); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, @@ -4054,7 +4400,7 @@ count = 0; while (!sleeping_task_low.WokenUp() && count < 1024) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions())); + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions())); count++; } // Windows fails this test. Will tune in the future and figure out @@ -4146,7 +4492,7 @@ true); } } - db_->DropColumnFamily(handles_[2]); + ASSERT_OK(db_->DropColumnFamily(handles_[2])); delete handles_[2]; handles_.erase(handles_.begin() + 2); env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, @@ -4188,17 +4534,19 @@ VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0); uint64_t num_running_flushes = 0; - db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes, + &num_running_flushes)); ASSERT_EQ(num_running_flushes, 0); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush // The first sync point is to make sure there's one flush job // running when we perform VerifyOperationCount(). TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1"); VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1); - db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes, + &num_running_flushes)); ASSERT_EQ(num_running_flushes, 1); // This second sync point is to ensure the flush job will not // be completed until we already perform VerifyOperationCount(). @@ -4241,15 +4589,15 @@ for (int file = 0; file < kNumL0Files; ++file) { for (int key = 0; key < kEntriesPerBuffer; ++key) { ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer), - RandomString(&rnd, kTestValueSize))); + rnd.RandomString(kTestValueSize))); } - Flush(); + ASSERT_OK(Flush()); } // This makes sure a compaction won't be scheduled until // we have done with the above Put Phase. uint64_t num_running_compactions = 0; - db_->GetIntProperty(DB::Properties::kNumRunningCompactions, - &num_running_compactions); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions)); ASSERT_EQ(num_running_compactions, 0); TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0"); ASSERT_GE(NumTableFilesAtLevel(0), @@ -4265,8 +4613,8 @@ // If thread tracking is not enabled, compaction count should be 0. VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0); } - db_->GetIntProperty(DB::Properties::kNumRunningCompactions, - &num_running_compactions); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions, + &num_running_compactions)); ASSERT_EQ(num_running_compactions, 1); // TODO(yhchiang): adding assert to verify each compaction stage. TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2"); @@ -4297,7 +4645,7 @@ ASSERT_EQ("1,1,1", FilesPerLevel(1)); // Compaction range overlaps files - Compact(1, "p1", "p9"); + Compact(1, "p", "q"); ASSERT_EQ("0,0,1", FilesPerLevel(1)); // Populate a different range @@ -4312,7 +4660,9 @@ MakeTables(1, "a", "z", 1); ASSERT_EQ("1,0,2", FilesPerLevel(1)); CancelAllBackgroundWork(db_); - db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); ASSERT_EQ("1,0,2", FilesPerLevel(1)); if (iter == 0) { @@ -4389,10 +4739,10 @@ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; for (int file = 0; file < 16 * kNumL0Files; ++file) { for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + ASSERT_OK(Put(ToString(key++), rnd.RandomString(kTestValueSize))); } - Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(env_->GetThreadList(&thread_list)); for (auto thread : thread_list) { operation_count[thread.operation_type]++; } @@ -4412,12 +4762,12 @@ ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); CancelAllBackgroundWork(db_); TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Record the number of compactions at a time. for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { operation_count[i] = 0; } - Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(env_->GetThreadList(&thread_list)); for (auto thread : thread_list) { operation_count[thread.operation_type]++; } @@ -4476,10 +4826,10 @@ int operation_count[ThreadStatus::NUM_OP_TYPES] = {0}; for (int file = 0; file < 16 * kNumL0Files; ++file) { for (int k = 0; k < kEntriesPerBuffer; ++k) { - ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + ASSERT_OK(Put(ToString(key++), rnd.RandomString(kTestValueSize))); } - Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(env_->GetThreadList(&thread_list)); for (auto thread : thread_list) { operation_count[thread.operation_type]++; } @@ -4499,12 +4849,12 @@ CancelAllBackgroundWork(db_); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Record the number of compactions at a time. for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { operation_count[i] = 0; } - Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(env_->GetThreadList(&thread_list)); for (auto thread : thread_list) { operation_count[thread.operation_type]++; } @@ -4530,10 +4880,11 @@ for (int i = 0; i < kNKeys; i++) { keys[i] = i; } - std::random_shuffle(std::begin(keys), std::end(keys)); + RandomShuffle(std::begin(keys), std::end(keys)); Random rnd(301); Options options; + options.env = env_; options.create_if_missing = true; options.db_write_buffer_size = 20480; options.write_buffer_size = 20480; @@ -4563,8 +4914,8 @@ for (int i = 0; i < 20; i++) { ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0); @@ -4576,10 +4927,11 @@ for (int i = 21; i < 120; i++) { ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U + 50U * 24); // Make sure data in files in L3 is not compacted by removing all files @@ -4613,7 +4965,7 @@ for (int i = 0; i < kNKeys; i++) { keys[i] = i; } - std::random_shuffle(std::begin(keys), std::end(keys)); + RandomShuffle(std::begin(keys), std::end(keys)); Random rnd(301); Options options; @@ -4626,7 +4978,7 @@ options.level0_stop_writes_trigger = 2; options.soft_pending_compaction_bytes_limit = 1024 * 1024; options.target_file_size_base = 20; - + options.env = env_; options.level_compaction_dynamic_level_bytes = true; options.max_bytes_for_level_base = 200; options.max_bytes_for_level_multiplier = 8; @@ -4662,17 +5014,17 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); for (int i = 0; i < 100; i++) { - std::string value = RandomString(&rnd, 200); + std::string value = rnd.RandomString(200); ASSERT_OK(Put(Key(keys[i]), value)); if (i % 25 == 24) { - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -4707,11 +5059,11 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); for (int i = 101; i < 500; i++) { - std::string value = RandomString(&rnd, 200); + std::string value = rnd.RandomString(200); ASSERT_OK(Put(Key(keys[i]), value)); if (i % 100 == 99) { - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } @@ -4759,9 +5111,9 @@ auto gen_l0_kb = [this](int start, int size, int stride) { Random rnd(301); for (int i = 0; i < size; i++) { - ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024))); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); }; // Write 3 files that have the same key range. @@ -4772,7 +5124,7 @@ gen_l0_kb(0, 64, 1); ASSERT_EQ(NumTableFilesAtLevel(0), 2); gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel()); std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -4791,7 +5143,7 @@ gen_l0_kb(0, 64, 1); ASSERT_EQ("1,1", FilesPerLevel()); gen_l0_kb(0, 64, 1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,2", FilesPerLevel()); metadata.clear(); db_->GetLiveFilesMetaData(&metadata); @@ -4813,7 +5165,7 @@ for (int i = 0; i < 96; ++i) { gen_l0_kb(i, 64, 96); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(SizeAtLevel(1), k1MB / 2); ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); @@ -4834,7 +5186,7 @@ for (int i = 0; i < 20; ++i) { gen_l0_kb(i, 64, 32); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); ASSERT_TRUE(total_size < k128KB * 7 * 1.5); @@ -4842,8 +5194,8 @@ // Clean up memtable and L0. Block compaction threads. If continue to write // and flush memtables. We should see put stop after 8 memtable flushes // since level0_stop_writes_trigger = 8 - dbfull()->TEST_FlushMemTable(true, true); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Block compaction test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, @@ -4854,8 +5206,8 @@ Random rnd(301); WriteOptions wo; while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo)); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); count++; if (dbfull()->TEST_write_controler().IsStopped()) { sleeping_task_low.WakeUp(); @@ -4871,8 +5223,8 @@ // Block compaction thread again. Perform the put and memtable flushes // until we see the stop after 6 memtable flushes. ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}})); - dbfull()->TEST_FlushMemTable(true); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 0); // Block compaction again @@ -4882,8 +5234,8 @@ sleeping_task_low.WaitUntilSleeping(); count = 0; while (count < 64) { - ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo)); - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo)); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); count++; if (dbfull()->TEST_write_controler().IsStopped()) { sleeping_task_low.WakeUp(); @@ -4900,29 +5252,29 @@ // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of // L0 files do not change after the call. ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 0); for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 4); // Enable auto compaction and perform the same test, # of L0 files should be // reduced after compaction. ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 0); for (int i = 0; i < 4; ++i) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(i), rnd.RandomString(1024))); // Wait for compaction so that put won't stop - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_LT(NumTableFilesAtLevel(0), 4); } @@ -4936,6 +5288,7 @@ Options options; options.ttl = 0; options.create_if_missing = true; + options.env = env_; DestroyAndReopen(options); // Initial defaults @@ -4997,6 +5350,7 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) { Options options; options.create_if_missing = true; + options.env = env_; DestroyAndReopen(options); // Initial defaults @@ -5075,12 +5429,13 @@ DestroyAndReopen(options); Random rnd(301); - const int kCDTKeysPerBuffer = 4; - const int kTestSize = kCDTKeysPerBuffer * 4096; - const int kTotalIteration = 100; + constexpr int kCDTKeysPerBuffer = 4; + constexpr int kTestSize = kCDTKeysPerBuffer * 4096; + constexpr int kTotalIteration = 20; // the second half of the test involves in random failure // of file creation. - const int kRandomFailureTest = kTotalIteration / 2; + constexpr int kRandomFailureTest = kTotalIteration / 2; + std::vector values; for (int i = 0; i < kTestSize; ++i) { values.push_back("NOT_FOUND"); @@ -5091,7 +5446,7 @@ } for (int k = 0; k < kTestSize; ++k) { // here we expect some of the Put fails. - std::string value = RandomString(&rnd, 100); + std::string value = rnd.RandomString(100); Status s = Put(Key(k), Slice(value)); if (s.ok()) { // update the latest successful put @@ -5105,8 +5460,8 @@ } // If rocksdb does not do the correct job, internal assert will fail here. - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError()); + ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError()); // verify we have the latest successful update for (int k = 0; k < kTestSize; ++k) { @@ -5140,11 +5495,11 @@ int key1 = key_start + 1; int key2 = key_start + 2; Random rnd(301); - ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + ASSERT_OK(Put(Key(key0), rnd.RandomString(8))); for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + ASSERT_OK(Put(Key(key1), rnd.RandomString(8))); } - ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + ASSERT_OK(Put(Key(key2), rnd.RandomString(8))); std::unique_ptr iter(db_->NewIterator(ReadOptions())); iter->Seek(Key(key1)); ASSERT_TRUE(iter->Valid()); @@ -5160,14 +5515,14 @@ ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}})); // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); // Trigger reseek assert_reseek_count(200, 1); ASSERT_OK( dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}})); // Clear memtable and make new option effective - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); // No reseek assert_reseek_count(300, 1); @@ -5210,45 +5565,56 @@ ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], &mutable_cf_options)); ASSERT_TRUE(mutable_cf_options.report_bg_io_stats); + ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order); + + ASSERT_OK(dbfull()->SetOptions( + handles_[1], {{"check_flush_compaction_key_order", "false"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], + &mutable_cf_options)); + ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order); } #endif // ROCKSDB_LITE TEST_F(DBTest, L0L1L2AndUpHitCounter) { + const int kNumLevels = 3; + const int kNumKeysPerLevel = 10000; + const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel; + Options options = CurrentOptions(); - options.write_buffer_size = 32 * 1024; - options.target_file_size_base = 32 * 1024; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 64 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); + Reopen(options); - int numkeys = 20000; - for (int i = 0; i < numkeys; i++) { - ASSERT_OK(Put(1, Key(i), "val")); + // After the below loop there will be one file on each of L0, L1, and L2. + int key = 0; + for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) { + for (int i = 0; i < kNumKeysPerLevel; ++i) { + ASSERT_OK(Put(Key(key), "val")); + key++; + } + ASSERT_OK(Flush()); + for (int input_level = 0; input_level < output_level; ++input_level) { + // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to + // `input_level + 1`. + ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr)); + } } + assert(key == kNumKeysPerDb); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - - for (int i = 0; i < numkeys; i++) { - ASSERT_EQ(Get(1, Key(i)), "val"); + for (int i = 0; i < kNumKeysPerDb; i++) { + ASSERT_EQ(Get(Key(i)), "val"); } - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); - ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + - TestGetTickerCount(options, GET_HIT_L1) + - TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); } TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { @@ -5284,7 +5650,7 @@ Random rnd(301); for (int i = 0; i < kNumKeysWritten; ++i) { // compressible string - ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); } table_options.format_version = first_table_version == 1 ? 2 : 1; @@ -5319,12 +5685,20 @@ &sleeping_task_high, Env::Priority::HIGH); std::vector filenames; - env_->GetChildren(dbname_, &filenames); + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + // In Windows, LOCK file cannot be deleted because it is locked by db_test + // After closing db_test, the LOCK file is unlocked and can be deleted // Delete archival files. + bool deleteDir = true; for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(dbname_ + "/" + filenames[i]); + Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]); + if (!s.ok()) { + deleteDir = false; + } + } + if (deleteDir) { + ASSERT_OK(env_->DeleteDir(dbname_)); } - env_->DeleteDir(dbname_); DestroyAndReopen(options); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -5360,9 +5734,10 @@ public: explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {} - bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + bool FullMergeV2(const MergeOperationInput& merge_in, MergeOperationOutput* merge_out) const override { - db_test_->env_->addon_time_.fetch_add(1000); + db_test_->env_->MockSleepForMicroseconds(1000 * + merge_in.operand_list.size()); merge_out->new_value = ""; return true; } @@ -5378,16 +5753,16 @@ // Enable time profiling SetPerfLevel(kEnableTime); - this->env_->addon_time_.store(0); - this->env_->time_elapse_only_sleep_ = true; - this->env_->no_slowdown_ = true; Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.merge_operator.reset(new DelayedMergeOperator(this)); + SetTimeElapseOnlySleepOnReopen(&options); DestroyAndReopen(options); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); - db_->Put(WriteOptions(), "foo", one); + ASSERT_OK(db_->Put(WriteOptions(), "foo", one)); ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "foo", two)); ASSERT_OK(Flush()); @@ -5398,9 +5773,9 @@ opt.verify_checksums = true; opt.snapshot = nullptr; std::string result; - db_->Get(opt, "foo", &result); + ASSERT_OK(db_->Get(opt, "foo", &result)); - ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); ReadOptions read_options; std::unique_ptr iter(db_->NewIterator(read_options)); @@ -5411,11 +5786,10 @@ } ASSERT_EQ(1, count); - ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); + ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); #ifdef ROCKSDB_USING_THREAD_STATUS ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0); #endif // ROCKSDB_USING_THREAD_STATUS - this->env_->time_elapse_only_sleep_ = false; } #ifndef ROCKSDB_LITE @@ -5425,18 +5799,24 @@ options.compaction_filter_factory = std::make_shared(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.merge_operator.reset(new DelayedMergeOperator(this)); - options.compaction_style = kCompactionStyleUniversal; + options.disable_auto_compactions = true; options.max_subcompactions = max_subcompactions_; + SetTimeElapseOnlySleepOnReopen(&options); DestroyAndReopen(options); - for (int i = 0; i < 1000; i++) { + constexpr unsigned n = 1000; + for (unsigned i = 0; i < n; i++) { ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); + ASSERT_EQ(uint64_t{n} * 1000000U, + TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); } TEST_P(DBTestWithParam, FilterCompactionTimeTest) { @@ -5448,14 +5828,17 @@ options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.statistics->set_stats_level(kExceptTimeForMutex); options.max_subcompactions = max_subcompactions_; + SetTimeElapseOnlySleepOnReopen(&options); DestroyAndReopen(options); + unsigned n = 0; // put some data for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); + ++n; } - Flush(); + ASSERT_OK(Flush()); } CompactRangeOptions cro; @@ -5467,7 +5850,9 @@ Iterator* itr = db_->NewIterator(ReadOptions()); itr->SeekToFirst(); - ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); + ASSERT_OK(itr->status()); + ASSERT_EQ(uint64_t{n} * 1000000U, + TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME)); delete itr; } #endif // ROCKSDB_LITE @@ -5480,7 +5865,7 @@ Reopen(options); for (int i = 0; i < 100000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); // only 2 memtables will be alive, so logs_to_free needs to always be below // 2 ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); @@ -5500,7 +5885,7 @@ #endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE -TEST_F(DBTest, SuggestCompactRangeTest) { +TEST_F(DBTest, DISABLED_SuggestCompactRangeTest) { class CompactionFilterFactoryGetContext : public CompactionFilterFactory { public: std::unique_ptr CreateCompactionFilter( @@ -5521,8 +5906,8 @@ }; Options options = CurrentOptions(); - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); options.compaction_style = kCompactionStyleLevel; options.compaction_filter_factory.reset( new CompactionFilterFactoryGetContext()); @@ -5578,7 +5963,7 @@ // compact it three times for (int i = 0; i < 3; ++i) { ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // All files are compacted @@ -5591,7 +5976,7 @@ // nonoverlapping with the file on level 0 Slice start("a"), end("b"); ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // should not compact the level 0 file ASSERT_EQ(1, NumTableFilesAtLevel(0)); @@ -5599,7 +5984,7 @@ start = Slice("j"); end = Slice("m"); ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual( options.compaction_filter_factory.get())); @@ -5608,6 +5993,7 @@ ASSERT_EQ(1, NumTableFilesAtLevel(1)); } + TEST_F(DBTest, PromoteL0) { Options options = CurrentOptions(); options.disable_auto_compactions = true; @@ -5624,7 +6010,7 @@ std::map values; for (const auto& range : ranges) { for (int32_t j = range.first; j < range.second; j++) { - values[j] = RandomString(&rnd, value_size); + values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } ASSERT_OK(Flush()); @@ -5685,8 +6071,8 @@ Random rnd(301); for (int i = 0; i < kNumL0Files; ++i) { - ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024))); - Flush(); + ASSERT_OK(Put(Key(0), rnd.RandomString(1024))); + ASSERT_OK(Flush()); } ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files); ASSERT_EQ(NumTableFilesAtLevel(1), 0); @@ -5724,7 +6110,7 @@ for (int i = 0; i < 2; ++i) { // put two keys to ensure no trivial move for (int j = 0; j < 2; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } ASSERT_OK(Flush()); } @@ -5738,7 +6124,7 @@ for (int i = 0; i < kNumL0Files; ++i) { // put two keys to ensure no trivial move for (int j = 0; j < 2; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } ASSERT_OK(Flush()); } @@ -5750,7 +6136,7 @@ } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); manual_compaction_thread.join(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } #ifndef ROCKSDB_LITE @@ -5767,7 +6153,7 @@ for (int i = 0; i < 2; ++i) { // put two keys to ensure no trivial move for (int j = 0; j < 2; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } ASSERT_OK(Flush()); } @@ -5790,6 +6176,7 @@ port::Thread manual_compaction_thread([&]() { auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(), input_files, 0); + ASSERT_OK(s); }); TEST_SYNC_POINT( @@ -5797,7 +6184,7 @@ // generate enough files to trigger compaction for (int i = 0; i < 20; ++i) { for (int j = 0; j < 2; ++j) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } ASSERT_OK(Flush()); } @@ -5808,7 +6195,7 @@ "DBTest::CompactFilesShouldTriggerAutoCompaction:End"); manual_compaction_thread.join(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data); ASSERT_LE(cf_meta_data.levels[0].files.size(), @@ -5833,7 +6220,7 @@ for (;;) { std::string data(3000, j++ % 127 + 20); data += ToString(j); - batch.Put(handles_[0], Slice(data), Slice(data)); + ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data))); if (batch.GetDataSize() > write_size) { break; } @@ -5918,7 +6305,6 @@ Options options = CurrentOptions(); env_->SetBackgroundThreads(1, Env::LOW); options.env = env_; - env_->no_slowdown_ = true; options.write_buffer_size = 100000000; options.max_write_buffer_number = 256; options.max_background_compactions = 1; @@ -5927,8 +6313,9 @@ options.level0_stop_writes_trigger = 999999; options.delayed_write_rate = 20000000; // Start with 200MB/s options.memtable_factory.reset( - new SpecialSkipListFactory(kEntriesPerMemTable)); + test::NewSpecialSkipListFactory(kEntriesPerMemTable)); + SetTimeElapseOnlySleepOnReopen(&options); CreateAndReopenWithCF({"pikachu"}, options); // Block compactions @@ -5937,14 +6324,14 @@ Env::Priority::LOW); for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(10000, 'x')); - Flush(); + ASSERT_OK(Put(Key(i), std::string(10000, 'x'))); + ASSERT_OK(Flush()); } // These writes will be slowed down to 1KB/s uint64_t estimated_sleep_time = 0; Random rnd(301); - Put("", ""); + ASSERT_OK(Put("", "")); uint64_t cur_rate = options.delayed_write_rate; for (int i = 0; i < kTotalFlushes; i++) { uint64_t size_memtable = 0; @@ -5953,26 +6340,23 @@ // Spread the size range to more. size_t entry_size = rand_num * rand_num * rand_num; WriteOptions wo; - Put(Key(i), std::string(entry_size, 'x'), wo); + ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo)); size_memtable += entry_size + 18; // Occasionally sleep a while if (rnd.Uniform(20) == 6) { env_->SleepForMicroseconds(2666); } } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); estimated_sleep_time += size_memtable * 1000000u / cur_rate; // Slow down twice. One for memtable switch and one for flush finishes. cur_rate = static_cast(static_cast(cur_rate) * kIncSlowdownRatio * kIncSlowdownRatio); } // Estimate the total sleep time fall into the rough range. - ASSERT_GT(env_->addon_time_.load(), - static_cast(estimated_sleep_time / 2)); - ASSERT_LT(env_->addon_time_.load(), - static_cast(estimated_sleep_time * 2)); + ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2); + ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2); - env_->no_slowdown_ = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); @@ -5992,7 +6376,7 @@ options.max_bytes_for_level_base = 10000000000u; options.max_background_compactions = 1; options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); env_->SetBackgroundThreads(1, Env::LOW); test::SleepingBackgroundTask sleeping_task_low; @@ -6013,14 +6397,14 @@ int key_idx = 0; for (int num = 0; num < 5; num++) { GenerateNewFile(&rnd, &key_idx, true); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } ASSERT_EQ(0, callback_count.load()); for (int num = 0; num < 5; num++) { GenerateNewFile(&rnd, &key_idx, true); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } ASSERT_GE(callback_count.load(), 1); @@ -6102,25 +6486,25 @@ // Generating 360KB in Level 3 for (int i = 0; i < 72; i++) { - Put(Key(i), std::string(5000, 'x')); + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); if (i % 10 == 0) { - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); // Generating 360KB in Level 2 for (int i = 0; i < 72; i++) { - Put(Key(i), std::string(5000, 'x')); + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); if (i % 10 == 0) { - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(2); - Put(Key(0), ""); + ASSERT_OK(Put(Key(0), "")); test::SleepingBackgroundTask sleeping_task_low; // Block compactions @@ -6130,11 +6514,11 @@ // Create 3 L0 files, making score of L0 to be 3. for (int i = 0; i < 3; i++) { - Put(Key(i), std::string(5000, 'x')); - Put(Key(100 - i), std::string(5000, 'x')); + ASSERT_OK(Put(Key(i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x'))); // Flush the file. File size is around 30KB. InstallFlushCallback(); - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); WaitForFlush(); } ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); @@ -6143,7 +6527,7 @@ sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); sleeping_task_low.Reset(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Now there is one L1 file but doesn't trigger soft_rate_limit // The L1 file size is around 30KB. @@ -6165,11 +6549,11 @@ sleeping_task_low.WaitUntilSleeping(); // Create 3 L0 files, making score of L0 to be 3 for (int i = 0; i < 3; i++) { - Put(Key(10 + i), std::string(5000, 'x')); - Put(Key(90 - i), std::string(5000, 'x')); + ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x'))); // Flush the file. File size is around 30KB. InstallFlushCallback(); - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); WaitForFlush(); } @@ -6188,11 +6572,11 @@ // Create 3 L0 files, making score of L0 to be 3, higher than L0. for (int i = 0; i < 3; i++) { - Put(Key(20 + i), std::string(5000, 'x')); - Put(Key(80 - i), std::string(5000, 'x')); + ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x'))); + ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x'))); // Flush the file. File size is around 30KB. InstallFlushCallback(); - dbfull()->TEST_FlushMemTable(true, true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true)); WaitForFlush(); } // Wake up sleep task to enable compaction to run and waits @@ -6220,8 +6604,8 @@ {"max_bytes_for_level_base", "5000"}, })); - Put("", ""); - Flush(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed)); @@ -6241,7 +6625,7 @@ options.disable_auto_compactions = true; int kNumKeysPerMemtable = 3; options.memtable_factory.reset( - new SpecialSkipListFactory(kNumKeysPerMemtable)); + test::NewSpecialSkipListFactory(kNumKeysPerMemtable)); Reopen(options); test::SleepingBackgroundTask sleeping_task; @@ -6254,12 +6638,12 @@ for (int i = 0; i < 3; i++) { // Fill one mem table for (int j = 0; j < kNumKeysPerMemtable; j++) { - Put(Key(j), ""); + ASSERT_OK(Put(Key(j), "")); } ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); } // Inserting a new entry would create a new mem table, triggering slow down. - Put(Key(0), ""); + ASSERT_OK(Put(Key(0), "")); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); sleeping_task.WakeUp(); @@ -6404,18 +6788,18 @@ std::vector threads; std::atomic done(false); - db_->PauseBackgroundWork(); + ASSERT_OK(db_->PauseBackgroundWork()); threads.emplace_back([&]() { Random rnd(301); for (int i = 0; i < 10000; ++i) { - Put(RandomString(&rnd, 10), RandomString(&rnd, 10)); + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } done.store(true); }); env_->SleepForMicroseconds(200000); // make sure the thread is not done ASSERT_FALSE(done.load()); - db_->ContinueBackgroundWork(); + ASSERT_OK(db_->ContinueBackgroundWork()); for (auto& t : threads) { t.join(); } @@ -6450,6 +6834,7 @@ { port::Thread tmp_thread([&] { auto it = db_->NewIterator(ReadOptions()); + ASSERT_OK(it->status()); delete it; }); tmp_thread.join(); @@ -6486,10 +6871,11 @@ Options options = CurrentOptions(); options.max_open_files = -1; - env_->time_elapse_only_sleep_ = false; + env_->SetMockSleep(); options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); bool set_file_creation_time_to_zero = true; @@ -6500,7 +6886,7 @@ const uint64_t uint_time_1 = static_cast(time_1); // Add 50 hours - env_->addon_time_.fetch_add(50 * 60 * 60); + env_->MockSleepForSeconds(50 * 60 * 60); int64_t time_2 = 0; env_->GetCurrentTime(&time_2); @@ -6538,9 +6924,9 @@ for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } // At this point there should be 2 files, one with file_creation_time = 0 and @@ -6554,18 +6940,18 @@ set_file_creation_time_to_zero = false; options = CurrentOptions(); options.max_open_files = -1; - env_->time_elapse_only_sleep_ = false; options.env = env_; - env_->addon_time_.store(0); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + DestroyAndReopen(options); for (int i = 0; i < kNumLevelFiles; ++i) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } // At this point there should be 2 files with non-zero file creation time. @@ -6585,18 +6971,50 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) { + Options options = CurrentOptions(); + options.max_write_buffer_size_to_maintain = 10000; + options.write_buffer_size = 160000; + Reopen(options); + Random rnd(301); + bool memory_limit_exceeded = false; + + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + for (int i = 0; i < 1000; i++) { + std::string value = rnd.RandomString(1000); + ASSERT_OK(Put("keykey_" + std::to_string(i), value)); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage(); + const uint64_t size_all_mem_table = + cur_active_mem + cfd->imm()->ApproximateMemoryUsage(); + + // Errors out if memory usage keeps on increasing beyond the limit. + // Once memory limit exceeds, memory_limit_exceeded is set and if + // size_all_mem_table doesn't drop out in the next write then it errors out + // (not expected behaviour). If memory usage drops then + // memory_limit_exceeded is set to false. + if ((size_all_mem_table > cur_active_mem) && + (cur_active_mem >= + static_cast(options.max_write_buffer_size_to_maintain)) && + (size_all_mem_table > + static_cast(options.max_write_buffer_size_to_maintain) + + options.write_buffer_size)) { + ASSERT_FALSE(memory_limit_exceeded); + memory_limit_exceeded = true; + } else { + memory_limit_exceeded = false; + } + } +} + #endif } // namespace ROCKSDB_NAMESPACE -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); -} -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS - int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test2.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test2.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,31 +6,212 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. + #include #include #include +#include #include "db/db_test_util.h" #include "db/read_callback.h" +#include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" #include "rocksdb/persistent_cache.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/trace_record_result.h" +#include "rocksdb/utilities/replayer.h" #include "rocksdb/wal_filter.h" -#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { class DBTest2 : public DBTestBase { public: - DBTest2() : DBTestBase("/db_test2") {} + DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} + + protected: +#ifndef ROCKSDB_LITE + uint64_t GetSstSizeHelper(Temperature temperature) { + std::string prop; + EXPECT_TRUE( + dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature + + ToString(static_cast(temperature)), + &prop)); + return static_cast(std::atoi(prop.c_str())); + } +#endif // ROCKSDB_LITE +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, OpenForReadOnly) { + DB* db_ptr = nullptr; + std::string dbname = test::PerThreadDBPath("db_readonly"); + Options options = CurrentOptions(); + options.create_if_missing = true; + // OpenForReadOnly should fail but will create in the file system + ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); + // Since is created, we should be able to delete the dir + // We first get the list files under + // There should not be any subdirectories -- this is not checked here + std::vector files; + ASSERT_OK(env_->GetChildren(dbname, &files)); + for (auto& f : files) { + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); + } + // should be empty now and we should be able to delete it + ASSERT_OK(env_->DeleteDir(dbname)); + options.create_if_missing = false; + // OpenForReadOnly should fail since was successfully deleted + ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); + // With create_if_missing false, there should not be a dir in the file system + ASSERT_NOK(env_->FileExists(dbname)); +} + +TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { + DB* db_ptr = nullptr; + std::string dbname = test::PerThreadDBPath("db_readonly"); + Options options = CurrentOptions(); + options.create_if_missing = true; + + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + column_families.push_back(ColumnFamilyDescriptor("goku", cf_options)); + std::vector handles; + // OpenForReadOnly should fail but will create in the file system + ASSERT_NOK( + DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); + // Since is created, we should be able to delete the dir + // We first get the list files under + // There should not be any subdirectories -- this is not checked here + std::vector files; + ASSERT_OK(env_->GetChildren(dbname, &files)); + for (auto& f : files) { + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); + } + // should be empty now and we should be able to delete it + ASSERT_OK(env_->DeleteDir(dbname)); + options.create_if_missing = false; + // OpenForReadOnly should fail since was successfully deleted + ASSERT_NOK( + DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); + // With create_if_missing false, there should not be a dir in the file system + ASSERT_NOK(env_->FileExists(dbname)); +} + +class TestReadOnlyWithCompressedCache + : public DBTestBase, + public testing::WithParamInterface> { + public: + TestReadOnlyWithCompressedCache() + : DBTestBase("test_readonly_with_compressed_cache", + /*env_do_fsync=*/true) { + max_open_files_ = std::get<0>(GetParam()); + use_mmap_ = std::get<1>(GetParam()); + } + int max_open_files_; + bool use_mmap_; +}; + +TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) { + if (use_mmap_ && !IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires MMAP support"); + return; + } + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar")); + ASSERT_OK(Flush()); + + DB* db_ptr = nullptr; + Options options = CurrentOptions(); + options.allow_mmap_reads = use_mmap_; + options.max_open_files = max_open_files_; + options.compression = kSnappyCompression; + BlockBasedTableOptions table_options; + table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); + table_options.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + + ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr)); + + std::string v; + ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("bar", v); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); + ASSERT_EQ("bar", v); + if (Snappy_Supported()) { + if (use_mmap_) { + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + } else { + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); + } + } + + delete db_ptr; +} + +INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache, + TestReadOnlyWithCompressedCache, + ::testing::Combine(::testing::Values(-1, 100), + ::testing::Bool())); + +class PartitionedIndexTestListener : public EventListener { + public: + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + ASSERT_GT(info.table_properties.index_partitions, 1); + ASSERT_EQ(info.table_properties.index_key_is_user_key, 0); + } }; +TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { + const int kValueSize = 10500; + const int kNumEntriesPerFile = 1000; + const int kNumFiles = 3; + const int kNumDistinctKeys = 30; + + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + PartitionedIndexTestListener* listener = new PartitionedIndexTestListener(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.listeners.emplace_back(listener); + std::vector snapshots; + Reopen(options); + Random rnd(301); + + for (int i = 0; i < kNumFiles; i++) { + for (int j = 0; j < kNumEntriesPerFile; j++) { + int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys; + std::string value = rnd.RandomString(kValueSize); + ASSERT_OK(Put("keykey_" + std::to_string(key_id), value)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush()); + } + + for (auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} + +#endif // ROCKSDB_LITE + class PrefixFullBloomWithReverseComparator : public DBTestBase, public ::testing::WithParamInterface { public: PrefixFullBloomWithReverseComparator() - : DBTestBase("/prefix_bloom_reverse") {} + : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {} void SetUp() override { if_cache_filter_ = GetParam(); } bool if_cache_filter_; }; @@ -56,7 +237,7 @@ ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2")); ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); if (bbto.block_cache) { bbto.block_cache->EraseUnRefEntries(); @@ -88,18 +269,20 @@ PrefixFullBloomWithReverseComparator, testing::Bool()); TEST_F(DBTest2, IteratorPropertyVersionNumber) { - Put("", ""); + ASSERT_OK(Put("", "")); Iterator* iter1 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter1->status()); std::string prop_value; ASSERT_OK( iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number1 = static_cast(std::atoi(prop_value.c_str())); - Put("", ""); - Flush(); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); Iterator* iter2 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter2->status()); ASSERT_OK( iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number2 = @@ -107,9 +290,10 @@ ASSERT_GT(version_number2, version_number1); - Put("", ""); + ASSERT_OK(Put("", "")); Iterator* iter3 = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter3->status()); ASSERT_OK( iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number3 = @@ -136,11 +320,11 @@ BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "a", "begin"); - Put(1, "z", "end"); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); TryReopenWithColumnFamilies({"default", "pikachu"}, options); @@ -156,10 +340,10 @@ options.merge_operator = MergeOperators::CreatePutOperator(); options.disable_auto_compactions = true; DestroyAndReopen(options); - Put("poi", "Finch"); - db_->Merge(WriteOptions(), "poi", "Reese"); - db_->Merge(WriteOptions(), "poi", "Shaw"); - db_->Merge(WriteOptions(), "poi", "Root"); + ASSERT_OK(Put("poi", "Finch")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw")); + ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root")); options.max_successive_merges = 2; Reopen(options); } @@ -170,7 +354,7 @@ public testing::WithParamInterface> { public: DBTestSharedWriteBufferAcrossCFs() - : DBTestBase("/db_test_shared_write_buffer") {} + : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {} void SetUp() override { use_old_interface_ = std::get<0>(GetParam()); cost_cache_ = std::get<1>(GetParam()); @@ -182,6 +366,10 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { Options options = CurrentOptions(); options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 @@ -217,14 +405,15 @@ wo.disableWAL = true; std::function wait_flush = [&]() { - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); }; // Create some data and flush "default" and "nikitich" so that they // are newer CFs created. + flush_listener->expected_flush_reason = FlushReason::kManualFlush; ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); Flush(3); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); @@ -235,6 +424,7 @@ ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(1)); + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); @@ -359,6 +549,10 @@ std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2"); Options options = CurrentOptions(); options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -389,13 +583,14 @@ wo.disableWAL = true; std::function wait_flush = [&]() { - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); - static_cast(db2)->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); }; // Trigger a flush on cf2 + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); @@ -407,7 +602,7 @@ ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); - static_cast(db2)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + GetNumberOfSstFilesForColumnFamily(db_, "cf1") + @@ -438,7 +633,7 @@ wait_flush(); ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); wait_flush(); - static_cast(db2)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); @@ -562,9 +757,9 @@ for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { - batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } WalFilter::WalProcessingOption wal_processing_option = @@ -583,14 +778,14 @@ TryReopenWithColumnFamilies({ "default", "pikachu" }, options); if (wal_processing_option == WalFilter::WalProcessingOption::kCorruptedRecord) { - assert(!status.ok()); + ASSERT_NOK(status); // In case of corruption we can turn off paranoid_checks to reopen // databse options.paranoid_checks = false; ReopenWithColumnFamilies({ "default", "pikachu" }, options); } else { - assert(status.ok()); + ASSERT_OK(status); } // Compute which keys we expect to be found @@ -647,7 +842,7 @@ break; } default: - assert(false); // unhandled case + FAIL(); // unhandled case } bool checked_after_reopen = false; @@ -690,7 +885,7 @@ num_keys_added_(0) {} void Put(const Slice& key, const Slice& value) override { if (num_keys_added_ < num_keys_to_add_in_new_batch_) { - new_write_batch_->Put(key, value); + ASSERT_OK(new_write_batch_->Put(key, value)); ++num_keys_added_; } } @@ -717,8 +912,12 @@ bool* batch_changed) const override { if (current_record_index_ >= change_records_from_index_) { ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_); - batch.Iterate(&handler); - *batch_changed = true; + Status s = batch.Iterate(&handler); + if (s.ok()) { + *batch_changed = true; + } else { + assert(false); + } } // Filter is passed as a const object for RocksDB to not modify the @@ -750,9 +949,9 @@ for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { - batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // Create a test filter that would apply wal_processing_option at the first @@ -811,8 +1010,12 @@ WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) const override { *new_batch = batch; - new_batch->Put("key_extra", "value_extra"); - *batch_changed = true; + Status s = new_batch->Put("key_extra", "value_extra"); + if (s.ok()) { + *batch_changed = true; + } else { + assert(false); + } return WalProcessingOption::kContinueProcessing; } @@ -838,9 +1041,9 @@ for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { - batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)); + ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // Create a test filter that would add extra keys @@ -923,7 +1126,11 @@ } } handler(log_number, cf_log_number_map_, cf_wal_keys_); - batch.Iterate(&handler); + Status s = batch.Iterate(&handler); + if (!s.ok()) { + // TODO(AR) is this ok? + return WalProcessingOption::kCorruptedRecord; + } return WalProcessingOption::kContinueProcessing; } @@ -958,14 +1165,16 @@ for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { - batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024)); - batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024)); + ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j], + DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j], + DummyString(1024))); } - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } //Flush default column-family - db_->Flush(FlushOptions(), handles_[0]); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[0])); // Do some more writes std::vector> batch_keys_post_flush(3); @@ -981,10 +1190,12 @@ for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { - batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024)); - batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024)); + ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j], + DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j], + DummyString(1024))); } - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // On Recovery we should only find the second batch applicable to default CF @@ -1011,10 +1222,10 @@ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_post_flush[i][j]); - ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } - ASSERT_TRUE(index == keys_cf.size()); + ASSERT_EQ(index, keys_cf.size()); index = 0; keys_cf = cf_wal_keys[name_id_map["pikachu"]]; @@ -1023,7 +1234,7 @@ for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_pre_flush[i][j]); - ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } @@ -1031,10 +1242,10 @@ for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_post_flush[i][j]); - ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0); + ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } - ASSERT_TRUE(index == keys_cf.size()); + ASSERT_EQ(index, keys_cf.size()); } TEST_F(DBTest2, PresetCompressionDict) { @@ -1054,7 +1265,7 @@ options.disable_auto_compactions = true; options.level0_file_num_compaction_trigger = kNumL0Files; options.memtable_factory.reset( - new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); + test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); options.num_levels = 2; options.target_file_size_base = kL0FileBytes; options.target_file_size_multiplier = 2; @@ -1121,7 +1332,7 @@ std::string seq_datas[10]; for (int j = 0; j < 10; ++j) { seq_datas[j] = - RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes); + rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes); } ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); @@ -1131,11 +1342,11 @@ ASSERT_OK(Put(1, Key(static_cast(key_num)), seq_datas[(key_num / 10) % 10])); } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); } - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); @@ -1189,14 +1400,14 @@ options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); Random rnd(301); for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kNumEntriesPerFile; ++j) { ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j), - RandomString(&rnd, kNumBytesPerEntry))); + rnd.RandomString(kNumBytesPerEntry))); } ASSERT_OK(Flush()); MoveFilesToLevel(1); @@ -1234,6 +1445,236 @@ } } +class PresetCompressionDictTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + PresetCompressionDictTest() + : DBTestBase("db_test2", false /* env_do_fsync */), + compression_type_(std::get<0>(GetParam())), + bottommost_(std::get<1>(GetParam())) {} + + protected: + const CompressionType compression_type_; + const bool bottommost_; +}; + +INSTANTIATE_TEST_CASE_P( + DBTest2, PresetCompressionDictTest, + ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()), + ::testing::Bool())); + +TEST_P(PresetCompressionDictTest, Flush) { + // Verifies that dictionary is generated and written during flush only when + // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the + // size of the dictionary is within expectations according to the limit on + // buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile)); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t i = 0; i <= kKeysPerFile; ++i) { + ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the flush finishes. + if (bottommost_) { + // Flush is never considered bottommost. This should change in the future + // since flushed files may have nothing underneath them, like the one in + // this test case. + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactNonBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when `ColumnFamilyOptions::compression` enables + // dictionary. Also verifies the size of the dictionary is within expectations + // according to the limit on buffering set by + // `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + // This L0->L1 compaction merges the two L0 files into L1. The produced L1 + // file is not bottommost due to the existing L2 file covering the same key- + // range. + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the compaction finishes. + if (bottommost_) { + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when either `ColumnFamilyOptions::compression` or + // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also + // verifies the size of the dictionary is within expectations according to the + // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in ZSTD's + // digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit after + // each block is built. + ASSERT_LE( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } +} + class CompactionCompressionListener : public EventListener { public: explicit CompactionCompressionListener(Options* db_options) @@ -1244,9 +1685,9 @@ int bottommost_level = 0; for (int level = 0; level < db->NumberLevels(); level++) { std::string files_at_level; - ASSERT_TRUE( - db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), - &files_at_level)); + ASSERT_TRUE(db->GetProperty( + "rocksdb.num-files-at-level" + ROCKSDB_NAMESPACE::ToString(level), + &files_at_level)); if (files_at_level != "0") { bottommost_level = level; } @@ -1268,6 +1709,151 @@ const Options* db_options_; }; +enum CompressionFailureType { + kTestCompressionFail, + kTestDecompressionFail, + kTestDecompressionCorruption +}; + +class CompressionFailuresTest + : public DBTest2, + public testing::WithParamInterface> { + public: + CompressionFailuresTest() { + std::tie(compression_failure_type_, compression_type_, + compression_max_dict_bytes_, compression_parallel_threads_) = + GetParam(); + } + + CompressionFailureType compression_failure_type_ = kTestCompressionFail; + CompressionType compression_type_ = kNoCompression; + uint32_t compression_max_dict_bytes_ = 0; + uint32_t compression_parallel_threads_ = 0; +}; + +INSTANTIATE_TEST_CASE_P( + DBTest2, CompressionFailuresTest, + ::testing::Combine(::testing::Values(kTestCompressionFail, + kTestDecompressionFail, + kTestDecompressionCorruption), + ::testing::ValuesIn(GetSupportedCompressions()), + ::testing::Values(0, 10), ::testing::Values(1, 4))); + +TEST_P(CompressionFailuresTest, CompressionFailures) { + if (compression_type_ == kNoCompression) { + return; + } + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.max_bytes_for_level_base = 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 7; + options.max_background_compactions = 1; + options.target_file_size_base = 512; + + BlockBasedTableOptions table_options; + table_options.block_size = 512; + table_options.verify_compression = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + options.compression = compression_type_; + options.compression_opts.parallel_threads = compression_parallel_threads_; + options.compression_opts.max_dict_bytes = compression_max_dict_bytes_; + options.bottommost_compression_opts.parallel_threads = + compression_parallel_threads_; + options.bottommost_compression_opts.max_dict_bytes = + compression_max_dict_bytes_; + + if (compression_failure_type_ == kTestCompressionFail) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompressData:TamperWithReturnValue", [](void* arg) { + bool* ret = static_cast(arg); + *ret = false; + }); + } else if (compression_failure_type_ == kTestDecompressionFail) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UncompressBlockContentsForCompressionType:TamperWithReturnValue", + [](void* arg) { + Status* ret = static_cast(arg); + ASSERT_OK(*ret); + *ret = Status::Corruption("kTestDecompressionFail"); + }); + } else if (compression_failure_type_ == kTestDecompressionCorruption) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UncompressBlockContentsForCompressionType:" + "TamperWithDecompressionOutput", + [](void* arg) { + BlockContents* contents = static_cast(arg); + // Ensure uncompressed data != original data + const size_t len = contents->data.size() + 1; + std::unique_ptr fake_data(new char[len]()); + *contents = BlockContents(std::move(fake_data), len); + }); + } + + std::map key_value_written; + + const int kKeySize = 5; + const int kValUnitSize = 16; + const int kValSize = 256; + Random rnd(405); + + Status s = Status::OK(); + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + std::string key = rnd.RandomString(kKeySize); + // Ensure good compression ratio + std::string valueUnit = rnd.RandomString(kValUnitSize); + std::string value; + for (int k = 0; k < kValSize; k += kValUnitSize) { + value += valueUnit; + } + s = Put(key, value); + if (compression_failure_type_ == kTestCompressionFail) { + key_value_written[key] = value; + ASSERT_OK(s); + } + } + s = Flush(); + if (compression_failure_type_ == kTestCompressionFail) { + ASSERT_OK(s); + } + s = dbfull()->TEST_WaitForCompact(); + if (compression_failure_type_ == kTestCompressionFail) { + ASSERT_OK(s); + } + if (i == 4) { + // Make compression fail at the mid of table building + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + if (compression_failure_type_ == kTestCompressionFail) { + // Should be kNoCompression, check content consistency + std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::string key = db_iter->key().ToString(); + std::string value = db_iter->value().ToString(); + ASSERT_NE(key_value_written.find(key), key_value_written.end()); + ASSERT_EQ(key_value_written[key], value); + key_value_written.erase(key); + } + ASSERT_EQ(0, key_value_written.size()); + } else if (compression_failure_type_ == kTestDecompressionFail) { + ASSERT_EQ(std::string(s.getState()), + "Could not decompress: kTestDecompressionFail"); + } else if (compression_failure_type_ == kTestDecompressionCorruption) { + ASSERT_EQ(std::string(s.getState()), + "Decompressed block did not match raw block"); + } +} + TEST_F(DBTest2, CompressionOptions) { if (!Zlib_Supported() || !Snappy_Supported()) { return; @@ -1288,6 +1874,10 @@ const int kValSize = 20; Random rnd(301); + std::vector compression_parallel_threads = {1, 4}; + + std::map key_value_written; + for (int iter = 0; iter <= 2; iter++) { listener->max_level_checked = 0; @@ -1312,19 +1902,38 @@ options.bottommost_compression = kDisableCompressionOption; } - DestroyAndReopen(options); - // Write 10 random files - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 5; j++) { - ASSERT_OK( - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize))); + for (auto num_threads : compression_parallel_threads) { + options.compression_opts.parallel_threads = num_threads; + options.bottommost_compression_opts.parallel_threads = num_threads; + + DestroyAndReopen(options); + // Write 10 random files + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 5; j++) { + std::string key = rnd.RandomString(kKeySize); + std::string value = rnd.RandomString(kValSize); + key_value_written[key] = value; + ASSERT_OK(Put(key, value)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } - ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); - } - // Make sure that we wrote enough to check all 7 levels - ASSERT_EQ(listener->max_level_checked, 6); + // Make sure that we wrote enough to check all 7 levels + ASSERT_EQ(listener->max_level_checked, 6); + + // Make sure database content is the same as key_value_written + std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::string key = db_iter->key().ToString(); + std::string value = db_iter->value().ToString(); + ASSERT_NE(key_value_written.find(key), key_value_written.end()); + ASSERT_EQ(key_value_written[key], value); + key_value_written.erase(key); + } + ASSERT_OK(db_iter->status()); + ASSERT_EQ(0, key_value_written.size()); + } } } @@ -1375,7 +1984,7 @@ // 4 Files in L0 for (int i = 0; i < 4; i++) { for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } ASSERT_OK(Flush()); } @@ -1390,7 +1999,7 @@ // Another 6 L0 files to trigger compaction again for (int i = 0; i < 6; i++) { for (int j = 0; j < 10; j++) { - ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } ASSERT_OK(Flush()); } @@ -1404,7 +2013,7 @@ // Hold NotifyOnCompactionCompleted in the unlock mutex section TEST_SYNC_POINT("DBTest2::CompactionStall:3"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_LT(NumTableFilesAtLevel(0), options.level0_file_num_compaction_trigger); ASSERT_GT(listener->compacted_files_cnt_.load(), @@ -1425,8 +2034,8 @@ // This snapshot will have sequence number 0 what is expected behaviour. const Snapshot* s1 = db_->GetSnapshot(); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - Put(1, "k2", std::string(100000, 'y')); // Trigger flush + ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable + ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush db_->ReleaseSnapshot(s1); } @@ -1436,20 +2045,20 @@ Options options; options = CurrentOptions(options); std::vector snapshots; - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); SequenceNumber oldest_ww_snap, first_ww_snap; - Put("k", "v"); // inc seq + ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); snapshots.push_back(db_->GetSnapshot()); - Put("k", "v"); // inc seq + ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); first_ww_snap = snapshots.back()->GetSequenceNumber(); - Put("k", "v"); // inc seq + ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); snapshots.push_back(db_->GetSnapshot()); - Put("k", "v"); // inc seq + ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); { @@ -1469,7 +2078,8 @@ : public DBTestBase, public testing::WithParamInterface> { public: - PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {} + PinL0IndexAndFilterBlocksTest() + : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {} void SetUp() override { infinite_max_files_ = std::get<0>(GetParam()); disallow_preload_ = std::get<1>(GetParam()); @@ -1485,22 +2095,22 @@ table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options->table_factory.reset(new BlockBasedTableFactory(table_options)); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, *options); - Put(1, "a", "begin"); - Put(1, "z", "end"); + ASSERT_OK(Put(1, "a", "begin")); + ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); // move this table to L1 - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); options->table_factory.reset(NewBlockBasedTableFactory(table_options)); TryReopenWithColumnFamilies({"default", "pikachu"}, *options); // create new table at L0 - Put(1, "a2", "begin2"); - Put(1, "z2", "end2"); + ASSERT_OK(Put(1, "a2", "begin2")); + ASSERT_OK(Put(1, "z2", "end2")); ASSERT_OK(Flush(1)); if (close_afterwards) { @@ -1525,7 +2135,7 @@ table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "val")); @@ -1544,7 +2154,7 @@ std::string value; // Miss and hit count should remain the same, they're all pinned. - db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); @@ -1672,18 +2282,18 @@ // cache read for both of index and filter. If prefetch doesn't explicitly // happen, it will happen when verifying the file. Compact(1, "a", "zzzzz"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (!disallow_preload_) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -1692,12 +2302,12 @@ ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } @@ -1710,8 +2320,8 @@ #ifndef ROCKSDB_LITE TEST_F(DBTest2, MaxCompactionBytesTest) { Options options = CurrentOptions(); - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 200 << 10; options.arena_block_size = 4 << 10; @@ -1743,10 +2353,10 @@ GenerateNewRandomFile(&rnd); // Add three more small files that overlap with the previous file for (int i = 0; i < 3; i++) { - Put("a", "z"); + ASSERT_OK(Put("a", "z")); ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Output files to L1 are cut to three pieces, according to // options.max_compaction_bytes @@ -1779,6 +2389,10 @@ return PersistentCache::StatsType(); } + uint64_t NewId() override { + return last_id_.fetch_add(1, std::memory_order_relaxed); + } + Status Insert(const Slice& page_key, const char* data, const size_t size) override { MutexLock _(&lock_); @@ -1819,6 +2433,7 @@ const bool is_compressed_ = true; size_t size_ = 0; const size_t max_size_ = 10 * 1024; // 10KiB + std::atomic last_id_{1}; }; #ifdef OS_LINUX @@ -1831,6 +2446,9 @@ ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env // CPU timing is not enabled with kEnableTimeExceptForMutex SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); @@ -1838,19 +2456,20 @@ ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); ASSERT_EQ(0, env_->now_cpu_count_.load()); - uint64_t kDummyAddonTime = uint64_t{1000000000000}; + constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; + constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; // Add time to NowNanos() reading. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TableCache::FindTable:0", - [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); ASSERT_EQ("bar", Get("foo")); ASSERT_GT(env_->now_cpu_count_.load(), 2); - ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime); - ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); SetPerfLevel(PerfLevel::kDisable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -1873,6 +2492,9 @@ std::string last_key = "k" + ToString(kNumEntries - 1); std::string last_value = "v" + ToString(kNumEntries - 1); env_->now_cpu_count_.store(0); + env_->SetMockSleep(); + + // NOTE: Presumed unnecessary and removed: resetting mock time in env // CPU timing is not enabled with kEnableTimeExceptForMutex SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); @@ -1895,17 +2517,19 @@ ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos); iter->Prev(); ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos); ASSERT_EQ(0, env_->now_cpu_count_.load()); delete iter; - uint64_t kDummyAddonTime = uint64_t{1000000000000}; + constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; + constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; // Add time to NowNanos() reading. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TableCache::FindTable:0", - [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); @@ -1922,19 +2546,20 @@ ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0); - ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime); + ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v1", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0); - ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime); + ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos); iter->Prev(); ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0); - ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime); + ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos); ASSERT_GE(env_->now_cpu_count_.load(), 12); - ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); SetPerfLevel(PerfLevel::kDisable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -1942,10 +2567,7 @@ } #endif // OS_LINUX -// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache -// breaks when that function is not implemented and no regular block cache is -// provided. -#if !defined(OS_SOLARIS) && !defined(OS_WIN) +#if !defined OS_SOLARIS TEST_F(DBTest2, PersistentCache) { int num_iter = 80; @@ -1988,7 +2610,7 @@ std::string str; for (int i = 0; i < num_iter; i++) { if (i % 4 == 0) { // high compression ratio - str = RandomString(&rnd, 1000); + str = rnd.RandomString(1000); } values.push_back(str); ASSERT_OK(Put(1, Key(i), values[i])); @@ -2009,7 +2631,7 @@ } } } -#endif // !defined(OS_SOLARIS) && !defined(OS_WIN) +#endif // !defined OS_SOLARIS namespace { void CountSyncPoint() { @@ -2086,7 +2708,7 @@ Random rnd(301); for (size_t i = 0; i < kNumEntries; i++) { - ASSERT_OK(Put(Key(static_cast(i)), RandomString(&rnd, 100))); + ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(100))); } ASSERT_OK(Flush()); @@ -2130,6 +2752,7 @@ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString())); } + ASSERT_OK(iter->status()); delete iter; // Read amp is on average 100% since we read all what we loaded in memory @@ -2152,26 +2775,30 @@ { const int kIdBufLen = 100; char id_buf[kIdBufLen]; + Status s = Status::NotSupported(); #ifndef OS_WIN // You can't open a directory on windows using random access file std::unique_ptr file; - ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions())); - if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { - // fs holding db directory doesn't support getting a unique file id, - // this means that running this test will fail because lru_cache will load - // the blocks again regardless of them being already in the cache - return; - } -#else - std::unique_ptr dir; - ASSERT_OK(env_->NewDirectory(dbname_, &dir)); - if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { - // fs holding db directory doesn't support getting a unique file id, - // this means that running this test will fail because lru_cache will load - // the blocks again regardless of them being already in the cache - return; + s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions()); + if (s.ok()) { + if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } } #endif + if (!s.ok()) { + std::unique_ptr dir; + ASSERT_OK(env_->NewDirectory(dbname_, &dir)); + if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } + } } uint32_t bytes_per_bit[2] = {1, 16}; for (size_t k = 0; k < 2; k++) { @@ -2193,14 +2820,13 @@ Random rnd(301); for (int i = 0; i < kNumEntries; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); } ASSERT_OK(Flush()); Close(); Reopen(options); - uint64_t total_useful_bytes = 0; std::set read_keys; std::string value; // Iter1: Read half the DB, Read even keys @@ -2211,8 +2837,6 @@ if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); - total_useful_bytes += - GetEncodedEntrySize(internal_key.size(), value.size()); read_keys.insert(i); } } @@ -2239,8 +2863,6 @@ if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); - total_useful_bytes += - GetEncodedEntrySize(internal_key.size(), value.size()); read_keys.insert(i); } } @@ -2416,22 +3038,22 @@ Random rnd(301); // Generate a file containing 10 keys. for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); // Generate another file containing same keys for (int i = 0; i < 10; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); int manual_compactions_paused = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) { - auto paused = reinterpret_cast*>(arg); - ASSERT_FALSE(paused->load(std::memory_order_acquire)); - paused->store(true, std::memory_order_release); + auto paused = static_cast*>(arg); + ASSERT_EQ(0, paused->load(std::memory_order_acquire)); + paused->fetch_add(1, std::memory_order_release); manual_compactions_paused += 1; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -2445,10 +3067,12 @@ } // OK, now trigger a manual compaction - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsManualCompactionPaused()); // Wait for compactions to get scheduled and stopped - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); // Get file names after compaction is stopped files_meta.clear(); @@ -2463,10 +3087,12 @@ manual_compactions_paused = 0; // Now make sure CompactFiles also not run - dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), - files_before_compact, 0); + ASSERT_TRUE(dbfull() + ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + files_before_compact, 0) + .IsManualCompactionPaused()); // Wait for manual compaction to get scheduled and finish - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); files_meta.clear(); files_after_compact.clear(); @@ -2495,7 +3121,7 @@ for (int i = 0; i < 2; i++) { // Generate a file containing 10 keys. for (int j = 0; j < 100; j++) { - ASSERT_OK(Put(Key(j), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(j), rnd.RandomString(50))); } ASSERT_OK(Flush()); } @@ -2517,9 +3143,9 @@ for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { - ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } - Flush(); + ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { @@ -2540,8 +3166,10 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); dbfull()->DisableManualCompaction(); - dbfull()->CompactRange(compact_options, nullptr, nullptr); - dbfull()->TEST_WaitForCompact(true); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); // As manual compaction disabled, not even reach sync point ASSERT_EQ(run_manual_compactions, 0); #ifndef ROCKSDB_LITE @@ -2551,8 +3179,8 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:1"); dbfull()->EnableManualCompaction(); - dbfull()->CompactRange(compact_options, nullptr, nullptr); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); #ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); #endif // !ROCKSDB_LITE @@ -2571,9 +3199,9 @@ for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { - ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } - Flush(); + ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { @@ -2590,16 +3218,17 @@ int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { - auto paused = reinterpret_cast*>(arg); - ASSERT_FALSE(paused->load(std::memory_order_acquire)); - paused->store(true, std::memory_order_release); + auto paused = static_cast*>(arg); + ASSERT_EQ(0, paused->load(std::memory_order_acquire)); + paused->fetch_add(1, std::memory_order_release); run_manual_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->EnableManualCompaction(); - dbfull()->CompactRange(compact_options, nullptr, nullptr); - dbfull()->TEST_WaitForCompact(true); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(run_manual_compactions, 1); #ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); @@ -2608,8 +3237,8 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:2"); dbfull()->EnableManualCompaction(); - dbfull()->CompactRange(compact_options, nullptr, nullptr); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); #ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); #endif // !ROCKSDB_LITE @@ -2617,6 +3246,360 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBTest2, CancelManualCompaction1) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Setup a callback to disable compactions after a couple of levels are + // compacted + int compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", + [&](void* /*arg*/) { ++compactions_run; }); + + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // Since compactions are disabled, we shouldn't start compacting. + // E.g. we should call the compaction function exactly one time. + ASSERT_EQ(compactions_run, 0); + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + if (compactions_run == 3) { + compact_options.canceled->store(true, std::memory_order_release); + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(compactions_run, 3); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CancelManualCompaction2) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + compact_options.max_subcompactions = 1; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + int compactions_run = 0; + std::atomic kv_compactions{0}; + int compactions_stopped_at = 0; + int kv_compactions_stopped_at = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator:ProcessKV", [&](void* /*arg*/) { + int kv_compactions_run = + kv_compactions.fetch_add(1, std::memory_order_release); + if (kv_compactions_run == 5) { + compact_options.canceled->store(true, std::memory_order_release); + kv_compactions_stopped_at = kv_compactions_run; + compactions_stopped_at = compactions_run; + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to + // the canceled variable from the single compacting thread (via callback), + // this value is deterministically kv_compactions_stopped_at + 1. + ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1); + ASSERT_EQ(compactions_run, compactions_stopped_at); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionIterator::ProcessKV"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +class CancelCompactionListener : public EventListener { + public: + CancelCompactionListener() + : num_compaction_started_(0), num_compaction_ended_(0) {} + + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + num_compaction_started_++; + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(ci.cf_name, "default"); + ASSERT_EQ(ci.base_input_level, 0); + ASSERT_EQ(ci.status.code(), code_); + ASSERT_EQ(ci.status.subcode(), subcode_); + num_compaction_ended_++; + } + + std::atomic num_compaction_started_; + std::atomic num_compaction_ended_; + Status::Code code_; + Status::SubCode subcode_; +}; + +TEST_F(DBTest2, CancelManualCompactionWithListener) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + compact_options.max_subcompactions = 1; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CancelCompactionListener* listener = new CancelCompactionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator:ProcessKV", [&](void* /*arg*/) { + compact_options.canceled->store(true, std::memory_order_release); + }); + + int running_compaction = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile1", + [&](void* /*arg*/) { running_compaction++; }); + + // Case I: 1 Notify begin compaction, 2 DisableManualCompaction, 3 Compaction + // not run, 4 Notify compaction end. + listener->code_ = Status::kIncomplete; + listener->subcode_ = Status::SubCode::kManualCompactionPaused; + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + ASSERT_EQ(running_compaction, 0); + + listener->num_compaction_started_ = 0; + listener->num_compaction_ended_ = 0; + + // Case II: 1 DisableManualCompaction, 2 Notify begin compaction (return + // without notifying), 3 Notify compaction end (return without notifying). + ASSERT_TRUE(dbfull() + ->CompactRange(compact_options, nullptr, nullptr) + .IsManualCompactionPaused()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_EQ(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + ASSERT_EQ(running_compaction, 0); + + // Case III: 1 Notify begin compaction, 2 Compaction in between + // 3. DisableManualCompaction, , 4 Notify compaction end. + // compact_options.canceled->store(false, std::memory_order_release); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionIterator:ProcessKV"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) { + compact_options.canceled->store(true, std::memory_order_release); + }); + + listener->code_ = Status::kOk; + listener->subcode_ = Status::SubCode::kNone; + + compact_options.canceled->store(false, std::memory_order_release); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + + // Compaction job will succeed. + ASSERT_GT(running_compaction, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) { + int num_levels = 3; + const int kNumFilesTrigger = 4; + + Options options = CurrentOptions(); + env_->SetBackgroundThreads(0, Env::Priority::HIGH); + env_->SetBackgroundThreads(0, Env::Priority::LOW); + env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); + options.env = env_; + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = num_levels; + options.write_buffer_size = 100 << 10; // 100KB + options.target_file_size_base = 32 << 10; // 32KB + options.level0_file_num_compaction_trigger = kNumFilesTrigger; + // Trigger compaction if size amplification exceeds 110% + options.compaction_options_universal.max_size_amplification_percent = 110; + + CancelCompactionListener* listener = new CancelCompactionListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + + int num_bottom_thread_compaction_scheduled = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:ForwardToBottomPriPool", + [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; }); + + int num_compaction_jobs = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():End", + [&](void* /*arg*/) { num_compaction_jobs++; }); + + listener->code_ = Status::kOk; + listener->subcode_ = Status::SubCode::kNone; + + Random rnd(301); + for (int i = 0; i < 1; ++i) { + for (int num = 0; num < kNumFilesTrigger; num++) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* no_wait */); + // use no_wait above because that one waits for flush and compaction. We + // don't want to wait for compaction because the full compaction is + // intentionally blocked while more files are flushed. + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + } + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_GT(num_bottom_thread_compaction_scheduled, 0); + ASSERT_EQ(num_compaction_jobs, 1); + ASSERT_GT(listener->num_compaction_started_, 0); + ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest2, OptimizeForPointLookup) { Options options = CurrentOptions(); Close(); @@ -2625,7 +3608,7 @@ ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("v1", Get("foo")); } @@ -2635,11 +3618,11 @@ options.OptimizeForSmallDb(); // Find the cache object - ASSERT_EQ(std::string(BlockBasedTableFactory::kName), - std::string(options.table_factory->Name())); - BlockBasedTableOptions* table_options = - reinterpret_cast( - options.table_factory->GetOptions()); + ASSERT_TRUE(options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); + auto table_options = + options.table_factory->GetOptions(); + ASSERT_TRUE(table_options != nullptr); std::shared_ptr cache = table_options->block_cache; @@ -2651,7 +3634,7 @@ ASSERT_NE(0, cache->GetUsage()); ASSERT_EQ("v1", Get("foo")); - Flush(); + ASSERT_OK(Flush()); size_t prev_size = cache->GetUsage(); // Remember block cache size, so that we can find that @@ -2666,6 +3649,101 @@ #endif // ROCKSDB_LITE +TEST_F(DBTest2, IterRaceFlush1) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"}, + {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRaceFlush:2"); + }); + + // iterator is created after the first Put(), and its snapshot sequence is + // assigned after second Put(), so it must see v2. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v2", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, IterRaceFlush2) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"}, + {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2"); + }); + + // iterator is created after the first Put(), and its snapshot sequence is + // assigned before second Put(), thus it must see v1. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v1", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, IterRefreshRaceFlush) { + ASSERT_OK(Put("foo", "v1")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"}, + {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ROCKSDB_NAMESPACE::port::Thread t1([&] { + TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1"); + ASSERT_OK(Put("foo", "v2")); + ASSERT_OK(Flush()); + TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2"); + }); + + // iterator is refreshed after the first Put(), and its sequence number is + // assigned after second Put(), thus it must see v2. + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + ASSERT_OK(it->status()); + ASSERT_OK(it->Refresh()); + it->Seek("foo"); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ("foo", it->key().ToString()); + ASSERT_EQ("v2", it->value().ToString()); + } + + t1.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest2, GetRaceFlush1) { ASSERT_OK(Put("foo", "v1")); @@ -2678,7 +3756,7 @@ ROCKSDB_NAMESPACE::port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); - Flush(); + ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); }); @@ -2701,7 +3779,7 @@ port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); - Flush(); + ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); }); @@ -2774,6 +3852,7 @@ ASSERT_EQ("second", value); // nothing should be returned using memtable-only iterator after flushing. it = db_->NewIterator(ropt, handles_[1]); + ASSERT_OK(it->status()); count = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_TRUE(it->Valid()); @@ -2781,11 +3860,13 @@ } ASSERT_TRUE(!it->Valid()); ASSERT_EQ(0, count); + ASSERT_OK(it->status()); delete it; // Add a key to memtable ASSERT_OK(Put(1, "foobar", "third")); it = db_->NewIterator(ropt, handles_[1]); + ASSERT_OK(it->status()); count = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_TRUE(it->Valid()); @@ -2795,6 +3876,7 @@ } ASSERT_TRUE(!it->Valid()); ASSERT_EQ(1, count); + ASSERT_OK(it->status()); delete it; } @@ -2823,28 +3905,28 @@ WriteOptions wo; for (int i = 0; i < 6; i++) { wo.low_pri = false; - Put("", "", wo); + ASSERT_OK(Put("", "", wo)); wo.low_pri = true; - Put("", "", wo); - Flush(); + ASSERT_OK(Put("", "", wo)); + ASSERT_OK(Flush()); } ASSERT_EQ(0, rate_limit_count.load()); wo.low_pri = true; - Put("", "", wo); + ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); wo.low_pri = false; - Put("", "", wo); + ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); TEST_SYNC_POINT("DBTest.LowPriWrite:0"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); wo.low_pri = true; - Put("", "", wo); + ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); wo.low_pri = false; - Put("", "", wo); + ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); } @@ -2862,7 +3944,8 @@ Options options = CurrentOptions(); options.compression = kNoCompression; options.level0_file_num_compaction_trigger = kNumL0Files; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); options.new_table_reader_for_compaction_inputs = true; // takes roughly one second, split into 100 x 10ms intervals. Each interval // permits 5.12KB, which is smaller than the block size, so this test @@ -2877,17 +3960,19 @@ BlockBasedTableOptions bbto; bbto.block_size = 16384; bbto.no_block_cache = true; - options.table_factory.reset(new BlockBasedTableFactory(bbto)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); for (int i = 0; i < kNumL0Files; ++i) { for (int j = 0; j <= kNumKeysPerFile; ++j) { ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey))); } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + if (i + 1 < kNumL0Files) { + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH)); @@ -2906,6 +3991,7 @@ direct_io_extra)); Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey)); } @@ -2922,11 +4008,12 @@ // is on levels higher than the new num_levels. TEST_F(DBTest2, ReduceLevel) { Options options; + options.env = env_; options.disable_auto_compactions = true; options.num_levels = 7; Reopen(options); - Put("foo", "bar"); - Flush(); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); MoveFilesToLevel(6); #ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -2934,7 +4021,7 @@ CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 1; - dbfull()->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); #ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel()); #endif // !ROCKSDB_LITE @@ -2950,6 +4037,7 @@ Options options; options.disable_auto_compactions = true; options.num_levels = 7; + options.env = env_; Reopen(options); std::vector snapshots; // Try to create a db with multiple layers and a memtable @@ -2962,35 +4050,35 @@ // the DB instead of assuming what seq the DB used. int i = 1; for (; i < 10; i++) { - Put(key, value + std::to_string(i)); + ASSERT_OK(Put(key, value + std::to_string(i))); // Take a snapshot to avoid the value being removed during compaction auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } - Flush(); + ASSERT_OK(Flush()); for (; i < 20; i++) { - Put(key, value + std::to_string(i)); + ASSERT_OK(Put(key, value + std::to_string(i))); // Take a snapshot to avoid the value being removed during compaction auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } - Flush(); + ASSERT_OK(Flush()); MoveFilesToLevel(6); #ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); #endif // !ROCKSDB_LITE for (; i < 30; i++) { - Put(key, value + std::to_string(i)); + ASSERT_OK(Put(key, value + std::to_string(i))); auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } - Flush(); + ASSERT_OK(Flush()); #ifndef ROCKSDB_LITE ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel()); #endif // !ROCKSDB_LITE // And also add some values to the memtable for (; i < 40; i++) { - Put(key, value + std::to_string(i)); + ASSERT_OK(Put(key, value + std::to_string(i))); auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } @@ -3063,40 +4151,46 @@ [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("key", "val"); + ASSERT_OK(Put("key", "val")); FlushOptions flush_opts; flush_opts.wait = false; db_->Flush(flush_opts); TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"); - db_->DisableFileDeletions(); + ASSERT_OK(db_->DisableFileDeletions()); VectorLogPtr log_files; - db_->GetSortedWalFiles(log_files); + ASSERT_OK(db_->GetSortedWalFiles(log_files)); TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"); for (const auto& log_file : log_files) { ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); } - db_->EnableFileDeletions(); + ASSERT_OK(db_->EnableFileDeletions()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, TestNumPread) { Options options = CurrentOptions(); + bool prefetch_supported = + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); // disable block cache BlockBasedTableOptions table_options; table_options.no_block_cache = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); env_->count_random_reads_ = true; - env_->random_file_open_counter_.store(0); ASSERT_OK(Put("bar", "foo")); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); - // After flush, we'll open the file and read footer, meta block, - // property block and index block. - ASSERT_EQ(4, env_->random_read_counter_.Read()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read @@ -3112,19 +4206,30 @@ ASSERT_OK(Put("bar2", "foo2")); ASSERT_OK(Put("foo2", "bar2")); ASSERT_OK(Flush()); - // After flush, we'll open the file and read footer, meta block, - // property block and index block. - ASSERT_EQ(4, env_->random_read_counter_.Read()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } ASSERT_EQ(1, env_->random_file_open_counter_.load()); - // Compaction needs two input blocks, which requires 2 preads, and - // generate a new SST file which needs 4 preads (footer, meta block, - // property block and index block). In total 6. env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ(6, env_->random_read_counter_.Read()); - // All compactin input files should have already been opened. + if (prefetch_supported) { + // Compaction needs two input blocks, which requires 2 preads, and + // generate a new SST file which needs 4 preads (footer, meta block, + // property block and index block). In total 6. + ASSERT_EQ(6, env_->random_read_counter_.Read()); + } else { + // With prefetch off, compaction needs two input blocks, + // followed by a single buffered read. In total 3. + ASSERT_EQ(3, env_->random_read_counter_.Read()); + } + // All compaction input files should have already been opened. ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read @@ -3136,6 +4241,118 @@ ASSERT_EQ(0, env_->random_file_open_counter_.load()); } +class TraceExecutionResultHandler : public TraceRecordResult::Handler { + public: + TraceExecutionResultHandler() {} + ~TraceExecutionResultHandler() override {} + + virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceWrite: { + total_latency_ += result.GetLatency(); + cnt_++; + writes_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle( + const SingleValueTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceGet: { + total_latency_ += result.GetLatency(); + cnt_++; + gets_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle( + const MultiValuesTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + for (const Status& s : result.GetMultiStatus()) { + s.PermitUncheckedError(); + } + switch (result.GetTraceType()) { + case kTraceMultiGet: { + total_latency_ += result.GetLatency(); + cnt_++; + multigets_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + virtual Status Handle(const IteratorTraceExecutionResult& result) override { + if (result.GetStartTimestamp() > result.GetEndTimestamp()) { + return Status::InvalidArgument("Invalid timestamps."); + } + result.GetStatus().PermitUncheckedError(); + switch (result.GetTraceType()) { + case kTraceIteratorSeek: + case kTraceIteratorSeekForPrev: { + total_latency_ += result.GetLatency(); + cnt_++; + seeks_++; + break; + } + default: + return Status::Corruption("Type mismatch."); + } + return Status::OK(); + } + + void Reset() { + total_latency_ = 0; + cnt_ = 0; + writes_ = 0; + gets_ = 0; + seeks_ = 0; + multigets_ = 0; + } + + double GetAvgLatency() const { + return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_; + } + + int GetNumWrites() const { return writes_; } + + int GetNumGets() const { return gets_; } + + int GetNumIterSeeks() const { return seeks_; } + + int GetNumMultiGets() const { return multigets_; } + + private: + std::atomic total_latency_{0}; + std::atomic cnt_{0}; + std::atomic writes_{0}; + std::atomic gets_{0}; + std::atomic seeks_{0}; + std::atomic multigets_{0}; +}; + TEST_F(DBTest2, TraceAndReplay) { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreatePutOperator(); @@ -3154,6 +4371,170 @@ ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + // 5 Writes + ASSERT_OK(Put(0, "a", "1")); + ASSERT_OK(Merge(0, "b", "2")); + ASSERT_OK(Delete(0, "c")); + ASSERT_OK(SingleDelete(0, "d")); + ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); + + // 6th Write + WriteBatch batch; + ASSERT_OK(batch.Put("f", "11")); + ASSERT_OK(batch.Merge("g", "12")); + ASSERT_OK(batch.Delete("h")); + ASSERT_OK(batch.SingleDelete("i")); + ASSERT_OK(batch.DeleteRange("j", "k")); + ASSERT_OK(db_->Write(wo, &batch)); + + // 2 Seek(ForPrev)s + single_iter = db_->NewIterator(ro); + single_iter->Seek("f"); // Seek 1 + single_iter->SeekForPrev("g"); + ASSERT_OK(single_iter->status()); + delete single_iter; + + // 2 Gets + ASSERT_EQ("1", Get(0, "a")); + ASSERT_EQ("12", Get(0, "g")); + + // 7th and 8th Write, 3rd Get + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Put(1, "rocksdb", "rocks")); + ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + + // Total Write x 8, Get x 3, Seek x 2. + ASSERT_OK(db_->EndTrace()); + // These should not get into the trace file as it is after EndTrace. + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); + + // Open another db, replay, and verify the data + std::string value; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); + ASSERT_OK(DestroyDB(dbname2, options)); + + // Using a different name than db2, to pacify infer's use-after-lifetime + // warnings (http://fbinfer.com). + DB* db2_init = nullptr; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname2, &db2_init)); + ColumnFamilyHandle* cf; + ASSERT_OK( + db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); + delete cf; + delete db2_init; + + DB* db2 = nullptr; + std::vector column_families; + ColumnFamilyOptions cf_options; + cf_options.merge_operator = MergeOperators::CreatePutOperator(); + column_families.push_back(ColumnFamilyDescriptor("default", cf_options)); + column_families.push_back( + ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); + std::vector handles; + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); + + env_->SleepForMicroseconds(100); + // Verify that the keys don't already exist + ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); + + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + + TraceExecutionResultHandler res_handler; + std::function &&)> res_cb = + [&res_handler](Status exec_s, std::unique_ptr&& res) { + ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported()); + if (res != nullptr) { + ASSERT_OK(res->Accept(&res_handler)); + res.reset(); + } + }; + + // Unprepared replay should fail with Status::Incomplete() + ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); + ASSERT_OK(replayer->Prepare()); + // Ok to repeatedly Prepare(). + ASSERT_OK(replayer->Prepare()); + // Replay using 1 thread, 1x speed. + ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); + ASSERT_EQ("1", value); + ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); + ASSERT_EQ("12", value); + ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); + ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); + + ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); + ASSERT_EQ("bar", value); + ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); + ASSERT_EQ("rocks", value); + + // Re-replay should fail with Status::Incomplete() if Prepare() was not + // called. Currently we don't distinguish between unprepared and trace end. + ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); + + // Re-replay using 2 threads, 2x speed. + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Re-replay using 2 threads, 1/2 speed. + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb)); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 8); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + replayer.reset(); + + for (auto handle : handles) { + delete handle; + } + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); +} + +TEST_F(DBTest2, TraceAndManualReplay) { + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreatePutOperator(); + ReadOptions ro; + WriteOptions wo; + TraceOptions trace_opts; + EnvOptions env_opts; + CreateAndReopenWithCF({"pikachu"}, options); + Random rnd(301); + Iterator* single_iter = nullptr; + + ASSERT_TRUE(db_->EndTrace().IsIOError()); + + std::string trace_filename = dbname_ + "/rocksdb.trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); + ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); + ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Merge(0, "b", "2")); ASSERT_OK(Delete(0, "c")); @@ -3171,6 +4552,37 @@ single_iter = db_->NewIterator(ro); single_iter->Seek("f"); single_iter->SeekForPrev("g"); + ASSERT_OK(single_iter->status()); + delete single_iter; + + // Write some sequenced keys for testing lower/upper bounds of iterator. + batch.Clear(); + ASSERT_OK(batch.Put("iter-0", "iter-0")); + ASSERT_OK(batch.Put("iter-1", "iter-1")); + ASSERT_OK(batch.Put("iter-2", "iter-2")); + ASSERT_OK(batch.Put("iter-3", "iter-3")); + ASSERT_OK(batch.Put("iter-4", "iter-4")); + ASSERT_OK(db_->Write(wo, &batch)); + + ReadOptions bounded_ro = ro; + Slice lower_bound("iter-1"); + Slice upper_bound("iter-3"); + bounded_ro.iterate_lower_bound = &lower_bound; + bounded_ro.iterate_upper_bound = &upper_bound; + single_iter = db_->NewIterator(bounded_ro); + single_iter->Seek("iter-0"); + ASSERT_EQ(single_iter->key().ToString(), "iter-1"); + single_iter->Seek("iter-2"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + single_iter->Seek("iter-4"); + ASSERT_FALSE(single_iter->Valid()); + single_iter->SeekForPrev("iter-0"); + ASSERT_FALSE(single_iter->Valid()); + single_iter->SeekForPrev("iter-2"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + single_iter->SeekForPrev("iter-4"); + ASSERT_EQ(single_iter->key().ToString(), "iter-2"); + ASSERT_OK(single_iter->status()); delete single_iter; ASSERT_EQ("1", Get(0, "a")); @@ -3180,14 +4592,18 @@ ASSERT_OK(Put(1, "rocksdb", "rocks")); ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); + // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2. + // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6 + // Seek(ForPrev)s. + // Total Write x 9, Get x 3, Seek x 8 ASSERT_OK(db_->EndTrace()); // These should not get into the trace file as it is after EndTrace. - Put("hello", "world"); - Merge("foo", "bar"); + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); // Open another db, replay, and verify the data std::string value; - std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime @@ -3209,7 +4625,9 @@ column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3218,8 +4636,76 @@ std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); - Replayer replayer(db2, handles_, std::move(trace_reader)); - ASSERT_OK(replayer.Replay()); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + + TraceExecutionResultHandler res_handler; + + // Manual replay for 2 times. The 2nd checks if the replay can restart. + std::unique_ptr record; + std::unique_ptr result; + for (int i = 0; i < 2; i++) { + // Next should fail if unprepared. + ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); + ASSERT_OK(replayer->Prepare()); + Status s = Status::OK(); + // Looping until trace end. + while (s.ok()) { + s = replayer->Next(&record); + // Skip unsupported operations. + if (s.IsNotSupported()) { + continue; + } + if (s.ok()) { + ASSERT_OK(replayer->Execute(record, &result)); + if (result != nullptr) { + ASSERT_OK(result->Accept(&res_handler)); + if (record->GetTraceType() == kTraceIteratorSeek || + record->GetTraceType() == kTraceIteratorSeekForPrev) { + IteratorSeekQueryTraceRecord* iter_rec = + dynamic_cast(record.get()); + IteratorTraceExecutionResult* iter_res = + dynamic_cast(result.get()); + // Check if lower/upper bounds are correctly saved and decoded. + std::string lower_str = iter_rec->GetLowerBound().ToString(); + std::string upper_str = iter_rec->GetUpperBound().ToString(); + std::string iter_key = iter_res->GetKey().ToString(); + std::string iter_value = iter_res->GetValue().ToString(); + if (!lower_str.empty() && !upper_str.empty()) { + ASSERT_EQ(lower_str, "iter-1"); + ASSERT_EQ(upper_str, "iter-3"); + if (iter_res->GetValid()) { + // If iterator is valid, then lower_bound <= key < upper_bound. + ASSERT_GE(iter_key, lower_str); + ASSERT_LT(iter_key, upper_str); + } else { + // If iterator is invalid, then + // key < lower_bound or key >= upper_bound. + ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str); + } + } + // If iterator is invalid, the key and value should be empty. + if (!iter_res->GetValid()) { + ASSERT_TRUE(iter_key.empty()); + ASSERT_TRUE(iter_value.empty()); + } + } + result.reset(); + } + } + } + // Status::Incomplete() will be returned when manually reading the trace + // end, or Prepare() was not called. + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 9); + ASSERT_EQ(res_handler.GetNumGets(), 3); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 8); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + } ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); ASSERT_EQ("1", value); @@ -3233,6 +4719,138 @@ ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); ASSERT_EQ("rocks", value); + // Test execution of artificially created TraceRecords. + uint64_t fake_ts = 1U; + // Write + batch.Clear(); + ASSERT_OK(batch.Put("trace-record-write1", "write1")); + ASSERT_OK(batch.Put("trace-record-write2", "write2")); + record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Write x 1 + ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value)); + ASSERT_EQ("write1", value); + ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value)); + ASSERT_EQ("write2", value); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 1); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Get related + // Get an existing key. + record.reset(new GetQueryTraceRecord(handles[0]->GetID(), + "trace-record-write1", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Get x 1 + // Get an non-existing key, should still return Status::OK(). + record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get", + fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Get x 2 + // Get from an invalid (non-existing) cf_id. + uint32_t invalid_cf_id = handles[1]->GetID() + 1; + record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 2); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // Iteration related + for (IteratorSeekQueryTraceRecord::SeekType seekType : + {IteratorSeekQueryTraceRecord::kSeek, + IteratorSeekQueryTraceRecord::kSeekForPrev}) { + // Seek to an existing key. + record.reset(new IteratorSeekQueryTraceRecord( + seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Seek x 1 in one iteration + // Seek to an non-existing key, should still return Status::OK(). + record.reset(new IteratorSeekQueryTraceRecord( + seekType, handles[0]->GetID(), "trace-record-get", fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // Seek x 2 in one iteration + // Seek from an invalid cf_id. + record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id, + "whatever", fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + } + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations + ASSERT_EQ(res_handler.GetNumMultiGets(), 0); + res_handler.Reset(); + + // MultiGet related + // Get existing keys. + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a", "foo"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 1 + // Get all non-existing keys, should still return Status::OK(). + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"no1", "no2"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 2 + // Get mixed of existing and non-existing keys, should still return + // Status::OK(). + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a", "no2"}), fake_ts++)); + ASSERT_OK(replayer->Execute(record, &result)); + ASSERT_TRUE(result != nullptr); + MultiValuesTraceExecutionResult* mvr = + dynamic_cast(result.get()); + ASSERT_TRUE(mvr != nullptr); + ASSERT_OK(mvr->GetMultiStatus()[0]); + ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound()); + ASSERT_EQ(mvr->GetValues()[0], "1"); + ASSERT_EQ(mvr->GetValues()[1], ""); + ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 3 + // Get from an invalid (non-existing) cf_id. + record.reset(new MultiGetQueryTraceRecord( + std::vector( + {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}), + std::vector({"a", "foo", "whatever"}), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); + ASSERT_TRUE(result == nullptr); + // Empty MultiGet + record.reset(new MultiGetQueryTraceRecord( + std::vector(), std::vector(), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); + ASSERT_TRUE(result == nullptr); + // MultiGet size mismatch + record.reset(new MultiGetQueryTraceRecord( + std::vector({handles[0]->GetID(), handles[1]->GetID()}), + std::vector({"a"}), fake_ts++)); + ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); + ASSERT_TRUE(result == nullptr); + ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_EQ(res_handler.GetNumWrites(), 0); + ASSERT_EQ(res_handler.GetNumGets(), 0); + ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); + ASSERT_EQ(res_handler.GetNumMultiGets(), 3); + res_handler.Reset(); + + replayer.reset(); + for (auto handle : handles) { delete handle; } @@ -3261,7 +4879,7 @@ ASSERT_OK(Put(0, "c", "1")); ASSERT_OK(db_->EndTrace()); - std::string dbname2 = test::TmpDir(env_) + "/db_replay2"; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2"); std::string value; ASSERT_OK(DestroyDB(dbname2, options)); @@ -3284,7 +4902,9 @@ column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3294,8 +4914,12 @@ std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); - Replayer replayer(db2, handles_, std::move(trace_reader)); - ASSERT_OK(replayer.Replay()); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); @@ -3330,7 +4954,7 @@ ASSERT_OK(Put(0, "e", "5")); ASSERT_OK(db_->EndTrace()); - std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling"; + std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling"); std::string value; ASSERT_OK(DestroyDB(dbname2, options)); @@ -3352,7 +4976,9 @@ column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); @@ -3363,8 +4989,12 @@ std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); - Replayer replayer(db2, handles_, std::move(trace_reader)); - ASSERT_OK(replayer.Replay()); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); @@ -3425,12 +5055,12 @@ ASSERT_OK(db_->EndTrace()); // These should not get into the trace file as it is after EndTrace. - Put("hello", "world"); - Merge("foo", "bar"); + ASSERT_OK(Put("hello", "world")); + ASSERT_OK(Merge("foo", "bar")); // Open another db, replay, and verify the data std::string value; - std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + std::string dbname2 = test::PerThreadDBPath(env_, "db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime @@ -3452,7 +5082,9 @@ column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3461,8 +5093,12 @@ std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); - Replayer replayer(db2, handles_, std::move(trace_reader)); - ASSERT_OK(replayer.Replay()); + std::unique_ptr replayer; + ASSERT_OK( + db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); + ASSERT_OK(replayer->Prepare()); + ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); + replayer.reset(); // All the key-values should not present since we filter out the WRITE ops. ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); @@ -3479,7 +5115,7 @@ ASSERT_OK(DestroyDB(dbname2, options)); // Set up a new db. - std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read"; + std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read"); ASSERT_OK(DestroyDB(dbname3, options)); DB* db3_init = nullptr; @@ -3498,7 +5134,7 @@ handles.clear(); DB* db3 = nullptr; - ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3)); + ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3554,6 +5190,11 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { Options options = CurrentOptions(); + options.env = env_; + if (!IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires default environment"); + return; + } options.allow_mmap_reads = true; options.max_open_files = 100; options.compression = kNoCompression; @@ -3568,9 +5209,9 @@ ASSERT_FALSE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); - dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, - nullptr /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); // Ensure pinned_value doesn't rely on memory munmap'd by the above // compaction. It crashes if it does. @@ -3606,18 +5247,18 @@ bbto.cache_index_and_filter_blocks = false; bbto.block_cache = NewLRUCache(100000); bbto.block_size = 400; // small block size - options.table_factory.reset(new BlockBasedTableFactory(bbto)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); - std::string v = RandomString(&rnd, 400); + std::string v = rnd.RandomString(400); // Since v is the size of a block, each key should take a block // of 400+ bytes. - Put("1", v); - Put("3", v); - Put("5", v); - Put("7", v); + ASSERT_OK(Put("1", v)); + ASSERT_OK(Put("3", v)); + ASSERT_OK(Put("5", v)); + ASSERT_OK(Put("7", v)); ASSERT_OK(Flush()); ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); @@ -3646,16 +5287,18 @@ iter->Seek("3"); ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); } ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); // Test compaction case - Put("2", v); - Put("5", v); - Put("6", v); - Put("8", v); + ASSERT_OK(Put("2", v)); + ASSERT_OK(Put("5", v)); + ASSERT_OK(Put("6", v)); + ASSERT_OK(Put("8", v)); ASSERT_OK(Flush()); // Clear existing data in block cache @@ -3714,20 +5357,20 @@ }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("1", "1"); - Put("9", "1"); - Flush(); + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); expected_lower_bound = 0; expected_higher_bound = 8 * 1024; - Put("1", "1"); - Put("9", "1"); - Flush(); - - Put("1", "1"); - Put("9", "1"); - Flush(); + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); // Full compaction to make sure there is no L0 file after the open. ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -3760,13 +5403,13 @@ options.max_open_files = -1; Reopen(options); - Put("1", "1"); - Put("9", "1"); - Flush(); - - Put("1", "1"); - Put("9", "1"); - Flush(); + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("1", "1")); + ASSERT_OK(Put("9", "1")); + ASSERT_OK(Flush()); ASSERT_TRUE(called.load()); called = false; @@ -3797,7 +5440,7 @@ CreateColumnFamilies({"test1", "test2"}, Options()); ASSERT_EQ(handles_.size(), 2); - DBImpl* dbi = reinterpret_cast(db_); + DBImpl* dbi = static_cast_with_check(db_); port::Thread user_thread1([&]() { auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); @@ -3832,6 +5475,7 @@ SyncPoint::GetInstance()->EnableProcessing(); Options options; + options.env = env_; options.num_levels = 2; options.disable_auto_compactions = true; Reopen(options); @@ -3866,31 +5510,36 @@ GetSstFiles(env_, dbname_, &files); ASSERT_EQ(files.size(), 2); - port::Thread user_thread1( - [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); }); + Status user_thread1_status; + port::Thread user_thread1([&]() { + user_thread1_status = + db_->CompactFiles(CompactionOptions(), handle, files, 1); + }); + Status user_thread2_status; port::Thread user_thread2([&]() { - ASSERT_OK(db_->IngestExternalFile(handle, {external_file2}, - IngestExternalFileOptions())); + user_thread2_status = db_->IngestExternalFile(handle, {external_file2}, + IngestExternalFileOptions()); TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1"); }); user_thread1.join(); user_thread2.join(); + ASSERT_OK(user_thread1_status); + ASSERT_OK(user_thread2_status); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } #endif // ROCKSDB_LITE -// TODO: figure out why this test fails in appveyor -#ifndef OS_WIN TEST_F(DBTest2, MultiDBParallelOpenTest) { const int kNumDbs = 2; Options options = CurrentOptions(); std::vector dbnames; for (int i = 0; i < kNumDbs; ++i) { - dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i)); + dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i))); ASSERT_OK(DestroyDB(dbnames.back(), options)); } @@ -3915,7 +5564,6 @@ } // Verify non-empty DBs can be recovered in parallel - dbs.clear(); open_threads.clear(); for (int i = 0; i < kNumDbs; ++i) { open_threads.emplace_back( @@ -3932,11 +5580,11 @@ ASSERT_OK(DestroyDB(dbnames[i], options)); } } -#endif // OS_WIN namespace { class DummyOldStats : public Statistics { public: + const char* Name() const override { return "DummyOldStats"; } uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; } void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override { num_rt++; @@ -3956,8 +5604,8 @@ } bool HistEnabledForType(uint32_t /*type*/) const override { return false; } std::string ToString() const override { return ""; } - int num_rt = 0; - int num_mt = 0; + std::atomic num_rt{0}; + std::atomic num_mt{0}; }; } // namespace @@ -3969,7 +5617,7 @@ options.statistics = stats; Reopen(options); - Put("foo", "bar"); + ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("foo")); @@ -4017,6 +5665,7 @@ ASSERT_OK(Put("bbb1", "")); Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); // Seeking into f1, the iterator will check bloom filter which returns the // file iterator ot be invalidate, and the cursor will put into f2, with @@ -4055,6 +5704,7 @@ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); // Bloom filter is filterd out by f1. // This is just one of several valid position following the contract. @@ -4062,6 +5712,7 @@ // the behavior of the current implementation. If underlying implementation // changes, the test might fail here. iter->Seek("bbb1"); + ASSERT_OK(iter->status()); ASSERT_FALSE(iter->Valid()); delete iter; @@ -4149,7 +5800,7 @@ for (const auto& f : filenames) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) { + if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) { std::string fname = dbname_ + "/" + f; std::string file_content; ASSERT_OK(ReadFileToString(env_, fname, &file_content)); @@ -4208,6 +5859,7 @@ ReadOptions ro; ro.total_order_seek = true; std::unique_ptr iter(db_->NewIterator(ro)); + ASSERT_OK(iter->status()); iter->Seek("e"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("x", iter->key().ToString()); @@ -4225,6 +5877,7 @@ ASSERT_OK(Put("a", "a")); Iterator* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); ASSERT_OK(Flush()); size_t value = options.write_buffer_manager->memory_usage(); ASSERT_GT(value, base_value); @@ -4283,7 +5936,7 @@ ASSERT_OK(Put("key", "2")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); - Flush(); + ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; @@ -4291,14 +5944,14 @@ nullptr)); ASSERT_OK(db_->Merge(WriteOptions(), "key", "5")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "6")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "7")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "8")); - Flush(); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); #ifndef ROCKSDB_LITE ASSERT_EQ("0,4,1", FilesPerLevel()); #endif // ROCKSDB_LITE @@ -4306,6 +5959,24 @@ ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); } +TEST_F(DBTest2, FileConsistencyCheckInOpen) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { + Status* ret_s = static_cast(arg); + *ret_s = Status::Corruption("fcc"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.force_consistency_checks = true; + ASSERT_NOK(TryReopen(options)); + + SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { // create a DB with block prefix index BlockBasedTableOptions table_options; @@ -4320,15 +5991,16 @@ Reopen(options); Random rnd(301); - std::string large_value = RandomString(&rnd, 500); + std::string large_value = rnd.RandomString(500); ASSERT_OK(Put("a1", large_value)); ASSERT_OK(Put("x1", large_value)); ASSERT_OK(Put("y1", large_value)); - Flush(); + ASSERT_OK(Flush()); { std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_OK(iterator->status()); iterator->SeekForPrev("x3"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); @@ -4367,6 +6039,46 @@ } } +TEST_F(DBTest2, PartitionedIndexPrefetchFailure) { + Options options = last_options_; + options.env = env_; + options.max_open_files = 20; + BlockBasedTableOptions bbto; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.metadata_block_size = 128; + bbto.block_size = 128; + bbto.block_cache = NewLRUCache(16777216); + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Force no table cache so every read will preload the SST file. + dbfull()->TEST_table_cache()->SetCapacity(0); + bbto.block_cache->SetCapacity(0); + + Random rnd(301); + for (int i = 0; i < 4096; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(32))); + } + ASSERT_OK(Flush()); + + // Try different random failures in table open for 300 times. + for (int i = 0; i < 300; i++) { + env_->num_reads_fails_ = 0; + env_->rand_reads_fail_odd_ = 8; + + std::string value; + Status s = dbfull()->Get(ReadOptions(), Key(1), &value); + if (env_->num_reads_fails_ > 0) { + ASSERT_NOK(s); + } else { + ASSERT_OK(s); + } + } + + env_->rand_reads_fail_odd_ = 0; +} + TEST_F(DBTest2, ChangePrefixExtractor) { for (bool use_partitioned_filter : {true, false}) { // create a DB with block prefix index @@ -4400,7 +6112,7 @@ ASSERT_OK(Put("xx1", "")); ASSERT_OK(Put("xz1", "")); ASSERT_OK(Put("zz", "")); - Flush(); + ASSERT_OK(Flush()); // After reopening DB with prefix size 2 => 1, prefix extractor // won't take effective unless it won't change results based @@ -4410,6 +6122,7 @@ { std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_OK(iterator->status()); iterator->Seek("xa"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); @@ -4434,6 +6147,7 @@ { std::unique_ptr iterator(db_->NewIterator(ro)); + ASSERT_OK(iterator->status()); // SeekForPrev() never uses prefix bloom if it is changed. iterator->SeekForPrev("xg0"); @@ -4448,6 +6162,7 @@ ub = Slice(ub_str); { std::unique_ptr iterator(db_->NewIterator(ro)); + ASSERT_OK(iterator->status()); iterator->Seek("x"); ASSERT_TRUE(iterator->Valid()); @@ -4494,6 +6209,8 @@ if (expect_filter_check) { ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); } + + ASSERT_OK(iterator->status()); } { std::unique_ptr iterator(db_->NewIterator(ro)); @@ -4511,6 +6228,8 @@ if (expect_filter_check) { ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); } + + ASSERT_OK(iterator->status()); } ub_str = "xg9"; @@ -4523,6 +6242,7 @@ if (expect_filter_check) { ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); } + ASSERT_OK(iterator->status()); } } } @@ -4542,29 +6262,29 @@ Reopen(options); ASSERT_OK(Put("b1", "ok")); - Flush(); + ASSERT_OK(Flush()); // Flushing several files so that the chance that hash bucket // is empty fo "b" in at least one of the files is high. ASSERT_OK(Put("a1", "")); ASSERT_OK(Put("c1", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("a2", "")); ASSERT_OK(Put("c2", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("a3", "")); ASSERT_OK(Put("c3", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("a4", "")); ASSERT_OK(Put("c4", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("a5", "")); ASSERT_OK(Put("c5", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("ok", Get("b1")); } @@ -4582,12 +6302,12 @@ Reopen(options); Random rnd(301); - std::string large_value = RandomString(&rnd, 500); + std::string large_value = rnd.RandomString(500); ASSERT_OK(Put("a1", large_value)); ASSERT_OK(Put("x1", large_value)); ASSERT_OK(Put("y1", large_value)); - Flush(); + ASSERT_OK(Flush()); ReadOptions ro; ro.total_order_seek = false; @@ -4598,6 +6318,7 @@ ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + ASSERT_OK(iterator->status()); } std::string ub_str = "b9"; @@ -4609,6 +6330,7 @@ iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + ASSERT_OK(iterator->status()); } ub_str = "z"; @@ -4619,6 +6341,7 @@ ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + ASSERT_OK(iterator->status()); } ub_str = "c"; @@ -4628,6 +6351,7 @@ iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + ASSERT_OK(iterator->status()); } // The same queries without recreating iterator @@ -4640,6 +6364,7 @@ iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + ASSERT_OK(iterator->status()); ub_str = "z"; ub = Slice(ub_str); @@ -4676,16 +6401,469 @@ ASSERT_EQ("a1", iterator->key().ToString()); } } + +class RenameCurrentTest : public DBTestBase, + public testing::WithParamInterface { + public: + RenameCurrentTest() + : DBTestBase("rename_current_test", /*env_do_fsync=*/true), + sync_point_(GetParam()) {} + + ~RenameCurrentTest() override {} + + void SetUp() override { + env_->no_file_overwrite_.store(true, std::memory_order_release); + } + + void TearDown() override { + env_->no_file_overwrite_.store(false, std::memory_order_release); + } + + void SetupSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) { + Status* s = reinterpret_cast(arg); + assert(s); + *s = Status::IOError("Injected IO error."); + }); + } + + const std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest, + ::testing::Values("SetCurrentFile:BeforeRename", + "SetCurrentFile:AfterRename")); + +TEST_P(RenameCurrentTest, Open) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = TryReopen(options); + ASSERT_NOK(s); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); +} + +TEST_P(RenameCurrentTest, Flush) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("key", "value")); + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(Flush()); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST_P(RenameCurrentTest, Compaction) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("a", "a_value")); + ASSERT_OK(Put("c", "c_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b", "b_value")); + ASSERT_OK(Put("d", "d_value")); + ASSERT_OK(Flush()); + + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ("d_value", Get("d")); +} + +TEST_F(DBTest2, BottommostTemperature) { + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + options.statistics = CreateDBStatistics(); + Reopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // reopen and check the information is persisted + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); + + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature); +} + +TEST_F(DBTest2, BottommostTemperatureUniversal) { + const int kTriggerNum = 3; + const int kNumLevels = 5; + const int kBottommostLevel = kNumLevels - 1; + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kTriggerNum; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + DestroyAndReopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + + for (int i = 0; i < kTriggerNum; i++) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + + // Update bottommost temperature + options.bottommost_temperature = Temperature::kWarm; + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + // Should not impact existing ones + ASSERT_EQ(Temperature::kUnknown, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + + // new generated file should have the new settings + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, + metadata.levels[kBottommostLevel].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); + ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); +} #endif // ROCKSDB_LITE -} // namespace ROCKSDB_NAMESPACE -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); +// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. +TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + bool should_inject_error = false; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RecoverLogFiles:BeforeReadWal", + [&](void* /*arg*/) { should_inject_error = true; }); + SyncPoint::GetInstance()->SetCallBack( + "LogReader::ReadMore:AfterReadFile", [&](void* arg) { + if (should_inject_error) { + ASSERT_NE(nullptr, arg); + *reinterpret_cast(arg) = Status::IOError("Injected IOError"); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + options.avoid_flush_during_recovery = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + Status s = TryReopen(options); + ASSERT_TRUE(s.IsIOError()); } -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:Start:1", + "PointInTimeRecoveryWithSyncFailureInCFCreation:1"}, + {"PointInTimeRecoveryWithSyncFailureInCFCreation:2", + "DBImpl::BackgroundCallFlush:Start:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1"}, Options()); + ASSERT_OK(Put("foo", "bar")); + + // Creating a CF when a flush is going on, log is synced but the + // closed log file is not synced and corrupted. + port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); }); + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1"); + CreateColumnFamilies({"test2"}, Options()); + env_->corrupt_in_sync_ = true; + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2"); + flush_thread.join(); + env_->corrupt_in_sync_ = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Reopening the DB should not corrupt anything + Options options = CurrentOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ReopenWithColumnFamilies({"default", "test1", "test2"}, options); +} + +TEST_F(DBTest2, RenameDirectory) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value0")); + Close(); + auto old_dbname = dbname_; + auto new_dbname = dbname_ + "_2"; + EXPECT_OK(env_->RenameFile(dbname_, new_dbname)); + options.create_if_missing = false; + dbname_ = new_dbname; + ASSERT_OK(TryReopen(options)); + ASSERT_EQ("value0", Get("foo")); + Destroy(options); + dbname_ = old_dbname; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, GetLatestSeqAndTsForKey) { + Destroy(last_options_); + + Options options = CurrentOptions(); + options.max_write_buffer_size_to_maintain = 64 << 10; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.comparator = test::ComparatorWithU64Ts(); + options.statistics = CreateDBStatistics(); + + Reopen(options); + + constexpr uint64_t kTsU64Value = 12; + + for (uint64_t key = 0; key < 100; ++key) { + std::string ts_str; + PutFixed64(&ts_str, kTsU64Value); + Slice ts = ts_str; + WriteOptions write_opts; + write_opts.timestamp = &ts; + + std::string key_str; + PutFixed64(&key_str, key); + std::reverse(key_str.begin(), key_str.end()); + ASSERT_OK(Put(key_str, "value", write_opts)); + } + + ASSERT_OK(Flush()); + + constexpr bool cache_only = true; + constexpr SequenceNumber lower_bound_seq = 0; + auto* cfhi = static_cast_with_check( + dbfull()->DefaultColumnFamily()); + assert(cfhi); + assert(cfhi->cfd()); + SuperVersion* sv = cfhi->cfd()->GetSuperVersion(); + for (uint64_t key = 0; key < 100; ++key) { + std::string key_str; + PutFixed64(&key_str, key); + std::reverse(key_str.begin(), key_str.end()); + std::string ts; + SequenceNumber seq = kMaxSequenceNumber; + bool found_record_for_key = false; + bool is_blob_index = false; + + const Status s = dbfull()->GetLatestSequenceForKey( + sv, key_str, cache_only, lower_bound_seq, &seq, &ts, + &found_record_for_key, &is_blob_index); + ASSERT_OK(s); + std::string expected_ts; + PutFixed64(&expected_ts, kTsU64Value); + ASSERT_EQ(expected_ts, ts); + ASSERT_TRUE(found_record_for_key); + ASSERT_FALSE(is_blob_index); + } + + // Verify that no read to SST files. + ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0)); +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,21 +8,35 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" + #include "db/forward_iterator.h" +#include "env/mock_env.h" +#include "rocksdb/convenience.h" #include "rocksdb/env_encryption.h" +#include "rocksdb/unique_id.h" #include "rocksdb/utilities/object_registry.h" +#include "table/format.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { +namespace { +int64_t MaybeCurrentTime(Env* env) { + int64_t time = 1337346000; // arbitrary fallback default + env->GetCurrentTime(&time).PermitUncheckedError(); + return time; +} +} // namespace + // Special Env used to delay background operations -SpecialEnv::SpecialEnv(Env* base) +SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep) : EnvWrapper(base), + maybe_starting_time_(MaybeCurrentTime(base)), rnd_(301), sleep_counter_(this), - addon_time_(0), - time_elapse_only_sleep_(false), - no_slowdown_(false) { + time_elapse_only_sleep_(time_elapse_only_sleep), + no_slowdown_(time_elapse_only_sleep) { delay_sstable_sync_.store(false, std::memory_order_release); drop_writes_.store(false, std::memory_order_release); no_space_.store(false, std::memory_order_release); @@ -32,6 +46,7 @@ manifest_sync_error_.store(false, std::memory_order_release); manifest_write_error_.store(false, std::memory_order_release); log_write_error_.store(false, std::memory_order_release); + no_file_overwrite_.store(false, std::memory_order_release); random_file_open_counter_.store(0, std::memory_order_relaxed); delete_count_.store(0, std::memory_order_relaxed); num_open_wal_file_.store(0); @@ -43,37 +58,33 @@ non_writable_count_ = 0; table_write_callback_ = nullptr; } -#ifndef ROCKSDB_LITE -ROT13BlockCipher rot13Cipher_(16); -#endif // ROCKSDB_LITE - -DBTestBase::DBTestBase(const std::string path) +DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) { Env* base_env = Env::Default(); -#ifndef ROCKSDB_LITE - const char* test_env_uri = getenv("TEST_ENV_URI"); - if (test_env_uri) { - Env* test_env = nullptr; - Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); - base_env = test_env; - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } -#endif // !ROCKSDB_LITE + ConfigOptions config_options; + EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_)); EXPECT_NE(nullptr, base_env); if (getenv("MEM_ENV")) { - mem_env_ = new MockEnv(base_env); + mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock()); } #ifndef ROCKSDB_LITE if (getenv("ENCRYPTED_ENV")) { - encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, - new CTREncryptionProvider(rot13Cipher_)); + std::shared_ptr provider; + std::string provider_id = getenv("ENCRYPTED_ENV"); + if (provider_id.find("=") == std::string::npos && + !EndsWith(provider_id, "://test")) { + provider_id = provider_id + "://test"; + } + EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id, + &provider)); + encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider); } #endif // !ROCKSDB_LITE env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_ : (mem_env_ ? mem_env_ : base_env)); env_->SetBackgroundThreads(1, Env::LOW); env_->SetBackgroundThreads(1, Env::HIGH); + env_->skip_fsync_ = !env_do_fsync; dbname_ = test::PerThreadDBPath(env_, path); alternative_wal_dir_ = dbname_ + "/wal"; alternative_db_log_dir_ = dbname_ + "/db_log_dir"; @@ -189,28 +200,28 @@ Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kUniversalCompaction) { option_config_ = kUniversalCompactionMultiLevel; Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kUniversalCompactionMultiLevel) { option_config_ = kLevelSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kLevelSubcompactions) { option_config_ = kUniversalSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + Reopen(options); return true; } else { return false; @@ -225,7 +236,7 @@ auto options = CurrentOptions(); Destroy(options); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kDBLogDir) { option_config_ = kWalDirAndMmapReads; @@ -233,14 +244,14 @@ auto options = CurrentOptions(); Destroy(options); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kWalDirAndMmapReads) { option_config_ = kRecycleLogFiles; Destroy(last_options_); auto options = CurrentOptions(); Destroy(options); - TryReopen(options); + Reopen(options); return true; } else { return false; @@ -320,7 +331,7 @@ return GetOptions(option_config_, default_options, options_override); } -Options DBTestBase::GetDefaultOptions() { +Options DBTestBase::GetDefaultOptions() const { Options options; options.write_buffer_size = 4090 * 4096; options.target_file_size_base = 2 * 1024 * 1024; @@ -328,6 +339,10 @@ options.max_open_files = 5000; options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; options.compaction_pri = CompactionPri::kByCompensatedSize; + options.env = env_; + if (!env_->skip_fsync_) { + options.track_and_verify_wals_in_manifest = true; + } return options; } @@ -356,28 +371,28 @@ options.unordered_write = false; break; case kPlainTableFirstBytePrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableCappedPrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewCappedPrefixTransform(8)); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableCappedPrefixNonMmap: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewCappedPrefixTransform(8)); options.allow_mmap_reads = false; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableAllBytesPrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; @@ -399,20 +414,7 @@ options.use_direct_reads = true; options.use_direct_io_for_flush_and_compaction = true; options.compaction_readahead_size = 2 * 1024 * 1024; - #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ - !defined(OS_AIX) && !defined(OS_OPENBSD) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewWritableFile:O_DIRECT", [&](void* arg) { - int* val = static_cast(arg); - *val &= ~O_DIRECT; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewRandomAccessFile:O_DIRECT", [&](void* arg) { - int* val = static_cast(arg); - *val &= ~O_DIRECT; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); -#endif + SetupSyncPointsToMockDirectIO(); break; } #endif // ROCKSDB_LITE @@ -474,16 +476,15 @@ case kInfiniteMaxOpenFiles: options.max_open_files = -1; break; - case kxxHashChecksum: { - table_options.checksum = kxxHash; - break; - } - case kxxHash64Checksum: { - table_options.checksum = kxxHash64; + case kXXH3Checksum: { + table_options.checksum = kXXH3; + // Thrown in here for basic coverage: + options.DisableExtraChecks(); break; } case kFIFOCompaction: { options.compaction_style = kCompactionStyleFIFO; + options.max_open_files = -1; break; } case kBlockBasedTableWithPrefixHashIndex: { @@ -497,6 +498,7 @@ break; } case kBlockBasedTableWithPartitionedIndex: { + table_options.format_version = 3; table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; options.prefix_extractor.reset(NewNoopTransform()); break; @@ -517,6 +519,11 @@ table_options.index_block_restart_interval = 8; break; } + case kBlockBasedTableWithLatestFormat: { + // In case different from default + table_options.format_version = kLatestFormatVersion; + break; + } case kOptimizeFiltersForHits: { options.optimize_filters_for_hits = true; set_block_based_table_factory = true; @@ -608,6 +615,39 @@ ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } +void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) { + time_elapse_only_sleep_on_reopen_ = true; + + // Need to disable stats dumping and persisting which also use + // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal. + // With time_elapse_only_sleep_, this can hang on some platforms (MacOS) + // because (a) on some platforms, pthread_cond_timedwait does not appear + // to release the lock for other threads to operate if the deadline time + // is already passed, and (b) TimedWait calls are currently a bad abstraction + // because the deadline parameter is usually computed from Env time, + // but is interpreted in real clock time. + options->stats_dump_period_sec = 0; + options->stats_persist_period_sec = 0; +} + +void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) { + if (time_elapse_only_sleep_on_reopen_) { + assert(options.env == env_ || + static_cast_with_check(options.env) + ->env_target() == env_); + assert(options.stats_dump_period_sec == 0); + assert(options.stats_persist_period_sec == 0); + // We cannot set these before destroying the last DB because they might + // cause a deadlock or similar without the appropriate options set in + // the DB. + env_->time_elapse_only_sleep_ = true; + env_->no_slowdown_ = true; + } else { + // Going back in same test run is not yet supported, so no + // reset in this case. + } +} + Status DBTestBase::TryReopenWithColumnFamilies( const std::vector& cfs, const std::vector& options) { Close(); @@ -618,6 +658,7 @@ } DBOptions db_opts = DBOptions(options[0]); last_options_ = options[0]; + MaybeInstallTimeElapseOnlySleep(db_opts); return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); } @@ -634,7 +675,7 @@ void DBTestBase::Close() { for (auto h : handles_) { - db_->DestroyColumnFamilyHandle(h); + EXPECT_OK(db_->DestroyColumnFamilyHandle(h)); } handles_.clear(); delete db_; @@ -644,7 +685,7 @@ void DBTestBase::DestroyAndReopen(const Options& options) { // Destroy using last options Destroy(last_options_); - ASSERT_OK(TryReopen(options)); + Reopen(options); } void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { @@ -652,7 +693,8 @@ if (delete_cf_paths) { for (size_t i = 0; i < handles_.size(); ++i) { ColumnFamilyDescriptor cfdescriptor; - handles_[i]->GetDescriptor(&cfdescriptor); + // GetDescriptor is not implemented for ROCKSDB_LITE + handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); column_families.push_back(cfdescriptor); } } @@ -661,6 +703,7 @@ } Status DBTestBase::ReadOnlyReopen(const Options& options) { + MaybeInstallTimeElapseOnlySleep(options); return DB::OpenForReadOnly(options, dbname_, &db_); } @@ -670,11 +713,12 @@ // Note: operator= is an unsafe approach here since it destructs // std::shared_ptr in the same order of their creation, in contrast to // destructors which destructs them in the opposite order of creation. One - // particular problme is that the cache destructor might invoke callback + // particular problem is that the cache destructor might invoke callback // functions that use Option members such as statistics. To work around this - // problem, we manually call destructor of table_facotry which eventually + // problem, we manually call destructor of table_factory which eventually // clears the block cache. last_options_ = options; + MaybeInstallTimeElapseOnlySleep(options); return DB::Open(options, dbname_, &db_); } @@ -909,12 +953,13 @@ InternalKeyComparator icmp(options.comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* upper_bound */); + ReadOptions read_options; ScopedArenaIterator iter; if (cf == 0) { - iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg, kMaxSequenceNumber)); } else { - iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); @@ -927,7 +972,8 @@ bool first = true; while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - if (!ParseInternalKey(iter->key(), &ikey)) { + if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) != + Status::OK()) { result += "CORRUPTED"; } else { if (!last_options_.comparator->Equal(ikey.user_key, user_key)) { @@ -1029,12 +1075,12 @@ std::string property; if (cf == 0) { // default cfd - EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(level), &property)); + EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level), + &property)); } else { - EXPECT_TRUE(db_->GetProperty( - handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level), - &property)); + EXPECT_TRUE(db_->GetProperty(handles_[cf], + "rocksdb.num-files-at-level" + ToString(level), + &property)); } return atoi(property.c_str()); } @@ -1044,12 +1090,10 @@ if (cf == 0) { // default cfd EXPECT_TRUE(db_->GetProperty( - "rocksdb.compression-ratio-at-level" + NumberToString(level), - &property)); + "rocksdb.compression-ratio-at-level" + ToString(level), &property)); } else { EXPECT_TRUE(db_->GetProperty( - handles_[cf], - "rocksdb.compression-ratio-at-level" + NumberToString(level), + handles_[cf], "rocksdb.compression-ratio-at-level" + ToString(level), &property)); } return std::stod(property); @@ -1084,29 +1128,77 @@ result.resize(last_non_zero_offset); return result; } + #endif // !ROCKSDB_LITE +std::vector DBTestBase::GetBlobFileNumbers() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + std::vector result; + result.reserve(blob_files.size()); + + for (const auto& blob_file : blob_files) { + result.emplace_back(blob_file.first); + } + + return result; +} + size_t DBTestBase::CountFiles() { + size_t count = 0; std::vector files; - env_->GetChildren(dbname_, &files); + if (env_->GetChildren(dbname_, &files).ok()) { + count += files.size(); + } - std::vector logfiles; if (dbname_ != last_options_.wal_dir) { - env_->GetChildren(last_options_.wal_dir, &logfiles); + if (env_->GetChildren(last_options_.wal_dir, &files).ok()) { + count += files.size(); + } } - return files.size() + logfiles.size(); + return count; +}; + +Status DBTestBase::CountFiles(size_t* count) { + std::vector files; + Status s = env_->GetChildren(dbname_, &files); + if (!s.ok()) { + return s; + } + size_t files_count = files.size(); + + if (dbname_ != last_options_.wal_dir) { + s = env_->GetChildren(last_options_.wal_dir, &files); + if (!s.ok()) { + return s; + } + *count = files_count + files.size(); + } + + return Status::OK(); } -uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) { +Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf, + uint64_t* size) { Range r(start, limit); - uint64_t size; if (cf == 0) { - db_->GetApproximateSizes(&r, 1, &size); + return db_->GetApproximateSizes(&r, 1, size); } else { - db_->GetApproximateSizes(handles_[1], &r, 1, &size); + return db_->GetApproximateSizes(handles_[1], &r, 1, size); } - return size; } void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit, @@ -1147,9 +1239,9 @@ void DBTestBase::MoveFilesToLevel(int level, int cf) { for (int l = 0; l < level; ++l) { if (cf > 0) { - dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]); + EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf])); } else { - dbfull()->TEST_CompactRange(l, nullptr, nullptr); + EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr)); } } } @@ -1176,7 +1268,7 @@ void DBTestBase::GetSstFiles(Env* env, std::string path, std::vector* files) { - env->GetChildren(path, files); + EXPECT_OK(env->GetChildren(path, files)); files->erase( std::remove_if(files->begin(), files->end(), [](std::string name) { @@ -1196,24 +1288,24 @@ void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx, bool nowait) { for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { - ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); + ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990))); (*key_idx)++; } if (!nowait) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } // this will generate non-overlapping files since it keeps increasing key_idx void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) { for (int i = 0; i < KNumKeysByGenerateNewFile; i++) { - ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990))); + ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990))); (*key_idx)++; } if (!nowait) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } @@ -1221,12 +1313,12 @@ void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) { for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) { - ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000))); + ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000))); } - ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200))); + ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200))); if (!nowait) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } @@ -1324,21 +1416,22 @@ kMaxSequenceNumber /* upper_bound */); // This should be defined after range_del_agg so that it destructs the // assigned iterator before it range_del_agg is already destructed. + ReadOptions read_options; ScopedArenaIterator iter; if (cf != 0) { - iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[cf])); } else { - iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, + iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg, kMaxSequenceNumber)); } iter->SeekToFirst(); - ASSERT_EQ(iter->status().ok(), true); + ASSERT_OK(iter->status()); int seq = numValues; while (iter->Valid()) { ParsedInternalKey ikey; ikey.clear(); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); // checks sequence number for updates ASSERT_EQ(ikey.sequence, (unsigned)seq--); @@ -1371,36 +1464,40 @@ ASSERT_OK(destfile->Close()); } -std::unordered_map DBTestBase::GetAllSSTFiles( - uint64_t* total_size) { - std::unordered_map res; - +Status DBTestBase::GetAllDataFiles( + const FileType file_type, std::unordered_map* files, + uint64_t* total_size /* = nullptr */) { if (total_size) { *total_size = 0; } - std::vector files; - env_->GetChildren(dbname_, &files); - for (auto& file_name : files) { - uint64_t number; - FileType type; - std::string file_path = dbname_ + "/" + file_name; - if (ParseFileName(file_name, &number, &type) && type == kTableFile) { - uint64_t file_size = 0; - env_->GetFileSize(file_path, &file_size); - res[file_path] = file_size; - if (total_size) { - *total_size += file_size; + std::vector children; + Status s = env_->GetChildren(dbname_, &children); + if (s.ok()) { + for (auto& file_name : children) { + uint64_t number; + FileType type; + if (ParseFileName(file_name, &number, &type) && type == file_type) { + std::string file_path = dbname_ + "/" + file_name; + uint64_t file_size = 0; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + break; + } + (*files)[file_path] = file_size; + if (total_size) { + *total_size += file_size; + } } } } - return res; + return s; } std::vector DBTestBase::ListTableFiles(Env* env, const std::string& path) { std::vector files; std::vector file_numbers; - env->GetChildren(path, &files); + EXPECT_OK(env->GetChildren(path, &files)); uint64_t number; FileType type; for (size_t i = 0; i < files.size(); ++i) { @@ -1532,13 +1629,14 @@ InternalKeyComparator icmp(last_options_.comparator); ReadRangeDelAggregator range_del_agg(&icmp, kMaxSequenceNumber /* upper_bound */); - auto iter = - dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber); + ReadOptions read_options; + auto iter = dbfull()->NewInternalIterator(read_options, &arena, + &range_del_agg, kMaxSequenceNumber); iter->SeekToFirst(); for (auto p : true_data) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey ikey; - ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); ASSERT_EQ(p.first, ikey.user_key); ASSERT_EQ(p.second, iter->value()); iter->Next(); @@ -1561,4 +1659,14 @@ } #endif // ROCKSDB_LITE +void VerifySstUniqueIds(const TablePropertiesCollection& props) { + ASSERT_FALSE(props.empty()); // suspicious test if empty + std::unordered_set seen; + for (auto& pair : props) { + std::string id; + ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id)); + ASSERT_TRUE(seen.insert(id).second); + } +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,9 @@ #pragma once #include -#include #include +#include #include #include #include @@ -22,10 +22,7 @@ #include #include "db/db_impl/db_impl.h" -#include "db/dbformat.h" -#include "env/mock_env.h" #include "file/filename.h" -#include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" @@ -38,21 +35,18 @@ #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" -#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" -#include "test_util/mock_time_env.h" -#include "util/compression.h" -#include "util/mutexlock.h" - #include "test_util/sync_point.h" #include "test_util/testharness.h" -#include "test_util/testutil.h" +#include "util/cast_util.h" +#include "util/compression.h" +#include "util/mutexlock.h" #include "util/string_util.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { +class MockEnv; namespace anon { class AtomicCounter { @@ -116,98 +110,13 @@ enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 }; -// A hacky skip list mem table that triggers flush after number of entries. -class SpecialMemTableRep : public MemTableRep { - public: - explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable, - int num_entries_flush) - : MemTableRep(allocator), - memtable_(memtable), - num_entries_flush_(num_entries_flush), - num_entries_(0) {} - - virtual KeyHandle Allocate(const size_t len, char** buf) override { - return memtable_->Allocate(len, buf); - } - - // Insert key into the list. - // REQUIRES: nothing that compares equal to key is currently in the list. - virtual void Insert(KeyHandle handle) override { - num_entries_++; - memtable_->Insert(handle); - } - - void InsertConcurrently(KeyHandle handle) override { - num_entries_++; - memtable_->Insert(handle); - } - - // Returns true iff an entry that compares equal to key is in the list. - virtual bool Contains(const char* key) const override { - return memtable_->Contains(key); - } - - virtual size_t ApproximateMemoryUsage() override { - // Return a high memory usage when number of entries exceeds the threshold - // to trigger a flush. - return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024; - } - - virtual void Get(const LookupKey& k, void* callback_args, - bool (*callback_func)(void* arg, - const char* entry)) override { - memtable_->Get(k, callback_args, callback_func); - } - - uint64_t ApproximateNumEntries(const Slice& start_ikey, - const Slice& end_ikey) override { - return memtable_->ApproximateNumEntries(start_ikey, end_ikey); - } - - virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { - return memtable_->GetIterator(arena); - } - - virtual ~SpecialMemTableRep() override {} - - private: - std::unique_ptr memtable_; - int num_entries_flush_; - int num_entries_; -}; - -// The factory for the hacky skip list mem table that triggers flush after -// number of entries exceeds a threshold. -class SpecialSkipListFactory : public MemTableRepFactory { - public: - // After number of inserts exceeds `num_entries_flush` in a mem table, trigger - // flush. - explicit SpecialSkipListFactory(int num_entries_flush) - : num_entries_flush_(num_entries_flush) {} - - using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* /*logger*/) override { - return new SpecialMemTableRep( - allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0), - num_entries_flush_); - } - virtual const char* Name() const override { return "SkipListFactory"; } - - bool IsInsertConcurrentlySupported() const override { - return factory_.IsInsertConcurrentlySupported(); - } - - private: - SkipListFactory factory_; - int num_entries_flush_; -}; - // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: - explicit SpecialEnv(Env* base); + explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "SpecialEnv"; } + const char* Name() const override { return kClassName(); } Status NewWritableFile(const std::string& f, std::unique_ptr* r, const EnvOptions& soptions) override { @@ -233,6 +142,11 @@ return base_->Append(data); } } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } Status PositionedAppend(const Slice& data, uint64_t offset) override { if (env_->table_write_callback_) { (*env_->table_write_callback_)(); @@ -247,6 +161,11 @@ return base_->PositionedAppend(data, offset); } } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status RangeSync(uint64_t offset, uint64_t nbytes) override { Status s = base_->RangeSync(offset, nbytes); @@ -276,7 +195,10 @@ while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) { env_->SleepForMicroseconds(100000); } - Status s = base_->Sync(); + Status s; + if (!env_->skip_fsync_) { + s = base_->Sync(); + } #if !(defined NDEBUG) || !defined(OS_WIN) TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s); #endif // !(defined NDEBUG) || !defined(OS_WIN) @@ -294,6 +216,9 @@ Status Allocate(uint64_t offset, uint64_t len) override { return base_->Allocate(offset, len); } + size_t GetUniqueId(char* id, size_t max_size) const override { + return base_->GetUniqueId(id, max_size); + } }; class ManifestFile : public WritableFile { public: @@ -306,6 +231,12 @@ return base_->Append(data); } } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status Close() override { return base_->Close(); } Status Flush() override { return base_->Flush(); } @@ -314,7 +245,11 @@ if (env_->manifest_sync_error_.load(std::memory_order_acquire)) { return Status::IOError("simulated sync error"); } else { - return base_->Sync(); + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } } } uint64_t GetFileSize() override { return base_->GetFileSize(); } @@ -353,15 +288,26 @@ #endif return s; } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } Status Truncate(uint64_t size) override { return base_->Truncate(size); } + void PrepareWrite(size_t offset, size_t len) override { + base_->PrepareWrite(offset, len); + } + void SetPreallocationBlockSize(size_t size) override { + base_->SetPreallocationBlockSize(size); + } Status Close() override { // SyncPoint is not supported in Released Windows Mode. #if !(defined NDEBUG) || !defined(OS_WIN) // Check preallocation size - // preallocation size is never passed to base file. - size_t preallocation_size = preallocation_block_size(); + size_t block_size, last_allocated_block; + base_->GetPreallocationStatus(&block_size, &last_allocated_block); TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus", - &preallocation_size); + &block_size); #endif // !(defined NDEBUG) || !defined(OS_WIN) return base_->Close(); @@ -369,7 +315,15 @@ Status Flush() override { return base_->Flush(); } Status Sync() override { ++env_->sync_counter_; - return base_->Sync(); + if (env_->corrupt_in_sync_) { + EXPECT_OK(Append(std::string(33000, ' '))); + return Status::IOError("Ingested Sync Failure"); + } + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } } bool IsSyncThreadSafe() const override { return env_->is_wal_sync_thread_safe_.load(); @@ -382,6 +336,40 @@ SpecialEnv* env_; std::unique_ptr base_; }; + class OtherFile : public WritableFile { + public: + OtherFile(SpecialEnv* env, std::unique_ptr&& b) + : env_(env), base_(std::move(b)) {} + Status Append(const Slice& data) override { return base_->Append(data); } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } + Status Close() override { return base_->Close(); } + Status Flush() override { return base_->Flush(); } + Status Sync() override { + if (env_->skip_fsync_) { + return Status::OK(); + } else { + return base_->Sync(); + } + } + uint64_t GetFileSize() override { return base_->GetFileSize(); } + Status Allocate(uint64_t offset, uint64_t len) override { + return base_->Allocate(offset, len); + } + + private: + SpecialEnv* env_; + std::unique_ptr base_; + }; + + if (no_file_overwrite_.load(std::memory_order_acquire) && + target()->FileExists(f).ok()) { + return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true."); + } if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { uint32_t random_number; @@ -416,6 +404,8 @@ r->reset(new ManifestFile(this, std::move(*r))); } else if (strstr(f.c_str(), "log") != nullptr) { r->reset(new WalFile(this, std::move(*r))); + } else { + r->reset(new OtherFile(this, std::move(*r))); } } return s; @@ -452,12 +442,44 @@ std::atomic* bytes_read_; }; + class RandomFailureFile : public RandomAccessFile { + public: + RandomFailureFile(std::unique_ptr&& target, + std::atomic* failure_cnt, uint32_t fail_odd) + : target_(std::move(target)), + fail_cnt_(failure_cnt), + fail_odd_(fail_odd) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + if (Random::GetTLSInstance()->OneIn(fail_odd_)) { + fail_cnt_->fetch_add(1); + return Status::IOError("random error"); + } + return target_->Read(offset, n, result, scratch); + } + + virtual Status Prefetch(uint64_t offset, size_t n) override { + return target_->Prefetch(offset, n); + } + + private: + std::unique_ptr target_; + std::atomic* fail_cnt_; + uint32_t fail_odd_; + }; + Status s = target()->NewRandomAccessFile(f, r, soptions); random_file_open_counter_++; - if (s.ok() && count_random_reads_) { - r->reset(new CountingFile(std::move(*r), &random_read_counter_, - &random_read_bytes_counter_)); + if (s.ok()) { + if (count_random_reads_) { + r->reset(new CountingFile(std::move(*r), &random_read_counter_, + &random_read_bytes_counter_)); + } else if (rand_reads_fail_odd_ > 0) { + r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_, + rand_reads_fail_odd_)); + } } + if (s.ok() && soptions.compaction_readahead_size > 0) { compaction_readahead_size_ = soptions.compaction_readahead_size; } @@ -493,20 +515,35 @@ virtual void SleepForMicroseconds(int micros) override { sleep_counter_.Increment(); if (no_slowdown_ || time_elapse_only_sleep_) { - addon_time_.fetch_add(micros); + addon_microseconds_.fetch_add(micros); } if (!no_slowdown_) { target()->SleepForMicroseconds(micros); } } + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_.Increment(); + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + virtual Status GetCurrentTime(int64_t* unix_time) override { Status s; - if (!time_elapse_only_sleep_) { + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { s = target()->GetCurrentTime(unix_time); } if (s.ok()) { - *unix_time += addon_time_.load(); + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; } return s; } @@ -518,12 +555,12 @@ virtual uint64_t NowNanos() override { return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + - addon_time_.load() * 1000; + addon_microseconds_.load() * 1000; } virtual uint64_t NowMicros() override { return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) + - addon_time_.load(); + addon_microseconds_.load(); } virtual Status DeleteFile(const std::string& fname) override { @@ -531,6 +568,37 @@ return target()->DeleteFile(fname); } + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + + Status NewDirectory(const std::string& name, + std::unique_ptr* result) override { + if (!skip_fsync_) { + return target()->NewDirectory(name, result); + } else { + class NoopDirectory : public Directory { + public: + NoopDirectory() {} + ~NoopDirectory() {} + + Status Fsync() override { return Status::OK(); } + }; + + result->reset(new NoopDirectory()); + return Status::OK(); + } + } + + Status RenameFile(const std::string& src, const std::string& dest) override { + rename_count_.fetch_add(1); + if (rename_error_.load(std::memory_order_acquire)) { + return Status::NotSupported("Simulated `RenameFile()` error."); + } + return target()->RenameFile(src, dest); + } + + // Something to return when mocking current time + const int64_t maybe_starting_time_; + Random rnd_; port::Mutex rnd_mutex_; // Lock to pretect rnd_ @@ -555,13 +623,21 @@ // Force write to log files to fail while this pointer is non-nullptr std::atomic log_write_error_; + // Force `RenameFile()` to fail while this pointer is non-nullptr + std::atomic rename_error_{false}; + // Slow down every log write, in micro-seconds. std::atomic log_write_slowdown_; + // If true, returns Status::NotSupported for file overwrite. + std::atomic no_file_overwrite_; + // Number of WAL files that are still open for write. std::atomic num_open_wal_file_; bool count_random_reads_; + uint32_t rand_reads_fail_odd_ = 0; + std::atomic num_reads_fails_; anon::AtomicCounter random_read_counter_; std::atomic random_read_bytes_counter_; std::atomic random_file_open_counter_; @@ -575,6 +651,12 @@ std::atomic sync_counter_; + // If true, all fsync to files and directories are skipped. + bool skip_fsync_ = false; + + // If true, ingest the corruption to file during sync. + bool corrupt_in_sync_ = false; + std::atomic non_writeable_rate_; std::atomic new_writable_count_; @@ -583,25 +665,33 @@ std::function* table_write_callback_; - std::atomic addon_time_; - std::atomic now_cpu_count_; std::atomic delete_count_; - std::atomic time_elapse_only_sleep_; - - bool no_slowdown_; + std::atomic rename_count_{0}; std::atomic is_wal_sync_thread_safe_{true}; std::atomic compaction_readahead_size_{}; + + private: // accessing these directly is prone to error + friend class DBTestBase; + + std::atomic addon_microseconds_{0}; + + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic time_elapse_only_sleep_; + + bool no_slowdown_; }; #ifndef ROCKSDB_LITE class OnFileDeletionListener : public EventListener { public: OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "OnFileDeletionListener"; } void SetExpectedFileName(const std::string file_name) { expected_file_name_ = file_name; @@ -623,6 +713,19 @@ size_t matched_count_; std::string expected_file_name_; }; + +class FlushCounterListener : public EventListener { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "FlushCounterListener"; } + std::atomic count{0}; + std::atomic expected_flush_reason{FlushReason::kOthers}; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason); + } +}; #endif // A test merge operator mimics put but also fails if one of merge operands is @@ -647,6 +750,86 @@ virtual const char* Name() const override { return "TestPutOperator"; } }; +// A wrapper around Cache that can easily be extended with instrumentation, +// etc. +class CacheWrapper : public Cache { + public: + explicit CacheWrapper(std::shared_ptr target) + : target_(std::move(target)) {} + + const char* Name() const override { return target_->Name(); } + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + using Cache::Lookup; + Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { + return target_->Lookup(key, stats); + } + + bool Ref(Handle* handle) override { return target_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool force_erase = false) override { + return target_->Release(handle, force_erase); + } + + void* Value(Handle* handle) override { return target_->Value(handle); } + + void Erase(const Slice& key) override { target_->Erase(key); } + uint64_t NewId() override { return target_->NewId(); } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + size_t GetCapacity() const override { return target_->GetCapacity(); } + + size_t GetUsage() const override { return target_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } + + size_t GetCharge(Handle* handle) const override { + return target_->GetCharge(handle); + } + + DeleterFn GetDeleter(Handle* handle) const override { + return target_->GetDeleter(handle); + } + + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + target_->ApplyToAllCacheEntries(callback, thread_safe); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } + + protected: + std::shared_ptr target_; +}; + class DBTestBase : public testing::Test { public: // Sequence of option configurations to try @@ -674,7 +857,7 @@ kUniversalCompactionMultiLevel = 20, kCompressedBlockCache = 21, kInfiniteMaxOpenFiles = 22, - kxxHashChecksum = 23, + kXXH3Checksum = 23, kFIFOCompaction = 24, kOptimizeFiltersForHits = 25, kRowCache = 26, @@ -687,9 +870,9 @@ kBlockBasedTableWithIndexRestartInterval, kBlockBasedTableWithPartitionedIndex, kBlockBasedTableWithPartitionedIndexFormat4, + kBlockBasedTableWithLatestFormat, kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, - kxxHash64Checksum, kUnorderedWrite, // This must be the last line kEnd, @@ -730,16 +913,13 @@ // requires. kSkipMmapReads; - explicit DBTestBase(const std::string path); + // `env_do_fsync` decides whether the special Env would do real + // fsync for files and directories. Skipping fsync can speed up + // tests, but won't cover the exact fsync logic. + DBTestBase(const std::string path, bool env_do_fsync); ~DBTestBase(); - static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; - } - static std::string Key(int i) { char buf[100]; snprintf(buf, sizeof(buf), "key%06d", i); @@ -773,14 +953,17 @@ const anon::OptionsOverride& options_override = anon::OptionsOverride()) const; - static Options GetDefaultOptions(); + Options GetDefaultOptions() const; + + Options GetOptions(int option_config) const { + return GetOptions(option_config, GetDefaultOptions()); + } - Options GetOptions(int option_config, - const Options& default_options = GetDefaultOptions(), + Options GetOptions(int option_config, const Options& default_options, const anon::OptionsOverride& options_override = anon::OptionsOverride()) const; - DBImpl* dbfull() { return reinterpret_cast(db_); } + DBImpl* dbfull() { return static_cast_with_check(db_); } void CreateColumnFamilies(const std::vector& cfs, const Options& options); @@ -886,12 +1069,20 @@ int TotalTableFiles(int cf = 0, int levels = -1); #endif // ROCKSDB_LITE + std::vector GetBlobFileNumbers(); + // Return spread of files per level std::string FilesPerLevel(int cf = 0); size_t CountFiles(); - uint64_t Size(const Slice& start, const Slice& limit, int cf = 0); + Status CountFiles(size_t* count); + + Status Size(const Slice& start, const Slice& limit, uint64_t* size) { + return Size(start, limit, 0, size); + } + + Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size); void Compact(int cf, const Slice& start, const Slice& limit, uint32_t target_path_id); @@ -969,8 +1160,9 @@ void CopyFile(const std::string& source, const std::string& destination, uint64_t size = 0); - std::unordered_map GetAllSSTFiles( - uint64_t* total_size = nullptr); + Status GetAllDataFiles(const FileType file_type, + std::unordered_map* sst_files, + uint64_t* total_size = nullptr); std::vector ListTableFiles(Env* env, const std::string& path); @@ -995,6 +1187,19 @@ Tickers ticker_type) { return options.statistics->getAndResetTickerCount(ticker_type); } + + // Note: reverting this setting within the same test run is not yet + // supported + void SetTimeElapseOnlySleepOnReopen(DBOptions* options); + + private: // Prone to error on direct use + void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); + + bool time_elapse_only_sleep_on_reopen_ = false; }; +// For verifying that all files generated by current version have SST +// unique ids. +void VerifySstUniqueIds(const TablePropertiesCollection& props); + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,8 @@ #if !defined(ROCKSDB_LITE) #include "rocksdb/utilities/table_properties_collectors.h" #include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -25,8 +27,8 @@ : public DBTestBase, public ::testing::WithParamInterface> { public: - explicit DBTestUniversalCompactionBase( - const std::string& path) : DBTestBase(path) {} + explicit DBTestUniversalCompactionBase(const std::string& path) + : DBTestBase(path, /*env_do_fsync=*/false) {} void SetUp() override { num_levels_ = std::get<0>(GetParam()); exclusive_manual_compaction_ = std::get<1>(GetParam()); @@ -43,7 +45,8 @@ class DBTestUniversalCompaction2 : public DBTestBase { public: - DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {} + DBTestUniversalCompaction2() + : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {} }; namespace { @@ -90,36 +93,6 @@ std::atomic_bool expect_full_compaction_; std::atomic_bool expect_manual_compaction_; }; - -class DelayFilter : public CompactionFilter { - public: - explicit DelayFilter(DBTestBase* d) : db_test(d) {} - bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/, - std::string* /*new_value*/, - bool* /*value_changed*/) const override { - db_test->env_->addon_time_.fetch_add(1000); - return true; - } - - const char* Name() const override { return "DelayFilter"; } - - private: - DBTestBase* db_test; -}; - -class DelayFilterFactory : public CompactionFilterFactory { - public: - explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {} - std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& /*context*/) override { - return std::unique_ptr(new DelayFilter(db_test)); - } - - const char* Name() const override { return "DelayFilterFactory"; } - - private: - DBTestBase* db_test; -}; } // namespace // Make sure we don't trigger a problem if the trigger condtion is given @@ -154,11 +127,11 @@ for (int num = 0; num < 16; num++) { // Write 100KB file. And immediately it should be compacted to one file. GenerateNewFile(&rnd, &key_idx); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumSortedRuns(0), 1); } ASSERT_OK(Put(Key(key_idx), "")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumSortedRuns(0), 1); } @@ -179,7 +152,7 @@ options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.optimize_filters_for_hits = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.memtable_factory.reset(new SpecialSkipListFactory(3)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(3)); DestroyAndReopen(options); @@ -190,15 +163,15 @@ Env::Priority::LOW); for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { - Put(Key(num * 10), "val"); + ASSERT_OK(Put(Key(num * 10), "val")); if (num) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - Put(Key(30 + num * 10), "val"); - Put(Key(60 + num * 10), "val"); + ASSERT_OK(Put(Key(30 + num * 10), "val")); + ASSERT_OK(Put(Key(60 + num * 10), "val")); } - Put("", ""); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("", "")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // Query set of non existing keys for (int i = 5; i < 90; i += 10) { @@ -218,7 +191,7 @@ // Unblock compaction and wait it for happening. sleeping_task_low.WakeUp(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // The same queries will not trigger bloom filter for (int i = 5; i < 90; i += 10) { @@ -322,7 +295,7 @@ // Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a // new file of size 1. GenerateNewFile(1, &rnd, &key_idx); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Level-0 compaction is triggered, but no file will be picked up. ASSERT_EQ(NumSortedRuns(1), 4); @@ -331,7 +304,7 @@ // a new file of size 1. filter->expect_full_compaction_.store(true); GenerateNewFile(1, &rnd, &key_idx); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All files at level 0 will be compacted into a single one. ASSERT_EQ(NumSortedRuns(1), 1); @@ -361,10 +334,10 @@ num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(NumSortedRuns(1), num + 1); } ASSERT_EQ(NumSortedRuns(1), 2); @@ -374,7 +347,7 @@ // but will instead trigger size amplification. ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify that size amplification did occur ASSERT_EQ(NumSortedRuns(1), 1); @@ -419,10 +392,10 @@ num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(NumSortedRuns(1), num + 1); } ASSERT_EQ(NumSortedRuns(1), 2); @@ -432,7 +405,7 @@ // but could instead trigger size amplification if it's set // to 110. ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify compaction did not happen ASSERT_EQ(NumSortedRuns(1), 3); @@ -453,7 +426,7 @@ ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal .max_size_amplification_percent); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify that size amplification did happen ASSERT_EQ(NumSortedRuns(1), 1); ASSERT_EQ(total_picked_compactions, 1); @@ -498,10 +471,10 @@ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 11; i++) { - ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000))); + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(NumSortedRuns(1), num + 1); } ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger); @@ -509,7 +482,7 @@ // Flush whatever is remaining in memtable. This is typically small, about // 30KB. ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify compaction did not happen ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1); ASSERT_EQ(total_picked_compactions, 0); @@ -538,7 +511,7 @@ ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2u); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Files in L0 are approx: 0.3 (30KB), 1, 1, 1. // On compaction: the files are below the size amp threshold, so we @@ -576,10 +549,10 @@ ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); Random rnd(301); for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) { - ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize))); } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); std::vector compaction_input_file_names; @@ -639,17 +612,17 @@ // Generate 3 overlapping files Random rnd(301); for (int i = 0; i < 210; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); } ASSERT_OK(Flush()); for (int i = 200; i < 300; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); } ASSERT_OK(Flush()); for (int i = 250; i < 260; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 100))); + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); } ASSERT_OK(Flush()); @@ -659,11 +632,11 @@ compact_options.change_level = true; compact_options.target_level = 4; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); } -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) class DBTestUniversalCompactionMultiLevels : public DBTestUniversalCompactionBase { public: @@ -693,7 +666,7 @@ ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = num_keys; i < num_keys * 2; i++) { ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i)); @@ -740,7 +713,7 @@ std::vector values; ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(trivial_move, 0); ASSERT_GT(non_trivial_move, 0); @@ -764,6 +737,7 @@ Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.num_levels = num_levels_; + options.env = env_; options.write_buffer_size = 1 << 10; // 1KB options.level0_file_num_compaction_trigger = 3; options.max_background_compactions = 3; @@ -803,7 +777,7 @@ for (int i = 0; i < num_keys * 2; i++) { ASSERT_OK(Put(1, Key(i % num_keys), Key(i))); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(num_compactions_running.load(), 0); @@ -881,7 +855,7 @@ // Hold the 1st compaction from finishing TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // There should only be one picked compaction as the score drops below one // after the first one is picked. @@ -929,7 +903,7 @@ // Hold the 1st and 2nd compaction from finishing TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // This time we will trigger a compaction because of size ratio and // another compaction because of number of files that are not compacted @@ -940,7 +914,7 @@ INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel, ::testing::Combine(::testing::Values(1, 10), ::testing::Values(false))); -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { Options options = CurrentOptions(); @@ -960,17 +934,17 @@ for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { // Write 100KB (100 values, each 1K) for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); if (num < options.level0_file_num_compaction_trigger - 1) { ASSERT_EQ(NumSortedRuns(1), num + 1); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumSortedRuns(1), 1); } @@ -998,20 +972,20 @@ num++) { // Write 100KB (100 values, each 1K) for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumSortedRuns(), num + 1); } // Generate one more file at level-0, which should trigger level-0 // compaction. for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Suppose each file flushed from mem table has size 1. Now we compact // (level0_file_num_compaction_trigger+1)=4 files and should have a big // file of size 4. @@ -1024,25 +998,25 @@ // First, clean up memtable before inserting new data. This will generate // a level-0 file, with size around 0.4 (according to previously written // data amount). - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; num++) { // Write 110KB (11 values, each 10K) for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumSortedRuns(), num + 3); } // Generate one more file at level-0, which should trigger level-0 // compaction. for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. // After compaction, we should have 3 files, with size 4, 0.4, 2. ASSERT_EQ(NumSortedRuns(), 3); @@ -1050,10 +1024,10 @@ // Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one // more file at level-0, which should trigger level-0 compaction. for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990))); + ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990))); key_idx++; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Level-0 compaction is triggered, but no file will be picked up. ASSERT_EQ(NumSortedRuns(), 4); } @@ -1082,8 +1056,8 @@ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_LT(TotalSize(), 110000U * 2 * 0.9); @@ -1094,8 +1068,8 @@ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_LT(TotalSize(), 110000 * 4 * 0.9); @@ -1107,8 +1081,8 @@ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_LT(TotalSize(), 110000 * 6 * 0.9); @@ -1120,8 +1094,8 @@ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2); } @@ -1150,13 +1124,13 @@ ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000))); key_idx++; } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2); } -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) // Test that checks trivial move in universal compaction TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) { int32_t trivial_move = 0; @@ -1197,7 +1171,7 @@ std::vector values; ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(trivial_move, 0); ASSERT_GT(non_trivial_move, 0); @@ -1243,13 +1217,13 @@ std::vector values; ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(trivial_move, 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) { Options options = CurrentOptions(); @@ -1258,7 +1232,7 @@ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleUniversal; options.compaction_options_universal.size_ratio = 5; options.write_buffer_size = 111 << 10; // 114KB @@ -1267,12 +1241,14 @@ options.num_levels = 1; std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) { + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + ASSERT_OK( + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i])); + } + ASSERT_OK(env_->DeleteDir(options.db_paths[1].path)); } - env_->DeleteDir(options.db_paths[1].path); Reopen(options); Random rnd(301); @@ -1360,7 +1336,7 @@ options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_style = kCompactionStyleUniversal; options.compaction_options_universal.size_ratio = 10; options.write_buffer_size = 111 << 10; // 114KB @@ -1524,18 +1500,19 @@ options.num_levels = 1; options.write_buffer_size = 200 << 10; // 200KB options.level0_file_num_compaction_trigger = 3; - options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile)); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysPerFile)); options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); for (int i = 0; i <= max_key1; i++) { // each value is 10K - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Stage 2: reopen with universal compaction, num_levels=4 options.compaction_style = kCompactionStyleUniversal; @@ -1548,12 +1525,12 @@ // Insert more keys for (int i = max_key1 + 1; i <= max_key2; i++) { // each value is 10K - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); verify_func(max_key2); // Compaction to non-L0 has happened. @@ -1568,7 +1545,8 @@ compact_options.change_level = true; compact_options.target_level = 0; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); // Need to restart it once to remove higher level records in manifest. ReopenWithColumnFamilies({"default", "pikachu"}, options); // Final reopen @@ -1580,12 +1558,12 @@ // Insert more keys for (int i = max_key2 + 1; i <= max_key3; i++) { // each value is 10K - ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); verify_func(max_key3); } @@ -1604,15 +1582,17 @@ options.level0_file_num_compaction_trigger = 2; options.num_levels = 1; options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) { + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + ASSERT_OK( + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i])); + } + ASSERT_OK(env_->DeleteDir(options.db_paths[1].path)); } - env_->DeleteDir(options.db_paths[1].path); Reopen(options); Random rnd(301); @@ -1700,6 +1680,7 @@ Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM); Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; + options.max_background_compactions = 2; options.num_levels = num_levels_; options.write_buffer_size = 100 << 10; // 100KB options.target_file_size_base = 32 << 10; // 32KB @@ -1708,6 +1689,10 @@ options.compaction_options_universal.max_size_amplification_percent = 110; DestroyAndReopen(options); + // Need to get a token to enable compaction parallelism up to + // `max_background_compactions` jobs. + auto pressure_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {// wait for the full compaction to be picked before adding files intended // for the second one. @@ -1727,14 +1712,14 @@ // use no_wait above because that one waits for flush and compaction. We // don't want to wait for compaction because the full compaction is // intentionally blocked while more files are flushed. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } if (i == 0) { TEST_SYNC_POINT( "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // First compaction should output to bottom level. Second should output to L0 // since older L0 files pending compaction prevent it from being placed lower. @@ -1752,7 +1737,7 @@ const int kNumFilesTrigger = 8; Options options = CurrentOptions(); options.memtable_factory.reset( - new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2; options.compaction_options_universal.max_size_amplification_percent = static_cast(-1); @@ -1773,7 +1758,7 @@ int key_idx = 0; GenerateNewFile(&rnd, &key_idx); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Compacting the first four files was enough to bring the score below one so // there's no need to schedule any more compactions. ASSERT_EQ(1, num_compactions_attempted); @@ -1803,9 +1788,9 @@ auto stop_token = dbfull()->TEST_write_controler().GetCompactionPressureToken(); - Put("key", "val"); - Flush(); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1); ColumnFamilyMetaData cf_meta; ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily(); @@ -1829,10 +1814,10 @@ TEST_SYNC_POINT( "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"); for (int i = 0; i < 2; ++i) { - Put("key", "val"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); compact_files_thread.join(); } @@ -1863,7 +1848,7 @@ DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); MakeTables(3, "p", "q", 1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(2, TotalLiveFiles(1)); ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); @@ -1872,7 +1857,7 @@ CompactRangeOptions compact_options; compact_options.target_path_id = 1; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); @@ -1895,7 +1880,7 @@ // Full compaction to DB path 0 compact_options.target_path_id = 0; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_EQ(1, TotalLiveFiles(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); @@ -1932,27 +1917,28 @@ // during flush int i; for (i = 0; i < 2000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); // MoveFilesToLevel(6); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); for (i = 1999; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(6), 0); } +#if defined(ENABLE_SINGLE_LEVEL_DTC) TEST_F(DBTestUniversalCompaction2, SingleLevel) { const int kNumKeys = 3000; const int kWindowSize = 100; @@ -1974,23 +1960,24 @@ // during flush int i; for (i = 0; i < 2000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1999; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()(; - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(0)); } +#endif // ENABLE_SINGLE_LEVEL_DTC TEST_F(DBTestUniversalCompaction2, MultipleLevels) { const int kWindowSize = 100; @@ -2011,50 +1998,50 @@ // during flush int i; for (i = 0; i < 500; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 500; i < 1000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1000; i < 1500; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1500; i < 2000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(6), 0); for (i = 1999; i < 2333; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2333; i < 2666; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2666; i < 2999; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(6), 0); ASSERT_GT(NumTableFilesAtLevel(5), 0); for (i = 1900; i < 2100; ++i) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); ASSERT_EQ(0, NumTableFilesAtLevel(2)); @@ -2083,23 +2070,23 @@ // during flush int i; for (i = 0; i < 2000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2000; i < 3000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 3500; i < 4000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2900; i < 3100; ++i) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(2, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(6), 0); } @@ -2125,23 +2112,23 @@ // during flush int i; for (i = 0; i < 2000; ++i) { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); // MoveFilesToLevel(6); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); for (i = 1999; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(6)); ASSERT_GT(NumTableFilesAtLevel(5), 0); @@ -2150,7 +2137,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { Options options; options.compaction_style = kCompactionStyleUniversal; - + options.env = env_; KeepFilterFactory* filter = new KeepFilterFactory(true); options.compaction_filter_factory.reset(filter); Reopen(options); @@ -2182,9 +2169,11 @@ opts.compaction_options_universal.max_size_amplification_percent = 200; opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days opts.num_levels = 5; - env_->addon_time_.store(0); + env_->SetMockSleep(); Reopen(opts); + // NOTE: Presumed unnecessary and removed: resetting mock time in env + int periodic_compactions = 0; int start_level = -1; int output_level = -1; @@ -2203,16 +2192,16 @@ // Case 1: Oldest flushed file excceeds periodic compaction threshold. ASSERT_OK(Put("foo", "bar")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ(0, periodic_compactions); // Move clock forward so that the flushed file would qualify periodic // compaction. - env_->addon_time_.store(48 * 60 * 60 + 100); + env_->MockSleepForSeconds(48 * 60 * 60 + 100); // Another flush would trigger compaction the oldest file. ASSERT_OK(Put("foo", "bar2")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, periodic_compactions); ASSERT_EQ(0, start_level); @@ -2222,16 +2211,16 @@ periodic_compactions = 0; // A flush doesn't trigger a periodic compaction when threshold not hit ASSERT_OK(Put("foo", "bar2")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, periodic_compactions); // After periodic compaction threshold hits, a flush will trigger // a compaction ASSERT_OK(Put("foo", "bar2")); - env_->addon_time_.fetch_add(48 * 60 * 60 + 100); - Flush(); - dbfull()->TEST_WaitForCompact(); + env_->MockSleepForSeconds(48 * 60 * 60 + 100); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, periodic_compactions); ASSERT_EQ(0, start_level); ASSERT_EQ(4, output_level); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_wal_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_wal_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,26 +8,58 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" -#include "env/composite_env_wrapper.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" -#include "test_util/fault_injection_test_env.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { -class DBWALTest : public DBTestBase { - public: - DBWALTest() : DBTestBase("/db_wal_test") {} +class DBWALTestBase : public DBTestBase { + protected: + explicit DBWALTestBase(const std::string& dir_name) + : DBTestBase(dir_name, /*env_do_fsync=*/true) {} #if defined(ROCKSDB_PLATFORM_POSIX) + public: +#if defined(ROCKSDB_FALLOCATE_PRESENT) + bool IsFallocateSupported() { + // Test fallocate support of running file system. + // Skip this test if fallocate is not supported. + std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + assert(fd > 0); + int alloc_status = fallocate(fd, 0, 0, 1); + int err_number = errno; + close(fd); + assert(env_->DeleteFile(fname_test_fallocate) == Status::OK()); + if (err_number == ENOSYS || err_number == EOPNOTSUPP) { + fprintf(stderr, "Skipped preallocated space check: %s\n", + errnoStr(err_number).c_str()); + return false; + } + assert(alloc_status == 0); + return true; + } +#endif // ROCKSDB_FALLOCATE_PRESENT + uint64_t GetAllocatedFileSize(std::string file_name) { struct stat sbuf; int err = stat(file_name.c_str(), &sbuf); assert(err == 0); return sbuf.st_blocks * 512; } -#endif +#endif // ROCKSDB_PLATFORM_POSIX +}; + +class DBWALTest : public DBWALTestBase { + public: + DBWALTest() : DBWALTestBase("/db_wal_test") {} }; // A SpecialEnv enriched to give more insight about deleted files @@ -40,8 +72,8 @@ InstrumentedMutexLock l(&env_mutex_); if (f == skipped_wal) { deleted_wal_reopened = true; - if (IsWAL(f) && largetest_deleted_wal.size() != 0 && - f.compare(largetest_deleted_wal) <= 0) { + if (IsWAL(f) && largest_deleted_wal.size() != 0 && + f.compare(largest_deleted_wal) <= 0) { gap_in_wals = true; } } @@ -55,9 +87,9 @@ // remember its name partly because the application might attempt to // delete the file again. if (skipped_wal.size() != 0 && skipped_wal != fname) { - if (largetest_deleted_wal.size() == 0 || - largetest_deleted_wal.compare(fname) < 0) { - largetest_deleted_wal = fname; + if (largest_deleted_wal.size() == 0 || + largest_deleted_wal.compare(fname) < 0) { + largest_deleted_wal = fname; } } else { skipped_wal = fname; @@ -75,7 +107,7 @@ // the wal whose actual delete was skipped by the env std::string skipped_wal = ""; // the largest WAL that was requested to be deleted - std::string largetest_deleted_wal = ""; + std::string largest_deleted_wal = ""; // number of WALs that were successfully deleted std::atomic deleted_wal_cnt = {0}; // the WAL whose delete from fs was skipped is reopened during recovery @@ -86,7 +118,8 @@ class DBWALTestWithEnrichedEnv : public DBTestBase { public: - DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") { + DBWALTestWithEnrichedEnv() + : DBTestBase("db_wal_test", /*env_do_fsync=*/true) { enriched_env_ = new EnrichedSpecialEnv(env_->target()); auto options = CurrentOptions(); options.env = enriched_env_; @@ -330,18 +363,319 @@ } while (ChangeWalOptions()); } +TEST_F(DBWALTest, RecoverWithBlob) { + // Write a value that's below the prospective size limit for blobs and another + // one that's above. Note that blob files are not actually enabled at this + // point. + constexpr uint64_t min_blob_size = 10; + + constexpr char short_value[] = "short"; + static_assert(sizeof(short_value) - 1 < min_blob_size, + "short_value too long"); + + constexpr char long_value[] = "long_value"; + static_assert(sizeof(long_value) - 1 >= min_blob_size, + "long_value too short"); + + ASSERT_OK(Put("key1", short_value)); + ASSERT_OK(Put("key2", long_value)); + + // There should be no files just yet since we haven't flushed. + { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + ASSERT_EQ(storage_info->num_non_empty_levels(), 0); + ASSERT_TRUE(storage_info->GetBlobFiles().empty()); + } + + // Reopen the database with blob files enabled. A new table file/blob file + // pair should be written during recovery. + Options options; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + ASSERT_EQ(Get("key1"), short_value); + ASSERT_EQ(Get("key2"), long_value); + + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_EQ(l0_files.size(), 1); + + const FileMetaData* const table_file = l0_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.begin()->second; + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), "key1"); + ASSERT_EQ(table_file->largest.user_key(), "key2"); + ASSERT_EQ(table_file->fd.smallest_seqno, 1); + ASSERT_EQ(table_file->fd.largest_seqno, 2); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); + +#ifndef ROCKSDB_LITE + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_FALSE(compaction_stats.empty()); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); + + const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); +#endif // ROCKSDB_LITE +} + +TEST_F(DBWALTest, RecoverWithBlobMultiSST) { + // Write several large (4 KB) values without flushing. Note that blob files + // are not actually enabled at this point. + std::string large_value(1 << 12, 'a'); + + constexpr int num_keys = 64; + + for (int i = 0; i < num_keys; ++i) { + ASSERT_OK(Put(Key(i), large_value)); + } + + // There should be no files just yet since we haven't flushed. + { + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + ASSERT_EQ(storage_info->num_non_empty_levels(), 0); + ASSERT_TRUE(storage_info->GetBlobFiles().empty()); + } + + // Reopen the database with blob files enabled and write buffer size set to a + // smaller value. Multiple table files+blob files should be written and added + // to the Version during recovery. + Options options; + options.write_buffer_size = 1 << 16; // 64 KB + options.enable_blob_files = true; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + for (int i = 0; i < num_keys; ++i) { + ASSERT_EQ(Get(Key(i)), large_value); + } + + VersionSet* const versions = dbfull()->GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_GT(l0_files.size(), 1); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_GT(blob_files.size(), 1); + + ASSERT_EQ(l0_files.size(), blob_files.size()); +} + +TEST_F(DBWALTest, WALWithChecksumHandoff) { +#ifndef ROCKSDB_ASSERT_STATUS_CHECKED + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + do { + Options options = CurrentOptions(); + + options.checksum_handoff_file_types.Add(FileType::kWalFile); + options.env = fault_fs_env.get(); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + // This put, data is persisted by Flush + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + writeOpt.disableWAL = false; + // Data is persisted in the WAL + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3")); + // The hash does not match, write fails + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Due to the write failure, Get should not find + ASSERT_NE("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "zoo")); + ASSERT_EQ("v3", Get(1, "bar")); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + // Each write will be similated as corrupted. + fault_fs->IngestDataCorruptionBeforeWrite(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4")); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_NE("v4", Get(1, "foo")); + ASSERT_NE("v4", Get(1, "bar")); + fault_fs->NoDataCorruptionBeforeWrite(); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + // The file system does not provide checksum method and verification. + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v5", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + + Destroy(options); + } while (ChangeWalOptions()); +#endif // ROCKSDB_ASSERT_STATUS_CHECKED +} + +class DBRecoveryTestBlobError + : public DBWALTest, + public testing::WithParamInterface { + public: + DBRecoveryTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) { + // Write a value. Note that blob files are not actually enabled at this point. + ASSERT_OK(Put("key", "blob")); + + // Reopen with blob files enabled but make blob file writing fail during + // recovery. + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options options; + options.enable_blob_files = true; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + ASSERT_NOK(TryReopen(options)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Make sure the files generated by the failed recovery have been deleted. + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kTableFile; + + if (!ParseFileName(file, &number, &type)) { + continue; + } + + ASSERT_NE(type, kTableFile); + ASSERT_NE(type, kBlobFile); + } +} + TEST_F(DBWALTest, IgnoreRecoveredLog) { std::string backup_logs = dbname_ + "/backup_logs"; do { // delete old files in backup_logs directory - env_->CreateDirIfMissing(backup_logs); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); std::vector old_files; - env_->GetChildren(backup_logs, &old_files); + ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); for (auto& file : old_files) { - if (file != "." && file != "..") { - env_->DeleteFile(backup_logs + "/" + file); - } + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); } Options options = CurrentOptions(); options.create_if_missing = true; @@ -359,11 +693,9 @@ // copy the logs to backup std::vector logs; - env_->GetChildren(options.wal_dir, &logs); + ASSERT_OK(env_->GetChildren(options.wal_dir, &logs)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); - } + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); } // recover the DB @@ -374,9 +706,7 @@ // copy the logs from backup back to wal dir for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); } // this should ignore the log files, recovery should not happen again // if the recovery happens, the same merge operator would be called twice, @@ -390,11 +720,9 @@ Close(); // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); } // assert that we successfully recovered only from logs, even though we // destroyed the DB @@ -405,16 +733,14 @@ // Recovery will fail if DB directory doesn't exist. Destroy(options); // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - // we won't be needing this file no more - env_->DeleteFile(backup_logs + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log)); } Status s = TryReopen(options); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(s); Destroy(options); } while (ChangeWalOptions()); } @@ -452,9 +778,9 @@ called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -471,9 +797,9 @@ called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -491,9 +817,9 @@ called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -512,9 +838,9 @@ called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -522,7 +848,10 @@ #endif // !(defined NDEBUG) || !defined(OS_WIN) #ifndef ROCKSDB_LITE -TEST_F(DBWALTest, FullPurgePreservesRecycledLog) { +TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) { + // TODO(ajkr): Disabled until WAL recycling is fixed for + // `kPointInTimeRecovery`. + // For github issue #1303 for (int i = 0; i < 2; ++i) { Options options = CurrentOptions(); @@ -558,7 +887,10 @@ } } -TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) { +TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) { + // TODO(ajkr): Disabled until WAL recycling is fixed for + // `kPointInTimeRecovery`. + // Ensures full purge cannot delete a WAL while it's in the process of being // recycled. In particular, we force the full purge after a file has been // chosen for reuse, but before it has been renamed. @@ -734,7 +1066,7 @@ // Make 'dobrynia' to be flushed and new WAL file to be created ASSERT_OK(Put(2, Key(10), DummyString(7500000))); ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); { auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), static_cast(1)); @@ -788,7 +1120,7 @@ // Make 'nikitich' memtable to be flushed ASSERT_OK(Put(3, Key(10), DummyString(1002400))); ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); // 4 memtable are not flushed, 1 sst file { auto tables = ListTableFiles(env_, dbname_); @@ -808,7 +1140,7 @@ ASSERT_OK(Put(3, Key(10), DummyString(1002400))); // make it flush ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); // There are still 4 memtable not flushed, and 2 sst tables ASSERT_OK(Put(0, Key(1), DummyString(1))); ASSERT_OK(Put(1, Key(1), DummyString(1))); @@ -856,10 +1188,10 @@ for (uint64_t b = 0; b < kNumBatches; b++) { batch.Clear(); for (int i = 0; i < kBatchSize; i++) { - batch.Put(Key(i), DummyString(128)); + ASSERT_OK(batch.Put(Key(i), DummyString(128))); } - dbfull()->Write(wo, &batch); + ASSERT_OK(dbfull()->Write(wo, &batch)); } ASSERT_OK(dbfull()->SyncWAL()); @@ -887,7 +1219,7 @@ ASSERT_OK(Flush(0)); ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5 ASSERT_EQ("v5", Get(0, "key")); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); // Simulate a crash. fault_env->SetFilesystemActive(false); Close(); @@ -905,16 +1237,16 @@ class RecoveryTestHelper { public: // Number of WAL files to generate - static const int kWALFilesCount = 10; + static constexpr int kWALFilesCount = 10; // Starting number for the WAL file name like 00010.log - static const int kWALFileOffset = 10; + static constexpr int kWALFileOffset = 10; // Keys to be written per WAL file - static const int kKeysPerWALFile = 133; + static constexpr int kKeysPerWALFile = 133; // Size of the value - static const int kValueSize = 96; + static constexpr int kValueSize = 96; // Create WAL files with values filled in - static void FillData(DBWALTest* test, const Options& options, + static void FillData(DBWALTestBase* test, const Options& options, const size_t wal_count, size_t* count) { // Calling internal functions requires sanitized options. Options sanitized_options = SanitizeOptions(test->dbname_, options); @@ -923,29 +1255,31 @@ *count = 0; std::shared_ptr table_cache = NewLRUCache(50, 0); - EnvOptions env_options; + FileOptions file_options; WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); std::unique_ptr versions; std::unique_ptr wal_manager; WriteController write_controller; - versions.reset(new VersionSet(test->dbname_, &db_options, env_options, + versions.reset(new VersionSet(test->dbname_, &db_options, file_options, table_cache.get(), &write_buffer_manager, &write_controller, - /*block_cache_tracer=*/nullptr)); + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ "")); - wal_manager.reset(new WalManager(db_options, env_options)); + wal_manager.reset( + new WalManager(db_options, file_options, /*io_tracer=*/nullptr)); std::unique_ptr current_log_writer; for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { uint64_t current_log_number = j; std::string fname = LogFileName(test->dbname_, current_log_number); - std::unique_ptr file; - ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), fname, env_options)); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(), + fname, file_options, &file_writer, + nullptr)); current_log_writer.reset( new log::Writer(std::move(file_writer), current_log_number, db_options.recycle_log_file_num > 0)); @@ -954,12 +1288,13 @@ for (int i = 0; i < kKeysPerWALFile; i++) { std::string key = "key" + ToString((*count)++); std::string value = test->DummyString(kValueSize); - assert(current_log_writer.get() != nullptr); + ASSERT_NE(current_log_writer.get(), nullptr); uint64_t seq = versions->LastSequence() + 1; batch.Clear(); - batch.Put(key, value); + ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK(current_log_writer->AddRecord( + WriteBatchInternal::Contents(&batch))); versions->SetLastAllocatedSequence(seq); versions->SetLastPublishedSequence(seq); versions->SetLastSequence(seq); @@ -968,7 +1303,7 @@ } // Recreate and fill the store with some data - static size_t FillData(DBWALTest* test, Options* options) { + static size_t FillData(DBWALTestBase* test, Options* options) { options->create_if_missing = true; test->DestroyAndReopen(*options); test->Close(); @@ -979,7 +1314,7 @@ } // Read back all the keys we wrote and return the number of keys found - static size_t GetData(DBWALTest* test) { + static size_t GetData(DBWALTestBase* test) { size_t count = 0; for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) { if (test->Get("key" + ToString(i)) != "NOT_FOUND") { @@ -990,7 +1325,7 @@ } // Manuall corrupt the specified WAL - static void CorruptWAL(DBWALTest* test, const Options& options, + static void CorruptWAL(DBWALTestBase* test, const Options& options, const double off, const double len, const int wal_file_id, const bool trunc = false) { Env* env = options.env; @@ -1007,104 +1342,110 @@ test->Close(); #endif if (trunc) { - ASSERT_EQ(0, truncate(fname.c_str(), static_cast(size * off))); + ASSERT_OK( + test::TruncateFile(env, fname, static_cast(size * off))); } else { - InduceCorruption(fname, static_cast(size * off + 8), - static_cast(size * len)); + ASSERT_OK(test::CorruptFile(env, fname, static_cast(size * off + 8), + static_cast(size * len), false)); } } +}; - // Overwrite data with 'a' from offset for length len - static void InduceCorruption(const std::string& filename, size_t offset, - size_t len) { - ASSERT_GT(len, 0U); - - int fd = open(filename.c_str(), O_RDWR); - - // On windows long is 32-bit - ASSERT_LE(offset, std::numeric_limits::max()); - - ASSERT_GT(fd, 0); - ASSERT_EQ(offset, lseek(fd, static_cast(offset), SEEK_SET)); - - void* buf = alloca(len); - memset(buf, 'b', len); - ASSERT_EQ(len, write(fd, buf, static_cast(len))); +class DBWALTestWithParams + : public DBWALTestBase, + public ::testing::WithParamInterface> { + public: + DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {} +}; - close(fd); - } +INSTANTIATE_TEST_CASE_P( + Wal, DBWALTestWithParams, + ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1), + ::testing::Range(RecoveryTestHelper::kWALFileOffset, + RecoveryTestHelper::kWALFileOffset + + RecoveryTestHelper::kWALFilesCount, + 1))); + +class DBWALTestWithParamsVaryingRecoveryMode + : public DBWALTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + DBWALTestWithParamsVaryingRecoveryMode() + : DBWALTestBase("/db_wal_test_with_params_mode") {} }; +INSTANTIATE_TEST_CASE_P( + Wal, DBWALTestWithParamsVaryingRecoveryMode, + ::testing::Combine( + ::testing::Bool(), ::testing::Range(0, 4, 1), + ::testing::Range(RecoveryTestHelper::kWALFileOffset, + RecoveryTestHelper::kWALFileOffset + + RecoveryTestHelper::kWALFilesCount, + 1), + ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords, + WALRecoveryMode::kAbsoluteConsistency, + WALRecoveryMode::kPointInTimeRecovery, + WALRecoveryMode::kSkipAnyCorruptedRecords))); + // Test scope: // - We expect to open the data store when there is incomplete trailing writes // at the end of any of the logs // - We do not expect to open the data store for corruption -TEST_F(DBWALTest, kTolerateCorruptedTailRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 3; i++) { /* Corruption offset position */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, &options); - // test checksum failure or parsing - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, /*wal=*/j, trunc); - - if (trunc) { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - const size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_TRUE(i == 0 || recovered_row_count > 0); - ASSERT_LT(recovered_row_count, row_count); - } else { - options.wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords; - ASSERT_NOK(TryReopen(options)); - } - } - } +TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) { + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + // test checksum failure or parsing + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + + options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; + if (trunc) { + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + const size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0); + ASSERT_LT(recovered_row_count, row_count); + } else { + ASSERT_NOK(TryReopen(options)); } } // Test scope: // We don't expect the data store to be opened if there is any corruption // (leading, middle or trailing -- incomplete writes or corruption) -TEST_F(DBWALTest, kAbsoluteConsistency) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - +TEST_P(DBWALTestWithParams, kAbsoluteConsistency) { // Verify clean slate behavior Options options = CurrentOptions(); const size_t row_count = RecoveryTestHelper::FillData(this, &options); - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; options.create_if_missing = false; ASSERT_OK(TryReopen(options)); ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count); - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset position */ - if (trunc && i == 0) { - continue; - } - - for (int j = jstart; j < jend; j++) { /* wal files */ - // fill with new date - RecoveryTestHelper::FillData(this, &options); - // corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - // verify - options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; - options.create_if_missing = false; - ASSERT_NOK(TryReopen(options)); - } - } + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + + if (trunc && corrupt_offset == 0) { + return; } + + // fill with new date + RecoveryTestHelper::FillData(this, &options); + // corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + // verify + options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; + options.create_if_missing = false; + ASSERT_NOK(TryReopen(options)); } // Test scope: @@ -1129,100 +1470,186 @@ ASSERT_OK(Put(1, "key3", "val3")); // Corrupt WAL at location of key3 - RecoveryTestHelper::InduceCorruption( - fname, static_cast(offset_to_corrupt), static_cast(4)); + ASSERT_OK(test::CorruptFile(env, fname, static_cast(offset_to_corrupt), + 4, false)); ASSERT_OK(Put(2, "key4", "val4")); ASSERT_OK(Put(1, "key5", "val5")); - Flush(2); + ASSERT_OK(Flush(2)); // PIT recovery & verify options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options)); } +TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { + Options options = CurrentOptions(); + options.env = env_; + options.track_and_verify_wals_in_manifest = true; + // The following make sure there are two bg flush threads. + options.max_background_jobs = 8; + + const std::string cf1_name("cf1"); + CreateAndReopenWithCF({cf1_name}, options); + assert(handles_.size() == 2); + + { + dbfull()->TEST_LockMutex(); + ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes); + dbfull()->TEST_UnlockMutex(); + } + + ASSERT_OK(dbfull()->PauseBackgroundWork()); + + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value")); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value")); + + ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1])); + + ASSERT_OK(db_->Put(WriteOptions(), "foo", "value")); + ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0])); + + bool called = false; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + // This callback will be called when the first bg flush thread reaches the + // point before entering the MANIFEST write queue after flushing the SST + // file. + // The purpose of the sync points here is to ensure both bg flush threads + // finish computing `min_wal_number_to_keep` before any of them updates the + // `log_number` for the column family that's being flushed. + SyncPoint::GetInstance()->SetCallBack( + "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep", + [&](void* /*arg*/) { + dbfull()->mutex()->AssertHeld(); + if (!called) { + // We are the first bg flush thread in the MANIFEST write queue. + // We set up the dependency between sync points for two threads that + // will be executing the same code. + // For the interleaving of events, see + // https://github.com/facebook/rocksdb/pull/9715. + // bg flush thread1 will release the db mutex while in the MANIFEST + // write queue. In the meantime, bg flush thread2 locks db mutex and + // computes the min_wal_number_to_keep (before thread1 writes to + // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins + // the MANIFEST write queue afterwards and bg flush thread1 proceeds + // with writing to MANIFEST. + called = true; + SyncPoint::GetInstance()->LoadDependency({ + {"VersionSet::LogAndApply:WriteManifestStart", + "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"}, + {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2", + "VersionSet::LogAndApply:WriteManifest"}, + }); + } else { + // The other bg flush thread has already been in the MANIFEST write + // queue, and we are after. + TEST_SYNC_POINT( + "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + + ASSERT_TRUE(called); + + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + DB* db1 = nullptr; + Status s = DB::OpenForReadOnly(options, dbname_, &db1); + ASSERT_OK(s); + assert(db1); + delete db1; +} + // Test scope: // - We expect to open data store under all circumstances // - We expect only data upto the point where the first error was encountered -TEST_F(DBWALTest, kPointInTimeRecovery) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; +TEST_P(DBWALTestWithParams, kPointInTimeRecovery) { const int maxkeys = RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile; - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Offset of corruption */ - for (int j = jstart; j < jend; j++) { /* WAL file */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, &options); - - // Corrupt the wal - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify - options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); - - bool expect_data = true; - for (size_t k = 0; k < maxkeys; ++k) { - bool found = Get("key" + ToString(i)) != "NOT_FOUND"; - if (expect_data && !found) { - expect_data = false; - } - ASSERT_EQ(found, expect_data); - } + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file - const size_t min = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset); - ASSERT_GE(recovered_row_count, min); - if (!trunc && i != 0) { - const size_t max = RecoveryTestHelper::kKeysPerWALFile * - (j - RecoveryTestHelper::kWALFileOffset + 1); - ASSERT_LE(recovered_row_count, max); - } + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the wal + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + + // Verify + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + // Verify a prefix of keys were recovered. But not in the case of full WAL + // truncation, because we have no way to know there was a corruption when + // truncation happened on record boundaries (preventing recovery holes in + // that case requires using `track_and_verify_wals_in_manifest`). + if (!trunc || corrupt_offset != 0) { + bool expect_data = true; + for (size_t k = 0; k < maxkeys; ++k) { + bool found = Get("key" + ToString(k)) != "NOT_FOUND"; + if (expect_data && !found) { + expect_data = false; } + ASSERT_EQ(found, expect_data); } } + + const size_t min = RecoveryTestHelper::kKeysPerWALFile * + (wal_file_id - RecoveryTestHelper::kWALFileOffset); + ASSERT_GE(recovered_row_count, min); + if (!trunc && corrupt_offset != 0) { + const size_t max = RecoveryTestHelper::kKeysPerWALFile * + (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1); + ASSERT_LE(recovered_row_count, max); + } } // Test scope: // - We expect to open the data store under all scenarios // - We expect to have recovered records past the corruption zone -TEST_F(DBWALTest, kSkipAnyCorruptedRecords) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; - - for (auto trunc : {true, false}) { /* Corruption style */ - for (int i = 0; i < 4; i++) { /* Corruption offset */ - for (int j = jstart; j < jend; j++) { /* wal files */ - // Fill data for testing - Options options = CurrentOptions(); - const size_t row_count = RecoveryTestHelper::FillData(this, &options); - - // Corrupt the WAL - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, j, trunc); - - // Verify behavior - options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; - options.create_if_missing = false; - ASSERT_OK(TryReopen(options)); - - // Probe data for invariants - size_t recovered_row_count = RecoveryTestHelper::GetData(this); - ASSERT_LT(recovered_row_count, row_count); +TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) { + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file - if (!trunc) { - ASSERT_TRUE(i != 0 || recovered_row_count > 0); - } - } - } + // Fill data for testing + Options options = CurrentOptions(); + const size_t row_count = RecoveryTestHelper::FillData(this, &options); + + // Corrupt the WAL + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + + // Verify behavior + options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + + // Probe data for invariants + size_t recovered_row_count = RecoveryTestHelper::GetData(this); + ASSERT_LT(recovered_row_count, row_count); + + if (!trunc) { + ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0); } } @@ -1288,7 +1715,7 @@ for (int i = 0; i < 2; ++i) { if (i > 0) { // Flush() triggers deletion of obsolete tracked files - Flush(); + ASSERT_OK(Flush()); } VectorLogPtr log_files; ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); @@ -1330,7 +1757,7 @@ ASSERT_EQ(Get("foo"), "foo_v2"); ASSERT_EQ(Get("bar"), "bar_v2"); // manual flush and insert again - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ(Get("foo"), "foo_v2"); ASSERT_EQ(Get("bar"), "bar_v2"); ASSERT_OK(Put("foo", "foo_v3")); @@ -1351,7 +1778,9 @@ auto countWalFiles = [this]() { VectorLogPtr log_files; - dbfull()->GetSortedWalFiles(log_files); + if (!dbfull()->GetSortedWalFiles(log_files).ok()) { + return size_t{0}; + } return log_files.size(); }; @@ -1359,11 +1788,11 @@ CreateAndReopenWithCF({"one", "two"}, options); ASSERT_OK(Put(0, "key1", kSmallValue)); ASSERT_OK(Put(1, "key2", kLargeValue)); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ(1, countWalFiles()); ASSERT_OK(Put(0, "key3", kSmallValue)); ASSERT_OK(Put(2, "key4", kLargeValue)); - Flush(2); + ASSERT_OK(Flush(2)); ASSERT_EQ(2, countWalFiles()); // Reopen, insert and flush. @@ -1377,9 +1806,9 @@ ASSERT_OK(Put(0, "key5", kLargeValue)); ASSERT_OK(Put(1, "key6", kLargeValue)); ASSERT_EQ(3, countWalFiles()); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(2, "key7", kLargeValue)); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); ASSERT_EQ(4, countWalFiles()); // Reopen twice and validate. @@ -1401,9 +1830,8 @@ // 2. Open with avoid_flush_during_recovery = true; // 3. Append more data without flushing, which creates new WAL log. // 4. Open again. See if it can correctly handle previous corruption. -TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) { - const int jstart = RecoveryTestHelper::kWALFileOffset; - const int jend = jstart + RecoveryTestHelper::kWALFilesCount; +TEST_P(DBWALTestWithParamsVaryingRecoveryMode, + RecoverFromCorruptedWALWithoutFlush) { const int kAppendKeys = 100; Options options = CurrentOptions(); options.avoid_flush_during_recovery = true; @@ -1422,60 +1850,47 @@ delete iter; return data; }; - for (auto& mode : wal_recovery_mode_string_map) { - options.wal_recovery_mode = mode.second; - for (auto trunc : {true, false}) { - for (int i = 0; i < 4; i++) { - for (int j = jstart; j < jend; j++) { - // Create corrupted WAL - RecoveryTestHelper::FillData(this, &options); - RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3, - /*len%=*/.1, /*wal=*/j, trunc); - // Skip the test if DB won't open. - if (!TryReopen(options).ok()) { - ASSERT_TRUE(options.wal_recovery_mode == - WALRecoveryMode::kAbsoluteConsistency || - (!trunc && - options.wal_recovery_mode == - WALRecoveryMode::kTolerateCorruptedTailRecords)); - continue; - } - ASSERT_OK(TryReopen(options)); - // Append some more data. - for (int k = 0; k < kAppendKeys; k++) { - std::string key = "extra_key" + ToString(k); - std::string value = DummyString(RecoveryTestHelper::kValueSize); - ASSERT_OK(Put(key, value)); - } - // Save data for comparison. - auto data = getAll(); - // Reopen. Verify data. - ASSERT_OK(TryReopen(options)); - auto actual_data = getAll(); - ASSERT_EQ(data, actual_data); - } - } - } + + bool trunc = std::get<0>(GetParam()); // Corruption style + // Corruption offset position + int corrupt_offset = std::get<1>(GetParam()); + int wal_file_id = std::get<2>(GetParam()); // WAL file + WALRecoveryMode recovery_mode = std::get<3>(GetParam()); + + options.wal_recovery_mode = recovery_mode; + // Create corrupted WAL + RecoveryTestHelper::FillData(this, &options); + RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3, + /*len%=*/.1, wal_file_id, trunc); + // Skip the test if DB won't open. + if (!TryReopen(options).ok()) { + ASSERT_TRUE(options.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency || + (!trunc && options.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)); + return; } + ASSERT_OK(TryReopen(options)); + // Append some more data. + for (int k = 0; k < kAppendKeys; k++) { + std::string key = "extra_key" + ToString(k); + std::string value = DummyString(RecoveryTestHelper::kValueSize); + ASSERT_OK(Put(key, value)); + } + // Save data for comparison. + auto data = getAll(); + // Reopen. Verify data. + ASSERT_OK(TryReopen(options)); + auto actual_data = getAll(); + ASSERT_EQ(data, actual_data); } // Tests that total log size is recovered if we set // avoid_flush_during_recovery=true. // Flush should trigger if max_total_wal_size is reached. TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { - class TestFlushListener : public EventListener { - public: - std::atomic count{0}; - - TestFlushListener() = default; - - void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { - count++; - assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason); - } - }; - std::shared_ptr test_listener = - std::make_shared(); + auto test_listener = std::make_shared(); + test_listener->expected_flush_reason = FlushReason::kWalFull; constexpr size_t kKB = 1024; constexpr size_t kMB = 1024 * 1024; @@ -1515,7 +1930,9 @@ 1 * kMB); // Write one more key to trigger flush. ASSERT_OK(Put(0, "foo", "v2")); - dbfull()->TEST_WaitForFlushMemTable(); + for (auto* h : handles_) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h)); + } // Flushed two column families. ASSERT_EQ(2, test_listener->count.load()); } @@ -1527,7 +1944,16 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { constexpr size_t kKB = 1024; Options options = CurrentOptions(); + options.env = env_; options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + DestroyAndReopen(options); size_t preallocated_size = dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); @@ -1549,6 +1975,175 @@ ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), preallocated_size); } +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + options.avoid_flush_during_shutdown = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + // The log file has preallocated space. + Close(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // After the flush during Open, the log file should get deleted. However, + // if the process is in a crash loop, the log file may not get + // deleted and thte preallocated space will keep accumulating. So we need + // to ensure it gets trtuncated. + EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + Close(); + std::vector filenames; + std::string last_log; + uint64_t last_log_num = 0; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (auto fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type, nullptr)) { + if (type == kWalFile && number > last_log_num) { + last_log = fname; + } + } + } + ASSERT_NE(last_log, ""); + last_log = dbname_ + '/' + last_log; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", + [](void* arg) { *(reinterpret_cast(arg)) = 0; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Preallocate space for the empty log file. This could happen if WAL data + // was buffered in memory and the process crashed. + std::unique_ptr log_file; + ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions())); + log_file->SetPreallocationBlockSize(preallocated_size); + log_file->PrepareWrite(0, 4096); + log_file.reset(); + + ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size); + + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // The preallocated space should be truncated. + EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + // create DB and close with file truncate disabled + std::atomic_bool enable_truncate{false}; + + SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", [&](void* arg) { + if (!enable_truncate) { + *(reinterpret_cast(arg)) = 0; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName()); + ASSERT_GE(db_size, preallocated_size); + Close(); + + // enable truncate and open DB as readonly, the file should not be truncated + // and DB size is not changed. + enable_truncate = true; + ASSERT_OK(ReadOnlyReopen(options)); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName()); + // The preallocated space should NOT be truncated. + // the DB size is almost the same. + ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size, + db_size / 100); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} #endif // ROCKSDB_FALLOCATE_PRESENT #endif // ROCKSDB_PLATFORM_POSIX @@ -1566,9 +2161,9 @@ wo.disableWAL = false; WriteBatch batch; - batch.Put("foo", "bar"); + ASSERT_OK(batch.Put("foo", "bar")); batch.MarkWalTerminationPoint(); - batch.Put("foo2", "bar2"); + ASSERT_OK(batch.Put("foo2", "bar2")); ASSERT_OK(dbfull()->Write(wo, &batch)); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,3217 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "test_util/testutil.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { +class DBBasicTestWithTimestampBase : public DBTestBase { + public: + explicit DBBasicTestWithTimestampBase(const std::string& dbname) + : DBTestBase(dbname, /*env_do_fsync=*/true) {} + + protected: + static std::string Key1(uint64_t k) { + std::string ret; + PutFixed64(&ret, k); + std::reverse(ret.begin(), ret.end()); + return ret; + } + + static std::string KeyWithPrefix(std::string prefix, uint64_t k) { + std::string ret; + PutFixed64(&ret, k); + std::reverse(ret.begin(), ret.end()); + return prefix + ret; + } + + static std::vector ConvertStrToSlice( + std::vector& strings) { + std::vector ret; + for (const auto& s : strings) { + ret.emplace_back(s); + } + return ret; + } + + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return -CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + if (a_has_ts) { + assert(a.size() >= timestamp_size()); + } + if (b_has_ts) { + assert(b.size() >= timestamp_size()); + } + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b; + return cmp_without_ts_->Compare(lhs, rhs); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + const size_t kSize = ts1.size(); + std::unique_ptr ts1_buf(new char[kSize]); + memcpy(ts1_buf.get(), ts1.data(), ts1.size()); + std::unique_ptr ts2_buf(new char[kSize]); + memcpy(ts2_buf.get(), ts2.data(), ts2.size()); + Slice ts1_copy = Slice(ts1_buf.get(), kSize); + Slice ts2_copy = Slice(ts2_buf.get(), kSize); + auto* ptr1 = const_cast(&ts1_copy); + auto* ptr2 = const_cast(&ts2_copy); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return -1; + } else if (high1 > high2) { + return 1; + } + if (low1 < low2) { + return -1; + } else if (low1 > low2) { + return 1; + } + return 0; + } + }; + + std::string Timestamp(uint64_t low, uint64_t high) { + std::string ts; + PutFixed64(&ts, low); + PutFixed64(&ts, high); + return ts; + } + + void CheckIterUserEntry(const Iterator* it, const Slice& expected_key, + ValueType expected_value_type, + const Slice& expected_value, + const Slice& expected_ts) const { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_EQ(expected_key, it->key()); + if (kTypeValue == expected_value_type) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); + } + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + SequenceNumber expected_seq, ValueType expected_val_type, + const Slice& expected_value, const Slice& expected_ts) { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + std::string ukey_and_ts; + ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size()); + ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + ParsedInternalKey parsed_ikey; + ASSERT_OK( + ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); + ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key); + ASSERT_EQ(expected_val_type, parsed_ikey.type); + ASSERT_EQ(expected_seq, parsed_ikey.sequence); + if (expected_val_type == kTypeValue) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); + } + + void CheckIterEntry(const Iterator* it, const Slice& expected_ukey, + ValueType expected_val_type, const Slice& expected_value, + const Slice& expected_ts) { + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + std::string ukey_and_ts; + ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size()); + ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + + ParsedInternalKey parsed_ikey; + ASSERT_OK( + ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); + ASSERT_EQ(expected_val_type, parsed_ikey.type); + ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key); + if (expected_val_type == kTypeValue) { + ASSERT_EQ(expected_value, it->value()); + } + ASSERT_EQ(expected_ts, it->timestamp()); + } +}; + +class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { + public: + DBBasicTestWithTimestamp() + : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {} +}; + +TEST_F(DBBasicTestWithTimestamp, MixedCfs) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.avoid_flush_during_shutdown = true; + DestroyAndReopen(options); + + Options options1 = CurrentOptions(); + options1.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options1.comparator = &test_cmp; + ColumnFamilyHandle* handle = nullptr; + Status s = db_->CreateColumnFamily(options1, "data", &handle); + ASSERT_OK(s); + + WriteBatch wb; + ASSERT_OK(wb.Put("a", "value")); + { + std::string key("a"); + std::string ts(kTimestampSize, '\0'); + std::array key_with_ts_slices{{key, ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + std::string value_str("value"); + Slice value_slice(value_str.data(), value_str.size()); + SliceParts value(&value_slice, 1); + ASSERT_OK(wb.Put(handle, key_with_ts, value)); + } + { + std::string ts = Timestamp(1, 0); + std::vector ts_list({Slice(), ts}); + ASSERT_OK(wb.AssignTimestamps(ts_list)); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + } + + const auto verify_db = [this](ColumnFamilyHandle* h) { + ASSERT_EQ("value", Get("a")); + std::string ts = Timestamp(1, 0); + Slice read_ts_slice(ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::string value; + ASSERT_OK(db_->Get(read_opts, h, "a", &value)); + ASSERT_EQ("value", value); + }; + + verify_db(handle); + + delete handle; + Close(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back("data", options1); + options.create_if_missing = false; + s = DB::Open(options, dbname_, cf_descs, &handles_, &db_); + ASSERT_OK(s); + + verify_db(handles_[1]); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + std::string start_str = "foo"; + std::string end_str = "foo2"; + Slice start(start_str), end(end_str); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + std::string ts_str = Timestamp(1, 0); + WriteOptions wopts; + Slice ts = ts_str; + wopts.timestamp = &ts; + ASSERT_OK(db_->Put(wopts, "k1", "v1")); + ASSERT_OK(db_->Put(wopts, "k2", "v2")); + ASSERT_OK(db_->Put(wopts, "k3", "v3")); + + ts_str = Timestamp(2, 0); + ts = ts_str; + wopts.timestamp = &ts; + ASSERT_OK(db_->Delete(wopts, "k3")); + + ts_str = Timestamp(4, 0); + ts = ts_str; + wopts.timestamp = &ts; + ASSERT_OK(db_->Put(wopts, "k1", "v5")); + + ts_str = Timestamp(3, 0); + ts = ts_str; + CompactRangeOptions cro; + cro.full_history_ts_low = &ts; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(Flush()); + + ReadOptions ropts; + ropts.timestamp = &ts; + std::string value; + Status s = db_->Get(ropts, "k1", &value); + ASSERT_OK(s); + ASSERT_EQ("v1", value); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + const std::string kKey = "test kKey"; + + // Test set ts_low first and flush() + int current_ts_low = 5; + std::string ts_low_str = Timestamp(current_ts_low, 0); + Slice ts_low = ts_low_str; + CompactRangeOptions comp_opts; + comp_opts.full_history_ts_low = &ts_low; + comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + + auto* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + auto result_ts_low = cfd->GetFullHistoryTsLow(); + + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + for (int i = 0; i < 10; i++) { + WriteOptions write_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, kKey, Key(i))); + } + ASSERT_OK(Flush()); + + // TODO return a non-ok for read ts < current_ts_low and test it. + for (int i = 0; i < 10; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + if (i < current_ts_low - 1) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + } + + // Test set ts_low and then trigger compaction + for (int i = 10; i < 20; i++) { + WriteOptions write_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, kKey, Key(i))); + } + + ASSERT_OK(Flush()); + + current_ts_low = 15; + ts_low_str = Timestamp(current_ts_low, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + result_ts_low = cfd->GetFullHistoryTsLow(); + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + // TODO return a non-ok for read ts < current_ts_low and test it. + for (int i = current_ts_low; i < 20; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + + // Test invalid compaction with range + Slice start(kKey), end(kKey); + Status s = db_->CompactRange(comp_opts, &start, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, &start, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, nullptr, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Test invalid compaction with the decreasing ts_low + ts_low_str = Timestamp(current_ts_low - 1, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + s = db_->CompactRange(comp_opts, nullptr, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + std::string ts_low_str = Timestamp(9, 0); + ASSERT_OK( + db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str)); + std::string result_ts_low; + ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low)); + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0); + // test increase full_history_low backward + std::string ts_low_str_back = Timestamp(8, 0); + auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_back); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow with a timestamp whose length is longger + // than the cf's timestamp size + std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a'); + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_long); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow with a timestamp which is null + std::string ts_low_str_null = ""; + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + ts_low_str_null); + ASSERT_EQ(s, Status::InvalidArgument()); + // test IncreaseFullHistoryTsLow for a column family that does not enable + // timestamp + options.comparator = BytewiseComparator(); + DestroyAndReopen(options); + ts_low_str = Timestamp(10, 0); + s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str); + ASSERT_EQ(s, Status::InvalidArgument()); + // test GetFullHistoryTsLow for a column family that does not enable + // timestamp + std::string current_ts_low; + s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), ¤t_ts_low); + ASSERT_EQ(s, Status::InvalidArgument()); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + + // test multiple ranges + std::vector ranges; + std::string start_tmp = Key(10); + std::string end_tmp = Key(20); + ranges.emplace_back(Range(start_tmp, end_tmp)); + ranges.emplace_back(Range(start, end)); + uint64_t range_sizes[2]; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, + ranges.data(), 2, range_sizes)); + + ASSERT_EQ(range_sizes[1], size); + + // Zero if not including mem table + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + // Test range boundaries + ASSERT_OK(db_->Put(write_opts, Key(1000), rnd.RandomString(1024))); + // Should include start key + start = Key(1000); + end = Key(1100); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 0); + + // Should exclude end key + start = Key(900); + end = Key(1000); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector start_keys = {1, 0}; + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + Slice write_ts = write_timestamps[i]; + write_opts.timestamp = &write_ts; + for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + // Forward iterate. + for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid(); + it->Next(), ++count, ++key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + size_t expected_count = kMaxKey - start_keys[i] + 1; + ASSERT_EQ(expected_count, count); + + // Backward iterate. + count = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); + + // SeekToFirst()/SeekToLast() with lower/upper bounds. + // Then iter with lower and upper bounds. + uint64_t l = 0; + uint64_t r = kMaxKey + 1; + while (l < r) { + std::string lb_str = Key1(l); + Slice lb = lb_str; + std::string ub_str = Key1(r); + Slice ub = ub_str; + read_opts.iterate_lower_bound = &lb; + read_opts.iterate_upper_bound = &ub; + it.reset(db_->NewIterator(read_opts)); + for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0; + it->Valid(); it->Next(), ++key, ++count) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + ASSERT_EQ(r - std::max(l, start_keys[i]), count); + + for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; + it->Valid(); it->Prev(), --key, ++count) { + CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + l += (kMaxKey / 100); + r -= (kMaxKey / 100); + } + } + Close(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Create 2 tables + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10; i++) { + WriteOptions write_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "key", Key(i))); + } + ASSERT_OK(Flush()); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(2U, props.size()); + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") != + user_collected.end()); + ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") != + user_collected.end()); + ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0)); + ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0)); + } + Close(); +} +#endif // !ROCKSDB_LITE + +class DBBasicTestWithTimestampTableOptions + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + explicit DBBasicTestWithTimestampTableOptions() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_table_options") {} +}; + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampTableOptions, + testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)); + +TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.index_type = GetParam(); + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator cmp(kTimestampSize); + options.comparator = &cmp; + DestroyAndReopen(options); + constexpr uint64_t kNumKeys = 1024; + for (uint64_t k = 0; k < kNumKeys; ++k) { + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, Key1(k), "value" + std::to_string(k))); + } + ASSERT_OK(Flush()); + { + ReadOptions read_opts; + read_opts.total_order_seek = true; + std::string ts_str = Timestamp(2, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + // verify Get() + for (it->SeekToFirst(); it->Valid(); it->Next()) { + std::string value_from_get; + std::string key_str(it->key().data(), it->key().size()); + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, ×tamp)); + ASSERT_EQ(it->value(), value_from_get); + ASSERT_EQ(Timestamp(1, 0), timestamp); + } + + // verify MultiGet() + constexpr uint64_t step = 2; + static_assert(0 == (kNumKeys % step), + "kNumKeys must be a multiple of step"); + for (uint64_t k = 0; k < kNumKeys; k += 2) { + std::vector key_strs; + std::vector keys; + for (size_t i = 0; i < step; ++i) { + key_strs.push_back(Key1(k + i)); + } + for (size_t i = 0; i < step; ++i) { + keys.emplace_back(key_strs[i]); + } + std::vector values; + std::vector timestamps; + std::vector statuses = + db_->MultiGet(read_opts, keys, &values, ×tamps); + ASSERT_EQ(step, statuses.size()); + ASSERT_EQ(step, values.size()); + ASSERT_EQ(step, timestamps.size()); + for (uint64_t i = 0; i < step; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("value" + std::to_string(k + i), values[i]); + ASSERT_EQ(Timestamp(1, 0), timestamps[i]); + } + } + } + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(1, 0); + ts = read_ts; + read_opts.timestamp = &ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->Seek("bbb"); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + // All of the keys or this test must be longer than 3 characters + constexpr int kMinKeyLen = 3; + options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(2, 0); + ts = read_ts; + read_opts.timestamp = &ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar1")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar2")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (int i = 3; i < 9; ++i) { + ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), + "bar" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(2, 0); + ts = read_ts; + read_opts.timestamp = &ts; + std::string up_bound = "foo5"; // exclusive + Slice up_bound_slice = up_bound; + std::string lo_bound = "foo2"; // inclusive + Slice lo_bound_slice = lo_bound; + read_opts.iterate_upper_bound = &up_bound_slice; + read_opts.iterate_lower_bound = &lo_bound_slice; + read_opts.auto_prefix_mode = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekForPrev("g"); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + } + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), + Timestamp(4, 3)}; + const std::vector> kvs = { + std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")}; + for (const auto& ts : timestamps) { + WriteBatch wb; + for (const auto& kv : kvs) { + const std::string& key = std::get<0>(kv); + const std::string& value = std::get<1>(kv); + std::array key_with_ts_slices{{Slice(key), Slice(ts)}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + std::array value_slices{{Slice(value)}}; + SliceParts values(value_slices.data(), 1); + ASSERT_OK(wb.Put(key_with_ts, values)); + } + + ASSERT_OK(wb.AssignTimestamp(ts)); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + } + std::string read_ts_str = Timestamp(5, 3); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + + it->SeekToFirst(); + ASSERT_TRUE(it->Valid()); + it->Prev(); + ASSERT_FALSE(it->Valid()); + + it->SeekToLast(); + ASSERT_TRUE(it->Valid()); + uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(0, prev_reseek_count); + it->Next(); + ASSERT_FALSE(it->Valid()); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->Seek(std::get<0>(kvs[0])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(1, prev_reseek_count); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->SeekForPrev(std::get<0>(kvs[1])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it.reset(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) { + constexpr int kNumKeysPerFile = 128; + constexpr uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + const std::vector read_timestamps_lb = {Timestamp(1, 0), + Timestamp(1, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + Slice write_ts = write_timestamps[i]; + write_opts.timestamp = &write_ts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + Slice read_ts_lb = read_timestamps_lb[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + if (i > 0) { + it->Next(); + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i - 1), + write_timestamps[i - 1]); + } + } + size_t expected_count = kMaxKey + 1; + ASSERT_EQ(expected_count, count); + } + // Delete all keys@ts=5 and check iteration result with start ts set + { + std::string write_timestamp = Timestamp(5, 0); + WriteOptions write_opts; + Slice write_ts = write_timestamp; + write_opts.timestamp = &write_ts; + for (uint64_t key = 0; key < kMaxKey + 1; ++key) { + Status s = db_->Delete(write_opts, Key1(key)); + ASSERT_OK(s); + } + + std::string read_timestamp = Timestamp(6, 0); + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::string read_timestamp_lb = Timestamp(2, 0); + Slice read_ts_lb = read_timestamp_lb; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), + write_ts); + // Skip key@ts=3 and land on tombstone key@ts=5 + it->Next(); + } + ASSERT_EQ(kMaxKey + 1, count); + } + Close(); +} + +class DBBasicDeletionTestWithTimestamp + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + DBBasicDeletionTestWithTimestamp() + : DBBasicTestWithTimestampBase("db_basic_deletion_test_with_timestamp") {} +}; + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicDeletionTestWithTimestamp, + ::testing::Values(ValueType::kTypeSingleDeletion, + ValueType::kTypeDeletionWithTimestamp)); + +TEST_P(DBBasicDeletionTestWithTimestamp, ForwardIterateStartSeqnum) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 0xffffffffffffffff; + const uint64_t kMinKey = kMaxKey - 1023; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + ValueType op_type = GetParam(); + // Need to disable compaction to bottommost level when sequence number will be + // zeroed out, causing the verification of sequence number to fail in this + // test. + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::vector start_seqs; + + const int kNumTimestamps = 4; + std::vector write_ts_list; + for (int t = 0; t != kNumTimestamps; ++t) { + write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); + } + WriteOptions write_opts; + for (size_t i = 0; i != write_ts_list.size(); ++i) { + Slice write_ts = write_ts_list[i]; + write_opts.timestamp = &write_ts; + for (uint64_t k = kMaxKey; k >= kMinKey; --k) { + Status s; + if (k % 2) { + s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i)); + } else { + if (op_type == ValueType::kTypeDeletionWithTimestamp) { + s = db_->Delete(write_opts, Key1(k)); + } else if (op_type == ValueType::kTypeSingleDeletion) { + s = db_->SingleDelete(write_opts, Key1(k)); + } + } + ASSERT_OK(s); + } + start_seqs.push_back(db_->GetLatestSequenceNumber()); + } + std::vector read_ts_list; + for (int t = 0; t != kNumTimestamps - 1; ++t) { + read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17)); + } + + ReadOptions read_opts; + // Scan with only read_opts.iter_start_seqnum set. + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_seqnum = start_seqs[i] + 1; + std::unique_ptr iter(db_->NewIterator(read_opts)); + SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; + uint64_t key = kMinKey; + for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) { + CheckIterEntry( + iter.get(), Key1(key), expected_seq, (key % 2) ? kTypeValue : op_type, + (key % 2) ? "value" + std::to_string(i + 1) : std::string(), + write_ts_list[i + 1]); + ++key; + --expected_seq; + } + } + // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set. + std::vector read_ts_lb_list; + for (int t = 0; t < kNumTimestamps - 1; ++t) { + read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); + } + for (size_t i = 0; i < read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + Slice read_ts_lb = read_ts_lb_list[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + read_opts.iter_start_seqnum = start_seqs[i] + 1; + std::unique_ptr it(db_->NewIterator(read_opts)); + uint64_t key = kMinKey; + SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; + for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) { + CheckIterEntry(it.get(), Key1(key), expected_seq, + (key % 2) ? kTypeValue : op_type, + "value" + std::to_string(i + 1), write_ts_list[i + 1]); + ++key; + --expected_seq; + } + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Insert kNumKeys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "foo", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + ts_str = Timestamp(kNumKeys, 0); + ts = ts_str; + read_opts.timestamp = &ts; + iter.reset(db_->NewIterator(read_opts)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, + "value" + std::to_string(kNumKeys - 1), ts_str); + ASSERT_EQ( + 2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Write kNumKeys + 1 keys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "a", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + WriteBatch batch; + const std::string dummy_ts(kTimestampSize, '\0'); + { + std::array key_with_ts_slices{{"a", dummy_ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + std::array value_slices{{"new_value"}}; + SliceParts values(value_slices.data(), 1); + ASSERT_OK(batch.Put(key_with_ts, values)); + } + { + std::string key_with_ts("b"); + key_with_ts.append(dummy_ts); + ASSERT_OK(batch.Put(key_with_ts, "new_value")); + } + s = batch.AssignTimestamp(ts_str); + ASSERT_OK(s); + s = db_->Write(write_opts, &batch); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("a"); + iter->Next(); + CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) { + Options options = GetDefaultOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + WriteOptions write_opts; + write_opts.timestamp = &ts; + Status s = db_->Put(write_opts, "b", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + WriteOptions write_opts; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "a", "value")); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + iter->Prev(); + CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + // random data + for (int i = 0; i < 3; i++) { + auto key = ToString(i * 10); + auto value = ToString(i * 10); + Slice key_slice = key; + Slice value_slice = value; + ASSERT_OK(db_->Put(write_opts, key_slice, value_slice)); + ASSERT_OK(Flush()); + } + + // Make num_levels to 2 to do key range filtering of sst files + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector timestamps(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + std::vector cfhs(keys.size(), cfh); + std::vector statuses = + db_->MultiGet(read_opts, cfhs, keys, &values, ×tamps); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + constexpr size_t max_skippable_internal_keys = 2; + const size_t kNumKeys = max_skippable_internal_keys + 2; + WriteOptions write_opts; + Status s; + { + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "a", "value")); + } + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "b", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + read_opts.max_skippable_internal_keys = max_skippable_internal_keys; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + iter->Next(); + ASSERT_TRUE(iter->status().IsIncomplete()); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) { + Options options = GetDefaultOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + constexpr size_t max_skippable_internal_keys = 2; + const size_t kNumKeys = max_skippable_internal_keys + 2; + WriteOptions write_opts; + Status s; + { + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "b", "value")); + } + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "a", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + read_opts.max_skippable_internal_keys = max_skippable_internal_keys; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + iter->Prev(); + ASSERT_TRUE(iter->status().IsIncomplete()); + } + Close(); +} + +// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom. +// Two L0s: +// f1 f2 +// ... +// Since f2.smallest < f1.largest < f2.largest +// f1 and f2 will be the inputs of a real compaction instead of trivial move. +TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.num_levels = 2; + options.level0_file_num_compaction_trigger = 2; + DestroyAndReopen(options); + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "a", "value0")); + ASSERT_OK(Flush()); + + ts_str = Timestamp(2, 0); + ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "b", "value0")); + ts_str = Timestamp(3, 0); + ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Delete(write_opts, "a")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions read_opts; + ts_str = Timestamp(1, 0); + ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status s = db_->Get(read_opts, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value0", value); + + ts_str = Timestamp(3, 0); + ts = ts_str; + read_opts.timestamp = &ts; + s = db_->Get(read_opts, "a", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Time-travel to the past before deletion + ts_str = Timestamp(2, 0); + ts = ts_str; + read_opts.timestamp = &ts; + s = db_->Get(read_opts, "a", &value); + ASSERT_OK(s); + ASSERT_EQ("value0", value); + Close(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class DBBasicTestWithTimestampFilterPrefixSettings + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, bool, bool, + std::shared_ptr, bool, double, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTimestampFilterPrefixSettings() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_filter_prefix") {} +}; + +TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = std::get<1>(GetParam()); + bbto.cache_index_and_filter_blocks = std::get<2>(GetParam()); + bbto.index_type = std::get<6>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor = std::get<3>(GetParam()); + options.memtable_whole_key_filtering = std::get<4>(GetParam()); + options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam()); + + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + const int kMaxKey = 1000; + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + int idx = 0; + for (; idx < kMaxKey / 4; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (; idx < kMaxKey / 2; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + ASSERT_OK(Flush()); + + for (; idx < kMaxKey; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + + ReadOptions read_opts_total_order; + read_opts_total_order.timestamp = &ts; + read_opts_total_order.total_order_seek = true; + + for (idx = 0; idx < kMaxKey; idx++) { + size_t batch_size = 4; + std::vector keys_str(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + + keys_str[0] = Key1(idx); + keys_str[1] = KeyWithPrefix("foo", idx); + keys_str[2] = Key1(kMaxKey + idx); + keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx); + + auto keys = ConvertStrToSlice(keys_str); + + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + for (int i = 0; i < 2; i++) { + ASSERT_OK(statuses[i]); + } + for (int i = 2; i < 4; i++) { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + + for (int i = 0; i < 2; i++) { + std::string value; + ASSERT_OK(db_->Get(read_opts, keys[i], &value)); + std::unique_ptr it1(db_->NewIterator(read_opts)); + ASSERT_NE(nullptr, it1); + ASSERT_OK(it1->status()); + // TODO(zjay) Fix seek with prefix + // it1->Seek(keys[i]); + // ASSERT_TRUE(it1->Valid()); + } + + for (int i = 2; i < 4; i++) { + std::string value; + Status s = db_->Get(read_opts, keys[i], &value); + ASSERT_TRUE(s.IsNotFound()); + } + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampFilterPrefixSettings, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(nullptr), + std::shared_ptr(NewBloomFilterPolicy(10, true)), + std::shared_ptr(NewBloomFilterPolicy(10, + false))), + ::testing::Bool(), ::testing::Bool(), + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), + std::shared_ptr(NewFixedPrefixTransform(4)), + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Bool(), ::testing::Values(0, 0.1), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +class DataVisibilityTest : public DBBasicTestWithTimestampBase { + public: + DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") { + // Initialize test data + for (int i = 0; i < kTestDataSize; i++) { + test_data_[i].key = "key" + ToString(i); + test_data_[i].value = "value" + ToString(i); + test_data_[i].timestamp = Timestamp(i, 0); + test_data_[i].ts = i; + test_data_[i].seq_num = kMaxSequenceNumber; + } + } + + protected: + struct TestData { + std::string key; + std::string value; + int ts; + std::string timestamp; + SequenceNumber seq_num; + }; + + constexpr static int kTestDataSize = 3; + TestData test_data_[kTestDataSize]; + + void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) { + ASSERT_LE(index, kTestDataSize); + WriteOptions write_opts; + Slice ts_slice = test_data_[index].timestamp; + write_opts.timestamp = &ts_slice; + + if (cfh == nullptr) { + ASSERT_OK( + db_->Put(write_opts, test_data_[index].key, test_data_[index].value)); + const Snapshot* snap = db_->GetSnapshot(); + test_data_[index].seq_num = snap->GetSequenceNumber(); + if (index > 0) { + ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num); + } + db_->ReleaseSnapshot(snap); + } else { + ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key, + test_data_[index].value)); + } + } + + void AssertVisibility(int ts, SequenceNumber seq, + std::vector statuses) { + ASSERT_EQ(kTestDataSize, statuses.size()); + for (int i = 0; i < kTestDataSize; i++) { + if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) { + ASSERT_OK(statuses[i]); + } else { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + } + } + + std::vector GetKeys() { + std::vector ret(kTestDataSize); + for (int i = 0; i < kTestDataSize; i++) { + ret[i] = test_data_[i].key; + } + return ret; + } + + void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) { + ReadOptions read_opts; + std::string read_ts = Timestamp(ts, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + read_opts.snapshot = snap; + + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + std::vector cfs(kTestDataSize, cfh); + SequenceNumber seq = + snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1; + + // There're several MultiGet interfaces with not exactly the same + // implementations, query data with all of them. + auto keys = GetKeys(); + std::vector values; + auto s1 = db_->MultiGet(read_opts, cfs, keys, &values); + AssertVisibility(ts, seq, s1); + + auto s2 = db_->MultiGet(read_opts, keys, &values); + AssertVisibility(ts, seq, s2); + + std::vector timestamps; + auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, ×tamps); + AssertVisibility(ts, seq, s3); + + auto s4 = db_->MultiGet(read_opts, keys, &values, ×tamps); + AssertVisibility(ts, seq, s4); + + std::vector values_ps5(kTestDataSize); + std::vector s5(kTestDataSize); + db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(), + s5.data()); + AssertVisibility(ts, seq, s5); + + std::vector values_ps6(kTestDataSize); + std::vector s6(kTestDataSize); + std::vector timestamps_array(kTestDataSize); + db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(), + timestamps_array.data(), s6.data()); + AssertVisibility(ts, seq, s6); + + std::vector values_ps7(kTestDataSize); + std::vector s7(kTestDataSize); + db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(), + values_ps7.data(), s7.data()); + AssertVisibility(ts, seq, s7); + + std::vector values_ps8(kTestDataSize); + std::vector s8(kTestDataSize); + db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(), + values_ps8.data(), timestamps_array.data(), s8.data()); + AssertVisibility(ts, seq, s8); + } + + void VerifyDefaultCF(const Snapshot* snap = nullptr) { + for (int i = 0; i <= kTestDataSize; i++) { + VerifyDefaultCF(i, snap); + } + } +}; +constexpr int DataVisibilityTest::kTestDataSize; + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// GetImpl(ts,seq) +// It is OK to return if ts>=t1 AND seq>=s1. If ts>=1t1 but seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::GetImpl:3", + "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"}, + {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut", + "DBImpl::GetImpl:4"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts_str = Timestamp(1, 0); + Slice write_ts = write_ts_str; + WriteOptions write_opts; + write_opts.timestamp = &write_ts; + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"); + Status s = db_->Put(write_opts, "foo", "value"); + ASSERT_OK(s); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut"); + }); + ReadOptions read_opts; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + + writer_thread.join(); + ASSERT_TRUE(s.IsNotFound()); + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// Flush +// GetImpl(ts,seq) +// It is OK to return if ts>=t1 AND seq>=s1. If ts>=t1 but seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::GetImpl:3", + "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"}, + {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut", + "DBImpl::GetImpl:4"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts_str = Timestamp(1, 0); + Slice write_ts = write_ts_str; + WriteOptions write_opts; + write_opts.timestamp = &write_ts; + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"); + Status s = db_->Put(write_opts, "foo", "value"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + + write_ts_str = Timestamp(2, 0); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + s = db_->Put(write_opts, "bar", "value"); + ASSERT_OK(s); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut"); + }); + ReadOptions read_opts; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + writer_thread.join(); + ASSERT_TRUE(s.IsNotFound()); + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 +// seq'=11 +// write finishes +// GetImpl(ts,seq) +// Since application specifies both timestamp and snapshot, application expects +// to see data that visible in BOTH timestamp and sequence number. Therefore, +// can be returned only if t1<=ts AND s1<=seq. +TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap", + "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"}, + {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut", + "DBImpl::GetImpl:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts_str = Timestamp(1, 0); + Slice write_ts = write_ts_str; + WriteOptions write_opts; + write_opts.timestamp = &write_ts; + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"); + Status s = db_->Put(write_opts, "foo", "value"); + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut"); + ASSERT_OK(s); + }); + ReadOptions read_opts; + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap"); + read_opts.snapshot = snap; + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + writer_thread.join(); + + ASSERT_TRUE(s.IsNotFound()); + + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 +// seq'=11 +// write finishes +// Flush +// GetImpl(ts,seq) +// Since application specifies both timestamp and snapshot, application expects +// to see data that visible in BOTH timestamp and sequence number. Therefore, +// can be returned only if t1<=ts AND s1<=seq. +TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap", + "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + std::string write_ts_str = Timestamp(1, 0); + Slice write_ts = write_ts_str; + WriteOptions write_opts; + write_opts.timestamp = &write_ts; + TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"); + Status s = db_->Put(write_opts, "foo", "value1"); + ASSERT_OK(s); + ASSERT_OK(Flush()); + + write_ts_str = Timestamp(2, 0); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + s = db_->Put(write_opts, "bar", "value2"); + ASSERT_OK(s); + }); + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap"); + writer_thread.join(); + std::string read_ts_str = Timestamp(3, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.snapshot = snap; + read_opts.timestamp = &read_ts; + std::string value; + Status s = db_->Get(read_opts, "foo", &value); + ASSERT_TRUE(s.IsNotFound()); + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=90 +// ts=100 +// seq=10 +// seq'=11 +// write finishes +// scan(ts,seq) +// can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but +// seqDisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::NewIterator:3", + "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"); + for (int i = 0; i < 3; ++i) { + std::string write_ts_str = Timestamp(i + 1, 0); + Slice write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + Status s = db_->Put(write_opts, "key" + std::to_string(i), + "value" + std::to_string(i)); + ASSERT_OK(s); + } + }); + std::string read_ts_str = Timestamp(10, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.timestamp = &read_ts; + Iterator* it = db_->NewIterator(read_opts); + ASSERT_NE(nullptr, it); + writer_thread.join(); + it->SeekToFirst(); + ASSERT_FALSE(it->Valid()); + delete it; + Close(); +} + +// Application specifies both timestamp and snapshot. +// reader writer +// seq=10 +// ts'=90 +// ts=100 seq'=11 +// write finishes +// scan(ts,seq) +// can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts +// but s1>seq, then the key should not be returned. +TEST_F(DataVisibilityTest, RangeScanWithSnapshot) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot", + "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + WriteOptions write_opts; + TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut"); + for (int i = 0; i < 3; ++i) { + std::string write_ts_str = Timestamp(i + 1, 0); + Slice write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + Status s = db_->Put(write_opts, "key" + std::to_string(i), + "value" + std::to_string(i)); + ASSERT_OK(s); + } + }); + const Snapshot* snap = db_->GetSnapshot(); + TEST_SYNC_POINT( + "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot"); + + writer_thread.join(); + + std::string read_ts_str = Timestamp(10, 0); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.snapshot = snap; + read_opts.total_order_seek = true; + read_opts.timestamp = &read_ts; + Iterator* it = db_->NewIterator(read_opts); + ASSERT_NE(nullptr, it); + it->Seek("key0"); + ASSERT_FALSE(it->Valid()); + + delete it; + db_->ReleaseSnapshot(snap); + Close(); +} + +// Application specifies both timestamp and snapshot. +// Query each combination and make sure for MultiGet key , only +// return keys that ts>=t1 AND seq>=s1. +TEST_F(DataVisibilityTest, MultiGetWithTimestamp) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + const Snapshot* snap0 = db_->GetSnapshot(); + PutTestData(0); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + + const Snapshot* snap1 = db_->GetSnapshot(); + PutTestData(1); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + + ASSERT_OK(Flush()); + + const Snapshot* snap2 = db_->GetSnapshot(); + PutTestData(2); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + VerifyDefaultCF(snap2); + + db_->ReleaseSnapshot(snap0); + db_->ReleaseSnapshot(snap1); + db_->ReleaseSnapshot(snap2); + + Close(); +} + +// Application specifies timestamp but not snapshot. +// reader writer +// ts'=0, 1 +// ts=3 +// seq=10 +// seq'=11, 12 +// write finishes +// MultiGet(ts,seq) +// For MultiGet , only return keys that ts>=t1 AND seq>=s1. +TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::MultiGet:AfterGetSeqNum1", + "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"}, + {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut", + "DBImpl::MultiGet:AfterGetSeqNum2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + port::Thread writer_thread([this]() { + TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"); + PutTestData(0); + PutTestData(1); + TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut"); + }); + + ReadOptions read_opts; + std::string read_ts = Timestamp(kTestDataSize, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + auto keys = GetKeys(); + std::vector values; + auto ss = db_->MultiGet(read_opts, keys, &values); + + writer_thread.join(); + for (auto s : ss) { + ASSERT_TRUE(s.IsNotFound()); + } + VerifyDefaultCF(); + Close(); +} + +TEST_F(DataVisibilityTest, MultiGetCrossCF) { + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + CreateAndReopenWithCF({"second"}, options); + ColumnFamilyHandle* second_cf = handles_[1]; + + const Snapshot* snap0 = db_->GetSnapshot(); + PutTestData(0); + PutTestData(0, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + + const Snapshot* snap1 = db_->GetSnapshot(); + PutTestData(1); + PutTestData(1, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + + ASSERT_OK(Flush()); + + const Snapshot* snap2 = db_->GetSnapshot(); + PutTestData(2); + PutTestData(2, second_cf); + VerifyDefaultCF(); + VerifyDefaultCF(snap0); + VerifyDefaultCF(snap1); + VerifyDefaultCF(snap2); + + ReadOptions read_opts; + std::string read_ts = Timestamp(kTestDataSize, 0); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + read_opts.snapshot = snap1; + auto keys = GetKeys(); + auto keys2 = GetKeys(); + keys.insert(keys.end(), keys2.begin(), keys2.end()); + std::vector cfs(kTestDataSize, + db_->DefaultColumnFamily()); + std::vector cfs2(kTestDataSize, second_cf); + cfs.insert(cfs.end(), cfs2.begin(), cfs2.end()); + + std::vector values; + auto ss = db_->MultiGet(read_opts, cfs, keys, &values); + for (int i = 0; i < 2 * kTestDataSize; i++) { + if (i % 3 == 0) { + // only the first key for each column family should be returned + ASSERT_OK(ss[i]); + } else { + ASSERT_TRUE(ss[i].IsNotFound()); + } + } + + db_->ReleaseSnapshot(snap0); + db_->ReleaseSnapshot(snap1); + db_->ReleaseSnapshot(snap2); + Close(); +} + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class DBBasicTestWithTimestampCompressionSettings + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, CompressionType, + uint32_t, uint32_t>> { + public: + DBBasicTestWithTimestampCompressionSettings() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_compression") {} +}; + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) { + const int kNumKeysPerFile = 1024; + const size_t kNumTimestamps = 4; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + options.target_file_size_base = 1 << 26; // 64MB + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK(Put(cf, Key1(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); + Close(); +} + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + const int kNumKeysPerFile = 1024; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + options.target_file_size_base = 1 << 26; // 64MB + + DestroyAndReopen(options); + + const size_t kNumL0Files = + static_cast(Options().level0_file_num_compaction_trigger); + { + // Half of the keys will go through Deletion and remaining half with + // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction + // to L1 + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + WriteOptions wopts; + wopts.timestamp = &ts; + for (size_t i = 0; i < kNumL0Files; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(db_->Put(wopts, Key1(j), "value" + std::to_string(i))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Generate another L0 at ts=3 + ts_str = Timestamp(3, 0); + ts = ts_str; + wopts.timestamp = &ts; + for (int i = 0; i < kNumKeysPerFile; ++i) { + std::string key_str = Key1(i); + Slice key(key_str); + if ((i % 3) == 0) { + if (i < kNumKeysPerFile / 2) { + ASSERT_OK(db_->Delete(wopts, key)); + } else { + ASSERT_OK(db_->SingleDelete(wopts, key)); + } + } else { + ASSERT_OK(db_->Put(wopts, key, "new_value")); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + // Populate memtable at ts=5 + ts_str = Timestamp(5, 0); + ts = ts_str; + wopts.timestamp = &ts; + for (int i = 0; i != kNumKeysPerFile; ++i) { + std::string key_str = Key1(i); + Slice key(key_str); + if ((i % 3) == 1) { + if (i < kNumKeysPerFile / 2) { + ASSERT_OK(db_->Delete(wopts, key)); + } else { + ASSERT_OK(db_->SingleDelete(wopts, key)); + } + } else if ((i % 3) == 2) { + ASSERT_OK(db_->Put(wopts, key, "new_value_2")); + } + } + } + { + std::string ts_str = Timestamp(6, 0); + Slice ts = ts_str; + ReadOptions ropts; + ropts.timestamp = &ts; + for (uint64_t i = 0; i != static_cast(kNumKeysPerFile); ++i) { + std::string value; + Status s = db_->Get(ropts, Key1(i), &value); + if ((i % 3) == 2) { + ASSERT_OK(s); + ASSERT_EQ("new_value_2", value); + } else { + ASSERT_TRUE(s.IsNotFound()); + } + } + } +} + +#ifndef ROCKSDB_LITE +// A class which remembers the name of each flushed file. +class FlushedFileCollector : public EventListener { + public: + FlushedFileCollector() {} + ~FlushedFileCollector() override {} + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.push_back(info.file_path); + } + + std::vector GetFlushedFiles() { + std::vector result; + { + InstrumentedMutexLock lock(&mutex_); + result = flushed_files_; + } + return result; + } + + void ClearFlushedFiles() { + InstrumentedMutexLock lock(&mutex_); + flushed_files_.clear(); + } + + private: + std::vector flushed_files_; + InstrumentedMutex mutex_; +}; + +TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) { + const int kNumKeysPerFile = 1024; + const size_t kNumTimestamps = 2; + const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; + const size_t kSplitPosBase = kNumKeysPerTimestamp / 2; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const CompressionType comp_type = std::get<1>(GetParam()); +#if LZ4_VERSION_NUMBER < 10400 // r124+ + if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) { + return; + } +#endif // LZ4_VERSION_NUMBER >= 10400 + if (!ZSTD_Supported() && comp_type == kZSTD) { + return; + } + if (!Zlib_Supported() && comp_type == kZlibCompression) { + return; + } + + options.compression = comp_type; + options.compression_opts.max_dict_bytes = std::get<2>(GetParam()); + if (comp_type == kZSTD) { + options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam()); + } + options.compression_opts.parallel_threads = std::get<3>(GetParam()); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + const auto& verify_records_func = [&](size_t i, size_t begin, size_t end, + ColumnFamilyHandle* cfh) { + std::string value; + std::string timestamp; + + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp = + std::string(write_ts_list[i].data(), write_ts_list[i].size()); + + for (size_t j = begin; j <= end; ++j) { + ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, ×tamp)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value); + ASSERT_EQ(expected_timestamp, timestamp); + } + }; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + size_t memtable_get_start = 0; + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + ASSERT_OK(Put(cf, Key1(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) { + verify_records_func(i, memtable_get_start, j, handles_[cf]); + memtable_get_start = j + 1; + + // flush all keys with the same timestamp to two sst files, split at + // incremental positions such that lowerlevel[1].smallest.userkey == + // higherlevel[0].largest.userkey + ASSERT_OK(Flush(cf)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // wait for flush (which + // is also a compaction) + + // compact files (2 at each level) to a lower level such that all + // keys with the same timestamp is at one level, with newer versions + // at higher levels. + CompactionOptions compact_opt; + compact_opt.compression = kNoCompression; + ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf], + collector->GetFlushedFiles(), + static_cast(kNumTimestamps - i))); + collector->ClearFlushedFiles(); + } + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp(write_ts_list[i].data(), + write_ts_list[i].size()); + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh); + } + } + }; + verify_db_func(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 2; + const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.memtable_whole_key_filtering = true; + + size_t ts_sz = Timestamp(0, 0).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_list; + std::vector read_ts_list; + + const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) { + std::vector keys; + std::vector key_vals; + std::vector values; + std::vector timestamps; + + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + key_vals.push_back(Key1(j)); + } + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + keys.push_back(key_vals[j]); + } + + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + std::string expected_timestamp(write_ts_list[i].data(), + write_ts_list[i].size()); + + std::vector cfhs(keys.size(), cfh); + std::vector statuses = + db_->MultiGet(ropts, cfhs, keys, &values, ×tamps); + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + ASSERT_OK(statuses[j]); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + values[j]); + ASSERT_EQ(expected_timestamp, timestamps[j]); + } + }; + + const std::string dummy_ts(ts_sz, '\0'); + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.push_back(Timestamp(i * 2, 0)); + read_ts_list.push_back(Timestamp(1 + i * 2, 0)); + const Slice& write_ts = write_ts_list.back(); + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + WriteOptions wopts; + WriteBatch batch; + for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) { + const std::string key = Key1(j); + const std::string value = + "value_" + std::to_string(j) + "_" + std::to_string(i); + std::array key_with_ts_slices{{key, dummy_ts}}; + SliceParts key_with_ts(key_with_ts_slices.data(), 2); + std::array value_slices{{value}}; + SliceParts values(value_slices.data(), 1); + ASSERT_OK(batch.Put(handles_[cf], key_with_ts, values)); + } + ASSERT_OK(batch.AssignTimestamp(write_ts)); + ASSERT_OK(db_->Write(wopts, &batch)); + + verify_records_func(i, handles_[cf]); + + ASSERT_OK(Flush(cf)); + } + } + + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + const Slice read_ts = read_ts_list[i]; + ropts.timestamp = &read_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + verify_records_func(i, cfh); + } + } + }; + verify_db_func(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) { + Options options = CurrentOptions(); + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "foo", "value")); + ASSERT_OK(db_->Put(write_opts, "bar", "value")); + ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", "value")); + ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", "value")); + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + { + ColumnFamilyHandle* column_families[] = {cfh, cfh}; + Slice keys[] = {"foo", "bar"}; + PinnableSlice values[] = {PinnableSlice(), PinnableSlice()}; + Status statuses[] = {Status::OK(), Status::OK()}; + dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + } + { + ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh}; + // Make user keys longer than configured timestamp size (16 bytes) to + // verify RocksDB does not use the trailing bytes 'x' as timestamp. + Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"}; + PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(), + PinnableSlice()}; + Status statuses[] = {Status::OK(), Status::OK(), Status::OK(), + Status::OK()}; + dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (const auto& s : statuses) { + ASSERT_OK(s); + } + } + Close(); +} + +#endif // !ROCKSDB_LITE + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampCompressionSettings, + ::testing::Combine( + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10, false))), + ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression, + kLZ4HCCompression, kZSTD), + ::testing::Values(0, 1 << 14), ::testing::Values(1, 4))); + +class DBBasicTestWithTimestampPrefixSeek + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, + std::shared_ptr, bool, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTimestampPrefixSeek() + : DBBasicTestWithTimestampBase( + "/db_basic_test_with_timestamp_prefix_seek") {} +}; + +TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { + const size_t kNumKeysPerFile = 128; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor = std::get<0>(GetParam()); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const uint64_t kMaxKey = 0xffffffffffffffff; + const uint64_t kMinKey = 0xfffffffffffff000; + const std::vector write_ts_list = {Timestamp(3, 0xffffffff), + Timestamp(6, 0xffffffff)}; + WriteOptions write_opts; + { + for (size_t i = 0; i != write_ts_list.size(); ++i) { + Slice write_ts = write_ts_list[i]; + write_opts.timestamp = &write_ts; + for (uint64_t key = kMaxKey; key >= kMinKey; --key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + } + const std::vector read_ts_list = {Timestamp(5, 0xffffffff), + Timestamp(9, 0xffffffff)}; + { + ReadOptions read_opts; + read_opts.total_order_seek = false; + read_opts.prefix_same_as_start = std::get<2>(GetParam()); + fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(), + bbto.filter_policy ? bbto.filter_policy->Name() : "null", + static_cast(read_opts.prefix_same_as_start)); + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + + // Seek to kMaxKey + iter->Seek(Key1(kMaxKey)); + CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + // Seek to kMinKey + iter->Seek(Key1(kMinKey)); + CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + } + const std::vector targets = {kMinKey, kMinKey + 0x10, + kMinKey + 0x100, kMaxKey}; + const SliceTransform* const pe = options.prefix_extractor.get(); + ASSERT_NE(nullptr, pe); + const size_t kPrefixShift = + 8 * (Key1(0).size() - pe->Transform(Key1(0)).size()); + const uint64_t kPrefixMask = + ~((static_cast(1) << kPrefixShift) - 1); + const uint64_t kNumKeysWithinPrefix = + (static_cast(1) << kPrefixShift); + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + // Forward and backward iterate. + for (size_t j = 0; j != targets.size(); ++j) { + std::string start_key = Key1(targets[j]); + uint64_t expected_ub = + (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix; + uint64_t expected_key = targets[j]; + size_t count = 0; + it->Seek(Key1(targets[j])); + while (it->Valid()) { + std::string saved_prev_key; + saved_prev_key.assign(it->key().data(), it->key().size()); + + // Out of prefix + if (!read_opts.prefix_same_as_start && + pe->Transform(saved_prev_key) != pe->Transform(start_key)) { + break; + } + CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + ++count; + ++expected_key; + it->Next(); + } + ASSERT_EQ(expected_ub - targets[j] + 1, count); + + count = 0; + expected_key = targets[j]; + it->SeekForPrev(start_key); + uint64_t expected_lb = (targets[j] & kPrefixMask); + while (it->Valid()) { + // Out of prefix + if (!read_opts.prefix_same_as_start && + pe->Transform(it->key()) != pe->Transform(start_key)) { + break; + } + CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + ++count; + --expected_key; + it->Prev(); + } + ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count); + } + } + } + Close(); +} + +// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g. +// NoopTransform. +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampPrefixSeek, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), + std::shared_ptr(NewFixedPrefixTransform(4)), + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10 /*bits_per_key*/, false)), + std::shared_ptr( + NewBloomFilterPolicy(20 /*bits_per_key*/, + false))), + ::testing::Bool(), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); + +class DBBasicTestWithTsIterTombstones + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, + std::shared_ptr, int, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTsIterTombstones() + : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {} +}; + +TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) { + constexpr size_t kNumKeysPerFile = 128; + Options options = CurrentOptions(); + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor = std::get<0>(GetParam()); + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.num_levels = std::get<2>(GetParam()); + DestroyAndReopen(options); + std::vector write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)}; + constexpr uint64_t kMaxKey = 0xffffffffffffffff; + constexpr uint64_t kMinKey = 0xfffffffffffff000; + // Insert kMinKey...kMaxKey + uint64_t key = kMinKey; + WriteOptions write_opts; + Slice ts = write_ts_strs[0]; + write_opts.timestamp = &ts; + do { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key)); + ASSERT_OK(s); + if (kMaxKey == key) { + break; + } + ++key; + } while (true); + + ts = write_ts_strs[1]; + write_opts.timestamp = &ts; + for (key = kMaxKey; key >= kMinKey; --key) { + Status s; + if (0 != (key % 2)) { + s = db_->Put(write_opts, Key1(key), "value1" + std::to_string(key)); + } else { + s = db_->Delete(write_opts, Key1(key)); + } + ASSERT_OK(s); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + { + std::string read_ts = Timestamp(4, 0); + ts = read_ts; + ReadOptions read_opts; + read_opts.total_order_seek = true; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + size_t count = 0; + key = kMinKey + 1; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) { + ASSERT_EQ(Key1(key), iter->key()); + ASSERT_EQ("value1" + std::to_string(key), iter->value()); + } + ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); + + for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid(); + key -= 2, ++count, iter->Prev()) { + ASSERT_EQ(Key1(key), iter->key()); + ASSERT_EQ("value1" + std::to_string(key), iter->value()); + } + ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTsIterTombstones, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Values(std::shared_ptr(nullptr), + std::shared_ptr( + NewBloomFilterPolicy(10, false)), + std::shared_ptr( + NewBloomFilterPolicy(20, false))), + ::testing::Values(2, 6), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,121 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction/compaction.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +std::string Key1(uint64_t key) { + std::string ret; + PutFixed64(&ret, key); + std::reverse(ret.begin(), ret.end()); + return ret; +} + +std::string Timestamp(uint64_t ts) { + std::string ret; + PutFixed64(&ret, ts); + return ret; +} +} // anonymous namespace + +class TimestampCompatibleCompactionTest : public DBTestBase { + public: + TimestampCompatibleCompactionTest() + : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {} + + std::string Get(const std::string& key, uint64_t ts) { + ReadOptions read_opts; + std::string ts_str = Timestamp(ts); + Slice ts_slice = ts_str; + read_opts.timestamp = &ts_slice; + std::string value; + Status s = db_->Get(read_opts, key, &value); + if (s.IsNotFound()) { + value.assign("NOT_FOUND"); + } else if (!s.ok()) { + value.assign(s.ToString()); + } + return value; + } +}; + +TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_style = kCompactionStyleLevel; + options.comparator = test::ComparatorWithU64Ts(); + options.level0_file_num_compaction_trigger = 3; + constexpr size_t kNumKeysPerFile = 101; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + const auto* compaction = reinterpret_cast(arg); + ASSERT_NE(nullptr, compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1, compaction->num_input_levels()); + // Check that all 3 L0 ssts are picked for level compaction. + ASSERT_EQ(3, compaction->num_input_files(0)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199. + uint64_t ts = 100; + uint64_t key = 0; + WriteOptions write_opts; + for (; key < kNumKeysPerFile - 1; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + Slice ts_slice = ts_str; + write_opts.timestamp = &ts_slice; + ASSERT_OK(db_->Put(write_opts, Key1(key), "foo_" + std::to_string(key))); + } + // Write another L0 with keys 99 with newer ts. + ASSERT_OK(Flush()); + uint64_t saved_read_ts1 = ts++; + key = 99; + for (int i = 0; i < 4; ++i, ++ts) { + std::string ts_str = Timestamp(ts); + Slice ts_slice = ts_str; + write_opts.timestamp = &ts_slice; + ASSERT_OK(db_->Put(write_opts, Key1(key), "bar_" + std::to_string(key))); + } + ASSERT_OK(Flush()); + uint64_t saved_read_ts2 = ts++; + // Write another L0 with keys 99, 100, 101, ..., 150 + for (; key <= 150; ++key, ++ts) { + std::string ts_str = Timestamp(ts); + Slice ts_slice = ts_str; + write_opts.timestamp = &ts_slice; + ASSERT_OK(db_->Put(write_opts, Key1(key), "foo1_" + std::to_string(key))); + } + ASSERT_OK(Flush()); + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + uint64_t read_ts = ts; + ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1)); + ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2)); + ASSERT_EQ("foo1_99", Get(Key1(99), read_ts)); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,793 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "db/write_thread.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBWriteBufferManagerTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBWriteBufferManagerTest() + : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {} + bool cost_cache_; +}; + +TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + // This make sures write will go through and if stall was in effect, it will + // end. + ASSERT_OK(Put(0, Key(2), DummyString(1), wo)); +} + +// Test Single DB with multiple writer threads get blocked when +// WriteBufferManager execeeds buffer_size_ and flush is waiting to be +// finished. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + std::unordered_set w_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_set.insert(w); + // Allow the flush to continue if all writer threads are blocked. + if (w_set.size() == (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + std::function writer = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + Status tmp = Put(cf, Slice(key), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // and they will be blocked. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(writer, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(writer, i % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_writers); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush +// is waiting to be finished but DBs tries to write meanwhile. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager Limit exceeded. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Since this is the last DB, signal Flush to continue. + if (wait_count_db == num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + // Write to DB. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // db_ will write and will be blocked (as Flush will on hold and will create + // stall in effect). + // | + // multiple dbs writers will be created to write to that db and they will be + // blocked. + // | + // | + // Last writer will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(write_db, db_); + // Wait untill first DB is blocked and then create the multiple writers for + // different DBs which will be blocked from getting added to the queue because + // stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_dbs; i++) { + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple DBs and multiple columns get +// blocked when stall by WriteBufferManager is in effect. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_set; + std::vector writer_threads; + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + thread_num.fetch_add(1); + cv.Signal(); + // Allow the flush to continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + { + InstrumentedMutexLock lock(&mutex); + w_set.insert(w); + thread_num.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + // Write to multiple columns of db_. + std::function write_cf = [&](int cf) { + Status tmp = Put(cf, Key(3), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + // Write to multiple DBs. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s2 = s2 && tmp.ok(); + }; + + // Flow: + // thread will write to db_ will be blocked (as Flush will on hold, + // buffer_size_ has exceeded and will create stall in effect). + // | + // | + // multiple writers threads writing to different DBs and to db_ across + // multiple columns will be created and they will be blocked due to stall. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + threads.emplace_back(write_db, db_); + // Wait untill first thread is blocked and then create the multiple writer + // threads. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i++) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_cf, i % 3); + // Write to different dbs. + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + + // Number of DBs blocked. + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_dbs); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ by passing +// different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // db_ is completed. + + std::unordered_set w_slowdown_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load( + std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + + std::function write_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // with different values of WriteOptions.no_slowdown. Some of them will + // be blocked and some of them will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_writers; i += 2) { + threads.emplace_back(write_no_slow_down, (i) % 4); + threads.emplace_back(write_slow_down, (i + 1) % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_slowdown_set.size(), num_writers / 2); + // Number of Writer threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ and different +// dbs by passing different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 4; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_slowdown_set; + std::vector writer_threads; + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + InstrumentedMutexLock lock(&mutex); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + std::function write_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // first thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // of db_ and different DBs with different values of + // WriteOptions.no_slowdown. Some of them will be blocked and some of them + // will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, db_); + // Wait untill first thread writing to DB is blocked and then + // create the multiple writers. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i += 2) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_slow_down, db_); + writer_threads.emplace_back(write_no_slow_down, db_); + // Write to different DBs. + threads.emplace_back(write_slow_down, dbs[i]); + threads.emplace_back(write_no_slow_down, dbs[i + 1]); + } + + for (auto& t : threads) { + t.join(); + } + + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ((num_dbs / 2) + 1, wait_count_db); + // Number of writer threads writing to db_ blocked from getting added to the + // queue. + ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2); + // Number of threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs); + + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, + testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,25 +4,27 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include #include -#include + #include "db/db_test_util.h" #include "db/write_batch_internal.h" #include "db/write_thread.h" #include "port/port.h" #include "port/stack_trace.h" -#include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" +#include "util/random.h" #include "util/string_util.h" +#include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { // Test variations of WriteImpl. class DBWriteTest : public DBTestBase, public testing::WithParamInterface { public: - DBWriteTest() : DBTestBase("/db_write_test") {} + DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {} Options GetOptions() { return DBTestBase::GetOptions(GetParam()); } @@ -40,6 +42,126 @@ ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument()); } +TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { + Options options = GetOptions(); + options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = + 4; + std::vector threads; + std::atomic thread_num(0); + port::Mutex mutex; + port::CondVar cv(&mutex); + // Guarded by mutex + int writers = 0; + + Reopen(options); + + std::function write_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = false; + ASSERT_OK(dbfull()->Put(wo, key, "bar")); + }; + std::function write_no_slowdown_func = [&]() { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions wo; + wo.no_slowdown = true; + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); + }; + std::function unblock_main_thread_func = [&](void*) { + mutex.Lock(); + ++writers; + cv.SignalAll(); + mutex.Unlock(); + }; + + // Create 3 L0 files and schedule 4th without waiting + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1", + "DBImpl::BackgroundCallFlush:start"}, + {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2", + "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"}, + // Make compaction start wait for the write stall to be detected and + // implemented by a write group leader + {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Schedule creation of 4th L0 file without waiting. This will seal the + // memtable and then wait for a sync point before writing the file. We need + // to do it this way because SwitchMemtable() needs to enter the + // write_thread + FlushOptions fopt; + fopt.wait = false; + ASSERT_OK(dbfull()->Flush(fopt)); + + // Create a mix of slowdown/no_slowdown write threads + mutex.Lock(); + // First leader + threads.emplace_back(write_slowdown_func); + while (writers != 1) { + cv.Wait(); + } + + // Second leader. Will stall writes + // Build a writers list with no slowdown in the middle: + // +-------------+ + // | slowdown +<----+ newest + // +--+----------+ + // | + // v + // +--+----------+ + // | no slowdown | + // +--+----------+ + // | + // v + // +--+----------+ + // | slowdown + + // +-------------+ + threads.emplace_back(write_slowdown_func); + while (writers != 2) { + cv.Wait(); + } + threads.emplace_back(write_no_slowdown_func); + while (writers != 3) { + cv.Wait(); + } + threads.emplace_back(write_slowdown_func); + while (writers != 4) { + cv.Wait(); + } + + mutex.Unlock(); + + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1"); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); + // This would have triggered a write stall. Unblock the write group leader + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2"); + // The leader is going to create missing newer links. When the leader + // finishes, the next leader is going to delay writes and fail writers with + // no_slowdown + + TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3"); + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { Options options = GetOptions(); options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4; @@ -47,6 +169,8 @@ std::atomic thread_num(0); port::Mutex mutex; port::CondVar cv(&mutex); + // Guarded by mutex + int writers = 0; Reopen(options); @@ -55,29 +179,31 @@ std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = false; - dbfull()->Put(wo, key, "bar"); + ASSERT_OK(dbfull()->Put(wo, key, "bar")); }; std::function write_no_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = true; - dbfull()->Put(wo, key, "bar"); + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; std::function unblock_main_thread_func = [&](void *) { mutex.Lock(); + ++writers; cv.SignalAll(); mutex.Unlock(); }; // Create 3 L0 files and schedule 4th without waiting - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); @@ -98,28 +224,28 @@ // write_thread FlushOptions fopt; fopt.wait = false; - dbfull()->Flush(fopt); + ASSERT_OK(dbfull()->Flush(fopt)); // Create a mix of slowdown/no_slowdown write threads mutex.Lock(); // First leader threads.emplace_back(write_slowdown_func); - cv.Wait(); + while (writers != 1) { + cv.Wait(); + } // Second leader. Will stall writes threads.emplace_back(write_slowdown_func); - cv.Wait(); threads.emplace_back(write_no_slowdown_func); - cv.Wait(); threads.emplace_back(write_slowdown_func); - cv.Wait(); threads.emplace_back(write_no_slowdown_func); - cv.Wait(); threads.emplace_back(write_slowdown_func); - cv.Wait(); + while (writers != 6) { + cv.Wait(); + } mutex.Unlock(); TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1"); - dbfull()->TEST_WaitForFlushMemTable(nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); // The leader is going to create missing newer links. When the leader finishes, @@ -129,12 +255,14 @@ for (auto& t : threads) { t.join(); } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { constexpr int kNumThreads = 5; std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); @@ -181,6 +309,11 @@ threads[i].join(); } ASSERT_EQ(1, leader_count); + + // The Failed PUT operations can cause a BG error to be set. + // Mark it as Checked for the ASSERT_STATUS_CHECKED + dbfull()->Resume().PermitUncheckedError(); + // Close before mock_env destruct. Close(); } @@ -194,7 +327,7 @@ ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty()); // try the 2nd wal created during SwitchWAL - dbfull()->TEST_SwitchWAL(); + ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_TRUE(Put("key" + ToString(0), "value").ok()); ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); @@ -203,7 +336,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); @@ -225,7 +358,9 @@ } */ if (!options.manual_wal_flush) { - ASSERT_FALSE(res.ok()); + ASSERT_NOK(res); + } else { + ASSERT_OK(res); } } // Close before mock_env destruct. @@ -235,7 +370,7 @@ TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { Random rnd(301); std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); options.writable_file_max_buffer_size = 4 * 1024 * 1024; @@ -246,7 +381,7 @@ mock_env->SetFilesystemActive(false, Status::IOError("Not active")); Status s; for (int i = 0; i < 4 * 512; ++i) { - s = Put(Key(i), RandomString(&rnd, 1024)); + s = Put(Key(i), rnd.RandomString(1024)); if (!s.ok()) { break; } @@ -269,7 +404,7 @@ ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false)); ASSERT_OK(dbfull()->UnlockWAL()); // try the 2nd wal created during SwitchWAL - dbfull()->TEST_SwitchWAL(); + ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_OK(Put("key" + ToString(0), "value")); ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); ASSERT_OK(dbfull()->LockWAL()); @@ -297,13 +432,14 @@ ROCKSDB_NAMESPACE::WriteOptions write_option_default; std::string no_wal_key = no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i); - this->Put(no_wal_key, no_wal_value, write_option_disable); + ASSERT_OK( + this->Put(no_wal_key, no_wal_value, write_option_disable)); std::string wal_key = wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); - this->Put(wal_key, wal_value, write_option_default); - dbfull()->SyncWAL(); + ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); + ASSERT_OK(dbfull()->SyncWAL()); } - return 0; + return; }); } for (auto& t: threads) { @@ -325,5 +461,6 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,10 @@ #include "db/dbformat.h" #include + #include + +#include "db/lookup_key.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "util/coding.h" @@ -23,14 +26,9 @@ // and the value type is embedded as the low 8 bits in the sequence // number in internal keys, we need to use the highest-numbered // ValueType, not the lowest). -const ValueType kValueTypeForSeek = kTypeBlobIndex; +const ValueType kValueTypeForSeek = kTypeDeletionWithTimestamp; const ValueType kValueTypeForSeekForPrev = kTypeDeletion; - -uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(IsExtendedValueType(t)); - return (seq << 8) | t; -} +const std::string kDisableUserTimestamp(""); EntryType GetEntryType(ValueType value_type) { switch (value_type) { @@ -38,6 +36,8 @@ return kEntryPut; case kTypeDeletion: return kEntryDelete; + case kTypeDeletionWithTimestamp: + return kEntryDeleteWithTimestamp; case kTypeSingleDeletion: return kEntrySingleDelete; case kTypeMerge: @@ -51,41 +51,53 @@ } } -bool ParseFullKey(const Slice& internal_key, FullKey* fkey) { - ParsedInternalKey ikey; - if (!ParseInternalKey(internal_key, &ikey)) { - return false; - } - fkey->user_key = ikey.user_key; - fkey->sequence = ikey.sequence; - fkey->type = GetEntryType(ikey.type); - return true; -} - -void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) { - *seq = packed >> 8; - *t = static_cast(packed & 0xff); - - assert(*seq <= kMaxSequenceNumber); - assert(IsExtendedValueType(*t)); -} - void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { result->append(key.user_key.data(), key.user_key.size()); PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); } +void AppendInternalKeyWithDifferentTimestamp(std::string* result, + const ParsedInternalKey& key, + const Slice& ts) { + assert(key.user_key.size() >= ts.size()); + result->append(key.user_key.data(), key.user_key.size() - ts.size()); + result->append(ts.data(), ts.size()); + PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); +} + void AppendInternalKeyFooter(std::string* result, SequenceNumber s, ValueType t) { PutFixed64(result, PackSequenceAndType(s, t)); } -std::string ParsedInternalKey::DebugString(bool hex) const { +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMin(ts_sz, static_cast(0)); + result->append(key.data(), key.size()); + result->append(kTsMin.data(), ts_sz); +} + +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMax(ts_sz, static_cast(0xff)); + result->append(key.data(), key.size()); + result->append(kTsMax.data(), ts_sz); +} + +std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { + std::string result = "'"; + if (log_err_key) { + result += user_key.ToString(hex); + } else { + result += ""; + } + char buf[50]; snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence, static_cast(type)); - std::string result = "'"; - result += user_key.ToString(hex); + result += buf; return result; } @@ -93,8 +105,8 @@ std::string InternalKey::DebugString(bool hex) const { std::string result; ParsedInternalKey parsed; - if (ParseInternalKey(rep_, &parsed)) { - result = parsed.DebugString(hex); + if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) { + result = parsed.DebugString(true /* log_err_key */, hex); // TODO } else { result = "(bad)"; result.append(EscapeString(rep_)); @@ -102,7 +114,12 @@ return result; } -const char* InternalKeyComparator::Name() const { return name_.c_str(); } +const char* InternalKeyComparator::Name() const { + if (name_.empty()) { + return "rocksdb.anonymous.InternalKeyComparator"; + } + return name_.c_str(); +} int InternalKeyComparator::Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,19 +9,14 @@ #pragma once #include + #include #include #include -#include "db/lookup_key.h" -#include "db/merge_context.h" -#include "logging/logging.h" -#include "monitoring/perf_context_imp.h" + #include "rocksdb/comparator.h" -#include "rocksdb/db.h" -#include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/table.h" #include "rocksdb/types.h" #include "util/coding.h" #include "util/user_comparator_wrapper.h" @@ -69,7 +64,9 @@ // generated by WriteUnprepared write policy is not mistakenly read by // another. kTypeBeginUnprepareXID = 0x13, // WAL only. - kMaxValue = 0x7F // Not used for storing records. + kTypeDeletionWithTimestamp = 0x14, + kTypeCommitXIDAndTimestamp = 0x15, // WAL only + kMaxValue = 0x7F // Not used for storing records. }; // Defined in dbformat.cc @@ -79,7 +76,8 @@ // Checks whether a type is an inline value type // (i.e. a type used in memtable skiplist and sst file datablock). inline bool IsValueType(ValueType t) { - return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex; + return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex || + kTypeDeletionWithTimestamp == t; } // Checks whether a type is from user operation @@ -94,6 +92,11 @@ static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64; +constexpr uint64_t kNumInternalBytes = 8; + +// Defined in dbformat.cc +extern const std::string kDisableUserTimestamp; + // The data structure that represents an internal key in the way that user_key, // sequence number and type are stored in separated forms. struct ParsedInternalKey { @@ -102,59 +105,95 @@ ValueType type; ParsedInternalKey() - : sequence(kMaxSequenceNumber) // Make code analyzer happy - {} // Intentionally left uninitialized (for speed) + : sequence(kMaxSequenceNumber), + type(kTypeDeletion) // Make code analyzer happy + {} // Intentionally left uninitialized (for speed) + // u contains timestamp if user timestamp feature is enabled. ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) : user_key(u), sequence(seq), type(t) {} - std::string DebugString(bool hex = false) const; + std::string DebugString(bool log_err_key, bool hex) const; void clear() { user_key.clear(); sequence = 0; type = kTypeDeletion; } + + void SetTimestamp(const Slice& ts) { + assert(ts.size() <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts.size(); + memcpy(const_cast(addr), ts.data(), ts.size()); + } }; // Return the length of the encoding of "key". inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; + return key.user_key.size() + kNumInternalBytes; } // Pack a sequence number and a ValueType into a uint64_t -extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t); +inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(IsExtendedValueType(t)); + return (seq << 8) | t; +} // Given the result of PackSequenceAndType, store the sequence number in *seq // and the ValueType in *t. -extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t); +inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, + ValueType* t) { + *seq = packed >> 8; + *t = static_cast(packed & 0xff); + + // Commented the following two assertions in order to test key-value checksum + // on corrupted keys without crashing ("DbKvChecksumTest"). + // assert(*seq <= kMaxSequenceNumber); + // assert(IsExtendedValueType(*t)); +} EntryType GetEntryType(ValueType value_type); // Append the serialization of "key" to *result. extern void AppendInternalKey(std::string* result, const ParsedInternalKey& key); + +// Append the serialization of "key" to *result, replacing the original +// timestamp with argument ts. +extern void AppendInternalKeyWithDifferentTimestamp( + std::string* result, const ParsedInternalKey& key, const Slice& ts); + // Serialized internal key consists of user key followed by footer. // This function appends the footer to *result, assuming that *result already // contains the user key at the end. extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, ValueType t); +// Append the key and a minimal timestamp to *result +extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Append the key and a maximal timestamp to *result +extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + // Attempt to parse an internal key from "internal_key". On success, // stores the parsed data in "*result", and returns true. // // On error, returns false, leaves "*result" in an undefined state. -extern bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); +extern Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key); // Returns the user key portion of an internal key. inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); + assert(internal_key.size() >= kNumInternalBytes); + return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); } inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, size_t ts_sz) { - assert(internal_key.size() >= 8 + ts_sz); - return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz); + assert(internal_key.size() >= kNumInternalBytes + ts_sz); + return Slice(internal_key.data(), + internal_key.size() - kNumInternalBytes - ts_sz); } inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { @@ -162,10 +201,15 @@ return Slice(user_key.data(), user_key.size() - ts_sz); } +inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz); +} + inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { - assert(internal_key.size() >= 8); + assert(internal_key.size() >= kNumInternalBytes); const size_t n = internal_key.size(); - return DecodeFixed64(internal_key.data() + n - 8); + return DecodeFixed64(internal_key.data() + n - kNumInternalBytes); } inline ValueType ExtractValueType(const Slice& internal_key) { @@ -186,10 +230,22 @@ std::string name_; public: - explicit InternalKeyComparator(const Comparator* c) - : user_comparator_(c), - name_("rocksdb.InternalKeyComparator:" + - std::string(user_comparator_.Name())) {} + // `InternalKeyComparator`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + InternalKeyComparator() = default; + + // @param named If true, assign a name to this comparator based on the + // underlying comparator's name. This involves an allocation and copy in + // this constructor to precompute the result of `Name()`. To avoid this + // overhead, set `named` to false. In that case, `Name()` will return a + // generic name that is non-specific to the underlying comparator. + explicit InternalKeyComparator(const Comparator* c, bool named = true) + : Comparator(c->timestamp_size()), user_comparator_(c) { + if (named) { + name_ = "rocksdb.InternalKeyComparator:" + + std::string(user_comparator_.Name()); + } + } virtual ~InternalKeyComparator() {} virtual const char* Name() const override; @@ -206,6 +262,12 @@ int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + // In this `Compare()` overload, the sequence numbers provided in + // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` + // and `b`, respectively. To disable sequence number override(s), provide the + // value `kDisableGlobalSequenceNumber`. + int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b, + SequenceNumber b_global_seqno) const; virtual const Comparator* GetRootComparator() const override { return user_comparator_.GetRootComparator(); } @@ -238,7 +300,8 @@ bool Valid() const { ParsedInternalKey parsed; - return ParseInternalKey(Slice(rep_), &parsed); + return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */) + .ok()); // TODO } void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } @@ -271,7 +334,7 @@ AppendInternalKeyFooter(&rep_, s, t); } - std::string DebugString(bool hex = false) const; + std::string DebugString(bool hex) const; }; inline int InternalKeyComparator::Compare(const InternalKey& a, @@ -279,36 +342,47 @@ return Compare(a.Encode(), b.Encode()); } -inline bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { +inline Status ParseInternalKey(const Slice& internal_key, + ParsedInternalKey* result, bool log_err_key) { const size_t n = internal_key.size(); - if (n < 8) return false; - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + + if (n < kNumInternalBytes) { + return Status::Corruption("Corrupted Key: Internal Key too small. Size=" + + std::to_string(n) + ". "); + } + + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); unsigned char c = num & 0xff; result->sequence = num >> 8; result->type = static_cast(c); assert(result->type <= ValueType::kMaxValue); - result->user_key = Slice(internal_key.data(), n - 8); - return IsExtendedValueType(result->type); + result->user_key = Slice(internal_key.data(), n - kNumInternalBytes); + + if (IsExtendedValueType(result->type)) { + return Status::OK(); + } else { + return Status::Corruption("Corrupted Key", + result->DebugString(log_err_key, true)); + } } // Update the sequence number in the internal key. // Guarantees not to invalidate ikey.data(). inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) { size_t ikey_sz = ikey->size(); - assert(ikey_sz >= 8); + assert(ikey_sz >= kNumInternalBytes); uint64_t newval = (seq << 8) | t; // Note: Since C++11, strings are guaranteed to be stored contiguously and // string::operator[]() is guaranteed not to change ikey.data(). - EncodeFixed64(&(*ikey)[ikey_sz - 8], newval); + EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval); } // Get the sequence number from the internal key inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { const size_t n = internal_key.size(); - assert(n >= 8); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + assert(n >= kNumInternalBytes); + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); return num >> 8; } @@ -347,8 +421,8 @@ if (IsUserKey()) { return Slice(key_, key_size_); } else { - assert(key_size_ >= 8); - return Slice(key_, key_size_ - 8); + assert(key_size_ >= kNumInternalBytes); + return Slice(key_, key_size_ - kNumInternalBytes); } } @@ -406,9 +480,9 @@ // and returns a Slice referencing the new copy. Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) { size_t key_n = key.size(); - assert(key_n >= 8); + assert(key_n >= kNumInternalBytes); SetInternalKey(key); - ikey->user_key = Slice(key_, key_n - 8); + ikey->user_key = Slice(key_, key_n - kNumInternalBytes); return Slice(key_, key_n); } @@ -423,35 +497,48 @@ // Update the sequence number in the internal key. Guarantees not to // invalidate slices to the key (and the user key). - void UpdateInternalKey(uint64_t seq, ValueType t) { + void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { assert(!IsKeyPinned()); - assert(key_size_ >= 8); + assert(key_size_ >= kNumInternalBytes); + if (ts) { + assert(key_size_ >= kNumInternalBytes + ts->size()); + memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(), + ts->size()); + } uint64_t newval = (seq << 8) | t; - EncodeFixed64(&buf_[key_size_ - 8], newval); + EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); } bool IsKeyPinned() const { return (key_ != buf_); } + // user_key does not have timestamp. void SetInternalKey(const Slice& key_prefix, const Slice& user_key, SequenceNumber s, - ValueType value_type = kValueTypeForSeek) { + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { size_t psize = key_prefix.size(); size_t usize = user_key.size(); - EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t)); + size_t ts_sz = (ts != nullptr ? ts->size() : 0); + EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz); if (psize > 0) { memcpy(buf_, key_prefix.data(), psize); } memcpy(buf_ + psize, user_key.data(), usize); - EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type)); + if (ts) { + memcpy(buf_ + psize + usize, ts->data(), ts_sz); + } + EncodeFixed64(buf_ + usize + psize + ts_sz, + PackSequenceAndType(s, value_type)); key_ = buf_; - key_size_ = psize + usize + sizeof(uint64_t); + key_size_ = psize + usize + sizeof(uint64_t) + ts_sz; is_user_key_ = false; } void SetInternalKey(const Slice& user_key, SequenceNumber s, - ValueType value_type = kValueTypeForSeek) { - SetInternalKey(Slice(), user_key, s, value_type); + ValueType value_type = kValueTypeForSeek, + const Slice* ts = nullptr) { + SetInternalKey(Slice(), user_key, s, value_type, ts); } void Reserve(size_t size) { @@ -528,7 +615,7 @@ void EnlargeBuffer(size_t key_size); }; -// Convert from a SliceTranform of user keys, to a SliceTransform of +// Convert from a SliceTransform of user keys, to a SliceTransform of // user keys. class InternalKeySliceTransform : public SliceTransform { public: @@ -568,7 +655,7 @@ // Read record from a write batch piece from input. // tag, column_family, key, value and blob are return values. Callers own the -// Slice they point to. +// slice they point to. // Tag is defined as ValueType. // input will be advanced to after the record. extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, @@ -625,8 +712,10 @@ // decreasing type (though sequence# should be enough to disambiguate) int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes); + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes); if (anum > bnum) { r = -1; } else if (anum < bnum) { @@ -644,14 +733,42 @@ int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); if (r == 0) { // Shift the number to exclude the last byte which contains the value type - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8; - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8; + const uint64_t anum = + DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8; + const uint64_t bnum = + DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8; if (anum > bnum) { r = -1; } else if (anum < bnum) { r = +1; } } + return r; +} + +inline int InternalKeyComparator::Compare(const Slice& a, + SequenceNumber a_global_seqno, + const Slice& b, + SequenceNumber b_global_seqno) const { + int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b)); + if (r == 0) { + uint64_t a_footer, b_footer; + if (a_global_seqno == kDisableGlobalSequenceNumber) { + a_footer = ExtractInternalKeyFooter(a); + } else { + a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a)); + } + if (b_global_seqno == kDisableGlobalSequenceNumber) { + b_footer = ExtractInternalKeyFooter(b); + } else { + b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b)); + } + if (a_footer > b_footer) { + r = -1; + } else if (a_footer < b_footer) { + r = +1; + } + } return r; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,8 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#include "logging/logging.h" + #include "test_util/testharness.h" +#include "test_util/testutil.h" namespace ROCKSDB_NAMESPACE { @@ -41,12 +42,12 @@ Slice in(encoded); ParsedInternalKey decoded("", 0, kTypeValue); - ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); ASSERT_EQ(key, decoded.user_key.ToString()); ASSERT_EQ(seq, decoded.sequence); ASSERT_EQ(vt, decoded.type); - ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); + ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */)); } class FormatTest : public testing::Test {}; @@ -186,7 +187,7 @@ Slice in(ikey); ParsedInternalKey decoded; - ASSERT_TRUE(ParseInternalKey(in, &decoded)); + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); ASSERT_EQ(user_key, decoded.user_key.ToString()); ASSERT_EQ(new_seq, decoded.sequence); ASSERT_EQ(new_val_type, decoded.type); @@ -203,5 +204,6 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/deletefile_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/deletefile_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -35,12 +35,12 @@ const std::string wal_dir_; DeleteFileTest() - : DBTestBase("/deletefile_test"), + : DBTestBase("deletefile_test", /*env_do_fsync=*/true), numlevels_(7), wal_dir_(dbname_ + "/wal_files") {} void SetOptions(Options* options) { - assert(options); + ASSERT_NE(options, nullptr); options->delete_obsolete_files_period_micros = 0; // always do full purge options->enable_thread_tracking = true; options->write_buffer_size = 1024 * 1024 * 1000; @@ -105,21 +105,27 @@ void CheckFileTypeCounts(const std::string& dir, int required_log, int required_sst, int required_manifest) { std::vector filenames; - env_->GetChildren(dir, &filenames); + ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; for (auto file : filenames) { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); + log_cnt += (type == kWalFile); sst_cnt += (type == kTableFile); manifest_cnt += (type == kDescriptorFile); } } - ASSERT_EQ(required_log, log_cnt); - ASSERT_EQ(required_sst, sst_cnt); - ASSERT_EQ(required_manifest, manifest_cnt); + if (required_log >= 0) { + ASSERT_EQ(required_log, log_cnt); + } + if (required_sst >= 0) { + ASSERT_EQ(required_sst, sst_cnt); + } + if (required_manifest >= 0) { + ASSERT_EQ(required_manifest, manifest_cnt); + } } static void DoSleep(void* arg) { @@ -180,7 +186,8 @@ ASSERT_TRUE(status.IsInvalidArgument()); // Lowest level file deletion should succeed. - ASSERT_OK(db_->DeleteFile(level2file)); + status = db_->DeleteFile(level2file); + ASSERT_OK(status); } TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { @@ -201,7 +208,7 @@ compact_options.change_level = true; compact_options.target_level = 2; Slice first_slice(first), last_slice(last); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 1 sst after compaction CheckFileTypeCounts(dbname_, 0, 1, 1); @@ -210,7 +217,9 @@ Iterator *itr = nullptr; CreateTwoLevels(); itr = db_->NewIterator(ReadOptions()); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + ASSERT_OK(itr->status()); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); delete itr; @@ -237,7 +246,8 @@ ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; itr = db_->NewIterator(read_options); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); test::SleepingBackgroundTask sleeping_task_before; @@ -260,6 +270,41 @@ CheckFileTypeCounts(dbname_, 0, 1, 1); } +TEST_F(DeleteFileTest, PurgeDuringOpen) { + Options options = CurrentOptions(); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + std::unique_ptr file; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + options.avoid_unnecessary_blocking_io = false; + options.create_if_missing = false; + Reopen(options); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + + // test background purge + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + // the obsolete file is not deleted until the background purge job is ran + CheckFileTypeCounts(dbname_, -1, 1, -1); + TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1"); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + CheckFileTypeCounts(dbname_, -1, 0, -1); +} + TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { Options options = CurrentOptions(); SetOptions(&options); @@ -306,6 +351,11 @@ do_test(false); } + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->LoadDependency( @@ -313,9 +363,6 @@ "DBImpl::BGWorkPurge:start"}}); SyncPoint::GetInstance()->EnableProcessing(); - options.avoid_unnecessary_blocking_io = true; - options.create_if_missing = false; - Reopen(options); { SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); do_test(true); @@ -344,11 +391,12 @@ ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; itr = db_->NewIterator(read_options); + ASSERT_OK(itr->status()); // ReadOptions is deleted, but iterator cleanup function should not be // affected } - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); delete itr; @@ -382,9 +430,11 @@ ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; Iterator* itr1 = db_->NewIterator(read_options); + ASSERT_OK(itr1->status()); CreateTwoLevels(); Iterator* itr2 = db_->NewIterator(read_options); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr2->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 5 sst files after 2 compactions with 2 live iterators CheckFileTypeCounts(dbname_, 0, 5, 1); @@ -417,6 +467,7 @@ CreateTwoLevels(); ReadOptions read_options; Iterator* it = db_->NewIterator(read_options); + ASSERT_OK(it->status()); std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -432,7 +483,7 @@ Status status = db_->DeleteFile(level2file); fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(), status.ToString().c_str()); - ASSERT_TRUE(status.ok()); + ASSERT_OK(status); it->SeekToFirst(); int numKeysIterated = 0; while(it->Valid()) { @@ -452,7 +503,7 @@ AddKeys(10, 0); VectorLogPtr logfiles; - db_->GetSortedWalFiles(logfiles); + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); ASSERT_GT(logfiles.size(), 0UL); // Take the last log file which is expected to be alive and try to delete it // Should not succeed because live logs are not allowed to be deleted @@ -461,7 +512,7 @@ ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); fprintf(stdout, "Deleting alive log file %s\n", alive_log->PathName().c_str()); - ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_NOK(db_->DeleteFile(alive_log->PathName())); ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); logfiles.clear(); @@ -469,10 +520,10 @@ // Call Flush again to flush out memtable and move alive log to archived log // and try to delete the archived log file FlushOptions fopts; - db_->Flush(fopts); + ASSERT_OK(db_->Flush(fopts)); AddKeys(10, 0); - db_->Flush(fopts); - db_->GetSortedWalFiles(logfiles); + ASSERT_OK(db_->Flush(fopts)); + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); ASSERT_GT(logfiles.size(), 0UL); std::unique_ptr archived_log = std::move(logfiles.front()); ASSERT_EQ(archived_log->Type(), kArchivedLogFile); @@ -480,8 +531,8 @@ fprintf(stdout, "Deleting archived log file %s\n", archived_log->PathName().c_str()); ASSERT_OK(db_->DeleteFile(archived_log->PathName())); - ASSERT_EQ(Status::NotFound(), - env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); + ASSERT_TRUE( + env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound()); } TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { @@ -520,6 +571,7 @@ { std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + ASSERT_OK(itr->status()); int count = 0; for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { ASSERT_OK(itr->status()); @@ -544,14 +596,6 @@ } // namespace ROCKSDB_NAMESPACE -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); -} -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS - int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,9 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" + #include "db/db_impl/db_impl.h" #include "db/event_helpers.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" namespace ROCKSDB_NAMESPACE { @@ -32,6 +34,14 @@ Status::Code::kIOError, Status::SubCode::kSpaceLimit, true), Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kCompaction, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, // Errors during BG flush {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, Status::SubCode::kNoSpace, true), @@ -42,6 +52,12 @@ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, Status::SubCode::kSpaceLimit, true), Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + Status::SubCode::kIOFenced, false), + Status::Severity::kFatalError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, Status::SubCode::kNoSpace, @@ -51,9 +67,74 @@ Status::Code::kIOError, Status::SubCode::kNoSpace, false), Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kWriteCallback, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kSpaceLimit, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + // Errors during MANIFEST write when WAL is disabled + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, + }; -std::map, Status::Severity> +std::map, + Status::Severity> DefaultErrorSeverityMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, @@ -75,11 +156,11 @@ {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kCorruption, false), Status::Severity::kNoError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, true), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + true), Status::Severity::kFatalError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, false), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + false), Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, @@ -94,30 +175,55 @@ {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, false), Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, + // Errors during BG flush with WAL disabled + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, true), + Status::Severity::kUnrecoverableError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kCorruption, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, + Status::Code::kIOError, false), + Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, }; std::map, Status::Severity> DefaultReasonMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kCompaction, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during BG flush {std::make_tuple(BackgroundErrorReason::kFlush, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kFlush, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, // Errors during Memtable update {std::make_tuple(BackgroundErrorReason::kMemTable, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kMemTable, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, }; void ErrorHandler::CancelErrorRecovery() { @@ -138,6 +244,10 @@ recovery_in_prog_ = false; } } + + // If auto recovery is also runing to resume from the retryable error, + // we should wait and end the auto recovery. + EndAutoRecovery(); #endif } @@ -159,16 +269,23 @@ // This can also get called as part of a recovery operation. In that case, we // also track the error separately in recovery_error_ so we can tell in the // end whether recovery succeeded or not -Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) { +const Status& ErrorHandler::SetBGError(const Status& bg_err, + BackgroundErrorReason reason) { db_mutex_->AssertHeld(); - if (bg_err.ok()) { - return Status::OK(); + return bg_err; } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set regular background error\n"); + bool paranoid = db_options_.paranoid_checks; Status::Severity sev = Status::Severity::kFatalError; Status new_bg_err; + DBRecoverContext context; bool found = false; { @@ -210,7 +327,8 @@ } // Allow some error specific overrides - if (new_bg_err == Status::NoSpace()) { + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery); } @@ -227,18 +345,146 @@ } } + recover_context_ = context; if (auto_recovery) { recovery_in_prog_ = true; // Kick-off error specific recovery - if (bg_error_ == Status::NoSpace()) { + if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace || + new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) { RecoverFromNoSpace(); } } return bg_error_; } -Status ErrorHandler::OverrideNoSpaceError(Status bg_error, +// This is the main function for looking at IO related error during the +// background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. +// 1) if the error is caused by data loss, the error is mapped to +// unrecoverable error. Application/user must take action to handle +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: +// a) if the error happens during compaction, it is mapped to a soft error. +// the compaction thread will reschedule a new compaction. +// b) if the error happens during flush and also WAL is empty, it is mapped +// to a soft error. Note that, it includes the case that IO error happens +// in SST or manifest write during flush. +// c) all other errors are mapped to hard error. +// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason +// reason) will be called to handle other error cases. +const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason) { + db_mutex_->AssertHeld(); + if (bg_io_err.ok()) { + return bg_io_err; + } + ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", + bg_io_err.ToString().c_str()); + + if (recovery_in_prog_ && recovery_io_error_.ok()) { + recovery_io_error_ = bg_io_err; + } + if (BackgroundErrorReason::kManifestWrite == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // Always returns ok + ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions"); + db_->DisableFileDeletionsWithLock().PermitUncheckedError(); + } + + Status new_bg_io_err = bg_io_err; + DBRecoverContext context; + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. + bool auto_recovery = false; + Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); + CheckAndSetRecoveryAndBGError(bg_err); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set background IO error as unrecoverable error\n"); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &bg_err, db_mutex_, &auto_recovery); + recover_context_ = context; + return bg_error_; + } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && + (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable())) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. Note that, all the NoSpace error should be + // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter + // it is retryable or file scope, this logic will be bypassed. + bool auto_recovery = false; + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &new_bg_io_err, db_mutex_, + &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set background retryable IO error\n"); + if (BackgroundErrorReason::kCompaction == reason) { + // We map the retryable IO error during compaction to soft error. Since + // compaction can reschedule by itself. We will not set the BG error in + // this case + // TODO: a better way to set or clean the retryable IO error which + // happens during compaction SST file write. + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Compaction will schedule by itself to resume\n"); + return bg_error_; + } else if (BackgroundErrorReason::kFlushNoWAL == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { + // When the BG Retryable IO error reason is flush without WAL, + // We map it to a soft error. At the same time, all the background work + // should be stopped except the BG work from recovery. Therefore, we + // set the soft_error_no_bg_work_ to true. At the same time, since DB + // continues to receive writes when BG error is soft error, to avoid + // to many small memtable being generated during auto resume, the flush + // reason is set to kErrorRecoveryRetryFlush. + Status bg_err(new_bg_io_err, Status::Severity::kSoftError); + CheckAndSetRecoveryAndBGError(bg_err); + soft_error_no_bg_work_ = true; + context.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } else { + Status bg_err(new_bg_io_err, Status::Severity::kHardError); + CheckAndSetRecoveryAndBGError(bg_err); + recover_context_ = context; + return StartRecoverFromRetryableBGIOError(bg_io_err); + } + } else { + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + return SetBGError(new_bg_io_err, reason); + } +} + +Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery) { #ifndef ROCKSDB_LITE if (bg_error.severity() >= Status::Severity::kFatalError) { @@ -294,10 +540,17 @@ // Signal that recovery succeeded if (recovery_error_.ok()) { Status old_bg_error = bg_error_; + // old_bg_error is only for notifying listeners, so may not be checked + old_bg_error.PermitUncheckedError(); + // Clear and check the recovery IO and BG error bg_error_ = Status::OK(); + recovery_io_error_ = IOStatus::OK(); + bg_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); recovery_in_prog_ = false; - EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, - old_bg_error, db_mutex_); + soft_error_no_bg_work_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, + bg_error_, db_mutex_); } return recovery_error_; #else @@ -308,6 +561,7 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { #ifndef ROCKSDB_LITE InstrumentedMutexLock l(db_mutex_); + bool no_bg_work_original_flag = soft_error_no_bg_work_; if (is_manual) { // If its a manual recovery and there's a background recovery in progress // return busy status @@ -315,9 +569,24 @@ return Status::Busy(); } recovery_in_prog_ = true; + + // In manual resume, we allow the bg work to run. If it is a auto resume, + // the bg work should follow this tag. + soft_error_no_bg_work_ = false; + + // In manual resume, if the bg error is a soft error and also requires + // no bg work, the error must be recovered by call the flush with + // flush reason: kErrorRecoveryRetryFlush. In other case, the flush + // reason is set to kErrorRecovery. + if (no_bg_work_original_flag) { + recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush; + } else { + recover_context_.flush_reason = FlushReason::kErrorRecovery; + } } - if (bg_error_.severity() == Status::Severity::kSoftError) { + if (bg_error_.severity() == Status::Severity::kSoftError && + recover_context_.flush_reason == FlushReason::kErrorRecovery) { // Simply clear the background error and return recovery_error_ = Status::OK(); return ClearBGError(); @@ -327,7 +596,14 @@ // during the recovery process. While recovering, the only operations that // can generate background errors should be the flush operations recovery_error_ = Status::OK(); - Status s = db_->ResumeImpl(); + recovery_error_.PermitUncheckedError(); + Status s = db_->ResumeImpl(recover_context_); + if (s.ok()) { + soft_error_no_bg_work_ = false; + } else { + soft_error_no_bg_work_ = no_bg_work_original_flag; + } + // For manual recover, shutdown, and fatal error cases, set // recovery_in_prog_ to false. For automatic background recovery, leave it // as is regardless of success or failure as it will be retried @@ -341,4 +617,186 @@ return bg_error_; #endif } + +const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( + const IOStatus& io_error) { +#ifndef ROCKSDB_LITE + db_mutex_->AssertHeld(); + if (bg_error_.ok()) { + return bg_error_; + } else if (io_error.ok()) { + return io_error; + } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { + // Auto resume BG error is not enabled, directly return bg_error_. + return bg_error_; + } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + if (recovery_thread_) { + // In this case, if recovery_in_prog_ is false, current thread should + // wait the previous recover thread to finish and create a new thread + // to recover from the bg error. + db_mutex_->Unlock(); + recovery_thread_->join(); + db_mutex_->Lock(); + } + + recovery_in_prog_ = true; + recovery_thread_.reset( + new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); + + if (recovery_io_error_.ok() && recovery_error_.ok()) { + return recovery_error_; + } else { + return bg_error_; + } +#else + (void)io_error; + return bg_error_; +#endif +} + +// Automatic recover from Retryable BG IO error. Must be called after db +// mutex is released. +void ErrorHandler::RecoverFromRetryableBGIOError() { +#ifndef ROCKSDB_LITE + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); + InstrumentedMutexLock l(db_mutex_); + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + DBRecoverContext context = recover_context_; + int resume_count = db_options_.max_bgerror_resume_count; + uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; + uint64_t retry_count = 0; + // Recover from the retryable error. Create a separate thread to do it. + while (resume_count > 0) { + if (end_recovery_) { + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + return; + } + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); + recovery_io_error_ = IOStatus::OK(); + recovery_error_ = Status::OK(); + retry_count++; + Status s = db_->ResumeImpl(context); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); + } + if (s.IsShutdownInProgress() || + bg_error_.severity() >= Status::Severity::kFatalError) { + // If DB shutdown in progress or the error severity is higher than + // Hard Error, stop auto resume and returns. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + bg_error_, db_mutex_); + return; + } + if (!recovery_io_error_.ok() && + recovery_error_.severity() <= Status::Severity::kHardError && + recovery_io_error_.GetRetryable()) { + // If new BG IO error happens during auto recovery and it is retryable + // and its severity is Hard Error or lower, the auto resmue sleep for + // a period of time and redo auto resume if it is allowed. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); + int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; + cv_.TimedWait(wait_until); + } else { + // There are three possibility: 1) recover_io_error is set during resume + // and the error is not retryable, 2) recover is successful, 3) other + // error happens during resume and cannot be resumed here. + if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) { + // recover from the retryable IO error and no other BG errors. Clean + // the bg_error and notify user. + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); + Status old_bg_error = bg_error_; + bg_error_ = Status::OK(); + bg_error_.PermitUncheckedError(); + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, old_bg_error, bg_error_, db_mutex_); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + recovery_in_prog_ = false; + if (soft_error_no_bg_work_) { + soft_error_no_bg_work_ = false; + } + return; + } else { + // In this case: 1) recovery_io_error is more serious or not retryable + // 2) other Non IO recovery_error happens. The auto recovery stops. + recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + !recovery_io_error_.ok() + ? recovery_io_error_ + : (!recovery_error_.ok() ? recovery_error_ : s), + db_mutex_); + return; + } + } + resume_count--; + } + recovery_in_prog_ = false; + EventHelpers::NotifyOnErrorRecoveryEnd( + db_options_.listeners, bg_error_, + Status::Aborted("Exceeded resume retry count"), db_mutex_); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } + return; +#else + return; +#endif +} + +void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { + if (recovery_in_prog_ && recovery_error_.ok()) { + recovery_error_ = bg_err; + } + if (bg_err.severity() > bg_error_.severity()) { + bg_error_ = bg_err; + } + return; +} + +void ErrorHandler::EndAutoRecovery() { + db_mutex_->AssertHeld(); + if (!end_recovery_) { + end_recovery_ = true; + } + cv_.SignalAll(); + db_mutex_->Unlock(); + if (recovery_thread_) { + recovery_thread_->join(); + } + db_mutex_->Lock(); + return; +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,7 @@ #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" +#include "rocksdb/io_status.h" #include "rocksdb/listener.h" #include "rocksdb/status.h" @@ -13,18 +14,36 @@ class DBImpl; +// This structure is used to store the DB recovery context. The context is +// the information that related to the recover actions. For example, it contains +// FlushReason, which tells the flush job why this flush is called. +struct DBRecoverContext { + FlushReason flush_reason; + + DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {} + + DBRecoverContext(FlushReason reason) : flush_reason(reason) {} +}; + class ErrorHandler { public: ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, InstrumentedMutex* db_mutex) : db_(db), db_options_(db_options), - bg_error_(Status::OK()), - recovery_error_(Status::OK()), + cv_(db_mutex), + end_recovery_(false), + recovery_thread_(nullptr), db_mutex_(db_mutex), auto_recovery_(false), - recovery_in_prog_(false) {} - ~ErrorHandler() {} + recovery_in_prog_(false), + soft_error_no_bg_work_(false), + bg_error_stats_(db_options.statistics) { + // Clear the checked flag for uninitialized errors + bg_error_.PermitUncheckedError(); + recovery_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + } void EnableAutoRecovery() { auto_recovery_ = true; } @@ -32,11 +51,14 @@ Status::Code code, Status::SubCode subcode); - Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + const Status& SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason); - Status GetBGError() { return bg_error_; } + Status GetBGError() const { return bg_error_; } - Status GetRecoveryError() { return recovery_error_; } + Status GetRecoveryError() const { return recovery_error_; } Status ClearBGError(); @@ -48,14 +70,18 @@ bool IsBGWorkStopped() { return !bg_error_.ok() && (bg_error_.severity() >= Status::Severity::kHardError || - !auto_recovery_); + !auto_recovery_ || soft_error_no_bg_work_); } + bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } + bool IsRecoveryInProgress() { return recovery_in_prog_; } Status RecoverFromBGError(bool is_manual = false); void CancelErrorRecovery(); + void EndAutoRecovery(); + private: DBImpl* db_; const ImmutableDBOptions& db_options_; @@ -63,13 +89,37 @@ // A separate Status variable used to record any errors during the // recovery process from hard errors Status recovery_error_; + // A separate IO Status variable used to record any IO errors during + // the recovery process. At the same time, recovery_error_ is also set. + IOStatus recovery_io_error_; + // The condition variable used with db_mutex during auto resume for time + // wait. + InstrumentedCondVar cv_; + bool end_recovery_; + std::unique_ptr recovery_thread_; + InstrumentedMutex* db_mutex_; // A flag indicating whether automatic recovery from errors is enabled bool auto_recovery_; bool recovery_in_prog_; + // A flag to indicate that for the soft error, we should not allow any + // background work except the work is from recovery. + bool soft_error_no_bg_work_; + + // Used to store the context for recover, such as flush reason. + DBRecoverContext recover_context_; + + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; - Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); void RecoverFromNoSpace(); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); + void RecoverFromRetryableBGIOError(); + // First, if it is in recovery and the recovery_error is ok. Set the + // recovery_error_ to bg_err. Second, if the severity is higher than the + // current bg_error_, overwrite it. + void CheckAndSetRecoveryAndBGError(const Status& bg_err); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,2663 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" +#include "port/stack_trace.h" +#include "rocksdb/io_status.h" +#include "rocksdb/sst_file_manager.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +class DBErrorHandlingFSTest : public DBTestBase { + public: + DBErrorHandlingFSTest() + : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::string GetManifestNameFromLiveFiles() { + std::vector live_files; + uint64_t manifest_size; + + Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false); + if (!s.ok()) { + return ""; + } + for (auto& file : live_files) { + uint64_t num = 0; + FileType type; + if (ParseFileName(file, &num, &type) && type == kDescriptorFile) { + return file; + } + } + return ""; + } + + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; +}; + +class ErrorHandlerFSListener : public EventListener { + public: + ErrorHandlerFSListener() + : mutex_(), + cv_(&mutex_), + no_auto_recovery_(false), + recovery_complete_(false), + file_creation_started_(false), + override_bg_error_(false), + file_count_(0), + fault_fs_(nullptr) {} + ~ErrorHandlerFSListener() { + file_creation_error_.PermitUncheckedError(); + bg_error_.PermitUncheckedError(); + new_bg_error_.PermitUncheckedError(); + } + + void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& /*ti*/) override { + InstrumentedMutexLock l(&mutex_); + file_creation_started_ = true; + if (file_count_ > 0) { + if (--file_count_ == 0) { + fault_fs_->SetFilesystemActive(false, file_creation_error_); + file_creation_error_ = IOStatus::OK(); + } + } + cv_.SignalAll(); + } + + void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error, + bool* auto_recovery) override { + bg_error.PermitUncheckedError(); + if (*auto_recovery && no_auto_recovery_) { + *auto_recovery = false; + } + } + + void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override { + InstrumentedMutexLock l(&mutex_); + recovery_complete_ = true; + cv_.SignalAll(); + new_bg_error_ = info.new_bg_error; + } + + bool WaitForRecovery(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!recovery_complete_) { + cv_.Wait(/*abs_time_us*/); + } + if (recovery_complete_) { + recovery_complete_ = false; + return true; + } + return false; + } + + void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) { + InstrumentedMutexLock l(&mutex_); + while (!file_creation_started_) { + cv_.Wait(/*abs_time_us*/); + } + file_creation_started_ = false; + } + + void OnBackgroundError(BackgroundErrorReason /*reason*/, + Status* bg_error) override { + if (override_bg_error_) { + *bg_error = bg_error_; + override_bg_error_ = false; + } + } + + void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } + + void OverrideBGError(Status bg_err) { + bg_error_ = bg_err; + override_bg_error_ = true; + } + + void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count, + IOStatus io_s) { + fault_fs_ = fs; + file_count_ = file_count; + file_creation_error_ = io_s; + } + + Status new_bg_error() { return new_bg_error_; } + + private: + InstrumentedMutex mutex_; + InstrumentedCondVar cv_; + bool no_auto_recovery_; + bool recovery_complete_; + bool file_creation_started_; + bool override_bg_error_; + int file_count_; + IOStatus file_creation_error_; + Status bg_error_; + Status new_bg_error_; + FaultInjectionTestFS* fault_fs_; +}; + +TEST_F(DBErrorHandlingFSTest, FLushWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + Destroy(options); +} + +// All the NoSpace IOError will be handled as the regular BG Error no matter the +// retryable flag is set of not. So the auto resume for retryable IO Error will +// not be triggered. Also, it is mapped as hard error. +TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + // not file scope, but retyrable set + error_msg.SetDataLoss(false); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.atomic_flush = true; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +// The flush error is injected before we finish the table build +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Destroy(options); +} + +// The retryable IO error is injected before we sync table +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +// The retryable IO error is injected before we close the table file +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_OK(Put(Key(2), "val2", wo)); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val2", Get(Key(2))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val", wo)); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_fs_->SetFilesystemActive(true); + + // This Resume() will attempt to create a new manifest file and fail again + s = dbfull()->Resume(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // A successful Resume() will create a new manifest file + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + s = Flush(); + ASSERT_OK(s); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteError:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, + // Wait for DB instance to clear bg_error before calling + // TEST_WaitForCompact + {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + // This Flush will trigger a compaction, which will fail when appending to + // the manifest + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteError:0"); + // Clear all errors so when the compaction is retried, it will succeed + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("CompactionManifestWriteError:1"); + TEST_SYNC_POINT("CompactionManifestWriteError:2"); + + s = dbfull()->TEST_WaitForCompact(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteError:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteError:0"); + TEST_SYNC_POINT("CompactionManifestWriteError:1"); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError( + Status(Status::NoSpace(), Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, CorruptionError) { + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, + IOStatus::Corruption("Corruption")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_NOK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + s = Put(Key(1), "val"); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + Status s; + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + // We should be able to shutdown the database while auto recovery is going + // on in the background + Close(); + DestroyDB(dbname_, options).PermitUncheckedError(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(s, s.NoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Reopen(options); + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 0; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOError()); + } + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + ASSERT_OK(dbfull()->Resume()); + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + Random rnd(301); + + listener->EnableAutoRecovery(); + CreateAndReopenWithCF({"one", "two", "three"}, options); + + { + WriteBatch batch; + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 100; ++j) { + ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024))); + } + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + // Write to one CF + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsNoSpace()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + for (auto i = 1; i < 4; ++i) { + // Every CF should have been flushed + ASSERT_EQ(NumTableFilesAtLevel(0, i), 1); + } + + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + for (auto i = 1; i < 4; ++i) { + for (auto j = 0; j < 199; ++j) { + if (j < 100) { + ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); + } + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); + std::vector> fault_envs; + std::vector fault_fs; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerFSListener()); + options.emplace_back(GetDefaultOptions()); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); + std::shared_ptr fs(fault_fs.back()); + fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); + options[i].env = fault_envs.back().get(); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_fs[i], 3, + IOStatus::NoSpace("Out of space")); + snprintf(buf, sizeof(buf), "_%d", i); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + fault_fs[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + EXPECT_TRUE( + db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE( + db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + delete db[i]; + fault_fs[i]->SetFilesystemActive(true); + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + } + } + options.clear(); + sfm.reset(); + delete def_env; +} + +TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); + std::vector> fault_envs; + std::vector fault_fs; + std::vector options; + std::vector> listener; + std::vector db; + std::shared_ptr sfm(NewSstFileManager(def_env)); + int kNumDbInstances = 3; + Random rnd(301); + + for (auto i = 0; i < kNumDbInstances; ++i) { + listener.emplace_back(new ErrorHandlerFSListener()); + options.emplace_back(GetDefaultOptions()); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); + std::shared_ptr fs(fault_fs.back()); + fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); + options[i].env = fault_envs.back().get(); + options[i].create_if_missing = true; + options[i].level0_file_num_compaction_trigger = 2; + options[i].writable_file_max_buffer_size = 32768; + options[i].listeners.emplace_back(listener[i]); + options[i].sst_file_manager = sfm; + DB* dbptr; + char buf[16]; + + listener[i]->EnableAutoRecovery(); + switch (i) { + case 0: + // Setup for returning error for the 3rd SST, which would be level 1 + listener[i]->InjectFileCreationError(fault_fs[i], 3, + IOStatus::NoSpace("Out of space")); + break; + case 1: + // Setup for returning error after the 1st SST, which would result + // in a hard error + listener[i]->InjectFileCreationError(fault_fs[i], 2, + IOStatus::NoSpace("Out of space")); + break; + default: + break; + } + snprintf(buf, sizeof(buf), "_%d", i); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); + db.emplace_back(dbptr); + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + for (auto j = 0; j <= 100; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); + } + + def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); + for (auto i = 0; i < kNumDbInstances; ++i) { + WriteBatch batch; + + // Write to one CF + for (auto j = 100; j < 199; ++j) { + ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(db[i]->Write(wopts, &batch)); + if (i != 1) { + ASSERT_OK(db[i]->Flush(FlushOptions())); + } else { + ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace()); + } + } + + for (auto i = 0; i < kNumDbInstances; ++i) { + Status s = static_cast(db[i])->TEST_WaitForCompact(true); + switch (i) { + case 0: + ASSERT_EQ(s.severity(), Status::Severity::kSoftError); + break; + case 1: + ASSERT_EQ(s.severity(), Status::Severity::kHardError); + break; + case 2: + ASSERT_OK(s); + break; + } + fault_fs[i]->SetFilesystemActive(true); + } + + def_env->SetFilesystemActive(true); + for (auto i = 0; i < kNumDbInstances; ++i) { + std::string prop; + if (i < 2) { + ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); + } + if (i == 1) { + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + } + EXPECT_TRUE( + db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(0), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 0); + EXPECT_TRUE( + db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(1), &prop)); + EXPECT_EQ(atoi(prop.c_str()), 1); + } + + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + + for (auto i = 0; i < kNumDbInstances; ++i) { + char buf[16]; + snprintf(buf, sizeof(buf), "_%d", i); + fault_fs[i]->SetFilesystemActive(true); + delete db[i]; + if (getenv("KEEP_DB")) { + printf("DB is still at %s%s\n", dbname_.c_str(), buf); + } else { + EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + } + } + options.clear(); + delete def_env; +} + +// When Put the KV-pair, the write option is set to disable WAL. +// If retryable error happens in this condition, map the bg error +// to soft error and trigger auto resume. During auto resume, SwitchMemtable +// is disabled to avoid small SST tables. Write can still be applied before +// the bg error is cleaned unless the memtable is full. +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:LoopOut", + "FLushWritNoWALRetryableeErrorAutoRecover1:1"}}); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1"); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ("val1", Get(Key(1))); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); + ASSERT_OK(Put(Key(2), "val2", wo)); + s = Flush(); + // Since auto resume fails, the bg error is not cleand, flush will + // return the bg_error set before. + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ("val2", Get(Key(2))); + + // call auto resume + ASSERT_OK(dbfull()->Resume()); + ASSERT_OK(Put(Key(3), "val3", wo)); + // After resume is successful, the flush should be ok. + ASSERT_OK(Flush()); + ASSERT_EQ("val3", Get(Key(3))); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(1), "val1", wo)); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); + ASSERT_OK(Put(Key(2), "val2", wo)); + s = Flush(); + // Since auto resume is successful, the bg error is cleaned, flush will + // be successful. + ASSERT_OK(s); + ASSERT_EQ("val2", Get(Key(2))); + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error. Activate the FS before the +// first resume. Resume is successful +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + + ASSERT_EQ("val1", Get(Key(1))); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Flush()); + ASSERT_EQ("val2", Get(Key(2))); + + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error and set the retry limit count. +// Never activate the FS and auto resume should fail at the end +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { + // Fail all the resume and let user to resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"FLushWritRetryableeErrorAutoRecover2:0", + "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:LoopOut", + "FLushWritRetryableeErrorAutoRecover2:1"}}); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0"); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_EQ("val1", Get(Key(1))); + // Auto resume fails due to FS does not recover during resume. User call + // resume manually here. + s = dbfull()->Resume(); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(s); + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Flush()); + ASSERT_EQ("val2", Get(Key(2))); + + Destroy(options); +} + +// Auto resume fromt the flush retryable IO error and set the retry limit count. +// Fail the first resume and let the second resume be successful. +TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { + // Fail the first resume and let the second resume be successful + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeStart", + "ManifestWriteRetryableErrorAutoRecover:0"}, + {"ManifestWriteRetryableErrorAutoRecover:1", + "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "ManifestWriteRetryableErrorAutoRecover:2"}}); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) { + // Fail the first resume and let the second resume be successful + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val", wo)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeStart", + "ManifestWriteNoWALRetryableErrorAutoRecover:0"}, + {"ManifestWriteNoWALRetryableErrorAutoRecover:1", + "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "ManifestWriteNoWALRetryableErrorAutoRecover:2"}}); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, + CompactionManifestWriteRetryableErrorAutoRecover) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + std::string old_manifest; + std::string new_manifest; + std::atomic fail_manifest(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Put(Key(2), "val")); + ASSERT_OK(Flush()); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + // Wait for flush of 2nd L0 file before starting compaction + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + // Wait for compaction to detect manifest write error + {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"}, + // Make compaction thread wait for error to be cleared + {"CompactionManifestWriteErrorAR:1", + "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, + {"CompactionManifestWriteErrorAR:2", + "RecoverFromRetryableBGIOError:BeforeStart"}, + // Fail the first resume, before the wait in resume + {"RecoverFromRetryableBGIOError:BeforeResume0", + "CompactionManifestWriteErrorAR:3"}, + // Activate the FS before the second resume + {"CompactionManifestWriteErrorAR:4", + "RecoverFromRetryableBGIOError:BeforeResume1"}, + // Wait the auto resume be sucessful + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "CompactionManifestWriteErrorAR:5"}}); + // trigger manifest write failure in compaction thread + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + if (fail_manifest.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1"); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4"); + TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5"); + SyncPoint::GetInstance()->DisableProcessing(); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + ASSERT_EQ("val", Get(Key(2))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { + // In this test, in the first round of compaction, the FS is set to error. + // So the first compaction fails due to retryable IO error and it is mapped + // to soft error. Then, compaction is rescheduled, in the second round of + // compaction, the FS is set to active and compaction is successful, so + // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync + // point. + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + Status s; + std::atomic fail_first(false); + std::atomic fail_second(true); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}, + {"CompactionJob::FinishCompactionOutputFile1", + "CompactionWriteRetryableErrorAutoRecover0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Start", + [&](void*) { fault_fs_->SetFilesystemActive(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", [&](void*) { + if (fail_first.load() && fail_second.load()) { + fault_fs_->SetFilesystemActive(false, error_msg); + fail_second.store(false); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_OK(s); + TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0"); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"}, + {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}}); + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(true, s.IsIOError()); + TEST_SYNC_POINT("WALWriteErrorDone"); + + TEST_SYNC_POINT("WALWriteError1:0"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("WALWriteError1:1"); + TEST_SYNC_POINT("WALWriteError1:2"); + } + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { + // Fail the first recover and try second time. + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = true; + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + Random rnd(301); + + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + // For the first batch, write is successful, require sync + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + // For the second batch, the first 2 file Append are successful, then the + // following Append fails due to file system retryable IOError. + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 200; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"}, + {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}}); + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, error_msg); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_EQ(true, s.IsIOError()); + + TEST_SYNC_POINT("WALWriteError2:0"); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + TEST_SYNC_POINT("WALWriteError2:1"); + TEST_SYNC_POINT("WALWriteError2:2"); + } + SyncPoint::GetInstance()->DisableProcessing(); + + // Data in corrupted WAL are not stored + for (auto i = 0; i < 199; ++i) { + if (i < 100) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + + // Resume and write a new batch, should be in the WAL + { + WriteBatch batch; + + for (auto i = 200; i < 300; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + Reopen(options); + for (auto i = 0; i < 300; ++i) { + if (i < 100 || i >= 200) { + ASSERT_NE(Get(Key(i)), "NOT_FOUND"); + } else { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + } + Close(); +} + +// Fail auto resume from a flush retryable error and verify that +// OnErrorRecoveryEnd listener callback is called +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { + // Activate the FS before the first resume + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_EQ(listener->WaitForRecovery(5000000), true); + ASSERT_EQ(listener->new_bg_error(), Status::Aborted()); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + + Destroy(options); +} + +class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, + public testing::WithParamInterface {}; + +TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "val")); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Destroy(options); +} + +TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Close(); +} + +TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->EnableAutoRecovery(true); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + ASSERT_TRUE(s.IsIOFenced()); + + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_TRUE(s.IsIOFenced()); + Destroy(options); +} + +TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.writable_file_max_buffer_size = 32768; + options.listeners.emplace_back(listener); + options.paranoid_checks = GetParam(); + Status s; + Random rnd(301); + + listener->EnableAutoRecovery(true); + DestroyAndReopen(options); + + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + ASSERT_OK(dbfull()->Write(wopts, &batch)); + }; + + { + WriteBatch batch; + int write_error = 0; + + for (auto i = 100; i < 199; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + SyncPoint::GetInstance()->SetCallBack( + "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { + write_error++; + if (write_error > 2) { + fault_fs_->SetFilesystemActive(false, + IOStatus::IOFenced("IO fenced")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOFenced()); + } + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + { + WriteBatch batch; + + for (auto i = 0; i < 100; ++i) { + ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); + } + + WriteOptions wopts; + wopts.sync = true; + s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOFenced()); + } + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest, + ::testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,871 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE - -#include "db/db_test_util.h" -#include "port/stack_trace.h" -#include "rocksdb/perf_context.h" -#include "rocksdb/sst_file_manager.h" -#include "test_util/fault_injection_test_env.h" -#if !defined(ROCKSDB_LITE) -#include "test_util/sync_point.h" -#endif - -namespace ROCKSDB_NAMESPACE { - -class DBErrorHandlingTest : public DBTestBase { - public: - DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {} - - std::string GetManifestNameFromLiveFiles() { - std::vector live_files; - uint64_t manifest_size; - - dbfull()->GetLiveFiles(live_files, &manifest_size, false); - for (auto& file : live_files) { - uint64_t num = 0; - FileType type; - if (ParseFileName(file, &num, &type) && type == kDescriptorFile) { - return file; - } - } - return ""; - } -}; - -class DBErrorHandlingEnv : public EnvWrapper { - public: - DBErrorHandlingEnv() : EnvWrapper(Env::Default()), - trig_no_space(false), trig_io_error(false) {} - - void SetTrigNoSpace() {trig_no_space = true;} - void SetTrigIoError() {trig_io_error = true;} - private: - bool trig_no_space; - bool trig_io_error; -}; - -class ErrorHandlerListener : public EventListener { - public: - ErrorHandlerListener() - : mutex_(), - cv_(&mutex_), - no_auto_recovery_(false), - recovery_complete_(false), - file_creation_started_(false), - override_bg_error_(false), - file_count_(0), - fault_env_(nullptr) {} - - void OnTableFileCreationStarted( - const TableFileCreationBriefInfo& /*ti*/) override { - InstrumentedMutexLock l(&mutex_); - file_creation_started_ = true; - if (file_count_ > 0) { - if (--file_count_ == 0) { - fault_env_->SetFilesystemActive(false, file_creation_error_); - file_creation_error_ = Status::OK(); - } - } - cv_.SignalAll(); - } - - void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, - Status /*bg_error*/, - bool* auto_recovery) override { - if (*auto_recovery && no_auto_recovery_) { - *auto_recovery = false; - } - } - - void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override { - InstrumentedMutexLock l(&mutex_); - recovery_complete_ = true; - cv_.SignalAll(); - } - - bool WaitForRecovery(uint64_t /*abs_time_us*/) { - InstrumentedMutexLock l(&mutex_); - while (!recovery_complete_) { - cv_.Wait(/*abs_time_us*/); - } - if (recovery_complete_) { - recovery_complete_ = false; - return true; - } - return false; - } - - void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) { - InstrumentedMutexLock l(&mutex_); - while (!file_creation_started_) { - cv_.Wait(/*abs_time_us*/); - } - file_creation_started_ = false; - } - - void OnBackgroundError(BackgroundErrorReason /*reason*/, - Status* bg_error) override { - if (override_bg_error_) { - *bg_error = bg_error_; - override_bg_error_ = false; - } - } - - void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; } - - void OverrideBGError(Status bg_err) { - bg_error_ = bg_err; - override_bg_error_ = true; - } - - void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count, - Status s) { - fault_env_ = env; - file_count_ = file_count; - file_creation_error_ = s; - } - - private: - InstrumentedMutex mutex_; - InstrumentedCondVar cv_; - bool no_auto_recovery_; - bool recovery_complete_; - bool file_creation_started_; - bool override_bg_error_; - int file_count_; - Status file_creation_error_; - Status bg_error_; - FaultInjectionTestEnv* fault_env_; -}; - -TEST_F(DBErrorHandlingTest, FLushWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - - Put(Key(0), "val"); - SyncPoint::GetInstance()->SetCallBack( - "FlushJob::Start", [&](void *) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - SyncPoint::GetInstance()->DisableProcessing(); - fault_env->SetFilesystemActive(true); - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); - - Reopen(options); - ASSERT_EQ("val", Get(Key(0))); - Destroy(options); -} - -TEST_F(DBErrorHandlingTest, ManifestWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - std::string old_manifest; - std::string new_manifest; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - old_manifest = GetManifestNameFromLiveFiles(); - - Put(Key(0), "val"); - Flush(); - Put(Key(1), "val"); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::LogAndApply:WriteManifest", [&](void *) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->DisableProcessing(); - fault_env->SetFilesystemActive(true); - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); - - new_manifest = GetManifestNameFromLiveFiles(); - ASSERT_NE(new_manifest, old_manifest); - - Reopen(options); - ASSERT_EQ("val", Get(Key(0))); - ASSERT_EQ("val", Get(Key(1))); - Close(); -} - -TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - std::string old_manifest; - std::string new_manifest; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - old_manifest = GetManifestNameFromLiveFiles(); - - Put(Key(0), "val"); - Flush(); - Put(Key(1), "val"); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::LogAndApply:WriteManifest", [&](void *) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_env->SetFilesystemActive(true); - - // This Resume() will attempt to create a new manifest file and fail again - s = dbfull()->Resume(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_env->SetFilesystemActive(true); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->DisableProcessing(); - - // A successful Resume() will create a new manifest file - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); - - new_manifest = GetManifestNameFromLiveFiles(); - ASSERT_NE(new_manifest, old_manifest); - - Reopen(options); - ASSERT_EQ("val", Get(Key(0))); - ASSERT_EQ("val", Get(Key(1))); - Close(); -} - -TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.level0_file_num_compaction_trigger = 2; - options.listeners.emplace_back(listener); - options.env = fault_env.get(); - Status s; - std::string old_manifest; - std::string new_manifest; - std::atomic fail_manifest(false); - DestroyAndReopen(options); - old_manifest = GetManifestNameFromLiveFiles(); - - Put(Key(0), "val"); - Put(Key(2), "val"); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - // Wait for flush of 2nd L0 file before starting compaction - {{"DBImpl::FlushMemTable:FlushMemTableFinished", - "BackgroundCallCompaction:0"}, - // Wait for compaction to detect manifest write error - {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"}, - // Make compaction thread wait for error to be cleared - {"CompactionManifestWriteError:1", - "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}, - // Wait for DB instance to clear bg_error before calling - // TEST_WaitForCompact - {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}}); - // trigger manifest write failure in compaction thread - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "VersionSet::LogAndApply:WriteManifest", [&](void*) { - if (fail_manifest.load()) { - fault_env->SetFilesystemActive(false, - Status::NoSpace("Out of space")); - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Put(Key(1), "val"); - // This Flush will trigger a compaction, which will fail when appending to - // the manifest - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - TEST_SYNC_POINT("CompactionManifestWriteError:0"); - // Clear all errors so when the compaction is retried, it will succeed - fault_env->SetFilesystemActive(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - TEST_SYNC_POINT("CompactionManifestWriteError:1"); - TEST_SYNC_POINT("CompactionManifestWriteError:2"); - - s = dbfull()->TEST_WaitForCompact(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ASSERT_EQ(s, Status::OK()); - - new_manifest = GetManifestNameFromLiveFiles(); - ASSERT_NE(new_manifest, old_manifest); - Reopen(options); - ASSERT_EQ("val", Get(Key(0))); - ASSERT_EQ("val", Get(Key(1))); - ASSERT_EQ("val", Get(Key(2))); - Close(); -} - -TEST_F(DBErrorHandlingTest, CompactionWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.level0_file_num_compaction_trigger = 2; - options.listeners.emplace_back(listener); - options.env = fault_env.get(); - Status s; - DestroyAndReopen(options); - - Put(Key(0), "va;"); - Put(Key(2), "va;"); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - listener->OverrideBGError( - Status(Status::NoSpace(), Status::Severity::kHardError) - ); - listener->EnableAutoRecovery(false); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::FlushMemTable:FlushMemTableFinished", - "BackgroundCallCompaction:0"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BackgroundCallCompaction:0", [&](void*) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Put(Key(1), "val"); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - - fault_env->SetFilesystemActive(true); - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); - Destroy(options); -} - -TEST_F(DBErrorHandlingTest, CorruptionError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.level0_file_num_compaction_trigger = 2; - options.env = fault_env.get(); - Status s; - DestroyAndReopen(options); - - Put(Key(0), "va;"); - Put(Key(2), "va;"); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::FlushMemTable:FlushMemTableFinished", - "BackgroundCallCompaction:0"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BackgroundCallCompaction:0", [&](void*) { - fault_env->SetFilesystemActive(false, Status::Corruption("Corruption")); - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Put(Key(1), "val"); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - - s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s.severity(), - ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); - - fault_env->SetFilesystemActive(true); - s = dbfull()->Resume(); - ASSERT_NE(s, Status::OK()); - Destroy(options); -} - -TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - - listener->EnableAutoRecovery(); - DestroyAndReopen(options); - - Put(Key(0), "val"); - SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - SyncPoint::GetInstance()->DisableProcessing(); - fault_env->SetFilesystemActive(true); - ASSERT_EQ(listener->WaitForRecovery(5000000), true); - - s = Put(Key(1), "val"); - ASSERT_EQ(s, Status::OK()); - - Reopen(options); - ASSERT_EQ("val", Get(Key(0))); - ASSERT_EQ("val", Get(Key(1))); - Destroy(options); -} - -TEST_F(DBErrorHandlingTest, FailRecoverFlushError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - - listener->EnableAutoRecovery(); - DestroyAndReopen(options); - - Put(Key(0), "val"); - SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - // We should be able to shutdown the database while auto recovery is going - // on in the background - Close(); - DestroyDB(dbname_, options); -} - -TEST_F(DBErrorHandlingTest, WALWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.writable_file_max_buffer_size = 32768; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - Random rnd(301); - - listener->EnableAutoRecovery(); - DestroyAndReopen(options); - - { - WriteBatch batch; - - for (auto i = 0; i<100; ++i) { - batch.Put(Key(i), RandomString(&rnd, 1024)); - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); - }; - - { - WriteBatch batch; - int write_error = 0; - - for (auto i = 100; i<199; ++i) { - batch.Put(Key(i), RandomString(&rnd, 1024)); - } - - SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { - write_error++; - if (write_error > 2) { - fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - } - }); - SyncPoint::GetInstance()->EnableProcessing(); - WriteOptions wopts; - wopts.sync = true; - s = dbfull()->Write(wopts, &batch); - ASSERT_EQ(s, s.NoSpace()); - } - SyncPoint::GetInstance()->DisableProcessing(); - fault_env->SetFilesystemActive(true); - ASSERT_EQ(listener->WaitForRecovery(5000000), true); - for (auto i=0; i<199; ++i) { - if (i < 100) { - ASSERT_NE(Get(Key(i)), "NOT_FOUND"); - } else { - ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); - } - } - Reopen(options); - for (auto i=0; i<199; ++i) { - if (i < 100) { - ASSERT_NE(Get(Key(i)), "NOT_FOUND"); - } else { - ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); - } - } - Close(); -} - -TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) { - std::unique_ptr fault_env( - new FaultInjectionTestEnv(Env::Default())); - std::shared_ptr listener(new ErrorHandlerListener()); - Options options = GetDefaultOptions(); - options.create_if_missing = true; - options.writable_file_max_buffer_size = 32768; - options.env = fault_env.get(); - options.listeners.emplace_back(listener); - Status s; - Random rnd(301); - - listener->EnableAutoRecovery(); - CreateAndReopenWithCF({"one", "two", "three"}, options); - - { - WriteBatch batch; - - for (auto i = 1; i < 4; ++i) { - for (auto j = 0; j < 100; ++j) { - batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024)); - } - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); - }; - - { - WriteBatch batch; - int write_error = 0; - - // Write to one CF - for (auto i = 100; i < 199; ++i) { - batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024)); - } - - SyncPoint::GetInstance()->SetCallBack( - "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { - write_error++; - if (write_error > 2) { - fault_env->SetFilesystemActive(false, - Status::NoSpace("Out of space")); - } - }); - SyncPoint::GetInstance()->EnableProcessing(); - WriteOptions wopts; - wopts.sync = true; - s = dbfull()->Write(wopts, &batch); - ASSERT_EQ(s, s.NoSpace()); - } - SyncPoint::GetInstance()->DisableProcessing(); - fault_env->SetFilesystemActive(true); - ASSERT_EQ(listener->WaitForRecovery(5000000), true); - - for (auto i = 1; i < 4; ++i) { - // Every CF should have been flushed - ASSERT_EQ(NumTableFilesAtLevel(0, i), 1); - } - - for (auto i = 1; i < 4; ++i) { - for (auto j = 0; j < 199; ++j) { - if (j < 100) { - ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); - } else { - ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); - } - } - } - ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); - for (auto i = 1; i < 4; ++i) { - for (auto j = 0; j < 199; ++j) { - if (j < 100) { - ASSERT_NE(Get(i, Key(j)), "NOT_FOUND"); - } else { - ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND"); - } - } - } - Close(); -} - -TEST_F(DBErrorHandlingTest, MultiDBCompactionError) { - FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); - std::vector> fault_env; - std::vector options; - std::vector> listener; - std::vector db; - std::shared_ptr sfm(NewSstFileManager(def_env)); - int kNumDbInstances = 3; - Random rnd(301); - - for (auto i = 0; i < kNumDbInstances; ++i) { - listener.emplace_back(new ErrorHandlerListener()); - options.emplace_back(GetDefaultOptions()); - fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); - options[i].create_if_missing = true; - options[i].level0_file_num_compaction_trigger = 2; - options[i].writable_file_max_buffer_size = 32768; - options[i].env = fault_env[i].get(); - options[i].listeners.emplace_back(listener[i]); - options[i].sst_file_manager = sfm; - DB* dbptr; - char buf[16]; - - listener[i]->EnableAutoRecovery(); - // Setup for returning error for the 3rd SST, which would be level 1 - listener[i]->InjectFileCreationError(fault_env[i].get(), 3, - Status::NoSpace("Out of space")); - snprintf(buf, sizeof(buf), "_%d", i); - DestroyDB(dbname_ + std::string(buf), options[i]); - ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), - Status::OK()); - db.emplace_back(dbptr); - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - WriteBatch batch; - - for (auto j = 0; j <= 100; ++j) { - batch.Put(Key(j), RandomString(&rnd, 1024)); - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); - } - - def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - for (auto i = 0; i < kNumDbInstances; ++i) { - WriteBatch batch; - - // Write to one CF - for (auto j = 100; j < 199; ++j) { - batch.Put(Key(j), RandomString(&rnd, 1024)); - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(true); - ASSERT_EQ(s.severity(), Status::Severity::kSoftError); - fault_env[i]->SetFilesystemActive(true); - } - - def_env->SetFilesystemActive(true); - for (auto i = 0; i < kNumDbInstances; ++i) { - std::string prop; - ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); - ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), - Status::OK()); - EXPECT_TRUE(db[i]->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(0), &prop)); - EXPECT_EQ(atoi(prop.c_str()), 0); - EXPECT_TRUE(db[i]->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(1), &prop)); - EXPECT_EQ(atoi(prop.c_str()), 1); - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - char buf[16]; - snprintf(buf, sizeof(buf), "_%d", i); - delete db[i]; - fault_env[i]->SetFilesystemActive(true); - if (getenv("KEEP_DB")) { - printf("DB is still at %s%s\n", dbname_.c_str(), buf); - } else { - Status s = DestroyDB(dbname_ + std::string(buf), options[i]); - } - } - options.clear(); - sfm.reset(); - delete def_env; -} - -TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) { - FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); - std::vector> fault_env; - std::vector options; - std::vector> listener; - std::vector db; - std::shared_ptr sfm(NewSstFileManager(def_env)); - int kNumDbInstances = 3; - Random rnd(301); - - for (auto i = 0; i < kNumDbInstances; ++i) { - listener.emplace_back(new ErrorHandlerListener()); - options.emplace_back(GetDefaultOptions()); - fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default())); - options[i].create_if_missing = true; - options[i].level0_file_num_compaction_trigger = 2; - options[i].writable_file_max_buffer_size = 32768; - options[i].env = fault_env[i].get(); - options[i].listeners.emplace_back(listener[i]); - options[i].sst_file_manager = sfm; - DB* dbptr; - char buf[16]; - - listener[i]->EnableAutoRecovery(); - switch (i) { - case 0: - // Setup for returning error for the 3rd SST, which would be level 1 - listener[i]->InjectFileCreationError(fault_env[i].get(), 3, - Status::NoSpace("Out of space")); - break; - case 1: - // Setup for returning error after the 1st SST, which would result - // in a hard error - listener[i]->InjectFileCreationError(fault_env[i].get(), 2, - Status::NoSpace("Out of space")); - break; - default: - break; - } - snprintf(buf, sizeof(buf), "_%d", i); - DestroyDB(dbname_ + std::string(buf), options[i]); - ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), - Status::OK()); - db.emplace_back(dbptr); - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - WriteBatch batch; - - for (auto j = 0; j <= 100; ++j) { - batch.Put(Key(j), RandomString(&rnd, 1024)); - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); - } - - def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); - for (auto i = 0; i < kNumDbInstances; ++i) { - WriteBatch batch; - - // Write to one CF - for (auto j = 100; j < 199; ++j) { - batch.Put(Key(j), RandomString(&rnd, 1024)); - } - - WriteOptions wopts; - wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - if (i != 1) { - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); - } else { - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace()); - } - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(true); - switch (i) { - case 0: - ASSERT_EQ(s.severity(), Status::Severity::kSoftError); - break; - case 1: - ASSERT_EQ(s.severity(), Status::Severity::kHardError); - break; - case 2: - ASSERT_EQ(s, Status::OK()); - break; - } - fault_env[i]->SetFilesystemActive(true); - } - - def_env->SetFilesystemActive(true); - for (auto i = 0; i < kNumDbInstances; ++i) { - std::string prop; - if (i < 2) { - ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); - } - if (i == 1) { - ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), - Status::OK()); - } - EXPECT_TRUE(db[i]->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(0), &prop)); - EXPECT_EQ(atoi(prop.c_str()), 0); - EXPECT_TRUE(db[i]->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(1), &prop)); - EXPECT_EQ(atoi(prop.c_str()), 1); - } - - for (auto i = 0; i < kNumDbInstances; ++i) { - char buf[16]; - snprintf(buf, sizeof(buf), "_%d", i); - fault_env[i]->SetFilesystemActive(true); - delete db[i]; - if (getenv("KEEP_DB")) { - printf("DB is still at %s%s\n", dbname_.c_str(), buf); - } else { - DestroyDB(dbname_ + std::string(buf), options[i]); - } - } - options.clear(); - delete def_env; -} - -} // namespace ROCKSDB_NAMESPACE - -int main(int argc, char** argv) { - ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,7 +5,18 @@ #include "db/event_helpers.h" +#include "rocksdb/convenience.h" +#include "rocksdb/listener.h" +#include "rocksdb/utilities/customizable_util.h" + namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +Status EventListener::CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result) { + return LoadSharedObject(config_options, id, nullptr, result); +} +#endif // ROCKSDB_LITE namespace { template @@ -26,6 +37,9 @@ const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, TableFileCreationReason reason) { + if (listeners.empty()) { + return; + } TableFileCreationBriefInfo info; info.db_name = db_name; info.cf_name = cf_name; @@ -43,7 +57,7 @@ BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, bool* auto_recovery) { #ifndef ROCKSDB_LITE - if (listeners.size() == 0U) { + if (listeners.empty()) { return; } db_mutex->AssertHeld(); @@ -51,6 +65,7 @@ db_mutex->Unlock(); for (auto& listener : listeners) { listener->OnBackgroundError(reason, bg_error); + bg_error->PermitUncheckedError(); if (*auto_recovery) { listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery); } @@ -71,14 +86,18 @@ const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, const FileDescriptor& fd, uint64_t oldest_blob_file_number, const TableProperties& table_properties, - TableFileCreationReason reason, const Status& s) { + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name) { if (s.ok() && event_logger) { JSONWriter jwriter; AppendCurrentTime(&jwriter); jwriter << "cf_name" << cf_name << "job" << job_id << "event" << "table_file_creation" << "file_number" << fd.GetNumber() << "file_size" - << fd.GetFileSize(); + << fd.GetFileSize() << "file_checksum" + << Slice(file_checksum).ToString(true) << "file_checksum_func_name" + << file_checksum_func_name; // table_properties { @@ -104,6 +123,7 @@ table_properties.num_entries) << "num_data_blocks" << table_properties.num_data_blocks << "num_entries" << table_properties.num_entries + << "num_filter_entries" << table_properties.num_filter_entries << "num_deletions" << table_properties.num_deletions << "num_merge_operands" << table_properties.num_merge_operands << "num_range_deletions" << table_properties.num_range_deletions @@ -121,7 +141,14 @@ << table_properties.compression_options << "creation_time" << table_properties.creation_time << "oldest_key_time" << table_properties.oldest_key_time << "file_creation_time" - << table_properties.file_creation_time; + << table_properties.file_creation_time + << "slow_compression_estimated_data_size" + << table_properties.slow_compression_estimated_data_size + << "fast_compression_estimated_data_size" + << table_properties.fast_compression_estimated_data_size + << "db_id" << table_properties.db_id << "db_session_id" + << table_properties.db_session_id << "orig_file_number" + << table_properties.orig_file_number; // user collected properties for (const auto& prop : table_properties.readable_properties) { @@ -140,7 +167,7 @@ } #ifndef ROCKSDB_LITE - if (listeners.size() == 0) { + if (listeners.empty()) { return; } TableFileCreationInfo info; @@ -152,9 +179,12 @@ info.table_properties = table_properties; info.reason = reason; info.status = s; + info.file_checksum = file_checksum; + info.file_checksum_func_name = file_checksum_func_name; for (auto& listener : listeners) { listener->OnTableFileCreated(info); } + info.status.PermitUncheckedError(); #else (void)listeners; (void)db_name; @@ -184,6 +214,9 @@ event_logger->Log(jwriter); #ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } TableFileDeletionInfo info; info.db_name = dbname; info.job_id = job_id; @@ -192,6 +225,7 @@ for (auto& listener : listeners) { listener->OnTableFileDeleted(info); } + info.status.PermitUncheckedError(); #else (void)file_path; (void)dbname; @@ -199,25 +233,126 @@ #endif // !ROCKSDB_LITE } -void EventHelpers::NotifyOnErrorRecoveryCompleted( +void EventHelpers::NotifyOnErrorRecoveryEnd( const std::vector>& listeners, - Status old_bg_error, InstrumentedMutex* db_mutex) { + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex) { #ifndef ROCKSDB_LITE - if (listeners.size() == 0U) { - return; - } - db_mutex->AssertHeld(); - // release lock while notifying events - db_mutex->Unlock(); - for (auto& listener : listeners) { - listener->OnErrorRecoveryCompleted(old_bg_error); + if (!listeners.empty()) { + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + BackgroundErrorRecoveryInfo info; + info.old_bg_error = old_bg_error; + info.new_bg_error = new_bg_error; + listener->OnErrorRecoveryCompleted(old_bg_error); + listener->OnErrorRecoveryEnd(info); + info.old_bg_error.PermitUncheckedError(); + info.new_bg_error.PermitUncheckedError(); + } + db_mutex->Lock(); } - db_mutex->Lock(); #else (void)listeners; (void)old_bg_error; + (void)new_bg_error; (void)db_mutex; #endif // ROCKSDB_LITE } +#ifndef ROCKSDB_LITE +void EventHelpers::NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason) { + if (listeners.empty()) { + return; + } + BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id, + creation_reason); + for (const auto& listener : listeners) { + listener->OnBlobFileCreationStarted(info); + } +} +#endif // !ROCKSDB_LITE + +void EventHelpers::LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes) { + if (s.ok() && event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + jwriter << "cf_name" << cf_name << "job" << job_id << "event" + << "blob_file_creation" + << "file_number" << file_number << "total_blob_count" + << total_blob_count << "total_blob_bytes" << total_blob_bytes + << "file_checksum" << file_checksum << "file_checksum_func_name" + << file_checksum_func_name << "status" << s.ToString(); + + jwriter.EndObject(); + event_logger->Log(jwriter); + } + +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + BlobFileCreationInfo info(db_name, cf_name, file_path, job_id, + creation_reason, total_blob_count, total_blob_bytes, + s, file_checksum, file_checksum_func_name); + for (const auto& listener : listeners) { + listener->OnBlobFileCreated(info); + } + info.status.PermitUncheckedError(); +#else + (void)listeners; + (void)db_name; + (void)file_path; + (void)creation_reason; +#endif +} + +void EventHelpers::LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& dbname) { + if (event_logger) { + JSONWriter jwriter; + AppendCurrentTime(&jwriter); + + jwriter << "job" << job_id << "event" + << "blob_file_deletion" + << "file_number" << file_number; + if (!status.ok()) { + jwriter << "status" << status.ToString(); + } + + jwriter.EndObject(); + event_logger->Log(jwriter); + } +#ifndef ROCKSDB_LITE + if (listeners.empty()) { + return; + } + BlobFileDeletionInfo info(dbname, file_path, job_id, status); + for (const auto& listener : listeners) { + listener->OnBlobFileDeleted(info); + } + info.status.PermitUncheckedError(); +#else + (void)listeners; + (void)dbname; + (void)file_path; +#endif // !ROCKSDB_LITE +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h 2025-05-19 16:14:27.000000000 +0000 @@ -35,15 +35,42 @@ const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, const FileDescriptor& fd, uint64_t oldest_blob_file_number, const TableProperties& table_properties, - TableFileCreationReason reason, const Status& s); + TableFileCreationReason reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name); static void LogAndNotifyTableFileDeletion( EventLogger* event_logger, int job_id, uint64_t file_number, const std::string& file_path, const Status& status, const std::string& db_name, const std::vector>& listeners); - static void NotifyOnErrorRecoveryCompleted( + static void NotifyOnErrorRecoveryEnd( const std::vector>& listeners, - Status bg_error, InstrumentedMutex* db_mutex); + const Status& old_bg_error, const Status& new_bg_error, + InstrumentedMutex* db_mutex); + +#ifndef ROCKSDB_LITE + static void NotifyBlobFileCreationStarted( + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, + BlobFileCreationReason creation_reason); +#endif // !ROCKSDB_LITE + + static void LogAndNotifyBlobFileCreationFinished( + EventLogger* event_logger, + const std::vector>& listeners, + const std::string& db_name, const std::string& cf_name, + const std::string& file_path, int job_id, uint64_t file_number, + BlobFileCreationReason creation_reason, const Status& s, + const std::string& file_checksum, + const std::string& file_checksum_func_name, uint64_t total_blob_count, + uint64_t total_blob_bytes); + + static void LogAndNotifyBlobFileDeletion( + EventLogger* event_logger, + const std::vector>& listeners, int job_id, + uint64_t file_number, const std::string& file_path, const Status& status, + const std::string& db_name); private: static void LogAndNotifyTableFileCreation( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,11 +6,14 @@ #include #include "db/db_test_util.h" +#include "db/version_edit.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" -#include "test_util/fault_injection_test_env.h" +#include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { @@ -19,15 +22,32 @@ : public DBTestBase, public ::testing::WithParamInterface> { public: - ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") { - sst_files_dir_ = dbname_ + "/sst_files/"; - fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default())); + ExternalSSTFileBasicTest() + : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) { + sst_files_dir_ = dbname_ + "_sst_files/"; + fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_)); DestroyAndRecreateExternalSSTFilesDir(); + + // Check if the Env supports RandomRWFile + std::string file_path = sst_files_dir_ + "test_random_rw_file"; + std::unique_ptr wfile; + assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok()); + wfile.reset(); + std::unique_ptr rwfile; + Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()); + if (s.IsNotSupported()) { + random_rwfile_supported_ = false; + } else { + EXPECT_OK(s); + random_rwfile_supported_ = true; + } + rwfile.reset(); + EXPECT_OK(env_->DeleteFile(file_path)); } void DestroyAndRecreateExternalSSTFilesDir() { - test::DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); } Status DeprecatedAddFile(const std::vector& files, @@ -41,6 +61,29 @@ return db_->IngestExternalFile(files, opts); } + Status AddFileWithFileChecksum( + const std::vector& files, + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + bool verify_file_checksum = true, bool move_files = false, + bool skip_snapshot_check = false, bool write_global_seqno = true) { + IngestExternalFileOptions opts; + opts.move_files = move_files; + opts.snapshot_consistency = !skip_snapshot_check; + opts.allow_global_seqno = false; + opts.allow_blocking_flush = false; + opts.write_global_seqno = write_global_seqno; + opts.verify_file_checksum = verify_file_checksum; + + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = files; + arg.options = opts; + arg.files_checksums = files_checksums; + arg.files_checksum_func_names = files_checksum_func_names; + return db_->IngestExternalFiles({arg}); + } + Status GenerateAndAddExternalFile( const Options options, std::vector keys, const std::vector& value_types, @@ -137,12 +180,23 @@ } ~ExternalSSTFileBasicTest() override { - test::DestroyDir(env_, sst_files_dir_); + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); } protected: std::string sst_files_dir_; std::unique_ptr fault_injection_test_env_; + bool random_rwfile_supported_; +#ifndef ROCKSDB_LITE + uint64_t GetSstSizeHelper(Temperature temperature) { + std::string prop; + EXPECT_TRUE( + dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature + + ToString(static_cast(temperature)), + &prop)); + return static_cast(std::atoi(prop.c_str())); + } +#endif // ROCKSDB_LITE }; TEST_F(ExternalSSTFileBasicTest, Basic) { @@ -162,7 +216,7 @@ } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); // Current file size should be non-zero after success write. ASSERT_GT(sst_file_writer.FileSize(), 0); @@ -174,16 +228,18 @@ ASSERT_EQ(file1_info.num_range_del_entries, 0); ASSERT_EQ(file1_info.smallest_range_del_key, ""); ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName); // sst_file_writer already finished, cannot add this value s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); s = sst_file_writer.DeleteRange(Key(100), Key(200)); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); DestroyAndReopen(options); // Add file using file path s = DeprecatedAddFile({file1}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 100; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -192,6 +248,391 @@ DestroyAndRecreateExternalSSTFilesDir(); } +class ChecksumVerifyHelper { + private: + Options options_; + + public: + ChecksumVerifyHelper(Options& options) : options_(options) {} + ~ChecksumVerifyHelper() {} + + Status GetSingleFileChecksumAndFuncName( + const std::string& file_path, std::string* file_checksum, + std::string* file_checksum_func_name) { + Status s; + EnvOptions soptions; + std::unique_ptr file_reader; + s = options_.env->NewSequentialFile(file_path, &file_reader, soptions); + if (!s.ok()) { + return s; + } + std::unique_ptr scratch(new char[2048]); + Slice result; + FileChecksumGenFactory* file_checksum_gen_factory = + options_.file_checksum_gen_factory.get(); + if (file_checksum_gen_factory == nullptr) { + *file_checksum = kUnknownFileChecksum; + *file_checksum_func_name = kUnknownFileChecksumFuncName; + return Status::OK(); + } else { + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context); + *file_checksum_func_name = file_checksum_gen->Name(); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + while (result.size() != 0) { + file_checksum_gen->Update(scratch.get(), result.size()); + s = file_reader->Read(2048, &result, scratch.get()); + if (!s.ok()) { + return s; + } + } + file_checksum_gen->Finalize(); + *file_checksum = file_checksum_gen->GetChecksum(); + } + return Status::OK(); + } +}; + +TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + ChecksumVerifyHelper checksum_helper(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + // file1.sst (0 => 99) + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 0; k < 100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + std::string file_checksum, file_checksum_func_name; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file1, &file_checksum, &file_checksum_func_name)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(0)); + ASSERT_EQ(file1_info.largest_key, Key(99)); + ASSERT_EQ(file1_info.num_range_del_entries, 0); + ASSERT_EQ(file1_info.smallest_range_del_key, ""); + ASSERT_EQ(file1_info.largest_range_del_key, ""); + ASSERT_EQ(file1_info.file_checksum, file_checksum); + ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name); + // sst_file_writer already finished, cannot add this value + s = sst_file_writer.Put(Key(100), "bad_val"); + ASSERT_NOK(s) << s.ToString(); + s = sst_file_writer.DeleteRange(Key(100), Key(200)); + ASSERT_NOK(s) << s.ToString(); + + DestroyAndReopen(options); + // Add file using file path + s = DeprecatedAddFile({file1}); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 100; k++) { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + + DestroyAndRecreateExternalSSTFilesDir(); +} + +TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { + Options old_options = CurrentOptions(); + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + const ImmutableCFOptions ioptions(options); + ChecksumVerifyHelper checksum_helper(options); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file01.sst (1000 => 1099) + std::string file1 = sst_files_dir_ + "file01.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 1000; k < 1100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(1000)); + ASSERT_EQ(file1_info.largest_key, Key(1099)); + std::string file_checksum1, file_checksum_func_name1; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file1, &file_checksum1, &file_checksum_func_name1)); + ASSERT_EQ(file1_info.file_checksum, file_checksum1); + ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1); + + // file02.sst (1100 => 1299) + std::string file2 = sst_files_dir_ + "file02.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + for (int k = 1100; k < 1300; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file2_info; + s = sst_file_writer.Finish(&file2_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file2_info.file_path, file2); + ASSERT_EQ(file2_info.num_entries, 200); + ASSERT_EQ(file2_info.smallest_key, Key(1100)); + ASSERT_EQ(file2_info.largest_key, Key(1299)); + std::string file_checksum2, file_checksum_func_name2; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file2, &file_checksum2, &file_checksum_func_name2)); + ASSERT_EQ(file2_info.file_checksum, file_checksum2); + ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2); + + // file03.sst (1300 => 1499) + std::string file3 = sst_files_dir_ + "file03.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + for (int k = 1300; k < 1500; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file3_info; + s = sst_file_writer.Finish(&file3_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file3_info.file_path, file3); + ASSERT_EQ(file3_info.num_entries, 200); + ASSERT_EQ(file3_info.smallest_key, Key(1300)); + ASSERT_EQ(file3_info.largest_key, Key(1499)); + std::string file_checksum3, file_checksum_func_name3; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file3, &file_checksum3, &file_checksum_func_name3)); + ASSERT_EQ(file3_info.file_checksum, file_checksum3); + ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3); + + // file04.sst (1500 => 1799) + std::string file4 = sst_files_dir_ + "file04.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + for (int k = 1500; k < 1800; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file4_info; + s = sst_file_writer.Finish(&file4_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file4_info.file_path, file4); + ASSERT_EQ(file4_info.num_entries, 300); + ASSERT_EQ(file4_info.smallest_key, Key(1500)); + ASSERT_EQ(file4_info.largest_key, Key(1799)); + std::string file_checksum4, file_checksum_func_name4; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file4, &file_checksum4, &file_checksum_func_name4)); + ASSERT_EQ(file4_info.file_checksum, file_checksum4); + ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4); + + // file05.sst (1800 => 1899) + std::string file5 = sst_files_dir_ + "file05.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + for (int k = 1800; k < 2000; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file5_info; + s = sst_file_writer.Finish(&file5_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file5_info.file_path, file5); + ASSERT_EQ(file5_info.num_entries, 200); + ASSERT_EQ(file5_info.smallest_key, Key(1800)); + ASSERT_EQ(file5_info.largest_key, Key(1999)); + std::string file_checksum5, file_checksum_func_name5; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file5, &file_checksum5, &file_checksum_func_name5)); + ASSERT_EQ(file5_info.file_checksum, file_checksum5); + ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5); + + // file06.sst (2000 => 2199) + std::string file6 = sst_files_dir_ + "file06.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + for (int k = 2000; k < 2200; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); + } + ExternalSstFileInfo file6_info; + s = sst_file_writer.Finish(&file6_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file6_info.file_path, file6); + ASSERT_EQ(file6_info.num_entries, 200); + ASSERT_EQ(file6_info.smallest_key, Key(2000)); + ASSERT_EQ(file6_info.largest_key, Key(2199)); + std::string file_checksum6, file_checksum_func_name6; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + file6, &file_checksum6, &file_checksum_func_name6)); + ASSERT_EQ(file6_info.file_checksum, file_checksum6); + ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6); + + s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"}, + {file_checksum1}, true, false, false, false); + // does not care the checksum input since db does not enable file checksum + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file1)); + std::vector live_files; + dbfull()->GetLiveFilesMetaData(&live_files); + std::set set1; + for (auto f : live_files) { + set1.insert(f.name); + ASSERT_EQ(f.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName); + } + + // check the temperature of the file being ingested + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature); + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + + // Reopen Db with checksum enabled + Reopen(options); + // Enable verify_file_checksum option + // The checksum vector does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"}, + {file_checksum_func_name2}, true, false, false, + false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // The checksum name does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false, + false, false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // The checksum itself does not match, fail the ingestion + s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2}, + true, false, false, false); + ASSERT_NOK(s) << s.ToString(); + + // Enable verify_file_checksum option + // All matches, ingestion is successful + s = AddFileWithFileChecksum({file2}, {file_checksum2}, + {file_checksum_func_name2}, true, false, false, + false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files1; + dbfull()->GetLiveFilesMetaData(&live_files1); + for (auto f : live_files1) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum2); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2); + set1.insert(f.name); + } + } + ASSERT_OK(env_->FileExists(file2)); + + // Enable verify_file_checksum option + // No checksum information is provided, generate it when ingesting + std::vector checksum, checksum_func; + s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false, + false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files2; + dbfull()->GetLiveFilesMetaData(&live_files2); + for (auto f : live_files2) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum3); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file3)); + + // Does not enable verify_file_checksum options + // The checksum name does not match, fail the ingestion + s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false, + false, false); + ASSERT_NOK(s) << s.ToString(); + + // Does not enable verify_file_checksum options + // Checksum function name matches, store the checksum being ingested. + s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4}, + false, false, false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files3; + dbfull()->GetLiveFilesMetaData(&live_files3); + for (auto f : live_files3) { + if (set1.find(f.name) == set1.end()) { + ASSERT_FALSE(f.file_checksum == file_checksum4); + ASSERT_EQ(f.file_checksum, "asd"); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file4)); + + // enable verify_file_checksum options, DB enable checksum, and enable + // write_global_seq. So the checksum stored is different from the one + // ingested due to the sequence number changes. + s = AddFileWithFileChecksum({file5}, {file_checksum5}, + {file_checksum_func_name5}, true, false, false, + true); + ASSERT_OK(s) << s.ToString(); + std::vector live_files4; + dbfull()->GetLiveFilesMetaData(&live_files4); + for (auto f : live_files4) { + if (set1.find(f.name) == set1.end()) { + std::string cur_checksum5, cur_checksum_func_name5; + ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( + dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5)); + ASSERT_EQ(f.file_checksum, cur_checksum5); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file5)); + + // Does not enable verify_file_checksum options and also the ingested file + // checksum information is empty. DB will generate and store the checksum + // in Manifest. + std::vector files_c6, files_name6; + s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false, + false, false); + ASSERT_OK(s) << s.ToString(); + std::vector live_files6; + dbfull()->GetLiveFilesMetaData(&live_files6); + for (auto f : live_files6) { + if (set1.find(f.name) == set1.end()) { + ASSERT_EQ(f.file_checksum, file_checksum6); + ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6); + set1.insert(f.name); + } + } + ASSERT_OK(s) << s.ToString(); + ASSERT_OK(env_->FileExists(file6)); + db_->GetColumnFamilyMetaData(&metadata); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); +} + TEST_F(ExternalSSTFileBasicTest, NoCopy) { Options options = CurrentOptions(); const ImmutableCFOptions ioptions(options); @@ -206,7 +647,7 @@ } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); @@ -220,7 +661,7 @@ } ExternalSstFileInfo file2_info; s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 200); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -234,23 +675,23 @@ } ExternalSstFileInfo file3_info; s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 15); ASSERT_EQ(file3_info.smallest_key, Key(110)); ASSERT_EQ(file3_info.largest_key, Key(124)); s = DeprecatedAddFile({file1}, true /* move file */); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); s = DeprecatedAddFile({file2}, false /* copy file */); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file2)); // This file has overlapping values with the existing data s = DeprecatedAddFile({file3}, true /* move file */); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file3)); for (int k = 0; k < 300; k++) { @@ -706,12 +1147,31 @@ "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}}; for (size_t i = 0; i < test_cases.size(); i++) { + bool no_sync = false; SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) { fault_injection_test_env_->SetFilesystemActive(false); }); SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) { fault_injection_test_env_->SetFilesystemActive(true); }); + if (i == 0) { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) { + Status* status = static_cast(s); + if (status->IsNotSupported()) { + no_sync = true; + } + }); + } + if (i == 2) { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) { + Status* status = static_cast(s); + if (status->IsNotSupported()) { + no_sync = true; + } + }); + } SyncPoint::GetInstance()->EnableProcessing(); DestroyAndReopen(options); @@ -720,6 +1180,7 @@ } Options sst_file_writer_options; + sst_file_writer_options.env = fault_injection_test_env_.get(); std::unique_ptr sst_file_writer( new SstFileWriter(EnvOptions(), sst_file_writer_options)); std::string file_name = @@ -736,7 +1197,12 @@ if (i == 2) { ingest_opt.write_global_seqno = true; } - ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok()); + Status s = db_->IngestExternalFile({file_name}, ingest_opt); + if (no_sync) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } db_->ReleaseSnapshot(snapshot); SyncPoint::GetInstance()->DisableProcessing(); @@ -745,20 +1211,56 @@ } } +TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) { + Options options; + options.create_if_missing = true; + options.env = env_; + + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) { + Status* s = static_cast(arg); + *s = Status::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "reopen_not_supported_test_" + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = true; + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { Options options; options.create_if_missing = true; - SpecialEnv senv(Env::Default()); + SpecialEnv senv(env_); options.env = &senv; DestroyAndReopen(options); Options sst_file_writer_options; + sst_file_writer_options.env = env_; std::unique_ptr sst_file_writer( new SstFileWriter(EnvOptions(), sst_file_writer_options)); std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst"; ASSERT_OK(sst_file_writer->Open(file_name)); Random rnd(301); - std::string value = DBTestBase::RandomString(&rnd, 4000); + std::string value = rnd.RandomString(4000); for (int i = 0; i < 5000; i++) { ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value)); } @@ -796,6 +1298,45 @@ Destroy(options); } +TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) { + for (int i = 5; i < 25; i++) { + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i), + Key(i) + "_val")); + } + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + Reopen(options); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file.sst (delete 0 => 30) + std::string file = sst_files_dir_ + "file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30))); + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + ASSERT_EQ(file_info.file_path, file); + ASSERT_EQ(file_info.num_entries, 0); + ASSERT_EQ(file_info.smallest_key, ""); + ASSERT_EQ(file_info.largest_key, ""); + ASSERT_EQ(file_info.num_range_del_entries, 1); + ASSERT_EQ(file_info.smallest_range_del_key, Key(0)); + ASSERT_EQ(file_info.largest_range_del_key, Key(30)); + + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.snapshot_consistency = true; + ifo.allow_global_seqno = true; + ifo.write_global_seqno = true; + ifo.verify_checksums_before_ingest = false; + ASSERT_OK(db_->IngestExternalFile({file}, ifo)); + + for (int i = 5; i < 25; i++) { + std::string res; + ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound()); + } +} + TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { int kNumLevels = 7; Options options = CurrentOptions(); @@ -896,7 +1437,7 @@ ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); ExternalSstFileInfo file8_info; Status s = sst_file_writer.Finish(&file8_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file8_info.file_path, file8); ASSERT_EQ(file8_info.num_entries, 0); ASSERT_EQ(file8_info.smallest_key, ""); @@ -911,7 +1452,7 @@ ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); ExternalSstFileInfo file9_info; s = sst_file_writer.Finish(&file9_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file9_info.file_path, file9); ASSERT_EQ(file9_info.num_entries, 0); ASSERT_EQ(file9_info.smallest_key, ""); @@ -923,7 +1464,7 @@ // Range deletion tombstones are exclusive on their end key, so these SSTs // should not be considered as overlapping. s = DeprecatedAddFile({file8, file9}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); DestroyAndRecreateExternalSSTFilesDir(); } @@ -964,6 +1505,10 @@ } TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { + if (!random_rwfile_supported_) { + ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); + return; + } SyncPoint::GetInstance()->DisableProcessing(); int file_id = 0; EnvOptions env_options; @@ -1013,6 +1558,11 @@ TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) { bool verify_checksums_before_ingest = std::get<1>(GetParam()); if (!verify_checksums_before_ingest) { + ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest"); + return; + } + if (!random_rwfile_supported_) { + ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); return; } uint64_t props_block_offset = 0; @@ -1111,6 +1661,141 @@ ASSERT_EQ(2, NumTableFilesAtLevel(0)); } +TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { + // Repro https://github.com/facebook/rocksdb/issues/6245. + // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction + // via trivial move. The bug happened when L1 files were incorrectly sorted + // resulting in an old value for "k" returned by `Get()`. + Options options = CurrentOptions(); + + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("k", "b")); + + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + IngestExternalFileOptions ifo; + s = db_->IngestExternalFile({file1}, ifo); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(Get("k"), "b"); +} + +TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) { + Options options = CurrentOptions(); + const ImmutableCFOptions ioptions(options); + options.bottommost_temperature = Temperature::kWarm; + SstFileWriter sst_file_writer(EnvOptions(), options); + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + auto size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + + // create file01.sst (1000 => 1099) and ingest it + std::string file1 = sst_files_dir_ + "file01.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + for (int k = 1000; k < 1100; k++) { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s); + ASSERT_EQ(file1_info.file_path, file1); + ASSERT_EQ(file1_info.num_entries, 100); + ASSERT_EQ(file1_info.smallest_key, Key(1000)); + ASSERT_EQ(file1_info.largest_key, Key(1099)); + + std::vector files; + std::vector files_checksums; + std::vector files_checksum_func_names; + Temperature file_temperature = Temperature::kWarm; + + files.push_back(file1); + IngestExternalFileOptions in_opts; + in_opts.move_files = false; + in_opts.snapshot_consistency = true; + in_opts.allow_global_seqno = false; + in_opts.allow_blocking_flush = false; + in_opts.write_global_seqno = true; + in_opts.verify_file_checksum = false; + IngestExternalFileArg arg; + arg.column_family = db_->DefaultColumnFamily(); + arg.external_files = files; + arg.options = in_opts; + arg.files_checksums = files_checksums; + arg.files_checksum_func_names = files_checksum_func_names; + arg.file_temperature = file_temperature; + s = db_->IngestExternalFiles({arg}); + ASSERT_OK(s); + + // check the temperature of the file being ingested + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 1); + + // non-bottommost file still has unknown temperature + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // reopen and check the information is persisted + Reopen(options); + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(2, metadata.file_count); + ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); + ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature); + size = GetSstSizeHelper(Temperature::kUnknown); + ASSERT_GT(size, 0); + size = GetSstSizeHelper(Temperature::kWarm); + ASSERT_GT(size, 0); + + // check other non-exist temperatures + size = GetSstSizeHelper(Temperature::kHot); + ASSERT_EQ(size, 0); + size = GetSstSizeHelper(Temperature::kCold); + ASSERT_EQ(size, 0); + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty( + DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), + &prop)); + ASSERT_EQ(std::atoi(prop.c_str()), 0); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, testing::Values(std::make_tuple(true, true), std::make_tuple(true, false), @@ -1124,5 +1809,6 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,6 +17,7 @@ #include "db/version_edit.h" #include "file/file_util.h" #include "file/random_access_file_reader.h" +#include "logging/logging.h" #include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" @@ -28,26 +29,39 @@ Status ExternalSstFileIngestionJob::Prepare( const std::vector& external_files_paths, - uint64_t next_file_number, SuperVersion* sv) { + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv) { Status status; // Read the information of files we are ingesting for (const std::string& file_path : external_files_paths) { IngestedFileInfo file_to_ingest; - status = GetIngestedFileInfo(file_path, &file_to_ingest, sv); + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); if (!status.ok()) { return status; } - files_to_ingest_.push_back(file_to_ingest); - } - for (const IngestedFileInfo& f : files_to_ingest_) { - if (f.cf_id != + if (file_to_ingest.cf_id != TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && - f.cf_id != cfd_->GetID()) { + file_to_ingest.cf_id != cfd_->GetID()) { return Status::InvalidArgument( - "External file column family id dont match"); + "External file column family id don't match"); + } + + if (file_to_ingest.num_entries == 0 && + file_to_ingest.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!file_to_ingest.smallest_internal_key.Valid() || + !file_to_ingest.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); } + + files_to_ingest_.emplace_back(std::move(file_to_ingest)); } const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); @@ -55,7 +69,7 @@ if (num_files == 0) { return Status::InvalidArgument("The list of files is empty"); } else if (num_files > 1) { - // Verify that passed files dont have overlapping ranges + // Verify that passed files don't have overlapping ranges autovector sorted_files; for (size_t i = 0; i < num_files; i++) { sorted_files.push_back(&files_to_ingest_[i]); @@ -68,7 +82,7 @@ info2->smallest_internal_key) < 0; }); - for (size_t i = 0; i < num_files - 1; i++) { + for (size_t i = 0; i + 1 < num_files; i++) { if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, sorted_files[i + 1]->smallest_internal_key) >= 0) { files_overlap_ = true; @@ -77,24 +91,18 @@ } } - if (ingestion_options_.ingest_behind && files_overlap_) { - return Status::NotSupported("Files have overlapping ranges"); + // Hanlde the file temperature + for (size_t i = 0; i < num_files; i++) { + files_to_ingest_[i].file_temperature = file_temperature; } - for (IngestedFileInfo& f : files_to_ingest_) { - if (f.num_entries == 0 && f.num_range_deletions == 0) { - return Status::InvalidArgument("File contain no entries"); - } - - if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { - return Status::Corruption("Generated table have corrupted keys"); - } + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); } // Copy/Move external files into DB std::unordered_set ingestion_path_ids; for (IngestedFileInfo& f : files_to_ingest_) { - f.fd = FileDescriptor(next_file_number++, 0, f.file_size); f.copy_file = false; const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = @@ -108,17 +116,26 @@ // directory before ingest the file. For integrity of RocksDB we need // to sync the file. std::unique_ptr file_to_sync; - status = fs_->ReopenWritableFile(path_inside_db, env_options_, - &file_to_sync, nullptr); - if (status.ok()) { - TEST_SYNC_POINT( - "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); - status = SyncIngestedFile(file_to_sync.get()); - TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to sync ingested file %s: %s", - path_inside_db.c_str(), status.ToString().c_str()); + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } } } } else if (status.IsNotSupported() && @@ -134,21 +151,26 @@ TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", nullptr); // CopyFile also sync the new file. - status = CopyFile(fs_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + status = CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_); } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { break; } f.internal_file_path = path_inside_db; + // Initialize the checksum information of ingested files. + f.file_checksum = kUnknownFileChecksum; + f.file_checksum_func_name = kUnknownFileChecksumFuncName; ingestion_path_ids.insert(f.fd.GetPathId()); } TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); if (status.ok()) { for (auto path_id : ingestion_path_ids) { - status = directories_->GetDataDir(path_id)->Fsync(); + status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); if (!status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Failed to sync directory %" ROCKSDB_PRIszt @@ -160,14 +182,141 @@ } TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + // Generate and check the sst file checksum. Note that, if + // IngestExternalFileOptions::write_global_seqno is true, we will not update + // the checksum information in the files_to_ingests_ here, since the file is + // upadted with the new global_seqno. After global_seqno is updated, DB will + // generate the new checksum and store it in the Manifest. In all other cases + // if ingestion_options_.write_global_seqno == true and + // verify_file_checksum is false, we only check the checksum function name. + if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) { + if (ingestion_options_.verify_file_checksum == false && + files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Only when verify_file_checksum == false and the checksum for ingested + // files are provided, DB will use the provided checksum and does not + // generate the checksum for ingested files. + need_generate_file_checksum_ = false; + } else { + need_generate_file_checksum_ = true; + } + FileChecksumGenContext gen_context; + std::unique_ptr file_checksum_gen = + db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + std::vector generated_checksums; + std::vector generated_checksum_func_names; + // Step 1: generate the checksum for ingested sst file. + if (need_generate_file_checksum_) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + std::string generated_checksum; + std::string generated_checksum_func_name; + std::string requested_checksum_func_name; + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), files_to_ingest_[i].internal_file_path, + db_options_.file_checksum_gen_factory.get(), + requested_checksum_func_name, &generated_checksum, + &generated_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, + db_options_.rate_limiter.get()); + if (!io_s.ok()) { + status = io_s; + ROCKS_LOG_WARN(db_options_.info_log, + "Sst file checksum generation of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + if (ingestion_options_.write_global_seqno == false) { + files_to_ingest_[i].file_checksum = generated_checksum; + files_to_ingest_[i].file_checksum_func_name = + generated_checksum_func_name; + } + generated_checksums.push_back(generated_checksum); + generated_checksum_func_names.push_back(generated_checksum_func_name); + } + } + + // Step 2: based on the verify_file_checksum and ingested checksum + // information, do the verification. + if (status.ok()) { + if (files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Verify the checksum and checksum function name. + if (ingestion_options_.verify_file_checksum) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != + generated_checksum_func_names[i]) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + if (files_checksums[i] != generated_checksums[i]) { + status = Status::Corruption( + "Ingested checksum does not match with the generated " + "checksum"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + } + } else { + // If verify_file_checksum is not enabled, we only verify the + // checksum function name. If it does not match, fail the ingestion. + // If matches, we trust the ingested checksum information and store + // in the Manifest. + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != file_checksum_gen->Name()) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + files_to_ingest_[i].file_checksum = files_checksums[i]; + files_to_ingest_[i].file_checksum_func_name = + files_checksum_func_names[i]; + } + } + } else if (files_checksums.size() != files_checksum_func_names.size() || + (files_checksums.size() == files_checksum_func_names.size() && + files_checksums.size() != 0)) { + // The checksum or checksum function name vector are not both empty + // and they are incomplete. + status = Status::InvalidArgument( + "The checksum information of ingested sst files are nonempty and " + "the size of checksums or the size of the checksum function " + "names " + "does not match with the number of ingested sst files"); + ROCKS_LOG_WARN( + db_options_.info_log, + "The ingested sst files checksum information is incomplete: %s", + status.ToString().c_str()); + } + } + } + // TODO: The following is duplicated with Cleanup(). if (!status.ok()) { + IOOptions io_opts; // We failed, remove all files that we copied into the db for (IngestedFileInfo& f : files_to_ingest_) { if (f.internal_file_path.empty()) { continue; } - Status s = env_->DeleteFile(f.internal_file_path); + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "AddFile() clean up for file %s failed : %s", @@ -186,8 +335,8 @@ ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), file_to_ingest.largest_internal_key.user_key()); } - Status status = - cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed); + Status status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); if (status.ok() && *flush_needed && !ingestion_options_.allow_blocking_flush) { status = Status::InvalidArgument("External file requires flush"); @@ -205,6 +354,12 @@ // with the files we are ingesting bool need_flush = false; status = NeedsFlush(&need_flush, super_version); + if (!status.ok()) { + return status; + } + if (need_flush) { + return Status::TryAgain(); + } assert(status.ok() && need_flush == false); #endif @@ -212,7 +367,7 @@ if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { // We need to assign a global sequence number to all the files even - // if the dont overlap with any ranges since we have snapshots + // if the don't overlap with any ranges since we have snapshots force_global_seqno = true; } // It is safe to use this instead of LastAllocatedSequence since we are @@ -230,9 +385,32 @@ super_version, force_global_seqno, cfd_->ioptions()->compaction_style, last_seqno, &f, &assigned_seqno); } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } if (!status.ok()) { return status; } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno); @@ -245,21 +423,29 @@ return status; } + status = GenerateChecksumForIngestedFile(&f); + if (!status.ok()) { + return status; + } + // We use the import time as the ancester time. This is the time the data // is written to the database. int64_t temp_current_time = 0; uint64_t current_time = kUnknownFileCreationTime; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; - if (env_->GetCurrentTime(&temp_current_time).ok()) { + if (clock_->GetCurrentTime(&temp_current_time).ok()) { current_time = oldest_ancester_time = static_cast(temp_current_time); } - - edit_.AddFile( - f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), + FileMetaData f_metadata( + f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, - f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time, - current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName); + f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, f.file_checksum, + f.file_checksum_func_name, kDisableUserTimestamp, + kDisableUserTimestamp); + f_metadata.temperature = f.file_temperature; + edit_.AddFile(f.picked_level, f_metadata); } return status; } @@ -268,7 +454,7 @@ // Update internal stats for new ingested files uint64_t total_keys = 0; uint64_t total_l0_files = 0; - uint64_t total_time = env_->NowMicros() - job_start_time_; + uint64_t total_time = clock_->NowMicros() - job_start_time_; EventLoggerStream stream = event_logger_->Log(); stream << "event" @@ -324,6 +510,7 @@ } void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + IOOptions io_opts; if (!status.ok()) { // We failed to add the files to the database // remove all the files we copied @@ -331,7 +518,7 @@ if (f.internal_file_path.empty()) { continue; } - Status s = env_->DeleteFile(f.internal_file_path); + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "AddFile() clean up for file %s failed : %s", @@ -343,7 +530,7 @@ } else if (status.ok() && ingestion_options_.move_files) { // The files were moved and added successfully, remove original file links for (IngestedFileInfo& f : files_to_ingest_) { - Status s = env_->DeleteFile(f.external_file_path); + Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN( db_options_.info_log, @@ -356,8 +543,8 @@ } Status ExternalSstFileIngestionJob::GetIngestedFileInfo( - const std::string& external_file, IngestedFileInfo* file_to_ingest, - SuperVersion* sv) { + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, SuperVersion* sv) { file_to_ingest->external_file_path = external_file; // Get external file size @@ -367,6 +554,10 @@ return status; } + // Assign FD with number + file_to_ingest->fd = + FileDescriptor(new_file_number, 0, file_to_ingest->file_size); + // Create TableReader for external file std::unique_ptr table_reader; std::unique_ptr sst_file; @@ -377,13 +568,18 @@ if (!status.ok()) { return status; } - sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file), - external_file)); + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); status = cfd_->ioptions()->table_factory->NewTableReader( - TableReaderOptions(*cfd_->ioptions(), - sv->mutable_cf_options.prefix_extractor.get(), - env_options_, cfd_->internal_comparator()), + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); if (!status.ok()) { return status; @@ -423,14 +619,12 @@ // Set the global sequence number file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); - auto offsets_iter = props->properties_offsets.find( - ExternalSstFilePropertyNames::kGlobalSeqno); - if (offsets_iter == props->properties_offsets.end() || - offsets_iter->second == 0) { + if (props->external_sst_file_global_seqno_offset == 0) { file_to_ingest->global_seqno_offset = 0; return Status::Corruption("Was not able to find file global seqno field"); } - file_to_ingest->global_seqno_offset = static_cast(offsets_iter->second); + file_to_ingest->global_seqno_offset = + static_cast(props->external_sst_file_global_seqno_offset); } else if (file_to_ingest->version == 1) { // SST file V1 should not have global seqno field assert(seqno_iter == uprops.end()); @@ -467,22 +661,28 @@ file_to_ingest->largest_internal_key = InternalKey("", 0, ValueType::kTypeValue); bool bounds_set = false; + bool allow_data_in_errors = db_options_.allow_data_in_errors; iter->SeekToFirst(); if (iter->Valid()) { - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } if (key.sequence != 0) { - return Status::Corruption("external file have non zero sequence number"); + return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->smallest_internal_key.SetFrom(key); iter->SeekToLast(); - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("external file have corrupted keys"); + pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } if (key.sequence != 0) { - return Status::Corruption("external file have non zero sequence number"); + return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->largest_internal_key.SetFrom(key); @@ -495,8 +695,11 @@ if (range_del_iter != nullptr) { for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); range_del_iter->Next()) { - if (!ParseInternalKey(range_del_iter->key(), &key)) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } RangeTombstone tombstone(key, range_del_iter->value()); @@ -570,10 +773,11 @@ const std::vector& level_files = vstorage->LevelFiles(lvl); const SequenceNumber level_largest_seqno = - (*max_element(level_files.begin(), level_files.end(), - [](FileMetaData* f1, FileMetaData* f2) { - return f1->fd.largest_seqno < f2->fd.largest_seqno; - })) + (*std::max_element(level_files.begin(), level_files.end(), + [](FileMetaData* f1, FileMetaData* f2) { + return f1->fd.largest_seqno < + f2->fd.largest_seqno; + })) ->fd.largest_seqno; // should only assign seqno to current level's largest seqno when // the file fits @@ -588,7 +792,7 @@ continue; } - // We dont overlap with any keys in this level, but we still need to check + // We don't overlap with any keys in this level, but we still need to check // if our file can fit in it if (IngestedFileFitInLevel(file_to_ingest, lvl)) { target_level = lvl; @@ -646,7 +850,7 @@ return Status::InvalidArgument("Global seqno is required, but disabled"); } else if (file_to_ingest->global_seqno_offset == 0) { return Status::InvalidArgument( - "Trying to set global seqno for a file that dont have a global seqno " + "Trying to set global seqno for a file that don't have a global seqno " "field"); } @@ -658,14 +862,18 @@ Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_, &rwfile, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", + &status); if (status.ok()) { + FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, + file_to_ingest->internal_file_path); std::string seqno_val; PutFixed64(&seqno_val, seqno); - status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val, - IOOptions(), nullptr); + status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, + IOOptions(), nullptr); if (status.ok()) { TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); - status = SyncIngestedFile(rwfile.get()); + status = SyncIngestedFile(fsptr.get()); TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); if (!status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, @@ -687,6 +895,33 @@ return Status::OK(); } +IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( + IngestedFileInfo* file_to_ingest) { + if (db_options_.file_checksum_gen_factory == nullptr || + need_generate_file_checksum_ == false || + ingestion_options_.write_global_seqno == false) { + // If file_checksum_gen_factory is not set, we are not able to generate + // the checksum. if write_global_seqno is false, it means we will use + // file checksum generated during Prepare(). This step will be skipped. + return IOStatus::OK(); + } + std::string file_checksum; + std::string file_checksum_func_name; + std::string requested_checksum_func_name; + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), file_to_ingest->internal_file_path, + db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, + &file_checksum, &file_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get()); + if (!io_s.ok()) { + return io_s; + } + file_to_ingest->file_checksum = file_checksum; + file_to_ingest->file_checksum_func_name = file_checksum_func_name; + return IOStatus::OK(); +} + bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( const IngestedFileInfo* file_to_ingest, int level) { if (level == 0) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,19 +9,20 @@ #include #include "db/column_family.h" -#include "db/dbformat.h" #include "db/internal_stats.h" #include "db/snapshot_impl.h" +#include "env/file_system_tracer.h" #include "logging/event_logger.h" #include "options/db_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "rocksdb/sst_file_writer.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { class Directories; +class SystemClock; struct IngestedFileInfo { // External file path @@ -63,18 +64,25 @@ // ingestion_options.move_files is false by default, thus copy_file is true // by default. bool copy_file = true; + // The checksum of ingested file + std::string file_checksum; + // The name of checksum function that generate the checksum + std::string file_checksum_func_name; + // The temperature of the file to be ingested + Temperature file_temperature = Temperature::kUnknown; }; class ExternalSstFileIngestionJob { public: ExternalSstFileIngestionJob( - Env* env, VersionSet* versions, ColumnFamilyData* cfd, + VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, SnapshotList* db_snapshots, const IngestExternalFileOptions& ingestion_options, - Directories* directories, EventLogger* event_logger) - : env_(env), - fs_(db_options.fs.get()), + Directories* directories, EventLogger* event_logger, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), + fs_(db_options.fs, io_tracer), versions_(versions), cfd_(cfd), db_options_(db_options), @@ -83,14 +91,18 @@ ingestion_options_(ingestion_options), directories_(directories), event_logger_(event_logger), - job_start_time_(env_->NowMicros()), - consumed_seqno_count_(0) { + job_start_time_(clock_->NowMicros()), + consumed_seqno_count_(0), + io_tracer_(io_tracer) { assert(directories != nullptr); } // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, - uint64_t next_file_number, SuperVersion* sv); + const std::vector& files_checksums, + const std::vector& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv); // Check if we need to flush the memtable before running the ingestion job // This will be true if the files we are ingesting are overlapping with any @@ -126,10 +138,11 @@ // Open the external file and populate `file_to_ingest` with all the // external information we need to ingest this file. Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, IngestedFileInfo* file_to_ingest, SuperVersion* sv); - // Assign `file_to_ingest` the appropriate sequence number and the lowest + // Assign `file_to_ingest` the appropriate sequence number and the lowest // possible level that it can be ingested to according to compaction_style. // REQUIRES: Mutex held Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, @@ -148,6 +161,8 @@ // Set the file global sequence number to `seqno` Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest, SequenceNumber seqno); + // Generate the file checksum and store in the IngestedFileInfo + IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest); // Check if `file_to_ingest` can fit in level `level` // REQUIRES: Mutex held @@ -158,8 +173,8 @@ template Status SyncIngestedFile(TWritableFile* file); - Env* env_; - FileSystem* fs_; + SystemClock* clock_; + FileSystemPtr fs_; VersionSet* versions_; ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; @@ -175,6 +190,10 @@ // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are // ingested in L0 bool files_overlap_{false}; + // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB + // file_checksum_gen_factory is set, DB will generate checksum each file. + bool need_generate_file_checksum_{true}; + std::shared_ptr io_tracer_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,13 +6,19 @@ #ifndef ROCKSDB_LITE #include + #include "db/db_test_util.h" +#include "db/dbformat.h" #include "file/filename.h" +#include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" -#include "test_util/fault_injection_test_env.h" #include "test_util/testutil.h" +#include "util/random.h" +#include "util/thread_guard.h" +#include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { @@ -21,6 +27,8 @@ public: ExternalSSTTestEnv(Env* t, bool fail_link) : EnvWrapper(t), fail_link_(fail_link) {} + static const char* kClassName() { return "ExternalSSTTestEnv"; } + const char* Name() const override { return kClassName(); } Status LinkFile(const std::string& s, const std::string& t) override { if (fail_link_) { @@ -35,16 +43,33 @@ bool fail_link_; }; +class ExternalSSTFileTestBase : public DBTestBase { + public: + ExternalSSTFileTestBase() + : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); + } + + ~ExternalSSTFileTestBase() override { + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); + } + + protected: + std::string sst_files_dir_; +}; + class ExternSSTFileLinkFailFallbackTest - : public DBTestBase, + : public ExternalSSTFileTestBase, public ::testing::WithParamInterface> { public: ExternSSTFileLinkFailFallbackTest() - : DBTestBase("/external_sst_file_test"), - test_env_(new ExternalSSTTestEnv(env_, true)) { - sst_files_dir_ = dbname_ + "/sst_files/"; - test::DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); + : test_env_(new ExternalSSTTestEnv(env_, true)) { options_ = CurrentOptions(); options_.disable_auto_compactions = true; options_.env = test_env_; @@ -59,24 +84,15 @@ } protected: - std::string sst_files_dir_; Options options_; ExternalSSTTestEnv* test_env_; }; class ExternalSSTFileTest - : public DBTestBase, + : public ExternalSSTFileTestBase, public ::testing::WithParamInterface> { public: - ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") { - sst_files_dir_ = dbname_ + "/sst_files/"; - DestroyAndRecreateExternalSSTFilesDir(); - } - - void DestroyAndRecreateExternalSSTFilesDir() { - test::DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); - } + ExternalSSTFileTest() {} Status GenerateOneExternalFile( const Options& options, ColumnFamilyHandle* cfh, @@ -111,7 +127,7 @@ for (const auto& entry : data) { s = sst_file_writer.Put(entry.first, entry.second); if (!s.ok()) { - sst_file_writer.Finish(); + sst_file_writer.Finish().PermitUncheckedError(); return s; } } @@ -166,7 +182,7 @@ for (auto& entry : data) { s = sst_file_writer.Put(entry.first, entry.second); if (!s.ok()) { - sst_file_writer.Finish(); + sst_file_writer.Finish().PermitUncheckedError(); return s; } } @@ -208,11 +224,10 @@ size_t num_cfs = column_families.size(); assert(ifos.size() == num_cfs); assert(data.size() == num_cfs); - Status s; std::vector args(num_cfs); for (size_t i = 0; i != num_cfs; ++i) { std::string external_file_path; - s = GenerateOneExternalFile( + Status s = GenerateOneExternalFile( options, column_families[i], data[i], file_id, sort_data, &external_file_path, true_data.size() == num_cfs ? &true_data[i] : nullptr); @@ -225,8 +240,7 @@ args[i].external_files.push_back(external_file_path); args[i].options = ifos[i]; } - s = db_->IngestExternalFiles(args); - return s; + return db_->IngestExternalFiles(args); } Status GenerateAndAddExternalFile( @@ -277,11 +291,8 @@ return db_->IngestExternalFile(files, opts); } - ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); } - protected: int last_file_id_ = 0; - std::string sst_files_dir_; }; TEST_F(ExternalSSTFileTest, Basic) { @@ -300,8 +311,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); // Current file size should be non-zero after success write. ASSERT_GT(sst_file_writer.FileSize(), 0); @@ -314,8 +324,7 @@ ASSERT_EQ(file1_info.smallest_range_del_key, ""); ASSERT_EQ(file1_info.largest_range_del_key, ""); // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); // file2.sst (100 => 199) std::string file2 = sst_files_dir_ + "file2.sst"; @@ -324,11 +333,9 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } // Cannot add this key because it's not after last added key - s = sst_file_writer.Put(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 100); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -342,9 +349,8 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); - ASSERT_TRUE(s.ok()) << s.ToString(); // Current file size should be non-zero after success finish. ASSERT_GT(sst_file_writer.FileSize(), 0); ASSERT_EQ(file3_info.file_path, file3); @@ -360,8 +366,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file4_info)); ASSERT_EQ(file4_info.file_path, file4); ASSERT_EQ(file4_info.num_entries, 10); ASSERT_EQ(file4_info.smallest_key, Key(30)); @@ -374,8 +379,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file5_info)); ASSERT_EQ(file5_info.file_path, file5); ASSERT_EQ(file5_info.num_entries, 100); ASSERT_EQ(file5_info.smallest_key, Key(400)); @@ -384,10 +388,9 @@ // file6.sst (delete 400 => 500) std::string file6 = sst_files_dir_ + "file6.sst"; ASSERT_OK(sst_file_writer.Open(file6)); - sst_file_writer.DeleteRange(Key(400), Key(500)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); ExternalSstFileInfo file6_info; - s = sst_file_writer.Finish(&file6_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file6_info)); ASSERT_EQ(file6_info.file_path, file6); ASSERT_EQ(file6_info.num_entries, 0); ASSERT_EQ(file6_info.smallest_key, ""); @@ -399,17 +402,16 @@ // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2) std::string file7 = sst_files_dir_ + "file7.sst"; ASSERT_OK(sst_file_writer.Open(file7)); - sst_file_writer.DeleteRange(Key(500), Key(550)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550))); for (int k = 520; k < 560; k += 2) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } - sst_file_writer.DeleteRange(Key(525), Key(575)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575))); for (int k = 560; k < 600; k += 2) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file7_info; - s = sst_file_writer.Finish(&file7_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file7_info)); ASSERT_EQ(file7_info.file_path, file7); ASSERT_EQ(file7_info.num_entries, 40); ASSERT_EQ(file7_info.smallest_key, Key(520)); @@ -421,10 +423,9 @@ // file8.sst (delete 600 => 700) std::string file8 = sst_files_dir_ + "file8.sst"; ASSERT_OK(sst_file_writer.Open(file8)); - sst_file_writer.DeleteRange(Key(600), Key(700)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700))); ExternalSstFileInfo file8_info; - s = sst_file_writer.Finish(&file8_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file8_info)); ASSERT_EQ(file8_info.file_path, file8); ASSERT_EQ(file8_info.num_entries, 0); ASSERT_EQ(file8_info.smallest_key, ""); @@ -436,13 +437,11 @@ // Cannot create an empty sst file std::string file_empty = sst_files_dir_ + "file_empty.sst"; ExternalSstFileInfo file_empty_info; - s = sst_file_writer.Finish(&file_empty_info); - ASSERT_NOK(s); + ASSERT_NOK(sst_file_writer.Finish(&file_empty_info)); DestroyAndReopen(options); // Add file using file path - s = DeprecatedAddFile({file1}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(DeprecatedAddFile({file1})); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 100; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -463,12 +462,10 @@ } // This file has overlapping values with the existing data - s = DeprecatedAddFile({file3}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file3})); // This file has overlapping values with the existing data - s = DeprecatedAddFile({file4}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file4})); // Overwrite values of keys divisible by 5 for (int k = 0; k < 200; k += 5) { @@ -476,17 +473,16 @@ } ASSERT_NE(db_->GetLatestSequenceNumber(), 0U); - // Key range of file5 (400 => 499) dont overlap with any keys in DB + // Key range of file5 (400 => 499) don't overlap with any keys in DB ASSERT_OK(DeprecatedAddFile({file5})); // This file has overlapping values with the existing data - s = DeprecatedAddFile({file6}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file6})); - // Key range of file7 (500 => 598) dont overlap with any keys in DB + // Key range of file7 (500 => 598) don't overlap with any keys in DB ASSERT_OK(DeprecatedAddFile({file7})); - // Key range of file7 (600 => 700) dont overlap with any keys in DB + // Key range of file7 (600 => 700) don't overlap with any keys in DB ASSERT_OK(DeprecatedAddFile({file8})); // Make sure values are correct before and after flush/compaction @@ -609,15 +605,13 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); ASSERT_EQ(file1_info.largest_key, Key(99)); // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); // file2.sst (100 => 199) std::string file2 = sst_files_dir_ + "file2.sst"; @@ -626,11 +620,9 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } // Cannot add this key because it's not after last added key - s = sst_file_writer.Put(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 100); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -644,8 +636,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 5); ASSERT_EQ(file3_info.smallest_key, Key(195)); @@ -659,8 +650,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file4_info)); ASSERT_EQ(file4_info.file_path, file4); ASSERT_EQ(file4_info.num_entries, 10); ASSERT_EQ(file4_info.smallest_key, Key(30)); @@ -673,8 +663,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file5_info)); ASSERT_EQ(file5_info.file_path, file5); ASSERT_EQ(file5_info.num_entries, 100); ASSERT_EQ(file5_info.smallest_key, Key(200)); @@ -686,8 +675,7 @@ ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75))); ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100))); ExternalSstFileInfo file6_info; - s = sst_file_writer.Finish(&file6_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file6_info)); ASSERT_EQ(file6_info.file_path, file6); ASSERT_EQ(file6_info.num_entries, 0); ASSERT_EQ(file6_info.smallest_key, ""); @@ -701,8 +689,7 @@ ASSERT_OK(sst_file_writer.Open(file7)); ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201))); ExternalSstFileInfo file7_info; - s = sst_file_writer.Finish(&file7_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file7_info)); ASSERT_EQ(file7_info.file_path, file7); ASSERT_EQ(file7_info.num_entries, 0); ASSERT_EQ(file7_info.smallest_key, ""); @@ -722,17 +709,13 @@ DestroyAndReopen(options); // These lists of files have key ranges that overlap with each other - s = DeprecatedAddFile(file_list1); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list1)); // Both of the following overlap on the range deletion tombstone. - s = DeprecatedAddFile(file_list4); - ASSERT_FALSE(s.ok()) << s.ToString(); - s = DeprecatedAddFile(file_list5); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list4)); + ASSERT_NOK(DeprecatedAddFile(file_list5)); // Add files using file path list - s = DeprecatedAddFile(file_list0); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(DeprecatedAddFile(file_list0)); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 200; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -773,8 +756,7 @@ } // This file list has overlapping values with the existing data - s = DeprecatedAddFile(file_list3); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list3)); // Overwrite values of keys divisible by 5 for (int k = 0; k < 200; k += 5) { @@ -842,16 +824,14 @@ for (int k = i * 100; k < (i + 1) * 100; k++) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } - Status s = sst_file_writer.Finish(&files_info[i]); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&files_info[i])); ASSERT_EQ(files_info[i].file_path, files[i]); ASSERT_EQ(files_info[i].num_entries, 100); ASSERT_EQ(files_info[i].smallest_key, Key(i * 100)); ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1)); } files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst"); - auto s = DeprecatedAddFile(files); - ASSERT_NOK(s) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(files)); for (int k = 0; k < n * 100; k++) { ASSERT_EQ("NOT_FOUND", Get(Key(k))); } @@ -873,17 +853,14 @@ // file1.sst (0 => 500) std::string sst_file_path = sst_files_dir_ + "file1.sst"; - Status s = sst_file_writer.Open(sst_file_path); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Open(sst_file_path)); for (int i = 0; i < 500; i++) { std::string k = Key(i); - s = sst_file_writer.Put(k, k + "_val"); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Put(k, k + "_val")); } ExternalSstFileInfo sst_file_info; - s = sst_file_writer.Finish(&sst_file_info); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Finish(&sst_file_info)); options.delete_obsolete_files_period_micros = 0; options.disable_auto_compactions = true; @@ -895,12 +872,11 @@ ASSERT_OK(Flush()); ASSERT_OK(Put("aaa", "xxx")); ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - s = DeprecatedAddFile({sst_file_path}); - ASSERT_OK(s); + ASSERT_OK(DeprecatedAddFile({sst_file_path})); for (int i = 0; i < 500; i++) { std::string k = Key(i); @@ -923,8 +899,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); @@ -937,8 +912,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 200); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -967,8 +941,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 100); ASSERT_EQ(file3_info.smallest_key, Key(300)); @@ -985,6 +958,7 @@ } TEST_F(ExternalSSTFileTest, MultiThreaded) { + env_->skip_fsync_ = true; // Bulk load 10 files every file contain 1000 keys int num_files = 10; int keys_per_file = 1000; @@ -1013,8 +987,7 @@ ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); } - Status s = sst_file_writer.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish()); }; // Write num_files files in parallel std::vector sst_writer_threads; @@ -1076,8 +1049,7 @@ // Overwrite values of keys divisible by 100 for (int k = 0; k < num_files * keys_per_file; k += 100) { std::string key = Key(k); - Status s = Put(key, key + "_new"); - ASSERT_TRUE(s.ok()); + ASSERT_OK(Put(key, key + "_new")); } for (int i = 0; i < 2; i++) { @@ -1097,6 +1069,7 @@ } TEST_F(ExternalSSTFileTest, OverlappingRanges) { + env_->skip_fsync_ = true; Random rnd(301); SequenceNumber assigned_seqno = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -1120,6 +1093,7 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); do { Options options = CurrentOptions(); + env_->skip_fsync_ = true; DestroyAndReopen(options); SstFileWriter sst_file_writer(EnvOptions(), options); @@ -1159,7 +1133,8 @@ // Generate the file containing the range std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); - ASSERT_OK(sst_file_writer.Open(file_name)); + s = sst_file_writer.Open(file_name); + ASSERT_OK(s); for (int k = range_start; k <= range_end; k++) { s = sst_file_writer.Put(Key(k), range_val); ASSERT_OK(s); @@ -1204,10 +1179,10 @@ // Flush / Compact the DB if (i && i % 50 == 0) { - Flush(); + ASSERT_OK(Flush()); } if (i && i % 75 == 0) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } } @@ -1228,6 +1203,7 @@ } TEST_P(ExternalSSTFileTest, PickedLevel) { + env_->skip_fsync_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = false; options.level0_file_num_compaction_trigger = 4; @@ -1284,7 +1260,7 @@ // Hold compaction from finishing TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); EXPECT_EQ(FilesPerLevel(), "1,1,1,2"); size_t kcnt = 0; @@ -1294,6 +1270,7 @@ } TEST_F(ExternalSSTFileTest, PickedLevelBug) { + env_->skip_fsync_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = false; options.level0_file_num_compaction_trigger = 3; @@ -1319,8 +1296,11 @@ // We have 2 overlapping files in L0 EXPECT_EQ(FilesPerLevel(), "2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"}, + {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "ExternalSSTFileTest::PickedLevelBug:0"}, {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"}, {"ExternalSSTFileTest::PickedLevelBug:2", "DBImpl::RunManualCompaction:0"}, @@ -1334,37 +1314,47 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - // While writing the MANIFEST start a thread that will ask for compaction - ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() { - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - }); - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); - - // Start a thread that will ingest a new file - ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() { - file_keys = {1, 2, 3}; - ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1)); - }); + Status bg_compact_status; + Status bg_addfile_status; - // Wait for AddFile to start picking levels and writing MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); - - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); - - // We need to verify that no compactions can run while AddFile is - // ingesting the files into the levels it find suitable. So we will - // wait for 2 seconds to give a chance for compactions to run during - // this period, and then make sure that no compactions where able to run - env_->SleepForMicroseconds(1000000 * 2); - ASSERT_FALSE(bg_compact_started.load()); - - // Hold AddFile from finishing writing the MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); + { + // While writing the MANIFEST start a thread that will ask for compaction + ThreadGuard bg_compact(port::Thread([&]() { + bg_compact_status = + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + })); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); + + // Start a thread that will ingest a new file + ThreadGuard bg_addfile(port::Thread([&]() { + file_keys = {1, 2, 3}; + bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1); + })); + + // Wait for AddFile to start picking levels and writing MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); + + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); + + // We need to verify that no compactions can run while AddFile is + // ingesting the files into the levels it find suitable. So we will + // wait for 2 seconds to give a chance for compactions to run during + // this period, and then make sure that no compactions where able to run + env_->SleepForMicroseconds(1000000 * 2); + bool bg_compact_started_tmp = bg_compact_started.load(); + + // Hold AddFile from finishing writing the MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); + + // check the status at the end, so even if the ASSERT fails the threads + // could be joined and return. + ASSERT_FALSE(bg_compact_started_tmp); + } - bg_addfile.join(); - bg_compact.join(); + ASSERT_OK(bg_addfile_status); + ASSERT_OK(bg_compact_status); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); int total_keys = 0; Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1401,7 +1391,7 @@ // After full compaction, there should be only 1 file. std::vector files; - env_->GetChildren(dbname_, &files); + ASSERT_OK(env_->GetChildren(dbname_, &files)); int num_sst_files = 0; for (auto& f : files) { uint64_t number; @@ -1413,7 +1403,9 @@ ASSERT_EQ(1, num_sst_files); } +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) { + env_->skip_fsync_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = false; options.level0_file_num_compaction_trigger = 2; @@ -1469,8 +1461,10 @@ } } } +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(ExternalSSTFileTest, PickedLevelDynamic) { + env_->skip_fsync_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = false; options.level0_file_num_compaction_trigger = 4; @@ -1521,7 +1515,7 @@ TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2"); // Output of the compaction will go to L3 - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); EXPECT_EQ(FilesPerLevel(), "1,0,0,2"); Close(); @@ -1609,15 +1603,15 @@ generated_files[7]}; ASSERT_NOK(DeprecatedAddFile(in_files)); - // These 2 files dont overlap with each other + // These 2 files don't overlap with each other in_files = {generated_files[0], generated_files[2]}; ASSERT_OK(DeprecatedAddFile(in_files)); - // These 2 files dont overlap with each other but overlap with keys in DB + // These 2 files don't overlap with each other but overlap with keys in DB in_files = {generated_files[3], generated_files[7]}; ASSERT_NOK(DeprecatedAddFile(in_files)); - // Files dont overlap and dont overlap with DB key range + // Files don't overlap and don't overlap with DB key range in_files = {generated_files[4], generated_files[6], generated_files[8]}; ASSERT_OK(DeprecatedAddFile(in_files)); @@ -1663,7 +1657,7 @@ cro.exclusive_manual_compaction = false; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -1715,9 +1709,9 @@ Options options = CurrentOptions(); options.unordered_write = true; DestroyAndReopen(options); - Put("foo", "v1"); + ASSERT_OK(Put("foo", "v1")); SyncPoint::GetInstance()->EnableProcessing(); - port::Thread writer([&]() { Put("bar", "v2"); }); + port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); }); TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"); ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1, @@ -1729,7 +1723,9 @@ SyncPoint::GetInstance()->ClearAllCallBacks(); } +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) { + env_->skip_fsync_ = true; Options options = CurrentOptions(); options.IncreaseParallelism(20); options.level0_slowdown_writes_trigger = 256; @@ -1746,10 +1742,8 @@ for (int i = 0; i < 500; i++) { std::vector> random_data; for (int j = 0; j < 100; j++) { - std::string k; - std::string v; - test::RandomString(&rnd, rnd.Next() % 20, &k); - test::RandomString(&rnd, rnd.Next() % 50, &v); + std::string k = rnd.RandomString(rnd.Next() % 20); + std::string v = rnd.RandomString(rnd.Next() % 50); random_data.emplace_back(k, v); } @@ -1767,10 +1761,11 @@ } size_t kcnt = 0; VerifyDBFromMap(true_data, &kcnt, false); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); VerifyDBFromMap(true_data, &kcnt, false); } } +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) { Options options = CurrentOptions(); @@ -1797,7 +1792,7 @@ options, file_data, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - // This file dont overlap with anything in the DB, will go to L4 + // This file don't overlap with anything in the DB, will go to L4 ASSERT_EQ("0,0,0,0,1", FilesPerLevel()); // Insert 80 -> 130 using AddFile @@ -1822,7 +1817,7 @@ options, file_data, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - // This file dont overlap with anything in the DB and fit in L4 as well + // This file don't overlap with anything in the DB and fit in L4 as well ASSERT_EQ("2,0,0,0,2", FilesPerLevel()); // Insert 10 -> 40 using AddFile @@ -1851,8 +1846,8 @@ ASSERT_OK(Put(Key(k), "memtable")); true_data[Key(k)] = "memtable"; } - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); bool write_global_seqno = std::get<0>(GetParam()); @@ -1861,40 +1856,40 @@ ASSERT_OK(GenerateAndAddExternalFile( options, {90, 100, 110}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // This file will flush the memtable ASSERT_OK(GenerateAndAddExternalFile( options, {19, 20, 21}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_EQ(entries_in_memtable, 0); for (int k : {200, 201, 205, 206}) { ASSERT_OK(Put(Key(k), "memtable")); true_data[Key(k)] = "memtable"; } - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // No need for flush, this file keys fit between the memtable keys ASSERT_OK(GenerateAndAddExternalFile( options, {202, 203, 204}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // This file will flush the memtable ASSERT_OK(GenerateAndAddExternalFile( options, {206, 207}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_EQ(entries_in_memtable, 0); size_t kcnt = 0; @@ -2001,7 +1996,8 @@ if (running_threads.load() == 0) { break; } - env_->SleepForMicroseconds(500000); + // Make sure we do a "real sleep", not a mock one. + SystemClock::Default()->SleepForMicroseconds(500000); } ASSERT_EQ(running_threads.load(), 0); @@ -2059,16 +2055,16 @@ IngestExternalFileOptions ifo; - // SST CF dont match + // SST CF don't match ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo)); - // SST CF dont match + // SST CF don't match ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo)); // SST CF match ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo)); - // SST CF dont match + // SST CF don't match ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo)); - // SST CF dont match + // SST CF don't match ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo)); // SST CF match ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo)); @@ -2292,7 +2288,7 @@ ASSERT_OK(Put(Key(i), "memtable")); true_data[Key(i)] = "memtable"; } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Universal picker should go at second from the bottom level ASSERT_EQ("0,1", FilesPerLevel()); ASSERT_OK(GenerateAndAddExternalFile( @@ -2306,7 +2302,7 @@ verify_checksums_before_ingest, true /*ingest_behind*/, false /*sort_data*/, &true_data)); ASSERT_EQ("0,1,1", FilesPerLevel()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // bottom level should be empty ASSERT_EQ("0,1", FilesPerLevel()); @@ -2383,20 +2379,67 @@ Random rnd(301); std::vector> random_data; for (int i = 0; i < kNumEntries; i++) { - std::string val; - test::RandomString(&rnd, kNumBytesPerEntry, &val); + std::string val = rnd.RandomString(kNumBytesPerEntry); random_data.emplace_back(Key(i), std::move(val)); } ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data))); ASSERT_EQ(1, num_compression_dicts); } +class ExternalSSTBlockChecksumTest + : public ExternalSSTFileTestBase, + public testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest, + testing::ValuesIn(test::kFooterFormatVersionsToTest)); + +// Very slow, not worth the cost to run regularly +TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) { + BlockBasedTableOptions table_options; + table_options.format_version = GetParam(); + for (auto t : GetSupportedChecksums()) { + table_options.checksum = t; + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // 2^32 - 1, will lead to data block with more than 2^32 bytes + size_t huge_size = port::kMaxUint32; + + std::string f = sst_files_dir_ + "f.sst"; + ASSERT_OK(sst_file_writer.Open(f)); + { + Random64 r(123); + std::string huge(huge_size, 0); + for (size_t j = 0; j + 7 < huge_size; j += 8) { + EncodeFixed64(&huge[j], r.Next()); + } + ASSERT_OK(sst_file_writer.Put("Huge", huge)); + } + + ExternalSstFileInfo f_info; + ASSERT_OK(sst_file_writer.Finish(&f_info)); + ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10); + + SstFileReader sst_file_reader(options); + ASSERT_OK(sst_file_reader.Open(f)); + ASSERT_OK(sst_file_reader.VerifyChecksum()); + } +} + TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { std::unique_ptr fault_injection_env( new FaultInjectionTestEnv(env_)); Options options = CurrentOptions(); options.env = fault_injection_env.get(); CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + // Exercise different situations in different column families: two are empty + // (so no new sequence number is needed), but at least one overlaps with the + // DB and needs to bump the sequence number. + ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue")); + std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); @@ -2420,9 +2463,8 @@ // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_OK(s); + ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); Close(); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, options); @@ -2603,9 +2645,8 @@ std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" @@ -2673,9 +2714,8 @@ std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" @@ -2748,9 +2788,8 @@ std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" @@ -2761,7 +2800,7 @@ "PartialManifestWriteFail:1"); ingest_thread.join(); - fault_injection_env->DropUnsyncedFileData(); + ASSERT_OK(fault_injection_env->DropUnsyncedFileData()); fault_injection_env->SetFilesystemActive(true); Close(); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, @@ -2796,7 +2835,103 @@ // sure that it won't enter the 2nd writer queue for the second time. std::vector> data; data.push_back(std::make_pair("1001", "v2")); - GenerateAndAddExternalFile(options, data); + ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true)); +} + +TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + constexpr size_t kValueSize = 8; + Random rnd(301); + std::string value = rnd.RandomString(kValueSize); + + // Write some key to make global seqno larger than zero + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("ab" + Key(i), value)); + } + // Get a Snapshot to make RocksDB assign global seqno to ingested sst files. + auto snap = dbfull()->GetSnapshot(); + + std::string fname = sst_files_dir_ + "test_file"; + ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options); + ASSERT_OK(writer.Open(fname)); + std::string key1 = "ab"; + std::string key2 = "ab"; + + // Make the prefix of key2 is same with key1 add zero seqno. The tail of every + // key is composed as (seqno << 8 | value_type), and here `1` represents + // ValueType::kTypeValue + + PutFixed64(&key2, PackSequenceAndType(0, kTypeValue)); + key2 += "cdefghijkl"; + + ASSERT_OK(writer.Put(key1, value)); + ASSERT_OK(writer.Put(key2, value)); + + ExternalSstFileInfo info; + ASSERT_OK(writer.Finish(&info)); + + ASSERT_OK(dbfull()->IngestExternalFile({info.file_path}, + IngestExternalFileOptions())); + dbfull()->ReleaseSnapshot(snap); + ASSERT_EQ(value, Get(key1)); + // You will get error here + ASSERT_EQ(value, Get(key2)); +} + +TEST_P(ExternalSSTFileTest, + DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) { + // Regression test for bug where global seqno corrupted the shared bytes + // buffer when switching from reverse iteration to forward iteration. + constexpr size_t kValueSize = 8; + Options options = CurrentOptions(); + + Random rnd(301); + std::string value = rnd.RandomString(kValueSize); + + std::string key0 = "aa"; + std::string key1 = "ab"; + // Make the prefix of key2 is same with key1 add zero seqno. The tail of every + // key is composed as (seqno << 8 | value_type), and here `1` represents + // ValueType::kTypeValue + std::string key2 = "ab"; + PutFixed64(&key2, PackSequenceAndType(0, kTypeValue)); + key2 += "cdefghijkl"; + std::string key3 = key2 + "_"; + + // Write some key to make global seqno larger than zero + ASSERT_OK(Put(key0, value)); + + std::string fname = sst_files_dir_ + "test_file"; + ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options); + ASSERT_OK(writer.Open(fname)); + + // key0 is a dummy to ensure the turnaround point (key1) comes from Prev + // cache rather than block (restart keys are pinned in block). + ASSERT_OK(writer.Put(key0, value)); + ASSERT_OK(writer.Put(key1, value)); + ASSERT_OK(writer.Put(key2, value)); + ASSERT_OK(writer.Put(key3, value)); + + ExternalSstFileInfo info; + ASSERT_OK(writer.Finish(&info)); + + ASSERT_OK(dbfull()->IngestExternalFile({info.file_path}, + IngestExternalFileOptions())); + ReadOptions read_opts; + // Prevents Seek() when switching directions, which circumvents the bug. + read_opts.total_order_seek = true; + Iterator* iter = db_->NewIterator(read_opts); + // Scan backwards to key2. File iterator will then be positioned at key1. + iter->Seek(key3); + ASSERT_EQ(key3, iter->key()); + iter->Prev(); + ASSERT_EQ(key2, iter->key()); + // Scan forwards and make sure key3 is present. Previously key3 would be + // corrupted by the global seqno from key1. + iter->Next(); + ASSERT_EQ(key3, iter->key()); + delete iter; } INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/fault_injection_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/fault_injection_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,17 +16,21 @@ #include "db/version_set.h" #include "env/mock_env.h" #include "file/filename.h" -#include "logging/logging.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" -#include "test_util/fault_injection_test_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/mutexlock.h" +#include "util/random.h" +#include "utilities/fault_injection_env.h" +#ifndef NDEBUG +#include "utilities/fault_injection_fs.h" +#endif namespace ROCKSDB_NAMESPACE { @@ -57,7 +61,6 @@ bool sequential_order_; - protected: public: enum ExpectedVerifResult { kValExpectFound, kValExpectNoError }; enum ResetMethod { @@ -81,7 +84,11 @@ sync_use_compact_(true), base_env_(nullptr), env_(nullptr), - db_(nullptr) {} + db_(nullptr) { + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_)); + EXPECT_NE(system_env_, nullptr); + } ~FaultInjectionTest() override { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -94,7 +101,7 @@ return false; } else { if (option_config_ == kMultiLevels) { - base_env_.reset(new MockEnv(Env::Default())); + base_env_.reset(MockEnv::Create(system_env_)); } return true; } @@ -146,8 +153,7 @@ assert(tiny_cache_ == nullptr); assert(env_ == nullptr); - env_ = - new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default()); + env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_); options_ = CurrentOptions(); options_.env = env_; @@ -192,7 +198,7 @@ for (int i = start_idx; i < start_idx + num_vals; i++) { Slice key = Key(i, &key_space); batch.Clear(); - batch.Put(key, Value(i, &value_space)); + ASSERT_OK(batch.Put(key, Value(i, &value_space))); ASSERT_OK(db_->Write(write_options, &batch)); } } @@ -249,7 +255,8 @@ // Return the value to associate with the specified key Slice Value(int k, std::string* storage) const { Random r(k); - return test::RandomString(&r, kValueSize, storage); + *storage = r.RandomString(kValueSize); + return Slice(*storage); } void CloseDB() { @@ -271,12 +278,12 @@ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); } - + ASSERT_OK(iter->status()); delete iter; FlushOptions flush_options; flush_options.wait = true; - db_->Flush(flush_options); + ASSERT_OK(db_->Flush(flush_options)); } // rnd cannot be null for kResetDropRandomUnsyncedData @@ -309,7 +316,7 @@ Build(write_options, 0, num_pre_sync); if (sync_use_compact_) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } write_options.sync = false; Build(write_options, num_pre_sync, num_post_sync); @@ -341,9 +348,13 @@ } void WaitCompactionFinish() { - static_cast(db_->GetRootDB())->TEST_WaitForCompact(); + ASSERT_OK(static_cast(db_->GetRootDB())->TEST_WaitForCompact()); ASSERT_OK(db_->Put(WriteOptions(), "", "")); } + + private: + Env* system_env_; + std::shared_ptr env_guard_; }; class FaultInjectionTestSplitted : public FaultInjectionTest {}; @@ -408,7 +419,7 @@ write_options.sync = true; ASSERT_OK( db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(false)); env_->SetFilesystemActive(false); NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); @@ -449,7 +460,7 @@ Build(WriteOptions(), 0, kNumKeys); FlushOptions flush_options; flush_options.wait = true; - db_->Flush(flush_options); + ASSERT_OK(db_->Flush(flush_options)); ASSERT_OK(db_->Put(WriteOptions(), "", "")); TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0"); TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1"); @@ -520,9 +531,9 @@ wo.sync = true; wo.disableWAL = false; WriteBatch batch; - batch.Put("cats", "dogs"); + ASSERT_OK(batch.Put("cats", "dogs")); batch.MarkWalTerminationPoint(); - batch.Put("boys", "girls"); + ASSERT_OK(batch.Put("boys", "girls")); ASSERT_OK(db_->Write(wo, &batch)); env_->SetFilesystemActive(false); @@ -535,6 +546,76 @@ ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound()); } +TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) { + auto fault_fs = std::make_shared(FileSystem::Default()); + fault_fs->EnableWriteErrorInjection(); + fault_fs->SetFilesystemDirectWritable(false); + const std::string file_name = NormalizePath(dbname_ + "/test_file"); + std::unique_ptr log_writer = nullptr; + constexpr uint64_t log_number = 0; + { + std::unique_ptr file; + const Status s = + fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter( + new WritableFileWriter(std::move(file), file_name, FileOptions())); + log_writer.reset(new log::Writer(std::move(fwriter), log_number, + /*recycle_log_files=*/false)); + } + + fault_fs->SetRandomWriteError( + 0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"), + /*inject_for_all_file_types=*/true, /*types=*/{}); + + { + VersionEdit edit; + edit.SetColumnFamily(0); + std::string buf; + assert(edit.EncodeTo(&buf)); + const Status s = log_writer->AddRecord(buf); + ASSERT_NOK(s); + } + + fault_fs->DisableWriteErrorInjection(); + + // Closing the log writer will cause WritableFileWriter::Close() and flush + // remaining data from its buffer to underlying file. + log_writer.reset(); + + { + std::unique_ptr file; + Status s = + fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr freader( + new SequentialFileReader(std::move(file), file_name)); + Status log_read_s; + class LogReporter : public log::Reader::Reporter { + public: + Status* status_; + explicit LogReporter(Status* _s) : status_(_s) {} + void Corruption(size_t /*bytes*/, const Status& _s) override { + if (status_->ok()) { + *status_ = _s; + } + } + } reporter(&log_read_s); + std::unique_ptr log_reader(new log::Reader( + nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number)); + Slice record; + std::string data; + size_t count = 0; + while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) { + VersionEdit edit; + ASSERT_OK(edit.DecodeFrom(data)); + ++count; + } + // Verify that only one version edit exists in the file. + ASSERT_EQ(1, count); + } +} + INSTANTIATE_TEST_CASE_P( FaultTest, FaultInjectionTest, ::testing::Values(std::make_tuple(false, kDefault, kEnd), @@ -551,5 +632,6 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/filename_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/filename_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,6 @@ #include "file/filename.h" #include "db/dbformat.h" -#include "logging/logging.h" #include "port/port.h" #include "test_util/testharness.h" @@ -35,23 +34,23 @@ FileType type; char mode; } cases[] = { - {"100.log", 100, kLogFile, kAllMode}, - {"0.log", 0, kLogFile, kAllMode}, - {"0.sst", 0, kTableFile, kAllMode}, - {"CURRENT", 0, kCurrentFile, kAllMode}, - {"LOCK", 0, kDBLockFile, kAllMode}, - {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, - {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, - {"METADB-2", 2, kMetaDatabase, kAllMode}, - {"METADB-7", 7, kMetaDatabase, kAllMode}, - {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, - {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, - {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, - {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, - {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, - {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, - {"18446744073709551615.log", 18446744073709551615ull, kLogFile, - kAllMode}, }; + {"100.log", 100, kWalFile, kAllMode}, + {"0.log", 0, kWalFile, kAllMode}, + {"0.sst", 0, kTableFile, kAllMode}, + {"CURRENT", 0, kCurrentFile, kAllMode}, + {"LOCK", 0, kDBLockFile, kAllMode}, + {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, + {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, + {"METADB-2", 2, kMetaDatabase, kAllMode}, + {"METADB-7", 7, kMetaDatabase, kAllMode}, + {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, + {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, + {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode}, + }; for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) { for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir"); @@ -108,7 +107,7 @@ TEST_F(FileNameTest, InfoLogFileName) { std::string dbname = ("/data/rocksdb"); std::string db_absolute_path; - Env::Default()->GetAbsolutePath(dbname, &db_absolute_path); + ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path)); ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, "")); ASSERT_EQ("/data/rocksdb/LOG.old.666", @@ -142,7 +141,7 @@ ASSERT_EQ("foo/", std::string(fname.data(), 4)); ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(192U, number); - ASSERT_EQ(kLogFile, type); + ASSERT_EQ(kWalFile, type); fname = TableFileName({DbPath("bar", 0)}, 200, 0); std::string fname1 = diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc 2025-05-19 16:14:27.000000000 +0000 @@ -39,8 +39,6 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block_based/block.h" -#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" @@ -77,28 +75,32 @@ return "Manual Flush"; case FlushReason::kErrorRecovery: return "Error Recovery"; + case FlushReason::kWalFull: + return "WAL Full"; default: return "Invalid"; } } -FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd, - const ImmutableDBOptions& db_options, - const MutableCFOptions& mutable_cf_options, - const uint64_t* max_memtable_id, - const FileOptions& file_options, VersionSet* versions, - InstrumentedMutex* db_mutex, - std::atomic* shutting_down, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, JobContext* job_context, - LogBuffer* log_buffer, Directory* db_directory, - Directory* output_file_directory, - CompressionType output_compression, Statistics* stats, - EventLogger* event_logger, bool measure_io_stats, - const bool sync_output_directory, const bool write_manifest, - Env::Priority thread_pri) +FlushJob::FlushJob( + const std::string& dbname, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, JobContext* job_context, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, CompressionType output_compression, + Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + const bool sync_output_directory, const bool write_manifest, + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) : dbname_(dbname), + db_id_(db_id), + db_session_id_(db_session_id), cfd_(cfd), db_options_(db_options), mutable_cf_options_(mutable_cf_options), @@ -123,13 +125,18 @@ edit_(nullptr), base_(nullptr), pick_memtable_called(false), - thread_pri_(thread_pri) { + thread_pri_(thread_pri), + io_tracer_(io_tracer), + clock_(db_options_.clock), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback) { // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); } FlushJob::~FlushJob() { + io_status_.PermitUncheckedError(); ThreadStatusUtil::ResetThreadStatus(); } @@ -159,7 +166,6 @@ ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } - void FlushJob::PickMemTable() { db_mutex_->AssertHeld(); assert(!pick_memtable_called); @@ -190,8 +196,8 @@ base_->Ref(); // it is likely that we do not need this reference } -Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, - FileMetaData* file_meta) { +Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, + bool* switched_to_mempurge) { TEST_SYNC_POINT("FlushJob::Start"); db_mutex_->AssertHeld(); assert(pick_memtable_called); @@ -221,9 +227,43 @@ prev_cpu_write_nanos = IOSTATS(cpu_write_nanos); prev_cpu_read_nanos = IOSTATS(cpu_read_nanos); } - - // This will release and re-acquire the mutex. - Status s = WriteLevel0Table(); + Status mempurge_s = Status::NotFound("No MemPurge."); + if ((db_options_.experimental_mempurge_threshold > 0.0) && + (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) && + (!mems_.empty()) && MemPurgeDecider()) { + mempurge_s = MemPurge(); + if (!mempurge_s.ok()) { + // Mempurge is typically aborted when the output + // bytes cannot be contained onto a single output memtable. + if (mempurge_s.IsAborted()) { + ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n", + mempurge_s.ToString().c_str()); + } else { + // However the mempurge process can also fail for + // other reasons (eg: new_mem->Add() fails). + ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n", + mempurge_s.ToString().c_str()); + } + } else { + if (switched_to_mempurge) { + *switched_to_mempurge = true; + } else { + // The mempurge process was successful, but no switch_to_mempurge + // pointer provided so no way to propagate the state of flush job. + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge process succeeded" + "but no 'switched_to_mempurge' ptr provided.\n"); + } + } + } + Status s; + if (mempurge_s.ok()) { + base_->Unref(); + s = Status::OK(); + } else { + // This will release and re-acquire the mutex. + s = WriteLevel0Table(); + } if (s.ok() && cfd_->IsDropped()) { s = Status::ColumnFamilyDropped("Column family dropped during compaction"); @@ -238,10 +278,17 @@ } else if (write_manifest_) { TEST_SYNC_POINT("FlushJob::InstallResults"); // Replace immutable memtable with the generated Table + IOStatus tmp_io_s; s = cfd_->imm()->TryInstallMemtableFlushResults( cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, - log_buffer_, &committed_flush_jobs_info_); + log_buffer_, &committed_flush_jobs_info_, &tmp_io_s, + !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted), + but 'false' if mempurge successful: no new min log number + or new level 0 file path to write to manifest. */); + if (!tmp_io_s.ok()) { + io_status_ = tmp_io_s; + } } if (s.ok() && file_meta != nullptr) { @@ -262,6 +309,13 @@ stream << vstorage->NumLevelFiles(level); } stream.EndArray(); + + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + stream << "blob_file_head" << blob_files.begin()->first; + stream << "blob_file_tail" << blob_files.rbegin()->first; + } + stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed(); if (measure_io_stats_) { @@ -289,13 +343,457 @@ base_->Unref(); } +Status FlushJob::MemPurge() { + Status s; + db_mutex_->AssertHeld(); + db_mutex_->Unlock(); + assert(!mems_.empty()); + + // Measure purging time. + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); + + MemTable* new_mem = nullptr; + // For performance/log investigation purposes: + // look at how much useful payload we harvest in the new_mem. + // This value is then printed to the DB log. + double new_mem_capacity = 0.0; + + // Create two iterators, one for the memtable data (contains + // info from puts + deletes), and one for the memtable + // Range Tombstones (from DeleteRanges). + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + std::vector memtables; + std::vector> + range_del_iters; + for (MemTable* m : mems_) { + memtables.push_back(m->NewIterator(ro, &arena)); + auto* range_del_iter = m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + } + + assert(!memtables.empty()); + SequenceNumber first_seqno = kMaxSequenceNumber; + SequenceNumber earliest_seqno = kMaxSequenceNumber; + // Pick first and earliest seqno as min of all first_seqno + // and earliest_seqno of the mempurged memtables. + for (const auto& mem : mems_) { + first_seqno = mem->GetFirstSequenceNumber() < first_seqno + ? mem->GetFirstSequenceNumber() + : first_seqno; + earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno + ? mem->GetEarliestSequenceNumber() + : earliest_seqno; + } + + ScopedArenaIterator iter( + NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(), + static_cast(memtables.size()), &arena)); + + auto* ioptions = cfd_->ioptions(); + + // Place iterator at the First (meaning most recent) key node. + iter->SeekToFirst(); + + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&(cfd_->internal_comparator()), + existing_snapshots_)); + for (auto& rd_iter : range_del_iters) { + range_del_agg->AddTombstones(std::move(rd_iter)); + } + + // If there is valid data in the memtable, + // or at least range tombstones, copy over the info + // to the new memtable. + if (iter->Valid() || !range_del_agg->IsEmpty()) { + // MaxSize is the size of a memtable. + size_t maxSize = mutable_cf_options_.write_buffer_size; + std::unique_ptr compaction_filter; + if (ioptions->compaction_filter_factory != nullptr && + ioptions->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kFlush)) { + CompactionFilter::Context ctx; + ctx.is_full_compaction = false; + ctx.is_manual_compaction = false; + ctx.column_family_id = cfd_->GetID(); + ctx.reason = TableFileCreationReason::kFlush; + compaction_filter = + ioptions->compaction_filter_factory->CreateCompactionFilter(ctx); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s = Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + return s; + } + } + + new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()), + mutable_cf_options_, cfd_->write_buffer_mgr(), + earliest_seqno, cfd_->GetID()); + assert(new_mem != nullptr); + + Env* env = db_options_.env; + assert(env); + MergeHelper merge( + env, (cfd_->internal_comparator()).user_comparator(), + (ioptions->merge_operator).get(), compaction_filter.get(), + ioptions->logger, true /* internal key corruption is not ok */, + existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), + snapshot_checker_); + CompactionIterator c_iter( + iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge, + kMaxSequenceNumber, &existing_snapshots_, + earliest_write_conflict_snapshot_, snapshot_checker_, env, + ShouldReportDetailedTime(env, ioptions->stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + nullptr, ioptions->allow_data_in_errors, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, + /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, ioptions->info_log, + &(cfd_->GetFullHistoryTsLow())); + + // Set earliest sequence number in the new memtable + // to be equal to the earliest sequence number of the + // memtable being flushed (See later if there is a need + // to update this number!). + new_mem->SetEarliestSequenceNumber(earliest_seqno); + // Likewise for first seq number. + new_mem->SetFirstSequenceNumber(first_seqno); + SequenceNumber new_first_seqno = kMaxSequenceNumber; + + c_iter.SeekToFirst(); + + // Key transfer + for (; c_iter.Valid(); c_iter.Next()) { + const ParsedInternalKey ikey = c_iter.ikey(); + const Slice value = c_iter.value(); + new_first_seqno = + ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno; + + // Should we update "OldestKeyTime" ???? -> timestamp appear + // to still be an "experimental" feature. + s = new_mem->Add( + ikey.sequence, ikey.type, ikey.user_key, value, + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted("Mempurge filled more than one memtable."); + new_mem_capacity = 1.0; + break; + } + } + + // Check status and propagate + // potential error status from c_iter + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + // Range tombstone transfer. + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + new_first_seqno = + tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno; + s = new_mem->Add( + tombstone.seq_, // Sequence number + kTypeRangeDeletion, // KV type + tombstone.start_key_, // Key is start key. + tombstone.end_key_, // Value is end key. + nullptr, // KV protection info set as nullptr since it + // should only be useful for the first add to + // the original memtable. + false, // : allow concurrent_memtable_writes_ + // Not seen as necessary for now. + nullptr, // get_post_process_info(m) must be nullptr + // when concurrent_memtable_writes is switched off. + nullptr); // hint, only used when concurrent_memtable_writes_ + // is switched on. + + if (!s.ok()) { + break; + } + + // If new_mem has size greater than maxSize, + // then rollback to regular flush operation, + // and destroy new_mem. + if (new_mem->ApproximateMemoryUsage() > maxSize) { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + break; + } + } + } + + // If everything happened smoothly and new_mem contains valid data, + // decide if it is flushed to storage or kept in the imm() + // memtable list (memory). + if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) { + // Rectify the first sequence number, which (unlike the earliest seq + // number) needs to be present in the new memtable. + new_mem->SetFirstSequenceNumber(new_first_seqno); + + // The new_mem is added to the list of immutable memtables + // only if it filled at less than 100% capacity and isn't flagged + // as in need of being flushed. + if (new_mem->ApproximateMemoryUsage() < maxSize && + !(new_mem->ShouldFlushNow())) { + db_mutex_->Lock(); + uint64_t new_mem_id = mems_[0]->GetID(); + + new_mem->SetID(new_mem_id); + + // This addition will not trigger another flush, because + // we do not call SchedulePendingFlush(). + cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free); + new_mem->Ref(); +#ifndef ROCKSDB_LITE + // Piggyback FlushJobInfo on the first flushed memtable. + db_mutex_->AssertHeld(); + meta_.fd.file_size = 0; + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); +#endif // !ROCKSDB_LITE + db_mutex_->Unlock(); + } else { + s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); + new_mem_capacity = 1.0; + if (new_mem) { + job_context_->memtables_to_free.push_back(new_mem); + } + } + } else { + // In this case, the newly allocated new_mem is empty. + assert(new_mem != nullptr); + job_context_->memtables_to_free.push_back(new_mem); + } + } + + // Reacquire the mutex for WriteLevel0 function. + db_mutex_->Lock(); + + // If mempurge successful, don't write input tables to level0, + // but write any full output table to level0. + if (s.ok()) { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful"); + } else { + TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful"); + } + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Mempurge lasted %" PRIu64 + " microseconds, and %" PRIu64 + " cpu " + "microseconds. Status is %s ok. Perc capacity: %f\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros, s.ok() ? "" : "not", new_mem_capacity); + + return s; +} + +bool FlushJob::MemPurgeDecider() { + double threshold = db_options_.experimental_mempurge_threshold; + // Never trigger mempurge if threshold is not a strictly positive value. + if (!(threshold > 0.0)) { + return false; + } + if (threshold > (1.0 * mems_.size())) { + return true; + } + // Payload and useful_payload (in bytes). + // The useful payload ratio of a given MemTable + // is estimated to be useful_payload/payload. + uint64_t payload = 0, useful_payload = 0, entry_size = 0; + + // Local variables used repetitively inside the for-loop + // when iterating over the sampled entries. + Slice key_slice, value_slice; + ParsedInternalKey res; + SnapshotImpl min_snapshot; + std::string vget; + Status mget_s, parse_s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0, sqno = 0, + min_seqno_snapshot = 0; + bool get_res, can_be_useful_payload, not_in_next_mems; + + // If estimated_useful_payload is > threshold, + // then flush to storage, else MemPurge. + double estimated_useful_payload = 0.0; + // Cochran formula for determining sample size. + // 95% confidence interval, 7% precision. + // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + double n0 = 196.0; + ReadOptions ro; + ro.total_order_seek = true; + + // Iterate over each memtable of the set. + for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_); + mem_iter++) { + MemTable* mt = *mem_iter; + + // Else sample from the table. + uint64_t nentries = mt->num_entries(); + // Corrected Cochran formula for small populations + // (converges to n0 for large populations). + uint64_t target_sample_size = + static_cast(ceil(n0 / (1.0 + (n0 / nentries)))); + std::unordered_set sentries = {}; + // Populate sample entries set. + mt->UniqueRandomSample(target_sample_size, &sentries); + + // Estimate the garbage ratio by comparing if + // each sample corresponds to a valid entry. + for (const char* ss : sentries) { + key_slice = GetLengthPrefixedSlice(ss); + parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/); + if (!parse_s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Memtable Decider: ParseInternalKey did not parse " + "key_slice %s successfully.", + key_slice.data()); + } + + // Size of the entry is "key size (+ value size if KV entry)" + entry_size = key_slice.size(); + if (res.type == kTypeValue) { + value_slice = + GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); + entry_size += value_slice.size(); + } + + // Count entry bytes as payload. + payload += entry_size; + + LookupKey lkey(res.user_key, kMaxSequenceNumber); + + // Paranoia: zero out these values just in case. + max_covering_tombstone_seq = 0; + sqno = 0; + + // Pick the oldest existing snapshot that is more recent + // than the sequence number of the sampled entry. + min_seqno_snapshot = kMaxSequenceNumber; + for (SequenceNumber seq_num : existing_snapshots_) { + if (seq_num > res.sequence && seq_num < min_seqno_snapshot) { + min_seqno_snapshot = seq_num; + } + } + min_snapshot.number_ = min_seqno_snapshot; + ro.snapshot = + min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr; + + // Estimate if the sample entry is valid or not. + get_res = mt->Get(lkey, &vget, nullptr, &mget_s, &merge_context, + &max_covering_tombstone_seq, &sqno, ro); + if (!get_res) { + ROCKS_LOG_WARN( + db_options_.info_log, + "Memtable Get returned false when Get(sampled entry). " + "Yet each sample entry should exist somewhere in the memtable, " + "unrelated to whether it has been deleted or not."); + } + + // TODO(bjlemaire): evaluate typeMerge. + // This is where the sampled entry is estimated to be + // garbage or not. Note that this is a garbage *estimation* + // because we do not include certain items such as + // CompactionFitlers triggered at flush, or if the same delete + // has been inserted twice or more in the memtable. + + // Evaluate if the entry can be useful payload + // Situation #1: entry is a KV entry, was found in the memtable mt + // and the sequence numbers match. + can_be_useful_payload = (res.type == kTypeValue) && get_res && + mget_s.ok() && (sqno == res.sequence); + + // Situation #2: entry is a delete entry, was found in the memtable mt + // (because gres==true) and no valid KV entry is found. + // (note: duplicate delete entries are also taken into + // account here, because the sequence number 'sqno' + // in memtable->Get(&sqno) operation is set to be equal + // to the most recent delete entry as well). + can_be_useful_payload |= + ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) && + mget_s.IsNotFound() && get_res && (sqno == res.sequence); + + // If there is a chance that the entry is useful payload + // Verify that the entry does not appear in the following memtables + // (memtables with greater memtable ID/larger sequence numbers). + if (can_be_useful_payload) { + not_in_next_mems = true; + for (auto next_mem_iter = mem_iter + 1; + next_mem_iter != std::end(mems_); next_mem_iter++) { + if ((*next_mem_iter) + ->Get(lkey, &vget, nullptr, &mget_s, &merge_context, + &max_covering_tombstone_seq, &sqno, ro)) { + not_in_next_mems = false; + break; + } + } + if (not_in_next_mems) { + useful_payload += entry_size; + } + } + } + if (payload > 0) { + // We use the estimated useful payload ratio to + // evaluate how many of the memtable bytes are useful bytes. + estimated_useful_payload += + (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload); + + ROCKS_LOG_INFO( + db_options_.info_log, + "Mempurge sampling - found garbage ratio from sampling: %f.\n", + (payload - useful_payload) * 1.0 / payload); + } else { + ROCKS_LOG_WARN(db_options_.info_log, + "Mempurge sampling: null payload measured, and collected " + "sample size is %zu\n.", + sentries.size()); + } + } + // We convert the total number of useful payload bytes + // into the proportion of memtable necessary to store all these bytes. + // We compare this proportion with the threshold value. + return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) < + threshold); +} + Status FlushJob::WriteLevel0Table() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_FLUSH_WRITE_L0); db_mutex_->AssertHeld(); - const uint64_t start_micros = db_options_.env->NowMicros(); - const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000; + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUMicros(); Status s; + + std::vector blob_file_additions; + { auto write_hint = cfd_->CalculateSSTWriteHint(0); db_mutex_->Unlock(); @@ -342,7 +840,7 @@ { ScopedArenaIterator iter( - NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], + NewMergingIterator(&cfd_->internal_comparator(), memtables.data(), static_cast(memtables.size()), &arena)); ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", @@ -352,7 +850,7 @@ TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", &output_compression_); int64_t _current_time = 0; - auto status = db_options_.env->GetCurrentTime(&_current_time); + auto status = clock_->GetCurrentTime(&_current_time); // Safe to proceed even if GetCurrentTime fails. So, log and proceed. if (!status.ok()) { ROCKS_LOG_WARN( @@ -368,23 +866,65 @@ // It's not clear whether oldest_key_time is always available. In case // it is not available, use current_time. - meta_.oldest_ancester_time = std::min(current_time, oldest_key_time); + uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time); + + TEST_SYNC_POINT_CALLBACK( + "FlushJob::WriteLevel0Table:oldest_ancester_time", + &oldest_ancester_time); + meta_.oldest_ancester_time = oldest_ancester_time; + meta_.file_creation_time = current_time; + uint64_t creation_time = (cfd_->ioptions()->compaction_style == + CompactionStyle::kCompactionStyleFIFO) + ? current_time + : meta_.oldest_ancester_time; + + uint64_t num_input_entries = 0; + uint64_t memtable_payload_bytes = 0; + uint64_t memtable_garbage_bytes = 0; + IOStatus io_s; + const std::string* const full_history_ts_low = + (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + TableBuilderOptions tboptions( + *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), output_compression_, + mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kFlush, creation_time, oldest_key_time, + current_time, db_id_, db_session_id_, 0 /* target_file_size */, + meta_.fd.GetNumber()); s = BuildTable( - dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(), - mutable_cf_options_, file_options_, cfd_->table_cache(), iter.get(), - std::move(range_del_iters), &meta_, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), - cfd_->GetName(), existing_snapshots_, + dbname_, versions_, db_options_, tboptions, file_options_, + cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, + &blob_file_additions, existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, - output_compression_, mutable_cf_options_.sample_for_compression, - cfd_->ioptions()->compression_opts, mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), - TableFileCreationReason::kFlush, event_logger_, job_context_->job_id, - Env::IO_HIGH, &table_properties_, 0 /* level */, - meta_.oldest_ancester_time, oldest_key_time, write_hint, - current_time); + &io_s, io_tracer_, BlobFileCreationReason::kFlush, event_logger_, + job_context_->job_id, Env::IO_HIGH, &table_properties_, write_hint, + full_history_ts_low, blob_callback_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); + if (!io_s.ok()) { + io_status_ = io_s; + } + if (num_input_entries != total_num_entries && s.ok()) { + std::string msg = "Expected " + ToString(total_num_entries) + + " entries in memtables, but read " + + ToString(num_input_entries); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + if (tboptions.reason == TableFileCreationReason::kFlush) { + TEST_SYNC_POINT("DBImpl::FlushJob:Flush"); + RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + memtable_payload_bytes); + RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + memtable_garbage_bytes); + } LogFlush(db_options_.info_log); } ROCKS_LOG_INFO(db_options_.info_log, @@ -397,7 +937,9 @@ meta_.marked_for_compaction ? " (needs compaction)" : ""); if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { - s = output_file_directory_->Fsync(); + s = output_file_directory_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); db_mutex_->Lock(); @@ -406,7 +948,10 @@ // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. - if (s.ok() && meta_.fd.GetFileSize() > 0) { + const bool has_output = meta_.fd.GetFileSize() > 0; + + if (s.ok() && has_output) { + TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated"); // if we have more than 1 background thread, then we cannot // insert files directly into higher levels because some other // threads could be concurrently producing compacted files for @@ -415,9 +960,13 @@ edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, meta_.fd.smallest_seqno, meta_.fd.largest_seqno, - meta_.marked_for_compaction, meta_.oldest_blob_file_number, - meta_.oldest_ancester_time, meta_.file_creation_time, - meta_.file_checksum, meta_.file_checksum_func_name); + meta_.marked_for_compaction, meta_.temperature, + meta_.oldest_blob_file_number, meta_.oldest_ancester_time, + meta_.file_creation_time, meta_.file_checksum, + meta_.file_checksum_func_name, meta_.min_timestamp, + meta_.max_timestamp); + + edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } #ifndef ROCKSDB_LITE // Piggyback FlushJobInfo on the first first flushed memtable. @@ -426,14 +975,36 @@ // Note that here we treat flush as level 0 compaction in internal stats InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); - stats.micros = db_options_.env->NowMicros() - start_micros; - stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros; - stats.bytes_written = meta_.fd.GetFileSize(); + const uint64_t micros = clock_->NowMicros() - start_micros; + const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros; + stats.micros = micros; + stats.cpu_micros = cpu_micros; + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Flush lasted %" PRIu64 + " microseconds, and %" PRIu64 " cpu microseconds.\n", + cfd_->GetName().c_str(), job_context_->job_id, micros, + cpu_micros); + + if (has_output) { + stats.bytes_written = meta_.fd.GetFileSize(); + stats.num_output_files = 1; + } + + const auto& blobs = edit_->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); + } + + stats.num_output_files_blob = static_cast(blobs.size()); + RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); - cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, - meta_.fd.GetFileSize()); + cfd_->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); RecordFlushIOStats(); + return s; } @@ -455,8 +1026,21 @@ info->largest_seqno = meta_.fd.largest_seqno; info->table_properties = table_properties_; info->flush_reason = cfd_->GetFlushReason(); + info->blob_compression_type = mutable_cf_options_.blob_compression_type; + + // Update BlobFilesInfo. + for (const auto& blob_file : edit_->GetBlobFileAdditions()) { + BlobFileAdditionInfo blob_file_addition_info( + BlobFileName(cfd_->ioptions()->cf_paths.front().path, + blob_file.GetBlobFileNumber()) /*blob_file_path*/, + blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(), + blob_file.GetTotalBlobBytes()); + info->blob_file_addition_infos.emplace_back( + std::move(blob_file_addition_info)); + } return info; } + #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,8 +17,8 @@ #include #include +#include "db/blob/blob_file_completion_callback.h" #include "db/column_family.h" -#include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" #include "db/job_context.h" @@ -60,18 +60,21 @@ // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive FlushJob(const std::string& dbname, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, - const MutableCFOptions& mutable_cf_options, - const uint64_t* max_memtable_id, const FileOptions& file_options, - VersionSet* versions, InstrumentedMutex* db_mutex, - std::atomic* shutting_down, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, SnapshotChecker* snapshot_checker, JobContext* job_context, - LogBuffer* log_buffer, Directory* db_directory, - Directory* output_file_directory, CompressionType output_compression, - Statistics* stats, EventLogger* event_logger, bool measure_io_stats, + LogBuffer* log_buffer, FSDirectory* db_directory, + FSDirectory* output_file_directory, + CompressionType output_compression, Statistics* stats, + EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, - Env::Priority thread_pri); + Env::Priority thread_pri, const std::shared_ptr& io_tracer, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); ~FlushJob(); @@ -79,7 +82,8 @@ // Once PickMemTable() is called, either Run() or Cancel() has to be called. void PickMemTable(); Status Run(LogsWithPrepTracker* prep_tracker = nullptr, - FileMetaData* file_meta = nullptr); + FileMetaData* file_meta = nullptr, + bool* switched_to_mempurge = nullptr); void Cancel(); const autovector& GetMemTables() const { return mems_; } @@ -89,25 +93,52 @@ } #endif // !ROCKSDB_LITE + // Return the IO status + IOStatus io_status() const { return io_status_; } + private: void ReportStartedFlush(); void ReportFlushInputSize(const autovector& mems); void RecordFlushIOStats(); Status WriteLevel0Table(); + + // Memtable Garbage Collection algorithm: a MemPurge takes the list + // of immutable memtables and filters out (or "purge") the outdated bytes + // out of it. The output (the filtered bytes, or "useful payload") is + // then transfered into a new memtable. If this memtable is filled, then + // the mempurge is aborted and rerouted to a regular flush process. Else, + // depending on the heuristics, placed onto the immutable memtable list. + // The addition to the imm list will not trigger a flush operation. The + // flush of the imm list will instead be triggered once the mutable memtable + // is added to the imm list. + // This process is typically intended for workloads with heavy overwrites + // when we want to avoid SSD writes (and reads) as much as possible. + // "MemPurge" is an experimental feature still at a very early stage + // of development. At the moment it is only compatible with the Get, Put, + // Delete operations as well as Iterators and CompactionFilters. + // For this early version, "MemPurge" is called by setting the + // options.experimental_mempurge_threshold value as >0.0. When this is + // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will + // first go through the MemPurge process. Therefore, we strongly + // recommend all users not to set this flag as true given that the MemPurge + // process has not matured yet. + Status MemPurge(); + bool MemPurgeDecider(); #ifndef ROCKSDB_LITE std::unique_ptr GetFlushJobInfo() const; #endif // !ROCKSDB_LITE const std::string& dbname_; + const std::string db_id_; + const std::string db_session_id_; ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; const MutableCFOptions& mutable_cf_options_; - // Pointer to a variable storing the largest memtable id to flush in this + // A variable storing the largest memtable id to flush in this // flush job. RocksDB uses this variable to select the memtables to flush in // this job. All memtables in this column family with an ID smaller than or - // equal to *max_memtable_id_ will be selected for flush. If null, then all - // memtables in the column family will be selected. - const uint64_t* max_memtable_id_; + // equal to max_memtable_id_ will be selected for flush. + uint64_t max_memtable_id_; const FileOptions file_options_; VersionSet* versions_; InstrumentedMutex* db_mutex_; @@ -117,8 +148,8 @@ SnapshotChecker* snapshot_checker_; JobContext* job_context_; LogBuffer* log_buffer_; - Directory* db_directory_; - Directory* output_file_directory_; + FSDirectory* db_directory_; + FSDirectory* output_file_directory_; CompressionType output_compression_; Statistics* stats_; EventLogger* event_logger_; @@ -153,6 +184,13 @@ Version* base_; bool pick_memtable_called; Env::Priority thread_pri_; + IOStatus io_status_; + + const std::shared_ptr io_tracer_; + SystemClock* clock_; + + const std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,22 +3,25 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/flush_job.h" + #include #include #include #include -#include "db/blob_index.h" +#include "db/blob/blob_index.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" -#include "db/flush_job.h" #include "db/version_set.h" #include "file/writable_file_writer.h" #include "rocksdb/cache.h" +#include "rocksdb/file_system.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -26,49 +29,35 @@ // TODO(icanadi) Mock out everything else: // 1. VersionSet // 2. Memtable -class FlushJobTest : public testing::Test { - public: - FlushJobTest() +class FlushJobTestBase : public testing::Test { + protected: + FlushJobTestBase(std::string dbname, const Comparator* ucmp) : env_(Env::Default()), - fs_(std::make_shared(env_)), - dbname_(test::PerThreadDBPath("flush_job_test")), + fs_(env_->GetFileSystem()), + dbname_(std::move(dbname)), + ucmp_(ucmp), options_(), db_options_(options_), column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), shutting_down_(false), - mock_table_factory_(new mock::MockTableFactory()) { - EXPECT_OK(env_->CreateDirIfMissing(dbname_)); - db_options_.db_paths.emplace_back(dbname_, - std::numeric_limits::max()); - db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - // TODO(icanadi) Remove this once we mock out VersionSet - NewDB(); - std::vector column_families; - cf_options_.table_factory = mock_table_factory_; - for (const auto& cf_name : column_family_names_) { - column_families.emplace_back(cf_name, cf_options_); - } + mock_table_factory_(new mock::MockTableFactory()) {} - db_options_.env = env_; - db_options_.fs = fs_; - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_manager_, - &write_controller_, - /*block_cache_tracer=*/nullptr)); - EXPECT_OK(versions_->Recover(column_families, false)); + virtual ~FlushJobTestBase() { + if (getenv("KEEP_DB")) { + fprintf(stdout, "db is still in %s\n", dbname_.c_str()); + } else { + // destroy versions_ to release all file handles + versions_.reset(); + EXPECT_OK(DestroyDir(env_, dbname_)); + } } void NewDB() { - SetIdentityFile(env_, dbname_); + ASSERT_OK(SetIdentityFile(env_, dbname_)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { - DBImpl* impl = new DBImpl(DBOptions(), dbname_); - std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); - new_db.SetDBId(db_id); - } + new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -80,6 +69,7 @@ VersionEdit new_cf; new_cf.AddColumnFamily(column_family_names_[i]); new_cf.SetColumnFamily(cf_id++); + new_cf.SetComparatorName(ucmp_->Name()); new_cf.SetLogNumber(0); new_cf.SetNextFile(2); new_cf.SetLastSequence(last_seq++); @@ -87,17 +77,19 @@ } const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions())); + { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); + ASSERT_OK(s); for (const auto& e : new_cfs) { record.clear(); @@ -108,12 +100,42 @@ } ASSERT_OK(s); // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1, nullptr); + s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + } + + void SetUp() override { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + // TODO(icanadi) Remove this once we mock out VersionSet + NewDB(); + + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.statistics = CreateDBStatistics(); + + cf_options_.comparator = ucmp_; + + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : column_family_names_) { + column_families.emplace_back(cf_name, cf_options_); + } + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families, false)); } Env* env_; std::shared_ptr fs_; std::string dbname_; + const Comparator* const ucmp_; EnvOptions env_options_; Options options_; ImmutableDBOptions db_options_; @@ -128,19 +150,26 @@ std::shared_ptr mock_table_factory_; }; +class FlushJobTest : public FlushJobTestBase { + public: + FlushJobTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"), + BytewiseComparator()) {} +}; + TEST_F(FlushJobTest, Empty) { JobContext job_context(0); auto cfd = versions_->GetColumnFamilySet()->GetDefault(); EventLogger event_logger(db_options_.info_log.get()); SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - nullptr /* memtable_id */, env_options_, versions_.get(), - &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, - kNoCompression, nullptr, &event_logger, false, - true /* sync_output_directory */, - true /* write_manifest */, Env::Priority::USER); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, {}, + kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, + nullptr, kNoCompression, nullptr, &event_logger, false, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/); { InstrumentedMutexLock l(&mutex_); flush_job.PickMemTable(); @@ -164,25 +193,26 @@ for (int i = 1; i < 10000; ++i) { std::string key(ToString((i + 1000) % 10000)); std::string value("value" + key); - new_mem->Add(SequenceNumber(i), kTypeValue, key, value); + ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value, + nullptr /* kv_prot_info */)); if ((i + 1000) % 10000 < 9995) { InternalKey internal_key(key, SequenceNumber(i), kTypeValue); - inserted_keys.insert({internal_key.Encode().ToString(), value}); + inserted_keys.push_back({internal_key.Encode().ToString(), value}); } } { - new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a"); + ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", + "9999a", nullptr /* kv_prot_info */)); InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); - inserted_keys.insert({internal_key.Encode().ToString(), "9999a"}); + inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"}); } -#ifndef ROCKSDB_LITE // Note: the first two blob references will not be considered when resolving // the oldest blob file referenced (the first one is inlined TTL, while the // second one is TTL and thus points to a TTL blob file). - constexpr std::array blob_file_numbers{ - kInvalidBlobFileNumber, 5, 103, 17, 102, 101}; + constexpr std::array blob_file_numbers{{ + kInvalidBlobFileNumber, 5, 103, 17, 102, 101}}; for (size_t i = 0; i < blob_file_numbers.size(); ++i) { std::string key(ToString(i + 10001)); std::string blob_index; @@ -200,13 +230,13 @@ } const SequenceNumber seq(i + 10001); - new_mem->Add(seq, kTypeBlobIndex, key, blob_index); + ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index, + nullptr /* kv_prot_info */)); InternalKey internal_key(key, seq, kTypeBlobIndex); - inserted_keys.emplace_hint(inserted_keys.end(), - internal_key.Encode().ToString(), blob_index); + inserted_keys.push_back({internal_key.Encode().ToString(), blob_index}); } -#endif + mock::SortKVVector(&inserted_keys); autovector to_delete; cfd->imm()->Add(new_mem, &to_delete); @@ -216,14 +246,14 @@ EventLogger event_logger(db_options_.info_log.get()); SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - nullptr /* memtable_id */, env_options_, versions_.get(), - &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, - kNoCompression, db_options_.statistics.get(), - &event_logger, true, true /* sync_output_directory */, - true /* write_manifest */, Env::Priority::USER); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, {}, + kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/); HistogramData hist; FileMetaData file_meta; @@ -237,12 +267,8 @@ ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); ASSERT_EQ("9999a", file_meta.largest.user_key().ToString()); ASSERT_EQ(1, file_meta.fd.smallest_seqno); -#ifndef ROCKSDB_LITE ASSERT_EQ(10006, file_meta.fd.largest_seqno); ASSERT_EQ(17, file_meta.oldest_blob_file_number); -#else - ASSERT_EQ(10000, file_meta.fd.largest_seqno); -#endif mock_table_factory_->AssertSingleFile(inserted_keys); job_context.Clean(); } @@ -266,8 +292,8 @@ for (size_t j = 0; j < num_keys_per_table; ++j) { std::string key(ToString(j + i * num_keys_per_table)); std::string value("value" + key); - mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key, - value); + ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, + key, value, nullptr /* kv_prot_info */)); } } @@ -282,15 +308,14 @@ assert(memtable_ids.size() == num_mems); uint64_t smallest_memtable_id = memtable_ids.front(); uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; - - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - &flush_memtable_id, env_options_, versions_.get(), &mutex_, - &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, - &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, - true /* write_manifest */, Env::Priority::USER); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/); HistogramData hist; FileMetaData file_meta; mutex_.Lock(); @@ -340,7 +365,8 @@ for (size_t j = 0; j != num_keys_per_memtable; ++j) { std::string key(ToString(j + i * num_keys_per_memtable)); std::string value("value" + key); - mem->Add(curr_seqno++, kTypeValue, key, value); + ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value, + nullptr /* kv_prot_info */)); } cfd->imm()->Add(mem, &to_delete); @@ -357,12 +383,12 @@ std::vector snapshot_seqs; flush_jobs.emplace_back(new FlushJob( dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), - &memtable_ids[k], env_options_, versions_.get(), &mutex_, + memtable_ids[k], env_options_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, true, false /* sync_output_directory */, false /* write_manifest */, - Env::Priority::USER)); + Env::Priority::USER, nullptr /*IOTracer*/)); k++; } HistogramData hist; @@ -392,10 +418,18 @@ for (auto cfd : all_cfds) { mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); } + autovector>*> + committed_flush_jobs_info; +#ifndef ROCKSDB_LITE + for (auto& job : flush_jobs) { + committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo()); + } +#endif //! ROCKSDB_LITE Status s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free, + versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs, + committed_flush_jobs_info, &job_context.memtables_to_free, nullptr /* db_directory */, nullptr /* log_buffer */); ASSERT_OK(s); @@ -448,9 +482,10 @@ std::string key(ToString(i)); int insertions = rnd.Uniform(max_inserts_per_keys); for (int j = 0; j < insertions; ++j) { - std::string value(test::RandomHumanReadableString(&rnd, 10)); + std::string value(rnd.HumanReadableString(10)); auto seqno = ++current_seqno; - new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value); + ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value, + nullptr /* kv_prot_info */)); // a key is visible only if: // 1. it's the last one written (j == insertions - 1) // 2. there's a snapshot pointing at it @@ -458,10 +493,11 @@ (snapshots_set.find(seqno) != snapshots_set.end()); if (visible) { InternalKey internal_key(key, seqno, kTypeValue); - inserted_keys.insert({internal_key.Encode().ToString(), value}); + inserted_keys.push_back({internal_key.Encode().ToString(), value}); } } } + mock::SortKVVector(&inserted_keys); autovector to_delete; cfd->imm()->Add(new_mem, &to_delete); @@ -471,14 +507,14 @@ EventLogger event_logger(db_options_.info_log.get()); SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - nullptr /* memtable_id */, env_options_, versions_.get(), - &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, - kNoCompression, db_options_.statistics.get(), - &event_logger, true, true /* sync_output_directory */, - true /* write_manifest */, Env::Priority::USER); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, + env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots, + kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/); mutex_.Lock(); flush_job.PickMemTable(); ASSERT_OK(flush_job.Run()); @@ -490,6 +526,136 @@ job_context.Clean(); } +class FlushJobTimestampTest : public FlushJobTestBase { + public: + FlushJobTimestampTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"), + test::ComparatorWithU64Ts()) {} + + void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts, + SequenceNumber seq, ValueType value_type, + Slice value) { + std::string key_str(std::move(key)); + PutFixed64(&key_str, ts); + ASSERT_OK(memtable->Add(seq, value_type, key_str, value, + nullptr /* kv_prot_info */)); + } + + protected: + static constexpr uint64_t kStartTs = 10; + static constexpr SequenceNumber kStartSeq = 0; + SequenceNumber curr_seq_{kStartSeq}; + std::atomic curr_ts_{kStartTs}; +}; + +TEST_F(FlushJobTimestampTest, AllKeysExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeDeletionWithTimestamp, ""); + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + constexpr SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string key = test::EncodeInt(0); + key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp); + ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode()); + } + + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(FlushJobTimestampTest, NoKeyExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 0); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string ukey = test::EncodeInt(0); + std::string smallest_key = + ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); + std::string largest_key = ukey + test::EncodeInt(kStartTs); + InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); + InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); + ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(largest.Encode(), fmeta.largest.Encode()); + } + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_scheduler.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,11 @@ #pragma once -#include #include +#include #include #include + #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -33,10 +33,11 @@ // iter.Next() class ForwardLevelIterator : public InternalIterator { public: - ForwardLevelIterator(const ColumnFamilyData* const cfd, - const ReadOptions& read_options, - const std::vector& files, - const SliceTransform* prefix_extractor) + ForwardLevelIterator( + const ColumnFamilyData* const cfd, const ReadOptions& read_options, + const std::vector& files, + const std::shared_ptr& prefix_extractor, + bool allow_unprepared_value) : cfd_(cfd), read_options_(read_options), files_(files), @@ -44,7 +45,10 @@ file_index_(std::numeric_limits::max()), file_iter_(nullptr), pinned_iters_mgr_(nullptr), - prefix_extractor_(prefix_extractor) {} + prefix_extractor_(prefix_extractor), + allow_unprepared_value_(allow_unprepared_value) { + status_.PermitUncheckedError(); // Allow uninitialized status through + } ~ForwardLevelIterator() override { // Reset current pointer @@ -82,8 +86,9 @@ prefix_extractor_, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -171,6 +176,16 @@ } return Status::OK(); } + bool PrepareValue() override { + assert(valid_); + if (file_iter_->PrepareValue()) { + return true; + } + + assert(!file_iter_->Valid()); + valid_ = false; + return false; + } bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_->IsKeyPinned(); @@ -196,17 +211,21 @@ Status status_; InternalIterator* file_iter_; PinnedIteratorsManager* pinned_iters_mgr_; - const SliceTransform* prefix_extractor_; + // Kept alive by ForwardIterator::sv_->mutable_cf_options + const std::shared_ptr& prefix_extractor_; + const bool allow_unprepared_value_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, ColumnFamilyData* cfd, - SuperVersion* current_sv) + SuperVersion* current_sv, + bool allow_unprepared_value) : db_(db), read_options_(read_options), cfd_(cfd), prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()), user_comparator_(cfd->user_comparator()), + allow_unprepared_value_(allow_unprepared_value), immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), sv_(current_sv), mutable_iter_(nullptr), @@ -222,6 +241,12 @@ if (sv_) { RebuildIterators(false); } + + // immutable_status_ is a local aggregation of the + // status of the immutable Iterators. + // We have to PermitUncheckedError in case it is never + // used, otherwise it will fail ASSERT_STATUS_CHECKED. + immutable_status_.PermitUncheckedError(); } ForwardIterator::~ForwardIterator() { @@ -402,7 +427,7 @@ if (seek_to_first) { l0_iters_[i]->SeekToFirst(); } else { - // If the target key passes over the larget key, we are sure Next() + // If the target key passes over the largest key, we are sure Next() // won't go over this file. if (user_comparator_->Compare(target_user_key, l0[i]->largest.user_key()) > 0) { @@ -560,6 +585,22 @@ return immutable_status_; } +bool ForwardIterator::PrepareValue() { + assert(valid_); + if (current_->PrepareValue()) { + return true; + } + + assert(!current_->Valid()); + assert(!current_->status().ok()); + assert(current_ != mutable_iter_); // memtable iterator can't fail + assert(immutable_status_.ok()); + + valid_ = false; + immutable_status_ = current_->status(); + return false; +} + Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { assert(prop != nullptr); if (prop_name == "rocksdb.iterator.super-version-number") { @@ -629,8 +670,10 @@ sv_->mem->NewRangeTombstoneIterator( read_options_, sv_->current->version_set()->LastSequence())); range_del_agg.AddTombstones(std::move(range_del_iter)); - sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_, - &range_del_agg); + // Always return Status::OK(). + Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_, + &range_del_agg); + assert(temp_s.ok()); } has_iter_trimmed_for_upper_bound_ = false; @@ -650,14 +693,15 @@ l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - sv_->mutable_cf_options.prefix_extractor.get(), + sv_->mutable_cf_options.prefix_extractor, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); } - BuildLevelIterators(vstorage); + BuildLevelIterators(vstorage, sv_); current_ = nullptr; is_prev_set_ = false; @@ -691,8 +735,10 @@ svnew->mem->NewRangeTombstoneIterator( read_options_, sv_->current->version_set()->LastSequence())); range_del_agg.AddTombstones(std::move(range_del_iter)); - svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_, - &range_del_agg); + // Always return Status::OK(). + Status temp_s = svnew->imm->AddRangeTombstoneIterators( + read_options_, &arena_, &range_del_agg); + assert(temp_s.ok()); } const auto* vstorage = sv_->current->storage_info(); @@ -727,12 +773,13 @@ read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0_files_new[inew], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - svnew->mutable_cf_options.prefix_extractor.get(), + svnew->mutable_cf_options.prefix_extractor, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); } for (auto* f : l0_iters_) { @@ -745,7 +792,7 @@ DeleteIterator(l); } level_iters_.clear(); - BuildLevelIterators(vstorage_new); + BuildLevelIterators(vstorage_new, svnew); current_ = nullptr; is_prev_set_ = false; SVCleanup(); @@ -759,7 +806,8 @@ } } -void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) { +void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv) { level_iters_.reserve(vstorage->num_levels() - 1); for (int32_t level = 1; level < vstorage->num_levels(); ++level) { const auto& level_files = vstorage->LevelFiles(level); @@ -775,7 +823,7 @@ } else { level_iters_.push_back(new ForwardLevelIterator( cfd_, read_options_, level_files, - sv_->mutable_cf_options.prefix_extractor.get())); + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_)); } } } @@ -791,12 +839,13 @@ l0_iters_[i] = cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0_files[i], /*range_del_agg=*/nullptr, - sv_->mutable_cf_options.prefix_extractor.get(), + sv_->mutable_cf_options.prefix_extractor, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } @@ -945,9 +994,9 @@ uint32_t ForwardIterator::FindFileInRange( const std::vector& files, const Slice& internal_key, uint32_t left, uint32_t right) { - auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool { + auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool { return cfd_->internal_comparator().InternalKeyComparator::Compare( - f->largest.Encode(), key) < 0; + f->largest.Encode(), k) < 0; }; const auto &b = files.begin(); return static_cast(std::lower_bound(b + left, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,6 @@ #include #include -#include "db/dbformat.h" #include "memory/arena.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" @@ -39,8 +38,9 @@ const Comparator* comparator_; }; -typedef std::priority_queue, - MinIterComparator> MinIterHeap; +using MinIterHeap = + std::priority_queue, + MinIterComparator>; /** * ForwardIterator is a special type of iterator that only supports Seek() @@ -52,7 +52,8 @@ class ForwardIterator : public InternalIterator { public: ForwardIterator(DBImpl* db, const ReadOptions& read_options, - ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr); + ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr, + bool allow_unprepared_value = false); virtual ~ForwardIterator(); void SeekForPrev(const Slice& /*target*/) override { @@ -75,6 +76,7 @@ virtual Slice key() const override; virtual Slice value() const override; virtual Status status() const override; + virtual bool PrepareValue() override; virtual Status GetProperty(std::string prop_name, std::string* prop) override; virtual void SetPinnedItersMgr( PinnedIteratorsManager* pinned_iters_mgr) override; @@ -95,7 +97,8 @@ void RebuildIterators(bool refresh_sv); void RenewIterators(); - void BuildLevelIterators(const VersionStorageInfo* vstorage); + void BuildLevelIterators(const VersionStorageInfo* vstorage, + SuperVersion* sv); void ResetIncompleteIterators(); void SeekInternal(const Slice& internal_key, bool seek_to_first); void UpdateCurrent(); @@ -120,6 +123,7 @@ ColumnFamilyData* const cfd_; const SliceTransform* const prefix_extractor_; const Comparator* user_comparator_; + const bool allow_unprepared_value_; MinIterHeap immutable_min_heap_; SuperVersion* sv_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -272,7 +272,6 @@ : db_(db), thread_(&StatsThread::run, this) {} void run() { - // using namespace std::chrono; auto tstart = std::chrono::steady_clock::now(), tlast = tstart; uint64_t wlast = 0, rlast = 0; while (!done_.load()) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include "db/version_edit.h" #include "file/file_util.h" #include "file/random_access_file_reader.h" +#include "logging/logging.h" #include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" @@ -26,14 +27,14 @@ for (const auto& file_metadata : metadata_) { const auto file_path = file_metadata.db_path + "/" + file_metadata.name; IngestedFileInfo file_to_import; - status = GetIngestedFileInfo(file_path, &file_to_import, sv); + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv); if (!status.ok()) { return status; } files_to_import_.push_back(file_to_import); } - const auto ucmp = cfd_->internal_comparator().user_comparator(); auto num_files = files_to_import_.size(); if (num_files == 0) { return Status::InvalidArgument("The list of files is empty"); @@ -55,17 +56,18 @@ } } - std::sort(sorted_files.begin(), sorted_files.end(), - [&ucmp](const IngestedFileInfo* info1, - const IngestedFileInfo* info2) { - return sstableKeyCompare(ucmp, info1->smallest_internal_key, - info2->smallest_internal_key) < 0; - }); - - for (size_t i = 0; i < sorted_files.size() - 1; i++) { - if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, - sorted_files[i + 1]->smallest_internal_key) >= - 0) { + std::sort( + sorted_files.begin(), sorted_files.end(), + [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return cfd_->internal_comparator().Compare( + info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i + 1 < sorted_files.size(); i++) { + if (cfd_->internal_comparator().Compare( + sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { return Status::InvalidArgument("Files have overlapping ranges"); } } @@ -85,8 +87,6 @@ // Copy/Move external files into DB auto hardlink_files = import_options_.move_files; for (auto& f : files_to_import_) { - f.fd = FileDescriptor(next_file_number++, 0, f.file_size); - const auto path_outside_db = f.external_file_path; const auto path_inside_db = TableFileName( cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); @@ -100,8 +100,8 @@ } } if (!hardlink_files) { - status = CopyFile(fs_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + status = CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_); } if (!status.ok()) { break; @@ -140,7 +140,7 @@ int64_t temp_current_time = 0; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; uint64_t current_time = kUnknownOldestAncesterTime; - if (env_->GetCurrentTime(&temp_current_time).ok()) { + if (clock_->GetCurrentTime(&temp_current_time).ok()) { current_time = oldest_ancester_time = static_cast(temp_current_time); } @@ -152,9 +152,10 @@ edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, file_metadata.smallest_seqno, - file_metadata.largest_seqno, false, kInvalidBlobFileNumber, - oldest_ancester_time, current_time, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + file_metadata.largest_seqno, false, file_metadata.temperature, + kInvalidBlobFileNumber, oldest_ancester_time, current_time, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); // If incoming sequence number is higher, update local sequence number. if (file_metadata.largest_seqno > versions_->LastSequence()) { @@ -196,8 +197,8 @@ } Status ImportColumnFamilyJob::GetIngestedFileInfo( - const std::string& external_file, IngestedFileInfo* file_to_import, - SuperVersion* sv) { + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_import, SuperVersion* sv) { file_to_import->external_file_path = external_file; // Get external file size @@ -207,6 +208,10 @@ return status; } + // Assign FD with number + file_to_import->fd = + FileDescriptor(new_file_number, 0, file_to_import->file_size); + // Create TableReader for external file std::unique_ptr table_reader; std::unique_ptr sst_file; @@ -217,13 +222,18 @@ if (!status.ok()) { return status; } - sst_file_reader.reset( - new RandomAccessFileReader(std::move(sst_file), external_file)); + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); status = cfd_->ioptions()->table_factory->NewTableReader( - TableReaderOptions(*cfd_->ioptions(), - sv->mutable_cf_options.prefix_extractor.get(), - env_options_, cfd_->internal_comparator()), + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), std::move(sst_file_reader), file_to_import->file_size, &table_reader); if (!status.ok()) { return status; @@ -252,15 +262,21 @@ // Get first (smallest) key from file iter->SeekToFirst(); - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); } file_to_import->smallest_internal_key.SetFrom(key); // Get last (largest) key from file iter->SeekToLast(); - if (!ParseInternalKey(iter->key(), &key)) { - return Status::Corruption("external file have corrupted keys"); + pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); } file_to_import->largest_internal_key.SetFrom(key); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h 2025-05-19 16:14:27.000000000 +0000 @@ -4,35 +4,37 @@ #include #include "db/column_family.h" -#include "db/dbformat.h" #include "db/external_sst_file_ingestion_job.h" #include "db/snapshot_impl.h" #include "options/db_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" #include "rocksdb/metadata.h" #include "rocksdb/sst_file_writer.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +struct EnvOptions; +class SystemClock; // Imports a set of sst files as is into a new column family. Logic is similar // to ExternalSstFileIngestionJob. class ImportColumnFamilyJob { public: - ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd, + ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, const ImportColumnFamilyOptions& import_options, - const std::vector& metadata) - : env_(env), + const std::vector& metadata, + const std::shared_ptr& io_tracer) + : clock_(db_options.clock), versions_(versions), cfd_(cfd), db_options_(db_options), - fs_(db_options_.fs.get()), + fs_(db_options_.fs, io_tracer), env_options_(env_options), import_options_(import_options), - metadata_(metadata) {} + metadata_(metadata), + io_tracer_(io_tracer) {} // Prepare the job by copying external files into the DB. Status Prepare(uint64_t next_file_number, SuperVersion* sv); @@ -54,19 +56,21 @@ // Open the external file and populate `file_to_import` with all the // external information we need to import this file. Status GetIngestedFileInfo(const std::string& external_file, + uint64_t new_file_number, IngestedFileInfo* file_to_import, SuperVersion* sv); - Env* env_; + SystemClock* clock_; VersionSet* versions_; ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; - FileSystem* fs_; + const FileSystemPtr fs_; const EnvOptions& env_options_; autovector files_to_import_; VersionEdit edit_; const ImportColumnFamilyOptions& import_options_; std::vector metadata_; + const std::shared_ptr io_tracer_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -1,20 +1,23 @@ #ifndef ROCKSDB_LITE #include + #include "db/db_test_util.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" #include "test_util/testutil.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { class ImportColumnFamilyTest : public DBTestBase { public: - ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") { + ImportColumnFamilyTest() + : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) { sst_files_dir_ = dbname_ + "/sst_files/"; + export_files_dir_ = test::PerThreadDBPath(env_, "export"); DestroyAndRecreateExternalSSTFilesDir(); - export_files_dir_ = test::TmpDir(env_) + "/export"; import_cfh_ = nullptr; import_cfh2_ = nullptr; metadata_ptr_ = nullptr; @@ -22,27 +25,27 @@ ~ImportColumnFamilyTest() { if (import_cfh_) { - db_->DropColumnFamily(import_cfh_); - db_->DestroyColumnFamilyHandle(import_cfh_); + EXPECT_OK(db_->DropColumnFamily(import_cfh_)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); import_cfh_ = nullptr; } if (import_cfh2_) { - db_->DropColumnFamily(import_cfh2_); - db_->DestroyColumnFamilyHandle(import_cfh2_); + EXPECT_OK(db_->DropColumnFamily(import_cfh2_)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_)); import_cfh2_ = nullptr; } if (metadata_ptr_) { delete metadata_ptr_; metadata_ptr_ = nullptr; } - test::DestroyDir(env_, sst_files_dir_); - test::DestroyDir(env_, export_files_dir_); + EXPECT_OK(DestroyDir(env_, sst_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir_)); } void DestroyAndRecreateExternalSSTFilesDir() { - test::DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); - test::DestroyDir(env_, export_files_dir_); + EXPECT_OK(DestroyDir(env_, sst_files_dir_)); + EXPECT_OK(env_->CreateDir(sst_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir_)); } LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path, @@ -101,9 +104,9 @@ ASSERT_NE(import_cfh_, nullptr); std::string value; - db_->Get(ReadOptions(), import_cfh_, "K1", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value)); ASSERT_EQ(value, "V1"); - db_->Get(ReadOptions(), import_cfh_, "K2", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value)); ASSERT_EQ(value, "V2"); ASSERT_OK(db_->DropColumnFamily(import_cfh_)); ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); @@ -122,9 +125,9 @@ ASSERT_NE(import_cfh_, nullptr); std::string value; - db_->Get(ReadOptions(), import_cfh_, "K3", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value)); ASSERT_EQ(value, "V1"); - db_->Get(ReadOptions(), import_cfh_, "K4", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value)); ASSERT_EQ(value, "V2"); } } @@ -140,7 +143,7 @@ const std::string file3_sst = sst_files_dir_ + file3_sst_name; ASSERT_OK(sfw_cf1.Open(file3_sst)); for (int i = 0; i < 100; ++i) { - sfw_cf1.Put(Key(i), Key(i) + "_val"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val")); } ASSERT_OK(sfw_cf1.Finish()); @@ -149,7 +152,7 @@ const std::string file2_sst = sst_files_dir_ + file2_sst_name; ASSERT_OK(sfw_cf1.Open(file2_sst)); for (int i = 0; i < 100; i += 2) { - sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1")); } ASSERT_OK(sfw_cf1.Finish()); @@ -158,7 +161,7 @@ const std::string file1a_sst = sst_files_dir_ + file1a_sst_name; ASSERT_OK(sfw_cf1.Open(file1a_sst)); for (int i = 0; i < 52; i += 4) { - sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2")); } ASSERT_OK(sfw_cf1.Finish()); @@ -167,7 +170,7 @@ const std::string file1b_sst = sst_files_dir_ + file1b_sst_name; ASSERT_OK(sfw_cf1.Open(file1b_sst)); for (int i = 52; i < 100; i += 4) { - sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2")); } ASSERT_OK(sfw_cf1.Finish()); @@ -176,7 +179,7 @@ const std::string file0a_sst = sst_files_dir_ + file0a_sst_name; ASSERT_OK(sfw_cf1.Open(file0a_sst)); for (int i = 0; i < 100; i += 16) { - sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3")); } ASSERT_OK(sfw_cf1.Finish()); @@ -185,7 +188,7 @@ const std::string file0b_sst = sst_files_dir_ + file0b_sst_name; ASSERT_OK(sfw_cf1.Open(file0b_sst)); for (int i = 0; i < 100; i += 16) { - sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"); + ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4")); } ASSERT_OK(sfw_cf1.Finish()); @@ -211,7 +214,7 @@ for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 16 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite4"); } else if (i % 4 == 0) { @@ -232,7 +235,7 @@ ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 5 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite5"); } else if (i % 16 == 0) { @@ -251,7 +254,7 @@ db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 5 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite5"); } else if (i % 16 == 0) { @@ -271,7 +274,7 @@ CreateAndReopenWithCF({"koko"}, options); for (int i = 0; i < 100; ++i) { - Put(1, Key(i), Key(i) + "_val"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); } ASSERT_OK(Flush(1)); @@ -280,13 +283,13 @@ // Overwrite the value in the same set of keys. for (int i = 0; i < 100; ++i) { - Put(1, Key(i), Key(i) + "_overwrite"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite")); } // Flush to create L0 file. ASSERT_OK(Flush(1)); for (int i = 0; i < 100; ++i) { - Put(1, Key(i), Key(i) + "_overwrite2"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2")); } // Flush again to create another L0 file. It should have higher sequencer. @@ -315,12 +318,12 @@ std::string value1, value2; for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Get(1, Key(i)), value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } @@ -337,16 +340,16 @@ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); } for (int i = 25; i < 50; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite3", value1); } for (int i = 50; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite2", value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } @@ -360,16 +363,16 @@ db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); } for (int i = 25; i < 50; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite3", value1); } for (int i = 50; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite2", value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } } @@ -379,7 +382,7 @@ CreateAndReopenWithCF({"koko"}, options); for (int i = 0; i < 100; ++i) { - Put(1, Key(i), Key(i) + "_val"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); } ASSERT_OK(Flush(1)); @@ -389,14 +392,14 @@ // Overwrite the value in the same set of keys. for (int i = 0; i < 50; ++i) { - Put(1, Key(i), Key(i) + "_overwrite"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite")); } // Flush to create L0 file. ASSERT_OK(Flush(1)); for (int i = 0; i < 25; ++i) { - Put(1, Key(i), Key(i) + "_overwrite2"); + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2")); } // Flush again to create another L0 file. It should have higher sequencer. @@ -411,7 +414,7 @@ // Create a new db and import the files. DB* db_copy; - test::DestroyDir(env_, dbname_ + "/db_copy"); + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); ColumnFamilyHandle* cfh = nullptr; ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", @@ -421,13 +424,75 @@ for (int i = 0; i < 100; ++i) { std::string value; - db_copy->Get(ReadOptions(), cfh, Key(i), &value); + ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value)); ASSERT_EQ(Get(1, Key(i)), value); } - db_copy->DropColumnFamily(cfh); - db_copy->DestroyColumnFamilyHandle(cfh); + ASSERT_OK(db_copy->DropColumnFamily(cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); + delete db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); +} + +TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { + // Imports a column family containing a level where two files overlap at their + // endpoints. "Overlap" means the largest user key in one file is the same as + // the smallest user key in the second file. + const int kFileBytes = 128 << 10; // 128KB + const int kValueBytes = 1 << 10; // 1KB + const int kNumFiles = 4; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 2; + CreateAndReopenWithCF({"koko"}, options); + + Random rnd(301); + // Every key is snapshot protected to ensure older versions will not be + // dropped during compaction. + std::vector snapshots; + snapshots.reserve(kFileBytes / kValueBytes * kNumFiles); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kFileBytes / kValueBytes; ++j) { + auto value = rnd.RandomString(kValueBytes); + ASSERT_OK(Put(1, "key", value)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush(1)); + } + + // Compact to create overlapping L1 files. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + // Create a new db and import the files. + DB* db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + { + std::string value; + ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value)); + } + ASSERT_OK(db_copy->DropColumnFamily(cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh)); delete db_copy; - test::DestroyDir(env_, dbname_ + "/db_copy"); + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + for (const Snapshot* snapshot : snapshots) { + db_->ReleaseSnapshot(snapshot); + } } TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,14 +12,21 @@ #include #include +#include #include +#include #include #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_entry_stats.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" -#include "table/block_based/block_based_table_factory.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "table/block_based/cachable_entry.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -49,6 +56,27 @@ {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}}, {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}}, {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}}, + {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}}, + {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}}, +}; + +const std::map + InternalStats::db_stats_type_to_info = { + {InternalStats::kIntStatsWalFileBytes, + DBStatInfo{"db.wal_bytes_written"}}, + {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}}, + {InternalStats::kIntStatsBytesWritten, + DBStatInfo{"db.user_bytes_written"}}, + {InternalStats::kIntStatsNumKeysWritten, + DBStatInfo{"db.user_keys_written"}}, + {InternalStats::kIntStatsWriteDoneByOther, + DBStatInfo{"db.user_writes_by_other"}}, + {InternalStats::kIntStatsWriteDoneBySelf, + DBStatInfo{"db.user_writes_by_self"}}, + {InternalStats::kIntStatsWriteWithWal, + DBStatInfo{"db.user_writes_with_wal"}}, + {InternalStats::kIntStatsWriteStallMicros, + DBStatInfo{"db.user_write_stall_micros"}}, }; namespace { @@ -60,12 +88,14 @@ const std::string& group_by) { int written_size = snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str()); + written_size = std::min(written_size, static_cast(len)); auto hdr = [](LevelStatType t) { return InternalStats::compaction_level_stats.at(t).header_name.c_str(); }; int line_size = snprintf( buf + written_size, len - written_size, - "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s " + "%s\n", // Note that we skip COMPACTED_FILES and merge it with Files column group_by.c_str(), hdr(LevelStatType::NUM_FILES), hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), @@ -76,9 +106,11 @@ hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), - hdr(LevelStatType::KEY_DROP)); + hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB), + hdr(LevelStatType::W_BLOB_GB)); written_size += line_size; + written_size = std::min(written_size, static_cast(len)); snprintf(buf + written_size, len - written_size, "%s\n", std::string(line_size, '-').c_str()); } @@ -87,10 +119,12 @@ int num_files, int being_compacted, double total_file_size, double score, double w_amp, const InternalStats::CompactionStats& stats) { - uint64_t bytes_read = - stats.bytes_read_non_output_levels + stats.bytes_read_output_level; - int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; - double elapsed = (stats.micros + 1) / kMicrosInSec; + const uint64_t bytes_read = stats.bytes_read_non_output_levels + + stats.bytes_read_output_level + + stats.bytes_read_blob; + const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; + const double elapsed = (stats.micros + 1) / kMicrosInSec; (*level_stats)[LevelStatType::NUM_FILES] = num_files; (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted; @@ -105,8 +139,7 @@ (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed; - (*level_stats)[LevelStatType::WRITE_MBPS] = - stats.bytes_written / kMB / elapsed; + (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed; (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec; (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec; (*level_stats)[LevelStatType::COMP_COUNT] = stats.count; @@ -116,6 +149,8 @@ static_cast(stats.num_input_records); (*level_stats)[LevelStatType::KEY_DROP] = static_cast(stats.num_dropped_records); + (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB; + (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB; } void PrintLevelStats(char* buf, size_t len, const std::string& name, @@ -140,7 +175,9 @@ "%9d " /* Comp(cnt) */ "%8.3f " /* Avg(sec) */ "%7s " /* KeyIn */ - "%6s\n", /* KeyDrop */ + "%6s " /* KeyDrop */ + "%9.1f " /* Rblob(GB) */ + "%9.1f\n", /* Wblob(GB) */ name.c_str(), static_cast(stat_value.at(LevelStatType::NUM_FILES)), static_cast(stat_value.at(LevelStatType::COMPACTED_FILES)), BytesToHumanString( @@ -165,7 +202,9 @@ .c_str(), NumberToHumanString( static_cast(stat_value.at(LevelStatType::KEY_DROP))) - .c_str()); + .c_str(), + stat_value.at(LevelStatType::R_BLOB_GB), + stat_value.at(LevelStatType::W_BLOB_GB)); } void PrintLevelStats(char* buf, size_t len, const std::string& name, @@ -206,6 +245,7 @@ static const std::string cf_file_histogram = "cf-file-histogram"; static const std::string dbstats = "dbstats"; static const std::string levelstats = "levelstats"; +static const std::string block_cache_entry_stats = "block-cache-entry-stats"; static const std::string num_immutable_mem_table = "num-immutable-mem-table"; static const std::string num_immutable_mem_table_flushed = "num-immutable-mem-table-flushed"; @@ -242,6 +282,8 @@ static const std::string base_level_str = "base-level"; static const std::string total_sst_files_size = "total-sst-files-size"; static const std::string live_sst_files_size = "live-sst-files-size"; +static const std::string live_sst_files_size_at_temperature = + "live-sst-files-size-at-temperature"; static const std::string estimate_pending_comp_bytes = "estimate-pending-compaction-bytes"; static const std::string aggregated_table_properties = @@ -258,6 +300,10 @@ static const std::string block_cache_usage = "block-cache-usage"; static const std::string block_cache_pinned_usage = "block-cache-pinned-usage"; static const std::string options_statistics = "options-statistics"; +static const std::string num_blob_files = "num-blob-files"; +static const std::string blob_stats = "blob-stats"; +static const std::string total_blob_file_size = "total-blob-file-size"; +static const std::string live_blob_file_size = "live-blob-file-size"; const std::string DB::Properties::kNumFilesAtLevelPrefix = rocksdb_prefix + num_files_at_level_prefix; @@ -272,6 +318,8 @@ rocksdb_prefix + cf_file_histogram; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; +const std::string DB::Properties::kBlockCacheEntryStats = + rocksdb_prefix + block_cache_entry_stats; const std::string DB::Properties::kNumImmutableMemTable = rocksdb_prefix + num_immutable_mem_table; const std::string DB::Properties::kNumImmutableMemTableFlushed = @@ -347,6 +395,15 @@ rocksdb_prefix + block_cache_pinned_usage; const std::string DB::Properties::kOptionsStatistics = rocksdb_prefix + options_statistics; +const std::string DB::Properties::kLiveSstFilesSizeAtTemperature = + rocksdb_prefix + live_sst_files_size_at_temperature; +const std::string DB::Properties::kNumBlobFiles = + rocksdb_prefix + num_blob_files; +const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats; +const std::string DB::Properties::kTotalBlobFileSize = + rocksdb_prefix + total_blob_file_size; +const std::string DB::Properties::kLiveBlobFileSize = + rocksdb_prefix + live_blob_file_size; const std::unordered_map InternalStats::ppt_name_to_info = { @@ -370,15 +427,20 @@ {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, nullptr}}, {DB::Properties::kDBStats, - {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}}, + {false, &InternalStats::HandleDBStats, nullptr, + &InternalStats::HandleDBMapStats, nullptr}}, + {DB::Properties::kBlockCacheEntryStats, + {true, &InternalStats::HandleBlockCacheEntryStats, nullptr, + &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, {DB::Properties::kSSTables, {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, {DB::Properties::kAggregatedTableProperties, {false, &InternalStats::HandleAggregatedTableProperties, nullptr, - nullptr, nullptr}}, + &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}}, {DB::Properties::kAggregatedTablePropertiesAtLevel, {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, - nullptr, nullptr, nullptr}}, + nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap, + nullptr}}, {DB::Properties::kNumImmutableMemTable, {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr, nullptr}}, @@ -456,6 +518,9 @@ {DB::Properties::kLiveSstFilesSize, {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr, nullptr}}, + {DB::Properties::kLiveSstFilesSizeAtTemperature, + {true, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr, + nullptr, nullptr}}, {DB::Properties::kEstimatePendingCompactionBytes, {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, nullptr, nullptr}}, @@ -484,10 +549,253 @@ {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr, nullptr}}, {DB::Properties::kOptionsStatistics, - {false, nullptr, nullptr, nullptr, + {true, nullptr, nullptr, nullptr, &DBImpl::GetPropertyHandleOptionsStatistics}}, + {DB::Properties::kNumBlobFiles, + {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr, + nullptr}}, + {DB::Properties::kBlobStats, + {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kTotalBlobFileSize, + {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr, + nullptr}}, + {DB::Properties::kLiveBlobFileSize, + {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr, + nullptr}}, }; +InternalStats::InternalStats(int num_levels, SystemClock* clock, + ColumnFamilyData* cfd) + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, + comp_stats_(num_levels), + comp_stats_by_pri_(Env::Priority::TOTAL), + file_read_latency_(num_levels), + bg_error_count_(0), + number_levels_(num_levels), + clock_(clock), + cfd_(cfd), + started_at_(clock->NowMicros()) { + Cache* block_cache = nullptr; + bool ok = GetBlockCacheForStats(&block_cache); + if (ok) { + assert(block_cache); + // Extract or create stats collector. Could fail in rare cases. + Status s = CacheEntryStatsCollector::GetShared( + block_cache, clock_, &cache_entry_stats_collector_); + if (s.ok()) { + assert(cache_entry_stats_collector_); + } else { + assert(!cache_entry_stats_collector_); + } + } else { + assert(!block_cache); + } +} + +void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, + bool foreground) { + CollectCacheEntryStats(foreground); + if (cache_entry_stats_collector_) { + cache_entry_stats_collector_->GetStats(stats); + } +} + +void InternalStats::CollectCacheEntryStats(bool foreground) { + // This function is safe to call from any thread because + // cache_entry_stats_collector_ field is const after constructor + // and ->GetStats does its own synchronization, which also suffices for + // cache_entry_stats_. + + if (!cache_entry_stats_collector_) { + return; // nothing to do (e.g. no block cache) + } + + // For "background" collections, strictly cap the collection time by + // expanding effective cache TTL. For foreground, be more aggressive about + // getting latest data. + int min_interval_seconds = foreground ? 10 : 180; + // 1/500 = max of 0.2% of one CPU thread + int min_interval_factor = foreground ? 10 : 500; + cache_entry_stats_collector_->CollectStats(min_interval_seconds, + min_interval_factor); +} + +std::function +InternalStats::CacheEntryRoleStats::GetEntryCallback() { + return [&](const Slice& /*key*/, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + auto e = role_map_.find(deleter); + size_t role_idx; + if (e == role_map_.end()) { + role_idx = static_cast(CacheEntryRole::kMisc); + } else { + role_idx = static_cast(e->second); + } + entry_counts[role_idx]++; + total_charges[role_idx] += charge; + }; +} + +void InternalStats::CacheEntryRoleStats::BeginCollection( + Cache* cache, SystemClock*, uint64_t start_time_micros) { + Clear(); + last_start_time_micros_ = start_time_micros; + ++collection_count; + role_map_ = CopyCacheDeleterRoleMap(); + std::ostringstream str; + str << cache->Name() << "@" << static_cast(cache) << "#" + << port::GetProcessID(); + cache_id = str.str(); + cache_capacity = cache->GetCapacity(); +} + +void InternalStats::CacheEntryRoleStats::EndCollection( + Cache*, SystemClock*, uint64_t end_time_micros) { + last_end_time_micros_ = end_time_micros; +} + +void InternalStats::CacheEntryRoleStats::SkippedCollection() { + ++copies_of_last_collection; +} + +uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const { + if (last_end_time_micros_ > last_start_time_micros_) { + return last_end_time_micros_ - last_start_time_micros_; + } else { + return 0U; + } +} + +std::string InternalStats::CacheEntryRoleStats::ToString( + SystemClock* clock) const { + std::ostringstream str; + str << "Block cache " << cache_id + << " capacity: " << BytesToHumanString(cache_capacity) + << " collections: " << collection_count + << " last_copies: " << copies_of_last_collection + << " last_secs: " << (GetLastDurationMicros() / 1000000.0) + << " secs_since: " + << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n"; + str << "Block cache entry stats(count,size,portion):"; + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + if (entry_counts[i] > 0) { + str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i] + << "," << BytesToHumanString(total_charges[i]) << "," + << (100.0 * total_charges[i] / cache_capacity) << "%)"; + } + } + str << "\n"; + return str.str(); +} + +void InternalStats::CacheEntryRoleStats::ToMap( + std::map* values, SystemClock* clock) const { + values->clear(); + auto& v = *values; + v["id"] = cache_id; + v["capacity"] = ROCKSDB_NAMESPACE::ToString(cache_capacity); + v["secs_for_last_collection"] = + ROCKSDB_NAMESPACE::ToString(GetLastDurationMicros() / 1000000.0); + v["secs_since_last_collection"] = ROCKSDB_NAMESPACE::ToString( + (clock->NowMicros() - last_end_time_micros_) / 1000000U); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + std::string role = kCacheEntryRoleToHyphenString[i]; + v["count." + role] = ROCKSDB_NAMESPACE::ToString(entry_counts[i]); + v["bytes." + role] = ROCKSDB_NAMESPACE::ToString(total_charges[i]); + v["percent." + role] = + ROCKSDB_NAMESPACE::ToString(100.0 * total_charges[i] / cache_capacity); + } +} + +bool InternalStats::HandleBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(/*foreground*/ true); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + *value = stats.ToString(clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + if (!cache_entry_stats_collector_) { + return false; + } + CollectCacheEntryStats(/*foreground*/ true); + CacheEntryRoleStats stats; + cache_entry_stats_collector_->GetStats(&stats); + stats.ToMap(values, clock_); + return true; +} + +bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value, + Slice suffix) { + uint64_t temperature; + bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty(); + if (!ok) { + return false; + } + + uint64_t size = 0; + const auto* vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file_meta : vstorage->LevelFiles(level)) { + if (static_cast(file_meta->temperature) == temperature) { + size += file_meta->fd.GetFileSize(); + } + } + } + + *value = ToString(size); + return true; +} + +bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + const auto& blob_files = vstorage->GetBlobFiles(); + *value = blob_files.size(); + return true; +} + +bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) { + std::ostringstream oss; + auto* current_version = cfd_->current(); + const auto& blob_files = current_version->storage_info()->GetBlobFiles(); + uint64_t current_num_blob_files = blob_files.size(); + uint64_t current_file_size = 0; + uint64_t current_garbage_size = 0; + for (const auto& pair : blob_files) { + const auto& meta = pair.second; + current_file_size += meta->GetBlobFileSize(); + current_garbage_size += meta->GetGarbageBlobBytes(); + } + oss << "Number of blob files: " << current_num_blob_files + << "\nTotal size of blob files: " << current_file_size + << "\nTotal size of garbage in blob files: " << current_garbage_size + << '\n'; + value->append(oss.str()); + return true; +} + +bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + *value = cfd_->GetTotalBlobFileSize(); + return true; +} + +bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/, + Version* /*version*/) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->GetTotalBlobFileSize(); + return true; +} + const DBPropertyInfo* GetPropertyInfo(const Slice& property) { std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); @@ -507,11 +815,12 @@ } bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, - const Slice& /*property*/, + const Slice& property, std::map* value) { assert(value != nullptr); assert(property_info.handle_map != nullptr); - return (this->*(property_info.handle_map))(value); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_map))(value, arg); } bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, @@ -587,7 +896,7 @@ } bool InternalStats::HandleCFMapStats( - std::map* cf_stats) { + std::map* cf_stats, Slice /*suffix*/) { DumpCFMapStats(cf_stats); return true; } @@ -609,6 +918,12 @@ return true; } +bool InternalStats::HandleDBMapStats( + std::map* db_stats, Slice /*suffix*/) { + DumpDBMapStats(db_stats); + return true; +} + bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { DumpDBStats(value); return true; @@ -631,7 +946,27 @@ return true; } -bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, +static std::map MapUint64ValuesToString( + const std::map& from) { + std::map to; + for (const auto& e : from) { + to[e.first] = ToString(e.second); + } + return to; +} + +bool InternalStats::HandleAggregatedTablePropertiesMap( + std::map* values, Slice /*suffix*/) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, Slice suffix) { uint64_t level; bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); @@ -644,7 +979,24 @@ if (!s.ok()) { return false; } - *value = tp->ToString(); + *values = tp->ToString(); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); return true; } @@ -698,21 +1050,24 @@ bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { // Current size of the active memtable - *value = cfd_->mem()->ApproximateMemoryUsage(); + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast(); return true; } bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { // Current size of the active memtable + immutable memtables - *value = cfd_->mem()->ApproximateMemoryUsage() + + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); return true; } bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { - *value = cfd_->mem()->ApproximateMemoryUsage() + + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + cfd_->imm()->ApproximateMemoryUsage(); return true; } @@ -798,7 +1153,7 @@ bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, Version* /*version*/) { - *value = db->IsFileDeletionsEnabled(); + *value = db->IsFileDeletionsEnabled() ? 1 : 0; return true; } @@ -903,29 +1258,19 @@ return *value > 0 && *value < std::numeric_limits::max(); } -bool InternalStats::HandleBlockCacheStat(Cache** block_cache) { +bool InternalStats::GetBlockCacheForStats(Cache** block_cache) { assert(block_cache != nullptr); - auto* table_factory = cfd_->ioptions()->table_factory; + auto* table_factory = cfd_->ioptions()->table_factory.get(); assert(table_factory != nullptr); - if (BlockBasedTableFactory::kName != table_factory->Name()) { - return false; - } - auto* table_options = - reinterpret_cast(table_factory->GetOptions()); - if (table_options == nullptr) { - return false; - } - *block_cache = table_options->block_cache.get(); - if (table_options->no_block_cache || *block_cache == nullptr) { - return false; - } - return true; + *block_cache = + table_factory->GetOptions(TableFactory::kBlockCacheOpts()); + return *block_cache != nullptr; } bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { Cache* block_cache; - bool ok = HandleBlockCacheStat(&block_cache); + bool ok = GetBlockCacheForStats(&block_cache); if (!ok) { return false; } @@ -936,7 +1281,7 @@ bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { Cache* block_cache; - bool ok = HandleBlockCacheStat(&block_cache); + bool ok = GetBlockCacheForStats(&block_cache); if (!ok) { return false; } @@ -947,7 +1292,7 @@ bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { Cache* block_cache; - bool ok = HandleBlockCacheStat(&block_cache); + bool ok = GetBlockCacheForStats(&block_cache); if (!ok) { return false; } @@ -955,10 +1300,21 @@ return true; } +void InternalStats::DumpDBMapStats( + std::map* db_stats) { + for (int i = 0; i < static_cast(kIntStatsNumMax); ++i) { + InternalDBStatsType type = static_cast(i); + (*db_stats)[db_stats_type_to_info.at(type).property_name] = + std::to_string(GetDBStats(type)); + } + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; + (*db_stats)["db.uptime"] = std::to_string(seconds_up); +} + void InternalStats::DumpDBStats(std::string* value) { char buf[1000]; // DB-level stats, only available from default column family - double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec; double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; snprintf(buf, sizeof(buf), "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", @@ -995,8 +1351,10 @@ NumberToHumanString(write_other + write_self).c_str(), NumberToHumanString(num_keys_written).c_str(), NumberToHumanString(write_self).c_str(), - (write_other + write_self) / static_cast(write_self + 1), - user_bytes_written / kGB, user_bytes_written / kMB / seconds_up); + (write_other + write_self) / + std::max(1.0, static_cast(write_self)), + user_bytes_written / kGB, + user_bytes_written / kMB / std::max(seconds_up, 0.001)); value->append(buf); // WAL snprintf(buf, sizeof(buf), @@ -1004,8 +1362,8 @@ "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", NumberToHumanString(write_with_wal).c_str(), NumberToHumanString(wal_synced).c_str(), - write_with_wal / static_cast(wal_synced + 1), - wal_bytes / kGB, wal_bytes / kMB / seconds_up); + write_with_wal / std::max(1.0, static_cast(wal_synced)), + wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001)); value->append(buf); // Stall AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true); @@ -1028,7 +1386,7 @@ NumberToHumanString(interval_num_keys_written).c_str(), NumberToHumanString(interval_write_self).c_str(), static_cast(interval_write_other + interval_write_self) / - (interval_write_self + 1), + std::max(1.0, static_cast(interval_write_self)), (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB, (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB / std::max(interval_seconds_up, 0.001)), @@ -1039,15 +1397,15 @@ uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced; uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; - snprintf( - buf, sizeof(buf), - "Interval WAL: %s writes, %s syncs, " - "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n", - NumberToHumanString(interval_write_with_wal).c_str(), - NumberToHumanString(interval_wal_synced).c_str(), - interval_write_with_wal / static_cast(interval_wal_synced + 1), - interval_wal_bytes / kGB, - interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); + snprintf(buf, sizeof(buf), + "Interval WAL: %s writes, %s syncs, " + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", + NumberToHumanString(interval_write_with_wal).c_str(), + NumberToHumanString(interval_wal_synced).c_str(), + interval_write_with_wal / + std::max(1.0, static_cast(interval_wal_synced)), + interval_wal_bytes / kGB, + interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001)); value->append(buf); // Stall @@ -1080,9 +1438,10 @@ */ void InternalStats::DumpCFMapStats( std::map* cf_stats) { + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); CompactionStats compaction_stats_sum; std::map> levels_stats; - DumpCFMapStats(&levels_stats, &compaction_stats_sum); + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); for (auto const& level_ent : levels_stats) { auto level_str = level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first); @@ -1099,9 +1458,10 @@ } void InternalStats::DumpCFMapStats( + const VersionStorageInfo* vstorage, std::map>* levels_stats, CompactionStats* compaction_stats_sum) { - const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + assert(vstorage); int num_levels_to_check = (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) @@ -1142,12 +1502,14 @@ if (level == 0) { input_bytes = curr_ingest; } else { - input_bytes = comp_stats_[level].bytes_read_non_output_levels; + input_bytes = comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; } double w_amp = (input_bytes == 0) ? 0.0 - : static_cast(comp_stats_[level].bytes_written) / + : static_cast(comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob) / input_bytes; std::map level_stats; PrepareLevelStats(&level_stats, files, files_being_compacted[level], @@ -1157,8 +1519,11 @@ } } // Cumulative summary - double w_amp = compaction_stats_sum->bytes_written / - static_cast(curr_ingest + 1); + double w_amp = (0 == curr_ingest) + ? 0.0 + : (compaction_stats_sum->bytes_written + + compaction_stats_sum->bytes_written_blob) / + static_cast(curr_ingest); // Stats summary across levels std::map sum_stats; PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted, @@ -1224,9 +1589,10 @@ value->append(buf); // Print stats for each level + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); std::map> levels_stats; CompactionStats compaction_stats_sum; - DumpCFMapStats(&levels_stats, &compaction_stats_sum); + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); for (int l = 0; l < number_levels_; ++l) { if (levels_stats.find(l) != levels_stats.end()) { PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]); @@ -1262,7 +1628,8 @@ CompactionStats interval_stats(compaction_stats_sum); interval_stats.Subtract(cf_stats_snapshot_.comp_stats); double w_amp = - interval_stats.bytes_written / static_cast(interval_ingest); + (interval_stats.bytes_written + interval_stats.bytes_written_blob) / + static_cast(interval_ingest); PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); value->append(buf); @@ -1281,7 +1648,14 @@ } } - double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + snprintf(buf, sizeof(buf), + "\nBlob file count: %" ROCKSDB_PRIszt ", total size: %.1f GB\n\n", + vstorage->GetBlobFiles().size(), + vstorage->GetTotalBlobFileSize() / kGB); + value->append(buf); + + uint64_t now_micros = clock_->NowMicros(); + double seconds_up = (now_micros - started_at_) / kMicrosInSec; double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", seconds_up, interval_seconds_up); @@ -1321,16 +1695,20 @@ uint64_t compact_micros = 0; for (int level = 0; level < number_levels_; level++) { compact_bytes_read += comp_stats_[level].bytes_read_output_level + - comp_stats_[level].bytes_read_non_output_levels; - compact_bytes_write += comp_stats_[level].bytes_written; + comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + compact_bytes_write += comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob; compact_micros += comp_stats_[level].micros; } snprintf(buf, sizeof(buf), "Cumulative compaction: %.2f GB write, %.2f MB/s write, " "%.2f GB read, %.2f MB/s read, %.1f seconds\n", - compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up, - compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up, + compact_bytes_write / kGB, + compact_bytes_write / kMB / std::max(seconds_up, 0.001), + compact_bytes_read / kGB, + compact_bytes_read / kMB / std::max(seconds_up, 0.001), compact_micros / kMicrosInSec); value->append(buf); @@ -1393,24 +1771,45 @@ cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; cf_stats_snapshot_.comp_stats = compaction_stats_sum; cf_stats_snapshot_.stall_count = total_stall_count; + + // Do not gather cache entry stats during CFStats because DB + // mutex is held. Only dump last cached collection (rely on DB + // periodic stats dump to update) + if (cache_entry_stats_collector_) { + CacheEntryRoleStats stats; + // thread safe + cache_entry_stats_collector_->GetStats(&stats); + + constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U; + + // Skip if stats are extremely old (> 1 day, incl not yet populated) + if (now_micros - stats.last_end_time_micros_ < kDayInMicros) { + value->append(stats.ToString(clock_)); + } + } } void InternalStats::DumpCFFileHistogram(std::string* value) { - char buf[2000]; - snprintf(buf, sizeof(buf), - "\n** File Read Latency Histogram By Level [%s] **\n", - cfd_->GetName().c_str()); - value->append(buf); + assert(value); + assert(cfd_); + + std::ostringstream oss; + oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName() + << "] **\n"; for (int level = 0; level < number_levels_; level++) { if (!file_read_latency_[level].Empty()) { - char buf2[5000]; - snprintf(buf2, sizeof(buf2), - "** Level %d read latency histogram (micros):\n%s\n", level, - file_read_latency_[level].ToString().c_str()); - value->append(buf2); + oss << "** Level " << level << " read latency histogram (micros):\n" + << file_read_latency_[level].ToString() << '\n'; } } + + if (!blob_file_read_latency_.Empty()) { + oss << "** Blob file read latency histogram (micros):\n" + << blob_file_read_latency_.ToString() << '\n'; + } + + value->append(oss.str()); } #else diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,16 +9,22 @@ // #pragma once + #include +#include #include #include +#include "cache/cache_entry_roles.h" #include "db/version_set.h" +#include "rocksdb/system_clock.h" class ColumnFamilyData; namespace ROCKSDB_NAMESPACE { +template +class CacheEntryStatsCollector; class DBImpl; class MemTableList; @@ -44,7 +50,9 @@ Version* version); // @param props Map of general properties to populate - bool (InternalStats::*handle_map)(std::map* props); + // @param suffix Argument portion of the property. (see handle_string) + bool (InternalStats::*handle_map)(std::map* props, + Slice suffix); // handle the string type properties rely on DBImpl methods // @param value Value-result argument for storing the property's string value @@ -76,6 +84,8 @@ AVG_SEC, KEY_IN, KEY_DROP, + R_BLOB_GB, + W_BLOB_GB, TOTAL // total number of types }; @@ -86,6 +96,11 @@ std::string header_name; }; +struct DBStatInfo { + // This what will be property_name in the flat map returned to the user + std::string property_name; +}; + class InternalStats { public: static const std::map compaction_level_stats; @@ -120,18 +135,9 @@ kIntStatsNumMax, }; - InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) - : db_stats_{}, - cf_stats_value_{}, - cf_stats_count_{}, - comp_stats_(num_levels), - comp_stats_by_pri_(Env::Priority::TOTAL), - file_read_latency_(num_levels), - bg_error_count_(0), - number_levels_(num_levels), - env_(env), - cfd_(cfd), - started_at_(env->NowMicros()) {} + static const std::map db_stats_type_to_info; + + InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". @@ -139,32 +145,42 @@ uint64_t micros; uint64_t cpu_micros; - // The number of bytes read from all non-output levels + // The number of bytes read from all non-output levels (table files) uint64_t bytes_read_non_output_levels; - // The number of bytes read from the compaction output level. + // The number of bytes read from the compaction output level (table files) uint64_t bytes_read_output_level; - // Total number of bytes written during compaction + // The number of bytes read from blob files + uint64_t bytes_read_blob; + + // Total number of bytes written to table files during compaction uint64_t bytes_written; - // Total number of bytes moved to the output level + // Total number of bytes written to blob files during compaction + uint64_t bytes_written_blob; + + // Total number of bytes moved to the output level (table files) uint64_t bytes_moved; - // The number of compaction input files in all non-output levels. + // The number of compaction input files in all non-output levels (table + // files) int num_input_files_in_non_output_levels; - // The number of compaction input files in the output level. + // The number of compaction input files in the output level (table files) int num_input_files_in_output_level; - // The number of compaction output files. + // The number of compaction output files (table files) int num_output_files; + // The number of compaction output files (blob files) + int num_output_files_blob; + // Total incoming entries during compaction between levels N and N+1 uint64_t num_input_records; // Accumulated diff number of entries - // (num input entries - num output entires) for compaction levels N and N+1 + // (num input entries - num output entries) for compaction levels N and N+1 uint64_t num_dropped_records; // Number of compactions done @@ -178,11 +194,14 @@ cpu_micros(0), bytes_read_non_output_levels(0), bytes_read_output_level(0), + bytes_read_blob(0), bytes_written(0), + bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), num_input_files_in_output_level(0), num_output_files(0), + num_output_files_blob(0), num_input_records(0), num_dropped_records(0), count(0) { @@ -197,11 +216,14 @@ cpu_micros(0), bytes_read_non_output_levels(0), bytes_read_output_level(0), + bytes_read_blob(0), bytes_written(0), + bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), num_input_files_in_output_level(0), num_output_files(0), + num_output_files_blob(0), num_input_records(0), num_dropped_records(0), count(c) { @@ -222,12 +244,15 @@ cpu_micros(c.cpu_micros), bytes_read_non_output_levels(c.bytes_read_non_output_levels), bytes_read_output_level(c.bytes_read_output_level), + bytes_read_blob(c.bytes_read_blob), bytes_written(c.bytes_written), + bytes_written_blob(c.bytes_written_blob), bytes_moved(c.bytes_moved), num_input_files_in_non_output_levels( c.num_input_files_in_non_output_levels), num_input_files_in_output_level(c.num_input_files_in_output_level), num_output_files(c.num_output_files), + num_output_files_blob(c.num_output_files_blob), num_input_records(c.num_input_records), num_dropped_records(c.num_dropped_records), count(c.count) { @@ -242,12 +267,15 @@ cpu_micros = c.cpu_micros; bytes_read_non_output_levels = c.bytes_read_non_output_levels; bytes_read_output_level = c.bytes_read_output_level; + bytes_read_blob = c.bytes_read_blob; bytes_written = c.bytes_written; + bytes_written_blob = c.bytes_written_blob; bytes_moved = c.bytes_moved; num_input_files_in_non_output_levels = c.num_input_files_in_non_output_levels; num_input_files_in_output_level = c.num_input_files_in_output_level; num_output_files = c.num_output_files; + num_output_files_blob = c.num_output_files_blob; num_input_records = c.num_input_records; num_dropped_records = c.num_dropped_records; count = c.count; @@ -264,11 +292,14 @@ this->cpu_micros = 0; this->bytes_read_non_output_levels = 0; this->bytes_read_output_level = 0; + this->bytes_read_blob = 0; this->bytes_written = 0; + this->bytes_written_blob = 0; this->bytes_moved = 0; this->num_input_files_in_non_output_levels = 0; this->num_input_files_in_output_level = 0; this->num_output_files = 0; + this->num_output_files_blob = 0; this->num_input_records = 0; this->num_dropped_records = 0; this->count = 0; @@ -283,13 +314,16 @@ this->cpu_micros += c.cpu_micros; this->bytes_read_non_output_levels += c.bytes_read_non_output_levels; this->bytes_read_output_level += c.bytes_read_output_level; + this->bytes_read_blob += c.bytes_read_blob; this->bytes_written += c.bytes_written; + this->bytes_written_blob += c.bytes_written_blob; this->bytes_moved += c.bytes_moved; this->num_input_files_in_non_output_levels += c.num_input_files_in_non_output_levels; this->num_input_files_in_output_level += c.num_input_files_in_output_level; this->num_output_files += c.num_output_files; + this->num_output_files_blob += c.num_output_files_blob; this->num_input_records += c.num_input_records; this->num_dropped_records += c.num_dropped_records; this->count += c.count; @@ -304,13 +338,16 @@ this->cpu_micros -= c.cpu_micros; this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels; this->bytes_read_output_level -= c.bytes_read_output_level; + this->bytes_read_blob -= c.bytes_read_blob; this->bytes_written -= c.bytes_written; + this->bytes_written_blob -= c.bytes_written_blob; this->bytes_moved -= c.bytes_moved; this->num_input_files_in_non_output_levels -= c.num_input_files_in_non_output_levels; this->num_input_files_in_output_level -= c.num_input_files_in_output_level; this->num_output_files -= c.num_output_files; + this->num_output_files_blob -= c.num_output_files_blob; this->num_input_records -= c.num_input_records; this->num_dropped_records -= c.num_dropped_records; this->count -= c.count; @@ -321,6 +358,39 @@ } }; + // For use with CacheEntryStatsCollector + struct CacheEntryRoleStats { + uint64_t cache_capacity = 0; + std::string cache_id; + std::array total_charges; + std::array entry_counts; + uint32_t collection_count = 0; + uint32_t copies_of_last_collection = 0; + uint64_t last_start_time_micros_ = 0; + uint64_t last_end_time_micros_ = 0; + + void Clear() { + // Wipe everything except collection_count + uint32_t saved_collection_count = collection_count; + *this = CacheEntryRoleStats(); + collection_count = saved_collection_count; + } + + void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); + std::function + GetEntryCallback(); + void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); + void SkippedCollection(); + + std::string ToString(SystemClock* clock) const; + void ToMap(std::map* values, + SystemClock* clock) const; + + private: + std::unordered_map role_map_; + uint64_t GetLastDurationMicros() const; + }; + void Clear() { for (int i = 0; i < kIntStatsNumMax; i++) { db_stats_[i].store(0); @@ -335,10 +405,11 @@ for (auto& h : file_read_latency_) { h.Clear(); } + blob_file_read_latency_.Clear(); cf_stats_snapshot_.Clear(); db_stats_snapshot_.Clear(); bg_error_count_ = 0; - started_at_ = env_->NowMicros(); + started_at_ = clock_->NowMicros(); } void AddCompactionStats(int level, Env::Priority thread_pri, @@ -375,6 +446,8 @@ return &file_read_latency_[level]; } + HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; } + uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } @@ -392,18 +465,31 @@ bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, Version* version, uint64_t* value); + // Unless there is a recent enough collection of the stats, collect and + // saved new cache entry stats. If `foreground`, require data to be more + // recent to skip re-collection. + // + // This should only be called while NOT holding the DB mutex. + void CollectCacheEntryStats(bool foreground); + + const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; } + const std::vector& TEST_GetCompactionStats() const { return comp_stats_; } + void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground); + // Store a mapping from the user-facing DB::Properties string to our // DBPropertyInfo struct used internally for retrieving properties. static const std::unordered_map ppt_name_to_info; private: + void DumpDBMapStats(std::map* db_stats); void DumpDBStats(std::string* value); void DumpCFMapStats(std::map* cf_stats); void DumpCFMapStats( + const VersionStorageInfo* vstorage, std::map>* level_stats, CompactionStats* compaction_stats_sum); void DumpCFMapStatsByPriority( @@ -413,17 +499,25 @@ void DumpCFStatsNoFileHistogram(std::string* value); void DumpCFFileHistogram(std::string* value); - bool HandleBlockCacheStat(Cache** block_cache); + bool GetBlockCacheForStats(Cache** block_cache); // Per-DB stats std::atomic db_stats_[kIntStatsNumMax]; // Per-ColumnFamily stats uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; + // Initialize/reference the collector in constructor so that we don't need + // additional synchronization in InternalStats, relying on synchronization + // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache + // (through a shared_ptr) so that it does not get immediately ejected from + // a full cache, which would force a re-scan on the next GetStats. + std::shared_ptr> + cache_entry_stats_collector_; // Per-ColumnFamily/level compaction stats std::vector comp_stats_; std::vector comp_stats_by_pri_; std::vector file_read_latency_; + HistogramImpl blob_file_read_latency_; // Used to compute per-interval statistics struct CFStatsSnapshot { @@ -519,14 +613,21 @@ bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); bool HandleLevelStats(std::string* value, Slice suffix); bool HandleStats(std::string* value, Slice suffix); - bool HandleCFMapStats(std::map* compaction_stats); + bool HandleCFMapStats(std::map* compaction_stats, + Slice suffix); bool HandleCFStats(std::string* value, Slice suffix); bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); bool HandleCFFileHistogram(std::string* value, Slice suffix); + bool HandleDBMapStats(std::map* compaction_stats, + Slice suffix); bool HandleDBStats(std::string* value, Slice suffix); bool HandleSsTables(std::string* value, Slice suffix); bool HandleAggregatedTableProperties(std::string* value, Slice suffix); bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesMap( + std::map* values, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix); bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, Version* version); bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, @@ -581,6 +682,14 @@ bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleBlockCacheEntryStatsMap(std::map* values, + Slice suffix); + bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix); + bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlobStats(std::string* value, Slice suffix); + bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version); // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can // be caused by any possible reason, including file system errors, out of @@ -589,7 +698,7 @@ uint64_t bg_error_count_; const int number_levels_; - Env* env_; + SystemClock* clock_; ColumnFamilyData* cfd_; uint64_t started_at_; }; @@ -628,18 +737,22 @@ kIntStatsNumMax, }; - InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {} + InternalStats(int /*num_levels*/, SystemClock* /*clock*/, + ColumnFamilyData* /*cfd*/) {} struct CompactionStats { uint64_t micros; uint64_t cpu_micros; uint64_t bytes_read_non_output_levels; uint64_t bytes_read_output_level; + uint64_t bytes_read_blob; uint64_t bytes_written; + uint64_t bytes_written_blob; uint64_t bytes_moved; int num_input_files_in_non_output_levels; int num_input_files_in_output_level; int num_output_files; + int num_output_files_blob; uint64_t num_input_records; uint64_t num_dropped_records; int count; @@ -667,6 +780,8 @@ HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; } + HistogramImpl* GetBlobFileReadHist() { return nullptr; } + uint64_t GetBackgroundErrorCount() const { return 0; } uint64_t BumpAndGetBackgroundErrorCount() { return 0; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/job_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/job_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,8 +12,9 @@ #include #include -#include "db/log_writer.h" #include "db/column_family.h" +#include "db/log_writer.h" +#include "db/version_set.h" namespace ROCKSDB_NAMESPACE { @@ -23,7 +24,7 @@ struct SuperVersionContext { struct WriteStallNotification { WriteStallInfo write_stall_info; - const ImmutableCFOptions* immutable_cf_options; + const ImmutableOptions* immutable_options; }; autovector superversions_to_free; @@ -57,15 +58,16 @@ #endif } - void PushWriteStallNotification( - WriteStallCondition old_cond, WriteStallCondition new_cond, - const std::string& name, const ImmutableCFOptions* ioptions) { + void PushWriteStallNotification(WriteStallCondition old_cond, + WriteStallCondition new_cond, + const std::string& name, + const ImmutableOptions* ioptions) { #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) WriteStallNotification notif; notif.write_stall_info.cf_name = name; notif.write_stall_info.condition.prev = old_cond; notif.write_stall_info.condition.cur = new_cond; - notif.immutable_cf_options = ioptions; + notif.immutable_options = ioptions; write_stall_notifications.push_back(notif); #else (void)old_cond; @@ -79,7 +81,7 @@ #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) // notify listeners on changed write stall conditions for (auto& notif : write_stall_notifications) { - for (auto& listener : notif.immutable_cf_options->listeners) { + for (auto& listener : notif.immutable_options->listeners) { listener->OnStallConditionsChanged(notif.write_stall_info); } } @@ -102,8 +104,9 @@ struct JobContext { inline bool HaveSomethingToDelete() const { - return full_scan_candidate_files.size() || sst_delete_files.size() || - log_delete_files.size() || manifest_delete_files.size(); + return !(full_scan_candidate_files.empty() && sst_delete_files.empty() && + blob_delete_files.empty() && log_delete_files.empty() && + manifest_delete_files.empty()); } inline bool HaveSomethingToClean() const { @@ -115,7 +118,7 @@ } } return memtables_to_free.size() > 0 || logs_to_free.size() > 0 || - sv_have_sth; + job_snapshot != nullptr || sv_have_sth; } // Structure to store information for candidate files to delete. @@ -140,11 +143,17 @@ std::vector full_scan_candidate_files; // the list of all live sst files that cannot be deleted - std::vector sst_live; + std::vector sst_live; - // a list of sst files that we need to delete + // the list of sst files that we need to delete std::vector sst_delete_files; + // the list of all live blob files that cannot be deleted + std::vector blob_live; + + // the list of blob files that we need to delete + std::vector blob_delete_files; + // a list of log files that we need to delete std::vector log_delete_files; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/kv_checksum.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/kv_checksum.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,394 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains classes containing fields to protect individual entries. +// The classes are named "ProtectionInfo", where indicates the +// combination of fields that are covered. Each field has a single letter +// abbreviation as follows. +// +// K = key +// V = value +// O = optype aka value type +// S = seqno +// C = CF ID +// +// Then, for example, a class that protects an entry consisting of key, value, +// optype, and CF ID (i.e., a `WriteBatch` entry) would be named +// `ProtectionInfoKVOC`. +// +// The `ProtectionInfo.*` classes are templated on the integer type used to hold +// the XOR of hashes for each field. Only unsigned integer types are supported, +// and the maximum supported integer width is 64 bits. When the integer type is +// narrower than the hash values, we lop off the most significant bits to make +// them fit. +// +// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do +// not currently make the byte order consistent for integer fields before +// hashing them, so the resulting values are endianness-dependent. + +#pragma once + +#include + +#include "db/dbformat.h" +#include "rocksdb/types.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +template +class ProtectionInfo; +template +class ProtectionInfoKVO; +template +class ProtectionInfoKVOC; +template +class ProtectionInfoKVOS; + +// Aliases for 64-bit protection infos. +using ProtectionInfo64 = ProtectionInfo; +using ProtectionInfoKVO64 = ProtectionInfoKVO; +using ProtectionInfoKVOC64 = ProtectionInfoKVOC; +using ProtectionInfoKVOS64 = ProtectionInfoKVOS; + +template +class ProtectionInfo { + public: + ProtectionInfo() = default; + + Status GetStatus() const; + ProtectionInfoKVO ProtectKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfoKVO ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const; + + private: + friend class ProtectionInfoKVO; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + + // Each field is hashed with an independent value so we can catch fields being + // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, + // and we should instead vary our seeds by a large odd number. This value by + // which we increment (0xD28AAD72F49BD50B) was taken from + // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd + // number. The values are computed manually since the Windows C++ compiler + // complains about the overflow when adding constants. + static const uint64_t kSeedK = 0; + static const uint64_t kSeedV = 0xD28AAD72F49BD50B; + static const uint64_t kSeedO = 0xA5155AE5E937AA16; + static const uint64_t kSeedS = 0x77A00858DDD37F21; + static const uint64_t kSeedC = 0x4A2AB5CBD26F542C; + + ProtectionInfo(T val) : val_(val) { + static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); + } + + T GetVal() const { return val_; } + void SetVal(T val) { val_ = val; } + + T val_ = 0; +}; + +template +class ProtectionInfoKVO { + public: + ProtectionInfoKVO() = default; + + ProtectionInfo StripKVO(const Slice& key, const Slice& value, + ValueType op_type) const; + ProtectionInfo StripKVO(const SliceParts& key, const SliceParts& value, + ValueType op_type) const; + + ProtectionInfoKVOC ProtectC(ColumnFamilyId column_family_id) const; + ProtectionInfoKVOS ProtectS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key); + void UpdateK(const SliceParts& old_key, const SliceParts& new_key); + void UpdateV(const Slice& old_value, const Slice& new_value); + void UpdateV(const SliceParts& old_value, const SliceParts& new_value); + void UpdateO(ValueType old_op_type, ValueType new_op_type); + + private: + friend class ProtectionInfo; + friend class ProtectionInfoKVOS; + friend class ProtectionInfoKVOC; + + explicit ProtectionInfoKVO(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); + } + + T GetVal() const { return info_.GetVal(); } + void SetVal(T val) { info_.SetVal(val); } + + ProtectionInfo info_; +}; + +template +class ProtectionInfoKVOC { + public: + ProtectionInfoKVOC() = default; + + ProtectionInfoKVO StripC(ColumnFamilyId column_family_id) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id); + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOC(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); + } + + T GetVal() const { return kvo_.GetVal(); } + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +class ProtectionInfoKVOS { + public: + ProtectionInfoKVOS() = default; + + ProtectionInfoKVO StripS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvo_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvo_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvo_.UpdateO(old_op_type, new_op_type); + } + void UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number); + + private: + friend class ProtectionInfoKVO; + + explicit ProtectionInfoKVOS(T val) : kvo_(val) { + static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); + } + + T GetVal() const { return kvo_.GetVal(); } + void SetVal(T val) { kvo_.SetVal(val); } + + ProtectionInfoKVO kvo_; +}; + +template +Status ProtectionInfo::GetStatus() const { + if (val_ != 0) { + return Status::Corruption("ProtectionInfo mismatch"); + } + return Status::OK(); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSliceNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateK(const SliceParts& old_key, + const SliceParts& new_key) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const Slice& old_value, + const Slice& new_value) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(GetSliceNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateV(const SliceParts& old_value, + const SliceParts& new_value) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVO::UpdateO(ValueType old_op_type, + ValueType new_op_type) { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&old_op_type), + sizeof(old_op_type), + ProtectionInfo::kSeedO)); + val = val ^ static_cast(NPHash64(reinterpret_cast(&new_op_type), + sizeof(new_op_type), + ProtectionInfo::kSeedO)); + SetVal(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const Slice& key, + const Slice& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfo ProtectionInfoKVO::StripKVO(const SliceParts& key, + const SliceParts& value, + ValueType op_type) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + return ProtectionInfo(val); +} + +template +ProtectionInfoKVOC ProtectionInfoKVO::ProtectC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVOC(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOC::StripC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOC::UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_column_family_id), + sizeof(old_column_family_id), ProtectionInfo::kSeedC)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_column_family_id), + sizeof(new_column_family_id), ProtectionInfo::kSeedC)); + SetVal(val); +} + +template +ProtectionInfoKVOS ProtectionInfoKVO::ProtectS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVOS(val); +} + +template +ProtectionInfoKVO ProtectionInfoKVOS::StripS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVO(val); +} + +template +void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_sequence_number), + sizeof(old_sequence_number), ProtectionInfo::kSeedS)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_sequence_number), + sizeof(new_sequence_number), ProtectionInfo::kSeedS)); + SetVal(val); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/listener_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/listener_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,15 +3,13 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/blob_index.h" +#include "db/blob/blob_index.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" -#include "logging/logging.h" -#include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -24,8 +22,6 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/block_based/block_based_table_factory.h" -#include "table/plain/plain_table_factory.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -41,7 +37,7 @@ class EventListenerTest : public DBTestBase { public: - EventListenerTest() : DBTestBase("/listener_test") {} + EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {} static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, uint64_t size) { @@ -195,10 +191,10 @@ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); @@ -214,6 +210,10 @@ : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { db_closed = false; } + + virtual ~TestFlushListener() { + prev_fc_info_.status.PermitUncheckedError(); // Ignore the status + } void OnTableFileCreated( const TableFileCreationInfo& info) override { // remember the info for later checking the FlushJobInfo. @@ -227,6 +227,8 @@ ASSERT_GT(info.table_properties.raw_value_size, 0U); ASSERT_GT(info.table_properties.num_data_blocks, 0U); ASSERT_GT(info.table_properties.num_entries, 0U); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); #ifdef ROCKSDB_USING_THREAD_STATUS // Verify the id of the current thread that created this table @@ -272,6 +274,9 @@ ASSERT_TRUE(test_); if (db == test_->db_) { std::vector> files_by_level; + ASSERT_LT(info.cf_id, test_->handles_.size()); + ASSERT_GE(info.cf_id, 0u); + ASSERT_NE(test_->handles_[info.cf_id], nullptr); test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id], &files_by_level); @@ -334,7 +339,7 @@ ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(listener->flushed_dbs_.size(), i); ASSERT_EQ(listener->flushed_column_family_names_.size(), i); } @@ -353,32 +358,39 @@ #ifdef ROCKSDB_USING_THREAD_STATUS options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS - TestFlushListener* listener = new TestFlushListener(options.env, this); - options.listeners.emplace_back(listener); - options.table_properties_collector_factories.push_back( - std::make_shared()); - std::vector cf_names = { - "pikachu", "ilya", "muromec", "dobrynia", - "nikitich", "alyosha", "popovich"}; - CreateAndReopenWithCF(cf_names, options); - - ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); - ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); - ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); - ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); - ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); - ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); - ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); - for (int i = 1; i < 8; ++i) { - ASSERT_OK(Flush(i)); - ASSERT_EQ(listener->flushed_dbs_.size(), i); - ASSERT_EQ(listener->flushed_column_family_names_.size(), i); - } + for (auto atomic_flush : {false, true}) { + options.atomic_flush = atomic_flush; + options.create_if_missing = true; + DestroyAndReopen(options); + TestFlushListener* listener = new TestFlushListener(options.env, this); + options.listeners.emplace_back(listener); + options.table_properties_collector_factories.push_back( + std::make_shared()); + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; + CreateAndReopenWithCF(cf_names, options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (int i = 1; i < 8; ++i) { + ASSERT_OK(Flush(i)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } - // make sure callback functions are called in the right order - for (size_t i = 0; i < cf_names.size(); i++) { - ASSERT_EQ(listener->flushed_dbs_[i], db_); - ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + // make sure callback functions are called in the right order + for (size_t i = 0; i < cf_names.size(); i++) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } + Close(); } } @@ -418,7 +430,7 @@ ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db)); for (size_t c = 0; c < cf_names.size(); ++c) { ColumnFamilyHandle* handle; - db->CreateColumnFamily(cf_opts, cf_names[c], &handle); + ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle)); handles.push_back(handle); } @@ -436,7 +448,8 @@ for (size_t c = 0; c < cf_names.size(); ++c) { for (int d = 0; d < kNumDBs; ++d) { ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); - reinterpret_cast(dbs[d])->TEST_WaitForFlushMemTable(); + ASSERT_OK( + static_cast_with_check(dbs[d])->TEST_WaitForFlushMemTable()); } } @@ -495,13 +508,16 @@ // keep writing until writes are forced to stop. for (int i = 0; static_cast(cf_meta.file_count) < kSlowdownTrigger * 10; ++i) { - Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()); + ASSERT_OK(Put(1, ToString(i), std::string(10000, 'x'), WriteOptions())); FlushOptions fo; fo.allow_write_stall = true; - db_->Flush(fo, handles_[1]); + ASSERT_OK(db_->Flush(fo, handles_[1])); db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); } ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); + // We don't want the listener executing during DBTestBase::Close() due to + // race on handles_. + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); } class TestCompactionReasonListener : public EventListener { @@ -519,8 +535,8 @@ Options options; options.env = CurrentOptions().env; options.create_if_missing = true; - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); TestCompactionReasonListener* listener = new TestCompactionReasonListener(); options.listeners.emplace_back(listener); @@ -535,7 +551,7 @@ for (int i = 0; i < 4; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(listener->compaction_reasons_.size(), 1); ASSERT_EQ(listener->compaction_reasons_[0], @@ -552,14 +568,14 @@ } // Do a trivial move from L0 -> L1 - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); options.max_bytes_for_level_base = 1; Close(); listener->compaction_reasons_.clear(); Reopen(options); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 1); for (auto compaction_reason : listener->compaction_reasons_) { @@ -571,7 +587,7 @@ listener->compaction_reasons_.clear(); Reopen(options); - Put("key", "value"); + ASSERT_OK(Put("key", "value")); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -585,8 +601,8 @@ Options options; options.env = CurrentOptions().env; options.create_if_missing = true; - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); TestCompactionReasonListener* listener = new TestCompactionReasonListener(); options.listeners.emplace_back(listener); @@ -605,7 +621,7 @@ for (int i = 0; i < 8; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -623,7 +639,7 @@ for (int i = 0; i < 8; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -635,7 +651,7 @@ listener->compaction_reasons_.clear(); Reopen(options); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -647,8 +663,8 @@ Options options; options.env = CurrentOptions().env; options.create_if_missing = true; - options.memtable_factory.reset( - new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory( + DBTestBase::kNumKeysByGenerateNewRandomFile)); TestCompactionReasonListener* listener = new TestCompactionReasonListener(); options.listeners.emplace_back(listener); @@ -664,7 +680,7 @@ for (int i = 0; i < 4; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -676,7 +692,9 @@ public: class TestEnv : public EnvWrapper { public: - TestEnv() : EnvWrapper(Env::Default()) {} + explicit TestEnv(Env* t) : EnvWrapper(t) {} + static const char* kClassName() { return "TestEnv"; } + const char* Name() const override { return kClassName(); } void SetStatus(Status s) { status_ = s; } @@ -688,7 +706,7 @@ return status_; } } - return Env::Default()->NewWritableFile(fname, result, options); + return target()->NewWritableFile(fname, result, options); } private: @@ -751,6 +769,8 @@ ASSERT_GT(info.cf_name.size(), 0U); ASSERT_GT(info.file_path.size(), 0U); ASSERT_GT(info.job_id, 0); + ASSERT_EQ(info.file_checksum, kUnknownFileChecksum); + ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); if (info.status.ok()) { ASSERT_GT(info.table_properties.data_size, 0U); ASSERT_GT(info.table_properties.raw_key_size, 0U); @@ -760,57 +780,72 @@ } else { if (idx >= 0) { failure_[idx]++; + last_failure_ = info.status; } } } - TestEnv test_env; int started_[2]; int finished_[2]; int failure_[2]; + Status last_failure_; }; TEST_F(EventListenerTest, TableFileCreationListenersTest) { auto listener = std::make_shared(); Options options; + std::unique_ptr test_env( + new TableFileCreationListener::TestEnv(CurrentOptions().env)); options.create_if_missing = true; options.listeners.push_back(listener); - options.env = &listener->test_env; + options.env = test_env.get(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "aaa")); ASSERT_OK(Put("bar", "bbb")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); - ASSERT_OK(Put("foo", "aaa1")); ASSERT_OK(Put("bar", "bbb1")); - listener->test_env.SetStatus(Status::NotSupported("not supported")); + test_env->SetStatus(Status::NotSupported("not supported")); ASSERT_NOK(Flush()); listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); - listener->test_env.SetStatus(Status::OK()); + ASSERT_TRUE(listener->last_failure_.IsNotSupported()); + test_env->SetStatus(Status::OK()); Reopen(options); ASSERT_OK(Put("foo", "aaa2")); ASSERT_OK(Put("bar", "bbb2")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); const Slice kRangeStart = "a"; const Slice kRangeEnd = "z"; - dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0); + // Verify that an empty table file that is immediately deleted gives Aborted + // status to listener. + ASSERT_OK(Put("baz", "z")); + ASSERT_OK(SingleDelete("baz")); + ASSERT_OK(Flush()); + listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); + ASSERT_TRUE(listener->last_failure_.IsAborted()); + ASSERT_OK(Put("foo", "aaa3")); ASSERT_OK(Put("bar", "bbb3")); ASSERT_OK(Flush()); - listener->test_env.SetStatus(Status::NotSupported("not supported")); - dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); - dbfull()->TEST_WaitForCompact(); + test_env->SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1); + ASSERT_TRUE(listener->last_failure_.IsNotSupported()); + Close(); } class MemTableSealedListener : public EventListener { @@ -831,6 +866,7 @@ TEST_F(EventListenerTest, MemTableSealedListenerTest) { auto listener = std::make_shared(); Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.listeners.push_back(listener); DestroyAndReopen(options); @@ -895,7 +931,7 @@ // can succeed. *bg_error = Status::OK(); env_->drop_writes_.store(false, std::memory_order_release); - env_->no_slowdown_ = false; + env_->SetMockSleep(false); } ++counter_; } @@ -909,7 +945,7 @@ options.create_if_missing = true; options.env = env_; options.listeners.push_back(listener); - options.memtable_factory.reset(new SpecialSkipListFactory(1)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); options.paranoid_checks = true; DestroyAndReopen(options); @@ -921,7 +957,7 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); env_->drop_writes_.store(true, std::memory_order_release); - env_->no_slowdown_ = true; + env_->SetMockSleep(); ASSERT_OK(Put("key0", "val")); ASSERT_OK(Put("key1", "val")); @@ -940,7 +976,7 @@ options.env = env_; options.level0_file_num_compaction_trigger = 2; options.listeners.push_back(listener); - options.memtable_factory.reset(new SpecialSkipListFactory(2)); + options.memtable_factory.reset(test::NewSpecialSkipListFactory(2)); options.paranoid_checks = true; DestroyAndReopen(options); @@ -955,7 +991,7 @@ ASSERT_EQ(2, NumTableFilesAtLevel(0)); env_->drop_writes_.store(true, std::memory_order_release); - env_->no_slowdown_ = true; + env_->SetMockSleep(); ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, listener->counter()); @@ -977,6 +1013,21 @@ file_reads_success_.store(0); file_writes_.store(0); file_writes_success_.store(0); + file_flushes_.store(0); + file_flushes_success_.store(0); + file_closes_.store(0); + file_closes_success_.store(0); + file_syncs_.store(0); + file_syncs_success_.store(0); + file_truncates_.store(0); + file_truncates_success_.store(0); + file_seq_reads_.store(0); + blob_file_reads_.store(0); + blob_file_writes_.store(0); + blob_file_flushes_.store(0); + blob_file_closes_.store(0); + blob_file_syncs_.store(0); + blob_file_truncates_.store(0); } void OnFileReadFinish(const FileOperationInfo& info) override { @@ -984,6 +1035,12 @@ if (info.status.ok()) { ++file_reads_success_; } + if (info.path.find("MANIFEST") != std::string::npos) { + ++file_seq_reads_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_reads_; + } ReportDuration(info); } @@ -992,6 +1049,53 @@ if (info.status.ok()) { ++file_writes_success_; } + if (EndsWith(info.path, ".blob")) { + ++blob_file_writes_; + } + ReportDuration(info); + } + + void OnFileFlushFinish(const FileOperationInfo& info) override { + ++file_flushes_; + if (info.status.ok()) { + ++file_flushes_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_flushes_; + } + ReportDuration(info); + } + + void OnFileCloseFinish(const FileOperationInfo& info) override { + ++file_closes_; + if (info.status.ok()) { + ++file_closes_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_closes_; + } + ReportDuration(info); + } + + void OnFileSyncFinish(const FileOperationInfo& info) override { + ++file_syncs_; + if (info.status.ok()) { + ++file_syncs_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_syncs_; + } + ReportDuration(info); + } + + void OnFileTruncateFinish(const FileOperationInfo& info) override { + ++file_truncates_; + if (info.status.ok()) { + ++file_truncates_success_; + } + if (EndsWith(info.path, ".blob")) { + ++blob_file_truncates_; + } ReportDuration(info); } @@ -1001,12 +1105,25 @@ std::atomic file_reads_success_; std::atomic file_writes_; std::atomic file_writes_success_; + std::atomic file_flushes_; + std::atomic file_flushes_success_; + std::atomic file_closes_; + std::atomic file_closes_success_; + std::atomic file_syncs_; + std::atomic file_syncs_success_; + std::atomic file_truncates_; + std::atomic file_truncates_success_; + std::atomic file_seq_reads_; + std::atomic blob_file_reads_; + std::atomic blob_file_writes_; + std::atomic blob_file_flushes_; + std::atomic blob_file_closes_; + std::atomic blob_file_syncs_; + std::atomic blob_file_truncates_; private: void ReportDuration(const FileOperationInfo& info) const { - auto duration = std::chrono::duration_cast( - info.finish_timestamp - info.start_timestamp); - ASSERT_GT(duration.count(), 0); + ASSERT_GT(info.duration.count(), 0); } }; @@ -1018,18 +1135,430 @@ TestFileOperationListener* listener = new TestFileOperationListener(); options.listeners.emplace_back(listener); + options.use_direct_io_for_flush_and_compaction = false; + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + options.use_direct_io_for_flush_and_compaction = false; + } else { + ASSERT_OK(s); + } DestroyAndReopen(options); ASSERT_OK(Put("foo", "aaa")); - dbfull()->Flush(FlushOptions()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_GE(listener->file_writes_.load(), listener->file_writes_success_.load()); ASSERT_GT(listener->file_writes_.load(), 0); + ASSERT_GE(listener->file_flushes_.load(), + listener->file_flushes_success_.load()); + ASSERT_GT(listener->file_flushes_.load(), 0); Close(); Reopen(options); ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load()); ASSERT_GT(listener->file_reads_.load(), 0); + ASSERT_GE(listener->file_closes_.load(), + listener->file_closes_success_.load()); + ASSERT_GT(listener->file_closes_.load(), 0); + ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load()); + ASSERT_GT(listener->file_syncs_.load(), 0); + if (true == options.use_direct_io_for_flush_and_compaction) { + ASSERT_GE(listener->file_truncates_.load(), + listener->file_truncates_success_.load()); + ASSERT_GT(listener->file_truncates_.load(), 0); + } +} + +TEST_F(EventListenerTest, OnBlobFileOperationTest) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + ASSERT_GT(listener->blob_file_writes_.load(), 0U); + ASSERT_GT(listener->blob_file_flushes_.load(), 0U); + Close(); + + Reopen(options); + ASSERT_GT(listener->blob_file_closes_.load(), 0U); + ASSERT_GT(listener->blob_file_syncs_.load(), 0U); + if (true == options.use_direct_io_for_flush_and_compaction) { + ASSERT_GT(listener->blob_file_truncates_.load(), 0U); + } +} + +TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + + TestFileOperationListener* listener = new TestFileOperationListener(); + options.listeners.emplace_back(listener); + + options.use_direct_io_for_flush_and_compaction = false; + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + options.use_direct_io_for_flush_and_compaction = false; + } else { + ASSERT_OK(s); + } + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "aaa")); + Close(); + + size_t seq_reads = listener->file_seq_reads_.load(); + Reopen(options); + ASSERT_GT(listener->file_seq_reads_.load(), seq_reads); +} + +class BlobDBJobLevelEventListenerTest : public EventListener { + public: + explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test) + : test_(test), call_count_(0) {} + + std::shared_ptr GetBlobFileMetaData( + const VersionStorageInfo::BlobFiles& blob_files, + uint64_t blob_file_number) { + const auto it = blob_files.find(blob_file_number); + + if (it == blob_files.end()) { + return nullptr; + } + + const auto& meta = it->second; + assert(meta); + + return meta; + } + + const VersionStorageInfo::BlobFiles& GetBlobFiles() { + VersionSet* const versions = test_->dbfull()->GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + EXPECT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + EXPECT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + EXPECT_NE(storage_info, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + return blob_files; + } + + std::vector GetFlushedFiles() { + std::lock_guard lock(mutex_); + std::vector result; + for (const auto& fname : flushed_files_) { + result.push_back(fname); + } + return result; + } + + void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { + call_count_++; + EXPECT_FALSE(info.blob_file_addition_infos.empty()); + const auto& blob_files = GetBlobFiles(); + { + std::lock_guard lock(mutex_); + flushed_files_.push_back(info.file_path); + } + EXPECT_EQ(info.blob_compression_type, kNoCompression); + + for (const auto& blob_file_addition_info : info.blob_file_addition_infos) { + const auto meta = GetBlobFileMetaData( + blob_files, blob_file_addition_info.blob_file_number); + EXPECT_EQ(meta->GetBlobFileNumber(), + blob_file_addition_info.blob_file_number); + EXPECT_EQ(meta->GetTotalBlobBytes(), + blob_file_addition_info.total_blob_bytes); + EXPECT_EQ(meta->GetTotalBlobCount(), + blob_file_addition_info.total_blob_count); + EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty()); + } + } + + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + call_count_++; + EXPECT_FALSE(ci.blob_file_garbage_infos.empty()); + const auto& blob_files = GetBlobFiles(); + EXPECT_EQ(ci.blob_compression_type, kNoCompression); + + for (const auto& blob_file_addition_info : ci.blob_file_addition_infos) { + const auto meta = GetBlobFileMetaData( + blob_files, blob_file_addition_info.blob_file_number); + EXPECT_EQ(meta->GetBlobFileNumber(), + blob_file_addition_info.blob_file_number); + EXPECT_EQ(meta->GetTotalBlobBytes(), + blob_file_addition_info.total_blob_bytes); + EXPECT_EQ(meta->GetTotalBlobCount(), + blob_file_addition_info.total_blob_count); + EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty()); + } + + for (const auto& blob_file_garbage_info : ci.blob_file_garbage_infos) { + EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U); + EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty()); + } + } + + EventListenerTest* test_; + uint32_t call_count_; + + private: + std::vector flushed_files_; + std::mutex mutex_; +}; + +// Test OnFlushCompleted EventListener called for blob files +TEST_F(EventListenerTest, BlobDBOnFlushCompleted) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + + options.min_blob_size = 0; + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("Key1"), "blob_value1"); + ASSERT_EQ(Get("Key2"), "blob_value2"); + ASSERT_EQ(Get("Key3"), "blob_value3"); + + ASSERT_GT(blob_event_listener->call_count_, 0U); +} + +// Test OnCompactionCompleted EventListener called for blob files +TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + blob_event_listener->call_count_ = 0; + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + // Make sure, OnCompactionCompleted is called. + ASSERT_GT(blob_event_listener->call_count_, 0U); +} + +// Test CompactFiles calls OnCompactionCompleted EventListener for blob files +// and populate the blob files info. +TEST_F(EventListenerTest, BlobDBCompactFiles) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + BlobDBJobLevelEventListenerTest* blob_event_listener = + new BlobDBJobLevelEventListenerTest(this); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + std::vector output_file_names; + CompactionJobInfo compaction_job_info; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction + // which will be populated in output_files_names. + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1, + &output_file_names, &compaction_job_info)); + + bool is_blob_in_output = false; + for (const auto& file : output_file_names) { + if (EndsWith(file, ".blob")) { + is_blob_in_output = true; + } + } + ASSERT_TRUE(is_blob_in_output); + + for (const auto& blob_file_addition_info : + compaction_job_info.blob_file_addition_infos) { + EXPECT_GT(blob_file_addition_info.blob_file_number, 0U); + EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U); + EXPECT_GT(blob_file_addition_info.total_blob_count, 0U); + EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty()); + } + + for (const auto& blob_file_garbage_info : + compaction_job_info.blob_file_garbage_infos) { + EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U); + EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U); + EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty()); + } +} + +class BlobDBFileLevelEventListener : public EventListener { + public: + void OnBlobFileCreationStarted( + const BlobFileCreationBriefInfo& info) override { + files_started_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.cf_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + } + + void OnBlobFileCreated(const BlobFileCreationInfo& info) override { + files_created_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.cf_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + EXPECT_GT(info.total_blob_count, 0U); + EXPECT_GT(info.total_blob_bytes, 0U); + EXPECT_EQ(info.file_checksum, kUnknownFileChecksum); + EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName); + EXPECT_TRUE(info.status.ok()); + } + + void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override { + files_deleted_++; + EXPECT_FALSE(info.db_name.empty()); + EXPECT_FALSE(info.file_path.empty()); + EXPECT_GT(info.job_id, 0); + EXPECT_TRUE(info.status.ok()); + } + + void CheckCounters() { + EXPECT_EQ(files_started_, files_created_); + EXPECT_GT(files_started_, 0U); + EXPECT_GT(files_deleted_, 0U); + EXPECT_LT(files_deleted_, files_created_); + } + + private: + std::atomic files_started_{}; + std::atomic files_created_{}; + std::atomic files_deleted_{}; +}; + +TEST_F(EventListenerTest, BlobDBFileTest) { + Options options; + options.env = CurrentOptions().env; + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + BlobDBFileLevelEventListener* blob_event_listener = + new BlobDBFileLevelEventListener(); + options.listeners.emplace_back(blob_event_listener); + + DestroyAndReopen(options); + + ASSERT_OK(Put("Key1", "blob_value1")); + ASSERT_OK(Put("Key2", "blob_value2")); + ASSERT_OK(Put("Key3", "blob_value3")); + ASSERT_OK(Put("Key4", "blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key3", "new_blob_value3")); + ASSERT_OK(Put("Key4", "new_blob_value4")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("Key5", "blob_value5")); + ASSERT_OK(Put("Key6", "blob_value6")); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + // On compaction, because of blob_garbage_collection_age_cutoff, it will + // delete the oldest blob file and create new blob file during compaction. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + blob_event_listener->CheckCounters(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,11 +11,11 @@ #include #include "file/sequence_file_reader.h" +#include "port/lang.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/util.h" namespace ROCKSDB_NAMESPACE { namespace log { @@ -119,16 +119,26 @@ break; case kBadHeader: - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. ReportCorruption(drop_size, "truncated header"); } FALLTHROUGH_INTENDED; case kEof: if (in_fragmented_record) { - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -142,8 +152,13 @@ if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { // Treat a record from a previous instance of the log as EOF. if (in_fragmented_record) { - if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, + // which higher layers can choose to ignore when it's provable + // there is no hole. ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -164,6 +179,20 @@ break; case kBadRecordLen: + if (eof_) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || + wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { + // In clean shutdown we don't expect any error in the log files. + // In point-in-time recovery an incomplete record at the end could + // produce a hole in the recovered data. Report an error here, which + // higher layers can choose to ignore when it's provable there is no + // hole. + ReportCorruption(drop_size, "truncated record body"); + } + return false; + } + FALLTHROUGH_INTENDED; + case kBadRecordChecksum: if (recycled_ && wal_recovery_mode == @@ -202,6 +231,10 @@ return last_record_offset_; } +uint64_t Reader::LastRecordEnd() { + return end_of_buffer_offset_ - buffer_.size(); +} + void Reader::UnmarkEOF() { if (read_error_) { return; @@ -281,6 +314,7 @@ // Last read was a full read, so this is a trailer to skip buffer_.clear(); Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status); end_of_buffer_offset_ += buffer_.size(); if (!status.ok()) { buffer_.clear(); @@ -350,18 +384,14 @@ } } if (header_size + length > buffer_.size()) { + assert(buffer_.size() >= static_cast(header_size)); *drop_size = buffer_.size(); buffer_.clear(); - if (!eof_) { - return kBadRecordLen; - } - // If the end of the file has been reached without reading |length| - // bytes of payload, assume the writer died in the middle of writing the - // record. Don't report a corruption unless requested. - if (*drop_size) { - return kBadHeader; - } - return kEof; + // If the end of the read has been reached without seeing + // `header_size + length` bytes of payload, report a corruption. The + // higher layers can decide how to handle it based on the recovery mode, + // whether this occurred at EOF, whether this is the final WAL, etc. + return kBadRecordLen; } if (type == kZeroType && length == 0) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -49,7 +49,6 @@ // // If "checksum" is true, verify checksums if available. Reader(std::shared_ptr info_log, - // @lint-ignore TXT2 T25377293 Grandfathered in std::unique_ptr&& file, Reporter* reporter, bool checksum, uint64_t log_num); // No copying allowed @@ -72,6 +71,11 @@ // Undefined before the first call to ReadRecord. uint64_t LastRecordOffset(); + // Returns the first physical offset after the last record returned by + // ReadRecord, or zero before first call to ReadRecord. This can also be + // thought of as the "current" position in processing the file bytes. + uint64_t LastRecordEnd(); + // returns true if the reader has encountered an eof condition. bool IsEOF() { return eof_; @@ -159,7 +163,6 @@ class FragmentBufferedReader : public Reader { public: FragmentBufferedReader(std::shared_ptr info_log, - // @lint-ignore TXT2 T25377293 Grandfathered in std::unique_ptr&& _file, Reporter* reporter, bool checksum, uint64_t log_num) : Reader(info_log, std::move(_file), reporter, checksum, log_num), diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,6 @@ #include "db/log_reader.h" #include "db/log_writer.h" -#include "env/composite_env_wrapper.h" #include "file/sequence_file_reader.h" #include "file/writable_file_writer.h" #include "rocksdb/env.h" @@ -50,7 +49,7 @@ // get<1>(tuple): true if allow retry after read EOF, false otherwise class LogTest : public ::testing::TestWithParam> { private: - class StringSource : public SequentialFile { + class StringSource : public FSSequentialFile { public: Slice& contents_; bool force_error_; @@ -68,7 +67,8 @@ returned_partial_(false), fail_after_read_partial_(fail_after_read_partial) {} - Status Read(size_t n, Slice* result, char* scratch) override { + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { if (fail_after_read_partial_) { EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; } @@ -81,7 +81,7 @@ contents_.remove_prefix(force_error_position_); force_error_ = false; returned_partial_ = true; - return Status::Corruption("read error"); + return IOStatus::Corruption("read error"); } } @@ -106,28 +106,21 @@ *result = Slice(scratch, n); contents_.remove_prefix(n); - return Status::OK(); + return IOStatus::OK(); } - Status Skip(uint64_t n) override { + IOStatus Skip(uint64_t n) override { if (n > contents_.size()) { contents_.clear(); - return Status::NotFound("in-memory file skipepd past end"); + return IOStatus::NotFound("in-memory file skipepd past end"); } contents_.remove_prefix(n); - return Status::OK(); + return IOStatus::OK(); } }; - inline StringSource* GetStringSourceFromLegacyReader( - SequentialFileReader* reader) { - LegacySequentialFileWrapper* file = - static_cast(reader->file()); - return static_cast(file->target()); - } - class ReportCollector : public Reader::Reporter { public: size_t dropped_bytes_; @@ -140,29 +133,17 @@ } }; - std::string& dest_contents() { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - return dest->contents_; - } - - const std::string& dest_contents() const { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - return dest->contents_; - } - - void reset_source_contents() { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - assert(src); - src->contents_ = dest_contents(); - } + std::string& dest_contents() { return sink_->contents_; } + + const std::string& dest_contents() const { return sink_->contents_; } + + void reset_source_contents() { source_->contents_ = dest_contents(); } Slice reader_contents_; - std::unique_ptr dest_holder_; - std::unique_ptr source_holder_; + test::StringSink* sink_; + StringSource* source_; ReportCollector report_; - Writer writer_; + std::unique_ptr writer_; std::unique_ptr reader_; protected: @@ -171,19 +152,23 @@ public: LogTest() : reader_contents_(), - dest_holder_(test::GetWritableFileWriter( - new test::StringSink(&reader_contents_), "" /* don't care */)), - source_holder_(test::GetSequentialFileReader( - new StringSource(reader_contents_, !std::get<1>(GetParam())), - "" /* file name */)), - writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())), + sink_(new test::StringSink(&reader_contents_)), + source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))), allow_retry_read_(std::get<1>(GetParam())) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(sink_holder), "" /* don't care */, FileOptions())); + writer_.reset( + new Writer(std::move(file_writer), 123, std::get<0>(GetParam()))); + std::unique_ptr source_holder(source_); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(source_holder), "" /* file name */)); if (allow_retry_read_) { - reader_.reset(new FragmentBufferedReader( - nullptr, std::move(source_holder_), &report_, true /* checksum */, - 123 /* log_number */)); + reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader), + &report_, true /* checksum */, + 123 /* log_number */)); } else { - reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_, + reader_.reset(new Reader(nullptr, std::move(file_reader), &report_, true /* checksum */, 123 /* log_number */)); } } @@ -191,7 +176,7 @@ Slice* get_reader_contents() { return &reader_contents_; } void Write(const std::string& msg) { - writer_.AddRecord(Slice(msg)); + ASSERT_OK(writer_->AddRecord(Slice(msg))); } size_t WrittenBytes() const { @@ -219,11 +204,7 @@ dest_contents()[offset] = new_byte; } - void ShrinkSize(int bytes) { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - dest->Drop(bytes); - } + void ShrinkSize(int bytes) { sink_->Drop(bytes); } void FixChecksum(int header_offset, int len, bool recyclable) { // Compute crc of type/len/data @@ -235,9 +216,8 @@ } void ForceError(size_t position = 0) { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->force_error_ = true; - src->force_error_position_ = position; + source_->force_error_ = true; + source_->force_error_position_ = position; } size_t DroppedBytes() const { @@ -249,14 +229,12 @@ } void ForceEOF(size_t position = 0) { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->force_eof_ = true; - src->force_eof_position_ = position; + source_->force_eof_ = true; + source_->force_eof_position_ = position; } void UnmarkEOF() { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->returned_partial_ = false; + source_->returned_partial_ = false; reader_->UnmarkEOF(); } @@ -465,7 +443,7 @@ ShrinkSize(1); ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); - ASSERT_EQ("OK", MatchError("Corruption: truncated header")); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); } TEST_P(LogTest, ChecksumMismatch) { @@ -573,9 +551,7 @@ ShrinkSize(1); ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); - ASSERT_EQ("OK", MatchError( - "Corruption: truncated headerCorruption: " - "error reading trailing data")); + ASSERT_EQ("OK", MatchError("Corruption: truncated record body")); } TEST_P(LogTest, ErrorJoinsRecords) { @@ -687,12 +663,13 @@ while (get_reader_contents()->size() < log::kBlockSize * 2) { Write("xxxxxxxxxxxxxxxx"); } - std::unique_ptr dest_holder(test::GetWritableFileWriter( - new test::OverwritingStringSink(get_reader_contents()), - "" /* don't care */)); + std::unique_ptr sink( + new test::OverwritingStringSink(get_reader_contents())); + std::unique_ptr dest_holder(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); Writer recycle_writer(std::move(dest_holder), 123, true); - recycle_writer.AddRecord(Slice("foooo")); - recycle_writer.AddRecord(Slice("bar")); + ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); ASSERT_EQ("foooo", Read()); ASSERT_EQ("bar", Read()); @@ -720,10 +697,9 @@ }; Slice contents_; - std::unique_ptr dest_holder_; + test::StringSink* sink_; std::unique_ptr log_writer_; Env* env_; - EnvOptions env_options_; const std::string test_dir_; const std::string log_file_; std::unique_ptr writer_; @@ -734,61 +710,58 @@ public: RetriableLogTest() : contents_(), - dest_holder_(nullptr), + sink_(new test::StringSink(&contents_)), log_writer_(nullptr), env_(Env::Default()), test_dir_(test::PerThreadDBPath("retriable_log_test")), log_file_(test_dir_ + "/log"), writer_(nullptr), reader_(nullptr), - log_reader_(nullptr) {} + log_reader_(nullptr) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr wfw(new WritableFileWriter( + std::move(sink_holder), "" /* file name */, FileOptions())); + log_writer_.reset(new Writer(std::move(wfw), 123, GetParam())); + } Status SetupTestEnv() { - dest_holder_.reset(test::GetWritableFileWriter( - new test::StringSink(&contents_), "" /* file name */)); - assert(dest_holder_ != nullptr); - log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam())); - assert(log_writer_ != nullptr); - Status s; - s = env_->CreateDirIfMissing(test_dir_); - std::unique_ptr writable_file; + FileOptions fopts; + auto fs = env_->GetFileSystem(); + s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr); + std::unique_ptr writable_file; if (s.ok()) { - s = env_->NewWritableFile(log_file_, &writable_file, env_options_); + s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr); } if (s.ok()) { - writer_.reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_, - env_options_)); - assert(writer_ != nullptr); + writer_.reset( + new WritableFileWriter(std::move(writable_file), log_file_, fopts)); + EXPECT_NE(writer_, nullptr); } - std::unique_ptr seq_file; + std::unique_ptr seq_file; if (s.ok()) { - s = env_->NewSequentialFile(log_file_, &seq_file, env_options_); + s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr); } if (s.ok()) { - reader_.reset(new SequentialFileReader( - NewLegacySequentialFileWrapper(seq_file), log_file_)); - assert(reader_ != nullptr); + reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_)); + EXPECT_NE(reader_, nullptr); log_reader_.reset(new FragmentBufferedReader( nullptr, std::move(reader_), &report_, true /* checksum */, 123 /* log_number */)); - assert(log_reader_ != nullptr); + EXPECT_NE(log_reader_, nullptr); } return s; } - std::string contents() { - auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file()); - assert(file != nullptr); - return file->contents_; - } + std::string contents() { return sink_->contents_; } - void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); } + void Encode(const std::string& msg) { + ASSERT_OK(log_writer_->AddRecord(Slice(msg))); + } void Write(const Slice& data) { - writer_->Append(data); - writer_->Sync(true); + ASSERT_OK(writer_->Append(data)); + ASSERT_OK(writer_->Sync(true)); } bool TryRead(std::string* result) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -33,14 +33,14 @@ Writer::~Writer() { if (dest_) { - WriteBuffer(); + WriteBuffer().PermitUncheckedError(); } } -Status Writer::WriteBuffer() { return dest_->Flush(); } +IOStatus Writer::WriteBuffer() { return dest_->Flush(); } -Status Writer::Close() { - Status s; +IOStatus Writer::Close() { + IOStatus s; if (dest_) { s = dest_->Close(); dest_.reset(); @@ -48,7 +48,7 @@ return s; } -Status Writer::AddRecord(const Slice& slice) { +IOStatus Writer::AddRecord(const Slice& slice) { const char* ptr = slice.data(); size_t left = slice.size(); @@ -59,7 +59,7 @@ // Fragment the record if necessary and emit it. Note that if slice // is empty, we still want to iterate once to emit a single // zero-length record - Status s; + IOStatus s; bool begin = true; do { const int64_t leftover = kBlockSize - block_offset_; @@ -114,7 +114,7 @@ bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); } -Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { +IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes size_t header_size; @@ -145,14 +145,17 @@ } // Compute the crc of the record type and the payload. - crc = crc32c::Extend(crc, ptr, n); + uint32_t payload_crc = crc32c::Value(ptr, n); + crc = crc32c::Crc32cCombine(crc, payload_crc, n); crc = crc32c::Mask(crc); // Adjust for storage + TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", + &crc); EncodeFixed32(buf, crc); // Write the header and the payload - Status s = dest_->Append(Slice(buf, header_size)); + IOStatus s = dest_->Append(Slice(buf, header_size)); if (s.ok()) { - s = dest_->Append(Slice(ptr, n)); + s = dest_->Append(Slice(ptr, n), payload_crc); } block_offset_ += header_size + n; return s; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,11 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include - +#include #include #include "db/log_format.h" +#include "rocksdb/io_status.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -79,16 +79,16 @@ ~Writer(); - Status AddRecord(const Slice& slice); + IOStatus AddRecord(const Slice& slice); WritableFileWriter* file() { return dest_.get(); } const WritableFileWriter* file() const { return dest_.get(); } uint64_t get_log_number() const { return log_number_; } - Status WriteBuffer(); + IOStatus WriteBuffer(); - Status Close(); + IOStatus Close(); bool TEST_BufferIsEmpty(); @@ -103,7 +103,7 @@ // record type stored in the header. uint32_t type_crc_[kMaxRecordType + 1]; - Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); + IOStatus EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); // If true, it does not flush after each write. Instead it relies on the upper // layer to manually does the flush by calling ::WriteBuffer() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,8 @@ // #pragma once -#include #include +#include #include #include #include diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/lookup_key.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/lookup_key.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,6 @@ #pragma once #include #include -#include "rocksdb/db.h" #include "rocksdb/slice.h" #include "rocksdb/types.h" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/malloc_stats.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/malloc_stats.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,10 +19,10 @@ #ifdef ROCKSDB_JEMALLOC -typedef struct { +struct MallocStatus { char* cur; char* end; -} MallocStatus; +}; static void GetJemallocStatus(void* mstat_arg, const char* status) { MallocStatus* mstat = reinterpret_cast(mstat_arg); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/manual_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/manual_compaction_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,8 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // // Test for issue 178: a manual compaction causes deleted data to reappear. -#include -#include #include #include "port/port.h" @@ -15,7 +13,19 @@ #include "rocksdb/write_batch.h" #include "test_util/testharness.h" -using namespace ROCKSDB_NAMESPACE; +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::CompactionStyle; +using ROCKSDB_NAMESPACE::CompactRangeOptions; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DestroyDB; +using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::Iterator; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteOptions; namespace { @@ -40,8 +50,9 @@ public: ManualCompactionTest() { // Get rid of any state from an old run. - dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath("rocksdb_cbug_test"); - DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options()); + dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath( + "rocksdb_manual_compaction_test"); + DestroyDB(dbname_, Options()); } std::string dbname_; @@ -60,28 +71,55 @@ const char* Name() const override { return "DestroyAllCompactionFilter"; } }; +class LogCompactionFilter : public CompactionFilter { + public: + const char* Name() const override { return "LogCompactionFilter"; } + + bool Filter(int level, const Slice& key, const Slice& /*existing_value*/, + std::string* /*new_value*/, + bool* /*value_changed*/) const override { + key_level_[key.ToString()] = level; + return false; + } + + void Reset() { key_level_.clear(); } + + size_t NumKeys() const { return key_level_.size(); } + + int KeyLevel(const Slice& key) { + auto it = key_level_.find(key.ToString()); + if (it == key_level_.end()) { + return -1; + } + return it->second; + } + + private: + mutable std::map key_level_; +}; + TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { for (int iter = 0; iter < 2; ++iter) { DB* db; Options options; if (iter == 0) { // level compaction options.num_levels = 3; - options.compaction_style = kCompactionStyleLevel; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; } else { // universal compaction - options.compaction_style = kCompactionStyleUniversal; + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; } options.create_if_missing = true; - options.compression = ROCKSDB_NAMESPACE::kNoCompression; + options.compression = CompressionType::kNoCompression; options.compaction_filter = new DestroyAllCompactionFilter(); ASSERT_OK(DB::Open(options, dbname_, &db)); - db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); - db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); - db->Put(WriteOptions(), Slice("key3"), Slice("value3")); - db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy"))); Slice key4("key4"); - db->CompactRange(CompactRangeOptions(), nullptr, &key4); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4)); Iterator* itr = db->NewIterator(ReadOptions()); itr->SeekToFirst(); ASSERT_TRUE(itr->Valid()); @@ -100,46 +138,45 @@ // Open database. Disable compression since it affects the creation // of layers and the code below is trying to test against a very // specific scenario. - ROCKSDB_NAMESPACE::DB* db; - ROCKSDB_NAMESPACE::Options db_options; + DB* db; + Options db_options; db_options.write_buffer_size = 1024; db_options.create_if_missing = true; - db_options.compression = ROCKSDB_NAMESPACE::kNoCompression; - ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(db_options, dbname_, &db)); + db_options.compression = CompressionType::kNoCompression; + ASSERT_OK(DB::Open(db_options, dbname_, &db)); // create first key range - ROCKSDB_NAMESPACE::WriteBatch batch; + WriteBatch batch; for (int i = 0; i < kNumKeys; i++) { - batch.Put(Key1(i), "value for range 1 key"); + ASSERT_OK(batch.Put(Key1(i), "value for range 1 key")); } - ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + ASSERT_OK(db->Write(WriteOptions(), &batch)); // create second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Put(Key2(i), "value for range 2 key"); + ASSERT_OK(batch.Put(Key2(i), "value for range 2 key")); } - ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + ASSERT_OK(db->Write(WriteOptions(), &batch)); // delete second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Delete(Key2(i)); + ASSERT_OK(batch.Delete(Key2(i))); } - ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch)); + ASSERT_OK(db->Write(WriteOptions(), &batch)); // compact database std::string start_key = Key1(0); std::string end_key = Key1(kNumKeys - 1); - ROCKSDB_NAMESPACE::Slice least(start_key.data(), start_key.size()); - ROCKSDB_NAMESPACE::Slice greatest(end_key.data(), end_key.size()); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); // commenting out the line below causes the example to work correctly - db->CompactRange(CompactRangeOptions(), &least, &greatest); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest)); // count the keys - ROCKSDB_NAMESPACE::Iterator* iter = - db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); + Iterator* iter = db->NewIterator(ReadOptions()); int num_keys = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { num_keys++; @@ -149,7 +186,119 @@ // close database delete db; - DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options()); + DestroyDB(dbname_, Options()); +} + +TEST_F(ManualCompactionTest, SkipLevel) { + DB* db; + Options options; + options.num_levels = 3; + // Initially, flushed L0 files won't exceed 100. + options.level0_file_num_compaction_trigger = 100; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.create_if_missing = true; + options.compression = CompressionType::kNoCompression; + LogCompactionFilter* filter = new LogCompactionFilter(); + options.compaction_filter = filter; + ASSERT_OK(DB::Open(options, dbname_, &db)); + + WriteOptions wo; + FlushOptions fo; + ASSERT_OK(db->Put(wo, "1", "")); + ASSERT_OK(db->Flush(fo)); + ASSERT_OK(db->Put(wo, "2", "")); + ASSERT_OK(db->Flush(fo)); + ASSERT_OK(db->Put(wo, "4", "")); + ASSERT_OK(db->Put(wo, "8", "")); + ASSERT_OK(db->Flush(fo)); + + { + // L0: 1, 2, [4, 8] + // no file has keys in range [5, 7] + Slice start("5"); + Slice end("7"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2, [4, 8] + // [3, 7] overlaps with 4 in L0 + Slice start("3"); + Slice end("7"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(2, filter->NumKeys()); + ASSERT_EQ(0, filter->KeyLevel("4")); + ASSERT_EQ(0, filter->KeyLevel("8")); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // no file has keys in range (-inf, 0] + Slice end("0"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // no file has keys in range [9, inf) + Slice start("9"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(0, filter->NumKeys()); + } + + { + // L0: 1, 2 + // L1: [4, 8] + // [2, 2] overlaps with 2 in L0 + Slice start("2"); + Slice end("2"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(1, filter->NumKeys()); + ASSERT_EQ(0, filter->KeyLevel("2")); + } + + { + // L0: 1 + // L1: 2, [4, 8] + // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0 + Slice start("2"); + Slice end("5"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_EQ(3, filter->NumKeys()); + ASSERT_EQ(1, filter->KeyLevel("2")); + ASSERT_EQ(1, filter->KeyLevel("4")); + ASSERT_EQ(1, filter->KeyLevel("8")); + } + + { + // L0: 1 + // L1: [2, 4, 8] + // [0, inf) overlaps all files + Slice start("0"); + filter->Reset(); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_EQ(4, filter->NumKeys()); + // 1 is first compacted to L1 and then further compacted into [2, 4, 8], + // so finally the logged level for 1 is L1. + ASSERT_EQ(1, filter->KeyLevel("1")); + ASSERT_EQ(1, filter->KeyLevel("2")); + ASSERT_EQ(1, filter->KeyLevel("4")); + ASSERT_EQ(1, filter->KeyLevel("8")); + } + + delete filter; + delete db; + DestroyDB(dbname_, options); } } // anonymous namespace diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,22 +13,27 @@ #include #include #include + #include "db/dbformat.h" +#include "db/kv_checksum.h" #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" +#include "logging/logging.h" #include "memory/arena.h" #include "memory/memory_usage.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" +#include "port/lang.h" #include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/types.h" #include "rocksdb/write_buffer_manager.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" @@ -36,12 +41,11 @@ #include "util/autovector.h" #include "util/coding.h" #include "util/mutexlock.h" -#include "util/util.h" namespace ROCKSDB_NAMESPACE { ImmutableMemTableOptions::ImmutableMemTableOptions( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options) : arena_block_size(mutable_cf_options.arena_block_size), memtable_prefix_bloom_bits( @@ -56,12 +60,13 @@ inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), max_successive_merges(mutable_cf_options.max_successive_merges), - statistics(ioptions.statistics), - merge_operator(ioptions.merge_operator), - info_log(ioptions.info_log) {} + statistics(ioptions.stats), + merge_operator(ioptions.merge_operator.get()), + info_log(ioptions.logger), + allow_data_in_errors(ioptions.allow_data_in_errors) {} MemTable::MemTable(const InternalKeyComparator& cmp, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, WriteBufferManager* write_buffer_manager, SequenceNumber latest_seq, uint32_t column_family_id) @@ -79,9 +84,9 @@ mutable_cf_options.memtable_huge_page_size), table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), - ioptions.info_log, column_family_id)), + ioptions.logger, column_family_id)), range_del_table_(SkipListFactory().CreateMemTableRep( - comparator_, &arena_, nullptr /* transform */, ioptions.info_log, + comparator_, &arena_, nullptr /* transform */, ioptions.logger, column_family_id)), is_range_del_table_empty_(true), data_size_(0), @@ -101,9 +106,9 @@ : 0), prefix_extractor_(mutable_cf_options.prefix_extractor.get()), flush_state_(FLUSH_NOT_REQUESTED), - env_(ioptions.env), + clock_(ioptions.clock), insert_with_hint_prefix_extractor_( - ioptions.memtable_insert_with_hint_prefix_extractor), + ioptions.memtable_insert_with_hint_prefix_extractor.get()), oldest_key_time_(std::numeric_limits::max()), atomic_flush_seqno_(kMaxSequenceNumber), approximate_memory_usage_(0) { @@ -117,7 +122,7 @@ bloom_filter_.reset( new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, 6 /* hard coded 6 probes */, - moptions_.memtable_huge_page_size, ioptions.info_log)); + moptions_.memtable_huge_page_size, ioptions.logger)); } } @@ -220,7 +225,7 @@ uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed); if (oldest_key_time == std::numeric_limits::max()) { int64_t current_time = 0; - auto s = env_->GetCurrentTime(¤t_time); + auto s = clock_->GetCurrentTime(¤t_time); if (s.ok()) { assert(current_time >= 0); // If fail, the timestamp is already set. @@ -327,9 +332,11 @@ PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { // iterator should only use prefix bloom filter - Slice user_k(ExtractUserKey(k)); - if (prefix_extractor_->InDomain(user_k) && - !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts) && + !bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -344,9 +351,11 @@ PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { - Slice user_k(ExtractUserKey(k)); - if (prefix_extractor_->InDomain(user_k) && - !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts) && + !bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -375,8 +384,19 @@ PERF_COUNTER_ADD(next_on_memtable_count, 1); assert(Valid()); iter_->Next(); + TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_); valid_ = iter_->Valid(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = valid_; + if (is_valid) { + result->key = key(); + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } + return is_valid; + } void Prev() override { PERF_COUNTER_ADD(prev_on_memtable_count, 1); assert(Valid()); @@ -428,11 +448,13 @@ is_range_del_table_empty_.load(std::memory_order_relaxed)) { return nullptr; } + return NewRangeTombstoneIteratorInternal(read_options, read_seq); +} + +FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq) { auto* unfragmented_iter = new MemTableIterator( *this, read_options, nullptr /* arena */, true /* use_range_del_table */); - if (unfragmented_iter == nullptr) { - return nullptr; - } auto fragmented_tombstone_list = std::make_shared( std::unique_ptr(unfragmented_iter), @@ -444,7 +466,7 @@ } port::RWMutex* MemTable::GetLock(const Slice& key) { - return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())]; + return &locks_[GetSliceRangedNPHash(key, locks_.size())]; } MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, @@ -468,10 +490,52 @@ return {entry_count * (data_size / n), entry_count}; } -bool MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, /* user key */ - const Slice& value, bool allow_concurrent, - MemTablePostProcessInfo* post_process_info, void** hint) { +Status MemTable::VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info) { + uint32_t ikey_len = 0; + if (!GetVarint32(&encoded, &ikey_len)) { + return Status::Corruption("Unable to parse internal key length"); + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (ikey_len < 8 + ts_sz) { + return Status::Corruption("Internal key length too short"); + } + if (ikey_len > encoded.size()) { + return Status::Corruption("Internal key length too long"); + } + uint32_t value_len = 0; + const size_t user_key_len = ikey_len - 8; + Slice key(encoded.data(), user_key_len); + encoded.remove_prefix(user_key_len); + + uint64_t packed = DecodeFixed64(encoded.data()); + ValueType value_type = kMaxValue; + SequenceNumber sequence_number = kMaxSequenceNumber; + UnPackSequenceAndType(packed, &sequence_number, &value_type); + encoded.remove_prefix(8); + + if (!GetVarint32(&encoded, &value_len)) { + return Status::Corruption("Unable to parse value length"); + } + if (value_len < encoded.size()) { + return Status::Corruption("Value length too short"); + } + if (value_len > encoded.size()) { + return Status::Corruption("Value length too long"); + } + Slice value(encoded.data(), value_len); + + return kv_prot_info.StripS(sequence_number) + .StripKVO(key, value, value_type) + .GetStatus(); +} + +Status MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent, + MemTablePostProcessInfo* post_process_info, void** hint) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] @@ -498,7 +562,17 @@ p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + if (kv_prot_info != nullptr) { + Slice encoded(buf, encoded_len); + TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded); + Status status = VerifyEncodedEntry(encoded, *kv_prot_info); + if (!status.ok()) { + return status; + } + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz); if (!allow_concurrent) { // Extract prefix for insert with hint. @@ -507,12 +581,12 @@ Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } } else { bool res = table->InsertKey(handle); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } } @@ -528,11 +602,11 @@ } if (bloom_filter_ && prefix_extractor_ && - prefix_extractor_->InDomain(key)) { - bloom_filter_->Add(prefix_extractor_->Transform(key)); + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); + bloom_filter_->Add(key_without_ts); } // The first sequence number inserted into the memtable @@ -553,7 +627,7 @@ ? table->InsertKeyConcurrently(handle) : table->InsertKeyWithHintConcurrently(handle, hint); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } assert(post_process_info != nullptr); @@ -564,11 +638,12 @@ } if (bloom_filter_ && prefix_extractor_ && - prefix_extractor_->InDomain(key)) { - bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->AddConcurrently( + prefix_extractor_->Transform(key_without_ts)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); + bloom_filter_->AddConcurrently(key_without_ts); } // atomically update first_seqno_ and earliest_seqno_. @@ -587,7 +662,7 @@ is_range_del_table_empty_.store(false, std::memory_order_relaxed); } UpdateOldestKeyTime(); - return true; + return Status::OK(); } // Callback from MemTable::Get() @@ -600,6 +675,7 @@ bool* merge_in_progress; std::string* value; SequenceNumber seq; + std::string* timestamp; const MergeOperator* merge_operator; // the merge operations encountered; MergeContext* merge_context; @@ -609,10 +685,11 @@ Statistics* statistics; bool inplace_update_support; bool do_merge; - Env* env_; + SystemClock* clock; + ReadCallback* callback_; bool* is_blob_index; - + bool allow_data_in_errors; bool CheckCallback(SequenceNumber _seq) { if (callback_) { return callback_->IsVisible(_seq); @@ -640,12 +717,15 @@ // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. - uint32_t key_length; + uint32_t key_length = 0; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + assert(key_length >= 8); Slice user_key_slice = Slice(key_ptr, key_length - 8); - if (s->mem->GetInternalKeyComparator() - .user_comparator() - ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) { + const Comparator* user_comparator = + s->mem->GetInternalKeyComparator().user_comparator(); + size_t ts_sz = user_comparator->timestamp_size(); + if (user_comparator->EqualWithoutTimestamp(user_key_slice, + s->key->user_key())) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; @@ -690,7 +770,7 @@ *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), &v, merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + s->statistics, s->clock, nullptr /* result_operand */, true); } } else { // Preserve the value with the goal of returning it as part of @@ -713,9 +793,15 @@ if (s->is_blob_index != nullptr) { *(s->is_blob_index) = (type == kTypeBlobIndex); } + + if (ts_sz > 0 && s->timestamp != nullptr) { + Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz); + s->timestamp->assign(ts.data(), ts.size()); + } return false; } case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: case kTypeRangeDeletion: { if (*(s->merge_in_progress)) { @@ -723,7 +809,7 @@ *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + s->statistics, s->clock, nullptr /* result_operand */, true); } } else { *(s->status) = Status::NotFound(); @@ -751,15 +837,24 @@ *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, s->statistics, - s->env_, nullptr /* result_operand */, true); + s->clock, nullptr /* result_operand */, true); *(s->found_final_value) = true; return false; } return true; } - default: - assert(false); - return true; + default: { + std::string msg("Corrupted value not expected."); + if (s->allow_data_in_errors) { + msg.append("Unrecognized value type: " + + std::to_string(static_cast(type)) + ". "); + msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) + + ". "); + msg.append("seq: " + std::to_string(seq) + "."); + } + *(s->status) = Status::Corruption(msg.c_str()); + return false; + } } } @@ -767,7 +862,8 @@ return false; } -bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, +bool MemTable::Get(const LookupKey& key, std::string* value, + std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, @@ -788,22 +884,21 @@ range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key())); } - Slice user_key = key.user_key(); bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = - bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); + may_contain = bloom_filter_->MayContain(user_key_without_ts); } else { assert(prefix_extractor_); - may_contain = - !prefix_extractor_->InDomain(user_key) || - bloom_filter_->MayContain(prefix_extractor_->Transform(user_key)); + may_contain = !prefix_extractor_->InDomain(user_key_without_ts) || + bloom_filter_->MayContain( + prefix_extractor_->Transform(user_key_without_ts)); } } @@ -816,7 +911,7 @@ PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback, - is_blob_index, value, s, merge_context, seq, + is_blob_index, value, timestamp, s, merge_context, seq, &found_final_value, &merge_in_progress); } @@ -831,7 +926,8 @@ void MemTable::GetFromTable(const LookupKey& key, SequenceNumber max_covering_tombstone_seq, bool do_merge, ReadCallback* callback, - bool* is_blob_index, std::string* value, Status* s, + bool* is_blob_index, std::string* value, + std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* seq, bool* found_final_value, bool* merge_in_progress) { Saver saver; @@ -840,6 +936,7 @@ saver.merge_in_progress = merge_in_progress; saver.key = &key; saver.value = value; + saver.timestamp = timestamp; saver.seq = kMaxSequenceNumber; saver.mem = this; saver.merge_context = merge_context; @@ -848,16 +945,17 @@ saver.logger = moptions_.info_log; saver.inplace_update_support = moptions_.inplace_update_support; saver.statistics = moptions_.statistics; - saver.env_ = env_; + saver.clock = clock_; saver.callback_ = callback; saver.is_blob_index = is_blob_index; saver.do_merge = do_merge; + saver.allow_data_in_errors = moptions_.allow_data_in_errors; table_->Get(key, &saver, SaveValue); *seq = saver.seq; } void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob) { + ReadCallback* callback) { // The sequence number is updated synchronously in version_set.h if (IsEmpty()) { // Avoiding recording stats for speed. @@ -865,52 +963,59 @@ } PERF_TIMER_GUARD(get_from_memtable_time); + // For now, memtable Bloom filter is effectively disabled if there are any + // range tombstones. This is the simplest way to ensure range tombstones are + // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0 + bool no_range_del = read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed); MultiGetRange temp_range(*range, range->begin(), range->end()); - if (bloom_filter_) { - std::array keys; - std::array may_match = {{true}}; - autovector prefixes; + if (bloom_filter_ && no_range_del) { + bool whole_key = + !prefix_extractor_ || moptions_.memtable_whole_key_filtering; + std::array bloom_keys; + std::array may_match; + std::array range_indexes; int num_keys = 0; for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (!prefix_extractor_) { - keys[num_keys++] = &iter->ukey; - } else if (prefix_extractor_->InDomain(iter->ukey)) { - prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey)); - keys[num_keys++] = &prefixes.back(); - } - } - bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); - int idx = 0; - for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) { + if (whole_key) { + bloom_keys[num_keys] = iter->ukey_without_ts; + range_indexes[num_keys++] = iter.index(); + } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { + bloom_keys[num_keys] = + prefix_extractor_->Transform(iter->ukey_without_ts); + range_indexes[num_keys++] = iter.index(); + } else { + // TODO: consider not counting these as Bloom hits to more closely + // match bloom_sst_hit_count PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); - continue; } - if (!may_match[idx]) { - temp_range.SkipKey(iter); + } + bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]); + for (int i = 0; i < num_keys; ++i) { + if (!may_match[i]) { + temp_range.SkipIndex(range_indexes[i]); PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); } else { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } - idx++; } } for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - SequenceNumber seq = kMaxSequenceNumber; bool found_final_value{false}; bool merge_in_progress = iter->s->IsMergeInProgress(); - std::unique_ptr range_del_iter( - NewRangeTombstoneIterator( - read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); - if (range_del_iter != nullptr) { + if (!no_range_del) { + std::unique_ptr range_del_iter( + NewRangeTombstoneIteratorInternal( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); iter->max_covering_tombstone_seq = std::max( iter->max_covering_tombstone_seq, range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); } + SequenceNumber dummy_seq; GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, - callback, is_blob, iter->value->GetSelf(), iter->s, - &(iter->merge_context), &seq, &found_final_value, - &merge_in_progress); + callback, &iter->is_blob_index, iter->value->GetSelf(), + iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq, + &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { *(iter->s) = Status::MergeInProgress(); @@ -918,16 +1023,26 @@ if (found_final_value) { iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); range->MarkKeyDone(iter); RecordTick(moptions_.statistics, MEMTABLE_HIT); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + // Set all remaining keys in range to Abort + for (auto range_iter = range->begin(); range_iter != range->end(); + ++range_iter) { + range->MarkKeyDone(range_iter); + *(range_iter->s) = Status::Aborted(); + } + break; + } } } PERF_COUNTER_ADD(get_from_memtable_count, 1); } -void MemTable::Update(SequenceNumber seq, - const Slice& key, - const Slice& value) { +Status MemTable::Update(SequenceNumber seq, const Slice& key, + const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info) { LookupKey lkey(key, seq); Slice mem_key = lkey.memtable_key(); @@ -971,22 +1086,26 @@ (unsigned)(VarintLength(key_length) + key_length + VarintLength(value.size()) + value.size())); RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); - return; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + Slice encoded(entry, p + value.size() - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } + return Status::OK(); } } } } - // key doesn't exist - bool add_res __attribute__((__unused__)); - add_res = Add(seq, kTypeValue, key, value); - // We already checked unused != seq above. In that case, Add should not fail. - assert(add_res); + // The latest value is not `kTypeValue` or key doesn't exist + return Add(seq, kTypeValue, key, value, kv_prot_info); } -bool MemTable::UpdateCallback(SequenceNumber seq, - const Slice& key, - const Slice& delta) { +Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info) { LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); @@ -1012,8 +1131,8 @@ // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; - uint64_t unused; - UnPackSequenceAndType(tag, &unused, &type); + uint64_t existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); switch (type) { case kTypeValue: { Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); @@ -1040,16 +1159,35 @@ } RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); UpdateFlushState(); - return true; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + updated_kv_prot_info.UpdateV(delta, + Slice(prev_buffer, new_prev_size)); + Slice encoded(entry, prev_buffer + new_prev_size - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } + return Status::OK(); } else if (status == UpdateStatus::UPDATED) { - Add(seq, kTypeValue, key, Slice(str_value)); + Status s; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(delta, str_value); + s = Add(seq, kTypeValue, key, Slice(str_value), + &updated_kv_prot_info); + } else { + s = Add(seq, kTypeValue, key, Slice(str_value), + nullptr /* kv_prot_info */); + } RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); UpdateFlushState(); - return true; + return s; } else if (status == UpdateStatus::UPDATE_FAILED) { - // No action required. Return. + // `UPDATE_FAILED` is named incorrectly. It indicates no update + // happened. It does not indicate a failure happened. UpdateFlushState(); - return true; + return Status::OK(); } } default: @@ -1057,9 +1195,8 @@ } } } - // If the latest value is not kTypeValue - // or key doesn't exist - return false; + // The latest value is not `kTypeValue` or key doesn't exist + return Status::NotFound(); } size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,8 +14,11 @@ #include #include #include +#include #include + #include "db/dbformat.h" +#include "db/kv_checksum.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" #include "db/version_edit.h" @@ -24,7 +27,6 @@ #include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "table/multiget_context.h" #include "util/dynamic_bloom.h" @@ -36,9 +38,10 @@ class Mutex; class MemTableIterator; class MergeContext; +class SystemClock; struct ImmutableMemTableOptions { - explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, + explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options); size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; @@ -54,6 +57,7 @@ Statistics* statistics; MergeOperator* merge_operator; Logger* info_log; + bool allow_data_in_errors; }; // Batched counters to updated when inserting keys in one write batch. @@ -69,7 +73,7 @@ // Note: Many of the methods in this class have comments indicating that // external synchronization is required as these methods are not thread-safe. // It is up to higher layers of code to decide how to prevent concurrent -// invokation of these methods. This is usually done by acquiring either +// invocation of these methods. This is usually done by acquiring either // the db mutex or the single writer thread. // // Some of these methods are documented to only require external @@ -100,7 +104,7 @@ // used, but this may prevent some transactions from succeeding until the // first key is inserted into the memtable. explicit MemTable(const InternalKeyComparator& comparator, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, WriteBufferManager* write_buffer_manager, SequenceNumber earliest_seq, uint32_t column_family_id); @@ -136,12 +140,39 @@ // operations on the same MemTable (unless this Memtable is immutable). size_t ApproximateMemoryUsage(); - // As a cheap version of `ApproximateMemoryUsage()`, this function doens't + // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't // require external synchronization. The value may be less accurate though size_t ApproximateMemoryUsageFast() const { return approximate_memory_usage_.load(std::memory_order_relaxed); } + // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast + size_t MemoryAllocatedBytes() const { + return table_->ApproximateMemoryUsage() + + range_del_table_->ApproximateMemoryUsage() + + arena_.MemoryAllocatedBytes(); + } + + // Returns a vector of unique random memtable entries of size 'sample_size'. + // + // Note: the entries are stored in the unordered_set as length-prefixed keys, + // hence their representation in the set as "const char*". + // Note2: the size of the output set 'entries' is not enforced to be strictly + // equal to 'target_sample_size'. Its final size might be slightly + // greater or slightly less than 'target_sample_size' + // + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + // REQUIRES: SkipList memtable representation. This function is not + // implemented for any other type of memtable representation (vectorrep, + // hashskiplist,...). + void UniqueRandomSample(const uint64_t& target_sample_size, + std::unordered_set* entries) { + // TODO(bjlemaire): at the moment, only supported by skiplistrep. + // Extend it to all other memtable representations. + table_->UniqueRandomSample(num_entries(), target_sample_size, entries); + } + // This method heuristically determines if the memtable should continue to // host more data. bool ShouldScheduleFlush() const { @@ -174,6 +205,9 @@ FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options, SequenceNumber read_seq); + Status VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOS64& kv_prot_info); + // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. // Typically value will be empty if type==kTypeDeletion. @@ -181,12 +215,14 @@ // REQUIRES: if allow_concurrent = false, external synchronization to prevent // simultaneous operations on the same MemTable. // - // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and - // the already exists. - bool Add(SequenceNumber seq, ValueType type, const Slice& key, - const Slice& value, bool allow_concurrent = false, - MemTablePostProcessInfo* post_process_info = nullptr, - void** hint = nullptr); + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + Status Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, const ProtectionInfoKVOS64* kv_prot_info, + bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); // Used to Get value associated with key or Get Merge Operands associated // with key. @@ -212,50 +248,62 @@ MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, + bool* is_blob_index = nullptr, bool do_merge = true) { + return Get(key, value, /*timestamp=*/nullptr, s, merge_context, + max_covering_tombstone_seq, seq, read_opts, callback, + is_blob_index, do_merge); + } + + bool Get(const LookupKey& key, std::string* value, std::string* timestamp, + Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, bool do_merge = true); - bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context, + bool Get(const LookupKey& key, std::string* value, std::string* timestamp, + Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, bool do_merge = true) { SequenceNumber seq; - return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, - read_opts, callback, is_blob_index, do_merge); + return Get(key, value, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, callback, + is_blob_index, do_merge); } void MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob); + ReadCallback* callback); - // Attempts to update the new_value inplace, else does normal Add - // Pseudocode - // if key exists in current memtable && prev_value is of type kTypeValue - // if new sizeof(new_value) <= sizeof(prev_value) - // update inplace - // else add(key, new_value) - // else add(key, new_value) + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // adds the new value to the memtable out-of-place. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. // // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - void Update(SequenceNumber seq, - const Slice& key, - const Slice& value); - - // If prev_value for key exists, attempts to update it inplace. - // else returns false - // Pseudocode - // if key exists in current memtable && prev_value is of type kTypeValue - // new_value = delta(prev_value) - // if sizeof(new_value) <= sizeof(prev_value) - // update inplace - // else add(key, new_value) - // else return false + Status Update(SequenceNumber seq, const Slice& key, const Slice& value, + const ProtectionInfoKVOS64* kv_prot_info); + + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // if `key` exists in current memtable with type `kTypeValue`, adds the new + // value to the memtable out-of-place. + // + // Returns `Status::NotFound` if `key` does not exist in current memtable or + // the latest version of `key` does not have `kTypeValue`. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. // // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - bool UpdateCallback(SequenceNumber seq, - const Slice& key, - const Slice& delta); + Status UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOS64* kv_prot_info); // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the @@ -321,6 +369,14 @@ return first_seqno_.load(std::memory_order_relaxed); } + // Returns the sequence number of the first element that was inserted + // into the memtable. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + void SetFirstSequenceNumber(SequenceNumber first_seqno) { + return first_seqno_.store(first_seqno, std::memory_order_relaxed); + } + // Returns the sequence number that is guaranteed to be smaller than or equal // to the sequence number of any key that could be inserted into this // memtable. It can then be assumed that any write with a larger(or equal) @@ -332,6 +388,15 @@ return earliest_seqno_.load(std::memory_order_relaxed); } + // Sets the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into this + // memtable. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // Used only for MemPurge operation + void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) { + return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed); + } + // DB's latest sequence ID when the memtable is created. This number // may be updated to a more recent one before any key is inserted. SequenceNumber GetCreationSeq() const { return creation_seq_; } @@ -434,6 +499,9 @@ } #endif // !ROCKSDB_LITE + // Returns a heuristic flush decision + bool ShouldFlushNow(); + private: enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; @@ -492,7 +560,7 @@ std::atomic flush_state_; - Env* env_; + SystemClock* clock_; // Extract sequential insert prefixes. const SliceTransform* insert_with_hint_prefix_extractor_; @@ -513,7 +581,7 @@ SequenceNumber atomic_flush_seqno_; // keep track of memory usage in table_, arena_, and range_del_table_. - // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` std::atomic approximate_memory_usage_; #ifndef ROCKSDB_LITE @@ -521,9 +589,6 @@ std::unique_ptr flush_job_info_; #endif // !ROCKSDB_LITE - // Returns a heuristic flush decision - bool ShouldFlushNow(); - // Updates flush_state_ using ShouldFlushNow() void UpdateFlushState(); @@ -532,9 +597,13 @@ void GetFromTable(const LookupKey& key, SequenceNumber max_covering_tombstone_seq, bool do_merge, ReadCallback* callback, bool* is_blob_index, - std::string* value, Status* s, MergeContext* merge_context, - SequenceNumber* seq, bool* found_final_value, - bool* merge_in_progress); + std::string* value, std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress); + + // Always returns non-null and assumes certain pre-checks are done + FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( + const ReadOptions& read_options, SequenceNumber read_seq); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,15 +5,18 @@ // #include "db/memtable_list.h" +#include #include #include #include #include + #include "db/db_impl/db_impl.h" #include "db/memtable.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" #include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/thread_status_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -43,22 +46,20 @@ } MemTableListVersion::MemTableListVersion( - size_t* parent_memtable_list_memory_usage, MemTableListVersion* old) + size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old) : max_write_buffer_number_to_maintain_( - old->max_write_buffer_number_to_maintain_), + old.max_write_buffer_number_to_maintain_), max_write_buffer_size_to_maintain_( - old->max_write_buffer_size_to_maintain_), + old.max_write_buffer_size_to_maintain_), parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { - if (old != nullptr) { - memlist_ = old->memlist_; - for (auto& m : memlist_) { - m->Ref(); - } + memlist_ = old.memlist_; + for (auto& m : memlist_) { + m->Ref(); + } - memlist_history_ = old->memlist_history_; - for (auto& m : memlist_history_) { - m->Ref(); - } + memlist_history_ = old.memlist_history_; + for (auto& m : memlist_history_) { + m->Ref(); } } @@ -104,20 +105,21 @@ // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. bool MemTableListVersion::Get(const LookupKey& key, std::string* value, - Status* s, MergeContext* merge_context, + std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) { - return GetFromList(&memlist_, key, value, s, merge_context, + return GetFromList(&memlist_, key, value, timestamp, s, merge_context, max_covering_tombstone_seq, seq, read_opts, callback, is_blob_index); } void MemTableListVersion::MultiGet(const ReadOptions& read_options, - MultiGetRange* range, ReadCallback* callback, - bool* is_blob) { + MultiGetRange* range, + ReadCallback* callback) { for (auto memtable : memlist_) { - memtable->MultiGet(read_options, range, callback, is_blob); + memtable->MultiGet(read_options, range, callback); if (range->empty()) { return; } @@ -128,9 +130,9 @@ const LookupKey& key, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { for (MemTable* memtable : memlist_) { - bool done = memtable->Get(key, nullptr, s, merge_context, - max_covering_tombstone_seq, read_opts, nullptr, - nullptr, false); + bool done = memtable->Get(key, /*value*/ nullptr, /*timestamp*/ nullptr, s, + merge_context, max_covering_tombstone_seq, + read_opts, nullptr, nullptr, false); if (done) { return true; } @@ -139,17 +141,17 @@ } bool MemTableListVersion::GetFromHistory( - const LookupKey& key, std::string* value, Status* s, + const LookupKey& key, std::string* value, std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) { - return GetFromList(&memlist_history_, key, value, s, merge_context, + return GetFromList(&memlist_history_, key, value, timestamp, s, merge_context, max_covering_tombstone_seq, seq, read_opts, nullptr /*read_callback*/, is_blob_index); } bool MemTableListVersion::GetFromList( std::list* list, const LookupKey& key, std::string* value, - Status* s, MergeContext* merge_context, + std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) { *seq = kMaxSequenceNumber; @@ -157,9 +159,9 @@ for (auto& memtable : *list) { SequenceNumber current_seq = kMaxSequenceNumber; - bool done = - memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq, - ¤t_seq, read_opts, callback, is_blob_index); + bool done = memtable->Get(key, value, timestamp, s, merge_context, + max_covering_tombstone_seq, ¤t_seq, + read_opts, callback, is_blob_index); if (*seq == kMaxSequenceNumber) { // Store the most recent sequence number of any operation on this key. // Since we only care about the most recent change, we only need to @@ -257,8 +259,8 @@ void MemTableListVersion::Add(MemTable* m, autovector* to_delete) { assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable AddMemTable(m); - - TrimHistory(to_delete, m->ApproximateMemoryUsage()); + // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast + TrimHistory(to_delete, 0); } // Removes m from list of memtables not flushed. Caller should NOT Unref m. @@ -280,16 +282,16 @@ } // return the total memory usage assuming the oldest flushed memtable is dropped -size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() const { +size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const { size_t total_memtable_size = 0; for (auto& memtable : memlist_) { - total_memtable_size += memtable->ApproximateMemoryUsage(); + total_memtable_size += memtable->MemoryAllocatedBytes(); } for (auto& memtable : memlist_history_) { - total_memtable_size += memtable->ApproximateMemoryUsage(); + total_memtable_size += memtable->MemoryAllocatedBytes(); } if (!memlist_history_.empty()) { - total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage(); + total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes(); } return total_memtable_size; } @@ -299,7 +301,7 @@ // calculate the total memory usage after dropping the oldest flushed // memtable, compare with max_write_buffer_size_to_maintain_ to decide // whether to trim history - return ApproximateMemoryUsageExcludingLast() + usage >= + return MemoryAllocatedBytesExcludingLast() + usage >= static_cast(max_write_buffer_size_to_maintain_); } else if (max_write_buffer_number_to_maintain_ > 0) { return memlist_.size() + memlist_history_.size() > @@ -310,14 +312,17 @@ } // Make sure we don't use up too much space in history -void MemTableListVersion::TrimHistory(autovector* to_delete, +bool MemTableListVersion::TrimHistory(autovector* to_delete, size_t usage) { + bool ret = false; while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { MemTable* x = memlist_history_.back(); memlist_history_.pop_back(); UnrefMemTable(to_delete, x); + ret = true; } + return ret; } // Returns true if there is at least one memtable on which flush has @@ -332,18 +337,26 @@ } // Returns the memtables that need to be flushed. -void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, +void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, autovector* ret) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); const auto& memlist = current_->memlist_; bool atomic_flush = false; + + // Note: every time MemTableList::Add(mem) is called, it adds the new mem + // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by + // iterating through the memlist starting at the end, the vector + // ret is filled with memtables already sorted in increasing MemTable ID. + // However, when the mempurge feature is activated, new memtables with older + // IDs will be added to the memlist. Therefore we std::sort(ret) at the end to + // return a vector of memtables sorted by increasing memtable ID. for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { MemTable* m = *it; if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { atomic_flush = true; } - if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) { + if (m->GetID() > max_memtable_id) { break; } if (!m->flush_in_progress_) { @@ -359,6 +372,15 @@ if (!atomic_flush || num_flush_not_started_ == 0) { flush_requested_ = false; // start-flush request is complete } + + // Sort the list of memtables by increasing memtable ID. + // This is useful when the mempurge feature is activated + // and the memtables are not guaranteed to be sorted in + // the memlist vector. + std::sort(ret->begin(), ret->end(), + [](const MemTable* m1, const MemTable* m2) -> bool { + return m1->GetID() < m2->GetID(); + }); } void MemTableList::RollbackMemtableFlush(const autovector& mems, @@ -387,9 +409,10 @@ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, const autovector& mems, LogsWithPrepTracker* prep_tracker, VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, - autovector* to_delete, Directory* db_directory, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer, - std::list>* committed_flush_jobs_info) { + std::list>* committed_flush_jobs_info, + IOStatus* io_s, bool write_edits) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); @@ -442,9 +465,18 @@ } if (it == memlist.rbegin() || batch_file_number != m->file_number_) { batch_file_number = m->file_number_; - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 " started", - cfd->GetName().c_str(), m->file_number_); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files) started", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size()); + } + edit_list.push_back(&m->edit_); memtables_to_flush.push_back(m); #ifndef ROCKSDB_LITE @@ -461,67 +493,67 @@ // TODO(myabandeh): Not sure how batch_count could be 0 here. if (batch_count > 0) { + uint64_t min_wal_number_to_keep = 0; + assert(edit_list.size() > 0); if (vset->db_options()->allow_2pc) { - assert(edit_list.size() > 0); - // We piggyback the information of earliest log file to keep in the + // Note that if mempurge is successful, the edit_list will + // not be applicable (contains info of new min_log number to keep, + // and level 0 file path of SST file created during normal flush, + // so both pieces of information are irrelevant after a successful + // mempurge operation). + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker); + + // We piggyback the information of earliest log file to keep in the // manifest entry for the last file flushed. - edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep( - vset, *cfd, edit_list, memtables_to_flush, prep_tracker)); + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); } + edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep); - // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, - db_directory); - - // we will be changing the version in the next code path, - // so we better create a new one, since versions are immutable - InstallNewVersion(); - - // All the later memtables that have the same filenum - // are part of the same batch. They can be committed now. - uint64_t mem_id = 1; // how many memtables have been flushed. - - // commit new state only if the column family is NOT dropped. - // The reason is as follows (refer to - // ColumnFamilyTest.FlushAndDropRaceCondition). - // If the column family is dropped, then according to LogAndApply, its - // corresponding flush operation is NOT written to the MANIFEST. This - // means the DB is not aware of the L0 files generated from the flush. - // By committing the new state, we remove the memtable from the memtable - // list. Creating an iterator on this column family will not be able to - // read full data since the memtable is removed, and the DB is not aware - // of the L0 files, causing MergingIterator unable to build child - // iterators. RocksDB contract requires that the iterator can be created - // on a dropped column family, and we must be able to - // read full data as long as column family handle is not deleted, even if - // the column family is dropped. - if (s.ok() && !cfd->IsDropped()) { // commit new state - while (batch_count-- > 0) { - MemTable* m = current_->memlist_.back(); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " done", - cfd->GetName().c_str(), m->file_number_, mem_id); - assert(m->file_number_ > 0); - current_->Remove(m, to_delete); - UpdateCachedValuesFromMemTableListVersion(); - ResetTrimHistoryNeeded(); - ++mem_id; + std::unique_ptr wal_deletion; + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (min_wal_number_to_keep > + vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.reset(new VersionEdit); + wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); + edit_list.push_back(wal_deletion.get()); } + TEST_SYNC_POINT_CALLBACK( + "MemTableList::TryInstallMemtableFlushResults:" + "AfterComputeMinWalToKeep", + nullptr); + } + + const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, + to_delete, mu](const Status& status) { + RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer, + to_delete, mu); + }; + if (write_edits) { + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + manifest_write_cb); + *io_s = vset->io_status(); } else { - for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { - MemTable* m = *it; - // commit failed. setup state so that we can flush again. - ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " failed", - m->file_number_, mem_id); - m->flush_completed_ = false; - m->flush_in_progress_ = false; - m->edit_.Clear(); - num_flush_not_started_++; - m->file_number_ = 0; - imm_flush_needed.store(true, std::memory_order_release); - ++mem_id; - } + // If write_edit is false (e.g: successful mempurge), + // then remove old memtables, wake up manifest write queue threads, + // and don't commit anything to the manifest file. + RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer, + to_delete, mu); + // Note: cfd->SetLogNumber is only called when a VersionEdit + // is written to MANIFEST. When mempurge is succesful, we skip + // this step, therefore cfd->GetLogNumber is always is + // earliest log with data unflushed. + // Notify new head of manifest write queue. + // wake up all the waiting writers + // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters + // needed or investigate more. + vset->WakeUpWaitingManifestWriters(); + *io_s = IOStatus::OK(); } } } @@ -535,7 +567,7 @@ InstallNewVersion(); // this method is used to move mutable memtable into an immutable list. // since mutable memtable is already refcounted by the DBImpl, - // and when moving to the imutable list we don't unref it, + // and when moving to the immutable list we don't unref it, // we don't have to ref the memtable here. we just take over the // reference from the DBImpl. current_->Add(m, to_delete); @@ -548,11 +580,12 @@ ResetTrimHistoryNeeded(); } -void MemTableList::TrimHistory(autovector* to_delete, size_t usage) { +bool MemTableList::TrimHistory(autovector* to_delete, size_t usage) { InstallNewVersion(); - current_->TrimHistory(to_delete, usage); + bool ret = current_->TrimHistory(to_delete, usage); UpdateCachedValuesFromMemTableListVersion(); ResetTrimHistoryNeeded(); + return ret; } // Returns an estimate of the number of bytes of data in use. @@ -566,9 +599,9 @@ size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } -size_t MemTableList::ApproximateMemoryUsageExcludingLast() const { - const size_t usage = - current_memory_usage_excluding_last_.load(std::memory_order_relaxed); +size_t MemTableList::MemoryAllocatedBytesExcludingLast() const { + const size_t usage = current_memory_allocted_bytes_excluding_last_.load( + std::memory_order_relaxed); return usage; } @@ -579,9 +612,9 @@ void MemTableList::UpdateCachedValuesFromMemTableListVersion() { const size_t total_memtable_size = - current_->ApproximateMemoryUsageExcludingLast(); - current_memory_usage_excluding_last_.store(total_memtable_size, - std::memory_order_relaxed); + current_->MemoryAllocatedBytesExcludingLast(); + current_memory_allocted_bytes_excluding_last_.store( + total_memtable_size, std::memory_order_relaxed); const bool has_history = current_->HasHistory(); current_has_history_.store(has_history, std::memory_order_relaxed); @@ -600,27 +633,99 @@ } else { // somebody else holds the current version, we need to create new one MemTableListVersion* version = current_; - current_ = new MemTableListVersion(¤t_memory_usage_, current_); + current_ = new MemTableListVersion(¤t_memory_usage_, *version); current_->Ref(); version->Unref(); } } +void MemTableList::RemoveMemTablesOrRestoreFlags( + const Status& s, ColumnFamilyData* cfd, size_t batch_count, + LogBuffer* log_buffer, autovector* to_delete, + InstrumentedMutex* mu) { + assert(mu); + mu->AssertHeld(); + assert(to_delete); + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64 + " failed", + m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } +} + uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( - const autovector& memtables_to_flush) { + const std::unordered_set* memtables_to_flush) { uint64_t min_log = 0; for (auto& m : current_->memlist_) { - // Assume the list is very short, we can live with O(m*n). We can optimize - // if the performance has some problem. - bool should_skip = false; - for (MemTable* m_to_flush : memtables_to_flush) { - if (m == m_to_flush) { - should_skip = true; - break; - } - } - if (should_skip) { + if (memtables_to_flush && memtables_to_flush->count(m)) { continue; } @@ -640,8 +745,11 @@ const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_metas, - autovector* to_delete, Directory* db_directory, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_metas, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); @@ -652,6 +760,10 @@ if (imm_lists != nullptr) { assert(imm_lists->size() == num); } + if (num == 0) { + return Status::OK(); + } + for (size_t k = 0; k != num; ++k) { #ifndef NDEBUG const auto* imm = @@ -666,6 +778,17 @@ (*mems_list[k])[i]->SetFlushCompleted(true); (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); } +#ifndef ROCKSDB_LITE + if (committed_flush_jobs_info[k]) { + assert(!mems_list[k]->empty()); + assert((*mems_list[k])[0]); + std::unique_ptr flush_job_info = + (*mems_list[k])[0]->ReleaseFlushJobInfo(); + committed_flush_jobs_info[k]->push_back(std::move(flush_job_info)); + } +#else //! ROCKSDB_LITE + (void)committed_flush_jobs_info; +#endif // ROCKSDB_LITE } Status s; @@ -680,12 +803,36 @@ ++num_entries; edit_lists.emplace_back(edits); } + + WalNumber min_wal_number_to_keep = 0; + if (vset->db_options()->allow_2pc) { + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, cfds, edit_lists, mems_list, prep_tracker); + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); + } + edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep); + + std::unique_ptr wal_deletion; + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.reset(new VersionEdit); + wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); + edit_lists.back().push_back(wal_deletion.get()); + ++num_entries; + } + } + // Mark the version edits as an atomic group if the number of version edits // exceeds 1. if (cfds.size() > 1) { - for (auto& edits : edit_lists) { - assert(edits.size() == 1); - edits[0]->MarkAtomicGroup(--num_entries); + for (size_t i = 0; i < edit_lists.size(); i++) { + assert((edit_lists[i].size() == 1) || + ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1))); + for (auto& e : edit_lists[i]) { + e->MarkAtomicGroup(--num_entries); + } } assert(0 == num_entries); } @@ -708,11 +855,25 @@ for (auto m : *mems_list[i]) { assert(m->GetFileNumber() > 0); uint64_t mem_id = m->GetID(); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " done", - cfds[i]->GetName().c_str(), m->GetFileNumber(), - mem_id); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + imm->current_->Remove(m, to_delete); imm->UpdateCachedValuesFromMemTableListVersion(); imm->ResetTrimHistoryNeeded(); @@ -723,11 +884,25 @@ auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); for (auto m : *mems_list[i]) { uint64_t mem_id = m->GetID(); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " failed", - cfds[i]->GetName().c_str(), m->GetFileNumber(), - mem_id); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + m->SetFlushCompleted(false); m->SetFlushInProgress(false); m->GetEdits()->Clear(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ #include #include -#include "db/dbformat.h" #include "db/logs_with_prep_tracker.h" #include "db/memtable.h" #include "db/range_del_aggregator.h" @@ -44,7 +43,7 @@ class MemTableListVersion { public: explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, - MemTableListVersion* old = nullptr); + const MemTableListVersion& old); explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, int max_write_buffer_number_to_maintain, int64_t max_write_buffer_size_to_maintain); @@ -58,24 +57,25 @@ // If any operation was found for this key, its most recent sequence number // will be stored in *seq on success (regardless of whether true/false is // returned). Otherwise, *seq will be set to kMaxSequenceNumber. - bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context, + bool Get(const LookupKey& key, std::string* value, std::string* timestamp, + Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); - bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext* merge_context, + bool Get(const LookupKey& key, std::string* value, std::string* timestamp, + Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr) { SequenceNumber seq; - return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, - read_opts, callback, is_blob_index); + return Get(key, value, timestamp, s, merge_context, + max_covering_tombstone_seq, &seq, read_opts, callback, + is_blob_index); } void MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob); + ReadCallback* callback); // Returns all the merge operands corresponding to the key by searching all // memtables starting from the most recent one. @@ -88,18 +88,20 @@ // have already been flushed. Should only be used from in-memory only // queries (such as Transaction validation) as the history may contain // writes that are also present in the SST files. - bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, + bool GetFromHistory(const LookupKey& key, std::string* value, + std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index = nullptr); - bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, + bool GetFromHistory(const LookupKey& key, std::string* value, + std::string* timestamp, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts, bool* is_blob_index = nullptr) { SequenceNumber seq; - return GetFromHistory(key, value, s, merge_context, + return GetFromHistory(key, value, timestamp, s, merge_context, max_covering_tombstone_seq, &seq, read_opts, is_blob_index); } @@ -135,9 +137,11 @@ const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, - autovector* to_delete, Directory* db_directory, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); // REQUIRE: m is an immutable memtable @@ -145,10 +149,12 @@ // REQUIRE: m is an immutable memtable void Remove(MemTable* m, autovector* to_delete); - void TrimHistory(autovector* to_delete, size_t usage); + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); bool GetFromList(std::list* list, const LookupKey& key, - std::string* value, Status* s, MergeContext* merge_context, + std::string* value, std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, @@ -162,7 +168,7 @@ // excluding the last MemTable in memlist_history_. The reason for excluding // the last MemTable is to see if dropping the last MemTable will keep total // memory usage above or equal to max_write_buffer_size_to_maintain_ - size_t ApproximateMemoryUsageExcludingLast() const; + size_t MemoryAllocatedBytesExcludingLast() const; // Whether this version contains flushed memtables that are only kept around // for transaction conflict checking. @@ -215,7 +221,7 @@ commit_in_progress_(false), flush_requested_(false), current_memory_usage_(0), - current_memory_usage_excluding_last_(0), + current_memory_allocted_bytes_excluding_last_(0), current_has_history_(false) { current_->Ref(); } @@ -246,7 +252,7 @@ // Returns the earliest memtables that needs to be flushed. The returned // memtables are guaranteed to be in the ascending order of created time. - void PickMemtablesToFlush(const uint64_t* max_memtable_id, + void PickMemtablesToFlush(uint64_t max_memtable_id, autovector* mems); // Reset status of the given memtable list back to pending state so that @@ -260,33 +266,39 @@ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, const autovector& m, LogsWithPrepTracker* prep_tracker, VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, - autovector* to_delete, Directory* db_directory, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer, - std::list>* committed_flush_jobs_info); + std::list>* committed_flush_jobs_info, + IOStatus* io_s, bool write_edits = true); // New memtables are inserted at the front of the list. // Takes ownership of the referenced held on *m by the caller of Add(). + // By default, adding memtables will flag that the memtable list needs to be + // flushed, but in certain situations, like after a mempurge, we may want to + // avoid flushing the memtable list upon addition of a memtable. void Add(MemTable* m, autovector* to_delete); // Returns an estimate of the number of bytes of data in use. size_t ApproximateMemoryUsage(); - // Returns the cached current_memory_usage_excluding_last_ value. - size_t ApproximateMemoryUsageExcludingLast() const; + // Returns the cached current_memory_allocted_bytes_excluding_last_ value. + size_t MemoryAllocatedBytesExcludingLast() const; // Returns the cached current_has_history_ value. bool HasHistory() const; - // Updates current_memory_usage_excluding_last_ and current_has_history_ - // from MemTableListVersion. Must be called whenever InstallNewVersion is - // called. + // Updates current_memory_allocted_bytes_excluding_last_ and + // current_has_history_ from MemTableListVersion. Must be called whenever + // InstallNewVersion is called. void UpdateCachedValuesFromMemTableListVersion(); // `usage` is the current size of the mutable Memtable. When // max_write_buffer_size_to_maintain is used, total size of mutable and // immutable memtables is checked against it to decide whether to trim // memtable list. - void TrimHistory(autovector* to_delete, size_t usage); + // + // Return true if memtable is trimmed + bool TrimHistory(autovector* to_delete, size_t usage); // Returns an estimate of the number of bytes of data used by // the unflushed mem-tables. @@ -300,7 +312,18 @@ // non-empty (regardless of the min_write_buffer_number_to_merge // parameter). This flush request will persist until the next time // PickMemtablesToFlush() is called. - void FlushRequested() { flush_requested_ = true; } + void FlushRequested() { + flush_requested_ = true; + // If there are some memtables stored in imm() that dont trigger + // flush (eg: mempurge output memtable), then update imm_flush_needed. + // Note: if race condition and imm_flush_needed is set to true + // when there is num_flush_not_started_==0, then there is no + // impact whatsoever. Imm_flush_needed is only used in an assert + // in IsFlushPending(). + if (num_flush_not_started_ > 0) { + imm_flush_needed.store(true, std::memory_order_release); + } + } bool HasFlushRequested() { return flush_requested_; } @@ -327,7 +350,7 @@ // Returns the min log containing the prep section after memtables listsed in // `memtables_to_flush` are flushed and their status is persisted in manifest. uint64_t PrecomputeMinLogContainingPrepSection( - const autovector& memtables_to_flush); + const std::unordered_set* memtables_to_flush = nullptr); uint64_t GetEarliestMemTableID() const { auto& memlist = current_->memlist_; @@ -373,14 +396,23 @@ const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, - autovector* to_delete, Directory* db_directory, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); // DB mutex held void InstallNewVersion(); + // DB mutex held + // Called after writing to MANIFEST + void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd, + size_t batch_count, LogBuffer* log_buffer, + autovector* to_delete, + InstrumentedMutex* mu); + const int min_write_buffer_number_to_merge_; MemTableListVersion* current_; @@ -398,8 +430,8 @@ // The current memory usage. size_t current_memory_usage_; - // Cached value of current_->ApproximateMemoryUsageExcludingLast(). - std::atomic current_memory_usage_excluding_last_; + // Cached value of current_->MemoryAllocatedBytesExcludingLast(). + std::atomic current_memory_allocted_bytes_excluding_last_; // Cached value of current_->HasHistory(). std::atomic current_has_history_; @@ -416,7 +448,10 @@ const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_meta, - autovector* to_delete, Directory* db_directory, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_meta, + const autovector>*>& + committed_flush_jobs_info, + autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -30,14 +30,14 @@ MemTableListTest() : db(nullptr), file_number(1) { dbname = test::PerThreadDBPath("memtable_list_test"); options.create_if_missing = true; - DestroyDB(dbname, options); + EXPECT_OK(DestroyDB(dbname, options)); } // Create a test db if not yet created void CreateDB() { if (db == nullptr) { options.create_if_missing = true; - DestroyDB(dbname, options); + EXPECT_OK(DestroyDB(dbname, options)); // Open DB only with default column family ColumnFamilyOptions cf_options; std::vector cf_descs; @@ -65,18 +65,20 @@ ~MemTableListTest() override { if (db) { std::vector cf_descs(handles.size()); +#ifndef ROCKSDB_LITE for (int i = 0; i != static_cast(handles.size()); ++i) { - handles[i]->GetDescriptor(&cf_descs[i]); + EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i])); } +#endif // !ROCKSDB_LITE for (auto h : handles) { if (h) { - db->DestroyColumnFamilyHandle(h); + EXPECT_OK(db->DestroyColumnFamilyHandle(h)); } } handles.clear(); delete db; db = nullptr; - DestroyDB(dbname, options, cf_descs); + EXPECT_OK(DestroyDB(dbname, options, cf_descs)); } } @@ -92,7 +94,6 @@ CreateDB(); // Create a mock VersionSet DBOptions db_options; - db_options.file_system = FileSystem::Default(); ImmutableDBOptions immutable_db_options(db_options); EnvOptions env_options; std::shared_ptr table_cache(NewLRUCache(50000, 16)); @@ -101,7 +102,8 @@ VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller, /*block_cache_tracer=*/nullptr); + &write_controller, /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ ""); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -115,13 +117,15 @@ auto cfd = column_family_set->GetDefault(); EXPECT_TRUE(nullptr != cfd); uint64_t file_num = file_number.fetch_add(1); + IOStatus io_s; // Create dummy mutex. InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); std::list> flush_jobs_info; Status s = list->TryInstallMemtableFlushResults( cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, - file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info); + file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info, &io_s); + EXPECT_OK(io_s); return s; } @@ -139,7 +143,6 @@ CreateDB(); // Create a mock VersionSet DBOptions db_options; - db_options.file_system.reset(new LegacyFileSystemWrapper(db_options.env)); ImmutableDBOptions immutable_db_options(db_options); EnvOptions env_options; @@ -149,7 +152,8 @@ VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller, /*block_cache_tracer=*/nullptr); + &write_controller, /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ ""); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -178,11 +182,21 @@ for (auto& meta : file_metas) { file_meta_ptrs.push_back(&meta); } + std::vector>> + committed_flush_jobs_info_storage(cf_ids.size()); + autovector>*> + committed_flush_jobs_info; + for (int i = 0; i < static_cast(cf_ids.size()); ++i) { + committed_flush_jobs_info.push_back( + &committed_flush_jobs_info_storage[i]); + } + InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); return InstallMemtableAtomicFlushResults( - &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex, - file_meta_ptrs, to_delete, nullptr, &log_buffer); + &lists, cfds, mutable_cf_options_list, mems_list, &versions, + nullptr /* prep_tracker */, &mutex, file_meta_ptrs, + committed_flush_jobs_info, to_delete, nullptr, &log_buffer); } }; @@ -195,7 +209,7 @@ ASSERT_FALSE(list.IsFlushPending()); autovector mems; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &mems); ASSERT_EQ(0, mems.size()); autovector to_delete; @@ -221,15 +235,16 @@ autovector to_delete; LookupKey lkey("key1", seq); - bool found = list.current()->Get(lkey, &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + bool found = list.current()->Get( + lkey, &value, /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); // Create a MemTable InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, @@ -237,26 +252,33 @@ mem->Ref(); // Write some keys to this memtable. - mem->Add(++seq, kTypeDeletion, "key1", ""); - mem->Add(++seq, kTypeValue, "key2", "value2"); - mem->Add(++seq, kTypeValue, "key1", "value1"); - mem->Add(++seq, kTypeValue, "key2", "value2.2"); + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); // Fetch the newly written keys merge_context.Clear(); - found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context, + found = mem->Get(LookupKey("key1", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ(value, "value1"); merge_context.Clear(); - found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context, + found = mem->Get(LookupKey("key1", 2), &value, + /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); // MemTable found out that this key is *not* found (at this sequence#) ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); - found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context, + found = mem->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ(value, "value2.2"); @@ -275,36 +297,39 @@ kMaxSequenceNumber, 0 /* column_family_id */); mem2->Ref(); - mem2->Add(++seq, kTypeDeletion, "key1", ""); - mem2->Add(++seq, kTypeValue, "key2", "value2.3"); + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3", + nullptr /* kv_prot_info */)); // Add second memtable to list list.Add(mem2, &to_delete); // Fetch keys via MemTableList merge_context.Clear(); - found = - list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get( + LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); - found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s, - &merge_context, &max_covering_tombstone_seq, - ReadOptions()); + found = list.current()->Get( + LookupKey("key1", saved_seq), &value, /*timestamp*/nullptr, + &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ("value1", value); merge_context.Clear(); - found = - list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get( + LookupKey("key2", seq), &value, /*timestamp*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ(value, "value2.3"); merge_context.Clear(); - found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get( + LookupKey("key2", 1), &value, /*timestamp*/nullptr, &s, + &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); ASSERT_EQ(2, list.NumNotFlushed()); @@ -319,7 +344,7 @@ // Create MemTableList int min_write_buffer_number_to_merge = 2; int max_write_buffer_number_to_maintain = 2; - int64_t max_write_buffer_size_to_maintain = 2000; + int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; MemTableList list(min_write_buffer_number_to_merge, max_write_buffer_number_to_maintain, max_write_buffer_size_to_maintain); @@ -333,15 +358,16 @@ autovector to_delete; LookupKey lkey("key1", seq); - bool found = list.current()->Get(lkey, &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + bool found = list.current()->Get( + lkey, &value, /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); // Create a MemTable InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, @@ -349,19 +375,24 @@ mem->Ref(); // Write some keys to this memtable. - mem->Add(++seq, kTypeDeletion, "key1", ""); - mem->Add(++seq, kTypeValue, "key2", "value2"); - mem->Add(++seq, kTypeValue, "key2", "value2.2"); + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); // Fetch the newly written keys merge_context.Clear(); - found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context, + found = mem->Get(LookupKey("key1", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); // MemTable found out that this key is *not* found (at this sequence#) ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); - found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context, + found = mem->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ(value, "value2.2"); @@ -372,22 +403,22 @@ // Fetch keys via MemTableList merge_context.Clear(); - found = - list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key1", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); - found = - list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(s.ok() && found); ASSERT_EQ("value2.2", value); // Flush this memtable from the list. // (It will then be a part of the memtable history). autovector to_flush; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(1, to_flush.size()); MutableCFOptions mutable_cf_options(options); @@ -400,27 +431,27 @@ // Verify keys are no longer in MemTableList merge_context.Clear(); - found = - list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key1", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); merge_context.Clear(); - found = - list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); // Verify keys are present in history merge_context.Clear(); found = list.current()->GetFromHistory( - LookupKey("key1", seq), &value, &s, &merge_context, + LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); found = list.current()->GetFromHistory( - LookupKey("key2", seq), &value, &s, &merge_context, + LookupKey("key2", seq), &value, /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found); ASSERT_EQ("value2.2", value); @@ -431,15 +462,17 @@ kMaxSequenceNumber, 0 /* column_family_id */); mem2->Ref(); - mem2->Add(++seq, kTypeDeletion, "key1", ""); - mem2->Add(++seq, kTypeValue, "key3", "value3"); + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3", + nullptr /* kv_prot_info */)); // Add second memtable to list list.Add(mem2, &to_delete); ASSERT_EQ(0, to_delete.size()); to_flush.clear(); - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(1, to_flush.size()); // Flush second memtable @@ -462,42 +495,42 @@ // Verify keys are no longer in MemTableList merge_context.Clear(); - found = - list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key1", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); merge_context.Clear(); - found = - list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); merge_context.Clear(); - found = - list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key3", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); // Verify that the second memtable's keys are in the history merge_context.Clear(); found = list.current()->GetFromHistory( - LookupKey("key1", seq), &value, &s, &merge_context, + LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found && s.IsNotFound()); merge_context.Clear(); found = list.current()->GetFromHistory( - LookupKey("key3", seq), &value, &s, &merge_context, + LookupKey("key3", seq), &value, /*timestamp*/nullptr, &s, &merge_context, &max_covering_tombstone_seq, ReadOptions()); ASSERT_TRUE(found); ASSERT_EQ("value3", value); // Verify that key2 from the first memtable is no longer in the history merge_context.Clear(); - found = - list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context, - &max_covering_tombstone_seq, ReadOptions()); + found = list.current()->Get(LookupKey("key2", seq), &value, + /*timestamp*/nullptr, &s, &merge_context, + &max_covering_tombstone_seq, ReadOptions()); ASSERT_FALSE(found); // Cleanup @@ -515,7 +548,7 @@ auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); WriteBufferManager wb(options.db_write_buffer_size); autovector to_delete; @@ -542,11 +575,16 @@ std::string value; MergeContext merge_context; - mem->Add(++seq, kTypeValue, "key1", ToString(i)); - mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); - mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); - mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); - mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", + nullptr /* kv_prot_info */)); tables.push_back(mem); } @@ -555,7 +593,7 @@ ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); autovector to_flush; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(0, to_flush.size()); // Request a flush even though there is nothing to flush @@ -564,7 +602,7 @@ ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); // Attempt to 'flush' to clear request for flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(0, to_flush.size()); ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -588,7 +626,7 @@ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(2, to_flush.size()); ASSERT_EQ(2, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -609,7 +647,7 @@ ASSERT_EQ(0, to_delete.size()); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(3, to_flush.size()); ASSERT_EQ(3, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -617,7 +655,7 @@ // Pick tables to flush again autovector to_flush2; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); ASSERT_EQ(0, to_flush2.size()); ASSERT_EQ(3, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -635,7 +673,7 @@ ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush again - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); ASSERT_EQ(1, to_flush2.size()); ASSERT_EQ(4, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -656,7 +694,7 @@ ASSERT_EQ(0, to_delete.size()); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); // Should pick 4 of 5 since 1 table has been picked in to_flush2 ASSERT_EQ(4, to_flush.size()); ASSERT_EQ(5, list.NumNotFlushed()); @@ -665,7 +703,7 @@ // Pick tables to flush again autovector to_flush3; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3); ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -726,7 +764,7 @@ autovector to_flush4; list.FlushRequested(); ASSERT_TRUE(list.HasFlushRequested()); - list.PickMemtablesToFlush(&memtable_id, &to_flush4); + list.PickMemtablesToFlush(memtable_id, &to_flush4); ASSERT_TRUE(to_flush4.empty()); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -737,7 +775,7 @@ // equal to 5. Therefore, only tables[5] will be selected. memtable_id = 5; list.FlushRequested(); - list.PickMemtablesToFlush(&memtable_id, &to_flush4); + list.PickMemtablesToFlush(memtable_id, &to_flush4); ASSERT_EQ(1, static_cast(to_flush4.size())); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -779,7 +817,7 @@ auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); WriteBufferManager wb(options.db_write_buffer_size); @@ -811,11 +849,16 @@ std::string value; - mem->Add(++seq, kTypeValue, "key1", ToString(i)); - mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); - mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); - mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); - mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", + nullptr /* kv_prot_info */)); elem.push_back(mem); } @@ -829,7 +872,8 @@ auto* list = lists[i]; ASSERT_FALSE(list->IsFlushPending()); ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); - list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]); + list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, + &flush_candidates[i]); ASSERT_EQ(0, flush_candidates[i].size()); } // Request flush even though there is nothing to flush @@ -859,8 +903,7 @@ // Pick memtables to flush for (auto i = 0; i != num_cfs; ++i) { flush_candidates[i].clear(); - lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i], - &flush_candidates[i]); + lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]); ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, static_cast(flush_candidates[i].size())); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -68,7 +68,7 @@ } // Get the operand at the index. - Slice GetOperand(int index) { + Slice GetOperand(int index) const { assert(operand_list_); SetDirectionForward(); @@ -76,13 +76,21 @@ } // Same as GetOperandsDirectionForward - const std::vector& GetOperands() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperands() const { return GetOperandsDirectionForward(); } // Return all the operands in the order as they were merged (passed to // FullMerge or FullMergeV2) - const std::vector& GetOperandsDirectionForward() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionForward() const { if (!operand_list_) { return empty_operand_list; } @@ -93,7 +101,11 @@ // Return all the operands in the reversed order relative to how they were // merged (passed to FullMerge or FullMergeV2) - const std::vector& GetOperandsDirectionBackward() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionBackward() const { if (!operand_list_) { return empty_operand_list; } @@ -110,14 +122,14 @@ } } - void SetDirectionForward() { + void SetDirectionForward() const { if (operands_reversed_ == true) { std::reverse(operand_list_->begin(), operand_list_->end()); operands_reversed_ = false; } } - void SetDirectionBackward() { + void SetDirectionBackward() const { if (operands_reversed_ == false) { std::reverse(operand_list_->begin(), operand_list_->end()); operands_reversed_ = true; @@ -125,10 +137,10 @@ } // List of operands - std::unique_ptr> operand_list_; + mutable std::unique_ptr> operand_list_; // Copy of operands that are not pinned. std::unique_ptr>> copied_operands_; - bool operands_reversed_ = true; + mutable bool operands_reversed_ = true; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,10 @@ #include +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_index.h" +#include "db/blob/prefetch_buffer_collection.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" @@ -14,6 +18,7 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" #include "table/format.h" #include "table/internal_iterator.h" @@ -28,6 +33,7 @@ Statistics* stats, const std::atomic* shutting_down) : env_(env), + clock_(env->GetSystemClock().get()), user_comparator_(user_comparator), user_merge_operator_(user_merge_operator), compaction_filter_(compaction_filter), @@ -39,7 +45,7 @@ snapshot_checker_(snapshot_checker), level_(level), keys_(), - filter_timer_(env_), + filter_timer_(clock_), total_filter_time_(0U), stats_(stats) { assert(user_comparator_ != nullptr); @@ -52,7 +58,7 @@ const Slice& key, const Slice* value, const std::vector& operands, std::string* result, Logger* logger, - Statistics* statistics, Env* env, + Statistics* statistics, SystemClock* clock, Slice* result_operand, bool update_num_ops_stats) { assert(merge_operator != nullptr); @@ -75,7 +81,7 @@ MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); { // Setup to time the merge - StopWatchNano timer(env, statistics != nullptr); + StopWatchNano timer(clock, statistics != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); // Do the merge @@ -116,7 +122,11 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, CompactionRangeDelAggregator* range_del_agg, const SequenceNumber stop_before, - const bool at_bottom) { + const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats) { // Get a copy of the internal key, before it's invalidated by iter->Next() // Also maintain the list of merge operands seen. assert(HasOperator()); @@ -138,27 +148,27 @@ // orig_ikey is backed by original_key if keys_.empty() // orig_ikey is backed by keys_.back() if !keys_.empty() ParsedInternalKey orig_ikey; - bool succ = ParseInternalKey(original_key, &orig_ikey); - assert(succ); - if (!succ) { - return Status::Corruption("Cannot parse key in MergeUntil"); - } - Status s; + Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors); + assert(s.ok()); + if (!s.ok()) return s; + bool hit_the_next_user_key = false; for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { if (IsShuttingDown()) { - return Status::ShutdownInProgress(); + s = Status::ShutdownInProgress(); + return s; } ParsedInternalKey ikey; assert(keys_.size() == merge_context_.GetNumOperands()); - if (!ParseInternalKey(iter->key(), &ikey)) { + Status pik_status = + ParseInternalKey(iter->key(), &ikey, allow_data_in_errors); + if (!pik_status.ok()) { // stop at corrupted key if (assert_valid_internal_key_) { - assert(!"Corrupted internal key not expected."); - return Status::Corruption("Corrupted internal key not expected."); + return pik_status; } break; } else if (first_key) { @@ -182,7 +192,6 @@ assert(IsValueType(ikey.type)); if (ikey.type != kTypeMerge) { - // hit a put/delete/single delete // => merge the put value or a nullptr with operands_ // => store result in operands_.back() (and update keys_.back()) @@ -193,7 +202,7 @@ // the compaction iterator to write out the key we're currently at, which // is the put/delete we just encountered. if (keys_.empty()) { - return Status::OK(); + return s; } // TODO(noetzli) If the merge operator returns false, we are currently @@ -201,19 +210,52 @@ // want. Also if we're in compaction and it's a put, it would be nice to // run compaction filter on it. const Slice val = iter->value(); + PinnableSlice blob_value; const Slice* val_ptr; - if (kTypeValue == ikey.type && + if ((kTypeValue == ikey.type || kTypeBlobIndex == ikey.type) && (range_del_agg == nullptr || !range_del_agg->ShouldDelete( ikey, RangeDelPositioningMode::kForwardTraversal))) { - val_ptr = &val; + if (ikey.type == kTypeBlobIndex) { + BlobIndex blob_index; + + s = blob_index.DecodeFrom(val); + if (!s.ok()) { + return s; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher); + + s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, + prefetch_buffer, &blob_value, + &bytes_read); + if (!s.ok()) { + return s; + } + + val_ptr = &blob_value; + + if (c_iter_stats) { + ++c_iter_stats->num_blobs_read; + c_iter_stats->total_blob_bytes_read += bytes_read; + } + } else { + val_ptr = &val; + } } else { val_ptr = nullptr; } std::string merge_result; s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, merge_context_.GetOperands(), &merge_result, logger_, - stats_, env_); + stats_, clock_); // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) @@ -268,7 +310,10 @@ if (keys_.size() == 1) { // we need to re-anchor the orig_ikey because it was anchored by // original_key before - ParseInternalKey(keys_.back(), &orig_ikey); + pik_status = + ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); } if (filter == CompactionFilter::Decision::kKeep) { merge_context_.PushOperand( @@ -284,14 +329,14 @@ keys_.clear(); merge_context_.Clear(); has_compaction_filter_skip_until_ = true; - return Status::OK(); + return s; } } } if (merge_context_.GetNumOperands() == 0) { // we filtered out all the merge operands - return Status::OK(); + return s; } // We are sure we have seen this key's entire history if: @@ -321,7 +366,7 @@ std::string merge_result; s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, merge_context_.GetOperands(), &merge_result, logger_, - stats_, env_); + stats_, clock_); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of @@ -344,7 +389,7 @@ bool merge_success = false; std::string merge_result; { - StopWatchNano timer(env_, stats_ != nullptr); + StopWatchNano timer(clock_, stats_ != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); merge_success = user_merge_operator_->PartialMergeMulti( orig_ikey.user_key, @@ -410,7 +455,9 @@ kValueTypeForSeek); } } - total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + } return ret; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,12 +9,12 @@ #include #include -#include "db/dbformat.h" #include "db/merge_context.h" #include "db/range_del_aggregator.h" #include "db/snapshot_checker.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "util/stop_watch.h" @@ -25,6 +25,10 @@ class Logger; class MergeOperator; class Statistics; +class SystemClock; +class BlobFetcher; +class PrefetchBufferCollection; +struct CompactionIterationStats; class MergeHelper { public: @@ -48,7 +52,7 @@ const Slice& key, const Slice* value, const std::vector& operands, std::string* result, Logger* logger, - Statistics* statistics, Env* env, + Statistics* statistics, SystemClock* clock, Slice* result_operand = nullptr, bool update_num_ops_stats = false); @@ -66,6 +70,12 @@ // 0 means no restriction // at_bottom: (IN) true if the iterator covers the bottem level, which means // we could reach the start of the history of this user key. + // allow_data_in_errors: (IN) if true, data details will be displayed in + // error/log messages. + // blob_fetcher: (IN) blob fetcher object for the compaction's input version. + // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers + // used for compaction readahead. + // c_iter_stats: (OUT) compaction iteration statistics. // // Returns one of the following statuses: // - OK: Entries were successfully merged. @@ -78,9 +88,12 @@ // // REQUIRED: The first key in the input is not corrupted. Status MergeUntil(InternalIterator* iter, - CompactionRangeDelAggregator* range_del_agg = nullptr, - const SequenceNumber stop_before = 0, - const bool at_bottom = false); + CompactionRangeDelAggregator* range_del_agg, + const SequenceNumber stop_before, const bool at_bottom, + const bool allow_data_in_errors, + const BlobFetcher* blob_fetcher, + PrefetchBufferCollection* prefetch_buffers, + CompactionIterationStats* c_iter_stats); // Filters a merge operand using the compaction filter specified // in the constructor. Returns the decision that the filter made. @@ -137,6 +150,7 @@ private: Env* env_; + SystemClock* clock_; const Comparator* user_comparator_; const MergeOperator* user_merge_operator_; const CompactionFilter* compaction_filter_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,34 +3,39 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/merge_helper.h" + #include #include #include -#include "db/merge_helper.h" +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/vector_iterator.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { class MergeHelperTest : public testing::Test { public: - MergeHelperTest() { env_ = Env::Default(); } + MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); } ~MergeHelperTest() override = default; Status Run(SequenceNumber stop_before, bool at_bottom, SequenceNumber latest_snapshot = 0) { - iter_.reset(new test::VectorIterator(ks_, vs_)); + iter_.reset(new VectorIterator(ks_, vs_, &icmp_)); iter_->SeekToFirst(); - merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(), + merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(), merge_op_.get(), filter_.get(), nullptr, false, latest_snapshot)); - return merge_helper_->MergeUntil(iter_.get(), nullptr /* range_del_agg */, - stop_before, at_bottom); + return merge_helper_->MergeUntil( + iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom, + false /* allow_data_in_errors */, nullptr /* blob_fetcher */, + nullptr /* prefetch_buffers */, nullptr /* c_iter_stats */); } void AddKeyVal(const std::string& user_key, const SequenceNumber& seq, @@ -45,7 +50,8 @@ } Env* env_; - std::unique_ptr iter_; + InternalKeyComparator icmp_; + std::unique_ptr iter_; std::shared_ptr merge_op_; std::unique_ptr merge_helper_; std::vector ks_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,8 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). // #include -#include + #include +#include #include "db/db_impl/db_impl.h" #include "db/dbformat.h" @@ -18,6 +19,7 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" #include "test_util/testharness.h" +#include "util/coding.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { @@ -47,12 +49,8 @@ return true; } - return mergeOperator_->PartialMerge( - key, - *existing_value, - value, - new_value, - logger); + return mergeOperator_->PartialMerge(key, *existing_value, value, new_value, + logger); } bool PartialMergeMulti(const Slice& key, @@ -71,6 +69,31 @@ std::shared_ptr mergeOperator_; }; +class EnvMergeTest : public EnvWrapper { + public: + EnvMergeTest() : EnvWrapper(Env::Default()) {} + static const char* kClassName() { return "MergeEnv"; } + const char* Name() const override { return kClassName(); } + // ~EnvMergeTest() override {} + + uint64_t NowNanos() override { + ++now_nanos_count_; + return target()->NowNanos(); + } + + static uint64_t now_nanos_count_; + + static std::unique_ptr singleton_; + + static EnvMergeTest* GetInstance() { + if (nullptr == singleton_) singleton_.reset(new EnvMergeTest); + return singleton_.get(); + } +}; + +uint64_t EnvMergeTest::now_nanos_count_{0}; +std::unique_ptr EnvMergeTest::singleton_; + std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, const size_t max_successive_merges = 0) { DB* db; @@ -78,8 +101,9 @@ options.create_if_missing = true; options.merge_operator = std::make_shared(); options.max_successive_merges = max_successive_merges; + options.env = EnvMergeTest::GetInstance(); + EXPECT_OK(DestroyDB(dbname, Options())); Status s; - DestroyDB(dbname, Options()); // DBWithTTL is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE if (ttl) { @@ -93,10 +117,11 @@ assert(!ttl); s = DB::Open(options, dbname, &db); #endif // !ROCKSDB_LITE - if (!s.ok()) { - std::cerr << s.ToString() << std::endl; - assert(false); - } + EXPECT_OK(s); + assert(s.ok()); + // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for + // session ID) + EnvMergeTest::now_nanos_count_ = 0; return std::shared_ptr(db); } @@ -106,7 +131,6 @@ // set, add, get and remove // This is a quick implementation without a Merge operation. class Counters { - protected: std::shared_ptr db_; @@ -190,7 +214,6 @@ return get(key, &base) && set(key, base + value); } - // convenience functions for testing void assert_set(const std::string& key, uint64_t value) { assert(set(key, value)); @@ -202,27 +225,25 @@ uint64_t value = default_; int result = get(key, &value); assert(result); - if (result == 0) exit(1); // Disable unused variable warning. + if (result == 0) exit(1); // Disable unused variable warning. return value; } void assert_add(const std::string& key, uint64_t value) { int result = add(key, value); assert(result); - if (result == 0) exit(1); // Disable unused variable warning. + if (result == 0) exit(1); // Disable unused variable warning. } }; // Implement 'add' directly with the new Merge operation class MergeBasedCounters : public Counters { private: - WriteOptions merge_option_; // for merge + WriteOptions merge_option_; // for merge public: explicit MergeBasedCounters(std::shared_ptr db, uint64_t defaultCount = 0) - : Counters(db, defaultCount), - merge_option_() { - } + : Counters(db, defaultCount), merge_option_() {} // mapped to a rocksdb Merge operation bool add(const std::string& key, uint64_t value) override { @@ -243,34 +264,37 @@ void dumpDb(DB* db) { auto it = std::unique_ptr(db->NewIterator(ReadOptions())); for (it->SeekToFirst(); it->Valid(); it->Next()) { - //uint64_t value = DecodeFixed64(it->value().data()); - //std::cout << it->key().ToString() << ": " << value << std::endl; + // uint64_t value = DecodeFixed64(it->value().data()); + // std::cout << it->key().ToString() << ": " << value << std::endl; } assert(it->status().ok()); // Check for any errors found during the scan } void testCounters(Counters& counters, DB* db, bool test_compaction) { - FlushOptions o; o.wait = true; counters.assert_set("a", 1); - if (test_compaction) db->Flush(o); + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } - assert(counters.assert_get("a") == 1); + ASSERT_EQ(counters.assert_get("a"), 1); counters.assert_remove("b"); // defaut value is 0 if non-existent - assert(counters.assert_get("b") == 0); + ASSERT_EQ(counters.assert_get("b"), 0); counters.assert_add("a", 2); - if (test_compaction) db->Flush(o); + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } // 1+2 = 3 - assert(counters.assert_get("a")== 3); + ASSERT_EQ(counters.assert_get("a"), 3); dumpDb(db); @@ -280,25 +304,114 @@ counters.assert_add("b", i); sum += i; } - assert(counters.assert_get("b") == sum); + ASSERT_EQ(counters.assert_get("b"), sum); dumpDb(db); if (test_compaction) { - db->Flush(o); + ASSERT_OK(db->Flush(o)); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); dumpDb(db); - assert(counters.assert_get("a")== 3); - assert(counters.assert_get("b") == sum); + ASSERT_EQ(counters.assert_get("a"), 3); + ASSERT_EQ(counters.assert_get("b"), sum); } } +void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { + ASSERT_OK(db->Put({}, "1", "1")); + ASSERT_OK(db->Flush(FlushOptions())); + + std::atomic cnt{0}; + const auto get_thread_id = [&cnt]() { + thread_local int thread_id{cnt++}; + return thread_id; + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (1 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_compact_thread:0"); + } else if (2 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:0"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (0 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:0"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:1"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) { + auto* mutex = reinterpret_cast(arg); + mutex->AssertHeld(); + int thread_id = get_thread_id(); + ASSERT_EQ(2, thread_id); + mutex->Unlock(); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:1"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:2"); + mutex->Lock(); + }); + SyncPoint::GetInstance()->LoadDependency({ + {"testCountersWithFlushAndCompaction::set_options_thread:0", + "testCountersWithCompactionAndFlush:BeforeCompact"}, + {"testCountersWithFlushAndCompaction::bg_compact_thread:0", + "testCountersWithFlushAndCompaction:BeforeIncCounters"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:0", + "testCountersWithFlushAndCompaction::set_options_thread:1"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:1", + "testCountersWithFlushAndCompaction:BeforeVerification"}, + {"testCountersWithFlushAndCompaction:AfterGet", + "testCountersWithFlushAndCompaction::bg_flush_thread:2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread set_options_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->SetOptions( + {{"disable_auto_compactions", "false"}})); + }); + TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact"); + port::Thread compact_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->CompactRange( + CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr)); + }); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters"); + counters.add("test-key", 1); + + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db->Flush(flush_opts)); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification"); + std::string expected; + PutFixed64(&expected, 1); + std::string actual; + Status s = db->Get(ReadOptions(), "test-key", &actual); + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet"); + set_options_thread.join(); + compact_thread.join(); + ASSERT_OK(s); + ASSERT_EQ(expected, actual); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + void testSuccessiveMerge(Counters& counters, size_t max_num_merges, size_t num_merges) { - counters.assert_remove("z"); uint64_t sum = 0; @@ -308,14 +421,14 @@ sum += i; if (i % (max_num_merges + 1) == 0) { - assert(num_merge_operator_calls == max_num_merges + 1); + ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1); } else { - assert(num_merge_operator_calls == 0); + ASSERT_EQ(num_merge_operator_calls, 0); } resetNumMergeOperatorCalls(); - assert(counters.assert_get("z") == sum); - assert(num_merge_operator_calls == i % (max_num_merges + 1)); + ASSERT_EQ(counters.assert_get("z"), sum); + ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1)); } } @@ -332,8 +445,8 @@ counters->assert_add("b", i); tmp_sum += i; } - db->Flush(o); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(tmp_sum, counters->assert_get("b")); if (count > max_merge) { // in this case, FullMerge should be called instead. @@ -346,20 +459,23 @@ // Test case 2: partial merge should not be called when a put is found. resetNumPartialMergeCalls(); tmp_sum = 0; - db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"); + ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10")); for (size_t i = 1; i <= count; i++) { counters->assert_add("c", i); tmp_sum += i; } - db->Flush(o); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(tmp_sum, counters->assert_get("c")); ASSERT_EQ(num_partial_merge_calls, 0U); + // NowNanos was previously called in MergeHelper::FilterMerge(), which + // harmed performance. + ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U); } void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, size_t num_merges) { - assert(num_merges > max_num_merges); + ASSERT_GT(num_merges, max_num_merges); Slice key("BatchSuccessiveMerge"); uint64_t merge_value = 1; @@ -370,15 +486,12 @@ // Create the batch WriteBatch batch; for (size_t i = 0; i < num_merges; ++i) { - batch.Merge(key, merge_value_slice); + ASSERT_OK(batch.Merge(key, merge_value_slice)); } // Apply to memtable and count the number of merges resetNumMergeOperatorCalls(); - { - Status s = db->Write(WriteOptions(), &batch); - assert(s.ok()); - } + ASSERT_OK(db->Write(WriteOptions(), &batch)); ASSERT_EQ( num_merge_operator_calls, static_cast(num_merges - (num_merges % (max_num_merges + 1)))); @@ -386,10 +499,7 @@ // Get the value resetNumMergeOperatorCalls(); std::string get_value_str; - { - Status s = db->Get(ReadOptions(), key, &get_value_str); - assert(s.ok()); - } + ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str)); assert(get_value_str.size() == sizeof(uint64_t)); uint64_t get_value = DecodeFixed64(&get_value_str[0]); ASSERT_EQ(get_value, num_merges * merge_value); @@ -398,7 +508,6 @@ } void runTest(const std::string& dbname, const bool use_ttl = false) { - { auto db = OpenDb(dbname, use_ttl); @@ -413,7 +522,7 @@ } } - DestroyDB(dbname, Options()); + ASSERT_OK(DestroyDB(dbname, Options())); { size_t max_merge = 5; @@ -422,7 +531,8 @@ testCounters(counters, db.get(), use_compression); testSuccessiveMerge(counters, max_merge, max_merge * 2); testSingleBatchSuccessiveMerge(db.get(), 5, 7); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } { @@ -433,14 +543,16 @@ auto db = OpenDb(dbname, use_ttl, max_merge); MergeBasedCounters counters(db, 0); testPartialMerge(&counters, db.get(), max_merge, min_merge, count); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } { auto db = OpenDb(dbname, use_ttl, max_merge); MergeBasedCounters counters(db, 0); testPartialMerge(&counters, db.get(), max_merge, min_merge, min_merge * 10); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } } @@ -451,15 +563,15 @@ counters.add("test-key", 1); counters.add("test-key", 1); counters.add("test-key", 1); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } DB* reopen_db; ASSERT_OK(DB::Open(Options(), dbname, &reopen_db)); std::string value; - ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok())); + ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value)); delete reopen_db; - DestroyDB(dbname, Options()); + ASSERT_OK(DestroyDB(dbname, Options())); } /* Temporary remove this test @@ -488,6 +600,19 @@ runTest(test::PerThreadDBPath("merge_testdbttl"), true); // Run test on TTL database } + +TEST_F(MergeTest, MergeWithCompactionAndFlush) { + const std::string dbname = + test::PerThreadDBPath("merge_with_compaction_and_flush"); + { + auto db = OpenDb(dbname); + { + MergeBasedCounters counters(db, 0); + testCountersWithFlushAndCompaction(counters, db.get()); + } + } + ASSERT_OK(DestroyDB(dbname, Options())); +} #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/obsolete_files_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/obsolete_files_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #ifndef ROCKSDB_LITE #include +#include #include #include #include @@ -27,17 +28,14 @@ #include "test_util/testutil.h" #include "util/string_util.h" -using std::cerr; -using std::cout; -using std::endl; -using std::flush; namespace ROCKSDB_NAMESPACE { class ObsoleteFilesTest : public DBTestBase { public: ObsoleteFilesTest() - : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {} + : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true), + wal_dir_(dbname_ + "/wal_files") {} void AddKeys(int numkeys, int startkey) { WriteOptions options; @@ -56,14 +54,16 @@ AddKeys(numKeysPerFile, startKey); startKey += numKeysPerFile; ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK( + dbfull()->TEST_WaitForCompact()); // wait for background flush (flush + // is also a kind of compaction). } } void CheckFileTypeCounts(const std::string& dir, int required_log, int required_sst, int required_manifest) { std::vector filenames; - env_->GetChildren(dir, &filenames); + ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0; int sst_cnt = 0; @@ -72,7 +72,7 @@ uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); + log_cnt += (type == kWalFile); sst_cnt += (type == kTableFile); manifest_cnt += (type == kDescriptorFile); } @@ -96,6 +96,12 @@ options.WAL_ttl_seconds = 300; // Used to test log files options.WAL_size_limit_MB = 1024; // Used to test log files options.wal_dir = wal_dir_; + + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the + // periodic stat dumping thread. + options.stats_dump_period_sec = 0; + Destroy(options); Reopen(options); } @@ -145,18 +151,6 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { ReopenDB(); - SyncPoint::GetInstance()->DisableProcessing(); - std::vector optsfiles_nums; - std::vector optsfiles_keep; - SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) { - optsfiles_nums.push_back(*reinterpret_cast(arg)); - }); - SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) { - optsfiles_keep.push_back(*reinterpret_cast(arg)); - }); - SyncPoint::GetInstance()->EnableProcessing(); createLevel0Files(2, 50000); CheckFileTypeCounts(wal_dir_, 1, 0, 0); @@ -172,7 +166,6 @@ } } ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */)); - ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size()); Close(); @@ -193,15 +186,127 @@ ASSERT_EQ(2, opts_file_count); } -} // namespace ROCKSDB_NAMESPACE +TEST_F(ObsoleteFilesTest, BlobFiles) { + ReopenDB(); -#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS -extern "C" { -void RegisterCustomObjects(int argc, char** argv); + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const ImmutableCFOptions* const ioptions = cfd->ioptions(); + assert(ioptions); + assert(!ioptions->cf_paths.empty()); + + const std::string& path = ioptions->cf_paths.front().path; + + // Add an obsolete blob file. + constexpr uint64_t first_blob_file_number = 234; + versions->AddObsoleteBlobFile(first_blob_file_number, path); + + // Add a live blob file. + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + constexpr uint64_t second_blob_file_number = 456; + constexpr uint64_t second_total_blob_count = 100; + constexpr uint64_t second_total_blob_bytes = 2000000; + constexpr char second_checksum_method[] = "CRC32B"; + constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a"; + + auto shared_meta = SharedBlobFileMetaData::Create( + second_blob_file_number, second_total_blob_count, second_total_blob_bytes, + second_checksum_method, second_checksum_value); + + constexpr uint64_t second_garbage_blob_count = 0; + constexpr uint64_t second_garbage_blob_bytes = 0; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + second_garbage_blob_count, second_garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + + // Check for obsolete files and make sure the first blob file is picked up + // and grabbed for purge. The second blob file should be on the live list. + constexpr int job_id = 0; + JobContext job_context{job_id}; + + dbfull()->TEST_LockMutex(); + constexpr bool force_full_scan = false; + dbfull()->FindObsoleteFiles(&job_context, force_full_scan); + dbfull()->TEST_UnlockMutex(); + + ASSERT_TRUE(job_context.HaveSomethingToDelete()); + ASSERT_EQ(job_context.blob_delete_files.size(), 1); + ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(), + first_blob_file_number); + + const auto& files_grabbed_for_purge = + dbfull()->TEST_GetFilesGrabbedForPurge(); + ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number), + files_grabbed_for_purge.end()); + + ASSERT_EQ(job_context.blob_live.size(), 1); + ASSERT_EQ(job_context.blob_live[0], second_blob_file_number); + + // Hack the job context a bit by adding a few files to the full scan + // list and adjusting the pending file number. We add the two files + // above as well as two additional ones, where one is old + // and should be cleaned up, and the other is still pending. + constexpr uint64_t old_blob_file_number = 123; + constexpr uint64_t pending_blob_file_number = 567; + + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(old_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(first_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(second_blob_file_number), path); + job_context.full_scan_candidate_files.emplace_back( + BlobFileName(pending_blob_file_number), path); + + job_context.min_pending_output = pending_blob_file_number; + + // Purge obsolete files and make sure we purge the old file and the first file + // (and keep the second file and the pending file). + std::vector deleted_files; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) { + const std::string* file = static_cast(arg); + assert(file); + + constexpr char blob_extension[] = ".blob"; + + if (file->find(blob_extension) != std::string::npos) { + deleted_files.emplace_back(*file); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->PurgeObsoleteFiles(job_context); + job_context.Clean(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number), + files_grabbed_for_purge.end()); + + std::sort(deleted_files.begin(), deleted_files.end()); + const std::vector expected_deleted_files{ + BlobFileName(path, old_blob_file_number), + BlobFileName(path, first_blob_file_number)}; + + ASSERT_EQ(deleted_files, expected_deleted_files); } -#else -void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} -#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +} // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/options_file_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/options_file_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -25,7 +25,7 @@ std::unordered_set* filename_history, int* options_files_count) { std::vector filenames; - db->GetEnv()->GetChildren(db->GetName(), &filenames); + EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames)); uint64_t number; FileType type; *options_files_count = 0; @@ -42,7 +42,7 @@ DB* db, const std::unordered_set& past_filenames) { std::vector filenames; std::unordered_set current_filenames; - db->GetEnv()->GetChildren(db->GetName(), &filenames); + EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames)); uint64_t number; FileType type; for (auto filename : filenames) { @@ -65,7 +65,7 @@ const int kReopenCount = 20; Options opt; opt.create_if_missing = true; - DestroyDB(dbname_, opt); + ASSERT_OK(DestroyDB(dbname_, opt)); std::unordered_set filename_history; DB* db; for (int i = 0; i < kReopenCount; ++i) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/output_validator.h" + +#include "test_util/sync_point.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { +Status OutputValidator::Add(const Slice& key, const Slice& value) { + if (enable_hash_) { + // Generate a rolling 64-bit hash of the key and values + paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_); + paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_); + } + if (enable_order_check_) { + TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check", + /*arg=*/nullptr); + if (key.size() < kNumInternalBytes) { + return Status::Corruption( + "Compaction tries to write a key without internal bytes."); + } + // prev_key_ starts with empty. + if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) { + return Status::Corruption("Compaction sees out-of-order keys."); + } + prev_key_.assign(key.data(), key.size()); + } + return Status::OK(); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +// A class that validates key/value that is inserted to an SST file. +// Pass every key/value of the file using OutputValidator::Add() +// and the class validates key order and optionally calculate a hash +// of all the key and value. +class OutputValidator { + public: + explicit OutputValidator(const InternalKeyComparator& icmp, + bool enable_order_check, bool enable_hash, + uint64_t precalculated_hash = 0) + : icmp_(icmp), + paranoid_hash_(precalculated_hash), + enable_order_check_(enable_order_check), + enable_hash_(enable_hash) {} + + // Add a key to the KV sequence, and return whether the key follows + // criteria, e.g. key is ordered. + Status Add(const Slice& key, const Slice& value); + + // Compare result of two key orders are the same. It can be used + // to compare the keys inserted into a file, and what is read back. + // Return true if the validation passes. + bool CompareValidator(const OutputValidator& other_validator) { + return GetHash() == other_validator.GetHash(); + } + + // Not (yet) intended to be persisted, so subject to change + // without notice between releases. + uint64_t GetHash() const { return paranoid_hash_; } + + private: + const InternalKeyComparator& icmp_; + std::string prev_key_; + uint64_t paranoid_hash_ = 0; + bool enable_order_check_; + bool enable_hash_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/perf_context_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/perf_context_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include "rocksdb/perf_context.h" + #include #include #include @@ -15,8 +17,8 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" -#include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "test_util/testharness.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -76,12 +78,12 @@ std::string key = "k" + ToString(i); std::string value = "v" + ToString(i); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); } for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { std::string key = "k" + ToString(i); - db->Delete(write_options, key); + ASSERT_OK(db->Delete(write_options, key)); } HistogramImpl hist_get; @@ -91,7 +93,7 @@ std::string value; get_perf_context()->Reset(); - StopWatchNano timer(Env::Default()); + StopWatchNano timer(SystemClock::Default().get()); timer.Start(); auto status = db->Get(read_options, key, &value); auto elapsed_nanos = timer.ElapsedNanos(); @@ -110,16 +112,15 @@ std::unique_ptr iter(db->NewIterator(read_options)); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->SeekToFirst(); hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count); auto elapsed_nanos = timer.ElapsedNanos(); if (FLAGS_verbose) { - std::cout << "SeekToFirst uesr key comparison: \n" - << hist_seek_to_first.ToString() - << "ikey skipped: " << get_perf_context()->internal_key_skipped_count - << "\n" + std::cout << "SeekToFirst user key comparison: \n" + << hist_seek_to_first.ToString() << "ikey skipped: " + << get_perf_context()->internal_key_skipped_count << "\n" << "idelete skipped: " << get_perf_context()->internal_delete_skipped_count << "\n" << "elapsed: " << elapsed_nanos << "\n"; @@ -132,7 +133,7 @@ std::string key = "k" + ToString(i); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->Seek(key); auto elapsed_nanos = timer.ElapsedNanos(); hist_seek.Add(get_perf_context()->user_key_comparison_count); @@ -146,7 +147,7 @@ get_perf_context()->Reset(); ASSERT_TRUE(iter->Valid()); - StopWatchNano timer2(Env::Default(), true); + StopWatchNano timer2(SystemClock::Default().get(), true); iter->Next(); auto elapsed_nanos2 = timer2.ElapsedNanos(); if (FLAGS_verbose) { @@ -156,7 +157,7 @@ } if (FLAGS_verbose) { - std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); + std::cout << "Seek user key comparison: \n" << hist_seek.ToString(); } } @@ -165,7 +166,7 @@ const int kTotalIterations = 1000000; std::vector timings(kTotalIterations); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); for (auto& timing : timings) { timing = timer.ElapsedNanos(true /* reset */); } @@ -186,7 +187,7 @@ uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(Env::Default(), nullptr, 0, &elapsed); + StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); for (auto& timing : timings) { timing = elapsed; } @@ -251,7 +252,7 @@ } if (FLAGS_random_key) { - std::random_shuffle(keys.begin(), keys.end()); + RandomShuffle(std::begin(keys), std::end(keys)); } #ifndef NDEBUG ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U); @@ -270,7 +271,7 @@ std::vector values; get_perf_context()->Reset(); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); if (++num_mutex_waited > 3) { #ifndef NDEBUG ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); @@ -314,7 +315,10 @@ hist_get.Add(get_perf_context()->user_key_comparison_count); get_perf_context()->Reset(); - db->MultiGet(read_options, multiget_keys, &values); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); @@ -324,9 +328,10 @@ } if (FLAGS_verbose) { - std::cout << "Put uesr key comparison: \n" << hist_put.ToString() - << "Get uesr key comparison: \n" << hist_get.ToString() - << "MultiGet uesr key comparison: \n" << hist_get.ToString(); + std::cout << "Put user key comparison: \n" + << hist_put.ToString() << "Get user key comparison: \n" + << hist_get.ToString() << "MultiGet user key comparison: \n" + << hist_get.ToString(); std::cout << "Put(): Pre and Post Process Time: \n" << hist_write_pre_post.ToString() << " Writing WAL time: \n" << hist_write_wal_time.ToString() << "\n" @@ -428,7 +433,10 @@ hist_get.Add(get_perf_context()->user_key_comparison_count); get_perf_context()->Reset(); - db->MultiGet(read_options, multiget_keys, &values); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); @@ -438,8 +446,9 @@ } if (FLAGS_verbose) { - std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString() - << "ReadOnly MultiGet uesr key comparison: \n" + std::cout << "ReadOnly Get user key comparison: \n" + << hist_get.ToString() + << "ReadOnly MultiGet user key comparison: \n" << hist_mget.ToString(); std::cout << "ReadOnly Get(): Time to get snapshot: \n" @@ -524,7 +533,7 @@ } if (FLAGS_random_key) { - std::random_shuffle(keys.begin(), keys.end()); + RandomShuffle(std::begin(keys), std::end(keys)); } HistogramImpl hist_put_time; @@ -532,14 +541,14 @@ HistogramImpl hist_time_diff; SetPerfLevel(kEnableTime); - StopWatchNano timer(Env::Default()); + StopWatchNano timer(SystemClock::Default().get()); for (const int i : keys) { std::string key = "k" + ToString(i); std::string value = "v" + ToString(i); get_perf_context()->Reset(); timer.Start(); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); auto put_time = timer.ElapsedNanos(); hist_put_time.Add(put_time); hist_wal_time.Add(get_perf_context()->write_wal_time); @@ -573,7 +582,7 @@ iter->Next(); hist_next.Add(get_perf_context()->user_key_comparison_count); } - + ASSERT_OK(iter->status()); if (FLAGS_verbose) { std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n" << hist_next.ToString(); @@ -585,25 +594,26 @@ for (PerfLevel perf_level_test : {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { for (int c = 0; c < 2; ++c) { - InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); - mutex.Lock(); - ROCKSDB_NAMESPACE::port::Thread child_thread([&] { - SetPerfLevel(perf_level_test); - get_perf_context()->Reset(); - ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); mutex.Lock(); - mutex.Unlock(); - if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || - stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ROCKSDB_NAMESPACE::port::Thread child_thread([&] { + SetPerfLevel(perf_level_test); + get_perf_context()->Reset(); ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); - } else { - // increment the counter only when it's a DB Mutex - ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); - } - }); - Env::Default()->SleepForMicroseconds(100); - mutex.Unlock(); - child_thread.join(); + mutex.Lock(); + mutex.Unlock(); + if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + } else { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); + } + }); + SystemClock::Default()->SleepForMicroseconds(100); + mutex.Unlock(); + child_thread.join(); } } } @@ -612,7 +622,8 @@ SetPerfLevel(kEnableTime); int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; for (int c = 0; c < 2; ++c) { - InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); InstrumentedCondVar lock(&mutex); get_perf_context()->Reset(); mutex.Lock(); @@ -817,6 +828,11 @@ } TEST_F(PerfContextTest, CPUTimer) { + if (SystemClock::Default()->CPUNanos() == 0) { + ROCKSDB_GTEST_SKIP("Target without CPUNanos support"); + return; + } + DestroyDB(kDbName, Options()); auto db = OpenDb(); WriteOptions write_options; @@ -830,7 +846,7 @@ std::string value = "v" + i_str; max_str = max_str > i_str ? max_str : i_str; - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); } std::string last_key = "k" + max_str; std::string last_value = "v" + max_str; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/periodic_work_scheduler.h" + +#include "db/db_impl/db_impl.h" +#include "rocksdb/system_clock.h" + +#ifndef ROCKSDB_LITE +namespace ROCKSDB_NAMESPACE { + +PeriodicWorkScheduler::PeriodicWorkScheduler( + const std::shared_ptr& clock) { + timer = std::unique_ptr(new Timer(clock.get())); +} + +void PeriodicWorkScheduler::Register(DBImpl* dbi, + unsigned int stats_dump_period_sec, + unsigned int stats_persist_period_sec) { + MutexLock l(&timer_mu_); + static std::atomic initial_delay(0); + timer->Start(); + if (stats_dump_period_sec > 0) { + timer->Add([dbi]() { dbi->DumpStats(); }, GetTaskName(dbi, "dump_st"), + initial_delay.fetch_add(1) % + static_cast(stats_dump_period_sec) * + kMicrosInSecond, + static_cast(stats_dump_period_sec) * kMicrosInSecond); + } + if (stats_persist_period_sec > 0) { + timer->Add( + [dbi]() { dbi->PersistStats(); }, GetTaskName(dbi, "pst_st"), + initial_delay.fetch_add(1) % + static_cast(stats_persist_period_sec) * kMicrosInSecond, + static_cast(stats_persist_period_sec) * kMicrosInSecond); + } + timer->Add([dbi]() { dbi->FlushInfoLog(); }, + GetTaskName(dbi, "flush_info_log"), + initial_delay.fetch_add(1) % kDefaultFlushInfoLogPeriodSec * + kMicrosInSecond, + kDefaultFlushInfoLogPeriodSec * kMicrosInSecond); +} + +void PeriodicWorkScheduler::Unregister(DBImpl* dbi) { + MutexLock l(&timer_mu_); + timer->Cancel(GetTaskName(dbi, "dump_st")); + timer->Cancel(GetTaskName(dbi, "pst_st")); + timer->Cancel(GetTaskName(dbi, "flush_info_log")); + if (!timer->HasPendingTask()) { + timer->Shutdown(); + } +} + +PeriodicWorkScheduler* PeriodicWorkScheduler::Default() { + // Always use the default SystemClock for the scheduler, as we only use the + // NowMicros which is the same for all clocks. The Env could only be + // overridden in test. + static PeriodicWorkScheduler scheduler(SystemClock::Default()); + return &scheduler; +} + +std::string PeriodicWorkScheduler::GetTaskName(DBImpl* dbi, + const std::string& func_name) { + std::string db_session_id; + // TODO: Should this error be ignored? + dbi->GetDbSessionId(db_session_id).PermitUncheckedError(); + return db_session_id + ":" + func_name; +} + +#ifndef NDEBUG + +// Get the static scheduler. For a new SystemClock, it needs to re-create the +// internal timer, so only re-create it when there's no running task. Otherwise, +// return the existing scheduler. Which means if the unittest needs to update +// MockClock, Close all db instances and then re-open them. +PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default( + const std::shared_ptr& clock) { + static PeriodicWorkTestScheduler scheduler(clock); + static port::Mutex mutex; + { + MutexLock l(&mutex); + if (scheduler.timer.get() != nullptr && + scheduler.timer->TEST_GetPendingTaskNum() == 0) { + { + MutexLock timer_mu_guard(&scheduler.timer_mu_); + scheduler.timer->Shutdown(); + } + scheduler.timer.reset(new Timer(clock.get())); + } + } + return &scheduler; +} + +void PeriodicWorkTestScheduler::TEST_WaitForRun( + std::function callback) const { + if (timer != nullptr) { + timer->TEST_WaitForRun(callback); + } +} + +size_t PeriodicWorkTestScheduler::TEST_GetValidTaskNum() const { + if (timer != nullptr) { + return timer->TEST_GetPendingTaskNum(); + } + return 0; +} + +PeriodicWorkTestScheduler::PeriodicWorkTestScheduler( + const std::shared_ptr& clock) + : PeriodicWorkScheduler(clock) {} + +#endif // !NDEBUG +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "db/db_impl/db_impl.h" +#include "util/timer.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +// PeriodicWorkScheduler is a singleton object, which is scheduling/running +// DumpStats(), PersistStats(), and FlushInfoLog() for all DB instances. All DB +// instances use the same object from `Default()`. +// +// Internally, it uses a single threaded timer object to run the periodic work +// functions. Timer thread will always be started since the info log flushing +// cannot be disabled. +class PeriodicWorkScheduler { + public: + static PeriodicWorkScheduler* Default(); + + PeriodicWorkScheduler() = delete; + PeriodicWorkScheduler(const PeriodicWorkScheduler&) = delete; + PeriodicWorkScheduler(PeriodicWorkScheduler&&) = delete; + PeriodicWorkScheduler& operator=(const PeriodicWorkScheduler&) = delete; + PeriodicWorkScheduler& operator=(PeriodicWorkScheduler&&) = delete; + + void Register(DBImpl* dbi, unsigned int stats_dump_period_sec, + unsigned int stats_persist_period_sec); + + void Unregister(DBImpl* dbi); + + // Periodically flush info log out of application buffer at a low frequency. + // This improves debuggability in case of RocksDB hanging since it ensures the + // log messages leading up to the hang will eventually become visible in the + // log. + static const uint64_t kDefaultFlushInfoLogPeriodSec = 10; + + protected: + std::unique_ptr timer; + // `timer_mu_` serves two purposes currently: + // (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as + // they are currently not implemented in a thread-safe way; and + // (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and + // the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically. + port::Mutex timer_mu_; + + explicit PeriodicWorkScheduler(const std::shared_ptr& clock); + + private: + std::string GetTaskName(DBImpl* dbi, const std::string& func_name); +}; + +#ifndef NDEBUG +// PeriodicWorkTestScheduler is for unittest, which can specify the SystemClock +// It also contains functions for unittest. +class PeriodicWorkTestScheduler : public PeriodicWorkScheduler { + public: + static PeriodicWorkTestScheduler* Default( + const std::shared_ptr& clock); + + void TEST_WaitForRun(std::function callback) const; + + size_t TEST_GetValidTaskNum() const; + + private: + explicit PeriodicWorkTestScheduler(const std::shared_ptr& clock); +}; +#endif // !NDEBUG + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,236 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/periodic_work_scheduler.h" + +#include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "test_util/mock_time_env.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class PeriodicWorkSchedulerTest : public DBTestBase { + public: + PeriodicWorkSchedulerTest() + : DBTestBase("periodic_work_scheduler_test", /*env_do_fsync=*/true) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_)); + } + + protected: + std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) { + auto* periodic_work_scheduler_ptr = + reinterpret_cast(arg); + *periodic_work_scheduler_ptr = + PeriodicWorkTestScheduler::Default(mock_clock_); + }); + } +}; + +TEST_F(PeriodicWorkSchedulerTest, Basic) { + constexpr unsigned int kPeriodSec = + PeriodicWorkScheduler::kDefaultFlushInfoLogPeriodSec; + Close(); + Options options; + options.stats_dump_period_sec = kPeriodSec; + options.stats_persist_period_sec = kPeriodSec; + options.create_if_missing = true; + options.env = mock_env_.get(); + + int dump_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning", + [&](void*) { dump_st_counter++; }); + + int pst_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning", + [&](void*) { pst_st_counter++; }); + + int flush_info_log_counter = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushInfoLog:StartRunning", + [&](void*) { flush_info_log_counter++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + + ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); + + ASSERT_GT(kPeriodSec, 1u); + dbfull()->TEST_WaitForStatsDumpRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec) - 1); + }); + + auto scheduler = dbfull()->TEST_GetPeriodicWorkScheduler(); + ASSERT_NE(nullptr, scheduler); + ASSERT_EQ(3, scheduler->TEST_GetValidTaskNum()); + + ASSERT_EQ(1, dump_st_counter); + ASSERT_EQ(1, pst_st_counter); + ASSERT_EQ(1, flush_info_log_counter); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + + ASSERT_EQ(2, dump_st_counter); + ASSERT_EQ(2, pst_st_counter); + ASSERT_EQ(2, flush_info_log_counter); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + + ASSERT_EQ(3, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(3, flush_info_log_counter); + + // Disable scheduler with SetOption + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}})); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); + + // Info log flush should still run. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + ASSERT_EQ(3, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(4, flush_info_log_counter); + + scheduler = dbfull()->TEST_GetPeriodicWorkScheduler(); + ASSERT_EQ(1u, scheduler->TEST_GetValidTaskNum()); + + // Re-enable one task + ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}})); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); + + scheduler = dbfull()->TEST_GetPeriodicWorkScheduler(); + ASSERT_NE(nullptr, scheduler); + ASSERT_EQ(2, scheduler->TEST_GetValidTaskNum()); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + ASSERT_EQ(4, dump_st_counter); + ASSERT_EQ(3, pst_st_counter); + ASSERT_EQ(5, flush_info_log_counter); + + Close(); +} + +TEST_F(PeriodicWorkSchedulerTest, MultiInstances) { + constexpr int kPeriodSec = 5; + const int kInstanceNum = 10; + + Close(); + Options options; + options.stats_dump_period_sec = kPeriodSec; + options.stats_persist_period_sec = kPeriodSec; + options.create_if_missing = true; + options.env = mock_env_.get(); + + int dump_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2", + [&](void*) { dump_st_counter++; }); + + int pst_st_counter = 0; + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning", + [&](void*) { pst_st_counter++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + auto dbs = std::vector(kInstanceNum); + for (int i = 0; i < kInstanceNum; i++) { + ASSERT_OK( + DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i]))); + } + + auto dbi = static_cast_with_check(dbs[kInstanceNum - 1]); + auto scheduler = dbi->TEST_GetPeriodicWorkScheduler(); + ASSERT_EQ(kInstanceNum * 3, scheduler->TEST_GetValidTaskNum()); + + int expected_run = kInstanceNum; + dbi->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + expected_run += kInstanceNum; + dbi->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + expected_run += kInstanceNum; + dbi->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + int half = kInstanceNum / 2; + for (int i = 0; i < half; i++) { + delete dbs[i]; + } + + expected_run += (kInstanceNum - half) * 2; + + dbi->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + dbi->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + ASSERT_EQ(expected_run, dump_st_counter); + ASSERT_EQ(expected_run, pst_st_counter); + + for (int i = half; i < kInstanceNum; i++) { + ASSERT_OK(dbs[i]->Close()); + delete dbs[i]; + } +} + +TEST_F(PeriodicWorkSchedulerTest, MultiEnv) { + constexpr int kDumpPeriodSec = 5; + constexpr int kPersistPeriodSec = 10; + Close(); + Options options1; + options1.stats_dump_period_sec = kDumpPeriodSec; + options1.stats_persist_period_sec = kPersistPeriodSec; + options1.create_if_missing = true; + options1.env = mock_env_.get(); + + Reopen(options1); + + std::unique_ptr mock_env2( + new CompositeEnvWrapper(Env::Default(), mock_clock_)); + Options options2; + options2.stats_dump_period_sec = kDumpPeriodSec; + options2.stats_persist_period_sec = kPersistPeriodSec; + options2.create_if_missing = true; + options1.env = mock_env2.get(); + + std::string dbname = test::PerThreadDBPath("multi_env_test"); + DB* db; + ASSERT_OK(DB::Open(options2, dbname, &db)); + DBImpl* dbi = static_cast_with_check(db); + + ASSERT_EQ(dbi->TEST_GetPeriodicWorkScheduler(), + dbfull()->TEST_GetPeriodicWorkScheduler()); + + ASSERT_OK(db->Close()); + delete db; + Close(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h 2025-05-19 16:14:27.000000000 +0000 @@ -43,7 +43,7 @@ } } - typedef void (*ReleaseFunction)(void* arg1); + using ReleaseFunction = void (*)(void* arg1); void PinPtr(void* ptr, ReleaseFunction release_func) { assert(pinning_enabled); if (ptr == nullptr) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/plain_table_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/plain_table_db_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,7 +16,6 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" -#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -32,26 +31,27 @@ #include "table/table_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" #include "util/hash.h" #include "util/mutexlock.h" +#include "util/random.h" #include "util/string_util.h" #include "utilities/merge_operators.h" -using std::unique_ptr; namespace ROCKSDB_NAMESPACE { class PlainTableKeyDecoderTest : public testing::Test {}; TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) { - std::string tmp; Random rnd(301); const uint32_t kLength = 2222; - Slice contents = test::RandomString(&rnd, kLength, &tmp); + std::string tmp = rnd.RandomString(kLength); + Slice contents(tmp); test::StringSource* string_source = new test::StringSource(contents, 0, false); - + std::unique_ptr holder(string_source); std::unique_ptr file_reader( - test::GetRandomAccessFileReader(string_source)); + new RandomAccessFileReader(std::move(holder), "test")); std::unique_ptr file_info( new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(), kLength)); @@ -146,9 +146,7 @@ return options; } - DBImpl* dbfull() { - return reinterpret_cast(db_); - } + DBImpl* dbfull() { return static_cast_with_check(db_); } void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); @@ -222,8 +220,8 @@ int NumTableFilesAtLevel(int level) { std::string property; - EXPECT_TRUE(db_->GetProperty( - "rocksdb.num-files-at-level" + NumberToString(level), &property)); + EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level), + &property)); return atoi(property.c_str()); } @@ -264,31 +262,26 @@ class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& env_options, - const InternalKeyComparator& icomparator, - EncodingType encoding_type, uint64_t file_size, - int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, - const TableProperties* table_properties, - std::unique_ptr&& file, - const ImmutableCFOptions& ioptions, - const SliceTransform* prefix_extractor, - bool* expect_bloom_not_match, bool store_index_in_file, - uint32_t column_family_id, - const std::string& column_family_name) + TestPlainTableReader( + const EnvOptions& env_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + std::unique_ptr&& props, + std::unique_ptr&& file, + const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor, + bool* expect_bloom_not_match, bool store_index_in_file, + uint32_t column_family_id, const std::string& column_family_name) : PlainTableReader(ioptions, std::move(file), env_options, icomparator, - encoding_type, file_size, table_properties, + encoding_type, file_size, props.get(), prefix_extractor), expect_bloom_not_match_(expect_bloom_not_match) { Status s = MmapDataIfNeeded(); EXPECT_TRUE(s.ok()); - s = PopulateIndex(const_cast(table_properties), - bloom_bits_per_key, hash_table_ratio, index_sparseness, - 2 * 1024 * 1024); + s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio, + index_sparseness, 2 * 1024 * 1024); EXPECT_TRUE(s.ok()); - TableProperties* props = const_cast(table_properties); EXPECT_EQ(column_family_id, static_cast(props->column_family_id)); EXPECT_EQ(column_family_name, props->column_family_name); if (store_index_in_file) { @@ -302,7 +295,7 @@ EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); } } - table_properties_.reset(props); + table_properties_ = std::move(props); } ~TestPlainTableReader() override {} @@ -336,31 +329,30 @@ column_family_id_(column_family_id), column_family_name_(std::move(column_family_name)) {} + using PlainTableFactory::NewTableReader; Status NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const override { - TableProperties* props = nullptr; - auto s = - ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, &props, - true /* compression_type_missing */); + std::unique_ptr props; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, &props); EXPECT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; - s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, - BloomBlockBuilder::kBloomBlock, &bloom_block_handle, - /* compression_type_missing */ true); + s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + BloomBlockBuilder::kBloomBlock, + &bloom_block_handle); EXPECT_TRUE(s.ok()); BlockHandle index_block_handle; - s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, - PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_handle, /* compression_type_missing */ true); + s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, + table_reader_options.ioptions, + PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_handle); EXPECT_TRUE(s.ok()); } @@ -374,9 +366,9 @@ std::unique_ptr new_reader(new TestPlainTableReader( table_reader_options.env_options, table_reader_options.internal_comparator, encoding_type, file_size, - bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), table_reader_options.ioptions, - table_reader_options.prefix_extractor, expect_bloom_not_match_, + bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, + std::move(props), std::move(file), table_reader_options.ioptions, + table_reader_options.prefix_extractor.get(), expect_bloom_not_match_, store_index_in_file_, column_family_id_, column_family_name_)); *table = std::move(new_reader); @@ -396,7 +388,7 @@ TEST_P(PlainTableDBTest, BadOptions1) { // Build with a prefix extractor ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Bad attempt to re-open without a prefix extractor Options options = CurrentOptions(); @@ -427,7 +419,9 @@ // Build without a prefix extractor // (apparently works even if hash_table_ratio > 0) ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + // Build without a prefix extractor, this call will fail and returns the + // status for this bad attempt. + ASSERT_NOK(dbfull()->TEST_FlushMemTable()); // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor Status s = TryReopen(&options); @@ -502,14 +496,15 @@ ASSERT_OK(Put("1000000000000foo", "v1")); ASSERT_OK(Put("0000000000000bar", "v2")); ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-table-readers-mem", &int_num)); ASSERT_GT(int_num, 0U); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK( + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); auto row = ptc.begin(); auto tp = row->second; @@ -594,23 +589,23 @@ DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(Put("1000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v2", Get("1000000000000foo")); ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v3", Get("0000000000000eee")); ASSERT_OK(Delete("0000000000000bar")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); ASSERT_OK(Put("0000000000000eee", "v5")); ASSERT_OK(Put("9000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v5", Get("0000000000000eee")); // Test Bloom Filter @@ -650,7 +645,7 @@ DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); int copied = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -728,7 +723,7 @@ ASSERT_OK(Put("1000000000foo005", "v__5")); ASSERT_OK(Put("1000000000foo007", "v__7")); ASSERT_OK(Put("1000000000foo008", "v__8")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); Iterator* iter = dbfull()->NewIterator(ReadOptions()); @@ -798,7 +793,7 @@ expect_bloom_not_match = false; } } - + ASSERT_OK(iter->status()); delete iter; } } @@ -839,7 +834,7 @@ for (unsigned i = 0; i < 2345; ++i) { ASSERT_OK(Put(NthKey(i, 'y'), "added")); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("added", Get(NthKey(42, 'y'))); for (unsigned i = 0; i < 32; ++i) { @@ -897,7 +892,7 @@ ASSERT_OK(Put(key_list[i], ToString(i))); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek(key_list[0]); @@ -945,7 +940,7 @@ ASSERT_OK(Put(key_list[i], ToString(i))); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek(key_list[0]); @@ -980,7 +975,7 @@ ASSERT_OK(Put("1000000000foo005", "v__5")); ASSERT_OK(Put("1000000000foo007", "v__7")); ASSERT_OK(Put("1000000000foo008", "v__8")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); Iterator* iter = dbfull()->NewIterator(ReadOptions()); @@ -1058,7 +1053,7 @@ ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("2000000000000fo3", "v")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1119,6 +1114,7 @@ iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1152,7 +1148,7 @@ ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("2000000000000fo3", "v")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1212,6 +1208,7 @@ iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1234,7 +1231,7 @@ ASSERT_OK(Put("5000000000000fo1", "v2")); ASSERT_OK(Put("5000000000000fo2", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1258,6 +1255,7 @@ iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } @@ -1267,15 +1265,9 @@ return std::string(buf); } -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - TEST_P(PlainTableDBTest, CompactionTrigger) { Options options = CurrentOptions(); - options.write_buffer_size = 120 << 10; // 100KB + options.write_buffer_size = 120 << 10; // 120KB options.num_levels = 3; options.level0_file_num_compaction_trigger = 3; Reopen(&options); @@ -1287,22 +1279,22 @@ std::vector values; // Write 120KB (10 values, each 12K) for (int i = 0; i < 10; i++) { - values.push_back(RandomString(&rnd, 12000)); + values.push_back(rnd.RandomString(12 << 10)); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Put(Key(999), "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); } //generate one more file in level-0, and should trigger level-0 compaction std::vector values; for (int i = 0; i < 12; i++) { - values.push_back(RandomString(&rnd, 10000)); + values.push_back(rnd.RandomString(10000)); ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Put(Key(999), "")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 1); @@ -1318,7 +1310,7 @@ ASSERT_OK(Put("1000000000000foo", "v1")); ASSERT_OK(Put("0000000000000bar", "v2")); ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); options.create_if_missing = false; std::shared_ptr block_based_factory( @@ -1334,7 +1326,7 @@ ASSERT_OK(Put("2000000000000foo", "v4")); ASSERT_OK(Put("3000000000000bar", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v4", Get("2000000000000foo")); ASSERT_EQ("v5", Get("3000000000000bar")); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/pre_release_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/pre_release_callback.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,11 +6,10 @@ #pragma once #include "rocksdb/status.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { -class DB; - class PreReleaseCallback { public: virtual ~PreReleaseCallback() {} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/prefix_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/prefix_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -25,8 +25,10 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "test_util/testharness.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/gflags_compat.h" #include "util/random.h" @@ -310,7 +312,7 @@ ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006")); ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011")); ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk")); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); std::string result; auto db_iter = db->NewIterator(ReadOptions()); @@ -330,7 +332,7 @@ ASSERT_OK(db->Put(write_options, "pikachu", "1")); ASSERT_OK(db->Put(write_options, "Meowth", "1")); ASSERT_OK(db->Put(write_options, "Mewtwo", "idk")); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); std::string result; auto db_iter = db->NewIterator(ReadOptions()); @@ -350,7 +352,7 @@ std::cout << "*** Mem table: " << options.memtable_factory->Name() << " number of buckets: " << num_buckets << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -370,9 +372,11 @@ ASSERT_TRUE(v16 == iter->value()); iter->Next(); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); SeekIterator(iter.get(), 2, 0); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6)); ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5)); @@ -396,9 +400,11 @@ ASSERT_TRUE(v17 == iter->value()); iter->Next(); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); SeekIterator(iter.get(), 2, 0); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); // 3. Insert an entry for the same prefix as the head of the bucket. Slice v15("v15"); @@ -523,7 +529,7 @@ while (NextOptions(num_buckets)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() << " number of buckets: " << num_buckets << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -538,11 +544,11 @@ PutKey(db.get(), write_options, 12345, 8, v18); PutKey(db.get(), write_options, 12345, 9, v19); PutKey(db.get(), write_options, 12346, 8, v16); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); TestKey test_key(12346, 8); std::string s; - db->Delete(write_options, TestKeyToSlice(s, test_key)); - db->Flush(FlushOptions()); + ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key))); + ASSERT_OK(db->Flush(FlushOptions())); read_options.prefix_same_as_start = true; std::unique_ptr iter(db->NewIterator(read_options)); SeekIterator(iter.get(), 12345, 6); @@ -567,6 +573,7 @@ // Verify seeking past the prefix won't return a result. SeekIterator(iter.get(), 12345, 10); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); } } } @@ -575,7 +582,7 @@ while (NextOptions(FLAGS_bucket_count)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -586,12 +593,11 @@ } if (FLAGS_random_prefix) { - std::random_shuffle(prefixes.begin(), prefixes.end()); + RandomShuffle(prefixes.begin(), prefixes.end()); } HistogramImpl hist_put_time; HistogramImpl hist_put_comparison; - // insert x random prefix, each with y continuous element. for (auto prefix : prefixes) { for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { @@ -602,7 +608,7 @@ std::string value(FLAGS_value_size, 0); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); ASSERT_OK(db->Put(write_options, key, value)); hist_put_time.Add(timer.ElapsedNanos()); hist_put_comparison.Add(get_perf_context()->user_key_comparison_count); @@ -625,7 +631,7 @@ std::string value = "v" + ToString(0); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); auto key_prefix = options.prefix_extractor->Transform(key); uint64_t total_keys = 0; for (iter->Seek(key); @@ -659,11 +665,12 @@ Slice key = TestKeyToSlice(s, test_key); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->Seek(key); hist_no_seek_time.Add(timer.ElapsedNanos()); hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); } std::cout << "non-existing Seek key comparison: \n" @@ -682,7 +689,7 @@ for (size_t m = 1; m < 100; m++) { std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: " << options.memtable_factory->Name() << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -707,7 +714,7 @@ } } if (i < 2) { - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); } } @@ -767,6 +774,7 @@ SliceToTestKey(iter->key()).prefix != stored_prefix) { break; } + ASSERT_OK(iter->status()); stored_prefix = SliceToTestKey(iter->key()).prefix; ASSERT_TRUE(iter->Valid()); ASSERT_NE(it, whole_map.end()); @@ -798,7 +806,7 @@ options.memtable_factory.reset(new SkipListFactory); options.write_buffer_size = 1024 * 1024; std::string v13("v13"); - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -806,17 +814,20 @@ PutKey(db.get(), write_options, TestKey(1, 4), "v14"); PutKey(db.get(), write_options, TestKey(3, 3), "v33"); PutKey(db.get(), write_options, TestKey(3, 4), "v34"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); PutKey(db.get(), write_options, TestKey(1, 1), "v11"); PutKey(db.get(), write_options, TestKey(1, 3), "v13"); PutKey(db.get(), write_options, TestKey(2, 1), "v21"); PutKey(db.get(), write_options, TestKey(2, 2), "v22"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); std::unique_ptr iter(db->NewIterator(read_options)); SeekIterator(iter.get(), 1, 5); iter->Prev(); + ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->value(), v13); } @@ -831,27 +842,29 @@ Slice upper_bound = TestKeyToSlice(s, upper_bound_key); { - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; read_options.iterate_upper_bound = &upper_bound; PutKey(db.get(), write_options, TestKey(1, 2), "v12"); PutKey(db.get(), write_options, TestKey(1, 4), "v14"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); PutKey(db.get(), write_options, TestKey(1, 1), "v11"); PutKey(db.get(), write_options, TestKey(1, 3), "v13"); PutKey(db.get(), write_options, TestKey(2, 1), "v21"); PutKey(db.get(), write_options, TestKey(2, 2), "v22"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); std::unique_ptr iter(db->NewIterator(read_options)); iter->SeekToLast(); ASSERT_EQ(iter->value(), v14); } { - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -860,12 +873,14 @@ PutKey(db.get(), write_options, TestKey(1, 4), "v14"); PutKey(db.get(), write_options, TestKey(3, 3), "v33"); PutKey(db.get(), write_options, TestKey(3, 4), "v34"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); PutKey(db.get(), write_options, TestKey(1, 1), "v11"); PutKey(db.get(), write_options, TestKey(1, 3), "v13"); - db->Flush(FlushOptions()); - reinterpret_cast(db.get())->TEST_WaitForFlushMemTable(); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK( + static_cast_with_check(db.get())->TEST_WaitForFlushMemTable()); std::unique_ptr iter(db->NewIterator(read_options)); iter->SeekToLast(); ASSERT_EQ(iter->value(), v14); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -33,17 +33,22 @@ if (smallest != nullptr) { pinned_bounds_.emplace_back(); auto& parsed_smallest = pinned_bounds_.back(); - if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) { - assert(false); - } + Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + smallest_ = &parsed_smallest; } if (largest != nullptr) { pinned_bounds_.emplace_back(); auto& parsed_largest = pinned_bounds_.back(); - if (!ParseInternalKey(largest->Encode(), &parsed_largest)) { - assert(false); - } + + Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + if (parsed_largest.type == kTypeRangeDeletion && parsed_largest.sequence == kMaxSequenceNumber) { // The file boundary has been artificially extended by a range tombstone. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h 2025-05-19 16:14:27.000000000 +0000 @@ -43,12 +43,12 @@ void InternalNext(); - // Seeks to the tombstone with the highest viisble sequence number that covers + // Seeks to the tombstone with the highest visible sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the earliest tombstone that ends after target. void Seek(const Slice& target); - // Seeks to the tombstone with the highest viisble sequence number that covers + // Seeks to the tombstone with the highest visible sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the latest tombstone that starts before target. void SeekForPrev(const Slice& target); @@ -283,9 +283,14 @@ bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { ParsedInternalKey parsed; - if (!ParseInternalKey(key, &parsed)) { + + Status pik_status = + ParseInternalKey(key, &parsed, false /* log_err_key */); // TODO + assert(pik_status.ok()); + if (!pik_status.ok()) { return false; } + return ShouldDelete(parsed, mode); } virtual bool ShouldDelete(const ParsedInternalKey& parsed, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,24 +11,24 @@ } #else -#include #include +#include #include #include #include #include #include +#include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "rocksdb/comparator.h" -#include "rocksdb/env.h" -#include "test_util/testutil.h" +#include "rocksdb/system_clock.h" #include "util/coding.h" +#include "util/gflags_compat.h" #include "util/random.h" #include "util/stop_watch.h" - -#include "util/gflags_compat.h" +#include "util/vector_iterator.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; @@ -147,8 +147,8 @@ keys.push_back(key_and_value.first.Encode().ToString()); values.push_back(key_and_value.second.ToString()); } - return std::unique_ptr( - new test::VectorIterator(keys, values)); + return std::unique_ptr( + new VectorIterator(keys, values, &icmp)); } // convert long to a big-endian slice key @@ -172,6 +172,8 @@ ParseCommandLineFlags(&argc, &argv, true); Stats stats; + ROCKSDB_NAMESPACE::SystemClock* clock = + ROCKSDB_NAMESPACE::SystemClock::Default().get(); ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed); std::default_random_engine random_gen(FLAGS_seed); std::normal_distribution normal_dist(FLAGS_tombstone_width_mean, @@ -206,8 +208,6 @@ ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j); } - auto range_del_iter = - ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones); fragmented_range_tombstone_lists.emplace_back( new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList( ROCKSDB_NAMESPACE::MakeRangeDelIterator( @@ -220,7 +220,7 @@ ROCKSDB_NAMESPACE::kMaxSequenceNumber)); ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones( - ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + clock, true /* auto_start */); range_del_agg.AddTombstones(std::move(fragmented_range_del_iter)); stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); } @@ -237,7 +237,7 @@ parsed_key.user_key = key_string; ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete( - ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + clock, true /* auto_start */); range_del_agg.ShouldDelete(parsed_key, mode); uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,6 +13,7 @@ #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" #include "test_util/testutil.h" +#include "util/vector_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -30,8 +31,8 @@ keys.push_back(key_and_value.first.Encode().ToString()); values.push_back(key_and_value.second.ToString()); } - return std::unique_ptr( - new test::VectorIterator(keys, values)); + return std::unique_ptr( + new VectorIterator(keys, values, &bytewise_icmp)); } std::vector> diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,12 +6,11 @@ #include "db/range_tombstone_fragmenter.h" #include +#include +#include #include #include -#include -#include - #include "util/autovector.h" #include "util/kv_map.h" #include "util/vector_iterator.h" @@ -26,12 +25,15 @@ return; } bool is_sorted = true; - int num_tombstones = 0; InternalKey pinned_last_start_key; Slice last_start_key; + num_unfragmented_tombstones_ = 0; + total_tombstone_payload_bytes_ = 0; for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); - unfragmented_tombstones->Next(), num_tombstones++) { - if (num_tombstones > 0 && + unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + if (num_unfragmented_tombstones_ > 0 && icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { is_sorted = false; break; @@ -51,10 +53,14 @@ // Sort the tombstones before fragmenting them. std::vector keys, values; - keys.reserve(num_tombstones); - values.reserve(num_tombstones); + keys.reserve(num_unfragmented_tombstones_); + values.reserve(num_unfragmented_tombstones_); + // Reset the counter to zero for the next iteration over keys. + total_tombstone_payload_bytes_ = 0; for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); unfragmented_tombstones->Next()) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); keys.emplace_back(unfragmented_tombstones->key().data(), unfragmented_tombstones->key().size()); values.emplace_back(unfragmented_tombstones->value().data(), diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h 2025-05-19 16:14:27.000000000 +0000 @@ -68,6 +68,14 @@ // number in [lower, upper]. bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; + uint64_t num_unfragmented_tombstones() const { + return num_unfragmented_tombstones_; + } + + uint64_t total_tombstone_payload_bytes() const { + return total_tombstone_payload_bytes_; + } + private: // Given an ordered range tombstone iterator unfragmented_tombstones, // "fragment" the tombstones into non-overlapping pieces, and store them in @@ -82,6 +90,8 @@ std::set seq_set_; std::list pinned_slices_; PinnedIteratorsManager pinned_iters_mgr_; + uint64_t num_unfragmented_tombstones_; + uint64_t total_tombstone_payload_bytes_; }; // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del @@ -180,6 +190,13 @@ SequenceNumber upper_bound() const { return upper_bound_; } SequenceNumber lower_bound() const { return lower_bound_; } + uint64_t num_unfragmented_tombstones() const { + return tombstones_->num_unfragmented_tombstones(); + } + uint64_t total_tombstone_payload_bytes() const { + return tombstones_->total_tombstone_payload_bytes(); + } + private: using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,8 +6,10 @@ #include "db/range_tombstone_fragmenter.h" #include "db/db_test_util.h" +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "test_util/testutil.h" +#include "util/vector_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -25,8 +27,8 @@ keys.push_back(key_and_value.first.Encode().ToString()); values.push_back(key_and_value.second.ToString()); } - return std::unique_ptr( - new test::VectorIterator(keys, values)); + return std::unique_ptr( + new VectorIterator(keys, values, &bytewise_icmp)); } void CheckIterPosition(const RangeTombstone& tombstone, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/read_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/read_callback.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,13 +5,14 @@ #pragma once +#include "db/dbformat.h" #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { class ReadCallback { public: - ReadCallback(SequenceNumber last_visible_seq) + explicit ReadCallback(SequenceNumber last_visible_seq) : max_visible_seq_(last_visible_seq) {} ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted) : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc 2025-05-19 16:14:27.000000000 +0000 @@ -62,6 +62,7 @@ #ifndef ROCKSDB_LITE #include + #include "db/builder.h" #include "db/db_impl/db_impl.h" #include "db/dbformat.h" @@ -71,9 +72,9 @@ #include "db/table_cache.h" #include "db/version_edit.h" #include "db/write_batch_internal.h" -#include "env/composite_env_wrapper.h" #include "file/filename.h" #include "file/writable_file_writer.h" +#include "logging/logging.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -94,15 +95,16 @@ const ColumnFamilyOptions& default_cf_opts, const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs) : dbname_(dbname), + db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)), env_(db_options.env), - env_options_(), + file_options_(), db_options_(SanitizeOptions(dbname_, db_options)), immutable_db_options_(ImmutableDBOptions(db_options_)), icmp_(default_cf_opts.comparator), default_cf_opts_( SanitizeOptions(immutable_db_options_, default_cf_opts)), - default_cf_iopts_( - ImmutableCFOptions(immutable_db_options_, default_cf_opts_)), + default_iopts_( + ImmutableOptions(immutable_db_options_, default_cf_opts_)), unknown_cf_opts_( SanitizeOptions(immutable_db_options_, unknown_cf_opts)), create_unknown_cfs_(create_unknown_cfs), @@ -110,16 +112,19 @@ // TableCache can be small since we expect each table to be opened // once. NewLRUCache(10, db_options_.table_cache_numshardbits)), - table_cache_(new TableCache(default_cf_iopts_, env_options_, + table_cache_(new TableCache(default_iopts_, &file_options_, raw_table_cache_.get(), - /*block_cache_tracer=*/nullptr)), + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, db_session_id_)), wb_(db_options_.db_write_buffer_size), wc_(db_options_.delayed_write_rate), - vset_(dbname_, &immutable_db_options_, env_options_, + vset_(dbname_, &immutable_db_options_, file_options_, raw_table_cache_.get(), &wb_, &wc_, - /*block_cache_tracer=*/nullptr), + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + db_session_id_), next_file_number_(1), - db_lock_(nullptr) { + db_lock_(nullptr), + closed_(false) { for (const auto& cfd : column_families) { cf_name_to_opts_[cfd.name] = cfd.options; } @@ -163,29 +168,37 @@ return status; } - ~Repairer() { - if (db_lock_ != nullptr) { - env_->UnlockFile(db_lock_); + Status Close() { + Status s = Status::OK(); + if (!closed_) { + if (db_lock_ != nullptr) { + s = env_->UnlockFile(db_lock_); + db_lock_ = nullptr; + } + closed_ = true; } - delete table_cache_; + return s; } + ~Repairer() { Close().PermitUncheckedError(); } + Status Run() { Status status = env_->LockFile(LockFileName(dbname_), &db_lock_); if (!status.ok()) { return status; } status = FindFiles(); + DBImpl* db_impl = nullptr; if (status.ok()) { // Discard older manifests and start a fresh one for (size_t i = 0; i < manifests_.size(); i++) { ArchiveFile(dbname_ + "/" + manifests_[i]); } // Just create a DBImpl temporarily so we can reuse NewDB() - DBImpl* db_impl = new DBImpl(db_options_, dbname_); - status = db_impl->NewDB(); - delete db_impl; + db_impl = new DBImpl(db_options_, dbname_); + status = db_impl->NewDB(/*new_filenames=*/nullptr); } + delete db_impl; if (status.ok()) { // Recover using the fresh manifest created by NewDB() @@ -229,17 +242,18 @@ }; std::string const dbname_; + std::string db_session_id_; Env* const env_; - const EnvOptions env_options_; + const FileOptions file_options_; const DBOptions db_options_; const ImmutableDBOptions immutable_db_options_; const InternalKeyComparator icmp_; const ColumnFamilyOptions default_cf_opts_; - const ImmutableCFOptions default_cf_iopts_; // table_cache_ holds reference + const ImmutableOptions default_iopts_; // table_cache_ holds reference const ColumnFamilyOptions unknown_cf_opts_; const bool create_unknown_cfs_; std::shared_ptr raw_table_cache_; - TableCache* table_cache_; + std::unique_ptr table_cache_; WriteBufferManager wb_; WriteController wc_; VersionSet vset_; @@ -254,6 +268,7 @@ // Lock over the persistent DB state. Non-nullptr iff successfully // acquired. FileLock* db_lock_; + bool closed_; Status FindFiles() { std::vector filenames; @@ -265,21 +280,15 @@ } // search wal_dir if user uses a customize wal_dir - bool same = false; - Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same); - if (status.IsNotSupported()) { - same = db_options_.wal_dir == dbname_; - status = Status::OK(); - } else if (!status.ok()) { - return status; - } - + bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_); if (!same) { - to_search_paths.push_back(db_options_.wal_dir); + to_search_paths.push_back(immutable_db_options_.wal_dir); } for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) { - status = env_->GetChildren(to_search_paths[path_id], &filenames); + ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n", + to_search_paths[path_id].c_str()); + Status status = env_->GetChildren(to_search_paths[path_id], &filenames); if (!status.ok()) { return status; } @@ -297,7 +306,7 @@ if (number + 1 > next_file_number_) { next_file_number_ = number + 1; } - if (type == kLogFile) { + if (type == kWalFile) { logs_.push_back(number); } else if (type == kTableFile) { table_fds_.emplace_back(number, static_cast(path_id), @@ -316,10 +325,11 @@ } void ConvertLogFilesToTables() { + const auto& wal_dir = immutable_db_options_.GetWalDir(); for (size_t i = 0; i < logs_.size(); i++) { // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option. - std::string logname = LogFileName(db_options_.wal_dir, logs_[i]); - Status status = ConvertLogToTable(logs_[i]); + std::string logname = LogFileName(wal_dir, logs_[i]); + Status status = ConvertLogToTable(wal_dir, logs_[i]); if (!status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring conversion error: %s", @@ -329,7 +339,7 @@ } } - Status ConvertLogToTable(uint64_t log) { + Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) { struct LogReporter : public log::Reader::Reporter { Env* env; std::shared_ptr info_log; @@ -342,15 +352,15 @@ }; // Open the log file - std::string logname = LogFileName(db_options_.wal_dir, log); - std::unique_ptr lfile; - Status status = env_->NewSequentialFile( - logname, &lfile, env_->OptimizeForLogRead(env_options_)); + std::string logname = LogFileName(wal_dir, log); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr lfile_reader; + Status status = SequentialFileReader::Create( + fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader, + nullptr); if (!status.ok()) { return status; } - std::unique_ptr lfile_reader(new SequentialFileReader( - NewLegacySequentialFileWrapper(lfile), logname)); // Create the log reader. LogReporter reporter; @@ -382,15 +392,16 @@ record.size(), Status::Corruption("log record too small")); continue; } - WriteBatchInternal::SetContents(&batch, record); - status = - WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); - if (status.ok()) { + Status record_status = WriteBatchInternal::SetContents(&batch, record); + if (record_status.ok()) { + record_status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + } + if (record_status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s", - log, status.ToString().c_str()); - status = Status::OK(); // Keep going with rest of file + log, record_status.ToString().c_str()); } } @@ -410,7 +421,8 @@ Arena arena; ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); int64_t _current_time = 0; - status = env_->GetCurrentTime(&_current_time); // ignore error + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); @@ -423,18 +435,26 @@ range_del_iters.emplace_back(range_del_iter); } - LegacyFileSystemWrapper fs(env_); - status = BuildTable( - dbname_, env_, &fs, *cfd->ioptions(), - *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_, - iter.get(), std::move(range_del_iters), &meta, + IOStatus io_s; + CompressionOptions default_compression; + TableBuilderOptions tboptions( + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber, - snapshot_checker, kNoCompression, 0 /* sample_for_compression */, - CompressionOptions(), false, nullptr /* internal_stats */, - TableFileCreationReason::kRecovery, nullptr /* event_logger */, - 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, - -1 /* level */, current_time, write_hint); + kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), + -1 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, current_time, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/, + meta.fd.GetNumber()); + status = BuildTable( + dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, + file_options_, table_cache_.get(), iter.get(), + std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, + {}, kMaxSequenceNumber, snapshot_checker, + false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, + nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, + nullptr /* event_logger */, 0 /* job_id */, Env::IO_HIGH, + nullptr /* table_properties */, write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), @@ -481,8 +501,8 @@ file_size); std::shared_ptr props; if (status.ok()) { - status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd, - &props); + status = table_cache_->GetTableProperties(file_options_, icmp_, + t->meta.fd, &props); } if (status.ok()) { t->column_family_id = static_cast(props->column_family_id); @@ -522,20 +542,24 @@ ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( - ropts, env_options_, cfd->internal_comparator(), t->meta, + ropts, file_options_, cfd->internal_comparator(), t->meta, nullptr /* range_del_agg */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), + cfd->GetLatestMutableCFOptions()->prefix_extractor, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, - /*level=*/-1, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); + /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { + Status pik_status = + ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, - "Table #%" PRIu64 ": unparsable key %s", - t->meta.fd.GetNumber(), EscapeString(key).c_str()); + "Table #%" PRIu64 ": unparsable key - %s", + t->meta.fd.GetNumber(), pik_status.getState()); continue; } @@ -553,6 +577,30 @@ t->meta.fd.GetNumber(), counter, status.ToString().c_str()); } + if (status.ok()) { + // XXX/FIXME: This is just basic, naive handling of range tombstones, + // like call to UpdateBoundariesForRange in builder.cc where we assume + // an SST file is a full sorted run. This probably needs the extra logic + // from compaction_job.cc around call to UpdateBoundariesForRange (to + // handle range tombstones extendingg beyond range of other entries). + ReadOptions ropts; + std::unique_ptr r_iter; + status = table_cache_->GetRangeTombstoneIterator( + ropts, cfd->internal_comparator(), t->meta, &r_iter); + + if (r_iter) { + r_iter->SeekToFirst(); + + while (r_iter->Valid()) { + auto tombstone = r_iter->Tombstone(); + auto kv = tombstone.Serialize(); + t->meta.UpdateBoundariesForRange( + kv.first, tombstone.SerializeEndKey(), tombstone.seq_, + cfd->internal_comparator()); + r_iter->Next(); + } + } + } return status; } @@ -585,9 +633,10 @@ table->meta.fd.GetFileSize(), table->meta.smallest, table->meta.largest, table->meta.fd.smallest_seqno, table->meta.fd.largest_seqno, table->meta.marked_for_compaction, - table->meta.oldest_blob_file_number, + table->meta.temperature, table->meta.oldest_blob_file_number, table->meta.oldest_ancester_time, table->meta.file_creation_time, - table->meta.file_checksum, table->meta.file_checksum_func_name); + table->meta.file_checksum, table->meta.file_checksum_func_name, + table->meta.min_timestamp, table->meta.max_timestamp); } assert(next_file_number_ > 0); vset_.MarkFileNumberUsed(next_file_number_ - 1); @@ -614,7 +663,7 @@ new_dir.assign(fname.data(), slash - fname.data()); } new_dir.append("/lost"); - env_->CreateDir(new_dir); // Ignore error + env_->CreateDir(new_dir).PermitUncheckedError(); // Ignore error std::string new_file = new_dir; new_file.append("/"); new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); @@ -646,12 +695,16 @@ ) { ColumnFamilyOptions default_cf_opts; Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + ColumnFamilyOptions() /* unknown_cf_opts */, + false /* create_unknown_cfs */); + status = repairer.Run(); if (status.ok()) { - Repairer repairer(dbname, db_options, column_families, - default_cf_opts, - ColumnFamilyOptions() /* unknown_cf_opts */, - false /* create_unknown_cfs */); - status = repairer.Run(); + status = repairer.Close(); } return status; } @@ -661,29 +714,33 @@ const ColumnFamilyOptions& unknown_cf_opts) { ColumnFamilyOptions default_cf_opts; Status status = GetDefaultCFOptions(column_families, &default_cf_opts); + if (!status.ok()) { + return status; + } + + Repairer repairer(dbname, db_options, column_families, default_cf_opts, + unknown_cf_opts, true /* create_unknown_cfs */); + status = repairer.Run(); if (status.ok()) { - Repairer repairer(dbname, db_options, - column_families, default_cf_opts, - unknown_cf_opts, true /* create_unknown_cfs */); - status = repairer.Run(); + status = repairer.Close(); } return status; } Status RepairDB(const std::string& dbname, const Options& options) { Options opts(options); - if (opts.file_system == nullptr) { - opts.file_system.reset(new LegacyFileSystemWrapper(opts.env)); - ; - } - DBOptions db_options(opts); ColumnFamilyOptions cf_options(opts); + Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */, cf_options /* unknown_cf_opts */, true /* create_unknown_cfs */); - return repairer.Run(); + Status status = repairer.Run(); + if (status.ok()) { + status = repairer.Close(); + } + return status; } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "rocksdb/options.h" #ifndef ROCKSDB_LITE #include @@ -22,30 +23,35 @@ #ifndef ROCKSDB_LITE class RepairTest : public DBTestBase { public: - RepairTest() : DBTestBase("/repair_test") {} + RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {} - std::string GetFirstSstPath() { + Status GetFirstSstPath(std::string* first_sst_path) { + assert(first_sst_path != nullptr); + first_sst_path->clear(); uint64_t manifest_size; std::vector files; - db_->GetLiveFiles(files, &manifest_size); - auto sst_iter = - std::find_if(files.begin(), files.end(), [](const std::string& file) { - uint64_t number; - FileType type; - bool ok = ParseFileName(file, &number, &type); - return ok && type == kTableFile; - }); - return sst_iter == files.end() ? "" : dbname_ + *sst_iter; + Status s = db_->GetLiveFiles(files, &manifest_size); + if (s.ok()) { + auto sst_iter = + std::find_if(files.begin(), files.end(), [](const std::string& file) { + uint64_t number; + FileType type; + bool ok = ParseFileName(file, &number, &type); + return ok && type == kTableFile; + }); + *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter; + } + return s; } }; TEST_F(RepairTest, LostManifest) { // Add a couple SST files, delete the manifest, and verify RepairDB() saves // the day. - Put("key", "val"); - Flush(); - Put("key2", "val2"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); // Need to get path before Close() deletes db_, but delete it after Close() to // ensure Close() didn't change the manifest. std::string manifest_path = @@ -61,12 +67,41 @@ ASSERT_EQ(Get("key2"), "val2"); } +TEST_F(RepairTest, LostManifestMoreDbFeatures) { + // Add a couple SST files, delete the manifest, and verify RepairDB() saves + // the day. + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Put("key3", "val3")); + ASSERT_OK(Put("key4", "val4")); + ASSERT_OK(Flush()); + // Test an SST file containing only a range tombstone + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2", + "key3z")); + ASSERT_OK(Flush()); + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + Reopen(CurrentOptions()); + + ASSERT_EQ(Get("key"), "val"); + ASSERT_EQ(Get("key2"), "NOT_FOUND"); + ASSERT_EQ(Get("key3"), "NOT_FOUND"); + ASSERT_EQ(Get("key4"), "val4"); +} + TEST_F(RepairTest, CorruptManifest) { // Manifest is in an invalid format. Expect a full recovery. - Put("key", "val"); - Flush(); - Put("key2", "val2"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); // Need to get path before Close() deletes db_, but overwrite it after Close() // to ensure Close() didn't change the manifest. std::string manifest_path = @@ -75,8 +110,8 @@ Close(); ASSERT_OK(env_->FileExists(manifest_path)); - LegacyFileSystemWrapper fs(env_); - CreateFile(&fs, manifest_path, "blah", false /* use_fsync */); + ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah", + false /* use_fsync */)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); Reopen(CurrentOptions()); @@ -87,13 +122,13 @@ TEST_F(RepairTest, IncompleteManifest) { // In this case, the manifest is valid but does not reference all of the SST // files. Expect a full recovery. - Put("key", "val"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); std::string orig_manifest_path = DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); CopyFile(orig_manifest_path, orig_manifest_path + ".tmp"); - Put("key2", "val2"); - Flush(); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); // Need to get path before Close() deletes db_, but overwrite it after Close() // to ensure Close() didn't change the manifest. std::string new_manifest_path = @@ -113,10 +148,10 @@ TEST_F(RepairTest, PostRepairSstFileNumbering) { // Verify after a DB is repaired, new files will be assigned higher numbers // than old files. - Put("key", "val"); - Flush(); - Put("key2", "val2"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); Close(); @@ -130,11 +165,12 @@ TEST_F(RepairTest, LostSst) { // Delete one of the SST files but preserve the manifest that refers to it, // then verify the DB is still usable for the intact SST. - Put("key", "val"); - Flush(); - Put("key2", "val2"); - Flush(); - auto sst_path = GetFirstSstPath(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + std::string sst_path; + ASSERT_OK(GetFirstSstPath(&sst_path)); ASSERT_FALSE(sst_path.empty()); ASSERT_OK(env_->DeleteFile(sst_path)); @@ -149,15 +185,16 @@ TEST_F(RepairTest, CorruptSst) { // Corrupt one of the SST files but preserve the manifest that refers to it, // then verify the DB is still usable for the intact SST. - Put("key", "val"); - Flush(); - Put("key2", "val2"); - Flush(); - auto sst_path = GetFirstSstPath(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("key2", "val2")); + ASSERT_OK(Flush()); + std::string sst_path; + ASSERT_OK(GetFirstSstPath(&sst_path)); ASSERT_FALSE(sst_path.empty()); - LegacyFileSystemWrapper fs(env_); - CreateFile(&fs, sst_path, "blah", false /* use_fsync */); + ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah", + false /* use_fsync */)); Close(); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); @@ -170,13 +207,16 @@ TEST_F(RepairTest, UnflushedSst) { // This test case invokes repair while some data is unflushed, then verifies // that data is in the db. - Put("key", "val"); + ASSERT_OK(Put("key", "val")); VectorLogPtr wal_files; ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 1); - uint64_t total_ssts_size; - GetAllSSTFiles(&total_ssts_size); - ASSERT_EQ(total_ssts_size, 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_EQ(total_ssts_size, 0); + } // Need to get path before Close() deletes db_, but delete it after Close() to // ensure Close() didn't change the manifest. std::string manifest_path = @@ -190,8 +230,12 @@ ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 0); - GetAllSSTFiles(&total_ssts_size); - ASSERT_GT(total_ssts_size, 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_GT(total_ssts_size, 0); + } ASSERT_EQ(Get("key"), "val"); } @@ -199,14 +243,17 @@ do { Options options = CurrentOptions(); DestroyAndReopen(options); - Put("key", "val"); - Put("foo", "bar"); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Put("foo", "bar")); VectorLogPtr wal_files; ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 1); - uint64_t total_ssts_size; - GetAllSSTFiles(&total_ssts_size); - ASSERT_EQ(total_ssts_size, 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_EQ(total_ssts_size, 0); + } std::string manifest_path = DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); @@ -221,8 +268,12 @@ Reopen(options); ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 0); - GetAllSSTFiles(&total_ssts_size); - ASSERT_GT(total_ssts_size, 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_GT(total_ssts_size, 0); + } ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("foo"), "bar"); @@ -238,13 +289,13 @@ CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions()); for (int i = 0; i < kNumCfs; ++i) { for (int j = 0; j < kEntriesPerCf; ++j) { - Put(i, "key" + ToString(j), "val" + ToString(j)); + ASSERT_OK(Put(i, "key" + ToString(j), "val" + ToString(j))); if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) { // Leave one unflushed so we can verify WAL entries are properly // associated with column families. continue; } - Flush(i); + ASSERT_OK(Flush(i)); } } @@ -283,12 +334,12 @@ std::vector{opts, rev_opts}); for (int i = 0; i < kNumCfs; ++i) { for (int j = 0; j < kEntriesPerCf; ++j) { - Put(i, "key" + ToString(j), "val" + ToString(j)); + ASSERT_OK(Put(i, "key" + ToString(j), "val" + ToString(j))); if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) { // Leave one unflushed so we can verify RepairDB's flush logic continue; } - Flush(i); + ASSERT_OK(Flush(i)); } } Close(); @@ -308,7 +359,7 @@ // Examine table properties to verify RepairDB() used the right options when // converting WAL->SST TablePropertiesCollection fname_to_props; - db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props); + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props)); ASSERT_EQ(fname_to_props.size(), 2U); for (const auto& fname_and_props : fname_to_props) { std::string comparator_name ( @@ -342,8 +393,8 @@ } } - Put("key", "val"); - Flush(); + ASSERT_OK(Put("key", "val")); + ASSERT_OK(Flush()); Close(); ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions())); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/snapshot_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/snapshot_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #pragma once #include +#include "db/dbformat.h" #include "rocksdb/db.h" namespace ROCKSDB_NAMESPACE { @@ -23,7 +24,7 @@ SequenceNumber number_; // const after creation // It indicates the smallest uncommitted data at the time the snapshot was // taken. This is currently used by WritePrepared transactions to limit the - // scope of queries to IsInSnpashot. + // scope of queries to IsInSnapshot. SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; virtual SequenceNumber GetSequenceNumber() const override { return number_; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,9 +13,11 @@ #include "db/range_tombstone_fragmenter.h" #include "db/snapshot_impl.h" #include "db/version_edit.h" +#include "file/file_util.h" #include "file/filename.h" #include "file/random_access_file_reader.h" #include "monitoring/perf_context_imp.h" +#include "rocksdb/advanced_options.h" #include "rocksdb/statistics.h" #include "table/block_based/block_based_table_reader.h" #include "table/get_context.h" @@ -62,14 +64,21 @@ } // namespace -TableCache::TableCache(const ImmutableCFOptions& ioptions, - const FileOptions& file_options, Cache* const cache, - BlockCacheTracer* const block_cache_tracer) +const int kLoadConcurency = 128; + +TableCache::TableCache(const ImmutableOptions& ioptions, + const FileOptions* file_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : ioptions_(ioptions), - file_options_(file_options), + file_options_(*file_options), cache_(cache), immortal_tables_(false), - block_cache_tracer_(block_cache_tracer) { + block_cache_tracer_(block_cache_tracer), + loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr), + io_tracer_(io_tracer), + db_session_id_(db_session_id) { if (ioptions_.row_cache) { // If the same cache is shared by multiple instances, we need to // disambiguate its entries. @@ -89,38 +98,54 @@ } Status TableCache::GetTableReader( - const FileOptions& file_options, + const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor, bool skip_filters, int level, - bool prefetch_index_and_filter_in_cache) { + const std::shared_ptr& prefix_extractor, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { std::string fname = TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; - Status s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, - nullptr); - RecordTick(ioptions_.statistics, NO_FILE_OPENS); - if (s.IsPathNotFound()) { + FileOptions fopts = file_options; + fopts.temperature = file_temperature; + Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } else if (s.IsPathNotFound()) { fname = Rocks2LevelTableFileName(fname); - s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr); - RecordTick(ioptions_.statistics, NO_FILE_OPENS); + s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + if (s.ok()) { + s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, + nullptr); + } + if (s.ok()) { + RecordTick(ioptions_.stats, NO_FILE_OPENS); + } } if (s.ok()) { if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(FSRandomAccessFile::kRandom); } - StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), fname, ioptions_.env, - record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter, ioptions_.listeners)); + std::move(file), fname, ioptions_.clock, io_tracer_, + record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, + file_temperature)); s = ioptions_.table_factory->NewTableReader( - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, - level, fd.largest_seqno, block_cache_tracer_), + ro, + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + skip_filters, immortal_tables_, false /* force_direct_prefetch */, + level, fd.largest_seqno, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, fd.GetNumber()), std::move(file_reader), fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); @@ -135,16 +160,15 @@ cache_->Erase(key); } -Status TableCache::FindTable(const FileOptions& file_options, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, Cache::Handle** handle, - const SliceTransform* prefix_extractor, - const bool no_io, bool record_read_stats, - HistogramImpl* file_read_hist, bool skip_filters, - int level, - bool prefetch_index_and_filter_in_cache) { - PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env); - Status s; +Status TableCache::FindTable( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + Cache::Handle** handle, + const std::shared_ptr& prefix_extractor, + const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { + PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); *handle = cache_->Lookup(key); @@ -152,17 +176,25 @@ const_cast(&no_io)); if (*handle == nullptr) { - if (no_io) { // Don't do IO and return a not-found status + if (no_io) { return Status::Incomplete("Table not found in table_cache, no_io is set"); } + MutexLock load_lock(loader_mutex_.get(key)); + // We check the cache again under loading mutex + *handle = cache_->Lookup(key); + if (*handle != nullptr) { + return Status::OK(); + } + std::unique_ptr table_reader; - s = GetTableReader(file_options, internal_comparator, fd, - false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader, prefix_extractor, - skip_filters, level, prefetch_index_and_filter_in_cache); + Status s = GetTableReader( + ro, file_options, internal_comparator, fd, false /* sequential mode */, + record_read_stats, file_read_hist, &table_reader, prefix_extractor, + skip_filters, level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(ioptions_.statistics, NO_FILE_ERRORS); + RecordTick(ioptions_.stats, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { @@ -173,18 +205,21 @@ table_reader.release(); } } + return s; } - return s; + return Status::OK(); } InternalIterator* TableCache::NewIterator( const ReadOptions& options, const FileOptions& file_options, const InternalKeyComparator& icomparator, const FileMetaData& file_meta, - RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, + RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, - const InternalKey* largest_compaction_key) { + const InternalKey* largest_compaction_key, bool allow_unprepared_value) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; @@ -197,10 +232,12 @@ auto& fd = file_meta.fd; table_reader = fd.table_reader; if (table_reader == nullptr) { - s = FindTable(file_options, icomparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record_read_stats */, file_read_hist, - skip_filters, level); + s = FindTable( + options, file_options, icomparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = GetTableReaderFromHandle(handle); } @@ -211,9 +248,9 @@ !options.table_filter(*table_reader->GetTableProperties())) { result = NewEmptyInternalIterator(arena); } else { - result = table_reader->NewIterator(options, prefix_extractor, arena, - skip_filters, caller, - file_options.compaction_readahead_size); + result = table_reader->NewIterator( + options, prefix_extractor.get(), arena, skip_filters, caller, + file_options.compaction_readahead_size, allow_unprepared_value); } if (handle != nullptr) { result->RegisterCleanup(&UnrefEntry, cache_, handle); @@ -265,19 +302,27 @@ const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::unique_ptr* out_iter) { + assert(out_iter); const FileDescriptor& fd = file_meta.fd; Status s; TableReader* t = fd.table_reader; Cache::Handle* handle = nullptr; if (t == nullptr) { - s = FindTable(file_options_, internal_comparator, fd, &handle); + s = FindTable(options, file_options_, internal_comparator, fd, &handle); if (s.ok()) { t = GetTableReaderFromHandle(handle); } } if (s.ok()) { + // Note: NewRangeTombstoneIterator could return nullptr out_iter->reset(t->NewRangeTombstoneIterator(options)); - assert(out_iter); + } + if (handle) { + if (*out_iter) { + (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle); + } else { + ReleaseHandle(handle); + } } return s; } @@ -303,8 +348,7 @@ // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. if (options.snapshot != nullptr && (get_context->has_callback() || - static_cast_with_check( - options.snapshot) + static_cast_with_check(options.snapshot) ->GetSequenceNumber() <= fd.largest_seqno)) { // We should consider to use options.snapshot->GetSequenceNumber() // instead of GetInternalKeySeqno(k), which will make the code @@ -346,22 +390,22 @@ ioptions_.row_cache.get(), row_handle); replayGetContextLog(*found_row_cache_entry, user_key, get_context, &value_pinner); - RecordTick(ioptions_.statistics, ROW_CACHE_HIT); + RecordTick(ioptions_.stats, ROW_CACHE_HIT); found = true; } else { - RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + RecordTick(ioptions_.stats, ROW_CACHE_MISS); } return found; } #endif // ROCKSDB_LITE -Status TableCache::Get(const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, - GetContext* get_context, - const SliceTransform* prefix_extractor, - HistogramImpl* file_read_hist, bool skip_filters, - int level) { +Status TableCache::Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin) { auto& fd = file_meta.fd; std::string* row_cache_entry = nullptr; bool done = false; @@ -384,12 +428,15 @@ Status s; TableReader* t = fd.table_reader; Cache::Handle* handle = nullptr; - if (!done && s.ok()) { + if (!done) { + assert(s.ok()); if (t == nullptr) { - s = FindTable( - file_options_, internal_comparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, level); + s = FindTable(options, file_options_, internal_comparator, fd, &handle, + prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { t = GetTableReaderFromHandle(handle); } @@ -408,7 +455,7 @@ } if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. - s = t->Get(options, k, get_context, prefix_extractor, skip_filters); + s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); get_context->SetReplayLog(nullptr); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set @@ -424,8 +471,11 @@ size_t charge = row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string); void* row_ptr = new std::string(std::move(*row_cache_entry)); - ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge, - &DeleteEntry); + // If row cache is full, it's OK to continue. + ioptions_.row_cache + ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry) + .PermitUncheckedError(); } #endif // ROCKSDB_LITE @@ -436,13 +486,12 @@ } // Batched version of TableCache::MultiGet. -Status TableCache::MultiGet(const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - const MultiGetContext::Range* mget_range, - const SliceTransform* prefix_extractor, - HistogramImpl* file_read_hist, bool skip_filters, - int level) { +Status TableCache::MultiGet( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + const std::shared_ptr& prefix_extractor, + HistogramImpl* file_read_hist, bool skip_filters, int level) { auto& fd = file_meta.fd; Status s; TableReader* t = fd.table_reader; @@ -467,8 +516,8 @@ for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) { - const Slice& user_key = miter->ukey; - ; + const Slice& user_key = miter->ukey_with_ts; + GetContext* get_context = miter->get_context; if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, @@ -486,10 +535,12 @@ // found in the row cache and thus the range may now be empty if (s.ok() && !table_range.empty()) { if (t == nullptr) { - s = FindTable( - file_options_, internal_comparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, level); + s = FindTable(options, file_options_, internal_comparator, fd, &handle, + prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { t = GetTableReaderFromHandle(handle); @@ -504,14 +555,14 @@ ++iter) { SequenceNumber* max_covering_tombstone_seq = iter->get_context->max_covering_tombstone_seq(); - *max_covering_tombstone_seq = - std::max(*max_covering_tombstone_seq, - range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey)); + *max_covering_tombstone_seq = std::max( + *max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts)); } } } if (s.ok()) { - t->MultiGet(options, &table_range, prefix_extractor, skip_filters); + t->MultiGet(options, &table_range, prefix_extractor.get(), skip_filters); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { Status* status = iter->s; @@ -531,7 +582,7 @@ for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) { std::string& row_cache_entry = row_cache_entries[row_idx++]; - const Slice& user_key = miter->ukey; + const Slice& user_key = miter->ukey_with_ts; ; GetContext* get_context = miter->get_context; @@ -544,8 +595,11 @@ size_t charge = row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string); void* row_ptr = new std::string(std::move(row_cache_entry)); - ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge, - &DeleteEntry); + // If row cache is full, it's OK. + ioptions_.row_cache + ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry) + .PermitUncheckedError(); } } } @@ -561,19 +615,18 @@ const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, - const SliceTransform* prefix_extractor, bool no_io) { - Status s; + const std::shared_ptr& prefix_extractor, bool no_io) { auto table_reader = fd.table_reader; // table already been pre-loaded? if (table_reader) { *properties = table_reader->GetTableProperties(); - return s; + return Status::OK(); } Cache::Handle* table_handle = nullptr; - s = FindTable(file_options, internal_comparator, fd, &table_handle, - prefix_extractor, no_io); + Status s = FindTable(ReadOptions(), file_options, internal_comparator, fd, + &table_handle, prefix_extractor, no_io); if (!s.ok()) { return s; } @@ -587,8 +640,7 @@ size_t TableCache::GetMemoryUsageByTableReader( const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, - const SliceTransform* prefix_extractor) { - Status s; + const std::shared_ptr& prefix_extractor) { auto table_reader = fd.table_reader; // table already been pre-loaded? if (table_reader) { @@ -596,8 +648,8 @@ } Cache::Handle* table_handle = nullptr; - s = FindTable(file_options, internal_comparator, fd, &table_handle, - prefix_extractor, true); + Status s = FindTable(ReadOptions(), file_options, internal_comparator, fd, + &table_handle, prefix_extractor, true); if (!s.ok()) { return 0; } @@ -608,6 +660,16 @@ return ret; } +bool TableCache::HasEntry(Cache* cache, uint64_t file_number) { + Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number)); + if (handle) { + cache->Release(handle); + return true; + } else { + return false; + } +} + void TableCache::Evict(Cache* cache, uint64_t file_number) { cache->Erase(GetSliceForFileNumber(&file_number)); } @@ -615,14 +677,14 @@ uint64_t TableCache::ApproximateOffsetOf( const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, - const SliceTransform* prefix_extractor) { + const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = fd.table_reader; Cache::Handle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = FindTable(file_options_, internal_comparator, fd, &table_handle, - prefix_extractor, false /* no_io */, + Status s = FindTable(ReadOptions(), file_options_, internal_comparator, fd, + &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = GetTableReaderFromHandle(table_handle); @@ -642,14 +704,14 @@ uint64_t TableCache::ApproximateSize( const Slice& start, const Slice& end, const FileDescriptor& fd, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, - const SliceTransform* prefix_extractor) { + const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = fd.table_reader; Cache::Handle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); - Status s = FindTable(file_options_, internal_comparator, fd, &table_handle, - prefix_extractor, false /* no_io */, + Status s = FindTable(ReadOptions(), file_options_, internal_comparator, fd, + &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { table_reader = GetTableReaderFromHandle(table_handle); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,9 @@ // Thread-safe (provides internal synchronization) #pragma once +#include #include #include -#include #include "db/dbformat.h" #include "db/range_del_aggregator.h" @@ -48,9 +48,11 @@ // ioptions.row_cache class TableCache { public: - TableCache(const ImmutableCFOptions& ioptions, - const FileOptions& storage_options, Cache* cache, - BlockCacheTracer* const block_cache_tracer); + TableCache(const ImmutableOptions& ioptions, + const FileOptions* storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -60,6 +62,7 @@ // the returned iterator. The returned "*table_reader_ptr" object is owned // by the cache and should not be deleted, and is valid for as long as the // returned iterator is live. + // @param options Must outlive the returned iterator. // @param range_del_agg If non-nullptr, adds range deletions to the // aggregator. If an error occurs, returns it in a NewErrorInternalIterator // @param for_compaction If true, a new TableReader may be allocated (but @@ -70,10 +73,12 @@ const ReadOptions& options, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, - const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, - HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, - bool skip_filters, int level, const InternalKey* smallest_compaction_key, - const InternalKey* largest_compaction_key); + const std::shared_ptr& prefix_extractor, + TableReader** table_reader_ptr, HistogramImpl* file_read_hist, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, + size_t max_file_size_for_l0_meta_pin, + const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key, bool allow_unprepared_value); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -85,13 +90,13 @@ // recorded // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" - Status Get(const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, - GetContext* get_context, - const SliceTransform* prefix_extractor = nullptr, - HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, - int level = -1); + Status Get( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, size_t max_file_size_for_l0_meta_pin = 0); // Return the range delete tombstone iterator of the file specified by // `file_meta`. @@ -110,17 +115,20 @@ // in the embedded GetContext // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" - Status MultiGet(const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - const MultiGetContext::Range* mget_range, - const SliceTransform* prefix_extractor = nullptr, - HistogramImpl* file_read_hist = nullptr, - bool skip_filters = false, int level = -1); + Status MultiGet( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + const std::shared_ptr& prefix_extractor = nullptr, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); + // Query whether specified file number is currently in cache + static bool HasEntry(Cache* cache, uint64_t file_number); + // Clean table handle and erase it from the table cache // Used in DB close, or the file is not live anymore. void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle); @@ -128,14 +136,16 @@ // Find table reader // @param skip_filters Disables loading/accessing the filter block // @param level == -1 means not specified - Status FindTable(const FileOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, Cache::Handle**, - const SliceTransform* prefix_extractor = nullptr, - const bool no_io = false, bool record_read_stats = true, - HistogramImpl* file_read_hist = nullptr, - bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true); + Status FindTable( + const ReadOptions& ro, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& file_fd, Cache::Handle**, + const std::shared_ptr& prefix_extractor = nullptr, + const bool no_io = false, bool record_read_stats = true, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); @@ -146,12 +156,13 @@ // @returns: `properties` will be reset on success. Please note that we will // return Status::Incomplete() if table is not present in cache and // we set `no_io` to be true. - Status GetTableProperties(const FileOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_meta, - std::shared_ptr* properties, - const SliceTransform* prefix_extractor = nullptr, - bool no_io = false); + Status GetTableProperties( + const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& file_meta, + std::shared_ptr* properties, + const std::shared_ptr& prefix_extractor = nullptr, + bool no_io = false); // Return total memory usage of the table reader of the file. // 0 if table reader of the file is not loaded. @@ -159,27 +170,28 @@ const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, - const SliceTransform* prefix_extractor = nullptr); + const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. uint64_t ApproximateOffsetOf( const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, - const SliceTransform* prefix_extractor = nullptr); + const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file // represented by fd (the start key must not be greater than the end key). - uint64_t ApproximateSize(const Slice& start, const Slice& end, - const FileDescriptor& fd, TableReaderCaller caller, - const InternalKeyComparator& internal_comparator, - const SliceTransform* prefix_extractor = nullptr); + uint64_t ApproximateSize( + const Slice& start, const Slice& end, const FileDescriptor& fd, + TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const std::shared_ptr& prefix_extractor = nullptr); // Release the handle from a cache void ReleaseHandle(Cache::Handle* handle); Cache* get_cache() const { return cache_; } - // Capacity of the backing Cache that indicates inifinite TableCache capacity. + // Capacity of the backing Cache that indicates infinite TableCache capacity. // For example when max_open_files is -1 we set the backing Cache to this. static const int kInfiniteCapacity = 0x400000; @@ -193,14 +205,16 @@ private: // Build a table reader - Status GetTableReader(const FileOptions& file_options, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, bool sequential_mode, - bool record_read_stats, HistogramImpl* file_read_hist, - std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor = nullptr, - bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true); + Status GetTableReader( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileDescriptor& fd, bool sequential_mode, bool record_read_stats, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Create a key prefix for looking up the row cache. The prefix is of the // format row_cache_id + fd_number + seq_no. Later, the user key can be @@ -215,12 +229,15 @@ bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, size_t prefix_size, GetContext* get_context); - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const FileOptions& file_options_; Cache* const cache_; std::string row_cache_id_; bool immortal_tables_; BlockCacheTracer* const block_cache_tracer_; + Striped loader_mutex_; + std::shared_ptr io_tracer_; + std::string db_session_id_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc 2025-05-19 16:14:27.000000000 +0000 @@ -33,8 +33,9 @@ const Slice& value, uint64_t file_size) { ParsedInternalKey ikey; - if (!ParseInternalKey(key, &ikey)) { - return Status::InvalidArgument("Invalid internal key"); + Status s = ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!s.ok()) { + return s; } return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type), @@ -42,10 +43,10 @@ } void UserKeyTablePropertiesCollector::BlockAdd( - uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) { - return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast, - blockCompressedBytesSlow); + uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) { + return collector_->BlockAdd(block_raw_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); } Status UserKeyTablePropertiesCollector::Finish( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,12 +6,14 @@ // This file defines a collection of statistics collectors. #pragma once -#include "rocksdb/table_properties.h" - #include #include #include +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table_properties.h" + namespace ROCKSDB_NAMESPACE { // Base class for internal table properties collector. @@ -27,9 +29,9 @@ virtual Status InternalAdd(const Slice& key, const Slice& value, uint64_t file_size) = 0; - virtual void BlockAdd(uint64_t blockRawBytes, - uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) = 0; + virtual void BlockAdd(uint64_t block_raw_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) = 0; virtual UserCollectedProperties GetReadableProperties() const = 0; @@ -42,12 +44,15 @@ virtual ~IntTblPropCollectorFactory() {} // has to be thread-safe virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t column_family_id) = 0; + uint32_t column_family_id, int level_at_creation) = 0; // The name of the properties collector can be used for debugging purpose. virtual const char* Name() const = 0; }; +using IntTblPropCollectorFactories = + std::vector>; + // When rocksdb creates a new table, it will encode all "user keys" into // "internal keys", which contains meta information of a given entry. // @@ -64,9 +69,9 @@ virtual Status InternalAdd(const Slice& key, const Slice& value, uint64_t file_size) override; - virtual void BlockAdd(uint64_t blockRawBytes, - uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) override; + virtual void BlockAdd(uint64_t block_raw_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override; virtual Status Finish(UserCollectedProperties* properties) override; @@ -89,9 +94,10 @@ std::shared_ptr user_collector_factory) : user_collector_factory_(user_collector_factory) {} virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t column_family_id) override { + uint32_t column_family_id, int level_at_creation) override { TablePropertiesCollectorFactory::Context context; context.column_family_id = column_family_id; + context.level_at_creation = level_at_creation; return new UserKeyTablePropertiesCollector( user_collector_factory_->CreateTablePropertiesCollector(context)); } @@ -104,4 +110,66 @@ std::shared_ptr user_collector_factory_; }; +// When rocksdb creates a newtable, it will encode all "user keys" into +// "internal keys". This class collects min/max timestamp from the encoded +// internal key when Add() is invoked. +// +// @param cmp the user comparator to compare the timestamps in internal key. +class TimestampTablePropertiesCollector : public IntTblPropCollector { + public: + explicit TimestampTablePropertiesCollector(const Comparator* cmp) + : cmp_(cmp), + timestamp_min_(kDisableUserTimestamp), + timestamp_max_(kDisableUserTimestamp) {} + + Status InternalAdd(const Slice& key, const Slice& /* value */, + uint64_t /* file_size */) override { + auto user_key = ExtractUserKey(key); + assert(cmp_ && cmp_->timestamp_size() > 0); + if (user_key.size() < cmp_->timestamp_size()) { + return Status::Corruption( + "User key size mismatch when comparing to timestamp size."); + } + auto timestamp_in_key = + ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size()); + if (timestamp_max_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) { + timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + if (timestamp_min_ == kDisableUserTimestamp || + cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) { + timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size()); + } + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { + return; + } + + Status Finish(UserCollectedProperties* properties) override { + assert(timestamp_min_.size() == timestamp_max_.size() && + timestamp_max_.size() == cmp_->timestamp_size()); + properties->insert({"rocksdb.timestamp_min", timestamp_min_}); + properties->insert({"rocksdb.timestamp_max", timestamp_max_}); + return Status::OK(); + } + + const char* Name() const override { + return "TimestampTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)}, + {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}}; + } + + protected: + const Comparator* const cmp_; + std::string timestamp_min_; + std::string timestamp_max_; +}; + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/table_properties_collector.h" + #include #include #include @@ -11,11 +13,10 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "db/table_properties_collector.h" -#include "env/composite_env_wrapper.h" #include "file/sequence_file_reader.h" #include "file/writable_file_writer.h" #include "options/cf_options.h" +#include "rocksdb/flush_block_policy.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" #include "table/meta_blocks.h" @@ -39,24 +40,23 @@ namespace { static const uint32_t kTestColumnFamilyId = 66; static const std::string kTestColumnFamilyName = "test_column_fam"; +static const int kTestLevel = 1; -void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, - const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - std::unique_ptr* writable, - std::unique_ptr* builder) { - std::unique_ptr wf(new test::StringSink); +void MakeBuilder( + const Options& options, const ImmutableOptions& ioptions, + const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + std::unique_ptr* writable, + std::unique_ptr* builder) { + std::unique_ptr wf(new test::StringSink); writable->reset( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)), - "" /* don't care */, EnvOptions())); - int unknown_level = -1; - builder->reset(NewTableBuilder( + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + TableBuilderOptions tboptions( ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, - kTestColumnFamilyId, kTestColumnFamilyName, writable->get(), - options.compression, options.sample_for_compression, - options.compression_opts, unknown_level)); + options.compression, options.compression_opts, kTestColumnFamilyId, + kTestColumnFamilyName, kTestLevel); + builder->reset(NewTableBuilder(tboptions, writable->get())); } } // namespace @@ -176,9 +176,9 @@ return Status::OK(); } - void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Nothing to do. return; } @@ -199,6 +199,7 @@ TablePropertiesCollector* CreateTablePropertiesCollector( TablePropertiesCollectorFactory::Context context) override { EXPECT_EQ(kTestColumnFamilyId, context.column_family_id); + EXPECT_EQ(kTestLevel, context.level_at_creation); if (!backward_mode_) { return new RegularKeysStartWithA(); } else { @@ -206,7 +207,7 @@ } } IntTblPropCollector* CreateIntTblPropCollector( - uint32_t /*column_family_id*/) override { + uint32_t /*column_family_id*/, int /* level_at_creation */) override { return new RegularKeysStartWithAInternal(); } const char* Name() const override { return "RegularKeysStartWithA"; } @@ -262,10 +263,9 @@ // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writer; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; if (test_int_tbl_prop_collector) { int_tbl_prop_collector_factories.emplace_back( new RegularKeysStartWithAFactory(backward_mode)); @@ -284,17 +284,16 @@ writer->Flush(); // -- Step 2: Read properties - LegacyWritableFileWrapper* file = - static_cast(writer->writable_file()); - test::StringSink* fwf = static_cast(file->target()); + test::StringSink* fwf = + static_cast(writer->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); std::unique_ptr fake_file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(fwf->contents()))); - TableProperties* props; + new RandomAccessFileReader(std::move(source), "test")); + + std::unique_ptr props; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), - magic_number, ioptions, &props, - true /* compression_type_missing */); - std::unique_ptr props_guard(props); + magic_number, ioptions, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; @@ -394,8 +393,7 @@ Options options; test::PlainInternalKeyComparator pikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; options.table_factory = table_factory; if (sanitized) { options.table_properties_collector_factories.emplace_back( @@ -408,11 +406,11 @@ options.info_log = std::make_shared(); options = SanitizeOptions("db", // just a place holder options); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); options.comparator = comparator; } - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); MutableCFOptions moptions(options); for (int iter = 0; iter < 2; ++iter) { @@ -425,19 +423,18 @@ ASSERT_OK(builder->Finish()); writable->Flush(); - LegacyWritableFileWrapper* file = - static_cast(writable->writable_file()); - test::StringSink* fwf = static_cast(file->target()); + test::StringSink* fwf = + static_cast(writable->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); std::unique_ptr reader( - test::GetRandomAccessFileReader( - new test::StringSource(fwf->contents()))); - TableProperties* props; - Status s = - ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, - ioptions, &props, true /* compression_type_missing */); + new RandomAccessFileReader(std::move(source), "test")); + + std::unique_ptr props; + Status s = ReadTableProperties(reader.get(), fwf->contents().size(), + magic_number, ioptions, &props); ASSERT_OK(s); - std::unique_ptr props_guard(props); auto user_collected = props->user_collected_properties; uint64_t deleted = GetDeletedKeys(user_collected); ASSERT_EQ(5u, deleted); // deletes + single-deletes diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,7 +17,7 @@ const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seq, std::unique_ptr files, VersionSet const* const versions, - const bool seq_per_batch) + const bool seq_per_batch, const std::shared_ptr& io_tracer) : dir_(dir), options_(options), read_options_(read_options), @@ -30,10 +30,11 @@ current_batch_seq_(0), current_last_seq_(0), versions_(versions), - seq_per_batch_(seq_per_batch) { + seq_per_batch_(seq_per_batch), + io_tracer_(io_tracer) { assert(files_ != nullptr); assert(versions_ != nullptr); - + current_status_.PermitUncheckedError(); // Clear on start reporter_.env = options_->env; reporter_.info_log = options_->info_log.get(); SeekToStartSequence(); // Seek till starting sequence @@ -42,7 +43,7 @@ Status TransactionLogIteratorImpl::OpenLogFile( const LogFile* log_file, std::unique_ptr* file_reader) { - FileSystem* fs = options_->fs.get(); + FileSystemPtr fs(options_->fs, io_tracer_); std::unique_ptr file; std::string fname; Status s; @@ -62,7 +63,8 @@ } } if (s.ok()) { - file_reader->reset(new SequentialFileReader(std::move(file), fname)); + file_reader->reset(new SequentialFileReader( + std::move(file), fname, io_tracer_, options_->listeners)); } return s; } @@ -223,7 +225,8 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { std::unique_ptr batch(new WriteBatch()); - WriteBatchInternal::SetContents(batch.get(), record); + Status s = WriteBatchInternal::SetContents(batch.get(), record); + s.PermitUncheckedError(); // TODO: What should we do with this error? SequenceNumber expected_seq = current_last_seq_ + 1; // If the iterator has started, then confirm that we get continuous batches @@ -263,6 +266,10 @@ sequence_++; return Status::OK(); } + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + ++sequence_; + return Status::OK(); + } Status PutCF(uint32_t /*cf*/, const Slice& /*key*/, const Slice& /*val*/) override { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include "db/log_reader.h" #include "db/version_set.h" #include "file/filename.h" +#include "logging/logging.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/env.h" @@ -63,7 +64,7 @@ const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seqNum, std::unique_ptr files, VersionSet const* const versions, - const bool seq_per_batch); + const bool seq_per_batch, const std::shared_ptr& io_tracer); virtual bool Valid() override; @@ -122,6 +123,7 @@ // Update current batch if a continuous batch is found, else return false void UpdateCurrentWriteBatch(const Slice& record); Status OpenLogReader(const LogFile* file); + std::shared_ptr io_tracer_; }; } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -14,13 +14,16 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include "db/blob/blob_file_meta.h" #include "db/dbformat.h" #include "db/internal_stats.h" #include "db/table_cache.h" @@ -31,90 +34,242 @@ namespace ROCKSDB_NAMESPACE { -bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { - if (a->fd.largest_seqno != b->fd.largest_seqno) { - return a->fd.largest_seqno > b->fd.largest_seqno; - } - if (a->fd.smallest_seqno != b->fd.smallest_seqno) { - return a->fd.smallest_seqno > b->fd.smallest_seqno; - } - // Break ties by file number - return a->fd.GetNumber() > b->fd.GetNumber(); -} +class VersionBuilder::Rep { + class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); -namespace { -bool BySmallestKey(FileMetaData* a, FileMetaData* b, - const InternalKeyComparator* cmp) { - int r = cmp->Compare(a->smallest, b->smallest); - if (r != 0) { - return (r < 0); - } - // Break ties by file number - return (a->fd.GetNumber() < b->fd.GetNumber()); -} -} // namespace + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } -class VersionBuilder::Rep { - private: - // Helper to sort files_ in v - // kLevel0 -- NewestFirstBySeqNo - // kLevelNon0 -- BySmallestKey - struct FileComparator { - enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method; - const InternalKeyComparator* internal_comparator; - - FileComparator() : internal_comparator(nullptr) {} - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - switch (sort_method) { - case kLevel0: - return NewestFirstBySeqNo(f1, f2); - case kLevelNon0: - return BySmallestKey(f1, f2, internal_comparator); + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; } - assert(false); - return false; + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); } }; + class BySmallestKey { + public: + explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} + + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + assert(cmp_); + + const int r = cmp_->Compare(lhs->smallest, rhs->smallest); + if (r != 0) { + return (r < 0); + } + + // Break ties by file number + return (lhs->fd.GetNumber() < rhs->fd.GetNumber()); + } + + private: + const InternalKeyComparator* cmp_; + }; + struct LevelState { std::unordered_set deleted_files; // Map from file number to file meta data. std::unordered_map added_files; }; + // A class that represents the accumulated changes (like additional garbage or + // newly linked/unlinked SST files) for a given blob file after applying a + // series of VersionEdits. + class BlobFileMetaDataDelta { + public: + bool IsEmpty() const { + return !additional_garbage_count_ && !additional_garbage_bytes_ && + newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty(); + } + + uint64_t GetAdditionalGarbageCount() const { + return additional_garbage_count_; + } + + uint64_t GetAdditionalGarbageBytes() const { + return additional_garbage_bytes_; + } + + const std::unordered_set& GetNewlyLinkedSsts() const { + return newly_linked_ssts_; + } + + const std::unordered_set& GetNewlyUnlinkedSsts() const { + return newly_unlinked_ssts_; + } + + void AddGarbage(uint64_t count, uint64_t bytes) { + additional_garbage_count_ += count; + additional_garbage_bytes_ += bytes; + } + + void LinkSst(uint64_t sst_file_number) { + assert(newly_linked_ssts_.find(sst_file_number) == + newly_linked_ssts_.end()); + + // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_unlinked_ssts_.find(sst_file_number); + + if (it != newly_unlinked_ssts_.end()) { + newly_unlinked_ssts_.erase(it); + } else { + newly_linked_ssts_.emplace(sst_file_number); + } + } + + void UnlinkSst(uint64_t sst_file_number) { + assert(newly_unlinked_ssts_.find(sst_file_number) == + newly_unlinked_ssts_.end()); + + // Reconcile with newly linked SSTs on the fly. (Note: an SST can be + // linked to and unlinked from the same blob file in the case of a trivial + // move.) + auto it = newly_linked_ssts_.find(sst_file_number); + + if (it != newly_linked_ssts_.end()) { + newly_linked_ssts_.erase(it); + } else { + newly_unlinked_ssts_.emplace(sst_file_number); + } + } + + private: + uint64_t additional_garbage_count_ = 0; + uint64_t additional_garbage_bytes_ = 0; + std::unordered_set newly_linked_ssts_; + std::unordered_set newly_unlinked_ssts_; + }; + + // A class that represents the state of a blob file after applying a series of + // VersionEdits. In addition to the resulting state, it also contains the + // delta (see BlobFileMetaDataDelta above). The resulting state can be used to + // identify obsolete blob files, while the delta makes it possible to + // efficiently detect trivial moves. + class MutableBlobFileMetaData { + public: + // To be used for brand new blob files + explicit MutableBlobFileMetaData( + std::shared_ptr&& shared_meta) + : shared_meta_(std::move(shared_meta)) {} + + // To be used for pre-existing blob files + explicit MutableBlobFileMetaData( + const std::shared_ptr& meta) + : shared_meta_(meta->GetSharedMeta()), + linked_ssts_(meta->GetLinkedSsts()), + garbage_blob_count_(meta->GetGarbageBlobCount()), + garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {} + + const std::shared_ptr& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + + bool HasDelta() const { return !delta_.IsEmpty(); } + + const std::unordered_set& GetLinkedSsts() const { + return linked_ssts_; + } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + bool AddGarbage(uint64_t count, uint64_t bytes) { + assert(shared_meta_); + + if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() || + garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) { + return false; + } + + delta_.AddGarbage(count, bytes); + + garbage_blob_count_ += count; + garbage_blob_bytes_ += bytes; + + return true; + } + + void LinkSst(uint64_t sst_file_number) { + delta_.LinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end()); + linked_ssts_.emplace(sst_file_number); + } + + void UnlinkSst(uint64_t sst_file_number) { + delta_.UnlinkSst(sst_file_number); + + assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end()); + linked_ssts_.erase(sst_file_number); + } + + private: + std::shared_ptr shared_meta_; + // Accumulated changes + BlobFileMetaDataDelta delta_; + // Resulting state after applying the changes + BlobFileMetaData::LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; + }; + const FileOptions& file_options_; - Logger* info_log_; + const ImmutableCFOptions* const ioptions_; TableCache* table_cache_; VersionStorageInfo* base_vstorage_; + VersionSet* version_set_; int num_levels_; LevelState* levels_; - // Store states of levels larger than num_levels_. We do this instead of + // Store sizes of levels larger than num_levels_. We do this instead of // storing them in levels_ to avoid regression in case there are no files // on invalid levels. The version is not consistent if in the end the files // on invalid levels don't cancel out. - std::map> invalid_levels_; + std::unordered_map invalid_level_sizes_; // Whether there are invalid new files or invalid deletion on levels larger // than num_levels_. bool has_invalid_levels_; - FileComparator level_zero_cmp_; - FileComparator level_nonzero_cmp_; + // Current levels of table files affected by additions/deletions. + std::unordered_map table_file_levels_; + NewestFirstBySeqNo level_zero_cmp_; + BySmallestKey level_nonzero_cmp_; + + // Mutable metadata objects for all blob files affected by the series of + // version edits. + std::map mutable_blob_file_metas_; public: - Rep(const FileOptions& file_options, Logger* info_log, - TableCache* table_cache, - VersionStorageInfo* base_vstorage) + Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set) : file_options_(file_options), - info_log_(info_log), + ioptions_(ioptions), table_cache_(table_cache), base_vstorage_(base_vstorage), + version_set_(version_set), num_levels_(base_vstorage->num_levels()), - has_invalid_levels_(false) { + has_invalid_levels_(false), + level_nonzero_cmp_(base_vstorage_->InternalComparator()) { + assert(ioptions_); + levels_ = new LevelState[num_levels_]; - level_zero_cmp_.sort_method = FileComparator::kLevel0; - level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; - level_nonzero_cmp_.internal_comparator = - base_vstorage_->InternalComparator(); } ~Rep() { @@ -140,204 +295,836 @@ } } - Status CheckConsistency(VersionStorageInfo* vstorage) { + // Mapping used for checking the consistency of links between SST files and + // blob files. It is built using the forward links (table file -> blob file), + // and is subsequently compared with the inverse mapping stored in the + // BlobFileMetaData objects. + using ExpectedLinkedSsts = + std::unordered_map; + + static void UpdateExpectedLinkedSsts( + uint64_t table_file_number, uint64_t blob_file_number, + ExpectedLinkedSsts* expected_linked_ssts) { + assert(expected_linked_ssts); + + if (blob_file_number == kInvalidBlobFileNumber) { + return; + } + + (*expected_linked_ssts)[blob_file_number].emplace(table_file_number); + } + + template + Status CheckConsistencyDetailsForLevel( + const VersionStorageInfo* vstorage, int level, Checker checker, + const std::string& sync_point, + ExpectedLinkedSsts* expected_linked_ssts) const { #ifdef NDEBUG - if (!vstorage->force_consistency_checks()) { - // Dont run consistency checks in release mode except if - // explicitly asked to + (void)sync_point; +#endif + + assert(vstorage); + assert(level >= 0 && level < num_levels_); + assert(expected_linked_ssts); + + const auto& level_files = vstorage->LevelFiles(level); + + if (level_files.empty()) { return Status::OK(); } -#endif - // make sure the files are sorted correctly - for (int level = 0; level < num_levels_; level++) { - auto& level_files = vstorage->LevelFiles(level); - for (size_t i = 1; i < level_files.size(); i++) { - auto f1 = level_files[i - 1]; - auto f2 = level_files[i]; + + assert(level_files[0]); + UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(), + level_files[0]->oldest_blob_file_number, + expected_linked_ssts); + + for (size_t i = 1; i < level_files.size(); ++i) { + assert(level_files[i]); + UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(), + level_files[i]->oldest_blob_file_number, + expected_linked_ssts); + + auto lhs = level_files[i - 1]; + auto rhs = level_files[i]; + #ifndef NDEBUG - auto pair = std::make_pair(&f1, &f2); - TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair); + auto pair = std::make_pair(&lhs, &rhs); + TEST_SYNC_POINT_CALLBACK(sync_point, &pair); #endif - if (level == 0) { - if (!level_zero_cmp_(f1, f2)) { - fprintf(stderr, "L0 files are not sorted properly"); - return Status::Corruption("L0 files are not sorted properly"); + + const Status s = checker(lhs, rhs); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); + } + + // Make sure table files are sorted correctly and that the links between + // table files and blob files are consistent. + Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + ExpectedLinkedSsts expected_linked_ssts; + + if (num_levels_ > 0) { + // Check L0 + { + auto l0_checker = [this](const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (!level_zero_cmp_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); + + return Status::Corruption("VersionBuilder", oss.str()); } - if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { + if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) { // This is an external file that we ingested - SequenceNumber external_file_seqno = f2->fd.smallest_seqno; - if (!(external_file_seqno < f1->fd.largest_seqno || + const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno; + + if (!(external_file_seqno < lhs->fd.largest_seqno || external_file_seqno == 0)) { - fprintf(stderr, - "L0 file with seqno %" PRIu64 " %" PRIu64 - " vs. file with global_seqno %" PRIu64 "\n", - f1->fd.smallest_seqno, f1->fd.largest_seqno, - external_file_seqno); - return Status::Corruption( - "L0 file with seqno " + - NumberToString(f1->fd.smallest_seqno) + " " + - NumberToString(f1->fd.largest_seqno) + - " vs. file with global_seqno" + - NumberToString(external_file_seqno) + " with fileNumber " + - NumberToString(f1->fd.GetNumber())); + std::ostringstream oss; + oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " + << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno + << " vs. file #" << rhs->fd.GetNumber() + << " with global_seqno " << external_file_seqno; + + return Status::Corruption("VersionBuilder", oss.str()); } - } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) { - fprintf(stderr, - "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64 - " %" PRIu64 "\n", - f1->fd.smallest_seqno, f1->fd.largest_seqno, - f2->fd.smallest_seqno, f2->fd.largest_seqno); - return Status::Corruption( - "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) + - " " + NumberToString(f1->fd.largest_seqno) + " " + - NumberToString(f1->fd.GetNumber()) + " vs. " + - NumberToString(f2->fd.smallest_seqno) + " " + - NumberToString(f2->fd.largest_seqno) + " " + - NumberToString(f2->fd.GetNumber())); + } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) { + std::ostringstream oss; + oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " + << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno + << " vs. file #" << rhs->fd.GetNumber() << " with seqno " + << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno; + + return Status::Corruption("VersionBuilder", oss.str()); } - } else { - if (!level_nonzero_cmp_(f1, f2)) { - fprintf(stderr, "L%d files are not sorted properly", level); - return Status::Corruption("L" + NumberToString(level) + - " files are not sorted properly"); + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, /* level */ 0, l0_checker, + "VersionBuilder::CheckConsistency0", &expected_linked_ssts); + if (!s.ok()) { + return s; + } + } + + // Check L1 and up + const InternalKeyComparator* const icmp = vstorage->InternalComparator(); + assert(icmp); + + for (int level = 1; level < num_levels_; ++level) { + auto checker = [this, level, icmp](const FileMetaData* lhs, + const FileMetaData* rhs) { + assert(lhs); + assert(rhs); + + if (!level_nonzero_cmp_(lhs, rhs)) { + std::ostringstream oss; + oss << 'L' << level << " files are not sorted properly: files #" + << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); + + return Status::Corruption("VersionBuilder", oss.str()); } - // Make sure there is no overlap in levels > 0 - if (vstorage->InternalComparator()->Compare(f1->largest, - f2->smallest) >= 0) { - fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level, - (f1->largest).DebugString(true).c_str(), - (f2->smallest).DebugString(true).c_str()); - return Status::Corruption( - "L" + NumberToString(level) + " have overlapping ranges " + - (f1->largest).DebugString(true) + " vs. " + - (f2->smallest).DebugString(true)); + // Make sure there is no overlap in level + if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) { + std::ostringstream oss; + oss << 'L' << level << " has overlapping ranges: file #" + << lhs->fd.GetNumber() + << " largest key: " << lhs->largest.DebugString(true) + << " vs. file #" << rhs->fd.GetNumber() + << " smallest key: " << rhs->smallest.DebugString(true); + + return Status::Corruption("VersionBuilder", oss.str()); } + + return Status::OK(); + }; + + const Status s = CheckConsistencyDetailsForLevel( + vstorage, level, checker, "VersionBuilder::CheckConsistency1", + &expected_linked_ssts); + if (!s.ok()) { + return s; } } } - return Status::OK(); + + // Make sure that all blob files in the version have non-garbage data and + // the links between them and the table files are consistent. + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& blob_file_meta = pair.second; + assert(blob_file_meta); + + if (blob_file_meta->GetGarbageBlobCount() >= + blob_file_meta->GetTotalBlobCount()) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number + << " consists entirely of garbage"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (blob_file_meta->GetLinkedSsts() != + expected_linked_ssts[blob_file_number]) { + std::ostringstream oss; + oss << "Links are inconsistent between table files and blob file #" + << blob_file_number; + + return Status::Corruption("VersionBuilder", oss.str()); + } + } + + Status ret_s; + TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn", + &ret_s); + return ret_s; } - Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, - int level) { + Status CheckConsistency(const VersionStorageInfo* vstorage) const { + assert(vstorage); + + // Always run consistency checks in debug build #ifdef NDEBUG - if (!base_vstorage_->force_consistency_checks()) { - // Dont run consistency checks in release mode except if - // explicitly asked to + if (!vstorage->force_consistency_checks()) { return Status::OK(); } #endif - // a file to be deleted better exist in the previous version - bool found = false; - for (int l = 0; !found && l < num_levels_; l++) { - const std::vector& base_files = - base_vstorage_->LevelFiles(l); - for (size_t i = 0; i < base_files.size(); i++) { - FileMetaData* f = base_files[i]; - if (f->fd.GetNumber() == number) { - found = true; - break; - } + Status s = CheckConsistencyDetails(vstorage); + if (s.IsCorruption() && s.getState()) { + // Make it clear the error is due to force_consistency_checks = 1 or + // debug build +#ifdef NDEBUG + auto prefix = "force_consistency_checks"; +#else + auto prefix = "force_consistency_checks(DEBUG)"; +#endif + s = Status::Corruption(prefix, s.getState()); + } else { + // was only expecting corruption with message, or OK + assert(s.ok()); + } + return s; + } + + bool CheckConsistencyForNumLevels() const { + // Make sure there are no files on or beyond num_levels(). + if (has_invalid_levels_) { + return false; + } + + for (const auto& pair : invalid_level_sizes_) { + const size_t level_size = pair.second; + if (level_size != 0) { + return false; } } - // if the file did not exist in the previous version, then it - // is possibly moved from lower level to higher level in current - // version - for (int l = level + 1; !found && l < num_levels_; l++) { - auto& level_added = levels_[l].added_files; - auto got = level_added.find(number); - if (got != level_added.end()) { - found = true; - break; + + return true; + } + + bool IsBlobFileInVersion(uint64_t blob_file_number) const { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return true; + } + + assert(base_vstorage_); + + const auto& base_blob_files = base_vstorage_->GetBlobFiles(); + + auto base_it = base_blob_files.find(blob_file_number); + if (base_it != base_blob_files.end()) { + return true; + } + + return false; + } + + MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData( + uint64_t blob_file_number) { + auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); + if (mutable_it != mutable_blob_file_metas_.end()) { + return &mutable_it->second; + } + + assert(base_vstorage_); + + const auto& base_blob_files = base_vstorage_->GetBlobFiles(); + + auto base_it = base_blob_files.find(blob_file_number); + if (base_it != base_blob_files.end()) { + assert(base_it->second); + + mutable_it = mutable_blob_file_metas_ + .emplace(blob_file_number, + MutableBlobFileMetaData(base_it->second)) + .first; + return &mutable_it->second; + } + + return nullptr; + } + + Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) { + const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber(); + + if (IsBlobFileInVersion(blob_file_number)) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " already added"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + // Note: we use C++11 for now but in C++14, this could be done in a more + // elegant way using generalized lambda capture. + VersionSet* const vs = version_set_; + const ImmutableCFOptions* const ioptions = ioptions_; + + auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) { + if (vs) { + assert(ioptions); + assert(!ioptions->cf_paths.empty()); + assert(shared_meta); + + vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(), + ioptions->cf_paths.front().path); } + + delete shared_meta; + }; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, blob_file_addition.GetTotalBlobCount(), + blob_file_addition.GetTotalBlobBytes(), + blob_file_addition.GetChecksumMethod(), + blob_file_addition.GetChecksumValue(), deleter); + + mutable_blob_file_metas_.emplace( + blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); + + return Status::OK(); + } + + Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { + const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber(); + + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + + if (!mutable_meta) { + std::ostringstream oss; + oss << "Blob file #" << blob_file_number << " not found"; + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(), + blob_file_garbage.GetGarbageBlobBytes())) { + std::ostringstream oss; + oss << "Garbage overflow for blob file #" << blob_file_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + + return Status::OK(); + } + + int GetCurrentLevelForTableFile(uint64_t file_number) const { + auto it = table_file_levels_.find(file_number); + if (it != table_file_levels_.end()) { + return it->second; + } + + assert(base_vstorage_); + return base_vstorage_->GetFileLocation(file_number).GetLevel(); + } + + uint64_t GetOldestBlobFileNumberForTableFile(int level, + uint64_t file_number) const { + assert(level < num_levels_); + + const auto& added_files = levels_[level].added_files; + + auto it = added_files.find(file_number); + if (it != added_files.end()) { + const FileMetaData* const meta = it->second; + assert(meta); + + return meta->oldest_blob_file_number; } - // maybe this file was added in a previous edit that was Applied - if (!found) { - auto& level_added = levels_[level].added_files; - auto got = level_added.find(number); - if (got != level_added.end()) { - found = true; + assert(base_vstorage_); + const FileMetaData* const meta = + base_vstorage_->GetFileMetaDataByNumber(file_number); + assert(meta); + + return meta->oldest_blob_file_number; + } + + Status ApplyFileDeletion(int level, uint64_t file_number) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (level != current_level) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot delete table file #" << file_number << " from level " + << level << " since it is "; + if (current_level == + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + oss << "not in the LSM tree"; + } else { + oss << "on level " << current_level; } + + return Status::Corruption("VersionBuilder", oss.str()); } - if (!found) { - fprintf(stderr, "not found %" PRIu64 "\n", number); - return Status::Corruption("not found " + NumberToString(number)); + + if (level >= num_levels_) { + assert(invalid_level_sizes_[level] > 0); + --invalid_level_sizes_[level]; + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + const uint64_t blob_file_number = + GetOldestBlobFileNumberForTableFile(level, file_number); + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->UnlinkSst(file_number); + } } + + auto& level_state = levels_[level]; + + auto& add_files = level_state.added_files; + auto add_it = add_files.find(file_number); + if (add_it != add_files.end()) { + UnrefFile(add_it->second); + add_files.erase(add_it); + } + + auto& del_files = level_state.deleted_files; + assert(del_files.find(file_number) == del_files.end()); + del_files.emplace(file_number); + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + return Status::OK(); } - bool CheckConsistencyForNumLevels() { - // Make sure there are no files on or beyond num_levels(). - if (has_invalid_levels_) { - return false; + Status ApplyFileAddition(int level, const FileMetaData& meta) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const uint64_t file_number = meta.fd.GetNumber(); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (current_level != + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + if (level >= num_levels_) { + has_invalid_levels_ = true; + } + + std::ostringstream oss; + oss << "Cannot add table file #" << file_number << " to level " << level + << " since it is already in the LSM tree on level " << current_level; + return Status::Corruption("VersionBuilder", oss.str()); } - for (auto& level : invalid_levels_) { - if (level.second.size() > 0) { - return false; + + if (level >= num_levels_) { + ++invalid_level_sizes_[level]; + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + auto& level_state = levels_[level]; + + auto& del_files = level_state.deleted_files; + auto del_it = del_files.find(file_number); + if (del_it != del_files.end()) { + del_files.erase(del_it); + } + + FileMetaData* const f = new FileMetaData(meta); + f->refs = 1; + + auto& add_files = level_state.added_files; + assert(add_files.find(file_number) == add_files.end()); + add_files.emplace(file_number, f); + + const uint64_t blob_file_number = f->oldest_blob_file_number; + + if (blob_file_number != kInvalidBlobFileNumber) { + MutableBlobFileMetaData* const mutable_meta = + GetOrCreateMutableBlobFileMetaData(blob_file_number); + if (mutable_meta) { + mutable_meta->LinkSst(file_number); } } - return true; + + table_file_levels_[file_number] = level; + + return Status::OK(); } // Apply all of the edits in *edit to the current state. - Status Apply(VersionEdit* edit) { - Status s = CheckConsistency(base_vstorage_); - if (!s.ok()) { - return s; + Status Apply(const VersionEdit* edit) { + { + const Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } } - // Delete files - const auto& del = edit->GetDeletedFiles(); - for (const auto& del_file : del) { - const auto level = del_file.first; - const auto number = del_file.second; - if (level < num_levels_) { - levels_[level].deleted_files.insert(number); - CheckConsistencyForDeletes(edit, number, level); - - auto exising = levels_[level].added_files.find(number); - if (exising != levels_[level].added_files.end()) { - UnrefFile(exising->second); - levels_[level].added_files.erase(exising); - } - } else { - if (invalid_levels_[level].erase(number) == 0) { - // Deleting an non-existing file on invalid level. - has_invalid_levels_ = true; - } + // Note: we process the blob file related changes first because the + // table file addition/deletion logic depends on the blob files + // already being there. + + // Add new blob files + for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) { + const Status s = ApplyBlobFileAddition(blob_file_addition); + if (!s.ok()) { + return s; + } + } + + // Increase the amount of garbage for blob files affected by GC + for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) { + const Status s = ApplyBlobFileGarbage(blob_file_garbage); + if (!s.ok()) { + return s; + } + } + + // Delete table files + for (const auto& deleted_file : edit->GetDeletedFiles()) { + const int level = deleted_file.first; + const uint64_t file_number = deleted_file.second; + + const Status s = ApplyFileDeletion(level, file_number); + if (!s.ok()) { + return s; } } - // Add new files + // Add new table files for (const auto& new_file : edit->GetNewFiles()) { const int level = new_file.first; - if (level < num_levels_) { - FileMetaData* f = new FileMetaData(new_file.second); - f->refs = 1; - - assert(levels_[level].added_files.find(f->fd.GetNumber()) == - levels_[level].added_files.end()); - levels_[level].deleted_files.erase(f->fd.GetNumber()); - levels_[level].added_files[f->fd.GetNumber()] = f; + const FileMetaData& meta = new_file.second; + + const Status s = ApplyFileAddition(level, meta); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); + } + + // Helper function template for merging the blob file metadata from the base + // version with the mutable metadata representing the state after applying the + // edits. The function objects process_base and process_mutable are + // respectively called to handle a base version object when there is no + // matching mutable object, and a mutable object when there is no matching + // base version object. process_both is called to perform the merge when a + // given blob file appears both in the base version and the mutable list. The + // helper stops processing objects if a function object returns false. Blob + // files with a file number below first_blob_file are not processed. + template + void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base, + ProcessMutable process_mutable, + ProcessBoth process_both) const { + assert(base_vstorage_); + + const auto& base_blob_files = base_vstorage_->GetBlobFiles(); + auto base_it = base_blob_files.lower_bound(first_blob_file); + const auto base_it_end = base_blob_files.end(); + + auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file); + const auto mutable_it_end = mutable_blob_file_metas_.end(); + + while (base_it != base_it_end && mutable_it != mutable_it_end) { + const uint64_t base_blob_file_number = base_it->first; + const uint64_t mutable_blob_file_number = mutable_it->first; + + if (base_blob_file_number < mutable_blob_file_number) { + const auto& base_meta = base_it->second; + + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } else if (mutable_blob_file_number < base_blob_file_number) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; } else { - uint64_t number = new_file.second.fd.GetNumber(); - auto& lvls = invalid_levels_[level]; - if (lvls.count(number) == 0) { - lvls.insert(number); - } else { - // Creating an already existing file on invalid level. - has_invalid_levels_ = true; + assert(base_blob_file_number == mutable_blob_file_number); + + const auto& base_meta = base_it->second; + const auto& mutable_meta = mutable_it->second; + + if (!process_both(base_meta, mutable_meta)) { + return; } + + ++base_it; + ++mutable_it; } } - return s; + + while (base_it != base_it_end) { + const auto& base_meta = base_it->second; + + if (!process_base(base_meta)) { + return; + } + + ++base_it; + } + + while (mutable_it != mutable_it_end) { + const auto& mutable_meta = mutable_it->second; + + if (!process_mutable(mutable_meta)) { + return; + } + + ++mutable_it; + } + } + + // Helper function template for finding the first blob file that has linked + // SSTs. + template + static bool CheckLinkedSsts(const Meta& meta, + uint64_t* min_oldest_blob_file_num) { + assert(min_oldest_blob_file_num); + + if (!meta.GetLinkedSsts().empty()) { + assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber); + + *min_oldest_blob_file_num = meta.GetBlobFileNumber(); + + return false; + } + + return true; + } + + // Find the oldest blob file that has linked SSTs. + uint64_t GetMinOldestBlobFileNumber() const { + uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber; + + auto process_base = + [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta) { + assert(base_meta); + + return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num); + }; + + auto process_mutable = [&min_oldest_blob_file_num]( + const MutableBlobFileMetaData& mutable_meta) { + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + auto process_both = [&min_oldest_blob_file_num]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { +#ifndef NDEBUG + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); +#else + (void)base_meta; +#endif + + // Look at mutable_meta since it supersedes *base_meta + return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); + }; + + MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable, + process_both); + + return min_oldest_blob_file_num; } - // Save the current state in *v. - Status SaveTo(VersionStorageInfo* vstorage) { + static std::shared_ptr CreateBlobFileMetaData( + const MutableBlobFileMetaData& mutable_meta) { + return BlobFileMetaData::Create( + mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(), + mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); + } + + // Add the blob file specified by meta to *vstorage if it is determined to + // contain valid data (blobs). + template + static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { + assert(vstorage); + assert(meta); + + if (meta->GetLinkedSsts().empty() && + meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { + return; + } + + vstorage->AddBlobFile(std::forward(meta)); + } + + // Merge the blob file metadata from the base version with the changes (edits) + // applied, and save the result into *vstorage. + void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + const uint64_t oldest_blob_file_with_linked_ssts = + GetMinOldestBlobFileNumber(); + + auto process_base = + [vstorage](const std::shared_ptr& base_meta) { + assert(base_meta); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + }; + + auto process_mutable = + [vstorage](const MutableBlobFileMetaData& mutable_meta) { + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + auto process_both = [vstorage]( + const std::shared_ptr& base_meta, + const MutableBlobFileMetaData& mutable_meta) { + assert(base_meta); + assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); + + if (!mutable_meta.HasDelta()) { + assert(base_meta->GetGarbageBlobCount() == + mutable_meta.GetGarbageBlobCount()); + assert(base_meta->GetGarbageBlobBytes() == + mutable_meta.GetGarbageBlobBytes()); + assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); + + AddBlobFileIfNeeded(vstorage, base_meta); + + return true; + } + + AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); + + return true; + }; + + MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base, + process_mutable, process_both); + } + + void MaybeAddFile(VersionStorageInfo* vstorage, int level, + FileMetaData* f) const { + const uint64_t file_number = f->fd.GetNumber(); + + const auto& level_state = levels_[level]; + + const auto& del_files = level_state.deleted_files; + const auto del_it = del_files.find(file_number); + + if (del_it != del_files.end()) { + // f is to-be-deleted table file + vstorage->RemoveCurrentStats(f); + } else { + const auto& add_files = level_state.added_files; + const auto add_it = add_files.find(file_number); + + // Note: if the file appears both in the base version and in the added + // list, the added FileMetaData supersedes the one in the base version. + if (add_it != add_files.end() && add_it->second != f) { + vstorage->RemoveCurrentStats(f); + } else { + vstorage->AddFile(level, f); + } + } + } + + template + void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *vstorage. + const auto& base_files = base_vstorage_->LevelFiles(level); + const auto& unordered_added_files = levels_[level].added_files; + vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); + + // Sort added files for the level. + std::vector added_files; + added_files.reserve(unordered_added_files.size()); + for (const auto& pair : unordered_added_files) { + added_files.push_back(pair.second); + } + std::sort(added_files.begin(), added_files.end(), cmp); + + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + auto added_iter = added_files.begin(); + auto added_end = added_files.end(); + while (added_iter != added_end || base_iter != base_end) { + if (base_iter == base_end || + (added_iter != added_end && cmp(*added_iter, *base_iter))) { + MaybeAddFile(vstorage, level, *added_iter++); + } else { + MaybeAddFile(vstorage, level, *base_iter++); + } + } + } + + void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { + assert(vstorage); + + if (!num_levels_) { + return; + } + + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_); + + for (int level = 1; level < num_levels_; ++level) { + SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); + } + } + + // Save the current state in *vstorage. + Status SaveTo(VersionStorageInfo* vstorage) const { Status s = CheckConsistency(base_vstorage_); if (!s.ok()) { return s; @@ -348,56 +1135,19 @@ return s; } - for (int level = 0; level < num_levels_; level++) { - const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; - // Merge the set of added files with the set of pre-existing files. - // Drop any deleted files. Store the result in *v. - const auto& base_files = base_vstorage_->LevelFiles(level); - const auto& unordered_added_files = levels_[level].added_files; - vstorage->Reserve(level, - base_files.size() + unordered_added_files.size()); - - // Sort added files for the level. - std::vector added_files; - added_files.reserve(unordered_added_files.size()); - for (const auto& pair : unordered_added_files) { - added_files.push_back(pair.second); - } - std::sort(added_files.begin(), added_files.end(), cmp); + SaveSSTFilesTo(vstorage); -#ifndef NDEBUG - FileMetaData* prev_added_file = nullptr; - for (const auto& added : added_files) { - if (level > 0 && prev_added_file != nullptr) { - assert(base_vstorage_->InternalComparator()->Compare( - prev_added_file->smallest, added->smallest) <= 0); - } - prev_added_file = added; - } -#endif - - auto base_iter = base_files.begin(); - auto base_end = base_files.end(); - auto added_iter = added_files.begin(); - auto added_end = added_files.end(); - while (added_iter != added_end || base_iter != base_end) { - if (base_iter == base_end || - (added_iter != added_end && cmp(*added_iter, *base_iter))) { - MaybeAddFile(vstorage, level, *added_iter++); - } else { - MaybeAddFile(vstorage, level, *base_iter++); - } - } - } + SaveBlobFilesTo(vstorage); s = CheckConsistency(vstorage); return s; } - Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, - const SliceTransform* prefix_extractor) { + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin) { assert(table_cache_ != nullptr); size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); @@ -405,7 +1155,7 @@ size_t max_load = port::kMaxSizet; if (!always_load) { - // If it is initial loading and not set to always laoding all the + // If it is initial loading and not set to always loading all the // files, we only load up to kInitialLoadLimit files, to limit the // time reopening the DB. const size_t kInitialLoadLimit = 16; @@ -462,11 +1212,13 @@ auto* file_meta = files_meta[file_idx].first; int level = files_meta[file_idx].second; statuses[file_idx] = table_cache_->FindTable( - file_options_, *(base_vstorage_->InternalComparator()), - file_meta->fd, &file_meta->table_reader_handle, prefix_extractor, - false /*no_io */, true /* record_read_stats */, + ReadOptions(), file_options_, + *(base_vstorage_->InternalComparator()), file_meta->fd, + &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, + true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, - prefetch_index_and_filter_in_cache); + prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, + file_meta->temperature); if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( @@ -483,63 +1235,75 @@ for (auto& t : threads) { t.join(); } + Status ret; for (const auto& s : statuses) { if (!s.ok()) { - return s; + if (ret.ok()) { + ret = s; + } } } - return Status::OK(); - } - - void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { - if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { - // f is to-be-deleted table file - vstorage->RemoveCurrentStats(f); - } else { - vstorage->AddFile(level, f, info_log_); - } + return ret; } }; VersionBuilder::VersionBuilder(const FileOptions& file_options, + const ImmutableCFOptions* ioptions, TableCache* table_cache, VersionStorageInfo* base_vstorage, - Logger* info_log) - : rep_(new Rep(file_options, info_log, table_cache, base_vstorage)) {} + VersionSet* version_set) + : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, + version_set)) {} -VersionBuilder::~VersionBuilder() { delete rep_; } - -Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { - return rep_->CheckConsistency(vstorage); -} - -Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, - uint64_t number, int level) { - return rep_->CheckConsistencyForDeletes(edit, number, level); -} +VersionBuilder::~VersionBuilder() = default; bool VersionBuilder::CheckConsistencyForNumLevels() { return rep_->CheckConsistencyForNumLevels(); } -Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); } +Status VersionBuilder::Apply(const VersionEdit* edit) { + return rep_->Apply(edit); +} -Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { +Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const { return rep_->SaveTo(vstorage); } Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const SliceTransform* prefix_extractor) { - return rep_->LoadTableHandlers(internal_stats, max_threads, - prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor); + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin) { + return rep_->LoadTableHandlers( + internal_stats, max_threads, prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); +} + +uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { + return rep_->GetMinOldestBlobFileNumber(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set())), + version_(cfd->current()) { + version_->Ref(); +} + +BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( + ColumnFamilyData* cfd, Version* v) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), v->storage_info(), v->version_set())), + version_(v) { + assert(version_ != cfd->current()); } -void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, - FileMetaData* f) { - rep_->MaybeAddFile(vstorage, level, f); +BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() { + version_->Unref(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,41 +8,62 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. // #pragma once + +#include + #include "rocksdb/file_system.h" #include "rocksdb/slice_transform.h" namespace ROCKSDB_NAMESPACE { +struct ImmutableCFOptions; class TableCache; class VersionStorageInfo; class VersionEdit; struct FileMetaData; class InternalStats; +class Version; +class VersionSet; +class ColumnFamilyData; // A helper class so we can efficiently apply a whole sequence // of edits to a particular state without creating intermediate // Versions that contain full copies of the intermediate state. class VersionBuilder { public: - VersionBuilder(const FileOptions& file_options, TableCache* table_cache, - VersionStorageInfo* base_vstorage, Logger* info_log = nullptr); + VersionBuilder(const FileOptions& file_options, + const ImmutableCFOptions* ioptions, TableCache* table_cache, + VersionStorageInfo* base_vstorage, VersionSet* version_set); ~VersionBuilder(); - Status CheckConsistency(VersionStorageInfo* vstorage); - Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, - int level); + bool CheckConsistencyForNumLevels(); - Status Apply(VersionEdit* edit); - Status SaveTo(VersionStorageInfo* vstorage); - Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, - bool is_initial_load, - const SliceTransform* prefix_extractor); - void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); + Status Apply(const VersionEdit* edit); + Status SaveTo(VersionStorageInfo* vstorage) const; + Status LoadTableHandlers( + InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, bool is_initial_load, + const std::shared_ptr& prefix_extractor, + size_t max_file_size_for_l0_meta_pin); + uint64_t GetMinOldestBlobFileNumber() const; private: class Rep; - Rep* rep_; + std::unique_ptr rep_; +}; + +// A wrapper of version builder which references the current version in +// constructor and unref it in the destructor. +// Both of the constructor and destructor need to be called inside DB Mutex. +class BaseReferencedVersionBuilder { + public: + explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd); + BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v); + ~BaseReferencedVersionBuilder(); + VersionBuilder* version_builder() const { return version_builder_.get(); } + + private: + std::unique_ptr version_builder_; + Version* version_; }; -extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,10 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include +#include +#include +#include #include + #include "db/version_edit.h" #include "db/version_set.h" -#include "logging/logging.h" +#include "rocksdb/advanced_options.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -18,7 +23,7 @@ const Comparator* ucmp_; InternalKeyComparator icmp_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; VersionStorageInfo vstorage_; uint32_t file_num_; @@ -52,19 +57,22 @@ return InternalKey(ukey, smallest_seq, kTypeValue); } - void Add(int level, uint32_t file_number, const char* smallest, + void Add(int level, uint64_t file_number, const char* smallest, const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, uint64_t num_entries = 0, uint64_t num_deletions = 0, bool sampled = false, SequenceNumber smallest_seqno = 0, - SequenceNumber largest_seqno = 0) { + SequenceNumber largest_seqno = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, - /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + /* marked_for_compact */ false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -75,8 +83,77 @@ } } + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value)); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) { + constexpr int level = 0; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 100; + constexpr uint32_t path_id = 0; + constexpr SequenceNumber smallest_seq = 0; + constexpr SequenceNumber largest_seq = 0; + constexpr uint64_t num_entries = 0; + constexpr uint64_t num_deletions = 0; + constexpr bool sampled = false; + + Add(level, table_file_number, smallest, largest, file_size, path_id, + smallest_seq, largest_seq, num_entries, num_deletions, sampled, + smallest_seq, largest_seq, blob_file_number); + } + + void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number, + uint64_t blob_file_number) { + assert(edit); + + constexpr int level = 0; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 100; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 300; + constexpr bool marked_for_compaction = false; + + edit->AddFile(level, table_file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + } + + static std::shared_ptr GetBlobFileMetaData( + const VersionStorageInfo::BlobFiles& blob_files, + uint64_t blob_file_number) { + const auto it = blob_files.find(blob_file_number); + + if (it == blob_files.end()) { + return std::shared_ptr(); + } + + const auto& meta = it->second; + assert(meta); + + return meta; + } + void UpdateVersionStorageInfo() { - vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_.UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_); vstorage_.UpdateNumNonEmptyLevels(); vstorage_.GenerateFileIndexer(); vstorage_.GenerateLevelFilesBrief(); @@ -115,19 +192,23 @@ VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.DeleteFile(3, 27U); EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; - VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false); - version_builder.Apply(&version_edit); - version_builder.SaveTo(&new_vstorage); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2)); ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3)); @@ -152,20 +233,24 @@ VersionEdit version_edit; version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; - VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false); - version_builder.Apply(&version_edit); - version_builder.SaveTo(&new_vstorage); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3)); @@ -192,9 +277,10 @@ VersionEdit version_edit; version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -202,13 +288,16 @@ version_edit.DeleteFile(4, 8U); EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; - VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false); - version_builder.Apply(&version_edit); - version_builder.SaveTo(&new_vstorage); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0)); ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4)); @@ -223,38 +312,46 @@ VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; - VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false); - version_builder.Apply(&version_edit); - version_builder.SaveTo(&new_vstorage); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2)); @@ -265,60 +362,1277 @@ UpdateVersionStorageInfo(); EnvOptions env_options; - VersionBuilder version_builder(env_options, nullptr, &vstorage_); + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false); VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); - version_builder.Apply(&version_edit); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); - version_builder.Apply(&version_edit2); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); - version_builder.SaveTo(&new_vstorage); + ASSERT_OK(version_builder.Apply(&version_edit2)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2)); UnrefFilesInVersion(&new_vstorage); } +TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + + Add(level, file_number, smallest, largest); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int incorrect_level = 3; + + edit.DeleteFile(incorrect_level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #2345 from level 3 since " + "it is on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) { + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 1234; + + edit.DeleteFile(level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #1234 from level 3 since " + "it is not in the LSM tree")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr uint64_t file_size = 10000; + constexpr uint32_t path_id = 0; + constexpr SequenceNumber smallest_seq = 100; + constexpr SequenceNumber largest_seq = 500; + constexpr uint64_t num_entries = 0; + constexpr uint64_t num_deletions = 0; + constexpr bool sampled = false; + constexpr SequenceNumber smallest_seqno = 1; + constexpr SequenceNumber largest_seqno = 1000; + + Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq, + largest_seq, num_entries, num_deletions, sampled, smallest_seqno, + largest_seqno); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + VersionEdit addition; + + constexpr bool marked_for_compaction = false; + + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + ASSERT_OK(builder.Apply(&addition)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + + Add(level, file_number, smallest, largest); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int new_level = 2; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + ASSERT_OK(builder.Apply(&edit)); + + VersionEdit other_edit; + + constexpr int new_level = 2; + + other_edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + + const Status s = builder.Apply(&other_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 3")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + + ASSERT_OK(builder.Apply(&addition)); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid()); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAddition) { + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&edit, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) { + // Attempt to add a blob file that is already present in the base version. + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count, + garbage_blob_bytes); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added")); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) { + // Attempt to add the same blob file twice using version edits. + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + ASSERT_OK(builder.Apply(&edit)); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added")); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) { + // Increase the amount of garbage for a blob file present in the base version. + + constexpr uint64_t table_file_number = 1; + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value, BlobFileMetaData::LinkedSsts{table_file_number}, + garbage_blob_count, garbage_blob_bytes); + + const auto meta = + GetBlobFileMetaData(vstorage_.GetBlobFiles(), blob_file_number); + ASSERT_NE(meta, nullptr); + + // Add dummy table file to ensure the blob file is referenced. + AddDummyFile(table_file_number, blob_file_number); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t new_garbage_blob_count = 456; + constexpr uint64_t new_garbage_blob_bytes = 111111; + + edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count, + new_garbage_blob_bytes); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta()); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), + garbage_blob_count + new_garbage_blob_count); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), + garbage_blob_bytes + new_garbage_blob_bytes); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) { + // Increase the amount of garbage for a blob file added using a version edit. + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&addition)); + + constexpr uint64_t garbage_blob_count = 123; + constexpr uint64_t garbage_blob_bytes = 456789; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + ASSERT_OK(builder.Apply(&garbage)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 1); + + const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number); + + ASSERT_NE(new_meta, nullptr); + ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value); + ASSERT_EQ(new_meta->GetLinkedSsts(), + BlobFileMetaData::LinkedSsts{table_file_number}); + ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) { + // Attempt to increase the amount of garbage for a blob file that is + // neither in the base version, nor was it added using a version edit. + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t garbage_blob_count = 5678; + constexpr uint64_t garbage_blob_bytes = 999999; + + edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found")); +} + +TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) { + // Test that VersionEdits that would result in the count/total size of garbage + // exceeding the count/total size of all blobs are rejected. + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit addition; + + constexpr uint64_t blob_file_number = 1234; + constexpr uint64_t total_blob_count = 5678; + constexpr uint64_t total_blob_bytes = 999999; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + + addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + // Add dummy table file to ensure the blob file is referenced. + constexpr uint64_t table_file_number = 1; + AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + + ASSERT_OK(builder.Apply(&addition)); + + { + // Garbage blob count overflow + constexpr uint64_t garbage_blob_count = 5679; + constexpr uint64_t garbage_blob_bytes = 999999; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&garbage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Garbage overflow for blob file #1234")); + } + + { + // Garbage blob bytes overflow + constexpr uint64_t garbage_blob_count = 5678; + constexpr uint64_t garbage_blob_bytes = 1000000; + + VersionEdit garbage; + + garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + const Status s = builder.Apply(&garbage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Garbage overflow for blob file #1234")); + } +} + +TEST_F(VersionBuilderTest, SaveBlobFilesTo) { + // Add three blob files to base version. + for (uint64_t i = 3; i >= 1; --i) { + const uint64_t table_file_number = i; + const uint64_t blob_file_number = i; + const uint64_t total_blob_count = i * 1000; + const uint64_t total_blob_bytes = i * 1000000; + const uint64_t garbage_blob_count = i * 100; + const uint64_t garbage_blob_bytes = i * 20000; + + AddBlob(blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), + BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count, + garbage_blob_bytes); + + // Add dummy table file to ensure the blob file is referenced. + AddDummyFile(table_file_number, blob_file_number); + } + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + // Add some garbage to the second and third blob files. The second blob file + // remains valid since it does not consist entirely of garbage yet. The third + // blob file is all garbage after the edit and will not be part of the new + // version. The corresponding dummy table file is also removed for + // consistency. + edit.AddBlobFileGarbage(/* blob_file_number */ 2, + /* garbage_blob_count */ 200, + /* garbage_blob_bytes */ 100000); + edit.AddBlobFileGarbage(/* blob_file_number */ 3, + /* garbage_blob_count */ 2700, + /* garbage_blob_bytes */ 2940000); + edit.DeleteFile(/* level */ 0, /* file_number */ 3); + + // Add a fourth blob file. + edit.AddBlobFile(/* blob_file_number */ 4, /* total_blob_count */ 4000, + /* total_blob_bytes */ 4000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 3); + + const auto meta1 = GetBlobFileMetaData(new_blob_files, 1); + + ASSERT_NE(meta1, nullptr); + ASSERT_EQ(meta1->GetBlobFileNumber(), 1); + ASSERT_EQ(meta1->GetTotalBlobCount(), 1000); + ASSERT_EQ(meta1->GetTotalBlobBytes(), 1000000); + ASSERT_EQ(meta1->GetGarbageBlobCount(), 100); + ASSERT_EQ(meta1->GetGarbageBlobBytes(), 20000); + + const auto meta2 = GetBlobFileMetaData(new_blob_files, 2); + + ASSERT_NE(meta2, nullptr); + ASSERT_EQ(meta2->GetBlobFileNumber(), 2); + ASSERT_EQ(meta2->GetTotalBlobCount(), 2000); + ASSERT_EQ(meta2->GetTotalBlobBytes(), 2000000); + ASSERT_EQ(meta2->GetGarbageBlobCount(), 400); + ASSERT_EQ(meta2->GetGarbageBlobBytes(), 140000); + + const auto meta4 = GetBlobFileMetaData(new_blob_files, 4); + + ASSERT_NE(meta4, nullptr); + ASSERT_EQ(meta4->GetBlobFileNumber(), 4); + ASSERT_EQ(meta4->GetTotalBlobCount(), 4000); + ASSERT_EQ(meta4->GetTotalBlobBytes(), 4000000); + ASSERT_EQ(meta4->GetGarbageBlobCount(), 0); + ASSERT_EQ(meta4->GetGarbageBlobBytes(), 0); + + // Delete the first table file, which makes the first blob file obsolete + // since it's at the head and unreferenced. + VersionBuilder second_builder(env_options, &ioptions_, table_cache, + &new_vstorage, version_set); + + VersionEdit second_edit; + second_edit.DeleteFile(/* level */ 0, /* file_number */ 1); + + ASSERT_OK(second_builder.Apply(&second_edit)); + + VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &new_vstorage, + force_consistency_checks); + + ASSERT_OK(second_builder.SaveTo(&newer_vstorage)); + + const auto& newer_blob_files = newer_vstorage.GetBlobFiles(); + ASSERT_EQ(newer_blob_files.size(), 2); + + const auto newer_meta1 = GetBlobFileMetaData(newer_blob_files, 1); + + ASSERT_EQ(newer_meta1, nullptr); + + UnrefFilesInVersion(&newer_vstorage); + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { + // When multiple background jobs (flushes/compactions) are executing in + // parallel, it is possible for the VersionEdit adding blob file K to be + // applied *after* the VersionEdit adding blob file N (for N > K). This test + // case makes sure this is handled correctly. + + // Add blob file #4 (referenced by table file #3) to base version. + constexpr uint64_t base_table_file_number = 3; + constexpr uint64_t base_blob_file_number = 4; + constexpr uint64_t base_total_blob_count = 100; + constexpr uint64_t base_total_blob_bytes = 1 << 20; + + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = "\xfa\xce\xb0\x0c"; + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + AddDummyFile(base_table_file_number, base_blob_file_number); + AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes, + checksum_method, checksum_value, + BlobFileMetaData::LinkedSsts{base_table_file_number}, + garbage_blob_count, garbage_blob_bytes); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + // Add blob file #2 (referenced by table file #1). + constexpr int level = 0; + constexpr uint64_t table_file_number = 1; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 1 << 12; + constexpr char smallest[] = "key1"; + constexpr char largest[] = "key987"; + constexpr SequenceNumber smallest_seqno = 0; + constexpr SequenceNumber largest_seqno = 0; + constexpr bool marked_for_compaction = false; + + constexpr uint64_t blob_file_number = 2; + static_assert(blob_file_number < base_blob_file_number, + "Added blob file should have a smaller file number"); + + constexpr uint64_t total_blob_count = 234; + constexpr uint64_t total_blob_bytes = 1 << 22; + + edit.AddFile( + level, table_file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, checksum_value, + checksum_method, kDisableUserTimestamp, kDisableUserTimestamp); + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + ASSERT_OK(builder.Apply(&edit)); + + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + const auto& new_blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(new_blob_files.size(), 2); + + const auto base_meta = + GetBlobFileMetaData(new_blob_files, base_blob_file_number); + + ASSERT_NE(base_meta, nullptr); + ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number); + ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count); + ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes); + ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value); + + const auto added_meta = GetBlobFileMetaData(new_blob_files, blob_file_number); + + ASSERT_NE(added_meta, nullptr); + ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes); + ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method); + ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { + // Initialize base version. The first table file points to a valid blob file + // in this version; the second one does not refer to any blob files. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201", + /* largest */ "300", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200, + kInvalidBlobFileNumber); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000); + + UpdateVersionStorageInfo(); + + // Add a new table file that points to the existing blob file, and add a + // new table file--blob file pair. + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("701"), + /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200, + /* largest_seqno */ 200, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("801"), + /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200, + /* largest_seqno */ 200, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, + /* total_blob_bytes */ 200000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + + ASSERT_OK(builder.Apply(&edit)); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) { + // Initialize base version. Links between the table file and the blob file + // are inconsistent. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 256); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr( + s.getState(), + "Links are inconsistent between table files and blob file #16")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) { + // Initialize base version. The table file points to a blob file that is + // all garbage. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000); + + UpdateVersionStorageInfo(); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Blob file #16 consists entirely of garbage")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) { + // Initialize base version, with a table file pointing to a blob file + // that has no garbage at this point. + + Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150", + /* largest */ "200", /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100, + /* oldest_blob_file_number */ 16); + + AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000, + /* total_blob_bytes */ 1000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1}, + /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0); + + UpdateVersionStorageInfo(); + + // Mark the entire blob file garbage but do not remove the linked SST. + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + VersionEdit edit; + + edit.AddBlobFileGarbage(/* blob_file_number */ 16, + /* garbage_blob_count */ 1000, + /* garbage_blob_bytes */ 1000000); + + ASSERT_OK(builder.Apply(&edit)); + + // Save to a new version in order to trigger consistency checks. + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + const Status s = builder.SaveTo(&new_vstorage); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), "Blob file #16 consists entirely of garbage")); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { + // Initialize base version. Table files 1..10 are linked to blob files 1..5, + // while table files 11..20 are not linked to any blob files. + + for (uint64_t i = 1; i <= 10; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << i; + + const std::string key = oss.str(); + + Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(), + /* largest */ key.c_str(), /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ i * 100, + /* largest_seqno */ i * 100, + /* oldest_blob_file_number */ ((i - 1) % 5) + 1); + } + + for (uint64_t i = 1; i <= 5; ++i) { + AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000, + /* total_blob_bytes */ 2000000, + /* checksum_method */ std::string(), + /* checksum_value */ std::string(), + BlobFileMetaData::LinkedSsts{i, i + 5}, + /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000); + } + + for (uint64_t i = 11; i <= 20; ++i) { + std::ostringstream oss; + oss << std::setw(2) << std::setfill('0') << i; + + const std::string key = oss.str(); + + Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(), + /* largest */ key.c_str(), /* file_size */ 100, + /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100, + /* num_entries */ 0, /* num_deletions */ 0, + /* sampled */ false, /* smallest_seqno */ i * 100, + /* largest_seqno */ i * 100, kInvalidBlobFileNumber); + } + + UpdateVersionStorageInfo(); + + { + const auto& blob_files = vstorage_.GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 5); + + const std::vector expected_linked_ssts{ + {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + + for (size_t i = 0; i < 5; ++i) { + const auto meta = + GetBlobFileMetaData(blob_files, /* blob_file_number */ i + 1); + ASSERT_NE(meta, nullptr); + ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]); + } + } + + VersionEdit edit; + + // Add an SST that references a blob file. + edit.AddFile( + /* level */ 1, /* file_number */ 21, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100), + /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100, + /* largest_seqno */ 2100, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + // Add an SST that does not reference any blob files. + edit.AddFile( + /* level */ 1, /* file_number */ 22, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200), + /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200, + /* largest_seqno */ 2200, /* marked_for_compaction */ false, + Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + // Delete a file that references a blob file. + edit.DeleteFile(/* level */ 1, /* file_number */ 6); + + // Delete a file that does not reference any blob files. + edit.DeleteFile(/* level */ 1, /* file_number */ 16); + + // Trivially move a file that references a blob file. Note that we save + // the original BlobFileMetaData object so we can check that no new object + // gets created. + auto meta3 = + GetBlobFileMetaData(vstorage_.GetBlobFiles(), /* blob_file_number */ 3); + + edit.DeleteFile(/* level */ 1, /* file_number */ 3); + edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("03", 300), + /* largest */ GetInternalKey("03", 300), + /* smallest_seqno */ 300, + /* largest_seqno */ 300, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + // Trivially move a file that does not reference any blob files. + edit.DeleteFile(/* level */ 1, /* file_number */ 13); + edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300), + /* largest */ GetInternalKey("13", 1300), + /* smallest_seqno */ 1300, + /* largest_seqno */ 1300, /* marked_for_compaction */ false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + + // Add one more SST file that references a blob file, then promptly + // delete it in a second version edit before the new version gets saved. + // This file should not show up as linked to the blob file in the new version. + edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300), + /* largest */ GetInternalKey("23", 2300), + /* smallest_seqno */ 2300, + /* largest_seqno */ 2300, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); + + VersionEdit edit2; + + edit2.DeleteFile(/* level */ 1, /* file_number */ 23); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_, + version_set); + + ASSERT_OK(builder.Apply(&edit)); + ASSERT_OK(builder.Apply(&edit2)); + + constexpr bool force_consistency_checks = true; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + + { + const auto& blob_files = new_vstorage.GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 5); + + const std::vector expected_linked_ssts{ + {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}; + + for (size_t i = 0; i < 5; ++i) { + const auto meta = + GetBlobFileMetaData(blob_files, /* blob_file_number */ i + 1); + ASSERT_NE(meta, nullptr); + ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]); + } + + // Make sure that no new BlobFileMetaData got created for the blob file + // affected by the trivial move. + ASSERT_EQ(GetBlobFileMetaData(blob_files, /* blob_file_number */ 3), meta3); + } + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { + Add(0, 1U, "150", "200", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.DeleteFile(0, 1U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + constexpr VersionSet* version_set = nullptr; + + VersionBuilder version_builder(env_options, &ioptions_, table_cache, + &vstorage_, version_set); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + VersionBuilder version_builder2(env_options, &ioptions_, table_cache, + &new_vstorage, version_set); + VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_NOK(version_builder2.Apply(&version_edit)); + + UnrefFilesInVersion(&new_vstorage); + UnrefFilesInVersion(&new_vstorage2); +} + TEST_F(VersionBuilderTest, EstimatedActiveKeys) { const uint32_t kTotalSamples = 20; const uint32_t kNumLevels = 5; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,7 @@ #include "db/version_edit.h" -#include "db/blob_index.h" +#include "db/blob/blob_index.h" #include "db/version_set.h" #include "logging/event_logger.h" #include "rocksdb/slice.h" @@ -18,61 +18,10 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -// The unknown file checksum. -const std::string kUnknownFileChecksum(""); -// The unknown sst file checksum function name. -const std::string kUnknownFileChecksumFuncName("Unknown"); -// Mask for an identified tag from the future which can be safely ignored. -const uint32_t kTagSafeIgnoreMask = 1 << 13; - -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. The number should be forward compatible so -// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' -// between Tag and kTagSafeIgnoreMask field. -enum Tag : uint32_t { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - // 8 was used for large value refs - kPrevLogNumber = 9, - kMinLogNumberToKeep = 10, - // Ignore-able field - kDbId = kTagSafeIgnoreMask + 1, - - // these are new formats divergent from open source leveldb - kNewFile2 = 100, - kNewFile3 = 102, - kNewFile4 = 103, // 4th (the latest) format version of adding files - kColumnFamily = 200, // specify column family for version edit - kColumnFamilyAdd = 201, - kColumnFamilyDrop = 202, - kMaxColumnFamily = 203, - - kInAtomicGroup = 300, -}; - -enum CustomTag : uint32_t { - kTerminate = 1, // The end of customized fields - kNeedCompaction = 2, - // Since Manifest is not entirely currently forward-compatible, and the only - // forward-compatible part is the CutsomtTag of kNewFile, we currently encode - // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be - // removed when manifest becomes forward-comptabile. - kMinLogNumberToKeepHack = 3, - kOldestBlobFileNumber = 4, - kOldestAncesterTime = 5, - kFileCreationTime = 6, - kFileChecksum = 7, - kFileChecksumFuncName = 8, - kPathId = 65, -}; -// If this bit for the custom tag is set, opening DB should fail if -// we don't know this field. -uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6; + +namespace { + +} // anonymous namespace uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { assert(number <= kFileNumberMask); @@ -89,7 +38,6 @@ fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); -#ifndef ROCKSDB_LITE if (value_type == kTypeBlobIndex) { BlobIndex blob_index; const Status s = blob_index.DecodeFrom(value); @@ -116,10 +64,6 @@ oldest_blob_file_number = blob_index.file_number(); } } -#else - (void)value; - (void)value_type; -#endif } void VersionEdit::Clear() { @@ -142,12 +86,17 @@ has_last_sequence_ = false; deleted_files_.clear(); new_files_.clear(); + blob_file_additions_.clear(); + blob_file_garbages_.clear(); + wal_additions_.clear(); + wal_deletion_.Reset(); column_family_ = 0; is_column_family_add_ = false; is_column_family_drop_ = false; column_family_name_.clear(); is_in_atomic_group_ = false; remaining_entries_ = 0; + full_history_ts_low_.clear(); } bool VersionEdit::EncodeTo(std::string* dst) const { @@ -217,45 +166,60 @@ // tag kNeedCompaction: // now only can take one char value 1 indicating need-compaction // - PutVarint32(dst, CustomTag::kOldestAncesterTime); + PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime); std::string varint_oldest_ancester_time; PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", &varint_oldest_ancester_time); PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); - PutVarint32(dst, CustomTag::kFileCreationTime); + PutVarint32(dst, NewFileCustomTag::kFileCreationTime); std::string varint_file_creation_time; PutVarint64(&varint_file_creation_time, f.file_creation_time); TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", &varint_file_creation_time); PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); - PutVarint32(dst, CustomTag::kFileChecksum); + PutVarint32(dst, NewFileCustomTag::kFileChecksum); PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); - PutVarint32(dst, CustomTag::kFileChecksumFuncName); + PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName); PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name)); + if (f.max_timestamp != kDisableUserTimestamp) { + if (f.min_timestamp.size() != f.max_timestamp.size()) { + assert(false); + return false; + } + PutVarint32(dst, NewFileCustomTag::kMinTimestamp); + PutLengthPrefixedSlice(dst, Slice(f.min_timestamp)); + PutVarint32(dst, NewFileCustomTag::kMaxTimestamp); + PutLengthPrefixedSlice(dst, Slice(f.max_timestamp)); + } if (f.fd.GetPathId() != 0) { - PutVarint32(dst, CustomTag::kPathId); + PutVarint32(dst, NewFileCustomTag::kPathId); char p = static_cast(f.fd.GetPathId()); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } if (f.marked_for_compaction) { - PutVarint32(dst, CustomTag::kNeedCompaction); + PutVarint32(dst, NewFileCustomTag::kNeedCompaction); char p = static_cast(1); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } if (has_min_log_number_to_keep_ && !min_log_num_written) { - PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack); + PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack); std::string varint_log_number; PutFixed64(&varint_log_number, min_log_number_to_keep_); PutLengthPrefixedSlice(dst, Slice(varint_log_number)); min_log_num_written = true; } if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { - PutVarint32(dst, CustomTag::kOldestBlobFileNumber); + PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber); std::string oldest_blob_file_number; PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); @@ -263,7 +227,31 @@ TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); - PutVarint32(dst, CustomTag::kTerminate); + PutVarint32(dst, NewFileCustomTag::kTerminate); + } + + for (const auto& blob_file_addition : blob_file_additions_) { + PutVarint32(dst, kBlobFileAddition); + blob_file_addition.EncodeTo(dst); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + PutVarint32(dst, kBlobFileGarbage); + blob_file_garbage.EncodeTo(dst); + } + + for (const auto& wal_addition : wal_additions_) { + PutVarint32(dst, kWalAddition2); + std::string encoded; + wal_addition.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); + } + + if (!wal_deletion_.IsEmpty()) { + PutVarint32(dst, kWalDeletion2); + std::string encoded; + wal_deletion_.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); } // 0 is default and does not need to be explicitly written @@ -284,6 +272,11 @@ PutVarint32(dst, kInAtomicGroup); PutVarint32(dst, remaining_entries_); } + + if (HasFullHistoryTsLow()) { + PutVarint32(dst, kFullHistoryTsLow); + PutLengthPrefixedSlice(dst, full_history_ts_low_); + } return true; } @@ -319,9 +312,6 @@ uint64_t file_size = 0; SequenceNumber smallest_seqno = 0; SequenceNumber largest_seqno = kMaxSequenceNumber; - // Since this is the only forward-compatible part of the code, we hack new - // extension into this record. When we do, we set this boolean to distinguish - // the record from the normal NewFile records. if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) && GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) && GetInternalKey(input, &f.largest) && @@ -335,6 +325,10 @@ return "new-file4 custom field"; } if (custom_tag == kTerminate) { + if (f.min_timestamp.size() != f.max_timestamp.size()) { + assert(false); + return "new-file4 custom field timestamp size mismatch error"; + } break; } if (!GetLengthPrefixedSlice(input, &field)) { @@ -385,6 +379,22 @@ return "invalid oldest blob file number"; } break; + case kTemperature: + if (field.size() != 1) { + return "temperature field wrong size"; + } else { + Temperature casted_field = static_cast(field[0]); + if (casted_field <= Temperature::kCold) { + f.temperature = casted_field; + } + } + break; + case kMinTimestamp: + f.min_timestamp = field.ToString(); + break; + case kMaxTimestamp: + f.max_timestamp = field.ToString(); + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -404,6 +414,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); +#ifndef NDEBUG + bool ignore_ignorable_tags = false; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags", + &ignore_ignorable_tags); +#endif Slice input = src; const char* msg = nullptr; uint32_t tag = 0; @@ -414,6 +429,11 @@ Slice str; InternalKey key; while (msg == nullptr && GetVarint32(&input, &tag)) { +#ifndef NDEBUG + if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) { + tag = kTagSafeIgnoreMask; + } +#endif switch (tag) { case kDbId: if (GetLengthPrefixedSlice(&input, &str)) { @@ -571,6 +591,86 @@ break; } + case kBlobFileAddition: + case kBlobFileAddition_DEPRECATED: { + BlobFileAddition blob_file_addition; + const Status s = blob_file_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFile(std::move(blob_file_addition)); + break; + } + + case kBlobFileGarbage: + case kBlobFileGarbage_DEPRECATED: { + BlobFileGarbage blob_file_garbage; + const Status s = blob_file_garbage.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + AddBlobFileGarbage(std::move(blob_file_garbage)); + break; + } + + case kWalAddition: { + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalAddition2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalAddition not prefixed by length"; + break; + } + + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + + case kWalDeletion: { + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&input); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kWalDeletion2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalDeletion not prefixed by length"; + break; + } + + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + case kColumnFamily: if (!GetVarint32(&input, &column_family_)) { if (!msg) { @@ -603,6 +703,16 @@ } break; + case kFullHistoryTsLow: + if (!GetLengthPrefixedSlice(&input, &str)) { + msg = "full_history_ts_low"; + } else if (str.empty()) { + msg = "full_history_ts_low: empty"; + } else { + full_history_ts_low_.assign(str.data(), str.size()); + } + break; + default: if (tag & kTagSafeIgnoreMask) { // Tag from future which can be safely ignored. @@ -691,15 +801,49 @@ r.append(" blob_file:"); AppendNumberTo(&r, f.oldest_blob_file_number); } + if (f.min_timestamp != kDisableUserTimestamp) { + assert(f.max_timestamp != kDisableUserTimestamp); + r.append(" min_timestamp:"); + r.append(Slice(f.min_timestamp).ToString(true)); + r.append(" max_timestamp:"); + r.append(Slice(f.max_timestamp).ToString(true)); + } r.append(" oldest_ancester_time:"); AppendNumberTo(&r, f.oldest_ancester_time); r.append(" file_creation_time:"); AppendNumberTo(&r, f.file_creation_time); r.append(" file_checksum:"); - r.append(f.file_checksum); + r.append(Slice(f.file_checksum).ToString(true)); r.append(" file_checksum_func_name: "); r.append(f.file_checksum_func_name); + if (f.temperature != Temperature::kUnknown) { + r.append(" temperature: "); + // Maybe change to human readable format whenthe feature becomes + // permanent + r.append(ToString(static_cast(f.temperature))); + } + } + + for (const auto& blob_file_addition : blob_file_additions_) { + r.append("\n BlobFileAddition: "); + r.append(blob_file_addition.DebugString()); + } + + for (const auto& blob_file_garbage : blob_file_garbages_) { + r.append("\n BlobFileGarbage: "); + r.append(blob_file_garbage.DebugString()); } + + for (const auto& wal_addition : wal_additions_) { + r.append("\n WalAddition: "); + r.append(wal_addition.DebugString()); + } + + if (!wal_deletion_.IsEmpty()) { + r.append("\n WalDeletion: "); + r.append(wal_deletion_.DebugString()); + } + r.append("\n ColumnFamily: "); AppendNumberTo(&r, column_family_); if (is_column_family_add_) { @@ -714,6 +858,10 @@ AppendNumberTo(&r, remaining_entries_); r.append(" entries remains"); } + if (HasFullHistoryTsLow()) { + r.append("\n FullHistoryTsLow: "); + r.append(Slice(full_history_ts_low_).ToString(hex_key)); + } r.append("\n}\n"); return r; } @@ -773,15 +921,81 @@ jw << "FileSize" << f.fd.GetFileSize(); jw << "SmallestIKey" << f.smallest.DebugString(hex_key); jw << "LargestIKey" << f.largest.DebugString(hex_key); + if (f.min_timestamp != kDisableUserTimestamp) { + assert(f.max_timestamp != kDisableUserTimestamp); + jw << "MinTimestamp" << Slice(f.min_timestamp).ToString(true); + jw << "MaxTimestamp" << Slice(f.max_timestamp).ToString(true); + } + jw << "OldestAncesterTime" << f.oldest_ancester_time; + jw << "FileCreationTime" << f.file_creation_time; + jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); + jw << "FileChecksumFuncName" << f.file_checksum_func_name; + if (f.temperature != Temperature::kUnknown) { + jw << "temperature" << ToString(static_cast(f.temperature)); + } if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { jw << "OldestBlobFile" << f.oldest_blob_file_number; } + if (f.temperature != Temperature::kUnknown) { + // Maybe change to human readable format whenthe feature becomes + // permanent + jw << "Temperature" << static_cast(f.temperature); + } + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_additions_.empty()) { + jw << "BlobFileAdditions"; + + jw.StartArray(); + + for (const auto& blob_file_addition : blob_file_additions_) { + jw.StartArrayedObject(); + jw << blob_file_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!blob_file_garbages_.empty()) { + jw << "BlobFileGarbages"; + + jw.StartArray(); + + for (const auto& blob_file_garbage : blob_file_garbages_) { + jw.StartArrayedObject(); + jw << blob_file_garbage; jw.EndArrayedObject(); } jw.EndArray(); } + if (!wal_additions_.empty()) { + jw << "WalAdditions"; + + jw.StartArray(); + + for (const auto& wal_addition : wal_additions_) { + jw.StartArrayedObject(); + jw << wal_addition; + jw.EndArrayedObject(); + } + + jw.EndArray(); + } + + if (!wal_deletion_.IsEmpty()) { + jw << "WalDeletion"; + jw.StartObject(); + jw << wal_deletion_; + jw.EndObject(); + } + jw << "ColumnFamily" << column_family_; if (is_column_family_add_) { @@ -794,6 +1008,10 @@ jw << "AtomicGroup" << remaining_entries_; } + if (HasFullHistoryTsLow()) { + jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key); + } + jw.EndObject(); return jw.Get(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,24 +13,93 @@ #include #include #include + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_garbage.h" #include "db/dbformat.h" +#include "db/wal_edit.h" #include "memory/arena.h" +#include "rocksdb/advanced_options.h" #include "rocksdb/cache.h" #include "table/table_reader.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +// Tag numbers for serialized VersionEdit. These numbers are written to +// disk and should not be changed. The number should be forward compatible so +// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' +// between Tag and kTagSafeIgnoreMask field. +enum Tag : uint32_t { + kComparator = 1, + kLogNumber = 2, + kNextFileNumber = 3, + kLastSequence = 4, + kCompactPointer = 5, + kDeletedFile = 6, + kNewFile = 7, + // 8 was used for large value refs + kPrevLogNumber = 9, + kMinLogNumberToKeep = 10, + + // these are new formats divergent from open source leveldb + kNewFile2 = 100, + kNewFile3 = 102, + kNewFile4 = 103, // 4th (the latest) format version of adding files + kColumnFamily = 200, // specify column family for version edit + kColumnFamilyAdd = 201, + kColumnFamilyDrop = 202, + kMaxColumnFamily = 203, + + kInAtomicGroup = 300, + + kBlobFileAddition = 400, + kBlobFileGarbage, + + // Mask for an unidentified tag from the future which can be safely ignored. + kTagSafeIgnoreMask = 1 << 13, + + // Forward compatible (aka ignorable) records + kDbId, + kBlobFileAddition_DEPRECATED, + kBlobFileGarbage_DEPRECATED, + kWalAddition, + kWalDeletion, + kFullHistoryTsLow, + kWalAddition2, + kWalDeletion2, +}; + +enum NewFileCustomTag : uint32_t { + kTerminate = 1, // The end of customized fields + kNeedCompaction = 2, + // Since Manifest is not entirely forward-compatible, we currently encode + // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed + // when manifest becomes forward-compatible. + kMinLogNumberToKeepHack = 3, + kOldestBlobFileNumber = 4, + kOldestAncesterTime = 5, + kFileCreationTime = 6, + kFileChecksum = 7, + kFileChecksumFuncName = 8, + kTemperature = 9, + kMinTimestamp = 10, + kMaxTimestamp = 11, + + // If this bit for the custom tag is set, opening DB should fail if + // we don't know this field. + kCustomTagNonSafeIgnoreMask = 1 << 6, + + // Forward incompatible (aka unignorable) fields + kPathId, +}; + class VersionSet; constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; -constexpr uint64_t kInvalidBlobFileNumber = 0; constexpr uint64_t kUnknownOldestAncesterTime = 0; constexpr uint64_t kUnknownFileCreationTime = 0; -extern const std::string kUnknownFileChecksum; -extern const std::string kUnknownFileChecksumFuncName; - extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); // A copyable structure contains information needed to read data from an SST @@ -123,6 +192,7 @@ bool marked_for_compaction = false; // True if client asked us nicely to // compact this file. + Temperature temperature = Temperature::kUnknown; // Used only in BlobDB. The file number of the oldest blob file this SST file // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. @@ -130,7 +200,7 @@ // The file could be the compaction output from other SST files, which could // in turn be outputs for compact older SST files. We track the memtable - // flush timestamp for the oldest SST file that eventaully contribute data + // flush timestamp for the oldest SST file that eventually contribute data // to this file. 0 means the information is not available. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; @@ -142,6 +212,10 @@ // File checksum function name std::string file_checksum_func_name = kUnknownFileChecksumFuncName; + // Min (oldest) timestamp of keys in this file + std::string min_timestamp; + // Max (newest) timestamp of keys in this file + std::string max_timestamp; FileMetaData() = default; @@ -149,18 +223,23 @@ const InternalKey& smallest_key, const InternalKey& largest_key, const SequenceNumber& smallest_seq, const SequenceNumber& largest_seq, bool marked_for_compact, - uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, - uint64_t _file_creation_time, const std::string& _file_checksum, - const std::string& _file_checksum_func_name) + Temperature _temperature, uint64_t oldest_blob_file, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + const std::string& _file_checksum, + const std::string& _file_checksum_func_name, + std::string _min_timestamp, std::string _max_timestamp) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), marked_for_compaction(marked_for_compact), + temperature(_temperature), oldest_blob_file_number(oldest_blob_file), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), file_checksum(_file_checksum), - file_checksum_func_name(_file_checksum_func_name) { + file_checksum_func_name(_file_checksum_func_name), + min_timestamp(std::move(_min_timestamp)), + max_timestamp(std::move(_max_timestamp)) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -307,16 +386,16 @@ bool HasLastSequence() const { return has_last_sequence_; } SequenceNumber GetLastSequence() const { return last_sequence_; } - // Delete the specified "file" from the specified "level". + // Delete the specified table file from the specified level. void DeleteFile(int level, uint64_t file) { deleted_files_.emplace(level, file); } - // Retrieve the files deleted as well as their associated levels. + // Retrieve the table files deleted as well as their associated levels. using DeletedFiles = std::set>; const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } - // Add the specified file at the specified level. + // Add the specified table file at the specified level. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file @@ -325,29 +404,120 @@ uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno, bool marked_for_compaction, - uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, - uint64_t file_creation_time, const std::string& file_checksum, - const std::string& file_checksum_func_name) { + Temperature temperature, uint64_t oldest_blob_file_number, + uint64_t oldest_ancester_time, uint64_t file_creation_time, + const std::string& file_checksum, + const std::string& file_checksum_func_name, + const std::string& min_timestamp, + const std::string& max_timestamp) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( - level, FileMetaData(file, file_path_id, file_size, smallest, largest, - smallest_seqno, largest_seqno, - marked_for_compaction, oldest_blob_file_number, - oldest_ancester_time, file_creation_time, - file_checksum, file_checksum_func_name)); + level, + FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, marked_for_compaction, + temperature, oldest_blob_file_number, oldest_ancester_time, + file_creation_time, file_checksum, file_checksum_func_name, + min_timestamp, max_timestamp)); + if (!HasLastSequence() || largest_seqno > GetLastSequence()) { + SetLastSequence(largest_seqno); + } } void AddFile(int level, const FileMetaData& f) { assert(f.fd.smallest_seqno <= f.fd.largest_seqno); new_files_.emplace_back(level, f); + if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) { + SetLastSequence(f.fd.largest_seqno); + } } - // Retrieve the files added as well as their associated levels. + // Retrieve the table files added as well as their associated levels. using NewFiles = std::vector>; const NewFiles& GetNewFiles() const { return new_files_; } + // Add a new blob file. + void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + blob_file_additions_.emplace_back( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value)); + } + + void AddBlobFile(BlobFileAddition blob_file_addition) { + blob_file_additions_.emplace_back(std::move(blob_file_addition)); + } + + // Retrieve all the blob files added. + using BlobFileAdditions = std::vector; + const BlobFileAdditions& GetBlobFileAdditions() const { + return blob_file_additions_; + } + + void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) { + assert(blob_file_additions_.empty()); + blob_file_additions_ = std::move(blob_file_additions); + } + + // Add garbage for an existing blob file. Note: intentionally broken English + // follows. + void AddBlobFileGarbage(uint64_t blob_file_number, + uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + } + + void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) { + blob_file_garbages_.emplace_back(std::move(blob_file_garbage)); + } + + // Retrieve all the blob file garbage added. + using BlobFileGarbages = std::vector; + const BlobFileGarbages& GetBlobFileGarbages() const { + return blob_file_garbages_; + } + + void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) { + assert(blob_file_garbages_.empty()); + blob_file_garbages_ = std::move(blob_file_garbages); + } + + // Add a WAL (either just created or closed). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) { + assert(NumEntries() == wal_additions_.size()); + wal_additions_.emplace_back(number, std::move(metadata)); + } + + // Retrieve all the added WALs. + const WalAdditions& GetWalAdditions() const { return wal_additions_; } + + bool IsWalAddition() const { return !wal_additions_.empty(); } + + // Delete a WAL (either directly deleted or archived). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void DeleteWalsBefore(WalNumber number) { + assert((NumEntries() == 1) == !wal_deletion_.IsEmpty()); + wal_deletion_ = WalDeletion(number); + } + + const WalDeletion& GetWalDeletion() const { return wal_deletion_; } + + bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); } + + bool IsWalManipulation() const { + size_t entries = NumEntries(); + return (entries > 0) && ((entries == wal_additions_.size()) || + (entries == !wal_deletion_.IsEmpty())); + } + // Number of edits - size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); } + size_t NumEntries() const { + return new_files_.size() + deleted_files_.size() + + blob_file_additions_.size() + blob_file_garbages_.size() + + wal_additions_.size() + !wal_deletion_.IsEmpty(); + } void SetColumnFamily(uint32_t column_family_id) { column_family_ = column_family_id; @@ -375,6 +545,10 @@ return is_column_family_add_ || is_column_family_drop_; } + bool IsColumnFamilyAdd() const { return is_column_family_add_; } + + bool IsColumnFamilyDrop() const { return is_column_family_drop_; } + void MarkAtomicGroup(uint32_t remaining_entries) { is_in_atomic_group_ = true; remaining_entries_ = remaining_entries; @@ -382,6 +556,16 @@ bool IsInAtomicGroup() const { return is_in_atomic_group_; } uint32_t GetRemainingEntries() const { return remaining_entries_; } + bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); } + const std::string& GetFullHistoryTsLow() const { + assert(HasFullHistoryTsLow()); + return full_history_ts_low_; + } + void SetFullHistoryTsLow(std::string full_history_ts_low) { + assert(!full_history_ts_low.empty()); + full_history_ts_low_ = std::move(full_history_ts_low); + } + // return true on success. bool EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); @@ -391,6 +575,11 @@ private: friend class ReactiveVersionSet; + friend class VersionEditHandlerBase; + friend class ListColumnFamiliesHandler; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; friend class VersionSet; friend class Version; friend class AtomicGroupReadBuffer; @@ -421,6 +610,12 @@ DeletedFiles deleted_files_; NewFiles new_files_; + BlobFileAdditions blob_file_additions_; + BlobFileGarbages blob_file_garbages_; + + WalAdditions wal_additions_; + WalDeletion wal_deletion_; + // Each version edit record should have column_family_ set // If it's not set, it is default (0) uint32_t column_family_ = 0; @@ -433,6 +628,8 @@ bool is_in_atomic_group_ = false; uint32_t remaining_entries_ = 0; + + std::string full_history_ts_low_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,980 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_edit_handler.h" + +#include +#include + +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "logging/logging.h" +#include "monitoring/persistent_stats_history.h" + +namespace ROCKSDB_NAMESPACE { + +void VersionEditHandlerBase::Iterate(log::Reader& reader, + Status* log_read_status) { + Slice record; + std::string scratch; + assert(log_read_status); + assert(log_read_status->ok()); + + size_t recovered_edits = 0; + Status s = Initialize(); + while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() && + reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + + s = read_buffer_.AddEdit(&edit); + if (!s.ok()) { + break; + } + ColumnFamilyData* cfd = nullptr; + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyVersionEdit(e, &cfd); + if (!s.ok()) { + break; + } + ++recovered_edits; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + s = ApplyVersionEdit(edit, &cfd); + if (s.ok()) { + ++recovered_edits; + } + } + } + if (!log_read_status->ok()) { + s = *log_read_status; + } + + CheckIterationResult(reader, &s); + + if (!s.ok()) { + if (s.IsCorruption()) { + // when we find a Corruption error, something is + // wrong with the underlying file. in this case we + // want to report the filename, so in here we append + // the filename to the Corruption message + assert(reader.file()); + + // build a new error message + std::stringstream message; + // append previous dynamic state message + const char* state = s.getState(); + if (state != nullptr) { + message << state; + message << ' '; + } + // append the filename to the corruption message + message << "in file " << reader.file()->file_name(); + // overwrite the status with the extended status + s = Status(s.code(), s.subcode(), s.severity(), message.str()); + } + status_ = s; + } + TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish", + &recovered_edits); +} + +Status ListColumnFamiliesHandler::ApplyVersionEdit( + VersionEdit& edit, ColumnFamilyData** /*unused*/) { + Status s; + if (edit.is_column_family_add_) { + if (column_family_names_.find(edit.column_family_) != + column_family_names_.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + } else { + column_family_names_.insert( + {edit.column_family_, edit.column_family_name_}); + } + } else if (edit.is_column_family_drop_) { + if (column_family_names_.find(edit.column_family_) == + column_family_names_.end()) { + s = Status::Corruption("Manifest - dropping non-existing column family"); + } else { + column_family_names_.erase(edit.column_family_); + } + } + return s; +} + +Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) { + for (const auto& deleted_file : edit.GetDeletedFiles()) { + Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second); + if (!s.ok()) { + return s; + } + } + for (const auto& new_file : edit.GetNewFiles()) { + Status s = file_checksum_list_.InsertOneFileChecksum( + new_file.second.fd.GetNumber(), new_file.second.file_checksum, + new_file.second.file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + for (const auto& new_blob_file : edit.GetBlobFileAdditions()) { + std::string checksum_value = new_blob_file.GetChecksumValue(); + std::string checksum_method = new_blob_file.GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (checksum_method.empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + Status s = file_checksum_list_.InsertOneFileChecksum( + new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +VersionEditHandler::VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + bool skip_load_table_files) + : VersionEditHandlerBase(), + read_only_(read_only), + column_families_(std::move(column_families)), + version_set_(version_set), + track_missing_files_(track_missing_files), + no_error_if_files_missing_(no_error_if_files_missing), + io_tracer_(io_tracer), + skip_load_table_files_(skip_load_table_files), + initialized_(false) { + assert(version_set_ != nullptr); +} + +Status VersionEditHandler::Initialize() { + Status s; + if (!initialized_) { + for (const auto& cf_desc : column_families_) { + name_to_options_.emplace(cf_desc.name, cf_desc.options); + } + auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName); + if (default_cf_iter == name_to_options_.end()) { + s = Status::InvalidArgument("Default column family not specified"); + } + if (s.ok()) { + VersionEdit default_cf_edit; + default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); + default_cf_edit.SetColumnFamily(0); + ColumnFamilyData* cfd = + CreateCfAndInit(default_cf_iter->second, default_cf_edit); + assert(cfd != nullptr); +#ifdef NDEBUG + (void)cfd; +#endif + initialized_ = true; + } + } + return s; +} + +Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s; + if (edit.is_column_family_add_) { + s = OnColumnFamilyAdd(edit, cfd); + } else if (edit.is_column_family_drop_) { + s = OnColumnFamilyDrop(edit, cfd); + } else if (edit.IsWalAddition()) { + s = OnWalAddition(edit); + } else if (edit.IsWalDeletion()) { + s = OnWalDeletion(edit); + } else { + s = OnNonCfOperation(edit, cfd); + } + if (s.ok()) { + assert(cfd != nullptr); + s = ExtractInfoFromVersionEdit(*cfd, edit); + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (cf_in_builders || cf_in_not_found) { + s = Status::Corruption("MANIFEST adding the same column family twice: " + + edit.column_family_name_); + } + if (s.ok()) { + auto cf_options = name_to_options_.find(edit.column_family_name_); + // implicitly add persistent_stats column family without requiring user + // to specify + ColumnFamilyData* tmp_cfd = nullptr; + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options_.end() && + !is_persistent_stats_column_family) { + column_families_not_found_.emplace(edit.column_family_, + edit.column_family_name_); + } else { + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + tmp_cfd = CreateCfAndInit(cfo, edit); + } else { + tmp_cfd = CreateCfAndInit(cf_options->second, edit); + } + *cfd = tmp_cfd; + } + } + return s; +} + +Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + ColumnFamilyData* tmp_cfd = nullptr; + Status s; + if (cf_in_builders) { + tmp_cfd = DestroyCfAndCleanup(edit); + } else if (cf_in_not_found) { + column_families_not_found_.erase(edit.column_family_); + } else { + s = Status::Corruption("MANIFEST - dropping non-existing column family"); + } + *cfd = tmp_cfd; + return s; +} + +Status VersionEditHandler::OnWalAddition(VersionEdit& edit) { + assert(edit.IsWalAddition()); + return version_set_->wals_.AddWals(edit.GetWalAdditions()); +} + +Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) { + assert(edit.IsWalDeletion()); + return version_set_->wals_.DeleteWalsBefore( + edit.GetWalDeletion().GetLogNumber()); +} + +Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, + ColumnFamilyData** cfd) { + bool cf_in_not_found = false; + bool cf_in_builders = false; + CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders); + + assert(cfd != nullptr); + *cfd = nullptr; + Status s; + if (!cf_in_not_found) { + if (!cf_in_builders) { + s = Status::Corruption( + "MANIFEST record referencing unknown column family"); + } + ColumnFamilyData* tmp_cfd = nullptr; + if (s.ok()) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( + edit.column_family_); + assert(tmp_cfd != nullptr); + s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); + if (s.ok()) { + s = builder_iter->second->version_builder()->Apply(&edit); + } + } + *cfd = tmp_cfd; + } + return s; +} + +// TODO maybe cache the computation result +bool VersionEditHandler::HasMissingFiles() const { + bool ret = false; + for (const auto& elem : cf_to_missing_files_) { + const auto& missing_files = elem.second; + if (!missing_files.empty()) { + ret = true; + break; + } + } + if (!ret) { + for (const auto& elem : cf_to_missing_blob_files_high_) { + if (elem.second != kInvalidBlobFileNumber) { + ret = true; + break; + } + } + } + return ret; +} + +void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, + bool* cf_in_not_found, + bool* cf_in_builders) const { + assert(cf_in_not_found != nullptr); + assert(cf_in_builders != nullptr); + // Not found means that user didn't supply that column + // family option AND we encountered column family add + // record. Once we encounter column family drop record, + // we will delete the column family from + // column_families_not_found. + bool in_not_found = column_families_not_found_.find(edit.column_family_) != + column_families_not_found_.end(); + // in builders means that user supplied that column family + // option AND that we encountered column family add record + bool in_builders = builders_.find(edit.column_family_) != builders_.end(); + // They cannot both be true + assert(!(in_not_found && in_builders)); + *cf_in_not_found = in_not_found; + *cf_in_builders = in_builders; +} + +void VersionEditHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + assert(s != nullptr); + if (!s->ok()) { + // Do nothing here. + } else if (!version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_) { + std::string msg("no "); + if (!version_edit_params_.has_log_number_) { + msg.append("log_file_number, "); + } + if (!version_edit_params_.has_next_file_number_) { + msg.append("next_file_number, "); + } + if (!version_edit_params_.has_last_sequence_) { + msg.append("last_sequence, "); + } + msg = msg.substr(0, msg.size() - 2); + msg.append(" entry in MANIFEST"); + *s = Status::Corruption(msg); + } + // There were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (s->ok() && MustOpenAllColumnFamilies() && + !column_families_not_found_.empty()) { + std::string msg; + for (const auto& cf : column_families_not_found_) { + msg.append(", "); + msg.append(cf.second); + } + msg = msg.substr(2); + *s = Status::InvalidArgument("Column families not opened: " + msg); + } + if (s->ok()) { + version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily( + version_edit_params_.max_column_family_); + version_set_->MarkMinLogNumberToKeep( + version_edit_params_.min_log_number_to_keep_); + version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_); + version_set_->MarkFileNumberUsed(version_edit_params_.log_number_); + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + if (!builder->CheckConsistencyForNumLevels()) { + *s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + } + } + if (s->ok()) { + for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } + if (read_only_) { + cfd->table_cache()->SetTablesAreImmortal(); + } + *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false, + /*is_initial_load=*/true); + if (!s->ok()) { + // If s is IOError::PathNotFound, then we mark the db as corrupted. + if (s->IsPathNotFound()) { + *s = Status::Corruption("Corruption: " + s->ToString()); + } + break; + } + } + } + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + VersionEdit edit; + *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true); + if (!s->ok()) { + break; + } + } + } + if (s->ok()) { + version_set_->manifest_file_size_ = reader.GetReadOffset(); + assert(version_set_->manifest_file_size_ > 0); + version_set_->next_file_number_.store( + version_edit_params_.next_file_number_ + 1); + SequenceNumber last_seq = version_edit_params_.last_sequence_; + assert(last_seq != kMaxSequenceNumber); + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_allocated_sequence_.load()) { + version_set_->last_allocated_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_published_sequence_.load()) { + version_set_->last_published_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->last_sequence_.load()) { + version_set_->last_sequence_.store(last_seq); + } + if (last_seq != kMaxSequenceNumber && + last_seq > version_set_->descriptor_last_sequence_) { + // This is the maximum last sequence of all `VersionEdit`s iterated. It + // may be greater than the maximum `largest_seqno` of all files in case + // the newest data referred to by the MANIFEST has been dropped or had its + // sequence number zeroed through compaction. + version_set_->descriptor_last_sequence_ = last_seq; + } + version_set_->prev_log_number_ = version_edit_params_.prev_log_number_; + } +} + +ColumnFamilyData* VersionEditHandler::CreateCfAndInit( + const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { + ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit); + assert(cfd != nullptr); + cfd->set_initialized(); + assert(builders_.find(edit.column_family_) == builders_.end()); + builders_.emplace(edit.column_family_, + VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); + if (track_missing_files_) { + cf_to_missing_files_.emplace(edit.column_family_, + std::unordered_set()); + cf_to_missing_blob_files_high_.emplace(edit.column_family_, + kInvalidBlobFileNumber); + } + return cfd; +} + +ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( + const VersionEdit& edit) { + auto builder_iter = builders_.find(edit.column_family_); + assert(builder_iter != builders_.end()); + builders_.erase(builder_iter); + if (track_missing_files_) { + auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_); + assert(missing_files_iter != cf_to_missing_files_.end()); + cf_to_missing_files_.erase(missing_files_iter); + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(edit.column_family_); + assert(missing_blob_files_high_iter != + cf_to_missing_blob_files_high_.end()); + cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); + } + ColumnFamilyData* ret = + version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_); + assert(ret != nullptr); + ret->SetDropped(); + ret->UnrefAndTryDelete(); + ret = nullptr; + return ret; +} + +Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, + ColumnFamilyData* cfd, + bool force_create_version) { + assert(cfd->initialized()); + Status s; + if (force_create_version) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + auto* builder = builder_iter->second->version_builder(); + auto* v = new Version(cfd, version_set_, version_set_->file_options_, + *cfd->GetLatestMutableCFOptions(), io_tracer_, + version_set_->current_version_number_++); + s = builder->SaveTo(v->storage_info()); + if (s.ok()) { + // Install new version + v->PrepareApply( + *cfd->GetLatestMutableCFOptions(), + !(version_set_->db_options_->skip_stats_update_on_db_open)); + version_set_->AppendVersion(cfd, v); + } else { + delete v; + } + } + return s; +} + +Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load) { + bool skip_load_table_files = skip_load_table_files_; + TEST_SYNC_POINT_CALLBACK( + "VersionEditHandler::LoadTables:skip_load_table_files", + &skip_load_table_files); + if (skip_load_table_files) { + return Status::OK(); + } + assert(cfd != nullptr); + assert(!cfd->IsDropped()); + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + assert(builder_iter->second != nullptr); + VersionBuilder* builder = builder_iter->second->version_builder(); + assert(builder); + Status s = builder->LoadTableHandlers( + cfd->internal_stats(), + version_set_->db_options_->max_file_opening_threads, + prefetch_index_and_filter_in_cache, is_initial_load, + cfd->GetLatestMutableCFOptions()->prefix_extractor, + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { + s = Status::OK(); + } + if (!s.ok() && !version_set_->db_options_->paranoid_checks) { + s = Status::OK(); + } + return s; +} + +Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit) { + Status s; + if (edit.has_db_id_) { + version_set_->db_id_ = edit.GetDbId(); + version_edit_params_.SetDBId(edit.db_id_); + } + if (cfd != nullptr) { + if (edit.has_log_number_) { + if (cfd->GetLogNumber() > edit.log_number_) { + ROCKS_LOG_WARN( + version_set_->db_options()->info_log, + "MANIFEST corruption detected, but ignored - Log numbers in " + "records NOT monotonically increasing"); + } else { + cfd->SetLogNumber(edit.log_number_); + version_edit_params_.SetLogNumber(edit.log_number_); + } + } + if (edit.has_comparator_ && + edit.comparator_ != cfd->user_comparator()->Name()) { + if (!cf_to_cmp_names_) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } else { + cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_); + } + } + if (edit.HasFullHistoryTsLow()) { + const std::string& new_ts = edit.GetFullHistoryTsLow(); + cfd->SetFullHistoryTsLow(new_ts); + } + } + + if (s.ok()) { + if (edit.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(edit.prev_log_number_); + } + if (edit.has_next_file_number_) { + version_edit_params_.SetNextFile(edit.next_file_number_); + } + if (edit.has_max_column_family_) { + version_edit_params_.SetMaxColumnFamily(edit.max_column_family_); + } + if (edit.has_min_log_number_to_keep_) { + version_edit_params_.min_log_number_to_keep_ = + std::max(version_edit_params_.min_log_number_to_keep_, + edit.min_log_number_to_keep_); + } + if (edit.has_last_sequence_) { + // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This + // is legacy behavior that cannot change without breaking downgrade + // compatibility. + assert(!version_edit_params_.has_last_sequence_ || + version_edit_params_.last_sequence_ <= edit.last_sequence_); + version_edit_params_.SetLastSequence(edit.last_sequence_); + } + if (!version_edit_params_.has_prev_log_number_) { + version_edit_params_.SetPrevLogNumber(0); + } + } + return s; +} + +VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer) + : VersionEditHandler(read_only, column_families, version_set, + /*track_missing_files=*/true, + /*no_error_if_files_missing=*/true, io_tracer) {} + +VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); +} + +void VersionEditHandlerPointInTime::CheckIterationResult( + const log::Reader& reader, Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + assert(s != nullptr); + if (s->ok()) { + for (auto* cfd : *(version_set_->column_family_set_)) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + assert(v_iter->second != nullptr); + + version_set_->AppendVersion(cfd, v_iter->second); + versions_.erase(v_iter); + } + } + } else { + for (const auto& elem : versions_) { + delete elem.second; + } + versions_.clear(); + } +} + +ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( + const VersionEdit& edit) { + ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit); + auto v_iter = versions_.find(edit.column_family_); + if (v_iter != versions_.end()) { + delete v_iter->second; + versions_.erase(v_iter); + } + return cfd; +} + +Status VersionEditHandlerPointInTime::MaybeCreateVersion( + const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { + assert(cfd != nullptr); + if (!force_create_version) { + assert(edit.column_family_ == cfd->GetID()); + } + auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); + assert(missing_files_iter != cf_to_missing_files_.end()); + std::unordered_set& missing_files = missing_files_iter->second; + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(cfd->GetID()); + assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); + const uint64_t prev_missing_blob_file_high = + missing_blob_files_high_iter->second; + + VersionBuilder* builder = nullptr; + + if (prev_missing_blob_file_high != kInvalidBlobFileNumber) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + } + + // At this point, we have not yet applied the new version edits read from the + // MANIFEST. We check whether we have any missing table and blob files. + const bool prev_has_missing_files = + !missing_files.empty() || + (prev_missing_blob_file_high != kInvalidBlobFileNumber && + prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber()); + + for (const auto& file : edit.GetDeletedFiles()) { + uint64_t file_num = file.second; + auto fiter = missing_files.find(file_num); + if (fiter != missing_files.end()) { + missing_files.erase(fiter); + } + } + + assert(!cfd->ioptions()->cf_paths.empty()); + Status s; + for (const auto& elem : edit.GetNewFiles()) { + const FileMetaData& meta = elem.second; + const FileDescriptor& fd = meta.fd; + uint64_t file_num = fd.GetNumber(); + const std::string fpath = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); + s = VerifyFile(fpath, meta); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_files.insert(file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + uint64_t missing_blob_file_num = prev_missing_blob_file_high; + for (const auto& elem : edit.GetBlobFileAdditions()) { + uint64_t file_num = elem.GetBlobFileNumber(); + s = VerifyBlobFile(cfd, file_num, elem); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_blob_file_num = std::max(missing_blob_file_num, file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + bool has_missing_blob_files = false; + if (missing_blob_file_num != kInvalidBlobFileNumber && + missing_blob_file_num >= prev_missing_blob_file_high) { + missing_blob_files_high_iter->second = missing_blob_file_num; + has_missing_blob_files = true; + } else if (missing_blob_file_num < prev_missing_blob_file_high) { + assert(false); + } + + // We still have not applied the new version edit, but have tried to add new + // table and blob files after verifying their presence and consistency. + // Therefore, we know whether we will see new missing table and blob files + // later after actually applying the version edit. We perform the check here + // and record the result. + const bool has_missing_files = + !missing_files.empty() || has_missing_blob_files; + + bool missing_info = !version_edit_params_.has_log_number_ || + !version_edit_params_.has_next_file_number_ || + !version_edit_params_.has_last_sequence_; + + // Create version before apply edit. The version will represent the state + // before applying the version edit. + // A new version will created if: + // 1) no error has occurred so far, and + // 2) log_number_, next_file_number_ and last_sequence_ are known, and + // 3) any of the following: + // a) no missing file before, but will have missing file(s) after applying + // this version edit. + // b) no missing file after applying the version edit, and the caller + // explicitly request that a new version be created. + if (s.ok() && !missing_info && + ((has_missing_files && !prev_has_missing_files) || + (!has_missing_files && force_create_version))) { + if (!builder) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder); + } + + auto* version = new Version(cfd, version_set_, version_set_->file_options_, + *cfd->GetLatestMutableCFOptions(), io_tracer_, + version_set_->current_version_number_++); + s = builder->SaveTo(version->storage_info()); + if (s.ok()) { + version->PrepareApply( + *cfd->GetLatestMutableCFOptions(), + !version_set_->db_options_->skip_stats_update_on_db_open); + auto v_iter = versions_.find(cfd->GetID()); + if (v_iter != versions_.end()) { + delete v_iter->second; + v_iter->second = version; + } else { + versions_.emplace(cfd->GetID(), version); + } + } else { + delete version; + } + } + return s; +} + +Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) { + return version_set_->VerifyFileMetadata(fpath, fmeta); +} + +Status VersionEditHandlerPointInTime::VerifyBlobFile( + ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition) { + BlobFileCache* blob_file_cache = cfd->blob_file_cache(); + assert(blob_file_cache); + CacheHandleGuard blob_file_reader; + Status s = + blob_file_cache->GetBlobFileReader(blob_file_num, &blob_file_reader); + if (!s.ok()) { + return s; + } + // TODO: verify checksum + (void)blob_addition; + return s; +} + +Status ManifestTailer::Initialize() { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::Initialize(); + } + assert(Mode::kCatchUp == mode_); + Status s; + if (!initialized_) { + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* default_cfd = cfd_set->GetDefault(); + assert(default_cfd); + auto builder_iter = builders_.find(default_cfd->GetID()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = default_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(default_cfd, base_version)); + builder_iter->second = std::move(new_builder); + + initialized_ = true; + } + return s; +} + +Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd); + if (s.ok()) { + assert(cfd); + if (*cfd) { + cfds_changed_.insert(*cfd); + } + } + return s; +} + +Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::OnColumnFamilyAdd(edit, cfd); + } + assert(Mode::kCatchUp == mode_); + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily()); + assert(cfd); + *cfd = tmp_cfd; + if (!tmp_cfd) { + // For now, ignore new column families created after Recover() succeeds. + return Status::OK(); + } + auto builder_iter = builders_.find(edit.GetColumnFamily()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = tmp_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(tmp_cfd, base_version)); + builder_iter->second = std::move(new_builder); + +#ifndef NDEBUG + auto version_iter = versions_.find(edit.GetColumnFamily()); + assert(version_iter == versions_.end()); +#endif // !NDEBUG + return Status::OK(); +} + +void ManifestTailer::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandlerPointInTime::CheckIterationResult(reader, s); + assert(s); + if (s->ok()) { + if (Mode::kRecovery == mode_) { + mode_ = Mode::kCatchUp; + } else { + assert(Mode::kCatchUp == mode_); + } + } +} + +Status ManifestTailer::VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) { + Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta); + // TODO: Open file or create hard link to prevent the file from being + // deleted. + return s; +} + +void DumpManifestHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + if (!s->ok()) { + fprintf(stdout, "%s\n", s->ToString().c_str()); + return; + } + assert(cf_to_cmp_names_); + for (auto* cfd : *(version_set_->column_family_set_)) { + fprintf(stdout, + "--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto it = cf_to_cmp_names_->find(cfd->GetID()); + if (it != cf_to_cmp_names_->end()) { + fprintf(stdout, + "comparator: <%s>, but the comparator object is not available.\n", + it->second.c_str()); + } else { + fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name()); + } + assert(cfd->current()); + + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char), + cfd->current()->DebugString(hex_).size(), stdout); + } + fprintf(stdout, + "next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep %" PRIu64 "\n", + version_set_->current_next_file_number(), + version_set_->LastSequence(), version_set_->prev_log_number(), + version_set_->column_family_set_->GetMaxColumnFamily(), + version_set_->min_log_number_to_keep()); +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,309 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/version_builder.h" +#include "db/version_edit.h" +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +struct FileMetaData; + +class VersionEditHandlerBase { + public: + explicit VersionEditHandlerBase() + : max_manifest_read_size_(std::numeric_limits::max()) {} + + virtual ~VersionEditHandlerBase() {} + + void Iterate(log::Reader& reader, Status* log_read_status); + + const Status& status() const { return status_; } + + AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } + + protected: + explicit VersionEditHandlerBase(uint64_t max_read_size) + : max_manifest_read_size_(max_read_size) {} + virtual Status Initialize() { return Status::OK(); } + + virtual Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) = 0; + + virtual void CheckIterationResult(const log::Reader& /*reader*/, + Status* /*s*/) {} + + void ClearReadBuffer() { read_buffer_.Clear(); } + + Status status_; + + private: + AtomicGroupReadBuffer read_buffer_; + const uint64_t max_manifest_read_size_; +}; + +class ListColumnFamiliesHandler : public VersionEditHandlerBase { + public: + ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + + ~ListColumnFamiliesHandler() override {} + + const std::map GetColumnFamilyNames() const { + return column_family_names_; + } + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + // default column family is always implicitly there + std::map column_family_names_{ + {0, kDefaultColumnFamilyName}}; +}; + +class FileChecksumRetriever : public VersionEditHandlerBase { + public: + FileChecksumRetriever(uint64_t max_read_size, + FileChecksumList& file_checksum_list) + : VersionEditHandlerBase(max_read_size), + file_checksum_list_(file_checksum_list) {} + + ~FileChecksumRetriever() override {} + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + FileChecksumList& file_checksum_list_; +}; + +using VersionBuilderUPtr = std::unique_ptr; + +// A class used for scanning MANIFEST file. +// VersionEditHandler reads a MANIFEST file, parses the version edits, and +// builds the version set's in-memory state, e.g. the version storage info for +// the versions of column families. +// To use this class and its subclasses, +// 1. Create an object of VersionEditHandler or its subclasses. +// VersionEditHandler handler(read_only, column_families, version_set, +// track_missing_files, +// no_error_if_files_missing); +// 2. Status s = handler.Iterate(reader, &db_id); +// 3. Check s and handle possible errors. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandler is shared by multiple threads. +class VersionEditHandler : public VersionEditHandlerBase { + public: + explicit VersionEditHandler( + bool read_only, + const std::vector& column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer) + : VersionEditHandler(read_only, column_families, version_set, + track_missing_files, no_error_if_files_missing, + io_tracer, /*skip_load_table_files=*/false) {} + + ~VersionEditHandler() override {} + + const VersionEditParams& GetVersionEditParams() const { + return version_edit_params_; + } + + bool HasMissingFiles() const; + + void GetDbId(std::string* db_id) const { + if (db_id && version_edit_params_.has_db_id_) { + *db_id = version_edit_params_.db_id_; + } + } + + protected: + explicit VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer, bool skip_load_table_files); + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd); + + Status OnWalAddition(VersionEdit& edit); + + Status OnWalDeletion(VersionEdit& edit); + + Status Initialize() override; + + void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, + bool* cf_in_builders) const; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options, + const VersionEdit& edit); + + virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit); + + virtual Status MaybeCreateVersion(const VersionEdit& edit, + ColumnFamilyData* cfd, + bool force_create_version); + + Status LoadTables(ColumnFamilyData* cfd, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load); + + virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } + + const bool read_only_; + std::vector column_families_; + VersionSet* version_set_; + std::unordered_map builders_; + std::unordered_map name_to_options_; + // Keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status. + std::unordered_map column_families_not_found_; + VersionEditParams version_edit_params_; + const bool track_missing_files_; + std::unordered_map> + cf_to_missing_files_; + std::unordered_map cf_to_missing_blob_files_high_; + bool no_error_if_files_missing_; + std::shared_ptr io_tracer_; + bool skip_load_table_files_; + bool initialized_; + std::unique_ptr> cf_to_cmp_names_; + + private: + Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& edit); +}; + +// A class similar to its base class, i.e. VersionEditHandler. +// VersionEditHandlerPointInTime restores the versions to the most recent point +// in time such that at this point, the version does not have missing files. +// +// Not thread-safe, external synchronization is necessary if an object of +// VersionEditHandlerPointInTime is shared by multiple threads. +class VersionEditHandlerPointInTime : public VersionEditHandler { + public: + VersionEditHandlerPointInTime( + bool read_only, std::vector column_families, + VersionSet* version_set, const std::shared_ptr& io_tracer); + ~VersionEditHandlerPointInTime() override; + + protected: + void CheckIterationResult(const log::Reader& reader, Status* s) override; + ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; + Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, + bool force_create_version) override; + virtual Status VerifyFile(const std::string& fpath, + const FileMetaData& fmeta); + virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition); + + std::unordered_map versions_; +}; + +class ManifestTailer : public VersionEditHandlerPointInTime { + public: + explicit ManifestTailer(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer) + : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, + version_set, io_tracer), + mode_(Mode::kRecovery) {} + + void PrepareToReadNewManifest() { + initialized_ = false; + ClearReadBuffer(); + } + + std::unordered_set& GetUpdatedColumnFamilies() { + return cfds_changed_; + } + + protected: + Status Initialize() override; + + bool MustOpenAllColumnFamilies() const override { return false; } + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + Status VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) override; + + enum Mode : uint8_t { + kRecovery = 0, + kCatchUp = 1, + }; + + Mode mode_; + std::unordered_set cfds_changed_; +}; + +class DumpManifestHandler : public VersionEditHandler { + public: + DumpManifestHandler(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer, bool verbose, + bool hex, bool json) + : VersionEditHandler( + /*read_only=*/true, column_families, version_set, + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer, + /*skip_load_table_files=*/true), + verbose_(verbose), + hex_(hex), + json_(json), + count_(0) { + cf_to_cmp_names_.reset(new std::unordered_map()); + } + + ~DumpManifestHandler() override {} + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override { + // Write out each individual edit + if (verbose_ && !json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } else if (json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } + ++count_; + return VersionEditHandler::ApplyVersionEdit(edit, cfd); + } + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + private: + const bool verbose_; + const bool hex_; + const bool json_; + int count_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,9 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" + +#include "rocksdb/advanced_options.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -36,8 +40,9 @@ edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0, InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), - kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber, - 888, 678, "234", "crc32c"); + kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, + kInvalidBlobFileNumber, 888, 678, "234", "crc32c", "123", + "345"); edit.DeleteFile(4, kBig + 700 + i); } @@ -54,23 +59,27 @@ VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true, kInvalidBlobFileNumber, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", + "234"); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, - kBig + 601, false, kInvalidBlobFileNumber, + kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345", + "543"); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, - kBig + 602, true, kInvalidBlobFileNumber, 666, 888, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, + 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + "456", "567"); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, - kBig + 603, true, 1001, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + kBig + 603, true, Temperature::kUnknown, 1001, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678", + "789"); ; edit.DeleteFile(4, 700); @@ -102,6 +111,14 @@ ASSERT_EQ(kInvalidBlobFileNumber, new_files[2].second.oldest_blob_file_number); ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number); + ASSERT_EQ("123", new_files[0].second.min_timestamp); + ASSERT_EQ("234", new_files[0].second.max_timestamp); + ASSERT_EQ("345", new_files[1].second.min_timestamp); + ASSERT_EQ("543", new_files[1].second.max_timestamp); + ASSERT_EQ("456", new_files[2].second.min_timestamp); + ASSERT_EQ("567", new_files[2].second.max_timestamp); + ASSERT_EQ("678", new_files[3].second.min_timestamp); + ASSERT_EQ("789", new_files[3].second.max_timestamp); } TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { @@ -109,13 +126,15 @@ VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true, kInvalidBlobFileNumber, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", + "234"); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, - kBig + 601, false, kInvalidBlobFileNumber, 686, 868, "234", - "crc32c"); + kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, + 686, 868, "234", "crc32c", kDisableUserTimestamp, + kDisableUserTimestamp); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -154,6 +173,10 @@ ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); + ASSERT_EQ("123", new_files[0].second.min_timestamp); + ASSERT_EQ("234", new_files[0].second.max_timestamp); + ASSERT_EQ(kDisableUserTimestamp, new_files[1].second.min_timestamp); + ASSERT_EQ(kDisableUserTimestamp, new_files[1].second.max_timestamp); } TEST_F(VersionEditTest, NewFile4NotSupportedField) { @@ -161,9 +184,10 @@ VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true, kInvalidBlobFileNumber, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -191,9 +215,10 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { VersionEdit edit; edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } @@ -278,6 +303,314 @@ TestEncodeDecode(edit); } +TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) { + VersionEdit edit; + + const std::string checksum_method_prefix = "Hash"; + const std::string checksum_value_prefix = "Value"; + + for (uint64_t blob_file_number = 1; blob_file_number <= 10; + ++blob_file_number) { + const uint64_t total_blob_count = blob_file_number << 10; + const uint64_t total_blob_bytes = blob_file_number << 20; + + std::string checksum_method(checksum_method_prefix); + AppendNumberTo(&checksum_method, blob_file_number); + + std::string checksum_value(checksum_value_prefix); + AppendNumberTo(&checksum_value, blob_file_number); + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + const uint64_t garbage_blob_count = total_blob_count >> 2; + const uint64_t garbage_blob_bytes = total_blob_bytes >> 1; + + edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + } + + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, AddWalEncodeDecode) { + VersionEdit edit; + for (uint64_t log_number = 1; log_number <= 20; log_number++) { + WalMetadata meta; + bool has_size = rand() % 2 == 0; + if (has_size) { + meta.SetSyncedSizeInBytes(rand() % 1000); + } + edit.AddWal(log_number, meta); + } + TestEncodeDecode(edit); +} + +static std::string PrefixEncodedWalAdditionWithLength( + const std::string& encoded) { + std::string ret; + PutVarint32(&ret, Tag::kWalAddition2); + PutLengthPrefixedSlice(&ret, encoded); + return ret; +} + +TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { + std::string encoded; + + { + // No log number. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != + std::string::npos) + << s.ToString(); + } + + { + // log number should be varint64, + // but we only encode 128 which is not a valid representation of varint64. + char c = 0; + unsigned char* ptr = reinterpret_cast(&c); + *ptr = 128; + encoded.append(1, c); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionEditTest, AddWalDecodeBadTag) { + constexpr WalNumber kLogNumber = 100; + constexpr uint64_t kSizeInBytes = 100; + + std::string encoded; + PutVarint64(&encoded, kLogNumber); + + { + // No tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } + + { + // Only has size tag, no terminate tag. + std::string encoded_with_size = encoded; + PutVarint32(&encoded_with_size, + static_cast(WalAdditionTag::kSyncedSize)); + PutVarint64(&encoded_with_size, kSizeInBytes); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_size); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } + + { + // Only has terminate tag. + std::string encoded_with_terminate = encoded; + PutVarint32(&encoded_with_terminate, + static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_terminate); + VersionEdit edit; + ASSERT_OK(edit.DecodeFrom(encoded_edit)); + auto& wal_addition = edit.GetWalAdditions()[0]; + ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber); + ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize()); + } +} + +TEST_F(VersionEditTest, AddWalDecodeNoSize) { + constexpr WalNumber kLogNumber = 100; + + std::string encoded; + PutVarint64(&encoded, kLogNumber); + PutVarint32(&encoded, static_cast(WalAdditionTag::kSyncedSize)); + // No real size after the size tag. + + { + // Without terminate tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") != + std::string::npos) + << s.ToString(); + } + + { + // With terminate tag. + PutVarint32(&encoded, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); + VersionEdit edit; + Status s = edit.DecodeFrom(encoded_edit); + ASSERT_TRUE(s.IsCorruption()); + // The terminate tag is misunderstood as the size. + ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionEditTest, AddWalDebug) { + constexpr int n = 2; + constexpr std::array kLogNumbers{{10, 20}}; + constexpr std::array kSizeInBytes{{100, 200}}; + + VersionEdit edit; + for (int i = 0; i < n; i++) { + edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i])); + } + + const WalAdditions& wals = edit.GetWalAdditions(); + + ASSERT_TRUE(edit.IsWalAddition()); + ASSERT_EQ(wals.size(), n); + for (int i = 0; i < n; i++) { + const WalAddition& wal = wals[i]; + ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]); + ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]); + } + + std::string expected_str = "VersionEdit {\n"; + for (int i = 0; i < n; i++) { + std::stringstream ss; + ss << " WalAddition: log_number: " << kLogNumbers[i] + << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n"; + expected_str += ss.str(); + } + expected_str += " ColumnFamily: 0\n}\n"; + ASSERT_EQ(edit.DebugString(true), expected_str); + + std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": ["; + for (int i = 0; i < n; i++) { + std::stringstream ss; + ss << "{\"LogNumber\": " << kLogNumbers[i] << ", " + << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}"; + if (i < n - 1) ss << ", "; + expected_json += ss.str(); + } + expected_json += "], \"ColumnFamily\": 0}"; + ASSERT_EQ(edit.DebugJSON(4, true), expected_json); +} + +TEST_F(VersionEditTest, DeleteWalEncodeDecode) { + VersionEdit edit; + edit.DeleteWalsBefore(rand() % 100); + TestEncodeDecode(edit); +} + +TEST_F(VersionEditTest, DeleteWalDebug) { + constexpr int n = 2; + constexpr std::array kLogNumbers{{10, 20}}; + + VersionEdit edit; + edit.DeleteWalsBefore(kLogNumbers[n - 1]); + + const WalDeletion& wal = edit.GetWalDeletion(); + + ASSERT_TRUE(edit.IsWalDeletion()); + ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]); + + std::string expected_str = "VersionEdit {\n"; + { + std::stringstream ss; + ss << " WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n"; + expected_str += ss.str(); + } + expected_str += " ColumnFamily: 0\n}\n"; + ASSERT_EQ(edit.DebugString(true), expected_str); + + std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": "; + { + std::stringstream ss; + ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}"; + expected_json += ss.str(); + } + expected_json += ", \"ColumnFamily\": 0}"; + ASSERT_EQ(edit.DebugJSON(4, true), expected_json); +} + +TEST_F(VersionEditTest, FullHistoryTsLow) { + VersionEdit edit; + ASSERT_FALSE(edit.HasFullHistoryTsLow()); + std::string ts = test::EncodeInt(0); + edit.SetFullHistoryTsLow(ts); + TestEncodeDecode(edit); +} + +// Tests that if RocksDB is downgraded, the new types of VersionEdits +// that have a tag larger than kTagSafeIgnoreMask can be safely ignored. +TEST_F(VersionEditTest, IgnorableTags) { + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) { + bool* ignore = static_cast(arg); + *ignore = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t kPrevLogNumber = 100; + constexpr uint64_t kLogNumber = 200; + constexpr uint64_t kNextFileNumber = 300; + constexpr uint64_t kColumnFamilyId = 400; + + VersionEdit edit; + // Add some ignorable entries. + for (int i = 0; i < 2; i++) { + edit.AddWal(i + 1, WalMetadata(i + 2)); + } + edit.SetDBId("db_id"); + // Add unignorable entries. + edit.SetPrevLogNumber(kPrevLogNumber); + edit.SetLogNumber(kLogNumber); + // Add more ignorable entries. + edit.DeleteWalsBefore(100); + // Add unignorable entry. + edit.SetNextFile(kNextFileNumber); + // Add more ignorable entries. + edit.SetFullHistoryTsLow("ts"); + // Add unignorable entry. + edit.SetColumnFamily(kColumnFamilyId); + + std::string encoded; + ASSERT_TRUE(edit.EncodeTo(&encoded)); + + VersionEdit decoded; + ASSERT_OK(decoded.DecodeFrom(encoded)); + + // Check that all ignorable entries are ignored. + ASSERT_FALSE(decoded.HasDbId()); + ASSERT_FALSE(decoded.HasFullHistoryTsLow()); + ASSERT_FALSE(decoded.IsWalAddition()); + ASSERT_FALSE(decoded.IsWalDeletion()); + ASSERT_TRUE(decoded.GetWalAdditions().empty()); + ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty()); + + // Check that unignorable entries are still present. + ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber); + ASSERT_EQ(edit.GetLogNumber(), kLogNumber); + ASSERT_EQ(edit.GetNextFile(), kNextFileNumber); + ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId); + + SyncPoint::GetInstance()->DisableProcessing(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,17 +9,24 @@ #include "db/version_set.h" -#include #include #include #include +#include #include #include #include #include #include #include -#include "compaction/compaction.h" + +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/compaction/compaction.h" +#include "db/compaction/file_pri.h" #include "db/internal_stats.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -29,13 +36,16 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "db/version_edit_handler.h" #include "file/filename.h" #include "file/random_access_file_reader.h" #include "file/read_write_util.h" #include "file/writable_file_writer.h" +#include "logging/logging.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/write_buffer_manager.h" @@ -49,6 +59,7 @@ #include "table/table_reader.h" #include "table/two_level_iterator.h" #include "test_util/sync_point.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -88,9 +99,9 @@ *overlap = false; if (iter->Valid()) { ParsedInternalKey seek_result; - if (!ParseInternalKey(iter->key(), &seek_result)) { - return Status::Corruption("DB have corrupted keys"); - } + Status s = ParseInternalKey(iter->key(), &seek_result, + false /* log_err_key */); // TODO + if (!s.ok()) return s; if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= 0) { @@ -109,10 +120,9 @@ // are MergeInProgress). class FilePicker { public: - FilePicker(std::vector* files, const Slice& user_key, - const Slice& ikey, autovector* file_levels, - unsigned int num_levels, FileIndexer* file_indexer, - const Comparator* user_comparator, + FilePicker(const Slice& user_key, const Slice& ikey, + autovector* file_levels, unsigned int num_levels, + FileIndexer* file_indexer, const Comparator* user_comparator, const InternalKeyComparator* internal_comparator) : num_levels_(num_levels), curr_level_(static_cast(-1)), @@ -120,9 +130,6 @@ hit_file_level_(static_cast(-1)), search_left_bound_(0), search_right_bound_(FileIndexer::kLevelMaxIndex), -#ifndef NDEBUG - files_(files), -#endif level_files_brief_(file_levels), is_hit_file_last_in_level_(false), curr_file_level_(nullptr), @@ -131,9 +138,6 @@ file_indexer_(file_indexer), user_comparator_(user_comparator), internal_comparator_(internal_comparator) { -#ifdef NDEBUG - (void)files; -#endif // Setup member variables to search first level. search_ended_ = !PrepareNextLevel(); if (!search_ended_) { @@ -203,23 +207,7 @@ } } } -#ifndef NDEBUG - // Sanity check to make sure that the files are correctly sorted - if (prev_file_) { - if (curr_level_ != 0) { - int comp_sign = internal_comparator_->Compare( - prev_file_->largest_key, f->smallest_key); - assert(comp_sign < 0); - } else { - // level == 0, the current file cannot be newer than the previous - // one. Use compressed data structure, has no attribute seqNo - assert(curr_index_in_curr_level_ > 0); - assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_], - files_[0][curr_index_in_curr_level_-1])); - } - } - prev_file_ = f; -#endif + returned_file_level_ = curr_level_; if (curr_level_ > 0 && cmp_largest < 0) { // No more files to search in this level. @@ -251,9 +239,6 @@ unsigned int hit_file_level_; int32_t search_left_bound_; int32_t search_right_bound_; -#ifndef NDEBUG - std::vector* files_; -#endif autovector* level_files_brief_; bool search_ended_; bool is_hit_file_last_in_level_; @@ -265,9 +250,6 @@ FileIndexer* file_indexer_; const Comparator* user_comparator_; const InternalKeyComparator* internal_comparator_; -#ifndef NDEBUG - FdWithKeyRange* prev_file_; -#endif // Setup local variables to search next level. // Returns false if there are no more levels to search. @@ -337,9 +319,7 @@ } start_index_in_curr_level_ = start_index; curr_index_in_curr_level_ = start_index; -#ifndef NDEBUG - prev_file_ = nullptr; -#endif + return true; } // curr_level_ = num_levels_. So, no more levels to search. @@ -364,6 +344,7 @@ range_(range), batch_iter_(range->begin()), batch_iter_prev_(range->begin()), + upper_key_(range->begin()), maybe_repeat_key_(false), current_level_range_(*range, range->begin(), range->end()), current_file_range_(*range, range->begin(), range->end()), @@ -400,7 +381,7 @@ int GetCurrentLevel() const { return curr_level_; } // Iterates through files in the current level until it finds a file that - // contains atleast one key from the MultiGet batch + // contains at least one key from the MultiGet batch bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, size_t* file_index, FdWithKeyRange** fd, bool* is_last_key_in_file) { @@ -432,7 +413,7 @@ !file_hit)) { struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; - Slice& user_key = batch_iter_->ukey; + Slice& user_key = batch_iter_->ukey_without_ts; // Do key range filtering of files or/and fractional cascading if: // (1) not all the files are in level 0, or @@ -446,17 +427,17 @@ // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->smallest_key), true); + assert(curr_level_ == 0 || fp_ctx.curr_index_in_curr_level == fp_ctx.start_index_in_curr_level || - user_comparator_->Compare(user_key, - ExtractUserKey(f->smallest_key)) <= 0); + cmp_smallest <= 0); - int cmp_smallest = user_comparator_->Compare( - user_key, ExtractUserKey(f->smallest_key)); if (cmp_smallest >= 0) { - cmp_largest = user_comparator_->Compare( - user_key, ExtractUserKey(f->largest_key)); + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->largest_key), true); } else { cmp_largest = -1; } @@ -480,9 +461,20 @@ } if (cmp_largest == 0) { // cmp_largest is 0, which means the next key will not be in this - // file, so stop looking further. Also don't increment megt_iter_ - // as we may have to look for this key in the next file if we don't - // find it in this one + // file, so stop looking further. However, its possible there are + // duplicates in the batch, so find the upper bound for the batch + // in this file (upper_key_) by skipping past the duplicates. We + // leave batch_iter_ as is since we may have to pick up from there + // for the next file, if this file has a merge value rather than + // final value + upper_key_ = batch_iter_; + ++upper_key_; + while (upper_key_ != current_level_range_.end() && + user_comparator_->CompareWithoutTimestamp( + batch_iter_->ukey_without_ts, false, + upper_key_->ukey_without_ts, false) == 0) { + ++upper_key_; + } break; } else { if (curr_level_ == 0) { @@ -502,6 +494,12 @@ *fd = f; *file_index = curr_file_index; *is_last_key_in_file = cmp_largest == 0; + if (!*is_last_key_in_file) { + // If the largest key in the batch overlapping the file is not the + // largest key in the file, upper_ley_ would not have been updated so + // update it here + upper_key_ = batch_iter_; + } return file_hit; } @@ -523,7 +521,7 @@ // file regardless for all keys not found yet if (current_level_range_.CheckKeyDone(batch_iter_) || curr_level_ == 0) { - ++batch_iter_; + batch_iter_ = upper_key_; } } // batch_iter_prev_ will become the start key for the next file @@ -543,18 +541,20 @@ &is_last_key_in_file)) { search_ended_ = !PrepareNextLevel(); } else { - MultiGetRange::Iterator upper_key = batch_iter_; if (is_last_key_in_file) { // Since cmp_largest is 0, batch_iter_ still points to the last key // that falls in this file, instead of the next one. Increment - // upper_key so we can set the range properly for SST MultiGet - ++upper_key; - ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level); + // the file index for all keys between batch_iter_ and upper_key_ + auto tmp_iter = batch_iter_; + while (tmp_iter != upper_key_) { + ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level); + ++tmp_iter; + } maybe_repeat_key_ = true; } // Set the range for this file current_file_range_ = - MultiGetRange(next_file_range, batch_iter_prev_, upper_key); + MultiGetRange(next_file_range, batch_iter_prev_, upper_key_); returned_file_level_ = curr_level_; hit_file_level_ = curr_level_; is_hit_file_last_in_level_ = @@ -606,6 +606,7 @@ // key found in the previous SST file, in order to serve as the start of // the batch key range for the next SST file MultiGetRange::Iterator batch_iter_prev_; + MultiGetRange::Iterator upper_key_; bool maybe_repeat_key_; MultiGetRange current_level_range_; MultiGetRange current_file_range_; @@ -625,7 +626,7 @@ if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < curr_file_level_->num_files) { batch_iter_prev_ = current_level_range_.begin(); - batch_iter_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); return true; } } @@ -720,7 +721,7 @@ } if (level_contains_keys) { batch_iter_prev_ = current_level_range_.begin(); - batch_iter_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); return true; } curr_level_++; @@ -852,15 +853,18 @@ class LevelIterator final : public InternalIterator { public: + // @param read_options Must outlive this iterator. LevelIterator(TableCache* table_cache, const ReadOptions& read_options, const FileOptions& file_options, const InternalKeyComparator& icomparator, const LevelFilesBrief* flevel, - const SliceTransform* prefix_extractor, bool should_sample, - HistogramImpl* file_read_hist, TableReaderCaller caller, - bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::shared_ptr& prefix_extractor, + bool should_sample, HistogramImpl* file_read_hist, + TableReaderCaller caller, bool skip_filters, int level, + RangeDelAggregator* range_del_agg, const std::vector* - compaction_boundaries = nullptr) + compaction_boundaries = nullptr, + bool allow_unprepared_value = false) : table_cache_(table_cache), read_options_(read_options), file_options_(file_options), @@ -872,11 +876,13 @@ should_sample_(should_sample), caller_(caller), skip_filters_(skip_filters), + allow_unprepared_value_(allow_unprepared_value), file_index_(flevel_->num_files), level_(level), range_del_agg_(range_del_agg), pinned_iters_mgr_(nullptr), - compaction_boundaries_(compaction_boundaries) { + compaction_boundaries_(compaction_boundaries), + is_next_read_sequential_(false) { // Empty level is not supported. assert(flevel_ != nullptr && flevel_->num_files > 0); } @@ -906,14 +912,21 @@ return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + bool PrepareValue() override { + return file_iter_.PrepareValue(); + } + inline bool MayBeOutOfLowerBound() override { assert(Valid()); return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); } - inline bool MayBeOutOfUpperBound() override { - assert(Valid()); - return file_iter_.MayBeOutOfUpperBound(); + inline IterBoundCheck UpperBoundCheckResult() override { + if (Valid()) { + return file_iter_.UpperBoundCheckResult(); + } else { + return IterBoundCheck::kUnknown; + } } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { @@ -940,13 +953,6 @@ void SetFileIterator(InternalIterator* iter); void InitFileIterator(size_t new_file_index); - // Called by both of Next() and NextAndGetResult(). Force inline. - void NextImpl() { - assert(Valid()); - file_iter_.Next(); - SkipEmptyFileForward(); - } - const Slice& file_smallest_key(size_t file_index) { assert(file_index < flevel_->num_files); return flevel_->files[file_index].smallest_key; @@ -955,8 +961,8 @@ bool KeyReachedUpperBound(const Slice& internal_key) { return read_options_.iterate_upper_bound != nullptr && user_comparator_.CompareWithoutTimestamp( - ExtractUserKey(internal_key), - *read_options_.iterate_upper_bound) >= 0; + ExtractUserKey(internal_key), /*a_has_ts=*/true, + *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0; } InternalIterator* NewFileIterator() { @@ -977,8 +983,9 @@ read_options_, file_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, nullptr /* don't need reference to table */, file_read_hist_, caller_, - /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key, - largest_compaction_key); + /*arena=*/nullptr, skip_filters_, level_, + /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, + largest_compaction_key, allow_unprepared_value_); } // Check if current file being fully within iterate_lower_bound. @@ -989,14 +996,14 @@ if (read_options_.iterate_lower_bound != nullptr && file_index_ < flevel_->num_files) { may_be_out_of_lower_bound_ = - user_comparator_.Compare( - ExtractUserKey(file_smallest_key(file_index_)), - *read_options_.iterate_lower_bound) < 0; + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true, + *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0; } } TableCache* table_cache_; - const ReadOptions read_options_; + const ReadOptions& read_options_; const FileOptions& file_options_; const InternalKeyComparator& icomparator_; const UserComparatorWrapper user_comparator_; @@ -1005,12 +1012,13 @@ // `prefix_extractor_` may be non-null even for total order seek. Checking // this variable is not the right way to identify whether prefix iterator // is used. - const SliceTransform* prefix_extractor_; + const std::shared_ptr& prefix_extractor_; HistogramImpl* file_read_hist_; bool should_sample_; TableReaderCaller caller_; bool skip_filters_; + bool allow_unprepared_value_; bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; @@ -1021,6 +1029,8 @@ // To be propagated to RangeDelAggregator in order to safely truncate range // tombstones. const std::vector* compaction_boundaries_; + + bool is_next_read_sequential_; }; void LevelIterator::Seek(const Slice& target) { @@ -1063,13 +1073,17 @@ // next key after the prefix, or make the iterator invalid. // A side benefit will be that it invalidates the iterator earlier so that // the upper level merging iterator can merge fewer child iterators. - Slice target_user_key = ExtractUserKey(target); - Slice file_user_key = ExtractUserKey(file_iter_.key()); - if (prefix_extractor_->InDomain(target_user_key) && - (!prefix_extractor_->InDomain(file_user_key) || - user_comparator_.Compare( - prefix_extractor_->Transform(target_user_key), - prefix_extractor_->Transform(file_user_key)) != 0)) { + size_t ts_sz = user_comparator_.timestamp_size(); + Slice target_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(target, ts_sz); + Slice file_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(file_iter_.key(), ts_sz); + if (prefix_extractor_->InDomain(target_user_key_without_ts) && + (!prefix_extractor_->InDomain(file_user_key_without_ts) || + user_comparator_.CompareWithoutTimestamp( + prefix_extractor_->Transform(target_user_key_without_ts), false, + prefix_extractor_->Transform(file_user_key_without_ts), + false) != 0)) { SetFileIterator(nullptr); } } @@ -1108,14 +1122,28 @@ CheckMayBeOutOfLowerBound(); } -void LevelIterator::Next() { NextImpl(); } +void LevelIterator::Next() { + assert(Valid()); + file_iter_.Next(); + SkipEmptyFileForward(); +} bool LevelIterator::NextAndGetResult(IterateResult* result) { - NextImpl(); - bool is_valid = Valid(); - if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + assert(Valid()); + bool is_valid = file_iter_.NextAndGetResult(result); + if (!is_valid) { + is_next_read_sequential_ = true; + SkipEmptyFileForward(); + is_next_read_sequential_ = false; + is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = file_iter_.UpperBoundCheckResult(); + // Ideally, we should return the real file_iter_.value_prepared but the + // information is not here. It would casue an extra PrepareValue() + // for the first key of a file. + result->value_prepared = !allow_unprepared_value_; + } } return is_valid; } @@ -1130,7 +1158,8 @@ bool seen_empty_file = false; while (file_iter_.iter() == nullptr || (!file_iter_.Valid() && file_iter_.status().ok() && - !file_iter_.iter()->IsOutOfBound())) { + file_iter_.iter()->UpperBoundCheckResult() != + IterBoundCheck::kOutOfBound)) { seen_empty_file = true; // Move to next file if (file_index_ >= flevel_->num_files - 1) { @@ -1172,6 +1201,12 @@ } InternalIterator* old_iter = file_iter_.Set(iter); + + // Update the read pattern for PrefetchBuffer. + if (is_next_read_sequential_) { + file_iter_.UpdateReadaheadState(old_iter); + } + if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { pinned_iters_mgr_->PinIterator(old_iter); } else { @@ -1202,28 +1237,6 @@ } } // anonymous namespace -// A wrapper of version builder which references the current version in -// constructor and unref it in the destructor. -// Both of the constructor and destructor need to be called inside DB Mutex. -class BaseReferencedVersionBuilder { - public: - explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd) - : version_builder_(new VersionBuilder( - cfd->current()->version_set()->file_options(), cfd->table_cache(), - cfd->current()->storage_info(), cfd->ioptions()->info_log)), - version_(cfd->current()) { - version_->Ref(); - } - ~BaseReferencedVersionBuilder() { - version_->Unref(); - } - VersionBuilder* version_builder() { return version_builder_.get(); } - - private: - std::unique_ptr version_builder_; - Version* version_; -}; - Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) const { @@ -1231,7 +1244,7 @@ auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( file_options_, cfd_->internal_comparator(), file_meta->fd, tp, - mutable_cf_options_.prefix_extractor.get(), true /* no io */); + mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; } @@ -1259,24 +1272,23 @@ return s; } - TableProperties* raw_table_properties; - // By setting the magic number to kInvalidTableMagicNumber, we can by - // pass the magic number check in the footer. + // By setting the magic number to kNullTableMagicNumber, we can bypass + // the magic number check in the footer. std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), file_name, nullptr /* env */, nullptr /* stats */, - 0 /* hist_type */, nullptr /* file_read_hist */, + std::move(file), file_name, nullptr /* env */, io_tracer_, + nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, nullptr /* rate_limiter */, ioptions->listeners)); + std::unique_ptr props; s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), - Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, - &raw_table_properties, false /* compression_type_missing */); + Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, + &props); if (!s.ok()) { return s; } - RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); - - *tp = std::shared_ptr(raw_table_properties); + *tp = std::move(props); + RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); return s; } @@ -1425,7 +1437,7 @@ for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( file_options_, cfd_->internal_comparator(), file_level.files[i].fd, - mutable_cf_options_.prefix_extractor.get()); + mutable_cf_options_.prefix_extractor); } } return total_usage; @@ -1440,6 +1452,10 @@ cf_meta->file_count = 0; cf_meta->levels.clear(); + cf_meta->blob_file_size = 0; + cf_meta->blob_file_count = 0; + cf_meta->blob_files.clear(); + auto* ioptions = cfd_->ioptions(); auto* vstorage = storage_info(); @@ -1457,15 +1473,16 @@ file_path = ioptions->cf_paths.back().path; } const uint64_t file_number = file->fd.GetNumber(); - files.emplace_back(SstFileMetaData{ + files.emplace_back( MakeTableFileName("", file_number), file_number, file_path, static_cast(file->fd.GetFileSize()), file->fd.smallest_seqno, file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted, file->oldest_blob_file_number, - file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(), - file->file_checksum, file->file_checksum_func_name}); + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->file_checksum, + file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); @@ -1474,6 +1491,17 @@ level, level_size, std::move(files)); cf_meta->size += level_size; } + for (const auto& iter : vstorage->GetBlobFiles()) { + const auto meta = iter.second.get(); + cf_meta->blob_files.emplace_back( + meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()), + ioptions->cf_paths.front().path, meta->GetBlobFileSize(), + meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(), + meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(), + meta->GetChecksumMethod(), meta->GetChecksumValue()); + cf_meta->blob_file_count++; + cf_meta->blob_file_size += meta->GetBlobFileSize(); + } } uint64_t Version::GetSstFilesSize() { @@ -1554,12 +1582,13 @@ void Version::AddIterators(const ReadOptions& read_options, const FileOptions& soptions, MergeIteratorBuilder* merge_iter_builder, - RangeDelAggregator* range_del_agg) { + RangeDelAggregator* range_del_agg, + bool allow_unprepared_value) { assert(storage_info_.finalized_); for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level, - range_del_agg); + range_del_agg, allow_unprepared_value); } } @@ -1567,7 +1596,8 @@ const FileOptions& soptions, MergeIteratorBuilder* merge_iter_builder, int level, - RangeDelAggregator* range_del_agg) { + RangeDelAggregator* range_del_agg, + bool allow_unprepared_value) { assert(storage_info_.finalized_); if (level >= storage_info_.num_non_empty_levels()) { // This is an empty level @@ -1587,12 +1617,12 @@ merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( read_options, soptions, cfd_->internal_comparator(), *file.file_metadata, range_del_agg, - mutable_cf_options_.prefix_extractor.get(), nullptr, + mutable_cf_options_.prefix_extractor, nullptr, cfd_->internal_stats()->GetFileReadHist(0), TableReaderCaller::kUserIterator, arena, - /*skip_filters=*/false, /*level=*/0, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value)); } if (should_sample) { // Count ones for every L0 files. This is done per iterator creation @@ -1611,10 +1641,11 @@ merge_iter_builder->AddIterator(new (mem) LevelIterator( cfd_->table_cache(), read_options, soptions, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), - mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - range_del_agg, /*largest_compaction_key=*/nullptr)); + range_del_agg, + /*compaction_boundaries=*/nullptr, allow_unprepared_value)); } } @@ -1645,12 +1676,13 @@ ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( read_options, file_options, cfd_->internal_comparator(), *file->file_metadata, &range_del_agg, - mutable_cf_options_.prefix_extractor.get(), nullptr, + mutable_cf_options_.prefix_extractor, nullptr, cfd_->internal_stats()->GetFileReadHist(0), TableReaderCaller::kUserIterator, &arena, - /*skip_filters=*/false, /*level=*/0, + /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr)); + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false)); status = OverlapWithIterator( ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -1662,7 +1694,7 @@ ScopedArenaIterator iter(new (mem) LevelIterator( cfd_->table_cache(), read_options, file_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), - mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), + mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, &range_del_agg)); @@ -1726,15 +1758,17 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, const FileOptions& file_opt, const MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, uint64_t version_number) : env_(vset->env_), + clock_(vset->clock_), cfd_(column_family_data), - info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log), - db_statistics_((cfd_ == nullptr) ? nullptr - : cfd_->ioptions()->statistics), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger), + db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats), table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), - merge_operator_((cfd_ == nullptr) ? nullptr - : cfd_->ioptions()->merge_operator), + blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr), + merge_operator_( + (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()), storage_info_( (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), @@ -1751,12 +1785,188 @@ refs_(0), file_options_(file_opt), mutable_cf_options_(mutable_cf_options), - version_number_(version_number) {} + max_file_size_for_l0_meta_pin_( + MaxFileSizeForL0MetaPin(mutable_cf_options_)), + version_number_(version_number), + io_tracer_(io_tracer) {} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + BlobIndex blob_index; + + { + Status s = blob_index.DecodeFrom(blob_index_slice); + if (!s.ok()) { + return s; + } + } + + return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value, + bytes_read); +} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) const { + assert(value); + + if (read_options.read_tier == kBlockCacheTier) { + return Status::Incomplete("Cannot read blob: no disk I/O allowed"); + } + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + + const uint64_t blob_file_number = blob_index.file_number(); + + const auto it = blob_files.find(blob_file_number); + if (it == blob_files.end()) { + return Status::Corruption("Invalid blob file number"); + } + + CacheHandleGuard blob_file_reader; + + { + assert(blob_file_cache_); + const Status s = blob_file_cache_->GetBlobFileReader(blob_file_number, + &blob_file_reader); + if (!s.ok()) { + return s; + } + } + + assert(blob_file_reader.GetValue()); + const Status s = blob_file_reader.GetValue()->GetBlob( + read_options, user_key, blob_index.offset(), blob_index.size(), + blob_index.compression(), prefetch_buffer, value, bytes_read); + + return s; +} + +void Version::MultiGetBlob( + const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_rqs) { + if (read_options.read_tier == kBlockCacheTier) { + Status s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + for (const auto& elem : blob_rqs) { + for (const auto& blob_rq : elem.second) { + const KeyContext& key_context = blob_rq.second; + assert(key_context.s); + assert(key_context.s->ok()); + *(key_context.s) = s; + assert(key_context.get_context); + auto& get_context = *(key_context.get_context); + get_context.MarkKeyMayExist(); + } + } + return; + } + + assert(!blob_rqs.empty()); + Status status; + const auto& blob_files = storage_info_.GetBlobFiles(); + for (auto& elem : blob_rqs) { + uint64_t blob_file_number = elem.first; + if (blob_files.find(blob_file_number) == blob_files.end()) { + auto& blobs_in_file = elem.second; + for (const auto& blob : blobs_in_file) { + const KeyContext& key_context = blob.second; + *(key_context.s) = Status::Corruption("Invalid blob file number"); + } + continue; + } + CacheHandleGuard blob_file_reader; + assert(blob_file_cache_); + status = blob_file_cache_->GetBlobFileReader(blob_file_number, + &blob_file_reader); + assert(!status.ok() || blob_file_reader.GetValue()); + + auto& blobs_in_file = elem.second; + if (!status.ok()) { + for (const auto& blob : blobs_in_file) { + const KeyContext& key_context = blob.second; + *(key_context.s) = status; + } + continue; + } + + assert(blob_file_reader.GetValue()); + const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); + const CompressionType compression = + blob_file_reader.GetValue()->GetCompressionType(); + + // sort blobs_in_file by file offset. + std::sort( + blobs_in_file.begin(), blobs_in_file.end(), + [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool { + assert(lhs.first.file_number() == rhs.first.file_number()); + return lhs.first.offset() < rhs.first.offset(); + }); + + autovector> blob_read_key_contexts; + autovector> user_keys; + autovector offsets; + autovector value_sizes; + autovector statuses; + autovector values; + for (const auto& blob : blobs_in_file) { + const auto& blob_index = blob.first; + const KeyContext& key_context = blob.second; + if (blob_index.HasTTL() || blob_index.IsInlined()) { + *(key_context.s) = + Status::Corruption("Unexpected TTL/inlined blob index"); + continue; + } + const uint64_t key_size = key_context.ukey_with_ts.size(); + const uint64_t offset = blob_index.offset(); + const uint64_t value_size = blob_index.size(); + if (!IsValidBlobOffset(offset, key_size, value_size, file_size)) { + *(key_context.s) = Status::Corruption("Invalid blob offset"); + continue; + } + if (blob_index.compression() != compression) { + *(key_context.s) = + Status::Corruption("Compression type mismatch when reading a blob"); + continue; + } + blob_read_key_contexts.emplace_back(std::cref(key_context)); + user_keys.emplace_back(std::cref(key_context.ukey_with_ts)); + offsets.push_back(blob_index.offset()); + value_sizes.push_back(blob_index.size()); + statuses.push_back(key_context.s); + values.push_back(key_context.value); + } + blob_file_reader.GetValue()->MultiGetBlob(read_options, user_keys, offsets, + value_sizes, statuses, values, + /*bytes_read=*/nullptr); + size_t num = blob_read_key_contexts.size(); + assert(num == user_keys.size()); + assert(num == offsets.size()); + assert(num == value_sizes.size()); + assert(num == statuses.size()); + assert(num == values.size()); + for (size_t i = 0; i < num; ++i) { + if (statuses[i]->ok()) { + range.AddValueSize(blob_read_key_contexts[i].get().value->size()); + if (range.GetValueSize() > read_options.value_size_soft_limit) { + *(blob_read_key_contexts[i].get().s) = Status::Aborted(); + } + } + } + } +} void Version::Get(const ReadOptions& read_options, const LookupKey& k, - PinnableSlice* value, Status* status, + PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, - SequenceNumber* max_covering_tombstone_seq, bool* value_found, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, bool* is_blob, bool do_merge) { Slice ikey = k.internal_key(); @@ -1769,29 +1979,37 @@ *key_exists = true; } - PinnedIteratorsManager pinned_iters_mgr; uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; if (vset_ && vset_->block_cache_tracer_ && vset_->block_cache_tracer_->is_tracing_enabled()) { tracing_get_id = vset_->block_cache_tracer_->NextGetId(); } + + // Note: the old StackableDB-based BlobDB passes in + // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we + // need to provide it here. + bool is_blob_index = false; + bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; + BlobFetcher blob_fetcher(this, read_options); + + assert(pinned_iters_mgr); GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, - do_merge ? value : nullptr, value_found, merge_context, do_merge, - max_covering_tombstone_seq, this->env_, seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, - tracing_get_id); + do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found, + merge_context, do_merge, max_covering_tombstone_seq, clock_, seq, + merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, + tracing_get_id, &blob_fetcher); // Pin blocks that we read to hold merge operands if (merge_operator_) { - pinned_iters_mgr.StartPinning(); + pinned_iters_mgr->StartPinning(); } - FilePicker fp( - storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_, - storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, - user_comparator(), internal_comparator()); + FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, + &storage_info_.file_indexer_, user_comparator(), + internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { @@ -1807,20 +2025,23 @@ bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; - StopWatchNano timer(env_, timer_enabled /* auto_start */); + StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, - &get_context, mutable_cf_options_.prefix_extractor.get(), + &get_context, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), - fp.GetCurrentLevel()); + fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_); // TODO: examine the behavior for corrupted key if (timer_enabled) { PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), - fp.GetCurrentLevel()); + fp.GetHitFileLevel()); } if (!status->ok()) { + if (db_statistics_ != nullptr) { + get_context.ReportCounters(); + } return; } @@ -1845,8 +2066,26 @@ } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); + + if (is_blob_index) { + if (do_merge && value) { + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + *status = GetBlob(read_options, user_key, *value, prefetch_buffer, + value, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); + } + return; + } + } + } + return; case GetContext::kDeleted: // Use empty error message for speed @@ -1855,7 +2094,7 @@ case GetContext::kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); return; - case GetContext::kBlobIndex: + case GetContext::kUnexpectedBlobIndex: ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); *status = Status::NotSupported( "Encounter unexpected blob index. Please open DB with " @@ -1882,7 +2121,7 @@ std::string* str_value = value != nullptr ? value->GetSelf() : nullptr; *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, nullptr, merge_context->GetOperands(), - str_value, info_log_, db_statistics_, env_, + str_value, info_log_, db_statistics_, clock_, nullptr /* result_operand */, true); if (LIKELY(value != nullptr)) { value->PinSelf(); @@ -1896,7 +2135,7 @@ } void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob) { + ReadCallback* callback) { PinnedIteratorsManager pinned_iters_mgr; // Pin blocks that we read to hold merge operands @@ -1913,15 +2152,16 @@ // use autovector in order to avoid unnecessary construction of GetContext // objects, which is expensive autovector get_ctx; + BlobFetcher blob_fetcher(this, read_options); for (auto iter = range->begin(); iter != range->end(); ++iter) { assert(iter->s->ok() || iter->s->IsMergeInProgress()); get_ctx.emplace_back( user_comparator(), merge_operator_, info_log_, db_statistics_, - iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, - iter->value, nullptr, &(iter->merge_context), true, - &iter->max_covering_tombstone_seq, this->env_, nullptr, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, - tracing_mget_id); + iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, + iter->ukey_with_ts, iter->value, iter->timestamp, nullptr, + &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_, + nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, + &iter->is_blob_index, tracing_mget_id, &blob_fetcher); // MergeInProgress status, if set, has been transferred to the get_context // state, so we set status to ok here. From now on, the iter status will // be used for IO errors, and get_context state will be used for any @@ -1940,24 +2180,52 @@ &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); + Status s; + uint64_t num_index_read = 0; + uint64_t num_filter_read = 0; + uint64_t num_data_read = 0; + uint64_t num_sst_read = 0; + + MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end()); + // blob_file => [[blob_idx, it], ...] + std::unordered_map blob_rqs; + int level = -1; while (f != nullptr) { MultiGetRange file_range = fp.CurrentFileRange(); bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; - StopWatchNano timer(env_, timer_enabled /* auto_start */); - Status s = table_cache_->MultiGet( + + // Report MultiGet stats per level. + if (level >= 0 && level != (int)fp.GetHitFileLevel()) { + // Dump the stats if the search has moved to the next level and + // reset for next level. + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL, + num_data_read); + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); + num_filter_read = 0; + num_index_read = 0; + num_data_read = 0; + num_sst_read = 0; + level = fp.GetHitFileLevel(); + } + + StopWatchNano timer(clock_, timer_enabled /* auto_start */); + s = table_cache_->MultiGet( read_options, *internal_comparator(), *f->file_metadata, &file_range, - mutable_cf_options_.prefix_extractor.get(), + mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), - fp.GetCurrentLevel()); + fp.GetHitFileLevel()); // TODO: examine the behavior for corrupted key if (timer_enabled) { PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(), - fp.GetCurrentLevel()); + fp.GetHitFileLevel()); } if (!s.ok()) { // TODO: Set status for individual keys appropriately @@ -1968,7 +2236,8 @@ return; } uint64_t batch_size = 0; - for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { + for (auto iter = file_range.begin(); s.ok() && iter != file_range.end(); + ++iter) { GetContext& get_context = *iter->get_context; Status* status = iter->s; // The Status in the KeyContext takes precedence over GetContext state @@ -1985,6 +2254,16 @@ sample_file_read_inc(f->file_metadata); } batch_size++; + num_index_read += get_context.get_context_stats_.num_index_read; + num_filter_read += get_context.get_context_stats_.num_filter_read; + num_data_read += get_context.get_context_stats_.num_data_read; + num_sst_read += get_context.get_context_stats_.num_sst_read; + // Reset these stats since they're specific to a level + get_context.get_context_stats_.num_index_read = 0; + get_context.get_context_stats_.num_filter_read = 0; + get_context.get_context_stats_.num_data_read = 0; + get_context.get_context_stats_.num_sst_read = 0; + // report the counters before returning if (get_context.State() != GetContext::kNotFound && get_context.State() != GetContext::kMerge && @@ -2012,9 +2291,33 @@ } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); + file_range.MarkKeyDone(iter); + + if (iter->is_blob_index) { + if (iter->value) { + const Slice& blob_index_slice = *(iter->value); + BlobIndex blob_index; + Status tmp_s = blob_index.DecodeFrom(blob_index_slice); + if (tmp_s.ok()) { + const uint64_t blob_file_num = blob_index.file_number(); + blob_rqs[blob_file_num].emplace_back( + std::make_pair(blob_index, std::cref(*iter))); + } else { + *(iter->s) = tmp_s; + } + } + } else { + file_range.AddValueSize(iter->value->size()); + if (file_range.GetValueSize() > + read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } + } continue; case GetContext::kDeleted: // Use empty error message for speed @@ -2026,7 +2329,7 @@ Status::Corruption("corrupted key for ", iter->lkey->user_key()); file_range.MarkKeyDone(iter); continue; - case GetContext::kBlobIndex: + case GetContext::kUnexpectedBlobIndex: ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); *status = Status::NotSupported( "Encounter unexpected blob index. Please open DB with " @@ -2035,15 +2338,27 @@ continue; } } + RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size); - if (file_picker_range.empty()) { + if (!s.ok() || file_picker_range.empty()) { break; } f = fp.GetNextFile(); } + // Dump stats for most recent level + RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL, + num_data_read); + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); + + if (s.ok() && !blob_rqs.empty()) { + MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs); + } + // Process any left over keys - for (auto iter = range->begin(); iter != range->end(); ++iter) { + for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) { GetContext& get_context = *iter->get_context; Status* status = iter->s; Slice user_key = iter->lkey->user_key(); @@ -2064,16 +2379,27 @@ iter->value != nullptr ? iter->value->GetSelf() : nullptr; *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), - str_value, info_log_, db_statistics_, env_, + str_value, info_log_, db_statistics_, clock_, nullptr /* result_operand */, true); if (LIKELY(iter->value != nullptr)) { iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + range->MarkKeyDone(iter); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; + } } } else { range->MarkKeyDone(iter); *status = Status::NotFound(); // Use an empty error message for speed } } + + for (auto iter = range->begin(); iter != range->end(); ++iter) { + range->MarkKeyDone(iter); + *(iter->s) = s; + } } bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { @@ -2095,10 +2421,14 @@ void Version::PrepareApply( const MutableCFOptions& mutable_cf_options, bool update_stats) { + TEST_SYNC_POINT_CALLBACK( + "Version::PrepareApply:forced_check", + reinterpret_cast(&storage_info_.force_consistency_checks_)); UpdateAccumulatedStats(update_stats); storage_info_.UpdateNumNonEmptyLevels(); storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options); - storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri); + storage_info_.UpdateFilesByCompactionPri(*cfd_->ioptions(), + mutable_cf_options); storage_info_.GenerateFileIndexer(); storage_info_.GenerateLevelFilesBrief(); storage_info_.GenerateLevel0NonOverlapping(); @@ -2343,13 +2673,13 @@ } namespace { -uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, +uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const std::vector& files) { uint32_t ttl_expired_files_count = 0; int64_t _current_time; - auto status = ioptions.env->GetCurrentTime(&_current_time); + auto status = ioptions.clock->GetCurrentTime(&_current_time); if (status.ok()) { const uint64_t current_time = static_cast(_current_time); for (FileMetaData* f : files) { @@ -2367,7 +2697,7 @@ } // anonymous namespace void VersionStorageInfo::ComputeCompactionScore( - const ImmutableCFOptions& immutable_cf_options, + const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options) { for (int level = 0; level <= MaxInputLevel(); level++) { double score; @@ -2396,6 +2726,11 @@ // compaction score for the whole DB. Adding other levels as if // they are L0 files. for (int i = 1; i < num_levels(); i++) { + // Its possible that a subset of the files in a level may be in a + // compaction, due to delete triggered compaction or trivial move. + // In that case, the below check may not catch a level being + // compacted as it only checks the first file. The worst that can + // happen is a scheduled compaction thread will find nothing to do. if (!files_[i].empty() && !files_[i][0]->being_compacted) { num_sorted_runs++; } @@ -2405,7 +2740,12 @@ if (compaction_style_ == kCompactionStyleFIFO) { score = static_cast(total_size) / mutable_cf_options.compaction_options_fifo.max_table_files_size; - if (mutable_cf_options.compaction_options_fifo.allow_compaction) { + if (mutable_cf_options.compaction_options_fifo.allow_compaction || + mutable_cf_options.compaction_options_fifo.age_for_warm > 0) { + // Warm tier move can happen at any time. It's too expensive to + // check very file's timestamp now. For now, just trigger it + // slightly more frequently than FIFO compaction so that this + // happens first. score = std::max( static_cast(num_sorted_runs) / mutable_cf_options.level0_file_num_compaction_trigger, @@ -2414,10 +2754,9 @@ if (mutable_cf_options.ttl > 0) { score = std::max( static_cast(GetExpiredTtlFilesCount( - immutable_cf_options, mutable_cf_options, files_[level])), + immutable_options, mutable_cf_options, files_[level])), score); } - } else { score = static_cast(num_sorted_runs) / mutable_cf_options.level0_file_num_compaction_trigger; @@ -2425,9 +2764,21 @@ // Level-based involves L0->L0 compactions that can lead to oversized // L0 files. Take into account size as well to avoid later giant // compactions to the base level. - score = std::max( - score, static_cast(total_size) / - mutable_cf_options.max_bytes_for_level_base); + uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base; + if (immutable_options.level_compaction_dynamic_level_bytes && + level_multiplier_ != 0.0) { + // Prevent L0 to Lbase fanout from growing larger than + // `level_multiplier_`. This prevents us from getting stuck picking + // L0 forever even when it is hurting write-amp. That could happen + // in dynamic level compaction's write-burst mode where the base + // level's target size can grow to be enormous. + l0_target_size = + std::max(l0_target_size, + static_cast(level_max_bytes_[base_level_] / + level_multiplier_)); + } + score = + std::max(score, static_cast(total_size) / l0_target_size); } } } else { @@ -2462,12 +2813,21 @@ ComputeFilesMarkedForCompaction(); ComputeBottommostFilesMarkedForCompaction(); if (mutable_cf_options.ttl > 0) { - ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl); + ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); } if (mutable_cf_options.periodic_compaction_seconds > 0) { ComputeFilesMarkedForPeriodicCompaction( - immutable_cf_options, mutable_cf_options.periodic_compaction_seconds); + immutable_options, mutable_cf_options.periodic_compaction_seconds); } + + if (mutable_cf_options.enable_blob_garbage_collection && + mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && + mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold); + } + EstimateCompactionBytesNeeded(mutable_cf_options); } @@ -2495,13 +2855,13 @@ } void VersionStorageInfo::ComputeExpiredTtlFiles( - const ImmutableCFOptions& ioptions, const uint64_t ttl) { + const ImmutableOptions& ioptions, const uint64_t ttl) { assert(ttl > 0); expired_ttl_files_.clear(); int64_t _current_time; - auto status = ioptions.env->GetCurrentTime(&_current_time); + auto status = ioptions.clock->GetCurrentTime(&_current_time); if (!status.ok()) { return; } @@ -2521,14 +2881,14 @@ } void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const uint64_t periodic_compaction_seconds) { assert(periodic_compaction_seconds > 0); files_marked_for_periodic_compaction_.clear(); int64_t temp_current_time; - auto status = ioptions.env->GetCurrentTime(&temp_current_time); + auto status = ioptions.clock->GetCurrentTime(&temp_current_time); if (!status.ok()) { return; } @@ -2562,7 +2922,7 @@ status = ioptions.env->GetFileModificationTime( file_path, &file_modification_time); if (!status.ok()) { - ROCKS_LOG_WARN(ioptions.info_log, + ROCKS_LOG_WARN(ioptions.logger, "Can't get file modification time: %s: %s", file_path.c_str(), status.ToString().c_str()); continue; @@ -2577,6 +2937,106 @@ } } +void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold) { + files_marked_for_forced_blob_gc_.clear(); + + if (blob_files_.empty()) { + return; + } + + // Number of blob files eligible for GC based on age + const size_t cutoff_count = static_cast( + blob_garbage_collection_age_cutoff * blob_files_.size()); + if (!cutoff_count) { + return; + } + + // Compute the sum of total and garbage bytes over the oldest batch of blob + // files. The oldest batch is defined as the set of blob files which are + // kept alive by the same SSTs as the very oldest one. Here is a toy example. + // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, + // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and + // potentially some higher-numbered ones, while SST 3 relies on blob file 12 + // and potentially some higher-numbered ones. Then, the SST to oldest blob + // file mapping is as follows: + // + // SST file number Oldest blob file number + // 1 10 + // 2 10 + // 3 12 + // + // This is what the same thing looks like from the blob files' POV. (Note that + // the linked SSTs simply denote the inverse mapping of the above.) + // + // Blob file number Linked SST set + // 10 {1, 2} + // 11 {} + // 12 {3} + // 13 {} + // + // Then, the oldest batch of blob files consists of blob files 10 and 11, + // and we can get rid of them by forcing the compaction of SSTs 1 and 2. + // + // Note that the overall ratio of garbage computed for the batch has to exceed + // blob_garbage_collection_force_threshold and the entire batch has to be + // eligible for GC according to blob_garbage_collection_age_cutoff in order + // for us to schedule any compactions. + const auto oldest_it = blob_files_.begin(); + + const auto& oldest_meta = oldest_it->second; + assert(oldest_meta); + + const auto& linked_ssts = oldest_meta->GetLinkedSsts(); + assert(!linked_ssts.empty()); + + size_t count = 1; + uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); + uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); + + auto it = oldest_it; + for (++it; it != blob_files_.end(); ++it) { + const auto& meta = it->second; + assert(meta); + + if (!meta->GetLinkedSsts().empty()) { + break; + } + + if (++count > cutoff_count) { + return; + } + + sum_total_blob_bytes += meta->GetTotalBlobBytes(); + sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); + } + + if (sum_garbage_blob_bytes < + blob_garbage_collection_force_threshold * sum_total_blob_bytes) { + return; + } + + for (uint64_t sst_file_number : linked_ssts) { + const FileLocation location = GetFileLocation(sst_file_number); + assert(location.IsValid()); + + const int level = location.GetLevel(); + assert(level >= 0); + + const size_t pos = location.GetPosition(); + + FileMetaData* const sst_meta = files_[level][pos]; + assert(sst_meta); + + if (sst_meta->being_compacted) { + continue; + } + + files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); + } +} + namespace { // used to sort files by size @@ -2585,7 +3045,7 @@ FileMetaData* file; }; -// Compator that is used to sort files based on their size +// Comparator that is used to sort files based on their size // In normal mode: descending size bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { return (first.file->compensated_file_size > @@ -2593,31 +3053,29 @@ } } // anonymous namespace -void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { - auto* level_files = &files_[level]; - // Must not overlap -#ifndef NDEBUG - if (level > 0 && !level_files->empty() && - internal_comparator_->Compare( - (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) { - auto* f2 = (*level_files)[level_files->size() - 1]; - if (info_log != nullptr) { - Error(info_log, "Adding new file %" PRIu64 - " range (%s, %s) to level %d but overlapping " - "with existing file %" PRIu64 " %s %s", - f->fd.GetNumber(), f->smallest.DebugString(true).c_str(), - f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(), - f2->smallest.DebugString(true).c_str(), - f2->largest.DebugString(true).c_str()); - LogFlush(info_log); - } - assert(false); - } -#else - (void)info_log; -#endif +void VersionStorageInfo::AddFile(int level, FileMetaData* f) { + auto& level_files = files_[level]; + level_files.push_back(f); + f->refs++; - level_files->push_back(f); + + const uint64_t file_number = f->fd.GetNumber(); + + assert(file_locations_.find(file_number) == file_locations_.end()); + file_locations_.emplace(file_number, + FileLocation(level, level_files.size() - 1)); +} + +void VersionStorageInfo::AddBlobFile( + std::shared_ptr blob_file_meta) { + assert(blob_file_meta); + + const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber(); + + auto it = blob_files_.lower_bound(blob_file_number); + assert(it == blob_files_.end() || it->first != blob_file_number); + + blob_files_.emplace_hint(it, blob_file_number, std::move(blob_file_meta)); } // Version::PrepareApply() need to be called before calling the function, or @@ -2681,11 +3139,22 @@ // Sort `temp` based on ratio of overlapping size over file size void SortFileByOverlappingRatio( const InternalKeyComparator& icmp, const std::vector& files, - const std::vector& next_level_files, + const std::vector& next_level_files, SystemClock* clock, + int level, int num_non_empty_levels, uint64_t ttl, std::vector* temp) { std::unordered_map file_to_order; auto next_level_it = next_level_files.begin(); + int64_t curr_time; + Status status = clock->GetCurrentTime(&curr_time); + if (!status.ok()) { + // If we can't get time, disable TTL. + ttl = 0; + } + + FileTtlBooster ttl_booster(static_cast(curr_time), ttl, + num_non_empty_levels, level); + for (auto& file : files) { uint64_t overlapping_bytes = 0; // Skip files in next level that is smaller than current file @@ -2705,9 +3174,12 @@ next_level_it++; } + uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1; + assert(ttl_boost_score > 0); assert(file->compensated_file_size != 0); - file_to_order[file->fd.GetNumber()] = - overlapping_bytes * 1024u / file->compensated_file_size; + file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U / + file->compensated_file_size / + ttl_boost_score; } std::sort(temp->begin(), temp->end(), @@ -2719,7 +3191,7 @@ } // namespace void VersionStorageInfo::UpdateFilesByCompactionPri( - CompactionPri compaction_pri) { + const ImmutableOptions& ioptions, const MutableCFOptions& options) { if (compaction_style_ == kCompactionStyleNone || compaction_style_ == kCompactionStyleFIFO || compaction_style_ == kCompactionStyleUniversal) { @@ -2744,7 +3216,7 @@ if (num > temp.size()) { num = temp.size(); } - switch (compaction_pri) { + switch (ioptions.compaction_pri) { case kByCompensatedSize: std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), CompareCompensatedSizeDescending); @@ -2765,7 +3237,8 @@ break; case kMinOverlappingRatio: SortFileByOverlappingRatio(*internal_comparator_, files_[level], - files_[level + 1], &temp); + files_[level + 1], ioptions.clock, level, + num_non_empty_levels_, options.ttl, &temp); break; default: assert(false); @@ -2846,8 +3319,7 @@ bottommost_files_mark_threshold_ = kMaxSequenceNumber; for (auto& level_and_file : bottommost_files_) { if (!level_and_file.second->being_compacted && - level_and_file.second->fd.largest_seqno != 0 && - level_and_file.second->num_deletions > 1) { + level_and_file.second->fd.largest_seqno != 0) { // largest_seqno might be nonzero due to containing the final key in an // earlier compaction, whose seqnum we didn't zero out. Multiple deletions // ensures the file really contains deleted or overwritten keys. @@ -3006,7 +3478,7 @@ // specified range. From that file, iterate backwards and // forwards to find all overlapping files. // if within_range is set, then only store the maximum clean inputs -// within range [begin, end]. "clean" means there is a boudnary +// within range [begin, end]. "clean" means there is a boundary // between the files in "*inputs" and the surrounding files void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int level, const InternalKey* begin, const InternalKey* end, @@ -3173,7 +3645,7 @@ return scratch->buffer; } -int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { +uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { uint64_t result = 0; std::vector overlaps; for (int level = 1; level < num_levels() - 1; level++) { @@ -3196,7 +3668,7 @@ return level_max_bytes_[level]; } -void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, +void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, const MutableCFOptions& options) { // Special logic to set number of sorted runs. // It is to match the previous behavior when all files are in L0. @@ -3286,7 +3758,7 @@ // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_INFO(ioptions.info_log, + ROCKS_LOG_INFO(ioptions.logger, "More existing levels in DB than needed. " "max_bytes_for_level_multiplier may not be guaranteed."); } else { @@ -3317,7 +3789,7 @@ // 1. the L0 size is larger than level size base, or // 2. number of L0 files reaches twice the L0->L1 compaction trigger // We don't do this otherwise to keep the LSM-tree structure stable - // unless the L0 compation is backlogged. + // unless the L0 compaction is backlogged. base_level_size = l0_size; if (base_level_ == num_levels_ - 1) { level_multiplier_ = 1.0; @@ -3345,22 +3817,23 @@ } uint64_t VersionStorageInfo::EstimateLiveDataSize() const { - // Estimate the live data size by adding up the size of the last level for all - // key ranges. Note: Estimate depends on the ordering of files in level 0 - // because files in level 0 can be overlapping. + // Estimate the live data size by adding up the size of a maximal set of + // sst files with no range overlap in same or higher level. The less + // compacted, the more optimistic (smaller) this estimate is. Also, + // for multiple sorted runs within a level, file order will matter. uint64_t size = 0; auto ikey_lt = [this](InternalKey* x, InternalKey* y) { return internal_comparator_->Compare(*x, *y) < 0; }; - // (Ordered) map of largest keys in non-overlapping files + // (Ordered) map of largest keys in files being included in size estimate std::map ranges(ikey_lt); for (int l = num_levels_ - 1; l >= 0; l--) { bool found_end = false; for (auto file : files_[l]) { - // Find the first file where the largest key is larger than the smallest - // key of the current file. If this file does not overlap with the + // Find the first file already included with largest key is larger than + // the smallest key of `file`. If that file does not overlap with the // current file, none of the files in the map does. If there is // no potential overlap, we can safely insert the rest of this level // (if the level is not 0) into the map without checking again because @@ -3375,6 +3848,14 @@ } } } + // For BlobDB, the result also includes the exact value of live bytes in the + // blob files of the version. + const auto& blobFiles = GetBlobFiles(); + for (const auto& pair : blobFiles) { + const auto& meta = pair.second; + size += meta->GetTotalBlobBytes(); + size -= meta->GetGarbageBlobBytes(); + } return size; } @@ -3409,13 +3890,27 @@ return false; } -void Version::AddLiveFiles(std::vector* live) { - for (int level = 0; level < storage_info_.num_levels(); level++) { - const std::vector& files = storage_info_.files_[level]; - for (const auto& file : files) { - live->push_back(file->fd); +void Version::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + + for (int level = 0; level < storage_info_.num_levels(); ++level) { + const auto& level_files = storage_info_.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + + live_table_files->emplace_back(meta->fd.GetNumber()); } } + + const auto& blob_files = storage_info_.GetBlobFiles(); + for (const auto& pair : blob_files) { + const auto& meta = pair.second; + assert(meta); + + live_blob_files->emplace_back(meta->GetBlobFileNumber()); + } } std::string Version::DebugString(bool hex, bool print_stats) const { @@ -3462,6 +3957,21 @@ r.append("\n"); } } + + const auto& blob_files = storage_info_.GetBlobFiles(); + if (!blob_files.empty()) { + r.append("--- blob files --- version# "); + AppendNumberTo(&r, version_number_); + r.append(" ---\n"); + for (const auto& pair : blob_files) { + const auto& blob_file_meta = pair.second; + assert(blob_file_meta); + + r.append(blob_file_meta->DebugString()); + r.push_back('\n'); + } + } + return r; } @@ -3473,15 +3983,30 @@ ColumnFamilyData* cfd; const MutableCFOptions mutable_cf_options; const autovector& edit_list; + const std::function manifest_write_callback; - explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, - const MutableCFOptions& cf_options, - const autovector& e) + explicit ManifestWriter( + InstrumentedMutex* mu, ColumnFamilyData* _cfd, + const MutableCFOptions& cf_options, const autovector& e, + const std::function& manifest_wcb) : done(false), cv(mu), cfd(_cfd), mutable_cf_options(cf_options), - edit_list(e) {} + edit_list(e), + manifest_write_callback(manifest_wcb) {} + ~ManifestWriter() { status.PermitUncheckedError(); } + + bool IsAllWalEdits() const { + bool all_wal_edits = true; + for (const auto& e : edit_list) { + if (!e->IsWalManipulation()) { + all_wal_edits = false; + break; + } + } + return all_wal_edits; + } }; Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { @@ -3534,17 +4059,23 @@ const FileOptions& storage_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, - BlockCacheTracer* const block_cache_tracer) - : column_family_set_(new ColumnFamilySet( - dbname, _db_options, storage_options, table_cache, - write_buffer_manager, write_controller, block_cache_tracer)), + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id) + : column_family_set_( + new ColumnFamilySet(dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, + block_cache_tracer, io_tracer, db_session_id)), + table_cache_(table_cache), env_(_db_options->env), - fs_(_db_options->fs.get()), + fs_(_db_options->fs, io_tracer), + clock_(_db_options->clock), dbname_(dbname), db_options_(_db_options), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() options_file_number_(0), + options_file_size_(0), pending_manifest_file_number_(0), last_sequence_(0), last_allocated_sequence_(0), @@ -3553,21 +4084,50 @@ current_version_number_(0), manifest_file_size_(0), file_options_(storage_options), - block_cache_tracer_(block_cache_tracer) {} + block_cache_tracer_(block_cache_tracer), + io_tracer_(io_tracer), + db_session_id_(db_session_id) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet - Cache* table_cache = column_family_set_->get_table_cache(); column_family_set_.reset(); for (auto& file : obsolete_files_) { if (file.metadata->table_reader_handle) { - table_cache->Release(file.metadata->table_reader_handle); - TableCache::Evict(table_cache, file.metadata->fd.GetNumber()); + table_cache_->Release(file.metadata->table_reader_handle); + TableCache::Evict(table_cache_, file.metadata->fd.GetNumber()); } file.DeleteMetadata(); } obsolete_files_.clear(); + io_status_.PermitUncheckedError(); +} + +void VersionSet::Reset() { + if (column_family_set_) { + WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); + WriteController* wc = column_family_set_->write_controller(); + column_family_set_.reset(new ColumnFamilySet( + dbname_, db_options_, file_options_, table_cache_, wbm, wc, + block_cache_tracer_, io_tracer_, db_session_id_)); + } + db_id_.clear(); + next_file_number_.store(2); + min_log_number_to_keep_.store(0); + manifest_file_number_ = 0; + options_file_number_ = 0; + pending_manifest_file_number_ = 0; + last_sequence_.store(0); + last_allocated_sequence_.store(0); + last_published_sequence_.store(0); + prev_log_number_ = 0; + descriptor_log_.reset(); + current_version_number_ = 0; + manifest_writers_.clear(); + manifest_file_size_ = 0; + obsolete_files_.clear(); + obsolete_manifests_.clear(); + wals_.Reset(); } void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, @@ -3600,8 +4160,9 @@ Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, - Directory* db_directory, bool new_descriptor_log, + FSDirectory* db_directory, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options) { + mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); ManifestWriter* last_writer = &first_writer; @@ -3614,9 +4175,16 @@ autovector mutable_cf_options_ptrs; std::vector> builder_guards; + // Tracking `max_last_sequence` is needed to ensure we write + // `VersionEdit::last_sequence_`s in non-decreasing order according to the + // recovery code's requirement. It also allows us to defer updating + // `descriptor_last_sequence_` until the apply phase, after the log phase + // succeeds. + SequenceNumber max_last_sequence = descriptor_last_sequence_; + if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) { // No group commits for column family add or drop - LogAndApplyCFHelper(first_writer.edit_list.front()); + LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence); batch_edits.push_back(first_writer.edit_list.front()); } else { auto it = manifest_writers_.cbegin(); @@ -3678,16 +4246,22 @@ } } if (version == nullptr) { - version = new Version(last_writer->cfd, this, file_options_, - last_writer->mutable_cf_options, - current_version_number_++); - versions.push_back(version); - mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); - builder_guards.emplace_back( - new BaseReferencedVersionBuilder(last_writer->cfd)); - builder = builder_guards.back()->version_builder(); + // WAL manipulations do not need to be applied to versions. + if (!last_writer->IsAllWalEdits()) { + version = new Version(last_writer->cfd, this, file_options_, + last_writer->mutable_cf_options, io_tracer_, + current_version_number_++); + versions.push_back(version); + mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); + builder_guards.emplace_back( + new BaseReferencedVersionBuilder(last_writer->cfd)); + builder = builder_guards.back()->version_builder(); + } + assert(last_writer->IsAllWalEdits() || builder); + assert(last_writer->IsAllWalEdits() || version); + TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion", + version); } - assert(builder != nullptr); // make checker happy for (const auto& e : last_writer->edit_list) { if (e->is_in_atomic_group_) { if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || @@ -3698,7 +4272,8 @@ } else if (group_start != std::numeric_limits::max()) { group_start = std::numeric_limits::max(); } - Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu); + Status s = LogAndApplyHelper(last_writer->cfd, builder, e, + &max_last_sequence, mu); if (!s.ok()) { // free up the allocated memory for (auto v : versions) { @@ -3760,9 +4335,6 @@ } #endif // NDEBUG - uint64_t new_manifest_file_size = 0; - Status s; - assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || manifest_file_size_ > db_options_->max_manifest_file_size) { @@ -3776,6 +4348,7 @@ // reads its content after releasing db mutex to avoid race with // SwitchMemtable(). std::unordered_map curr_state; + VersionEdit wal_additions; if (new_descriptor_log) { pending_manifest_file_number_ = NewFileNumber(); batch_edits.back()->SetNextFile(next_file_number_.load()); @@ -3788,15 +4361,25 @@ } for (const auto* cfd : *column_family_set_) { assert(curr_state.find(cfd->GetID()) == curr_state.end()); - curr_state[cfd->GetID()] = {cfd->GetLogNumber()}; + curr_state.emplace(std::make_pair( + cfd->GetID(), + MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()))); + } + + for (const auto& wal : wals_.GetWals()) { + wal_additions.AddWal(wal.first, wal.second); } } + uint64_t new_manifest_file_size = 0; + Status s; + IOStatus io_s; + IOStatus manifest_io_status; { FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); mu->Unlock(); - - TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest"); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { for (int i = 0; i < static_cast(versions.size()); ++i) { assert(!builder_guards.empty() && @@ -3805,10 +4388,11 @@ builder_guards.size() == versions.size()); ColumnFamilyData* cfd = versions[i]->cfd_; s = builder_guards[i]->version_builder()->LoadTableHandlers( - cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits, + cfd->internal_stats(), 1 /* max_threads */, true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, - mutable_cf_options_ptrs[i]->prefix_extractor.get()); + mutable_cf_options_ptrs[i]->prefix_extractor, + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i])); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -3827,18 +4411,24 @@ std::string descriptor_fname = DescriptorFileName(dbname_, pending_manifest_file_number_); std::unique_ptr descriptor_file; - s = NewWritableFile(fs_, descriptor_fname, &descriptor_file, - opt_file_opts); - if (s.ok()) { + io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file, + opt_file_opts); + if (io_s.ok()) { descriptor_file->SetPreallocationBlockSize( db_options_->manifest_preallocation_size); - + FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( - std::move(descriptor_file), descriptor_fname, opt_file_opts, env_, - nullptr, db_options_->listeners)); + std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, + io_tracer_, nullptr, db_options_->listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile), + tmp_set.Contains(FileType::kDescriptorFile))); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get()); + s = WriteCurrentStateToManifest(curr_state, wal_additions, + descriptor_log_.get(), io_s); + } else { + manifest_io_status = io_s; + s = io_s; } } @@ -3860,8 +4450,8 @@ e->DebugString(true)); break; } - TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord", - rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord", + REDUCE_ODDS2); #ifndef NDEBUG if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { TEST_SYNC_POINT_CALLBACK( @@ -3872,15 +4462,21 @@ } ++idx; #endif /* !NDEBUG */ - s = descriptor_log_->AddRecord(record); - if (!s.ok()) { + io_s = descriptor_log_->AddRecord(record); + if (!io_s.ok()) { + s = io_s; + manifest_io_status = io_s; break; } } if (s.ok()) { - s = SyncManifest(env_, db_options_, descriptor_log_->file()); + io_s = SyncManifest(db_options_, descriptor_log_->file()); + manifest_io_status = io_s; + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); } - if (!s.ok()) { + if (!io_s.ok()) { + s = io_s; ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", s.ToString().c_str()); } @@ -3888,10 +4484,15 @@ // If we just created a new descriptor file, install it by writing a // new CURRENT file that points to it. + if (s.ok()) { + assert(manifest_io_status.ok()); + } if (s.ok() && new_descriptor_log) { - s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, - db_directory); - TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest"); + io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, + db_directory); + if (!io_s.ok()) { + s = io_s; + } } if (s.ok()) { @@ -3910,6 +4511,28 @@ mu->Lock(); } + if (s.ok()) { + // Apply WAL edits, DB mutex must be held. + for (auto& e : batch_edits) { + if (e->IsWalAddition()) { + s = wals_.AddWals(e->GetWalAdditions()); + } else if (e->IsWalDeletion()) { + s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber()); + } + if (!s.ok()) { + break; + } + } + } + + if (!io_s.ok()) { + if (io_status_.ok()) { + io_status_ = io_s; + } + } else if (!io_status_.ok()) { + io_status_ = io_s; + } + // Append the old manifest file to the obsolete_manifest_ list to be deleted // by PurgeObsoleteFiles later. if (s.ok() && new_descriptor_log) { @@ -3922,32 +4545,34 @@ if (first_writer.edit_list.front()->is_column_family_add_) { assert(batch_edits.size() == 1); assert(new_cf_options != nullptr); + assert(max_last_sequence == descriptor_last_sequence_); CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); } else if (first_writer.edit_list.front()->is_column_family_drop_) { assert(batch_edits.size() == 1); + assert(max_last_sequence == descriptor_last_sequence_); first_writer.cfd->SetDropped(); first_writer.cfd->UnrefAndTryDelete(); } else { // Each version in versions corresponds to a column family. // For each column family, update its log number indicating that logs // with number smaller than this should be ignored. - for (const auto version : versions) { - uint64_t max_log_number_in_batch = 0; - uint32_t cf_id = version->cfd_->GetID(); - for (const auto& e : batch_edits) { - if (e->has_log_number_ && e->column_family_ == cf_id) { - max_log_number_in_batch = - std::max(max_log_number_in_batch, e->log_number_); + uint64_t last_min_log_number_to_keep = 0; + for (const auto& e : batch_edits) { + ColumnFamilyData* cfd = nullptr; + if (!e->IsColumnFamilyManipulation()) { + cfd = column_family_set_->GetColumnFamily(e->column_family_); + // e would not have been added to batch_edits if its corresponding + // column family is dropped. + assert(cfd); + } + if (cfd) { + if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) { + cfd->SetLogNumber(e->log_number_); + } + if (e->HasFullHistoryTsLow()) { + cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); } } - if (max_log_number_in_batch != 0) { - assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch); - version->cfd_->SetLogNumber(max_log_number_in_batch); - } - } - - uint64_t last_min_log_number_to_keep = 0; - for (auto& e : batch_edits) { if (e->has_min_log_number_to_keep_) { last_min_log_number_to_keep = std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); @@ -3955,8 +4580,7 @@ } if (last_min_log_number_to_keep != 0) { - // Should only be set in 2PC mode. - MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep); + MarkMinLogNumberToKeep(last_min_log_number_to_keep); } for (int i = 0; i < static_cast(versions.size()); ++i) { @@ -3964,6 +4588,8 @@ AppendVersion(cfd, versions[i]); } } + assert(max_last_sequence >= descriptor_last_sequence_); + descriptor_last_sequence_ = max_last_sequence; manifest_file_number_ = pending_manifest_file_number_; manifest_file_size_ = new_manifest_file_size; prev_log_number_ = first_writer.edit_list.front()->prev_log_number_; @@ -3978,22 +4604,75 @@ for (auto v : versions) { delete v; } + if (manifest_io_status.ok()) { + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + } // If manifest append failed for whatever reason, the file could be // corrupted. So we need to force the next version update to start a // new manifest file. descriptor_log_.reset(); - if (new_descriptor_log) { + // If manifest operations failed, then we know the CURRENT file still + // points to the original MANIFEST. Therefore, we can safely delete the + // new MANIFEST. + // If manifest operations succeeded, and we are here, then it is possible + // that renaming tmp file to CURRENT failed. + // + // On local POSIX-compliant FS, the CURRENT must point to the original + // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also + // keep it. Future recovery will ignore this MANIFEST. It's also ok for the + // process not to crash and continue using the db. Any future LogAndApply() + // call will switch to a new MANIFEST and update CURRENT, still ignoring + // this one. + // + // On non-local FS, it is + // possible that the rename operation succeeded on the server (remote) + // side, but the client somehow returns a non-ok status to RocksDB. Note + // that this does not violate atomicity. Should we delete the new MANIFEST + // successfully, a subsequent recovery attempt will likely see the CURRENT + // pointing to the new MANIFEST, thus fail. We will not be able to open the + // DB again. Therefore, if manifest operations succeed, we should keep the + // the new MANIFEST. If the process proceeds, any future LogAndApply() call + // will switch to a new MANIFEST and update CURRENT. If user tries to + // re-open the DB, + // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. + // b) CURRENT points to the original MANIFEST, and the original MANIFEST + // also exists. + if (new_descriptor_log && !manifest_io_status.ok()) { ROCKS_LOG_INFO(db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", - manifest_file_number_, pending_manifest_file_number_); - env_->DeleteFile( + pending_manifest_file_number_, manifest_file_number_); + Status manifest_del_status = env_->DeleteFile( DescriptorFileName(dbname_, pending_manifest_file_number_)); + if (!manifest_del_status.ok()) { + ROCKS_LOG_WARN(db_options_->info_log, + "Failed to delete manifest %" PRIu64 ": %s", + pending_manifest_file_number_, + manifest_del_status.ToString().c_str()); + } } } pending_manifest_file_number_ = 0; +#ifndef NDEBUG + // This is here kind of awkwardly because there's no other consistency + // checks on `VersionSet`'s updates for the new `Version`s. We might want + // to move it to a dedicated function, or remove it if we gain enough + // confidence in `descriptor_last_sequence_`. + if (s.ok()) { + for (const auto* v : versions) { + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (const auto& file : vstorage->LevelFiles(level)) { + assert(file->fd.largest_seqno <= descriptor_last_sequence_); + } + } + } + } +#endif // NDEBUG + // wake up all the waiting writers while (true) { ManifestWriter* ready = manifest_writers_.front(); @@ -4007,6 +4686,9 @@ } ready->status = s; ready->done = true; + if (ready->manifest_write_callback) { + (ready->manifest_write_callback)(s); + } if (need_signal) { ready->cv.Signal(); } @@ -4020,14 +4702,23 @@ return s; } -// 'datas' is gramatically incorrect. We still use this notation to indicate +void VersionSet::WakeUpWaitingManifestWriters() { + // wake up all the waiting writers + // Notify new head of manifest write queue. + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } +} + +// 'datas' is grammatically incorrect. We still use this notation to indicate // that this variable represents a collection of column_family_data. Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, const autovector>& edit_lists, - InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log, + const ColumnFamilyOptions* new_cf_options, + const std::vector>& manifest_wcbs) { mu->AssertHeld(); int num_edits = 0; for (const auto& elist : edit_lists) { @@ -4057,12 +4748,16 @@ assert(static_cast(num_cfds) == edit_lists.size()); } for (int i = 0; i < num_cfds; ++i) { + const auto wcb = + manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i]; writers.emplace_back(mu, column_family_datas[i], - *mutable_cf_options_list[i], edit_lists[i]); + *mutable_cf_options_list[i], edit_lists[i], wcb); manifest_writers_.push_back(&writers[i]); } assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting", + nullptr); while (!first_writer.done && &first_writer != manifest_writers_.front()) { first_writer.cv.Wait(); } @@ -4074,6 +4769,7 @@ for (const auto& writer : writers) { assert(writer.done); } + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu); #endif /* !NDEBUG */ return first_writer.status; } @@ -4100,16 +4796,13 @@ new_cf_options); } -void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { +void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence) { + assert(max_last_sequence != nullptr); assert(edit->IsColumnFamilyManipulation()); edit->SetNextFile(next_file_number_.load()); - // The log might have data that is not visible to memtbale and hence have not - // updated the last_sequence_ yet. It is also possible that the log has is - // expecting some new data that is not written yet. Since LastSequence is an - // upper bound on the sequence, it is ok to record - // last_allocated_sequence_ as the last sequence. - edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ - : last_sequence_); + assert(!edit->HasLastSequence()); + edit->SetLastSequence(*max_last_sequence); if (edit->is_column_family_drop_) { // if we drop column family, we have to make sure to save max column family, // so that we don't reuse existing ID @@ -4119,12 +4812,14 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* builder, VersionEdit* edit, + SequenceNumber* max_last_sequence, InstrumentedMutex* mu) { #ifdef NDEBUG (void)cfd; #endif mu->AssertHeld(); assert(!edit->IsColumnFamilyManipulation()); + assert(max_last_sequence != nullptr); if (edit->has_log_number_) { assert(edit->log_number_ >= cfd->GetLogNumber()); @@ -4135,161 +4830,17 @@ edit->SetPrevLogNumber(prev_log_number_); } edit->SetNextFile(next_file_number_.load()); - // The log might have data that is not visible to memtbale and hence have not - // updated the last_sequence_ yet. It is also possible that the log has is - // expecting some new data that is not written yet. Since LastSequence is an - // upper bound on the sequence, it is ok to record - // last_allocated_sequence_ as the last sequence. - edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ - : last_sequence_); - - Status s = builder->Apply(edit); - - return s; -} - -Status VersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, - const std::unordered_map& name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map>& - builders, - VersionEditParams* version_edit_params) { - // Not found means that user didn't supply that column - // family option AND we encountered column family add - // record. Once we encounter column family drop record, - // we will delete the column family from - // column_families_not_found. - bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) != - column_families_not_found.end()); - // in builders means that user supplied that column family - // option AND that we encountered column family add record - bool cf_in_builders = builders.find(edit.column_family_) != builders.end(); - - // they can't both be true - assert(!(cf_in_not_found && cf_in_builders)); - - ColumnFamilyData* cfd = nullptr; - - if (edit.is_column_family_add_) { - if (cf_in_builders || cf_in_not_found) { - return Status::Corruption( - "Manifest adding the same column family twice: " + - edit.column_family_name_); - } - auto cf_options = name_to_options.find(edit.column_family_name_); - // implicitly add persistent_stats column family without requiring user - // to specify - bool is_persistent_stats_column_family = - edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; - if (cf_options == name_to_options.end() && - !is_persistent_stats_column_family) { - column_families_not_found.insert( - {edit.column_family_, edit.column_family_name_}); - } else { - // recover persistent_stats CF from a DB that already contains it - if (is_persistent_stats_column_family) { - ColumnFamilyOptions cfo; - OptimizeForPersistentStats(&cfo); - cfd = CreateColumnFamily(cfo, &edit); - } else { - cfd = CreateColumnFamily(cf_options->second, &edit); - } - cfd->set_initialized(); - builders.insert(std::make_pair( - edit.column_family_, std::unique_ptr( - new BaseReferencedVersionBuilder(cfd)))); - } - } else if (edit.is_column_family_drop_) { - if (cf_in_builders) { - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - builders.erase(builder); - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - assert(cfd != nullptr); - if (cfd->UnrefAndTryDelete()) { - cfd = nullptr; - } else { - // who else can have reference to cfd!? - assert(false); - } - } else if (cf_in_not_found) { - column_families_not_found.erase(edit.column_family_); - } else { - return Status::Corruption( - "Manifest - dropping non-existing column family"); - } - } else if (!cf_in_not_found) { - if (!cf_in_builders) { - return Status::Corruption( - "Manifest record referencing unknown column family"); - } - - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // this should never happen since cf_in_builders is true - assert(cfd != nullptr); - - // if it is not column family add or column family drop, - // then it's a file add/delete, which should be forwarded - // to builder - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - Status s = builder->second->version_builder()->Apply(&edit); - if (!s.ok()) { - return s; - } - } - return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params); -} - -Status VersionSet::ExtractInfoFromVersionEdit( - ColumnFamilyData* cfd, const VersionEdit& from_edit, - VersionEditParams* version_edit_params) { - if (cfd != nullptr) { - if (from_edit.has_db_id_) { - version_edit_params->SetDBId(from_edit.db_id_); - } - if (from_edit.has_log_number_) { - if (cfd->GetLogNumber() > from_edit.log_number_) { - ROCKS_LOG_WARN( - db_options_->info_log, - "MANIFEST corruption detected, but ignored - Log numbers in " - "records NOT monotonically increasing"); - } else { - cfd->SetLogNumber(from_edit.log_number_); - version_edit_params->SetLogNumber(from_edit.log_number_); - } - } - if (from_edit.has_comparator_ && - from_edit.comparator_ != cfd->user_comparator()->Name()) { - return Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + from_edit.comparator_); - } - } - - if (from_edit.has_prev_log_number_) { - version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_); - } - - if (from_edit.has_next_file_number_) { - version_edit_params->SetNextFile(from_edit.next_file_number_); - } - - if (from_edit.has_max_column_family_) { - version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_); - } - - if (from_edit.has_min_log_number_to_keep_) { - version_edit_params->min_log_number_to_keep_ = - std::max(version_edit_params->min_log_number_to_keep_, - from_edit.min_log_number_to_keep_); + if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) { + *max_last_sequence = edit->GetLastSequence(); + } else { + edit->SetLastSequence(*max_last_sequence); } - if (from_edit.has_last_sequence_) { - version_edit_params->SetLastSequence(from_edit.last_sequence_); - } - return Status::OK(); + // The builder can be nullptr only if edit is WAL manipulation, + // because WAL edits do not need to be applied to versions, + // we return Status::OK() in this case. + assert(builder || edit->IsWalManipulation()); + return builder ? builder->Apply(edit) : Status::OK(); } Status VersionSet::GetCurrentManifestPath(const std::string& dbname, @@ -4319,91 +4870,16 @@ if (dbname.back() != '/') { manifest_path->push_back('/'); } - *manifest_path += fname; + manifest_path->append(fname); return Status::OK(); } -Status VersionSet::ReadAndRecover( - log::Reader* reader, AtomicGroupReadBuffer* read_buffer, - const std::unordered_map& name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map>& - builders, - VersionEditParams* version_edit_params, std::string* db_id) { - assert(reader != nullptr); - assert(read_buffer != nullptr); - Status s; - Slice record; - std::string scratch; - size_t recovered_edits = 0; - while (reader->ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - if (edit.has_db_id_) { - db_id_ = edit.GetDbId(); - if (db_id != nullptr) { - db_id->assign(edit.GetDbId()); - } - } - s = read_buffer->AddEdit(&edit); - if (!s.ok()) { - break; - } - if (edit.is_in_atomic_group_) { - if (read_buffer->IsFull()) { - // Apply edits in an atomic group when we have read all edits in the - // group. - for (auto& e : read_buffer->replay_buffer()) { - s = ApplyOneVersionEditToBuilder(e, name_to_options, - column_families_not_found, builders, - version_edit_params); - if (!s.ok()) { - break; - } - recovered_edits++; - } - if (!s.ok()) { - break; - } - read_buffer->Clear(); - } - } else { - // Apply a normal edit immediately. - s = ApplyOneVersionEditToBuilder(edit, name_to_options, - column_families_not_found, builders, - version_edit_params); - if (s.ok()) { - recovered_edits++; - } - } - } - if (!s.ok()) { - // Clear the buffer if we fail to decode/apply an edit. - read_buffer->Clear(); - } - TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits", - &recovered_edits); - return s; -} - Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id) { - std::unordered_map cf_name_to_options; - for (const auto& cf : column_families) { - cf_name_to_options.emplace(cf.name, cf.options); - } - // keeps track of column families in manifest that were not found in - // column families parameters. if those column families are not dropped - // by subsequent manifest records, Recover() will return failure status - std::unordered_map column_families_not_found; - // Read "CURRENT" file, which contains a pointer to the current manifest file std::string manifest_path; - Status s = GetCurrentManifestPath(dbname_, fs_, &manifest_path, + Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, &manifest_file_number_); if (!s.ok()) { return s; @@ -4421,140 +4897,34 @@ if (!s.ok()) { return s; } - manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path, - db_options_->log_readahead_size)); - } - - std::unordered_map> - builders; - - // add default column family - auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); - if (default_cf_iter == cf_name_to_options.end()) { - return Status::InvalidArgument("Default column family not specified"); - } - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - // In recovery, nobody else can access it, so it's fine to set it to be - // initialized earlier. - default_cfd->set_initialized(); - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } uint64_t current_manifest_file_size = 0; - VersionEditParams version_edit_params; + uint64_t log_number = 0; { VersionSet::LogReporter reporter; - reporter.status = &s; + Status log_read_status; + reporter.status = &log_read_status; log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - AtomicGroupReadBuffer read_buffer; - s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options, - column_families_not_found, builders, - &version_edit_params, db_id); - current_manifest_file_size = reader.GetReadOffset(); - assert(current_manifest_file_size != 0); - } - - if (s.ok()) { - if (!version_edit_params.has_next_file_number_) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!version_edit_params.has_log_number_) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!version_edit_params.has_last_sequence_) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!version_edit_params.has_prev_log_number_) { - version_edit_params.SetPrevLogNumber(0); - } - - column_family_set_->UpdateMaxColumnFamily( - version_edit_params.max_column_family_); - - // When reading DB generated using old release, min_log_number_to_keep=0. - // All log files will be scanned for potential prepare entries. - MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_); - MarkFileNumberUsed(version_edit_params.prev_log_number_); - MarkFileNumberUsed(version_edit_params.log_number_); - } - - // there were some column families in the MANIFEST that weren't specified - // in the argument. This is OK in read_only mode - if (read_only == false && !column_families_not_found.empty()) { - std::string list_of_not_found; - for (const auto& cf : column_families_not_found) { - list_of_not_found += ", " + cf.second; - } - list_of_not_found = list_of_not_found.substr(2); - s = Status::InvalidArgument( - "You have to open all column families. Column families not opened: " + - list_of_not_found); - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - assert(builders.count(cfd->GetID()) > 0); - auto* builder = builders[cfd->GetID()]->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } + VersionEditHandler handler(read_only, column_families, + const_cast(this), + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer_); + handler.Iterate(reader, &log_read_status); + s = handler.status(); + if (s.ok()) { + log_number = handler.GetVersionEditParams().log_number_; + current_manifest_file_size = reader.GetReadOffset(); + assert(current_manifest_file_size != 0); + handler.GetDbId(db_id); } } if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - if (read_only) { - cfd->table_cache()->SetTablesAreImmortal(); - } - assert(cfd->initialized()); - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto builder = builders_iter->second->version_builder(); - - // unlimited table cache. Pre-load table handle now. - // Need to do it out of the mutex. - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - true /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - if (!s.ok()) { - if (db_options_->paranoid_checks) { - return s; - } - s = Status::OK(); - } - - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(v->storage_info()); - - // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), - !(db_options_->skip_stats_update_on_db_open)); - AppendVersion(cfd, v); - } - manifest_file_size_ = current_manifest_file_size; - next_file_number_.store(version_edit_params.next_file_number_ + 1); - last_allocated_sequence_ = version_edit_params.last_sequence_; - last_published_sequence_ = version_edit_params.last_sequence_; - last_sequence_ = version_edit_params.last_sequence_; - prev_log_number_ = version_edit_params.prev_log_number_; - ROCKS_LOG_INFO( db_options_->info_log, "Recovered from manifest file:%s succeeded," @@ -4563,9 +4933,8 @@ ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), - last_sequence_.load(), version_edit_params.log_number_, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { @@ -4581,10 +4950,152 @@ return s; } +namespace { +class ManifestPicker { + public: + explicit ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname); + // REQUIRES Valid() == true + std::string GetNextManifest(uint64_t* file_number, std::string* file_name); + bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); } + + private: + const std::string& dbname_; + // MANIFEST file names(s) + std::vector manifest_files_; + std::vector::const_iterator manifest_file_iter_; +}; + +ManifestPicker::ManifestPicker(const std::string& dbname, + const std::vector& files_in_dbname) + : dbname_(dbname) { + // populate manifest files + assert(!files_in_dbname.empty()); + for (const auto& fname : files_in_dbname) { + uint64_t file_num = 0; + FileType file_type; + bool parse_ok = ParseFileName(fname, &file_num, &file_type); + if (parse_ok && file_type == kDescriptorFile) { + manifest_files_.push_back(fname); + } + } + // seek to first manifest + std::sort(manifest_files_.begin(), manifest_files_.end(), + [](const std::string& lhs, const std::string& rhs) { + uint64_t num1 = 0; + uint64_t num2 = 0; + FileType type1; + FileType type2; + bool parse_ok1 = ParseFileName(lhs, &num1, &type1); + bool parse_ok2 = ParseFileName(rhs, &num2, &type2); +#ifndef NDEBUG + assert(parse_ok1); + assert(parse_ok2); +#else + (void)parse_ok1; + (void)parse_ok2; +#endif + return num1 > num2; + }); + manifest_file_iter_ = manifest_files_.begin(); +} + +std::string ManifestPicker::GetNextManifest(uint64_t* number, + std::string* file_name) { + assert(Valid()); + std::string ret; + if (manifest_file_iter_ != manifest_files_.end()) { + ret.assign(dbname_); + if (ret.back() != kFilePathSeparator) { + ret.push_back(kFilePathSeparator); + } + ret.append(*manifest_file_iter_); + if (number) { + FileType type; + bool parse = ParseFileName(*manifest_file_iter_, number, &type); + assert(type == kDescriptorFile); +#ifndef NDEBUG + assert(parse); +#else + (void)parse; +#endif + } + if (file_name) { + *file_name = *manifest_file_iter_; + } + ++manifest_file_iter_; + } + return ret; +} +} // namespace + +Status VersionSet::TryRecover( + const std::vector& column_families, bool read_only, + const std::vector& files_in_dbname, std::string* db_id, + bool* has_missing_table_file) { + ManifestPicker manifest_picker(dbname_, files_in_dbname); + if (!manifest_picker.Valid()) { + return Status::Corruption("Cannot locate MANIFEST file in " + dbname_); + } + Status s; + std::string manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + while (!manifest_path.empty()) { + s = TryRecoverFromOneManifest(manifest_path, column_families, read_only, + db_id, has_missing_table_file); + if (s.ok() || !manifest_picker.Valid()) { + break; + } + Reset(); + manifest_path = + manifest_picker.GetNextManifest(&manifest_file_number_, nullptr); + } + return s; +} + +Status VersionSet::TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, bool read_only, + std::string* db_id, bool* has_missing_table_file) { + ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", + manifest_path.c_str()); + std::unique_ptr manifest_file_reader; + Status s; + { + std::unique_ptr manifest_file; + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + } + + assert(s.ok()); + VersionSet::LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, + /*checksum=*/true, /*log_num=*/0); + VersionEditHandlerPointInTime handler_pit( + read_only, column_families, const_cast(this), io_tracer_); + + handler_pit.Iterate(reader, &s); + + handler_pit.GetDbId(db_id); + + assert(nullptr != has_missing_table_file); + *has_missing_table_file = handler_pit.HasMissingFiles(); + + return handler_pit.status(); +} + Status VersionSet::ListColumnFamilies(std::vector* column_families, const std::string& dbname, FileSystem* fs) { - // these are just for performance reasons, not correcntes, + // these are just for performance reasons, not correctness, // so we're fine using the defaults FileOptions soptions; // Read "CURRENT" file, which contains a pointer to the current manifest file @@ -4603,51 +5114,27 @@ if (!s.ok()) { return s; } - file_reader.reset(new SequentialFileReader(std::move(file), manifest_path)); + file_reader.reset(new SequentialFileReader(std::move(file), manifest_path, + nullptr /*IOTracer*/)); } - std::map column_family_names; - // default column family is always implicitly there - column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - if (edit.is_column_family_add_) { - if (column_family_names.find(edit.column_family_) != - column_family_names.end()) { - s = Status::Corruption("Manifest adding the same column family twice"); - break; - } - column_family_names.insert( - {edit.column_family_, edit.column_family_name_}); - } else if (edit.is_column_family_drop_) { - if (column_family_names.find(edit.column_family_) == - column_family_names.end()) { - s = Status::Corruption( - "Manifest - dropping non-existing column family"); - break; - } - column_family_names.erase(edit.column_family_); - } - } + ListColumnFamiliesHandler handler; + handler.Iterate(reader, &s); + + assert(column_families); column_families->clear(); - if (s.ok()) { - for (const auto& iter : column_family_names) { + if (handler.status().ok()) { + for (const auto& iter : handler.GetColumnFamilyNames()) { column_families->push_back(iter.second); } } - return s; + return handler.status(); } #ifndef ROCKSDB_LITE @@ -4667,7 +5154,8 @@ WriteController wc(options->delayed_write_rate); WriteBufferManager wb(options->db_write_buffer_size); VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, - /*block_cache_tracer=*/nullptr); + nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, + /*db_session_id*/ ""); Status status; std::vector dummy; @@ -4720,7 +5208,19 @@ } if (first_nonempty_level > 0) { - new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level); + auto& new_last_level = new_files_list[new_levels - 1]; + + new_last_level = vstorage->LevelFiles(first_nonempty_level); + + for (size_t i = 0; i < new_last_level.size(); ++i) { + const FileMetaData* const meta = new_last_level[i]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + vstorage->file_locations_[file_number] = + VersionStorageInfo::FileLocation(new_levels - 1, i); + } } delete[] vstorage -> files_; @@ -4737,14 +5237,16 @@ } // Get the checksum information including the checksum and checksum function -// name of all SST files in VersionSet. Store the information in +// name of all SST and blob files in VersionSet. Store the information in // FileChecksumList which contains a map from file number to its checksum info. // If DB is not running, make sure call VersionSet::Recover() to load the file // metadata from Manifest to VersionSet before calling this function. Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { // Clean the previously stored checksum information if any. + Status s; if (checksum_list == nullptr) { - return Status::InvalidArgument("checksum_list is nullptr"); + s = Status::InvalidArgument("checksum_list is nullptr"); + return s; } checksum_list->reset(); @@ -4752,16 +5254,45 @@ if (cfd->IsDropped() || !cfd->initialized()) { continue; } + /* SST files */ for (int level = 0; level < cfd->NumberLevels(); level++) { for (const auto& file : cfd->current()->storage_info()->LevelFiles(level)) { - checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), - file->file_checksum, - file->file_checksum_func_name); + s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(), + file->file_checksum, + file->file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + } + + /* Blob files */ + const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& meta = pair.second; + + assert(meta); + assert(blob_file_number == meta->GetBlobFileNumber()); + + std::string checksum_value = meta->GetChecksumValue(); + std::string checksum_method = meta->GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (meta->GetChecksumMethod().empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + + s = checksum_list->InsertOneFileChecksum(blob_file_number, checksum_value, + checksum_method); + if (!s.ok()) { + return s; } } } - return Status::OK(); + + return s; } Status VersionSet::DumpManifest(Options& options, std::string& dscname, @@ -4771,205 +5302,31 @@ Status s; { std::unique_ptr file; - s = options.file_system->NewSequentialFile( + const std::shared_ptr& fs = options.env->GetFileSystem(); + s = fs->NewSequentialFile( dscname, - options.file_system->OptimizeForManifestRead(file_options_), &file, + fs->OptimizeForManifestRead(file_options_), &file, nullptr); if (!s.ok()) { return s; } file_reader.reset(new SequentialFileReader( - std::move(file), dscname, db_options_->log_readahead_size)); + std::move(file), dscname, db_options_->log_readahead_size, io_tracer_)); } - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t previous_log_number = 0; - int count = 0; - std::unordered_map comparators; - std::unordered_map> - builders; - - // add default column family - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); - + std::vector column_families( + 1, ColumnFamilyDescriptor(kDefaultColumnFamilyName, options)); + DumpManifestHandler handler(column_families, this, io_tracer_, verbose, hex, + json); { VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - // Write out each individual edit - if (verbose && !json) { - printf("%s\n", edit.DebugString(hex).c_str()); - } else if (json) { - printf("%s\n", edit.DebugJSON(count, hex).c_str()); - } - count++; - - bool cf_in_builders = - builders.find(edit.column_family_) != builders.end(); - - if (edit.has_comparator_) { - comparators.insert({edit.column_family_, edit.comparator_}); - } - - ColumnFamilyData* cfd = nullptr; - - if (edit.is_column_family_add_) { - if (cf_in_builders) { - s = Status::Corruption( - "Manifest adding the same column family twice"); - break; - } - cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); - cfd->set_initialized(); - builders.insert(std::make_pair( - edit.column_family_, std::unique_ptr( - new BaseReferencedVersionBuilder(cfd)))); - } else if (edit.is_column_family_drop_) { - if (!cf_in_builders) { - s = Status::Corruption( - "Manifest - dropping non-existing column family"); - break; - } - auto builder_iter = builders.find(edit.column_family_); - builders.erase(builder_iter); - comparators.erase(edit.column_family_); - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - assert(cfd != nullptr); - cfd->UnrefAndTryDelete(); - cfd = nullptr; - } else { - if (!cf_in_builders) { - s = Status::Corruption( - "Manifest record referencing unknown column family"); - break; - } - - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // this should never happen since cf_in_builders is true - assert(cfd != nullptr); - - // if it is not column family add or column family drop, - // then it's a file add/delete, which should be forwarded - // to builder - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - s = builder->second->version_builder()->Apply(&edit); - if (!s.ok()) { - break; - } - } - - if (cfd != nullptr && edit.has_log_number_) { - cfd->SetLogNumber(edit.log_number_); - } - - - if (edit.has_prev_log_number_) { - previous_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - - if (edit.has_max_column_family_) { - column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_); - } - - if (edit.has_min_log_number_to_keep_) { - MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_); - } - } - } - file_reader.reset(); - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - printf("no meta-nextfile entry in descriptor"); - } else if (!have_last_sequence) { - printf("no last-sequence-number entry in descriptor"); - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!have_prev_log_number) { - previous_log_number = 0; - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto builder = builders_iter->second->version_builder(); - - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(v->storage_info()); - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false); - - printf("--------------- Column family \"%s\" (ID %" PRIu32 - ") --------------\n", - cfd->GetName().c_str(), cfd->GetID()); - printf("log number: %" PRIu64 "\n", cfd->GetLogNumber()); - auto comparator = comparators.find(cfd->GetID()); - if (comparator != comparators.end()) { - printf("comparator: %s\n", comparator->second.c_str()); - } else { - printf("comparator: \n"); - } - printf("%s \n", v->DebugString(hex).c_str()); - delete v; - } - - next_file_number_.store(next_file + 1); - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - prev_log_number_ = previous_log_number; - - printf("next_file_number %" PRIu64 " last_sequence %" PRIu64 - " prev_log_number %" PRIu64 " max_column_family %" PRIu32 - " min_log_number_to_keep " - "%" PRIu64 "\n", - next_file_number_.load(), last_sequence, previous_log_number, - column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + handler.Iterate(reader, &s); } - return s; + return handler.status(); } #endif // ROCKSDB_LITE @@ -4982,15 +5339,15 @@ } // Called only either from ::LogAndApply which is protected by mutex or during // recovery which is single-threaded. -void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { - if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) { - min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed); +void VersionSet::MarkMinLogNumberToKeep(uint64_t number) { + if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) { + min_log_number_to_keep_.store(number, std::memory_order_relaxed); } } Status VersionSet::WriteCurrentStateToManifest( const std::unordered_map& curr_state, - log::Writer* log) { + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { // TODO: Break up into multiple records to reduce memory usage on recovery? // WARNING: This method doesn't hold a mutex!! @@ -4999,6 +5356,7 @@ // LogAndApply. Column family manipulations can only happen within LogAndApply // (the same single thread), so we're safe to iterate. + assert(io_s.ok()); if (db_options_->write_dbid_to_manifest) { VersionEdit edit_for_db_id; assert(!db_id_.empty()); @@ -5008,13 +5366,30 @@ return Status::Corruption("Unable to Encode VersionEdit:" + edit_for_db_id.DebugString(true)); } - Status add_record = log->AddRecord(db_id_record); - if (!add_record.ok()) { - return add_record; + io_s = log->AddRecord(db_id_record); + if (!io_s.ok()) { + return io_s; + } + } + + // Save WALs. + if (!wal_additions.GetWalAdditions().empty()) { + TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal", + const_cast(&wal_additions)); + std::string record; + if (!wal_additions.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_additions.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; } } for (auto cfd : *column_family_set_) { + assert(cfd); + if (cfd->IsDropped()) { continue; } @@ -5035,9 +5410,9 @@ return Status::Corruption( "Unable to Encode VersionEdit:" + edit.DebugString(true)); } - Status s = log->AddRecord(record); - if (!s.ok()) { - return s; + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; } } @@ -5046,29 +5421,69 @@ VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); + assert(cfd->current()); + assert(cfd->current()->storage_info()); + for (int level = 0; level < cfd->NumberLevels(); level++) { for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) { - edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), - f->fd.GetFileSize(), f->smallest, f->largest, - f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time, f->file_creation_time, - f->file_checksum, f->file_checksum_func_name); + edit.AddFile( + level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), + f->smallest, f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->min_timestamp, f->max_timestamp); } } + + const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& meta = pair.second; + + assert(meta); + assert(blob_file_number == meta->GetBlobFileNumber()); + + edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(), + meta->GetTotalBlobBytes(), meta->GetChecksumMethod(), + meta->GetChecksumValue()); + if (meta->GetGarbageBlobCount() > 0) { + edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(), + meta->GetGarbageBlobBytes()); + } + } + const auto iter = curr_state.find(cfd->GetID()); assert(iter != curr_state.end()); uint64_t log_number = iter->second.log_number; edit.SetLogNumber(log_number); + + if (cfd->GetID() == 0) { + // min_log_number_to_keep is for the whole db, not for specific column family. + // So it does not need to be set for every column family, just need to be set once. + // Since default CF can never be dropped, we set the min_log to the default CF here. + uint64_t min_log = min_log_number_to_keep(); + if (min_log != 0) { + edit.SetMinLogNumberToKeep(min_log); + } + } + + const std::string& full_history_ts_low = iter->second.full_history_ts_low; + if (!full_history_ts_low.empty()) { + edit.SetFullHistoryTsLow(full_history_ts_low); + } + + edit.SetLastSequence(descriptor_last_sequence_); + std::string record; if (!edit.EncodeTo(&record)) { return Status::Corruption( "Unable to Encode VersionEdit:" + edit.DebugString(true)); } - Status s = log->AddRecord(record); - if (!s.ok()) { - return s; + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; } } } @@ -5193,7 +5608,8 @@ static_cast(total_full_size * margin)) { total_full_size += total_intersecting_size / 2; } else { - // Estimate for all the first files, at each level + // Estimate for all the first files (might also be last files), at each + // level for (const auto file_ptr : first_files) { total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); } @@ -5230,7 +5646,7 @@ if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( key, f.file_metadata->fd, caller, icmp, - v->GetMutableCFOptions().prefix_extractor.get()); + v->GetMutableCFOptions().prefix_extractor); } } return result; @@ -5270,64 +5686,82 @@ } return table_cache->ApproximateSize( start, end, f.file_metadata->fd, caller, icmp, - v->GetMutableCFOptions().prefix_extractor.get()); + v->GetMutableCFOptions().prefix_extractor); } -void VersionSet::AddLiveFiles(std::vector* live_list) { +void VersionSet::AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const { + assert(live_table_files); + assert(live_blob_files); + // pre-calculate space requirement - int64_t total_files = 0; + size_t total_table_files = 0; + size_t total_blob_files = 0; + + assert(column_family_set_); for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { continue; } - Version* dummy_versions = cfd->dummy_versions(); + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + assert(v); + const auto* vstorage = v->storage_info(); - for (int level = 0; level < vstorage->num_levels(); level++) { - total_files += vstorage->LevelFiles(level).size(); + assert(vstorage); + + for (int level = 0; level < vstorage->num_levels(); ++level) { + total_table_files += vstorage->LevelFiles(level).size(); } + + total_blob_files += vstorage->GetBlobFiles().size(); } } // just one time extension to the right size - live_list->reserve(live_list->size() + static_cast(total_files)); + live_table_files->reserve(live_table_files->size() + total_table_files); + live_blob_files->reserve(live_blob_files->size() + total_blob_files); + assert(column_family_set_); for (auto cfd : *column_family_set_) { + assert(cfd); if (!cfd->initialized()) { continue; } + auto* current = cfd->current(); bool found_current = false; - Version* dummy_versions = cfd->dummy_versions(); + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { - v->AddLiveFiles(live_list); + v->AddLiveFiles(live_table_files, live_blob_files); if (v == current) { found_current = true; } } + if (!found_current && current != nullptr) { // Should never happen unless it is a bug. assert(false); - current->AddLiveFiles(live_list); + current->AddLiveFiles(live_table_files, live_blob_files); } } } InternalIterator* VersionSet::MakeInputIterator( - const Compaction* c, RangeDelAggregator* range_del_agg, + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, const FileOptions& file_options_compactions) { auto cfd = c->column_family_data(); - ReadOptions read_options; - read_options.verify_checksums = true; - read_options.fill_cache = false; - // Compaction iterators shouldn't be confined to a single prefix. - // Compactions use Seek() for - // (a) concurrent compactions, - // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. - read_options.total_order_seek = true; - // Level-0 files have to be merged together. For other levels, // we will make a concatenating iterator per level. // TODO(opt): use concatenating iterator for level-0 if there is no overlap @@ -5343,26 +5777,28 @@ for (size_t i = 0; i < flevel->num_files; i++) { list[num++] = cfd->table_cache()->NewIterator( read_options, file_options_compactions, - cfd->internal_comparator(), - *flevel->files[i].file_metadata, range_del_agg, - c->mutable_cf_options()->prefix_extractor.get(), + cfd->internal_comparator(), *flevel->files[i].file_metadata, + range_del_agg, c->mutable_cf_options()->prefix_extractor, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, /*arena=*/nullptr, - /*skip_filters=*/false, /*level=*/static_cast(which), + /*skip_filters=*/false, + /*level=*/static_cast(c->level(which)), + MaxFileSizeForL0MetaPin(*c->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr); + /*largest_compaction_key=*/nullptr, + /*allow_unprepared_value=*/false); } } else { // Create concatenating iterator for the files from this level list[num++] = new LevelIterator( cfd->table_cache(), read_options, file_options_compactions, cfd->internal_comparator(), c->input_levels(which), - c->mutable_cf_options()->prefix_extractor.get(), + c->mutable_cf_options()->prefix_extractor, /*should_sample=*/false, /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, - /*level=*/static_cast(which), range_del_agg, + /*level=*/static_cast(c->level(which)), range_del_agg, c->boundaries(which)); } } @@ -5375,57 +5811,6 @@ return result; } -// verify that the files listed in this compaction are present -// in the current version -bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { -#ifndef NDEBUG - Version* version = c->column_family_data()->current(); - const VersionStorageInfo* vstorage = version->storage_info(); - if (c->input_version() != version) { - ROCKS_LOG_INFO( - db_options_->info_log, - "[%s] compaction output being applied to a different base version from" - " input version", - c->column_family_data()->GetName().c_str()); - - if (vstorage->compaction_style_ == kCompactionStyleLevel && - c->start_level() == 0 && c->num_input_levels() > 2U) { - // We are doing a L0->base_level compaction. The assumption is if - // base level is not L1, levels from L1 to base_level - 1 is empty. - // This is ensured by having one compaction from L0 going on at the - // same time in level-based compaction. So that during the time, no - // compaction/flush can put files to those levels. - for (int l = c->start_level() + 1; l < c->output_level(); l++) { - if (vstorage->NumLevelFiles(l) != 0) { - return false; - } - } - } - } - - for (size_t input = 0; input < c->num_input_levels(); ++input) { - int level = c->level(input); - for (size_t i = 0; i < c->num_input_files(input); ++i) { - uint64_t number = c->input(input, i)->fd.GetNumber(); - bool found = false; - for (size_t j = 0; j < vstorage->files_[level].size(); j++) { - FileMetaData* f = vstorage->files_[level][j]; - if (f->fd.GetNumber() == number) { - found = true; - break; - } - } - if (!found) { - return false; // input files non existent in current version - } - } - } -#else - (void)c; -#endif - return true; // everything good -} - Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, FileMetaData** meta, ColumnFamilyData** cfd) { @@ -5483,6 +5868,9 @@ filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; filemetadata.file_checksum = file->file_checksum; filemetadata.file_checksum_func_name = file->file_checksum_func_name; + filemetadata.temperature = file->temperature; + filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); + filemetadata.file_creation_time = file->TryGetFileCreationTime(); metadata->push_back(filemetadata); } } @@ -5490,28 +5878,46 @@ } void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* blob_files, std::vector* manifest_filenames, uint64_t min_pending_output) { + assert(files); + assert(blob_files); + assert(manifest_filenames); + assert(files->empty()); + assert(blob_files->empty()); assert(manifest_filenames->empty()); - obsolete_manifests_.swap(*manifest_filenames); + std::vector pending_files; for (auto& f : obsolete_files_) { if (f.metadata->fd.GetNumber() < min_pending_output) { - files->push_back(std::move(f)); + files->emplace_back(std::move(f)); } else { - pending_files.push_back(std::move(f)); + pending_files.emplace_back(std::move(f)); } } obsolete_files_.swap(pending_files); + + std::vector pending_blob_files; + for (auto& blob_file : obsolete_blob_files_) { + if (blob_file.GetBlobFileNumber() < min_pending_output) { + blob_files->emplace_back(std::move(blob_file)); + } else { + pending_blob_files.emplace_back(std::move(blob_file)); + } + } + obsolete_blob_files_.swap(pending_blob_files); + + obsolete_manifests_.swap(*manifest_filenames); } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& cf_options, VersionEdit* edit) { + const ColumnFamilyOptions& cf_options, const VersionEdit* edit) { assert(edit->is_column_family_add_); MutableCFOptions dummy_cf_options; Version* dummy_versions = - new Version(nullptr, this, file_options_, dummy_cf_options); + new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_); // Ref() dummy version once so that later we can call Unref() to delete it // by avoiding calling "delete" explicitly (~Version is private) dummy_versions->Ref(); @@ -5520,7 +5926,7 @@ cf_options); Version* v = new Version(new_cfd, this, file_options_, - *new_cfd->GetLatestMutableCFOptions(), + *new_cfd->GetLatestMutableCFOptions(), io_tracer_, current_version_number_++); // Fill level target base information. @@ -5561,16 +5967,46 @@ return total_files_size; } -ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, - const ImmutableDBOptions* _db_options, - const FileOptions& _file_options, - Cache* table_cache, - WriteBufferManager* write_buffer_manager, - WriteController* write_controller) +uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { + std::unordered_set unique_blob_files; + uint64_t all_v_blob_file_size = 0; + for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { + // iterate all the versions + auto* vstorage = v->storage_info(); + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& pair : blob_files) { + if (unique_blob_files.find(pair.first) == unique_blob_files.end()) { + // find Blob file that has not been counted + unique_blob_files.insert(pair.first); + const auto& meta = pair.second; + all_v_blob_file_size += meta->GetBlobFileSize(); + } + } + } + return all_v_blob_file_size; +} + +Status VersionSet::VerifyFileMetadata(const std::string& fpath, + const FileMetaData& meta) const { + uint64_t fsize = 0; + Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); + if (status.ok()) { + if (fsize != meta.fd.GetFileSize()) { + status = Status::Corruption("File size mismatch: " + fpath); + } + } + return status; +} + +ReactiveVersionSet::ReactiveVersionSet( + const std::string& dbname, const ImmutableDBOptions* _db_options, + const FileOptions& _file_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, WriteController* write_controller, + const std::shared_ptr& io_tracer) : VersionSet(dbname, _db_options, _file_options, table_cache, write_buffer_manager, write_controller, - /*block_cache_tracer=*/nullptr), - number_of_edits_to_skip_(0) {} + /*block_cache_tracer=*/nullptr, io_tracer, + /*db_session_id*/ "") {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -5583,423 +6019,124 @@ assert(manifest_reporter != nullptr); assert(manifest_reader_status != nullptr); - std::unordered_map cf_name_to_options; - for (const auto& cf : column_families) { - cf_name_to_options.insert({cf.name, cf.options}); - } - - // add default column family - auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); - if (default_cf_iter == cf_name_to_options.end()) { - return Status::InvalidArgument("Default column family not specified"); - } - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - // In recovery, nobody else can access it, so it's fine to set it to be - // initialized earlier. - default_cfd->set_initialized(); - std::unordered_map> - builders; - std::unordered_map column_families_not_found; - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); - manifest_reader_status->reset(new Status()); manifest_reporter->reset(new LogReporter()); - static_cast(manifest_reporter->get())->status = + static_cast_with_check(manifest_reporter->get())->status = manifest_reader_status->get(); Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); - log::Reader* reader = manifest_reader->get(); - - int retry = 0; - VersionEdit version_edit; - while (s.ok() && retry < 1) { - assert(reader != nullptr); - Slice record; - std::string scratch; - s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options, - column_families_not_found, builders, &version_edit); - if (s.ok()) { - bool enough = version_edit.has_next_file_number_ && - version_edit.has_log_number_ && - version_edit.has_last_sequence_; - if (enough) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - if (cfd == nullptr) { - enough = false; - break; - } - } - } - if (enough) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - assert(cfd != nullptr); - if (!cfd->IsDropped()) { - auto builder_iter = builders.find(cfd->GetID()); - assert(builder_iter != builders.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - true /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - if (!s.ok()) { - enough = false; - if (s.IsPathNotFound()) { - s = Status::OK(); - } - break; - } - } - } - } - if (enough) { - break; - } - } - ++retry; + if (!s.ok()) { + return s; } + log::Reader* reader = manifest_reader->get(); + assert(reader); - if (s.ok()) { - if (!version_edit.has_prev_log_number_) { - version_edit.prev_log_number_ = 0; - } - column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_); - - MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_); - MarkFileNumberUsed(version_edit.prev_log_number_); - MarkFileNumberUsed(version_edit.log_number_); + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_)); - for (auto cfd : *column_family_set_) { - assert(builders.count(cfd->GetID()) > 0); - auto builder = builders[cfd->GetID()]->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } - } + manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); - if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - assert(cfd->initialized()); - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto* builder = builders_iter->second->version_builder(); - - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(v->storage_info()); - - // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), - !(db_options_->skip_stats_update_on_db_open)); - AppendVersion(cfd, v); - } - next_file_number_.store(version_edit.next_file_number_ + 1); - last_allocated_sequence_ = version_edit.last_sequence_; - last_published_sequence_ = version_edit.last_sequence_; - last_sequence_ = version_edit.last_sequence_; - prev_log_number_ = version_edit.prev_log_number_; - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - ROCKS_LOG_INFO(db_options_->info_log, - "Column family [%s] (ID %u), log number is %" PRIu64 "\n", - cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); - } - } - return s; + return manifest_tailer_->status(); } Status ReactiveVersionSet::ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + Status* manifest_read_status, std::unordered_set* cfds_changed) { assert(manifest_reader != nullptr); assert(cfds_changed != nullptr); mu->AssertHeld(); Status s; - uint64_t applied_edits = 0; - while (s.ok()) { - Slice record; - std::string scratch; - log::Reader* reader = manifest_reader->get(); - std::string old_manifest_path = reader->file()->file_name(); - while (reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - // Skip the first VersionEdits of each MANIFEST generated by - // VersionSet::WriteCurrentStatetoManifest. - if (number_of_edits_to_skip_ > 0) { - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - if (cfd != nullptr && !cfd->IsDropped()) { - --number_of_edits_to_skip_; - } - continue; - } - - s = read_buffer_.AddEdit(&edit); - if (!s.ok()) { - break; - } - VersionEdit temp_edit; - if (edit.is_in_atomic_group_) { - if (read_buffer_.IsFull()) { - // Apply edits in an atomic group when we have read all edits in the - // group. - for (auto& e : read_buffer_.replay_buffer()) { - s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit); - if (!s.ok()) { - break; - } - applied_edits++; - } - if (!s.ok()) { - break; - } - read_buffer_.Clear(); - } - } else { - // Apply a normal edit immediately. - s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit); - if (s.ok()) { - applied_edits++; - } - } - } - if (!s.ok()) { - // Clear the buffer if we fail to decode/apply an edit. - read_buffer_.Clear(); - } - // It's possible that: - // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. - // 2) we have finished reading the current MANIFEST. - // 3) we have encountered an IOError reading the current MANIFEST. - // We need to look for the next MANIFEST and start from there. If we cannot - // find the next MANIFEST, we should exit the loop. - s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); - reader = manifest_reader->get(); - if (s.ok()) { - if (reader->file()->file_name() == old_manifest_path) { - // Still processing the same MANIFEST, thus no need to continue this - // loop since no record is available if we have reached here. - break; - } else { - // We have switched to a new MANIFEST whose first records have been - // generated by VersionSet::WriteCurrentStatetoManifest. Since the - // secondary instance has already finished recovering upon start, there - // is no need for the secondary to process these records. Actually, if - // the secondary were to replay these records, the secondary may end up - // adding the same SST files AGAIN to each column family, causing - // consistency checks done by VersionBuilder to fail. Therefore, we - // record the number of records to skip at the beginning of the new - // MANIFEST and ignore them. - number_of_edits_to_skip_ = 0; - for (auto* cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - // Increase number_of_edits_to_skip by 2 because - // WriteCurrentStatetoManifest() writes 2 version edits for each - // column family at the beginning of the newly-generated MANIFEST. - // TODO(yanqin) remove hard-coded value. - if (db_options_->write_dbid_to_manifest) { - number_of_edits_to_skip_ += 3; - } else { - number_of_edits_to_skip_ += 2; - } - } - } - } + log::Reader* reader = manifest_reader->get(); + assert(reader); + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); + if (!s.ok()) { + return s; } - + manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status); + s = manifest_tailer_->status(); if (s.ok()) { - for (auto cfd : *column_family_set_) { - auto builder_iter = active_version_builders_.find(cfd->GetID()); - if (builder_iter == active_version_builders_.end()) { - continue; - } - auto builder = builder_iter->second->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } + *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); } - TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits", - &applied_edits); + return s; } -Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, std::unordered_set* cfds_changed, - VersionEdit* version_edit) { - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - - // If we cannot find this column family in our column family set, then it - // may be a new column family created by the primary after the secondary - // starts. It is also possible that the secondary instance opens only a subset - // of column families. Ignore it for now. - if (nullptr == cfd) { - return Status::OK(); +Status ReactiveVersionSet::MaybeSwitchManifest( + log::Reader::Reporter* reporter, + std::unique_ptr* manifest_reader) { + assert(manifest_reader != nullptr); + Status s; + std::string manifest_path; + s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, + &manifest_file_number_); + if (!s.ok()) { + return s; } - if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end() && - !cfd->IsDropped()) { - std::unique_ptr builder_guard( - new BaseReferencedVersionBuilder(cfd)); - active_version_builders_.insert( - std::make_pair(edit.column_family_, std::move(builder_guard))); - } - - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - - if (edit.is_column_family_add_) { - // TODO (yanqin) for now the secondary ignores column families created - // after Open. This also simplifies handling of switching to a new MANIFEST - // and processing the snapshot of the system at the beginning of the + std::unique_ptr manifest_file; + if (manifest_reader->get() != nullptr && + manifest_reader->get()->file()->file_name() == manifest_path) { + // CURRENT points to the same MANIFEST as before, no need to switch // MANIFEST. - } else if (edit.is_column_family_drop_) { - // Drop the column family by setting it to be 'dropped' without destroying - // the column family handle. - // TODO (haoyu) figure out how to handle column faimly drop for - // secondary instance. (Is it possible that the ref count for cfd is 0 but - // the ref count for its versions is higher than 0?) - cfd->SetDropped(); - if (cfd->UnrefAndTryDelete()) { - cfd = nullptr; - } - active_version_builders_.erase(builder_iter); - } else { - Status s = builder->Apply(&edit); - if (!s.ok()) { - return s; - } - } - Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit); - if (!s.ok()) { return s; } - - if (cfd != nullptr && !cfd->IsDropped()) { - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - TEST_SYNC_POINT_CALLBACK( - "ReactiveVersionSet::ApplyOneVersionEditToBuilder:" - "AfterLoadTableHandlers", - &s); - - if (s.ok()) { - auto version = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(version->storage_info()); - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); - } - } else if (s.IsPathNotFound()) { - s = Status::OK(); - } - // Some other error has occurred during LoadTableHandlers. - } - - if (version_edit->HasNextFile()) { - next_file_number_.store(version_edit->next_file_number_ + 1); - } - if (version_edit->has_last_sequence_) { - last_allocated_sequence_ = version_edit->last_sequence_; - last_published_sequence_ = version_edit->last_sequence_; - last_sequence_ = version_edit->last_sequence_; - } - if (version_edit->has_prev_log_number_) { - prev_log_number_ = version_edit->prev_log_number_; - MarkFileNumberUsed(version_edit->prev_log_number_); + assert(nullptr == manifest_reader->get() || + manifest_reader->get()->file()->file_name() != manifest_path); + s = fs_->FileExists(manifest_path, IOOptions(), nullptr); + if (s.IsNotFound()) { + return Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); + } else if (!s.ok()) { + return s; } - if (version_edit->has_log_number_) { - MarkFileNumberUsed(version_edit->log_number_); + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:0"); + TEST_SYNC_POINT( + "ReactiveVersionSet::MaybeSwitchManifest:" + "AfterGetCurrentManifestPath:1"); + // The primary can also delete the MANIFEST while the secondary is reading + // it. This is OK on POSIX. For other file systems, maybe create a hard link + // to MANIFEST. The hard link should be cleaned up later by the secondary. + s = fs_->NewSequentialFile(manifest_path, + fs_->OptimizeForManifestRead(file_options_), + &manifest_file, nullptr); + std::unique_ptr manifest_file_reader; + if (s.ok()) { + manifest_file_reader.reset(new SequentialFileReader( + std::move(manifest_file), manifest_path, + db_options_->log_readahead_size, io_tracer_, db_options_->listeners)); + manifest_reader->reset(new log::FragmentBufferedReader( + nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, + 0 /* log_number */)); + ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", + manifest_path.c_str()); + if (manifest_tailer_) { + manifest_tailer_->PrepareToReadNewManifest(); + } + } else if (s.IsPathNotFound()) { + // This can happen if the primary switches to a new MANIFEST after the + // secondary reads the CURRENT file but before the secondary actually tries + // to open the MANIFEST. + s = Status::TryAgain( + "The primary may have switched to a new MANIFEST and deleted the old " + "one."); } - column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_); - MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_); return s; } -Status ReactiveVersionSet::MaybeSwitchManifest( - log::Reader::Reporter* reporter, - std::unique_ptr* manifest_reader) { - assert(manifest_reader != nullptr); - Status s; - do { - std::string manifest_path; - s = GetCurrentManifestPath(dbname_, fs_, &manifest_path, - &manifest_file_number_); - std::unique_ptr manifest_file; - if (s.ok()) { - if (nullptr == manifest_reader->get() || - manifest_reader->get()->file()->file_name() != manifest_path) { - TEST_SYNC_POINT( - "ReactiveVersionSet::MaybeSwitchManifest:" - "AfterGetCurrentManifestPath:0"); - TEST_SYNC_POINT( - "ReactiveVersionSet::MaybeSwitchManifest:" - "AfterGetCurrentManifestPath:1"); - s = fs_->NewSequentialFile(manifest_path, - env_->OptimizeForManifestRead(file_options_), - &manifest_file, nullptr); - } else { - // No need to switch manifest. - break; - } - } - std::unique_ptr manifest_file_reader; - if (s.ok()) { - manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path, - db_options_->log_readahead_size)); - manifest_reader->reset(new log::FragmentBufferedReader( - nullptr, std::move(manifest_file_reader), reporter, - true /* checksum */, 0 /* log_number */)); - ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", - manifest_path.c_str()); - // TODO (yanqin) every time we switch to a new MANIFEST, we clear the - // active_version_builders_ map because we choose to construct the - // versions from scratch, thanks to the first part of each MANIFEST - // written by VersionSet::WriteCurrentStatetoManifest. This is not - // necessary, but we choose this at present for the sake of simplicity. - active_version_builders_.clear(); - } - } while (s.IsPathNotFound()); - return s; +#ifndef NDEBUG +uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group(); +} +#endif // !NDEBUG + +std::vector& ReactiveVersionSet::replay_buffer() { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().replay_buffer(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,8 +11,9 @@ // newest version is called "current". Older versions may be kept // around to provide a consistent view to live iterators. // -// Each Version keeps track of a set of Table files per level. The -// entire set of versions is maintained in a VersionSet. +// Each Version keeps track of a set of table files per level, as well as a +// set of blob files. The entire set of versions is maintained in a +// VersionSet. // // Version,VersionSet are thread-compatible, but require external // synchronization on all accesses. @@ -25,9 +26,12 @@ #include #include #include +#include #include #include +#include "cache/cache_helpers.h" +#include "db/blob/blob_file_meta.h" #include "db/column_family.h" #include "db/compaction/compaction.h" #include "db/compaction/compaction_picker.h" @@ -40,6 +44,7 @@ #include "db/version_builder.h" #include "db/version_edit.h" #include "db/write_controller.h" +#include "env/file_system_tracer.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" @@ -55,6 +60,7 @@ class Writer; } +class BlobIndex; class Compaction; class LogBuffer; class LookupKey; @@ -65,6 +71,8 @@ class MergeContext; class ColumnFamilySet; class MergeIteratorBuilder; +class SystemClock; +class ManifestTailer; // VersionEdit is always supposed to be valid and it is used to point at // entries in Manifest. Ideally it should not be used as a container to @@ -102,7 +110,7 @@ // Information of the storage associated with each Version, including number of // levels of LSM tree, files information at each level, files marked for -// compaction, etc. +// compaction, blob files, etc. class VersionStorageInfo { public: VersionStorageInfo(const InternalKeyComparator* internal_comparator, @@ -117,7 +125,9 @@ void Reserve(int level, size_t size) { files_[level].reserve(size); } - void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); + void AddFile(int level, FileMetaData* f); + + void AddBlobFile(std::shared_ptr blob_file_meta); void SetFinalized(); @@ -140,7 +150,7 @@ // We use compaction scores to figure out which compaction to do next // REQUIRES: db_mutex held!! // TODO find a better way to pass compaction_options_fifo. - void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options, + void ComputeCompactionScore(const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options); // Estimate est_comp_needed_bytes_ @@ -153,13 +163,13 @@ // This computes ttl_expired_files_ and is called by // ComputeCompactionScore() - void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions, + void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions, const uint64_t ttl); // This computes files_marked_for_periodic_compaction_ and is called by // ComputeCompactionScore() void ComputeFilesMarkedForPeriodicCompaction( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const uint64_t periodic_compaction_seconds); // This computes bottommost_files_marked_for_compaction_ and is called by @@ -174,12 +184,21 @@ // REQUIRES: DB mutex held void ComputeBottommostFilesMarkedForCompaction(); + // This computes files_marked_for_forced_blob_gc_ and is called by + // ComputeCompactionScore() + // + // REQUIRES: DB mutex held + void ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold); + // Generate level_files_brief_ from files_ void GenerateLevelFilesBrief(); // Sort all files for this version based on their file size and // record results in files_by_compaction_pri_. The largest files are listed // first. - void UpdateFilesByCompactionPri(CompactionPri compaction_pri); + void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); void GenerateLevel0NonOverlapping(); bool level0_non_overlapping() const { @@ -279,6 +298,75 @@ return files_[level]; } + class FileLocation { + public: + FileLocation() = default; + FileLocation(int level, size_t position) + : level_(level), position_(position) {} + + int GetLevel() const { return level_; } + size_t GetPosition() const { return position_; } + + bool IsValid() const { return level_ >= 0; } + + bool operator==(const FileLocation& rhs) const { + return level_ == rhs.level_ && position_ == rhs.position_; + } + + bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); } + + static FileLocation Invalid() { return FileLocation(); } + + private: + int level_ = -1; + size_t position_ = 0; + }; + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + FileLocation GetFileLocation(uint64_t file_number) const { + const auto it = file_locations_.find(file_number); + + if (it == file_locations_.end()) { + return FileLocation::Invalid(); + } + + assert(it->second.GetLevel() < num_levels_); + assert(it->second.GetPosition() < files_[it->second.GetLevel()].size()); + assert(files_[it->second.GetLevel()][it->second.GetPosition()]); + assert(files_[it->second.GetLevel()][it->second.GetPosition()] + ->fd.GetNumber() == file_number); + + return it->second; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const { + auto location = GetFileLocation(file_number); + + if (!location.IsValid()) { + return nullptr; + } + + return files_[location.GetLevel()][location.GetPosition()]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + using BlobFiles = std::map>; + const BlobFiles& GetBlobFiles() const { return blob_files_; } + + uint64_t GetTotalBlobFileSize() const { + uint64_t total_blob_bytes = 0; + + for (const auto& pair : blob_files_) { + const auto& meta = pair.second; + assert(meta); + + total_blob_bytes += meta->GetBlobFileSize(); + } + + return total_blob_bytes; + } + const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { assert(level < static_cast(level_files_brief_.size())); return level_files_brief_[level]; @@ -325,6 +413,14 @@ return bottommost_files_marked_for_compaction_; } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForForcedBlobGC() + const { + assert(finalized_); + return files_marked_for_forced_blob_gc_; + } + int base_level() const { return base_level_; } double level_multiplier() const { return level_multiplier_; } @@ -368,7 +464,7 @@ // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); + uint64_t MaxNextLevelOverlappingBytes(); // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false) const; @@ -395,7 +491,7 @@ next_file_to_compact_by_size_[level] = 0; } - const InternalKeyComparator* InternalComparator() { + const InternalKeyComparator* InternalComparator() const { return internal_comparator_; } @@ -403,7 +499,7 @@ uint64_t MaxBytesForLevel(int level) const; // Must be called after any change to MutableCFOptions. - void CalculateBaseBytes(const ImmutableCFOptions& ioptions, + void CalculateBaseBytes(const ImmutableOptions& ioptions, const MutableCFOptions& options); // Returns an estimate of the amount of live data in bytes. @@ -453,6 +549,14 @@ // in increasing order of keys std::vector* files_; + // Map of all table files in version. Maps file number to (level, position on + // level). + using FileLocations = std::unordered_map; + FileLocations file_locations_; + + // Map of blob files in version by number. + BlobFiles blob_files_; + // Level that L0 data should be compacted to. All levels < base_level_ should // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; @@ -499,6 +603,8 @@ autovector> bottommost_files_marked_for_compaction_; + autovector> files_marked_for_forced_blob_gc_; + // Threshold for needing to mark another bottommost file. Maintain it so we // can quickly check when releasing a snapshot whether more bottommost files // became eligible for compaction. It's defined as the min of the max nonzero @@ -553,20 +659,28 @@ }; using MultiGetRange = MultiGetContext::Range; -// A column family's version consists of the SST files owned by the column -// family at a certain point in time. +// A column family's version consists of the table and blob files owned by +// the column family at a certain point in time. class Version { public: // Append to *iters a sequence of iterators that will // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, const FileOptions& soptions, + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + // REQUIRES: This version has been saved (see VersionSet::SaveTo). + void AddIterators(const ReadOptions& read_options, + const FileOptions& soptions, MergeIteratorBuilder* merger_iter_builder, - RangeDelAggregator* range_del_agg); + RangeDelAggregator* range_del_agg, + bool allow_unprepared_value); - void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions, + // @param read_options Must outlive any iterator built by + // `merger_iter_builder`. + void AddIteratorsForLevel(const ReadOptions& read_options, + const FileOptions& soptions, MergeIteratorBuilder* merger_iter_builder, - int level, RangeDelAggregator* range_del_agg); + int level, RangeDelAggregator* range_del_agg, + bool allow_unprepared_value); Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, const Slice& smallest_user_key, @@ -594,15 +708,39 @@ // If the key has any merge operands then store them in // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held + // REQUIRES: pinned_iters_mgr != nullptr void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, - Status* status, MergeContext* merge_context, + std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found = nullptr, bool* key_exists = nullptr, SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, bool* is_blob = nullptr, bool do_merge = true); void MultiGet(const ReadOptions&, MultiGetRange* range, - ReadCallback* callback = nullptr, bool* is_blob = nullptr); + ReadCallback* callback = nullptr); + + // Interprets blob_index_slice as a blob reference, and (assuming the + // corresponding blob file is part of this Version) retrieves the blob and + // saves it in *value. + // REQUIRES: blob_index_slice stores an encoded blob reference + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + // Retrieves a blob using a blob reference and saves it in *value, + // assuming the corresponding blob file is part of this Version. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read) const; + + using BlobReadRequest = + std::pair>; + using BlobReadRequests = std::vector; + void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range, + std::unordered_map& blob_rqs); // Loads some stats information from files. Call without mutex held. It needs // to be called before applying the version to the version set. @@ -616,8 +754,10 @@ // and return true. Otherwise, return false. bool Unref(); - // Add all files listed in the current version to *live. - void AddLiveFiles(std::vector* live); + // Add all files listed in the current version to *live_table_files and + // *live_blob_files. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false, bool print_stats = false) const; @@ -662,14 +802,13 @@ ColumnFamilyData* cfd() const { return cfd_; } - // Return the next Version in the linked list. Used for debug only - Version* TEST_Next() const { - return next_; - } + // Return the next Version in the linked list. + Version* Next() const { return next_; } int TEST_refs() const { return refs_; } VersionStorageInfo* storage_info() { return &storage_info_; } + const VersionStorageInfo* storage_info() const { return &storage_info_; } VersionSet* version_set() { return vset_; } @@ -685,9 +824,12 @@ private: Env* env_; - FileSystem* fs_; + SystemClock* clock_; + friend class ReactiveVersionSet; friend class VersionSet; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; const InternalKeyComparator* internal_comparator() const { return storage_info_.internal_comparator_; @@ -696,10 +838,6 @@ return storage_info_.user_comparator_; } - bool PrefixMayMatch(const ReadOptions& read_options, - InternalIterator* level_iter, - const Slice& internal_prefix) const; - // Returns true if the filter blocks in the specified level will not be // checked during read operations. In certain cases (trivial move or preload), // the filter block may already be cached, but we still do not access it such @@ -715,15 +853,11 @@ // This accumulated stats will be used in compaction. void UpdateAccumulatedStats(bool update_stats); - // Sort all files for this version based on their file size and - // record results in files_by_compaction_pri_. The largest files are listed - // first. - void UpdateFilesByCompactionPri(); - ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs Logger* info_log_; Statistics* db_statistics_; TableCache* table_cache_; + BlobFileCache* blob_file_cache_; const MergeOperator* merge_operator_; VersionStorageInfo storage_info_; @@ -733,13 +867,18 @@ int refs_; // Number of live refs to this version const FileOptions file_options_; const MutableCFOptions mutable_cf_options_; + // Cached value to avoid recomputing it on every read. + const size_t max_file_size_for_l0_meta_pin_; // A version number that uniquely represents this version. This is // used for debugging and logging purposes only. uint64_t version_number_; + std::shared_ptr io_tracer_; Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, - MutableCFOptions mutable_cf_options, uint64_t version_number = 0); + MutableCFOptions mutable_cf_options, + const std::shared_ptr& io_tracer, + uint64_t version_number = 0); ~Version(); @@ -778,10 +917,24 @@ } }; +class ObsoleteBlobFileInfo { + public: + ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) + : blob_file_number_(blob_file_number), path_(std::move(path)) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + const std::string& GetPath() const { return path_; } + + private: + uint64_t blob_file_number_; + std::string path_; +}; + class BaseReferencedVersionBuilder; class AtomicGroupReadBuffer { public: + AtomicGroupReadBuffer() = default; Status AddEdit(VersionEdit* edit); void Clear(); bool IsFull() const; @@ -806,13 +959,26 @@ const FileOptions& file_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, WriteController* write_controller, - BlockCacheTracer* const block_cache_tracer); + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, + const std::string& db_session_id); // No copying allowed VersionSet(const VersionSet&) = delete; void operator=(const VersionSet&) = delete; virtual ~VersionSet(); + Status LogAndApplyToDefaultColumnFamily( + VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); + const MutableCFOptions* cf_options = + default_cf->GetLatestMutableCFOptions(); + return LogAndApply(default_cf, *cf_options, edit, mu, db_directory, + new_descriptor_log, column_family_options); + } + // Apply *edit to the current version to form a new descriptor that // is both saved to persistent state and installed as the new // current version. Will release *mu while actually writing to the file. @@ -822,7 +988,7 @@ Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, VersionEdit* edit, - InstrumentedMutex* mu, Directory* db_directory = nullptr, + InstrumentedMutex* mu, FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { autovector cfds; @@ -842,8 +1008,9 @@ ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, const autovector& edit_list, InstrumentedMutex* mu, - Directory* db_directory = nullptr, bool new_descriptor_log = false, - const ColumnFamilyOptions* column_family_options = nullptr) { + FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr, + const std::function& manifest_wcb = {}) { autovector cfds; cfds.emplace_back(column_family_data); autovector mutable_cf_options_list; @@ -851,7 +1018,8 @@ autovector> edit_lists; edit_lists.emplace_back(edit_list); return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - db_directory, new_descriptor_log, column_family_options); + db_directory, new_descriptor_log, column_family_options, + {manifest_wcb}); } // The across-multi-cf batch version. If edit_lists contain more than @@ -861,14 +1029,17 @@ const autovector& cfds, const autovector& mutable_cf_options_list, const autovector>& edit_lists, - InstrumentedMutex* mu, Directory* db_directory = nullptr, + InstrumentedMutex* mu, FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, - const ColumnFamilyOptions* new_cf_options = nullptr); + const ColumnFamilyOptions* new_cf_options = nullptr, + const std::vector>& manifest_wcbs = + {}); static Status GetCurrentManifestPath(const std::string& dbname, FileSystem* fs, std::string* manifest_filename, uint64_t* manifest_file_number); + void WakeUpWaitingManifestWriters(); // Recover the last saved descriptor from persistent storage. // If read_only == true, Recover() will not complain if some column families @@ -876,6 +1047,18 @@ Status Recover(const std::vector& column_families, bool read_only = false, std::string* db_id = nullptr); + Status TryRecover(const std::vector& column_families, + bool read_only, + const std::vector& files_in_dbname, + std::string* db_id, bool* has_missing_table_file); + + // Try to recover the version set to the most recent consistent state + // recorded in the specified manifest. + Status TryRecoverFromOneManifest( + const std::string& manifest_path, + const std::vector& column_families, + bool read_only, std::string* db_id, bool* has_missing_table_file); + // Reads a manifest file and returns a list of column families in // column_families. static Status ListColumnFamilies(std::vector* column_families, @@ -905,6 +1088,8 @@ #endif // ROCKSDB_LITE + const std::string& DbSessionId() const { return db_session_id_; } + // Return the current manifest file number uint64_t manifest_file_number() const { return manifest_file_number_; } @@ -916,8 +1101,8 @@ uint64_t current_next_file_number() const { return next_file_number_.load(); } - uint64_t min_log_number_to_keep_2pc() const { - return min_log_number_to_keep_2pc_.load(); + uint64_t min_log_number_to_keep() const { + return min_log_number_to_keep_.load(); } // Allocate and return a new file number @@ -975,7 +1160,7 @@ // Mark the specified log number as deleted // REQUIRED: this is only called during single-threaded recovery or repair, or // from ::LogAndApply where the global mutex is held. - void MarkMinLogNumberToKeep2PC(uint64_t number); + void MarkMinLogNumberToKeep(uint64_t number); // Return the log file number for the log file that is currently // being compacted, or zero if there is no such log file. @@ -984,15 +1169,35 @@ // Returns the minimum log number which still has data not flushed to any SST // file. // In non-2PC mode, all the log numbers smaller than this number can be safely - // deleted. + // deleted, although we still use `min_log_number_to_keep_` to determine when + // to delete a WAL file. uint64_t MinLogNumberWithUnflushedData() const { return PreComputeMinLogNumberWithUnflushedData(nullptr); } + + // Returns the minimum log number which still has data not flushed to any SST + // file. + // Empty column families' log number is considered to be + // new_log_number_for_empty_cf. + uint64_t PreComputeMinLogNumberWithUnflushedData( + uint64_t new_log_number_for_empty_cf) const { + uint64_t min_log_num = port::kMaxUint64; + for (auto cfd : *column_family_set_) { + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + uint64_t num = + cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); + if (min_log_num > num && !cfd->IsDropped()) { + min_log_num = num; + } + } + return min_log_num; + } // Returns the minimum log number which still has data not flushed to any SST // file, except data from `cfd_to_skip`. uint64_t PreComputeMinLogNumberWithUnflushedData( const ColumnFamilyData* cfd_to_skip) const { - uint64_t min_log_num = std::numeric_limits::max(); + uint64_t min_log_num = port::kMaxUint64; for (auto cfd : *column_family_set_) { if (cfd == cfd_to_skip) { continue; @@ -1005,15 +1210,36 @@ } return min_log_num; } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfds_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const std::unordered_set& cfds_to_skip) const { + uint64_t min_log_num = port::kMaxUint64; + for (auto cfd : *column_family_set_) { + if (cfds_to_skip.count(cfd)) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. + // @param read_options Must outlive the returned iterator. InternalIterator* MakeInputIterator( - const Compaction* c, RangeDelAggregator* range_del_agg, + const ReadOptions& read_options, const Compaction* c, + RangeDelAggregator* range_del_agg, const FileOptions& file_options_compactions); - // Add all files listed in any live version to *live. - void AddLiveFiles(std::vector* live_list); + // Add all files listed in any live version to *live_table_files and + // *live_blob_files. Note that these lists may contain duplicates. + void AddLiveFiles(std::vector* live_table_files, + std::vector* live_blob_files) const; // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search @@ -1026,23 +1252,30 @@ // Return the size of the current manifest file uint64_t manifest_file_size() const { return manifest_file_size_; } - // verify that the files that we started with for a compaction - // still exist in the current version and in the same original level. - // This ensures that a concurrent compaction did not erroneously - // pick the same files to compact. - bool VerifyCompactionFileConsistency(Compaction* c); - Status GetMetadataForFile(uint64_t number, int* filelevel, FileMetaData** metadata, ColumnFamilyData** cfd); // This function doesn't support leveldb SST filenames void GetLiveFilesMetaData(std::vector *metadata); + void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { + assert(table_cache_); + + table_cache_->Erase(GetSlice(&blob_file_number)); + + obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); + } + void GetObsoleteFiles(std::vector* files, + std::vector* blob_files, std::vector* manifest_filenames, uint64_t min_pending_output); ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + RefedColumnFamilySet GetRefedColumnFamilySet() { + return RefedColumnFamilySet(GetColumnFamilySet()); + } + const FileOptions& file_options() { return file_options_; } void ChangeFileOptions(const MutableDBOptions& new_options) { file_options_.writable_file_max_buffer_size = @@ -1055,20 +1288,51 @@ static uint64_t GetTotalSstFilesSize(Version* dummy_versions); + static uint64_t GetTotalBlobFileSize(Version* dummy_versions); + + // Get the IO Status returned by written Manifest. + const IOStatus& io_status() const { return io_status_; } + + // The returned WalSet needs to be accessed with DB mutex held. + const WalSet& GetWalSet() const { return wals_; } + + void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) { + assert(cfd); + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + Version* const version = + new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); + + constexpr bool update_stats = false; + version->PrepareApply(mutable_cf_options, update_stats); + AppendVersion(cfd, version); + } + protected: + using VersionBuilderMap = + std::unordered_map>; + struct ManifestWriter; friend class Version; + friend class VersionEditHandler; + friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; friend class DBImpl; friend class DBImplReadOnly; struct LogReporter : public log::Reader::Reporter { Status* status; virtual void Corruption(size_t /*bytes*/, const Status& s) override { - if (this->status->ok()) *this->status = s; + if (status->ok()) { + *status = s; + } } }; + void Reset(); + // Returns approximated offset of a key in a file for a given version. uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller); @@ -1081,59 +1345,52 @@ struct MutableCFState { uint64_t log_number; + std::string full_history_ts_low; + + explicit MutableCFState() = default; + explicit MutableCFState(uint64_t _log_number, std::string ts_low) + : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {} }; // Save current contents to *log Status WriteCurrentStateToManifest( const std::unordered_map& curr_state, - log::Writer* log); + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, - VersionEdit* edit); + const VersionEdit* edit); - Status ReadAndRecover( - log::Reader* reader, AtomicGroupReadBuffer* read_buffer, - const std::unordered_map& - name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map< - uint32_t, std::unique_ptr>& builders, - VersionEditParams* version_edit, std::string* db_id = nullptr); + Status VerifyFileMetadata(const std::string& fpath, + const FileMetaData& meta) const; - // REQUIRES db mutex - Status ApplyOneVersionEditToBuilder( - VersionEdit& edit, - const std::unordered_map& name_to_opts, - std::unordered_map& column_families_not_found, - std::unordered_map< - uint32_t, std::unique_ptr>& builders, - VersionEditParams* version_edit); - - Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, - const VersionEdit& from_edit, - VersionEditParams* version_edit_params); + // Protected by DB mutex. + WalSet wals_; std::unique_ptr column_family_set_; - + Cache* table_cache_; Env* const env_; - FileSystem* const fs_; + FileSystemPtr const fs_; + SystemClock* const clock_; const std::string dbname_; std::string db_id_; const ImmutableDBOptions* const db_options_; std::atomic next_file_number_; - // Any log number equal or lower than this should be ignored during recovery, - // and is qualified for being deleted in 2PC mode. In non-2PC mode, this - // number is ignored. - std::atomic min_log_number_to_keep_2pc_ = {0}; + // Any WAL number smaller than this should be ignored during recovery, + // and is qualified for being deleted. + std::atomic min_log_number_to_keep_ = {0}; uint64_t manifest_file_number_; uint64_t options_file_number_; + uint64_t options_file_size_; uint64_t pending_manifest_file_number_; // The last seq visible to reads. It normally indicates the last sequence in // the memtable but when using two write queues it could also indicate the // last sequence in the WAL visible to reads. std::atomic last_sequence_; + // The last sequence number of data committed to the descriptor (manifest + // file). + SequenceNumber descriptor_last_sequence_ = 0; // The last seq that is already allocated. It is applicable only when we have // two write queues. In that case seq might or might not have appreated in // memtable but it is expected to appear in the WAL. @@ -1160,6 +1417,7 @@ uint64_t manifest_file_size_; std::vector obsolete_files_; + std::vector obsolete_blob_files_; std::vector obsolete_manifests_; // env options for all reads and writes except compactions @@ -1167,16 +1425,25 @@ BlockCacheTracer* const block_cache_tracer_; + // Store the IO status when Manifest is written + IOStatus io_status_; + + std::shared_ptr io_tracer_; + + std::string db_session_id_; + private: // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, - InstrumentedMutex* mu, Directory* db_directory, + InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options); - void LogAndApplyCFHelper(VersionEdit* edit); + void LogAndApplyCFHelper(VersionEdit* edit, + SequenceNumber* max_last_sequence); Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, - VersionEdit* edit, InstrumentedMutex* mu); + VersionEdit* edit, SequenceNumber* max_last_sequence, + InstrumentedMutex* mu); }; // ReactiveVersionSet represents a collection of versions of the column @@ -1189,30 +1456,28 @@ const ImmutableDBOptions* _db_options, const FileOptions& _file_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller); + WriteController* write_controller, + const std::shared_ptr& io_tracer); ~ReactiveVersionSet() override; Status ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + Status* manifest_read_status, std::unordered_set* cfds_changed); Status Recover(const std::vector& column_families, std::unique_ptr* manifest_reader, std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status); +#ifndef NDEBUG + uint64_t TEST_read_edits_in_atomic_group() const; +#endif //! NDEBUG - uint64_t TEST_read_edits_in_atomic_group() const { - return read_buffer_.TEST_read_edits_in_atomic_group(); - } - std::vector& replay_buffer() { - return read_buffer_.replay_buffer(); - } + std::vector& replay_buffer(); protected: - using VersionSet::ApplyOneVersionEditToBuilder; - // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( VersionEdit& edit, std::unordered_set* cfds_changed, @@ -1223,12 +1488,7 @@ std::unique_ptr* manifest_reader); private: - std::unordered_map> - active_version_builders_; - AtomicGroupReadBuffer read_buffer_; - // Number of version edits to skip by ReadAndApply at the beginning of a new - // MANIFEST created by primary. - int number_of_edits_to_skip_; + std::unique_ptr manifest_tailer_; using VersionSet::LogAndApply; using VersionSet::Recover; @@ -1237,9 +1497,10 @@ const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, const autovector>& /*edit_lists*/, - InstrumentedMutex* /*mu*/, Directory* /*db_directory*/, - bool /*new_descriptor_log*/, - const ColumnFamilyOptions* /*new_cf_option*/) override { + InstrumentedMutex* /*mu*/, FSDirectory* /*db_directory*/, + bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, + const std::vector>& /*manifest_wcbs*/) + override { return Status::NotSupported("not supported in reactive mode"); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,9 +8,15 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_set.h" + +#include + #include "db/db_impl/db_impl.h" #include "db/log_writer.h" -#include "logging/logging.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/file_system.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -39,9 +45,11 @@ files_.size() + 1, 0, 0, InternalKey(smallest, smallest_seq, kTypeValue), InternalKey(largest, largest_seq, kTypeValue), smallest_seq, - largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); + largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp); files_.push_back(f); } @@ -95,13 +103,13 @@ return opt; } -class VersionStorageInfoTest : public testing::Test { +class VersionStorageInfoTestBase : public testing::Test { public: const Comparator* ucmp_; InternalKeyComparator icmp_; std::shared_ptr logger_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; VersionStorageInfo vstorage_; @@ -110,17 +118,19 @@ return InternalKey(ukey, smallest_seq, kTypeValue); } - VersionStorageInfoTest() - : ucmp_(BytewiseComparator()), + explicit VersionStorageInfoTestBase(const Comparator* ucmp) + : ucmp_(ucmp), icmp_(ucmp_), logger_(new CountingLogger()), options_(GetOptionsWithNumLevels(6, logger_)), ioptions_(options_), mutable_cf_options_(options_), - vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {} + vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, + /*src_vstorage=*/nullptr, + /*_force_consistency_checks=*/false) {} - ~VersionStorageInfoTest() override { - for (int i = 0; i < vstorage_.num_levels(); i++) { + ~VersionStorageInfoTestBase() override { + for (int i = 0; i < vstorage_.num_levels(); ++i) { for (auto* f : vstorage_.LevelFiles(i)) { if (--f->refs == 0) { delete f; @@ -130,31 +140,56 @@ } void Add(int level, uint32_t file_number, const char* smallest, - const char* largest, uint64_t file_size = 0) { - assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData( - file_number, 0, file_size, GetInternalKey(smallest, 0), - GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, - /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); - f->compensated_file_size = file_size; - vstorage_.AddFile(level, f); + const char* largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + constexpr SequenceNumber dummy_seq = 0; + + Add(level, file_number, GetInternalKey(smallest, dummy_seq), + GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number); } void Add(int level, uint32_t file_number, const InternalKey& smallest, - const InternalKey& largest, uint64_t file_size = 0) { + const InternalKey& largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); + Temperature::kUnknown, oldest_blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void Finalize() { + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_.UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.GenerateLevel0NonOverlapping(); + vstorage_.GenerateBottommostFiles(); + + vstorage_.SetFinalized(); + } + std::string GetOverlappingFiles(int level, const InternalKey& begin, const InternalKey& end) { std::vector inputs; @@ -171,6 +206,13 @@ } }; +class VersionStorageInfoTest : public VersionStorageInfoTestBase { + public: + VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {} + + ~VersionStorageInfoTest() override {} +}; + TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) { ioptions_.level_compaction_dynamic_level_bytes = false; mutable_cf_options_.max_bytes_for_level_base = 10; @@ -362,19 +404,19 @@ Add(2, 3U, "6", "8", 1U); // Partial overlap with last level Add(3, 4U, "1", "9", 1U); // Contains range of last level Add(4, 5U, "4", "5", 1U); // Inside range of last level - Add(4, 5U, "6", "7", 1U); // Inside range of last level - Add(5, 6U, "4", "7", 10U); + Add(4, 6U, "6", "7", 1U); // Inside range of last level + Add(5, 7U, "4", "7", 10U); ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize()); } TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered - Add(0, 1U, "5", "6", 1U); // Ignored because of [5,6] in l1 - Add(1, 1U, "1", "2", 1U); // Ignored because of [2,3] in l2 - Add(1, 2U, "3", "4", 1U); // Ignored because of [2,3] in l2 - Add(1, 3U, "5", "6", 1U); - Add(2, 4U, "2", "3", 1U); - Add(3, 5U, "7", "8", 1U); + Add(0, 2U, "5", "6", 1U); // Ignored because of [5,6] in l1 + Add(1, 3U, "1", "2", 1U); // Ignored because of [2,3] in l2 + Add(1, 4U, "3", "4", 1U); // Ignored because of [2,3] in l2 + Add(1, 5U, "5", "6", 1U); + Add(2, 6U, "2", "3", 1U); + Add(3, 7U, "7", "8", 1U); ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); } @@ -411,6 +453,244 @@ 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); } +TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + + Add(2, 7U, "1", "2", 8000U); + + ASSERT_EQ(vstorage_.GetFileLocation(11U), + VersionStorageInfo::FileLocation(0, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr); + + ASSERT_EQ(vstorage_.GetFileLocation(12U), + VersionStorageInfo::FileLocation(0, 1)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr); + + ASSERT_EQ(vstorage_.GetFileLocation(7U), + VersionStorageInfo::FileLocation(2, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr); + + ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid()); + ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) { + // No SST or blob files in VersionStorageInfo + Finalize(); + + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.75; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGC) { + // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13). + // The first two SSTs have the same oldest blob file, namely, the very oldest + // one (10), while the third SST's oldest blob file reference points to the + // third blob file (12). Thus, the oldest batch of blob files contains the + // first two blob files 10 and 11, and assuming they are eligible for GC based + // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them. + + constexpr int level = 0; + + constexpr uint64_t first_sst = 1; + constexpr uint64_t second_sst = 2; + constexpr uint64_t third_sst = 3; + + constexpr uint64_t first_blob = 10; + constexpr uint64_t second_blob = 11; + constexpr uint64_t third_blob = 12; + constexpr uint64_t fourth_blob = 13; + + { + constexpr char smallest[] = "bar1"; + constexpr char largest[] = "foo1"; + constexpr uint64_t file_size = 1000; + + Add(level, first_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar2"; + constexpr char largest[] = "foo2"; + constexpr uint64_t file_size = 2000; + + Add(level, second_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar3"; + constexpr char largest[] = "foo3"; + constexpr uint64_t file_size = 3000; + + Add(level, third_sst, smallest, largest, file_size, third_blob); + } + + { + constexpr uint64_t total_blob_count = 10; + constexpr uint64_t total_blob_bytes = 100000; + constexpr uint64_t garbage_blob_count = 2; + constexpr uint64_t garbage_blob_bytes = 15000; + + AddBlob(first_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{first_sst, second_sst}, + garbage_blob_count, garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 4; + constexpr uint64_t total_blob_bytes = 400000; + constexpr uint64_t garbage_blob_count = 3; + constexpr uint64_t garbage_blob_bytes = 235000; + + AddBlob(second_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 20; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 8; + constexpr uint64_t garbage_blob_bytes = 123456; + + AddBlob(third_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 128; + constexpr uint64_t total_blob_bytes = 789012345; + constexpr uint64_t garbage_blob_count = 67; + constexpr uint64_t garbage_blob_bytes = 88888888; + + AddBlob(fourth_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + Finalize(); + + assert(vstorage_.num_levels() > 0); + const auto& level_files = vstorage_.LevelFiles(level); + + assert(level_files.size() == 3); + assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst); + assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst); + assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst); + + // No blob files eligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.1; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Part of the oldest batch of blob files (specifically, the second file) is + // ineligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.25; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 2); + + std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}, + {level, level_files[1]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); + } +} + +class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { + public: + VersionStorageInfoTimestampTest() + : VersionStorageInfoTestBase(test::ComparatorWithU64Ts()) {} + ~VersionStorageInfoTimestampTest() override {} + std::string Timestamp(uint64_t ts) const { + std::string ret; + PutFixed64(&ret, ts); + return ret; + } + std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const { + std::string ret; + ret.assign(ukey.data(), ukey.size()); + PutFixed64(&ret, ts); + return ret; + } +}; + +TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) { + Add(/*level=*/1, /*file_number=*/1, /*smallest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue}, + /*file_size=*/100); + Add(/*level=*/1, /*file_number=*/2, /*smallest=*/ + {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue}, + /*file_size=*/100); + Add(/*level=*/1, /*file_number=*/3, /*smallest=*/ + {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue}, + /*largest=*/ + {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue}, + /*file_size=*/100); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateLevelFilesBrief(); + ASSERT_EQ( + "1,2", + GetOverlappingFiles( + /*level=*/1, + {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue}, + {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue})); + ASSERT_EQ("3", + GetOverlappingFiles( + /*level=*/1, + {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue}, + {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue})); +} class FindLevelFileTest : public testing::Test { public: @@ -611,40 +891,69 @@ const static std::string kColumnFamilyName3; int num_initial_edits_; - VersionSetTestBase() - : env_(Env::Default()), - fs_(std::make_shared(env_)), - dbname_(test::PerThreadDBPath("version_set_test")), - db_options_(), + explicit VersionSetTestBase(const std::string& name) + : env_(nullptr), + dbname_(test::PerThreadDBPath(name)), + options_(), + db_options_(options_), + cf_options_(options_), + immutable_options_(db_options_, cf_options_), mutable_cf_options_(cf_options_), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), shutting_down_(false), mock_table_factory_(std::make_shared()) { - EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); + if (env_ == Env::Default() && getenv("MEM_ENV")) { + env_guard_.reset(NewMemEnv(Env::Default())); + env_ = env_guard_.get(); + } + EXPECT_NE(nullptr, env_); + fs_ = env_->GetFileSystem(); + EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr)); + + options_.env = env_; db_options_.env = env_; db_options_.fs = fs_; - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_manager_, - &write_controller_, - /*block_cache_tracer=*/nullptr)), - reactive_versions_ = std::make_shared( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_); + immutable_options_.env = env_; + immutable_options_.fs = fs_; + immutable_options_.clock = env_->GetSystemClock().get(); + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + reactive_versions_ = std::make_shared( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, nullptr); db_options_.db_paths.emplace_back(dbname_, std::numeric_limits::max()); } - void PrepareManifest(std::vector* column_families, - SequenceNumber* last_seqno, - std::unique_ptr* log_writer) { + virtual ~VersionSetTestBase() { + if (getenv("KEEP_DB")) { + fprintf(stdout, "DB is still at %s\n", dbname_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(dbname_, options)); + } + } + + protected: + virtual void PrepareManifest( + std::vector* column_families, + SequenceNumber* last_seqno, std::unique_ptr* log_writer) { assert(column_families != nullptr); assert(last_seqno != nullptr); assert(log_writer != nullptr); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - DBImpl* impl = new DBImpl(DBOptions(), dbname_); + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; impl->GetDbIdentityFromIdentityFile(&db_id); new_db.SetDBId(db_id); @@ -671,13 +980,13 @@ } *last_seqno = last_seq; num_initial_edits_ = static_cast(new_cfs.size() + 1); + std::unique_ptr file_writer; const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); { log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; @@ -700,27 +1009,104 @@ // Create DB with 3 column families. void NewDB() { - std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; SetIdentityFile(env_, dbname_); - PrepareManifest(&column_families, &last_seqno, &log_writer); + PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. - Status s = SetCurrentFile(env_, dbname_, 1, nullptr); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); } + void ReopenDB() { + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families_, false)); + } + + void VerifyManifest(std::string* manifest_path) const { + assert(manifest_path != nullptr); + uint64_t manifest_file_number = 0; + Status s = versions_->GetCurrentManifestPath( + dbname_, fs_.get(), manifest_path, &manifest_file_number); + ASSERT_OK(s); + ASSERT_EQ(1, manifest_file_number); + } + + Status LogAndApplyToDefaultCF(VersionEdit& edit) { + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + return s; + } + + Status LogAndApplyToDefaultCF( + const autovector>& edits) { + autovector vedits; + for (auto& e : edits) { + vedits.push_back(e.get()); + } + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, vedits, &mutex_); + mutex_.Unlock(); + return s; + } + + void CreateNewManifest() { + constexpr FSDirectory* db_directory = nullptr; + constexpr bool new_descriptor_log = true; + mutex_.Lock(); + VersionEdit dummy; + ASSERT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + &dummy, &mutex_, db_directory, new_descriptor_log)); + mutex_.Unlock(); + } + + ColumnFamilyData* CreateColumnFamily(const std::string& cf_name, + const ColumnFamilyOptions& cf_options) { + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + new_cf.SetColumnFamily(new_id); + new_cf.SetLogNumber(0); + new_cf.SetComparatorName(cf_options.comparator->Name()); + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(/*column_family_data=*/nullptr, + MutableCFOptions(cf_options), &new_cf, &mutex_, + /*db_directory=*/nullptr, + /*new_descriptor_log=*/false, &cf_options); + mutex_.Unlock(); + EXPECT_OK(s); + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_name); + EXPECT_NE(nullptr, cfd); + return cfd; + } + + Env* mem_env_; Env* env_; + std::shared_ptr env_guard_; std::shared_ptr fs_; const std::string dbname_; EnvOptions env_options_; + Options options_; ImmutableDBOptions db_options_; ColumnFamilyOptions cf_options_; + ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; std::shared_ptr table_cache_; WriteController write_controller_; @@ -730,6 +1116,7 @@ InstrumentedMutex mutex_; std::atomic shutting_down_; std::shared_ptr mock_table_factory_; + std::vector column_families_; }; const std::string VersionSetTestBase::kColumnFamilyName1 = "alice"; @@ -738,7 +1125,7 @@ class VersionSetTest : public VersionSetTestBase, public testing::Test { public: - VersionSetTest() : VersionSetTestBase() {} + VersionSetTest() : VersionSetTestBase("version_set_test") {} }; TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { @@ -777,10 +1164,849 @@ EXPECT_EQ(kGroupSize - 1, count); } +TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) { + // Initialize the database and add a couple of blob files, one with some + // garbage in it, and one without any garbage. + NewDB(); + + assert(versions_); + assert(versions_->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const version = cfd->current(); + assert(version); + + VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t total_blob_count = 456; + constexpr uint64_t total_blob_bytes = 77777777; + constexpr char checksum_method[] = "SHA1"; + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c" + "\x52\x5c\xbd"; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + constexpr uint64_t garbage_blob_count = 89; + constexpr uint64_t garbage_blob_bytes = 1000000; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + } + + { + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, checksum_method, + checksum_value); + + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + auto meta = BlobFileMetaData::Create( + std::move(shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + storage_info->AddBlobFile(std::move(meta)); + } + + // Force the creation of a new manifest file and make sure metadata for + // the blob files is re-persisted. + size_t addition_encoded = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", + [&](void* /* arg */) { ++addition_encoded; }); + + size_t garbage_encoded = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", + [&](void* /* arg */) { ++garbage_encoded; }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateNewManifest(); + + ASSERT_EQ(addition_encoded, 2); + ASSERT_EQ(garbage_encoded, 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(VersionSetTest, AddLiveBlobFiles) { + // Initialize the database and add a blob file. + NewDB(); + + assert(versions_); + assert(versions_->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const first_version = cfd->current(); + assert(first_version); + + VersionStorageInfo* const first_storage_info = first_version->storage_info(); + assert(first_storage_info); + + constexpr uint64_t first_blob_file_number = 234; + constexpr uint64_t first_total_blob_count = 555; + constexpr uint64_t first_total_blob_bytes = 66666; + constexpr char first_checksum_method[] = "CRC32"; + constexpr char first_checksum_value[] = "\x3d\x87\xff\x57"; + + auto first_shared_meta = SharedBlobFileMetaData::Create( + first_blob_file_number, first_total_blob_count, first_total_blob_bytes, + first_checksum_method, first_checksum_value); + + constexpr uint64_t garbage_blob_count = 0; + constexpr uint64_t garbage_blob_bytes = 0; + + auto first_meta = BlobFileMetaData::Create( + std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + first_storage_info->AddBlobFile(first_meta); + + // Reference the version so it stays alive even after the following version + // edit. + first_version->Ref(); + + // Get live files directly from version. + std::vector version_table_files; + std::vector version_blob_files; + + first_version->AddLiveFiles(&version_table_files, &version_blob_files); + + ASSERT_EQ(version_blob_files.size(), 1); + ASSERT_EQ(version_blob_files[0], first_blob_file_number); + + // Create a new version containing an additional blob file. + versions_->TEST_CreateAndAppendVersion(cfd); + + Version* const second_version = cfd->current(); + assert(second_version); + assert(second_version != first_version); + + VersionStorageInfo* const second_storage_info = + second_version->storage_info(); + assert(second_storage_info); + + constexpr uint64_t second_blob_file_number = 456; + constexpr uint64_t second_total_blob_count = 100; + constexpr uint64_t second_total_blob_bytes = 2000000; + constexpr char second_checksum_method[] = "CRC32B"; + constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a"; + + auto second_shared_meta = SharedBlobFileMetaData::Create( + second_blob_file_number, second_total_blob_count, second_total_blob_bytes, + second_checksum_method, second_checksum_value); + + auto second_meta = BlobFileMetaData::Create( + std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(), + garbage_blob_count, garbage_blob_bytes); + + second_storage_info->AddBlobFile(std::move(first_meta)); + second_storage_info->AddBlobFile(std::move(second_meta)); + + // Get all live files from version set. Note that the result contains + // duplicates. + std::vector all_table_files; + std::vector all_blob_files; + + versions_->AddLiveFiles(&all_table_files, &all_blob_files); + + ASSERT_EQ(all_blob_files.size(), 3); + ASSERT_EQ(all_blob_files[0], first_blob_file_number); + ASSERT_EQ(all_blob_files[1], first_blob_file_number); + ASSERT_EQ(all_blob_files[2], second_blob_file_number); + + // Clean up previous version. + first_version->Unref(); +} + +TEST_F(VersionSetTest, ObsoleteBlobFile) { + // Initialize the database and add a blob file that is entirely garbage + // and thus can immediately be marked obsolete. + NewDB(); + + VersionEdit edit; + + constexpr uint64_t blob_file_number = 234; + constexpr uint64_t total_blob_count = 555; + constexpr uint64_t total_blob_bytes = 66666; + constexpr char checksum_method[] = "CRC32"; + constexpr char checksum_value[] = "\x3d\x87\xff\x57"; + + edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, + checksum_method, checksum_value); + + edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes); + + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + + ASSERT_OK(s); + + // Make sure blob files from the pending number range are not returned + // as obsolete. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_TRUE(blob_files.empty()); + } + + // Make sure the blob file is returned as obsolete if it's not in the pending + // range. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number + 1; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_EQ(blob_files.size(), 1); + ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number); + } + + // Make sure it's not returned a second time. + { + std::vector table_files; + std::vector blob_files; + std::vector manifest_files; + constexpr uint64_t min_pending_output = blob_file_number + 1; + + versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files, + min_pending_output); + + ASSERT_TRUE(blob_files.empty()); + } +} + +TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) { + NewDB(); + + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_EQ(versions[0], nullptr); +} + +// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit. +TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) { + NewDB(); + + const std::string kDBId = "db_db"; + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_NE(versions[0], nullptr); +} + +TEST_F(VersionSetTest, WalAddition) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced for several times before closing. + { + for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) { + uint64_t size = kSizeInBytes - size_delta; + WalMetadata wal(size); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size); + } + } + + // The WAL is closed. + { + WalMetadata wal(kSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalCloseWithoutSync) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced before closing. + { + WalMetadata wal(kSyncedSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } + + // A new WAL with larger log number is created, + // implicitly marking the current WAL closed. + { + VersionEdit edit; + edit.AddWal(kLogNumber + 1); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize()); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalDeletion) { + NewDB(); + + constexpr WalNumber kClosedLogNumber = 10; + constexpr WalNumber kNonClosedLogNumber = 20; + constexpr uint64_t kSizeInBytes = 111; + + // Add a non-closed and a closed WAL. + { + VersionEdit edit; + edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes)); + edit.AddWal(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Delete the closed WAL. + { + VersionEdit edit; + edit.DeleteWalsBefore(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Recover a new VersionSet, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Force the creation of a new MANIFEST file, + // only the non-closed WAL should be written to the new MANIFEST. + { + std::vector wal_additions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) { + VersionEdit* edit = reinterpret_cast(arg); + ASSERT_TRUE(edit->IsWalAddition()); + for (auto& addition : edit->GetWalAdditions()) { + wal_additions.push_back(addition); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateNewManifest(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(wal_additions.size(), 1); + ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber); + ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize()); + } + + // Recover from the new MANIFEST, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } +} + +TEST_F(VersionSetTest, WalCreateTwice) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); +} + +TEST_F(VersionSetTest, WalCreateAfterClose) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + edit.AddWal(kLogNumber); + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Create the same WAL again. + VersionEdit edit; + edit.AddWal(kLogNumber); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionSetTest, AddWalWithSmallerSize) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Add the same WAL with smaller synced size. + VersionEdit edit; + WalMetadata wal(kSizeInBytes / 2); + edit.AddWal(kLogNumber, wal); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + s.ToString().find( + "WAL 10 must not have smaller synced size than previous one") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { + NewDB(); + + constexpr WalNumber kLogNumber0 = 10; + constexpr WalNumber kLogNumber1 = 20; + constexpr WalNumber kNonExistingNumber = 15; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add closed WALs. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber0, wal); + edit.AddWal(kLogNumber1, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Delete WALs before a non-existing WAL. + VersionEdit edit; + edit.DeleteWalsBefore(kNonExistingNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, WAL0 is deleted, WAL1 is not. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber1) != wals.end()); + } +} + +TEST_F(VersionSetTest, DeleteAllWals) { + NewDB(); + + constexpr WalNumber kMaxLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kMaxLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + VersionEdit edit; + edit.DeleteWalsBefore(kMaxLogNumber + 10); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, all WALs are deleted. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 0); + } +} + +TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { + NewDB(); + + constexpr int kAtomicGroupSize = 7; + constexpr uint64_t kNumWals = 5; + const std::string kDBId = "db_db"; + + int remaining = kAtomicGroupSize; + autovector> edits; + // Add 5 WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + edits.back()->MarkAtomicGroup(--remaining); + } + // One edit with the min log number set. + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + edits.back()->MarkAtomicGroup(--remaining); + // Delete the first added 4 WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals); + edits.back()->MarkAtomicGroup(--remaining); + ASSERT_EQ(remaining, 0); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + // Recover a new VersionSet, the min log number and the last WAL should be + // kept. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + std::string db_id; + ASSERT_OK( + new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); + + ASSERT_EQ(db_id, kDBId); + + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNumWals) != wals.end()); + ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize()); + ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals); + } +} + +class VersionSetWithTimestampTest : public VersionSetTest { + public: + static const std::string kNewCfName; + + explicit VersionSetWithTimestampTest() : VersionSetTest() {} + + void SetUp() override { + NewDB(); + Options options; + options.comparator = test::ComparatorWithU64Ts(); + cfd_ = CreateColumnFamily(kNewCfName, options); + EXPECT_NE(nullptr, cfd_); + EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions()); + column_families_.emplace_back(kNewCfName, options); + } + + void TearDown() override { + for (auto* e : edits_) { + delete e; + } + edits_.clear(); + } + + void GenVersionEditsToSetFullHistoryTsLow( + const std::vector& ts_lbs) { + for (const auto ts_lb : ts_lbs) { + VersionEdit* edit = new VersionEdit; + edit->SetColumnFamily(cfd_->GetID()); + std::string ts_str = test::EncodeInt(ts_lb); + edit->SetFullHistoryTsLow(ts_str); + edits_.emplace_back(edit); + } + } + + void VerifyFullHistoryTsLow(uint64_t expected_ts_low) { + std::unique_ptr vset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, + /*db_id=*/nullptr)); + for (auto* cfd : *(vset->GetColumnFamilySet())) { + ASSERT_NE(nullptr, cfd); + if (cfd->GetName() == kNewCfName) { + ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow()); + } else { + ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty()); + } + } + } + + void DoTest(const std::vector& ts_lbs) { + if (ts_lbs.empty()) { + return; + } + + GenVersionEditsToSetFullHistoryTsLow(ts_lbs); + + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), + edits_, &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); + } + + protected: + ColumnFamilyData* cfd_{nullptr}; + // edits_ must contain and own pointers to heap-alloc VersionEdit objects. + autovector edits_; +}; + +const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); + +TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) { + constexpr uint64_t kTsLow = 100; + DoTest({kTsLow}); +} + +// Simulate the application increasing full_history_ts_low. +TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) { + const std::vector ts_lbs = {100, 101, 102, 103}; + DoTest(ts_lbs); +} + +// Simulate the application trying to decrease full_history_ts_low +// unsuccessfully. If the application calls public API sequentially to +// decrease the lower bound ts, RocksDB will return an InvalidArgument +// status before involving VersionSet. Only when multiple threads trying +// to decrease the lower bound concurrently will this case ever happen. Even +// so, the lower bound cannot be decreased. The application will be notified +// via return value of the API. +TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) { + const std::vector ts_lbs = {103, 102, 101, 100}; + DoTest(ts_lbs); +} + class VersionSetAtomicGroupTest : public VersionSetTestBase, public testing::Test { public: - VersionSetAtomicGroupTest() : VersionSetTestBase() {} + VersionSetAtomicGroupTest() + : VersionSetTestBase("version_set_atomic_group_test") {} void SetUp() override { PrepareManifest(&column_families_, &last_seqno_, &log_writer_); @@ -796,7 +2022,7 @@ edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); } void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { @@ -808,7 +2034,7 @@ edits_[i].MarkAtomicGroup(--remaining); edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); } void SetupCorruptedAtomicGroup(int atomic_group_size) { @@ -822,7 +2048,7 @@ } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); } void SetupIncorrectAtomicGroup(int atomic_group_size) { @@ -838,7 +2064,7 @@ } edits_[i].SetLastSequence(last_seqno_++); } - ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr)); } void SetupTestSyncPoints() { @@ -860,13 +2086,10 @@ last_in_atomic_group_ = true; }); SyncPoint::GetInstance()->SetCallBack( - "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) { - num_recovered_edits_ = *reinterpret_cast(arg); + "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) { + num_recovered_edits_ = *reinterpret_cast(arg); }); SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ReadAndApply:AppliedEdits", - [&](void* arg) { num_applied_edits_ = *reinterpret_cast(arg); }); - SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:AtomicGroup", [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); SyncPoint::GetInstance()->SetCallBack( @@ -904,8 +2127,7 @@ bool first_in_atomic_group_ = false; bool last_in_atomic_group_ = false; int num_edits_in_atomic_group_ = 0; - int num_recovered_edits_ = 0; - int num_applied_edits_ = 0; + size_t num_recovered_edits_ = 0; VersionEdit corrupted_edit_; VersionEdit edit_with_incorrect_group_size_; std::unique_ptr log_writer_; @@ -921,7 +2143,6 @@ EXPECT_TRUE(first_in_atomic_group_); EXPECT_TRUE(last_in_atomic_group_); EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -943,7 +2164,6 @@ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -956,20 +2176,20 @@ EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, &manifest_reporter, &manifest_reader_status)); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); AddNewEditsToLog(kAtomicGroupSize); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_TRUE(first_in_atomic_group_); EXPECT_TRUE(last_in_atomic_group_); // The recover should clean up the replay buffer. EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); - EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); + EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -985,7 +2205,6 @@ EXPECT_FALSE(last_in_atomic_group_); EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1017,14 +2236,13 @@ InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); // Reactive version set should be empty now. EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1041,13 +2259,14 @@ &manifest_reader_status)); EXPECT_EQ(column_families_.size(), reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); // Write a few edits in an atomic group. AddNewEditsToLog(kNumberOfPersistedVersionEdits); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_TRUE(first_in_atomic_group_); EXPECT_FALSE(last_in_atomic_group_); @@ -1056,8 +2275,6 @@ EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == kNumberOfPersistedVersionEdits); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); - EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1104,8 +2321,8 @@ // Write the corrupted edits. AddNewEditsToLog(kAtomicGroupSize); mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), corrupted_edit_.DebugString()); @@ -1154,8 +2371,8 @@ &manifest_reader_status)); AddNewEditsToLog(kAtomicGroupSize); mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_EQ(edits_[1].DebugString(), edit_with_incorrect_group_size_.DebugString()); @@ -1164,7 +2381,8 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, public testing::TestWithParam { public: - VersionSetTestDropOneCF() : VersionSetTestBase() {} + VersionSetTestDropOneCF() + : VersionSetTestBase("version_set_test_drop_one_cf") {} }; // This test simulates the following execution sequence @@ -1189,7 +2407,7 @@ SequenceNumber last_seqno; std::unique_ptr log_writer; PrepareManifest(&column_families, &last_seqno, &log_writer); - Status s = SetCurrentFile(env_, dbname_, 1, nullptr); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); @@ -1268,10 +2486,7 @@ mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); - if (cfd_to_drop->Unref()) { - delete cfd_to_drop; - cfd_to_drop = nullptr; - } + cfd_to_drop->UnrefAndTryDelete(); } INSTANTIATE_TEST_CASE_P( @@ -1279,6 +2494,737 @@ testing::Values(VersionSetTestBase::kColumnFamilyName1, VersionSetTestBase::kColumnFamilyName2, VersionSetTestBase::kColumnFamilyName3)); + +class EmptyDefaultCfNewManifest : public VersionSetTestBase, + public testing::Test { + public: + EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {} + // Emulate DBImpl::NewDB() + void PrepareManifest(std::vector* /*column_families*/, + SequenceNumber* /*last_seqno*/, + std::unique_ptr* log_writer) override { + assert(log_writer != nullptr); + VersionEdit new_db; + new_db.SetLogNumber(0); + const std::string manifest_path = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + ASSERT_OK(s); + log_writer->reset(new log::Writer(std::move(file_writer), 0, true)); + std::string record; + ASSERT_TRUE(new_db.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + // Create new column family + VersionEdit new_cf; + new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1); + new_cf.SetColumnFamily(1); + new_cf.SetLastSequence(2); + new_cf.SetNextFile(2); + record.clear(); + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + + protected: + bool write_dbid_to_manifest_ = false; + std::unique_ptr log_writer_; +}; + +// Create db, create column family. Cf creation will switch to a new MANIFEST. +// Then reopen db, trying to recover. +TEST_F(EmptyDefaultCfNewManifest, Recover) { + PrepareManifest(nullptr, nullptr, &log_writer_); + log_writer_.reset(); + Status s = + SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1, + cf_options_); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest( + manifest_path, column_families, false, &db_id, &has_missing_table_file); + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); +} + +class VersionSetTestEmptyDb + : public VersionSetTestBase, + public testing::TestWithParam< + std::tuple>> { + public: + static const std::string kUnknownColumnFamilyName; + VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {} + + protected: + void PrepareManifest(std::vector* /*column_families*/, + SequenceNumber* /*last_seqno*/, + std::unique_ptr* log_writer) override { + assert(nullptr != log_writer); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + const std::string manifest_path = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); + ASSERT_OK(s); + { + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + std::string record; + new_db.EncodeTo(&record); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + } + + std::unique_ptr log_writer_; +}; + +const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown"; + +TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + log_writer_.reset(); + Status s = + SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector cf_names = std::get<2>(GetParam()); + + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Only a subset of column families in the MANIFEST. + VersionEdit new_cf1; + new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1); + new_cf1.SetColumnFamily(1); + Status s; + { + std::string record; + new_cf1.EncodeTo(&record); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + { + VersionEdit tmp_edit; + tmp_edit.SetColumnFamily(4); + tmp_edit.SetLogNumber(0); + tmp_edit.SetNextFile(2); + tmp_edit.SetLastSequence(0); + std::string record; + ASSERT_TRUE(tmp_edit.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_NE(s.ToString().find(manifest_path), std::string::npos); + ASSERT_TRUE(s.IsCorruption()); + } +} + +TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) { + db_options_.write_dbid_to_manifest = std::get<0>(GetParam()); + PrepareManifest(nullptr, nullptr, &log_writer_); + // Write all column families but no log_number, next_file_number and + // last_sequence. + const std::vector all_cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; + Status s; + for (size_t i = 1; i != all_cf_names.size(); ++i) { + VersionEdit new_cf; + new_cf.AddColumnFamily(all_cf_names[i]); + new_cf.SetColumnFamily(cf_id++); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + { + VersionEdit tmp_edit; + tmp_edit.SetLogNumber(0); + tmp_edit.SetNextFile(2); + tmp_edit.SetLastSequence(0); + std::string record; + ASSERT_TRUE(tmp_edit.EncodeTo(&record)); + s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + log_writer_.reset(); + s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr); + ASSERT_OK(s); + + std::string manifest_path; + VerifyManifest(&manifest_path); + + bool read_only = std::get<1>(GetParam()); + const std::vector& cf_names = std::get<2>(GetParam()); + std::vector column_families; + for (const auto& cf_name : cf_names) { + column_families.emplace_back(cf_name, cf_options_); + } + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families, + read_only, &db_id, + &has_missing_table_file); + auto iter = + std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName); + if (iter == cf_names.end()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else if (read_only) { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + } else if (cf_names.size() == all_cf_names.size()) { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + } else if (cf_names.size() < all_cf_names.size()) { + ASSERT_TRUE(s.IsInvalidArgument()); + } else { + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily( + kUnknownColumnFamilyName); + ASSERT_EQ(nullptr, cfd); + } +} + +INSTANTIATE_TEST_CASE_P( + BestEffortRecovery, VersionSetTestEmptyDb, + testing::Combine( + /*write_dbid_to_manifest=*/testing::Bool(), + /*read_only=*/testing::Bool(), + /*cf_names=*/ + testing::Values( + std::vector(), + std::vector({kDefaultColumnFamilyName}), + std::vector({VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3}), + std::vector({kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1}), + std::vector({kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3}), + std::vector( + {kDefaultColumnFamilyName, + VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3, + VersionSetTestEmptyDb::kUnknownColumnFamilyName})))); + +class VersionSetTestMissingFiles : public VersionSetTestBase, + public testing::Test { + public: + VersionSetTestMissingFiles() + : VersionSetTestBase("version_set_test_missing_files"), + block_based_table_options_(), + table_factory_(std::make_shared( + block_based_table_options_)), + internal_comparator_( + std::make_shared(options_.comparator)) {} + + protected: + void PrepareManifest(std::vector* column_families, + SequenceNumber* last_seqno, + std::unique_ptr* log_writer) override { + assert(column_families != nullptr); + assert(last_seqno != nullptr); + assert(log_writer != nullptr); + const std::string manifest = DescriptorFileName(dbname_, 1); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); + log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); + VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } + { + std::string record; + ASSERT_TRUE(new_db.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + const std::vector cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; + uint32_t cf_id = 1; // default cf id is 0 + cf_options_.table_factory = table_factory_; + for (const auto& cf_name : cf_names) { + column_families->emplace_back(cf_name, cf_options_); + if (cf_name == kDefaultColumnFamilyName) { + continue; + } + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_name); + new_cf.SetColumnFamily(cf_id); + std::string record; + ASSERT_TRUE(new_cf.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + + VersionEdit cf_files; + cf_files.SetColumnFamily(cf_id); + cf_files.SetLogNumber(0); + record.clear(); + ASSERT_TRUE(cf_files.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + ++cf_id; + } + SequenceNumber seq = 2; + { + VersionEdit edit; + edit.SetNextFile(7); + edit.SetLastSequence(seq); + std::string record; + ASSERT_TRUE(edit.EncodeTo(&record)); + s = (*log_writer)->AddRecord(record); + ASSERT_OK(s); + } + *last_seqno = seq + 1; + } + + struct SstInfo { + uint64_t file_number; + std::string column_family; + std::string key; // the only key + int level = 0; + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key) + : SstInfo(file_num, cf_name, _key, 0) {} + SstInfo(uint64_t file_num, const std::string& cf_name, + const std::string& _key, int lvl) + : file_number(file_num), + column_family(cf_name), + key(_key), + level(lvl) {} + }; + + // Create dummy sst, return their metadata. Note that only file name and size + // are used. + void CreateDummyTableFiles(const std::vector& file_infos, + std::vector* file_metas) { + assert(file_metas != nullptr); + for (const auto& info : file_infos) { + uint64_t file_num = info.file_number; + std::string fname = MakeTableFileName(dbname_, file_num); + std::unique_ptr file; + Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter(new WritableFileWriter( + std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + std::unique_ptr builder(table_factory_->NewTableBuilder( + TableBuilderOptions( + immutable_options_, mutable_cf_options_, *internal_comparator_, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + info.column_family, info.level), + fwriter.get())); + InternalKey ikey(info.key, 0, ValueType::kTypeValue); + builder->Add(ikey.Encode(), "value"); + ASSERT_OK(builder->Finish()); + fwriter->Flush(); + uint64_t file_size = 0; + s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); + ASSERT_OK(s); + ASSERT_NE(0, file_size); + file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, + ikey, 0, 0, false, Temperature::kUnknown, 0, 0, + 0, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + } + } + + // This method updates last_sequence_. + void WriteFileAdditionAndDeletionToManifest( + uint32_t cf, const std::vector>& added_files, + const std::vector>& deleted_files) { + VersionEdit edit; + edit.SetColumnFamily(cf); + for (const auto& elem : added_files) { + int level = elem.first; + edit.AddFile(level, elem.second); + } + for (const auto& elem : deleted_files) { + int level = elem.first; + edit.DeleteFile(level, elem.second); + } + edit.SetLastSequence(last_seqno_); + ++last_seqno_; + assert(log_writer_.get() != nullptr); + std::string record; + ASSERT_TRUE(edit.EncodeTo(&record)); + Status s = log_writer_->AddRecord(record); + ASSERT_OK(s); + } + + BlockBasedTableOptions block_based_table_options_; + std::shared_ptr table_factory_; + std::shared_ptr internal_comparator_; + std::vector column_families_; + SequenceNumber last_seqno_; + std::unique_ptr log_writer_; +}; + +TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (uint64_t file_num = 10; file_num < 15; ++file_num) { + std::string smallest_ukey = "a"; + std::string largest_ukey = "b"; + InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); + InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( + file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, + largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + std::vector> deleted_files; + deleted_files.emplace_back(0, 10); + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, std::vector>(), deleted_files); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_TRUE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + ASSERT_TRUE(files.empty()); + } +} + +TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (size_t i = 3; i != 5; ++i) { + added_files.emplace_back(0, file_metas[i]); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + + added_files.clear(); + for (uint64_t file_num = 120; file_num < 130; ++file_num) { + std::string smallest_ukey = "a"; + std::string largest_ukey = "b"; + InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); + InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( + file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, + largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kDisableUserTimestamp, kDisableUserTimestamp); + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_TRUE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + if (cfd->GetName() == kDefaultColumnFamilyName) { + ASSERT_EQ(2, files.size()); + for (const auto* fmeta : files) { + if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) { + ASSERT_FALSE(true); + } + } + } else { + ASSERT_TRUE(files.empty()); + } + } +} + +TEST_F(VersionSetTestMissingFiles, NoFileMissing) { + std::vector existing_files = { + SstInfo(100, kDefaultColumnFamilyName, "a"), + SstInfo(102, kDefaultColumnFamilyName, "b"), + SstInfo(103, kDefaultColumnFamilyName, "c"), + SstInfo(107, kDefaultColumnFamilyName, "d"), + SstInfo(110, kDefaultColumnFamilyName, "e")}; + std::vector file_metas; + CreateDummyTableFiles(existing_files, &file_metas); + + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + std::vector> added_files; + for (const auto& meta : file_metas) { + added_files.emplace_back(0, meta); + } + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, added_files, std::vector>()); + std::vector> deleted_files; + deleted_files.emplace_back(/*level=*/0, 100); + WriteFileAdditionAndDeletionToManifest( + /*cf=*/0, std::vector>(), deleted_files); + log_writer_.reset(); + Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); + ASSERT_OK(s); + std::string manifest_path; + VerifyManifest(&manifest_path); + std::string db_id; + bool has_missing_table_file = false; + s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_, + /*read_only=*/false, &db_id, + &has_missing_table_file); + ASSERT_OK(s); + ASSERT_FALSE(has_missing_table_file); + for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) { + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + const std::vector& files = vstorage->LevelFiles(0); + if (cfd->GetName() == kDefaultColumnFamilyName) { + ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size()); + bool has_deleted_file = false; + for (const auto* fmeta : files) { + if (fmeta->fd.GetNumber() == 100) { + has_deleted_file = true; + break; + } + } + ASSERT_FALSE(has_deleted_file); + } else { + ASSERT_TRUE(files.empty()); + } + } +} + +TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { + db_options_.allow_2pc = true; + NewDB(); + + SstInfo sst(100, kDefaultColumnFamilyName, "a"); + std::vector file_metas; + CreateDummyTableFiles({sst}, &file_metas); + + constexpr WalNumber kMinWalNumberToKeep2PC = 10; + VersionEdit edit; + edit.AddFile(0, file_metas[0]); + edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC); + + for (int i = 0; i < 3; i++) { + CreateNewManifest(); + ReopenDB(); + ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,204 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wal_edit.h" + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +void WalAddition::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); + + if (metadata_.HasSyncedSize()) { + PutVarint32(dst, static_cast(WalAdditionTag::kSyncedSize)); + PutVarint64(dst, metadata_.GetSyncedSizeInBytes()); + } + + PutVarint32(dst, static_cast(WalAdditionTag::kTerminate)); +} + +Status WalAddition::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalAddition"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + while (true) { + uint32_t tag_value = 0; + if (!GetVarint32(src, &tag_value)) { + return Status::Corruption(class_name, "Error decoding tag"); + } + WalAdditionTag tag = static_cast(tag_value); + switch (tag) { + case WalAdditionTag::kSyncedSize: { + uint64_t size = 0; + if (!GetVarint64(src, &size)) { + return Status::Corruption(class_name, "Error decoding WAL file size"); + } + metadata_.SetSyncedSizeInBytes(size); + break; + } + // TODO: process future tags such as checksum. + case WalAdditionTag::kTerminate: + return Status::OK(); + default: { + std::stringstream ss; + ss << "Unknown tag " << tag_value; + return Status::Corruption(class_name, ss.str()); + } + } + } +} + +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) { + jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes" + << wal.GetMetadata().GetSyncedSizeInBytes(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal) { + os << "log_number: " << wal.GetLogNumber() + << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes(); + return os; +} + +std::string WalAddition::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +void WalDeletion::EncodeTo(std::string* dst) const { + PutVarint64(dst, number_); +} + +Status WalDeletion::DecodeFrom(Slice* src) { + constexpr char class_name[] = "WalDeletion"; + + if (!GetVarint64(src, &number_)) { + return Status::Corruption(class_name, "Error decoding WAL log number"); + } + + return Status::OK(); +} + +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) { + jw << "LogNumber" << wal.GetLogNumber(); + return jw; +} + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) { + os << "log_number: " << wal.GetLogNumber(); + return os; +} + +std::string WalDeletion::DebugString() const { + std::ostringstream oss; + oss << *this; + return oss.str(); +} + +Status WalSet::AddWal(const WalAddition& wal) { + if (wal.GetLogNumber() < min_wal_number_to_keep_) { + // The WAL has been obsolete, ignore it. + return Status::OK(); + } + + auto it = wals_.lower_bound(wal.GetLogNumber()); + bool existing = it != wals_.end() && it->first == wal.GetLogNumber(); + if (existing && !wal.GetMetadata().HasSyncedSize()) { + std::stringstream ss; + ss << "WAL " << wal.GetLogNumber() << " is created more than once"; + return Status::Corruption("WalSet::AddWal", ss.str()); + } + // If the WAL has synced size, it must >= the previous size. + if (wal.GetMetadata().HasSyncedSize() && existing && + it->second.HasSyncedSize() && + wal.GetMetadata().GetSyncedSizeInBytes() < + it->second.GetSyncedSizeInBytes()) { + std::stringstream ss; + ss << "WAL " << wal.GetLogNumber() + << " must not have smaller synced size than previous one"; + return Status::Corruption("WalSet::AddWal", ss.str()); + } + if (existing) { + it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); + } else { + wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); + } + return Status::OK(); +} + +Status WalSet::AddWals(const WalAdditions& wals) { + Status s; + for (const WalAddition& wal : wals) { + s = AddWal(wal); + if (!s.ok()) { + break; + } + } + return s; +} + +Status WalSet::DeleteWalsBefore(WalNumber wal) { + if (wal > min_wal_number_to_keep_) { + min_wal_number_to_keep_ = wal; + wals_.erase(wals_.begin(), wals_.lower_bound(wal)); + } + return Status::OK(); +} + +void WalSet::Reset() { + wals_.clear(); + min_wal_number_to_keep_ = 0; +} + +Status WalSet::CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const { + assert(env != nullptr); + + Status s; + for (const auto& wal : wals_) { + const uint64_t log_number = wal.first; + const WalMetadata& wal_meta = wal.second; + + if (!wal_meta.HasSyncedSize()) { + // The WAL and WAL directory is not even synced, + // so the WAL's inode may not be persisted, + // then the WAL might not show up when listing WAL directory. + continue; + } + + if (logs_on_disk.find(log_number) == logs_on_disk.end()) { + std::stringstream ss; + ss << "Missing WAL with log number: " << log_number << "."; + s = Status::Corruption(ss.str()); + break; + } + + uint64_t log_file_size = 0; + s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size); + if (!s.ok()) { + break; + } + if (log_file_size < wal_meta.GetSyncedSizeInBytes()) { + std::stringstream ss; + ss << "Size mismatch: WAL (log number: " << log_number + << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes() + << " bytes , but actually is " << log_file_size << " bytes on disk."; + s = Status::Corruption(ss.str()); + break; + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// WAL related classes used in VersionEdit and VersionSet. +// Modifications to WalAddition and WalDeletion may need to update +// VersionEdit and its related tests. + +#pragma once + +#include +#include +#include +#include +#include + +#include "logging/event_logger.h" +#include "port/port.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +using WalNumber = uint64_t; + +// Metadata of a WAL. +class WalMetadata { + public: + WalMetadata() = default; + + explicit WalMetadata(uint64_t synced_size_bytes) + : synced_size_bytes_(synced_size_bytes) {} + + bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; } + + void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; } + + uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; } + + private: + // The size of WAL is unknown, used when the WAL is not synced yet or is + // empty. + constexpr static uint64_t kUnknownWalSize = port::kMaxUint64; + + // Size of the most recently synced WAL in bytes. + uint64_t synced_size_bytes_ = kUnknownWalSize; +}; + +// These tags are persisted to MANIFEST, so it's part of the user API. +enum class WalAdditionTag : uint32_t { + // Indicates that there are no more tags. + kTerminate = 1, + // Synced Size in bytes. + kSyncedSize = 2, + // Add tags in the future, such as checksum? +}; + +// Records the event of adding a WAL in VersionEdit. +class WalAddition { + public: + WalAddition() : number_(0), metadata_() {} + + explicit WalAddition(WalNumber number) : number_(number), metadata_() {} + + WalAddition(WalNumber number, WalMetadata meta) + : number_(number), metadata_(std::move(meta)) {} + + WalNumber GetLogNumber() const { return number_; } + + const WalMetadata& GetMetadata() const { return metadata_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + private: + WalNumber number_; + WalMetadata metadata_; +}; + +std::ostream& operator<<(std::ostream& os, const WalAddition& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal); + +using WalAdditions = std::vector; + +// Records the event of deleting WALs before the specified log number. +class WalDeletion { + public: + WalDeletion() : number_(kEmpty) {} + + explicit WalDeletion(WalNumber number) : number_(number) {} + + WalNumber GetLogNumber() const { return number_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* src); + + std::string DebugString() const; + + bool IsEmpty() const { return number_ == kEmpty; } + + void Reset() { number_ = kEmpty; } + + private: + static constexpr WalNumber kEmpty = 0; + + WalNumber number_; +}; + +std::ostream& operator<<(std::ostream& os, const WalDeletion& wal); +JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal); + +// Used in VersionSet to keep the current set of WALs. +// +// When a WAL is synced or becomes obsoleted, +// a VersionEdit is logged to MANIFEST and +// the WAL is added to or deleted from WalSet. +// +// Not thread safe, needs external synchronization such as holding DB mutex. +class WalSet { + public: + // Add WAL(s). + // If the WAL is closed, + // then there must be an existing unclosed WAL, + // otherwise, return Status::Corruption. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status AddWal(const WalAddition& wal); + Status AddWals(const WalAdditions& wals); + + // Delete WALs with log number smaller than the specified wal number. + // Can happen when applying a VersionEdit or recovering from MANIFEST. + Status DeleteWalsBefore(WalNumber wal); + + // Resets the internal state. + void Reset(); + + // WALs with number less than MinWalNumberToKeep should not exist in WalSet. + WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; } + + const std::map& GetWals() const { return wals_; } + + // Checks whether there are missing or corrupted WALs. + // Returns Status::OK if there is no missing nor corrupted WAL, + // otherwise returns Status::Corruption. + // logs_on_disk is a map from log number to the log filename. + // Note that logs_on_disk may contain logs that is obsolete but + // haven't been deleted from disk. + Status CheckWals( + Env* env, + const std::unordered_map& logs_on_disk) const; + + private: + std::map wals_; + // WAL number < min_wal_number_to_keep_ should not exist in wals_. + // It's monotonically increasing, in-memory only, not written to MANIFEST. + WalNumber min_wal_number_to_keep_ = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wal_edit.h" + +#include "db/db_test_util.h" +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(WalSet, AddDeleteReset) { + WalSet wals; + ASSERT_TRUE(wals.GetWals().empty()); + + // Create WAL 1 - 10. + for (WalNumber log_number = 1; log_number <= 10; log_number++) { + wals.AddWal(WalAddition(log_number)); + } + ASSERT_EQ(wals.GetWals().size(), 10); + + // Delete WAL 1 - 5. + wals.DeleteWalsBefore(6); + ASSERT_EQ(wals.GetWals().size(), 5); + + WalNumber expected_log_number = 6; + for (auto it : wals.GetWals()) { + WalNumber log_number = it.first; + ASSERT_EQ(log_number, expected_log_number++); + } + + wals.Reset(); + ASSERT_TRUE(wals.GetWals().empty()); +} + +TEST(WalSet, Overwrite) { + constexpr WalNumber kNumber = 100; + constexpr uint64_t kBytes = 200; + WalSet wals; + wals.AddWal(WalAddition(kNumber)); + ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize()); + wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))); + ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize()); + ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes); +} + +TEST(WalSet, SmallerSyncedSize) { + constexpr WalNumber kNumber = 100; + constexpr uint64_t kBytes = 100; + WalSet wals; + ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)))); + Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0))); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + s.ToString().find( + "WAL 100 must not have smaller synced size than previous one") != + std::string::npos); +} + +TEST(WalSet, CreateTwice) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_OK(wals.AddWal(WalAddition(kNumber))); + Status s = wals.AddWal(WalAddition(kNumber)); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") != + std::string::npos); +} + +TEST(WalSet, DeleteAllWals) { + constexpr WalNumber kMaxWalNumber = 10; + WalSet wals; + for (WalNumber i = 1; i <= kMaxWalNumber; i++) { + wals.AddWal(WalAddition(i)); + } + ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1)); +} + +TEST(WalSet, AddObsoleteWal) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); + ASSERT_OK(wals.AddWal(WalAddition(kNumber))); + ASSERT_TRUE(wals.GetWals().empty()); +} + +TEST(WalSet, MinWalNumberToKeep) { + constexpr WalNumber kNumber = 100; + WalSet wals; + ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0); + ASSERT_OK(wals.DeleteWalsBefore(kNumber)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1); +} + +class WalSetTest : public DBTestBase { + public: + WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {} + + void SetUp() override { + test_dir_ = test::PerThreadDBPath("wal_set_test"); + ASSERT_OK(env_->CreateDir(test_dir_)); + } + + void TearDown() override { + EXPECT_OK(DestroyDir(env_, test_dir_)); + logs_on_disk_.clear(); + wals_.Reset(); + } + + void CreateWalOnDisk(WalNumber number, const std::string& fname, + uint64_t size_bytes) { + std::unique_ptr f; + std::string fpath = Path(fname); + ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions())); + std::string content(size_bytes, '0'); + ASSERT_OK(f->Append(content)); + ASSERT_OK(f->Close()); + + logs_on_disk_[number] = fpath; + } + + void AddWalToWalSet(WalNumber number, uint64_t size_bytes) { + // Create WAL. + ASSERT_OK(wals_.AddWal(WalAddition(number))); + // Close WAL. + WalMetadata wal(size_bytes); + ASSERT_OK(wals_.AddWal(WalAddition(number, wal))); + } + + Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); } + + private: + std::string test_dir_; + std::unordered_map logs_on_disk_; + WalSet wals_; + + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } +}; + +TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); } + +TEST_F(WalSetTest, CheckWals) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100; + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, size); + // log 0 - 5 are obsolete. + if (number > 5) { + AddWalToWalSet(number, size); + } + } + ASSERT_OK(CheckWals()); +} + +TEST_F(WalSetTest, CheckMissingWals) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100; + AddWalToWalSet(number, size); + // logs with even number are missing from disk. + if (number % 2) { + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, size); + } + } + + Status s = CheckWals(); + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + // The first log with even number is missing. + std::stringstream expected_err; + expected_err << "Missing WAL with log number: " << 2; + ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos) + << s.ToString(); +} + +TEST_F(WalSetTest, CheckWalsWithShrinkedSize) { + for (int number = 1; number < 10; number++) { + uint64_t size = rand() % 100 + 1; + AddWalToWalSet(number, size); + // logs with even number have shrinked size. + std::stringstream ss; + ss << "log" << number; + std::string fname = ss.str(); + CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1); + } + + Status s = CheckWals(); + ASSERT_TRUE(s.IsCorruption()) << s.ToString(); + // The first log with even number has wrong size. + std::stringstream expected_err; + expected_err << "Size mismatch: WAL (log number: " << 2 << ")"; + ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos) + << s.ToString(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc 2025-05-19 16:14:27.000000000 +0000 @@ -37,7 +37,7 @@ #ifndef ROCKSDB_LITE Status WalManager::DeleteFile(const std::string& fname, uint64_t number) { - auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname); + auto s = env_->DeleteFile(wal_dir_ + "/" + fname); if (s.ok()) { MutexLock l(&read_first_record_cache_mutex_); read_first_record_cache_.erase(number); @@ -52,7 +52,7 @@ Status s; // list wal files in main db dir. VectorLogPtr logs; - s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); + s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile); if (!s.ok()) { return s; } @@ -65,7 +65,7 @@ files.clear(); // list wal files in archive dir. - std::string archivedir = ArchivalDirectory(db_options_.wal_dir); + std::string archivedir = ArchivalDirectory(wal_dir_); Status exists = env_->FileExists(archivedir); if (exists.ok()) { s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); @@ -120,8 +120,8 @@ return s; } iter->reset(new TransactionLogIteratorImpl( - db_options_.wal_dir, &db_options_, read_options, file_options_, seq, - std::move(wal_files), version_set, seq_per_batch_)); + wal_dir_, &db_options_, read_options, file_options_, seq, + std::move(wal_files), version_set, seq_per_batch_, io_tracer_)); return (*iter)->status(); } @@ -134,14 +134,14 @@ // b. get sorted non-empty archived logs // c. delete what should be deleted void WalManager::PurgeObsoleteWALFiles() { - bool const ttl_enabled = db_options_.wal_ttl_seconds > 0; - bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0; + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; if (!ttl_enabled && !size_limit_enabled) { return; } - int64_t current_time; - Status s = env_->GetCurrentTime(¤t_time); + int64_t current_time = 0; + Status s = db_options_.clock->GetCurrentTime(¤t_time); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s", s.ToString().c_str()); @@ -150,7 +150,7 @@ } uint64_t const now_seconds = static_cast(current_time); uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) - ? db_options_.wal_ttl_seconds / 2 + ? db_options_.WAL_ttl_seconds / 2 : kDefaultIntervalToDeleteObsoleteWAL; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { @@ -159,7 +159,7 @@ purge_wal_files_last_run_ = now_seconds; - std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); + std::string archival_dir = ArchivalDirectory(wal_dir_); std::vector files; s = env_->GetChildren(archival_dir, &files); if (!s.ok()) { @@ -171,11 +171,10 @@ size_t log_files_num = 0; uint64_t log_file_size = 0; - for (auto& f : files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { std::string const file_path = archival_dir + "/" + f; if (ttl_enabled) { uint64_t file_m_time; @@ -186,7 +185,7 @@ s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) { + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { s = DeleteDBFile(&db_options_, file_path, archival_dir, false, /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { @@ -235,17 +234,21 @@ return; } - size_t const files_keep_num = - static_cast(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size); + size_t const files_keep_num = static_cast( + db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size); if (log_files_num <= files_keep_num) { return; } size_t files_del_num = log_files_num - files_keep_num; VectorLogPtr archived_logs; - GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); - - if (files_del_num > archived_logs.size()) { + s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to get archived WALs from: %s: %s", + archival_dir.c_str(), s.ToString().c_str()); + files_del_num = 0; + } else if (files_del_num > archived_logs.size()) { ROCKS_LOG_WARN(db_options_.info_log, "Trying to delete more archived log files than " "exist. Deleting all"); @@ -254,8 +257,7 @@ for (size_t i = 0; i < files_del_num; ++i) { std::string const file_path = archived_logs[i]->PathName(); - s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path, - db_options_.wal_dir, false, + s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false, /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", @@ -269,7 +271,7 @@ } void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { - auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); + auto archived_log_name = ArchivedLogFileName(wal_dir_, number); // The sync point below is used in (DBTest,TransactionLogIteratorRace) TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); Status s = env_->RenameFile(fname, archived_log_name); @@ -292,7 +294,7 @@ for (const auto& f : all_files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { SequenceNumber sequence; Status s = ReadFirstRecord(log_type, number, &sequence); if (!s.ok()) { @@ -334,10 +336,8 @@ std::sort( log_files.begin(), log_files.end(), [](const std::unique_ptr& a, const std::unique_ptr& b) { - LogFileImpl* a_impl = - static_cast_with_check(a.get()); - LogFileImpl* b_impl = - static_cast_with_check(b.get()); + LogFileImpl* a_impl = static_cast_with_check(a.get()); + LogFileImpl* b_impl = static_cast_with_check(b.get()); return *a_impl < *b_impl; }); return status; @@ -387,7 +387,7 @@ } Status s; if (type == kAliveLogFile) { - std::string fname = LogFileName(db_options_.wal_dir, number); + std::string fname = LogFileName(wal_dir_, number); s = ReadFirstLine(fname, number, sequence); if (!s.ok() && env_->FileExists(fname).ok()) { // return any error that is not caused by non-existing file @@ -397,8 +397,7 @@ if (type == kArchivedLogFile || !s.ok()) { // check if the file got moved to archive. - std::string archived_file = - ArchivedLogFileName(db_options_.wal_dir, number); + std::string archived_file = ArchivedLogFileName(wal_dir_, number); s = ReadFirstLine(archived_file, number, sequence); // maybe the file was deleted from archive dir. If that's the case, return // Status::OK(). The caller with identify this as empty file because @@ -428,7 +427,7 @@ Status s; uint64_t size_bytes; - s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes); + s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes); if (!s.ok()) { return s; @@ -469,7 +468,7 @@ fs_->OptimizeForLogRead(file_options_), &file, nullptr); std::unique_ptr file_reader( - new SequentialFileReader(std::move(file), fname)); + new SequentialFileReader(std::move(file), fname, io_tracer_)); if (!status.ok()) { return status; @@ -494,14 +493,19 @@ // TODO read record's till the first no corrupt entry? } else { WriteBatch batch; - WriteBatchInternal::SetContents(&batch, record); - *sequence = WriteBatchInternal::Sequence(&batch); - return Status::OK(); + // We can overwrite an existing non-OK Status since it'd only reach here + // with `paranoid_checks == false`. + status = WriteBatchInternal::SetContents(&batch, record); + if (status.ok()) { + *sequence = WriteBatchInternal::Sequence(&batch); + return status; + } } } - // ReadRecord returns false on EOF, which means that the log file is empty. we - // return status.ok() in that case and set sequence number to 0 + // ReadRecord might have returned false on EOF, which means that the log file + // is empty. Or, a failure may have occurred while processing the first entry. + // In any case, return status and set sequence number to 0. *sequence = 0; return status; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h 2025-05-19 16:14:27.000000000 +0000 @@ -36,14 +36,18 @@ class WalManager { public: WalManager(const ImmutableDBOptions& db_options, - const FileOptions& file_options, const bool seq_per_batch = false) + const FileOptions& file_options, + const std::shared_ptr& io_tracer, + const bool seq_per_batch = false) : db_options_(db_options), file_options_(file_options), env_(db_options.env), - fs_(db_options.fs.get()), + fs_(db_options.fs, io_tracer), purge_wal_files_last_run_(0), seq_per_batch_(seq_per_batch), - wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {} + wal_dir_(db_options_.GetWalDir()), + wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()), + io_tracer_(io_tracer) {} Status GetSortedWalFiles(VectorLogPtr& files); @@ -91,7 +95,7 @@ const ImmutableDBOptions& db_options_; const FileOptions file_options_; Env* env_; - FileSystem* fs_; + const FileSystemPtr fs_; // ------- WalManager state ------- // cache for ReadFirstRecord() calls @@ -103,11 +107,15 @@ bool seq_per_batch_; + const std::string& wal_dir_; + bool wal_in_db_path_; // obsolete files will be deleted every this seconds if ttl deletion is // enabled and archive size_limit is disabled. static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; + + std::shared_ptr io_tracer_; }; #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,20 +5,21 @@ #ifndef ROCKSDB_LITE +#include "db/wal_manager.h" + #include #include -#include "rocksdb/cache.h" -#include "rocksdb/write_batch.h" -#include "rocksdb/write_buffer_manager.h" - #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "db/version_set.h" -#include "db/wal_manager.h" #include "env/mock_env.h" #include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/file_system.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -31,13 +32,12 @@ class WalManagerTest : public testing::Test { public: WalManagerTest() - : env_(new MockEnv(Env::Default())), - dbname_(test::PerThreadDBPath("wal_manager_test")), + : dbname_(test::PerThreadDBPath("wal_manager_test")), db_options_(), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), current_log_number_(0) { - DestroyDB(dbname_, Options()); + env_.reset(MockEnv::Create(Env::Default())), DestroyDB(dbname_, Options()); } void Init() { @@ -47,19 +47,22 @@ std::numeric_limits::max()); db_options_.wal_dir = dbname_; db_options_.env = env_.get(); - fs_.reset(new LegacyFileSystemWrapper(env_.get())); - db_options_.fs = fs_; + db_options_.fs = env_->GetFileSystem(); + db_options_.clock = env_->GetSystemClock().get(); - versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, - table_cache_.get(), &write_buffer_manager_, - &write_controller_, - /*block_cache_tracer=*/nullptr)); + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); - wal_manager_.reset(new WalManager(db_options_, env_options_)); + wal_manager_.reset( + new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); } void Reopen() { - wal_manager_.reset(new WalManager(db_options_, env_options_)); + wal_manager_.reset( + new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); } // NOT thread safe @@ -67,9 +70,10 @@ assert(current_log_writer_.get() != nullptr); uint64_t seq = versions_->LastSequence() + 1; WriteBatch batch; - batch.Put(key, value); + ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK( + current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch))); versions_->SetLastAllocatedSequence(seq); versions_->SetLastPublishedSequence(seq); versions_->SetLastSequence(seq); @@ -79,10 +83,10 @@ void RollTheLog(bool /*archived*/) { current_log_number_++; std::string fname = ArchivedLogFileName(dbname_, current_log_number_); - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer, + nullptr)); current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); } @@ -113,7 +117,6 @@ WriteBufferManager write_buffer_manager_; std::unique_ptr versions_; std::unique_ptr wal_manager_; - std::shared_ptr fs_; std::unique_ptr current_log_writer_; uint64_t current_log_number_; @@ -122,8 +125,9 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { Init(); std::string path = dbname_ + "/000001.log"; - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); + std::unique_ptr file; + ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file, + nullptr)); SequenceNumber s; ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s)); @@ -133,14 +137,14 @@ wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s)); ASSERT_EQ(s, 0U); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions())); + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), path, FileOptions())); log::Writer writer(std::move(file_writer), 1, db_options_.recycle_log_file_num > 0); WriteBatch batch; - batch.Put("foo", "bar"); + ASSERT_OK(batch.Put("foo", "bar")); WriteBatchInternal::SetSequence(&batch, 10); - writer.AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. // Waiting for lei to finish with db_test @@ -165,14 +169,14 @@ uint64_t GetLogDirSize(std::string dir_path, Env* env) { uint64_t dir_size = 0; std::vector files; - env->GetChildren(dir_path, &files); + EXPECT_OK(env->GetChildren(dir_path, &files)); for (auto& f : files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { std::string const file_path = dir_path + "/" + f; uint64_t file_size; - env->GetFileSize(file_path, &file_size); + EXPECT_OK(env->GetFileSize(file_path, &file_size)); dir_size += file_size; } } @@ -182,9 +186,9 @@ Env* env, const std::string& path, const FileType expected_file_type) { std::vector files; std::vector file_numbers; - env->GetChildren(path, &files); uint64_t number; FileType type; + EXPECT_OK(env->GetChildren(path, &files)); for (size_t i = 0; i < files.size(); ++i) { if (ParseFileName(files[i], &number, &type)) { if (type == expected_file_type) { @@ -207,13 +211,14 @@ EXPECT_OK(iter->status()); iter->Next(); } + EXPECT_OK(iter->status()); return count; } } // namespace TEST_F(WalManagerTest, WALArchivalSizeLimit) { - db_options_.wal_ttl_seconds = 0; - db_options_.wal_size_limit_mb = 1000; + db_options_.WAL_ttl_seconds = 0; + db_options_.WAL_size_limit_MB = 1000; Init(); // TEST : Create WalManager with huge size limit and no ttl. @@ -221,7 +226,7 @@ // Count the archived log files that survived. // Assert that all of them did. // Change size limit. Re-open WalManager. - // Assert that archive is not greater than wal_size_limit_mb after + // Assert that archive is not greater than WAL_size_limit_MB after // PurgeObsoleteWALFiles() // Set ttl and time_to_check_ to small values. Re-open db. // Assert that there are no archived logs left. @@ -230,27 +235,27 @@ CreateArchiveLogs(20, 5000); std::vector log_files = - ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_EQ(log_files.size(), 20U); - db_options_.wal_size_limit_mb = 8; + db_options_.WAL_size_limit_MB = 8; Reopen(); wal_manager_->PurgeObsoleteWALFiles(); uint64_t archive_size = GetLogDirSize(archive_dir, env_.get()); - ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024); + ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024); - db_options_.wal_ttl_seconds = 1; - env_->FakeSleepForMicroseconds(2 * 1000 * 1000); + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); Reopen(); wal_manager_->PurgeObsoleteWALFiles(); - log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_TRUE(log_files.empty()); } TEST_F(WalManagerTest, WALArchivalTtl) { - db_options_.wal_ttl_seconds = 1000; + db_options_.WAL_ttl_seconds = 1000; Init(); // TEST : Create WalManager with a ttl and no size limit. @@ -263,15 +268,15 @@ CreateArchiveLogs(20, 5000); std::vector log_files = - ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_GT(log_files.size(), 0U); - db_options_.wal_ttl_seconds = 1; - env_->FakeSleepForMicroseconds(3 * 1000 * 1000); + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(3 * 1000 * 1000); Reopen(); wal_manager_->PurgeObsoleteWALFiles(); - log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_TRUE(log_files.empty()); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc 2025-05-19 16:14:27.000000000 +0000 @@ -46,6 +46,7 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" +#include "db/kv_checksum.h" #include "db/memtable.h" #include "db/merge_context.h" #include "db/snapshot_impl.h" @@ -53,13 +54,14 @@ #include "db/write_batch_internal.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" +#include "port/lang.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" #include "util/autovector.h" #include "util/cast_util.h" #include "util/coding.h" #include "util/duplicate_detector.h" #include "util/string_util.h" -#include "util/util.h" namespace ROCKSDB_NAMESPACE { @@ -132,110 +134,16 @@ return Status::OK(); } - Status MarkRollback(const Slice&) override { - content_flags |= ContentFlags::HAS_ROLLBACK; - return Status::OK(); - } -}; - -class TimestampAssigner : public WriteBatch::Handler { - public: - explicit TimestampAssigner(const Slice& ts) - : timestamp_(ts), timestamps_(kEmptyTimestampList) {} - explicit TimestampAssigner(const std::vector& ts_list) - : timestamps_(ts_list) { - SanityCheck(); - } - ~TimestampAssigner() override {} - - Status PutCF(uint32_t, const Slice& key, const Slice&) override { - AssignTimestamp(key); - ++idx_; - return Status::OK(); - } - - Status DeleteCF(uint32_t, const Slice& key) override { - AssignTimestamp(key); - ++idx_; - return Status::OK(); - } - - Status SingleDeleteCF(uint32_t, const Slice& key) override { - AssignTimestamp(key); - ++idx_; - return Status::OK(); - } - - Status DeleteRangeCF(uint32_t, const Slice& begin_key, - const Slice& end_key) override { - AssignTimestamp(begin_key); - AssignTimestamp(end_key); - ++idx_; - return Status::OK(); - } - - Status MergeCF(uint32_t, const Slice& key, const Slice&) override { - AssignTimestamp(key); - ++idx_; - return Status::OK(); - } - - Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { - // TODO (yanqin): support blob db in the future. - return Status::OK(); - } - - Status MarkBeginPrepare(bool) override { - // TODO (yanqin): support in the future. - return Status::OK(); - } - - Status MarkEndPrepare(const Slice&) override { - // TODO (yanqin): support in the future. - return Status::OK(); - } - - Status MarkCommit(const Slice&) override { - // TODO (yanqin): support in the future. + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + content_flags |= ContentFlags::HAS_COMMIT; return Status::OK(); } Status MarkRollback(const Slice&) override { - // TODO (yanqin): support in the future. + content_flags |= ContentFlags::HAS_ROLLBACK; return Status::OK(); } - - private: - void SanityCheck() const { - assert(!timestamps_.empty()); -#ifndef NDEBUG - const size_t ts_sz = timestamps_[0].size(); - for (size_t i = 1; i != timestamps_.size(); ++i) { - assert(ts_sz == timestamps_[i].size()); - } -#endif // !NDEBUG - } - - void AssignTimestamp(const Slice& key) { - assert(timestamps_.empty() || idx_ < timestamps_.size()); - const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_]; - size_t ts_sz = ts.size(); - char* ptr = const_cast(key.data() + key.size() - ts_sz); - memcpy(ptr, ts.data(), ts_sz); - } - - static const std::vector kEmptyTimestampList; - const Slice timestamp_; - const std::vector& timestamps_; - size_t idx_ = 0; - - // No copy or move. - TimestampAssigner(const TimestampAssigner&) = delete; - TimestampAssigner(TimestampAssigner&&) = delete; - TimestampAssigner& operator=(const TimestampAssigner&) = delete; - TimestampAssigner&& operator=(TimestampAssigner&&) = delete; }; -const std::vector TimestampAssigner::kEmptyTimestampList; } // anon namespace @@ -244,42 +152,49 @@ }; WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) - : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) { + : content_flags_(0), max_bytes_(max_bytes), rep_() { rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? reserved_bytes : WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader); } -WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz) - : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) { - rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? - reserved_bytes : WriteBatchInternal::kHeader); +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key) + : content_flags_(0), max_bytes_(max_bytes), rep_() { + // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per + // entry. + assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8); + if (protection_bytes_per_key != 0) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader); } WriteBatch::WriteBatch(const std::string& rep) - : content_flags_(ContentFlags::DEFERRED), - max_bytes_(0), - rep_(rep), - timestamp_size_(0) {} + : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {} WriteBatch::WriteBatch(std::string&& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), - rep_(std::move(rep)), - timestamp_size_(0) {} + rep_(std::move(rep)) {} WriteBatch::WriteBatch(const WriteBatch& src) : wal_term_point_(src.wal_term_point_), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(src.rep_), - timestamp_size_(src.timestamp_size_) { + rep_(src.rep_) { if (src.save_points_ != nullptr) { save_points_.reset(new SavePoints()); save_points_->stack = src.save_points_->stack; } + if (src.prot_info_ != nullptr) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + prot_info_->entries_ = src.prot_info_->entries_; + } } WriteBatch::WriteBatch(WriteBatch&& src) noexcept @@ -287,8 +202,8 @@ wal_term_point_(std::move(src.wal_term_point_)), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(std::move(src.rep_)), - timestamp_size_(src.timestamp_size_) {} + prot_info_(std::move(src.prot_info_)), + rep_(std::move(src.rep_)) {} WriteBatch& WriteBatch::operator=(const WriteBatch& src) { if (&src != this) { @@ -331,6 +246,9 @@ } } + if (prot_info_ != nullptr) { + prot_info_->entries_.clear(); + } wal_term_point_.clear(); } @@ -340,7 +258,8 @@ auto rv = content_flags_.load(std::memory_order_relaxed); if ((rv & ContentFlags::DEFERRED) != 0) { BatchContentClassifier classifier; - Iterate(&classifier); + // Should we handle status here? + Iterate(&classifier).PermitUncheckedError(); rv = classifier.content_flags; // this method is conceptually const, because it is performing a lazy @@ -358,6 +277,13 @@ wal_term_point_.content_flags = content_flags_; } +size_t WriteBatch::GetProtectionBytesPerKey() const { + if (prot_info_ != nullptr) { + return prot_info_->GetBytesPerKey(); + } + return 0; +} + bool WriteBatch::HasPut() const { return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; } @@ -495,6 +421,11 @@ return Status::Corruption("bad EndPrepare XID"); } break; + case kTypeCommitXIDAndTimestamp: + if (!GetLengthPrefixedSlice(input, key)) { + return Status::Corruption("bad commit timestamp"); + } + FALLTHROUGH_INTENDED; case kTypeCommitXID: if (!GetLengthPrefixedSlice(input, xid)) { return Status::Corruption("bad Commit XID"); @@ -639,7 +570,8 @@ case kTypeBeginPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); - handler->MarkBeginPrepare(); + s = handler->MarkBeginPrepare(); + assert(s.ok()); empty_batch = false; if (!handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -658,7 +590,8 @@ case kTypeBeginPersistedPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); - handler->MarkBeginPrepare(); + s = handler->MarkBeginPrepare(); + assert(s.ok()); empty_batch = false; if (handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -671,7 +604,8 @@ case kTypeBeginUnprepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); - handler->MarkBeginPrepare(true /* unprepared */); + s = handler->MarkBeginPrepare(true /* unprepared */); + assert(s.ok()); empty_batch = false; if (handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -690,23 +624,37 @@ case kTypeEndPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); - handler->MarkEndPrepare(xid); + s = handler->MarkEndPrepare(xid); + assert(s.ok()); empty_batch = true; break; case kTypeCommitXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); - handler->MarkCommit(xid); + s = handler->MarkCommit(xid); + assert(s.ok()); empty_batch = true; break; + case kTypeCommitXIDAndTimestamp: + assert(wb->content_flags_.load(std::memory_order_relaxed) & + (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); + // key stores the commit timestamp. + assert(!key.empty()); + s = handler->MarkCommitWithTimestamp(xid, key); + if (LIKELY(s.ok())) { + empty_batch = true; + } + break; case kTypeRollbackXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); - handler->MarkRollback(xid); + s = handler->MarkRollback(xid); + assert(s.ok()); empty_batch = true; break; case kTypeNoop: - handler->MarkNoop(empty_batch); + s = handler->MarkNoop(empty_batch); + assert(s.ok()); empty_batch = true; break; default: @@ -728,7 +676,7 @@ return b->is_latest_persistent_state_; } -void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) { +void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) { b->is_latest_persistent_state_ = true; } @@ -769,18 +717,22 @@ b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - if (0 == b->timestamp_size_) { - PutLengthPrefixedSlice(&b->rep_, key); - } else { - PutVarint32(&b->rep_, - static_cast(key.size() + b->timestamp_size_)); - b->rep_.append(key.data(), key.size()); - b->rep_.append(b->timestamp_size_, '\0'); - } + PutLengthPrefixedSlice(&b->rep_, key); PutLengthPrefixedSlice(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // Technically the optype could've been `kTypeColumnFamilyValue` with the + // CF ID encoded in the `WriteBatch`. That distinction is unimportant + // however since we verify CF ID is correct, as well as all other fields + // (a missing/extra encoded CF ID would corrupt another field). It is + // convenient to consolidate on `kTypeValue` here as that is what will be + // inserted into memtable. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -825,15 +777,18 @@ b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - if (0 == b->timestamp_size_) { - PutLengthPrefixedSliceParts(&b->rep_, key); - } else { - PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_); - } + PutLengthPrefixedSliceParts(&b->rep_, key); PutLengthPrefixedSliceParts(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeValue) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -889,6 +844,19 @@ return Status::OK(); } +Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b, + const Slice& xid, + const Slice& commit_ts) { + assert(!commit_ts.empty()); + b->rep_.push_back(static_cast(kTypeCommitXIDAndTimestamp)); + PutLengthPrefixedSlice(&b->rep_, commit_ts); + PutLengthPrefixedSlice(&b->rep_, xid); + b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | + ContentFlags::HAS_COMMIT, + std::memory_order_relaxed); + return Status::OK(); +} + Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) { b->rep_.push_back(static_cast(kTypeRollbackXID)); PutLengthPrefixedSlice(&b->rep_, xid); @@ -912,6 +880,14 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -934,6 +910,16 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, 0 /* _num_parts */), + kTypeDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -958,6 +944,14 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_SINGLE_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, "" /* value */, kTypeSingleDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -982,6 +976,17 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_SINGLE_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, + SliceParts(nullptr /* _parts */, + 0 /* _num_parts */) /* value */, + kTypeSingleDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1007,6 +1012,15 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE_RANGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1032,6 +1046,15 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE_RANGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(begin_key, end_key, kTypeRangeDeletion) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1064,6 +1087,13 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_MERGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1094,6 +1124,13 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_MERGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVO(key, value, kTypeMerge) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1119,6 +1156,14 @@ b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_BLOB_INDEX, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVO()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVO(key, value, kTypeBlobIndex) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1157,6 +1202,9 @@ Clear(); } else { rep_.resize(savepoint.size); + if (prot_info_ != nullptr) { + prot_info_->entries_.resize(savepoint.count); + } WriteBatchInternal::SetCount(this, savepoint.count); content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); } @@ -1175,13 +1223,17 @@ return Status::OK(); } -Status WriteBatch::AssignTimestamp(const Slice& ts) { - TimestampAssigner ts_assigner(ts); +Status WriteBatch::AssignTimestamp( + const Slice& ts, std::function checker) { + TimestampAssigner ts_assigner(prot_info_.get(), std::move(checker), ts); return Iterate(&ts_assigner); } -Status WriteBatch::AssignTimestamps(const std::vector& ts_list) { - TimestampAssigner ts_assigner(ts_list); +Status WriteBatch::AssignTimestamps( + const std::vector& ts_list, + std::function checker) { + SimpleListTimestampAssigner ts_assigner(prot_info_.get(), std::move(checker), + ts_list); return Iterate(&ts_assigner); } @@ -1198,6 +1250,8 @@ DBImpl* db_; const bool concurrent_memtable_writes_; bool post_info_created_; + const WriteBatch::ProtectionInfo* prot_info_; + size_t prot_info_idx_; bool* has_valid_writes_; // On some (!) platforms just default creating @@ -1260,6 +1314,16 @@ (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_); } + const ProtectionInfoKVOC64* NextProtectionInfo() { + const ProtectionInfoKVOC64* res = nullptr; + if (prot_info_ != nullptr) { + assert(prot_info_idx_ < prot_info_->entries_.size()); + res = &prot_info_->entries_[prot_info_idx_]; + ++prot_info_idx_; + } + return res; + } + protected: bool WriteBeforePrepare() const override { return write_before_prepare_; } bool WriteAfterCommit() const override { return write_after_commit_; } @@ -1272,6 +1336,7 @@ bool ignore_missing_column_families, uint64_t recovering_log_number, DB* db, bool concurrent_memtable_writes, + const WriteBatch::ProtectionInfo* prot_info, bool* has_valid_writes = nullptr, bool seq_per_batch = false, bool batch_per_txn = true, bool hint_per_batch = false) : sequence_(_sequence), @@ -1281,9 +1346,11 @@ ignore_missing_column_families_(ignore_missing_column_families), recovering_log_number_(recovering_log_number), log_number_ref_(0), - db_(static_cast_with_check(db)), + db_(static_cast_with_check(db)), concurrent_memtable_writes_(concurrent_memtable_writes), post_info_created_(false), + prot_info_(prot_info), + prot_info_idx_(0), has_valid_writes_(has_valid_writes), rebuilding_trx_(nullptr), rebuilding_trx_seq_(0), @@ -1341,6 +1408,10 @@ } void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) { + prot_info_ = prot_info; + prot_info_idx_ = 0; + } SequenceNumber sequence() const { return sequence_; } @@ -1396,28 +1467,34 @@ } Status PutCFImpl(uint32_t column_family_id, const Slice& key, - const Slice& value, ValueType value_type) { + const Slice& value, ValueType value_type, + const ProtectionInfoKVOS64* kv_prot_info) { // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, + value); // else insert the values to the memtable right away } - Status seek_status; - if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); - return seek_status; + return ret_status; } - Status ret_status; + assert(ret_status.ok()); MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); @@ -1425,23 +1502,17 @@ // any kind of transactions including the ones that use seq_per_batch assert(!seq_per_batch_ || !moptions->inplace_update_support); if (!moptions->inplace_update_support) { - bool mem_res = - mem->Add(sequence_, value_type, key, value, + ret_status = + mem->Add(sequence_, value_type, key, value, kv_prot_info, concurrent_memtable_writes_, get_post_process_info(mem), hint_per_batch_ ? &GetHintMap()[mem] : nullptr); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); - } } else if (moptions->inplace_callback == nullptr) { assert(!concurrent_memtable_writes_); - mem->Update(sequence_, key, value); + ret_status = mem->Update(sequence_, key, value, kv_prot_info); } else { assert(!concurrent_memtable_writes_); - if (mem->UpdateCallback(sequence_, key, value)) { - } else { + ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info); + if (ret_status.IsNotFound()) { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; @@ -1455,223 +1526,354 @@ std::string merged_value; auto cf_handle = cf_mems_->GetColumnFamilyHandle(); - Status s = Status::NotSupported(); + Status get_status = Status::NotSupported(); if (db_ != nullptr && recovering_log_number_ == 0) { if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - s = db_->Get(ropts, cf_handle, key, &prev_value); + get_status = db_->Get(ropts, cf_handle, key, &prev_value); } - - char* prev_buffer = const_cast(prev_value.c_str()); - uint32_t prev_size = static_cast(prev_value.size()); - auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, - s.ok() ? &prev_size : nullptr, - value, &merged_value); - if (status == UpdateStatus::UPDATED_INPLACE) { - // prev_value is updated in-place with final value. - bool mem_res __attribute__((__unused__)); - mem_res = mem->Add( - sequence_, value_type, key, Slice(prev_buffer, prev_size)); - assert(mem_res); - RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); - } else if (status == UpdateStatus::UPDATED) { - // merged_value contains the final value. - bool mem_res __attribute__((__unused__)); - mem_res = - mem->Add(sequence_, value_type, key, Slice(merged_value)); - assert(mem_res); - RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + // Intentionally overwrites the `NotFound` in `ret_status`. + if (!get_status.ok() && !get_status.IsNotFound()) { + ret_status = get_status; + } else { + ret_status = Status::OK(); + } + if (ret_status.ok()) { + UpdateStatus update_status; + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = static_cast(prev_value.size()); + if (get_status.ok()) { + update_status = moptions->inplace_callback(prev_buffer, &prev_size, + value, &merged_value); + } else { + update_status = moptions->inplace_callback( + nullptr /* existing_value */, nullptr /* existing_value_size */, + value, &merged_value); + } + if (update_status == UpdateStatus::UPDATED_INPLACE) { + assert(get_status.ok()); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, + Slice(prev_buffer, prev_size)); + // prev_value is updated in-place with final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + &updated_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } else if (update_status == UpdateStatus::UPDATED) { + if (kv_prot_info != nullptr) { + ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, merged_value); + // merged_value contains the final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(merged_value), &updated_kv_prot_info); + } else { + // merged_value contains the final value. + ret_status = + mem->Add(sequence_, value_type, key, Slice(merged_value), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } } } } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - } - // Since all Puts are logged in transaction logs (if enabled), always bump - // sequence number. Even if the update eventually fails and does not result - // in memtable add/update. - MaybeAdvanceSeq(); - CheckMemtableFull(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + } return ret_status; } Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - return PutCFImpl(column_family_id, key, value, kTypeValue); + const auto* kv_prot_info = NextProtectionInfo(); + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + return PutCFImpl(column_family_id, key, value, kTypeValue, + &mem_kv_prot_info); + } + return PutCFImpl(column_family_id, key, value, kTypeValue, + nullptr /* kv_prot_info */); } Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, - const Slice& value, ValueType delete_type) { + const Slice& value, ValueType delete_type, + const ProtectionInfoKVOS64* kv_prot_info) { Status ret_status; MemTable* mem = cf_mems_->GetMemTable(); - bool mem_res = - mem->Add(sequence_, delete_type, key, value, + ret_status = + mem->Add(sequence_, delete_type, key, value, kv_prot_info, concurrent_memtable_writes_, get_post_process_info(mem), hint_per_batch_ ? &GetHintMap()[mem] : nullptr); - if (UNLIKELY(!mem_res)) { + if (UNLIKELY(ret_status.IsTryAgain())) { assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); } - MaybeAdvanceSeq(); - CheckMemtableFull(); return ret_status; } Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); // else insert the values to the memtable right away } - Status seek_status; - if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); - return seek_status; + return ret_status; } - auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion); + ColumnFamilyData* cfd = cf_mems_->current(); + assert(!cfd || cfd->user_comparator()); + const size_t ts_sz = (cfd && cfd->user_comparator()) + ? cfd->user_comparator()->timestamp_size() + : 0; + const ValueType delete_type = + (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp; + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type); + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + nullptr /* kv_prot_info */); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); } return ret_status; } Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); // else insert the values to the memtable right away } - Status seek_status; - if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, - key); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); - return seek_status; + return ret_status; } + assert(ret_status.ok()); - auto ret_status = - DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, nullptr /* kv_prot_info */); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); } return ret_status; } Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, const Slice& end_key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); // else insert the values to the memtable right away } - Status seek_status; - if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); - // TODO(myabandeh): when transactional DeleteRange support is added, - // check if end_key must also be added. - batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); - return seek_status; + return ret_status; } + assert(ret_status.ok()); + if (db_ != nullptr) { auto cf_handle = cf_mems_->GetColumnFamilyHandle(); if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - auto* cfd = reinterpret_cast(cf_handle)->cfd(); + auto* cfd = + static_cast_with_check(cf_handle)->cfd(); if (!cfd->is_delete_range_supported()) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); return Status::NotSupported( std::string("DeleteRange not supported for table type ") + cfd->ioptions()->table_factory->Name() + " in CF " + cfd->GetName()); } + int cmp = cfd->user_comparator()->Compare(begin_key, end_key); + if (cmp > 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range where endpoints appear mistaken. Don't bother + // applying it to the DB, and return an error to the user. + return Status::InvalidArgument("end key comes before start key"); + } else if (cmp == 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); + // It's an empty range. Don't bother applying it to the DB. + return Status::OK(); + } } - auto ret_status = - DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, nullptr /* kv_prot_info */); + } // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); } return ret_status; } Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); // else insert the values to the memtable right away } - Status seek_status; - if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + Status ret_status; + if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, - value); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, + column_family_id, key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); - return seek_status; + return ret_status; } + assert(ret_status.ok()); - Status ret_status; MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); + if (moptions->merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`"); + } bool perform_merge = false; assert(!concurrent_memtable_writes_ || moptions->max_successive_merges == 0); @@ -1709,65 +1911,97 @@ if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - db_->Get(read_options, cf_handle, key, &get_value); - Slice get_value_slice = Slice(get_value); - - // 2) Apply this merge - auto merge_operator = moptions->merge_operator; - assert(merge_operator); - - std::string new_value; - - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator, key, &get_value_slice, {value}, &new_value, - moptions->info_log, moptions->statistics, Env::Default()); - - if (!merge_status.ok()) { - // Failed to merge! - // Store the delta in memtable + Status get_status = db_->Get(read_options, cf_handle, key, &get_value); + if (!get_status.ok()) { + // Failed to read a key we know exists. Store the delta in memtable. perform_merge = false; } else { - // 3) Add value to memtable - assert(!concurrent_memtable_writes_); - bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = moptions->merge_operator; + assert(merge_operator); + + std::string new_value; + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, + SystemClock::Default().get()); + + if (!merge_status.ok()) { + // Failed to merge! + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + assert(!concurrent_memtable_writes_); + if (kv_prot_info != nullptr) { + auto merged_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + merged_kv_prot_info.UpdateV(value, new_value); + merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue); + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + &merged_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + nullptr /* kv_prot_info */); + } } } } if (!perform_merge) { - // Add merge operator to memtable - bool mem_res = - mem->Add(sequence_, kTypeMerge, key, value, - concurrent_memtable_writes_, get_post_process_info(mem)); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + assert(ret_status.ok()); + // Add merge operand to memtable + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = + mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem)); + } else { + ret_status = mem->Add( + sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */, + concurrent_memtable_writes_, get_post_process_info(mem)); } } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + // TODO(ajkr): propagate `ProtectionInfoKVOS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, + key, value); } - MaybeAdvanceSeq(); - CheckMemtableFull(); return ret_status; } Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - // Same as PutCF except for value type. - return PutCFImpl(column_family_id, key, value, kTypeBlobIndex); + const auto* kv_prot_info = NextProtectionInfo(); + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + // Same as PutCF except for value type. + return PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + &mem_kv_prot_info); + } else { + return PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + nullptr /* kv_prot_info */); + } } void CheckMemtableFull() { @@ -1799,8 +2033,8 @@ const MemTable* const mem = cfd->mem(); assert(mem); - if (mem->ApproximateMemoryUsageFast() + - imm->ApproximateMemoryUsageExcludingLast() >= + if (mem->MemoryAllocatedBytes() + + imm->MemoryAllocatedBytesExcludingLast() >= size_to_maintain && imm->MarkTrimHistoryNeeded()) { trim_history_scheduler_->ScheduleWork(cfd); @@ -1885,6 +2119,8 @@ Status s; if (recovering_log_number_ != 0) { + // We must hold db mutex in recovery. + db_->mutex()->AssertHeld(); // in recovery when we encounter a commit marker // we lookup this transaction in our set of rebuilt transactions // and commit. @@ -1927,6 +2163,76 @@ return s; } + Status MarkCommitWithTimestamp(const Slice& name, + const Slice& commit_ts) override { + assert(db_); + + Status s; + + if (recovering_log_number_ != 0) { + // In recovery, db mutex must be held. + db_->mutex()->AssertHeld(); + // in recovery when we encounter a commit marker + // we lookup this transaction in our set of rebuilt transactions + // and commit. + auto trx = db_->GetRecoveredTransaction(name.ToString()); + // the log containing the prepared section may have + // been released in the last incarnation because the + // data was flushed to L0 + if (trx) { + // at this point individual CF lognumbers will prevent + // duplicate re-insertion of values. + assert(0 == log_number_ref_); + if (write_after_commit_) { + // write_after_commit_ can only have one batch in trx. + assert(trx->batches_.size() == 1); + const auto& batch_info = trx->batches_.begin()->second; + // all inserts must reference this trx log number + log_number_ref_ = batch_info.log_number_; + const auto checker = [this](uint32_t cf, size_t& ts_sz) { + assert(db_); + VersionSet* const vset = db_->GetVersionSet(); + assert(vset); + ColumnFamilySet* const cf_set = vset->GetColumnFamilySet(); + assert(cf_set); + ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf); + assert(cfd); + const auto* const ucmp = cfd->user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() == 0) { + ts_sz = 0; + } else if (ucmp->timestamp_size() != ts_sz) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + return Status::OK(); + }; + s = batch_info.batch_->AssignTimestamp(commit_ts, checker); + if (s.ok()) { + s = batch_info.batch_->Iterate(this); + log_number_ref_ = 0; + } + } + // else the values are already inserted before the commit + + if (s.ok()) { + db_->DeleteRecoveredTransaction(name.ToString()); + } + if (has_valid_writes_) { + *has_valid_writes_ = true; + } + } + } else { + // When writes are not delayed until commit, there is no connection + // between a memtable write and the WAL that supports it. So the commit + // need not reference any log as the only log to which it depends. + assert(!write_after_commit_ || log_number_ref_ > 0); + } + constexpr bool batch_boundary = true; + MaybeAdvanceSeq(batch_boundary); + + return s; + } + Status MarkRollback(const Slice& name) override { assert(db_); @@ -1973,8 +2279,8 @@ MemTableInserter inserter( sequence, memtables, flush_scheduler, trim_history_scheduler, ignore_missing_column_families, recovery_log_number, db, - concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, - batch_per_txn); + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); for (auto w : write_group) { if (w->CallbackFailed()) { continue; @@ -1987,6 +2293,7 @@ } SetSequence(w->batch, inserter.sequence()); inserter.set_log_number_ref(w->log_ref); + inserter.set_prot_info(w->batch->prot_info_.get()); w->status = w->batch->Iterate(&inserter); if (!w->status.ok()) { return w->status; @@ -2008,13 +2315,15 @@ (void)batch_cnt; #endif assert(writer->ShouldWriteToMemtable()); - MemTableInserter inserter( - sequence, memtables, flush_scheduler, trim_history_scheduler, - ignore_missing_column_families, log_number, db, - concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, - batch_per_txn, hint_per_batch); + MemTableInserter inserter(sequence, memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); SetSequence(writer->batch, sequence); inserter.set_log_number_ref(writer->log_ref); + inserter.set_prot_info(writer->batch->prot_info_.get()); Status s = writer->batch->Iterate(&inserter); assert(!seq_per_batch || batch_cnt != 0); assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); @@ -2034,8 +2343,8 @@ MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, trim_history_scheduler, ignore_missing_column_families, log_number, db, - concurrent_memtable_writes, has_valid_writes, - seq_per_batch, batch_per_txn); + concurrent_memtable_writes, batch->prot_info_.get(), + has_valid_writes, seq_per_batch, batch_per_txn); Status s = batch->Iterate(&inserter); if (next_seq != nullptr) { *next_seq = inserter.sequence(); @@ -2048,6 +2357,7 @@ Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { assert(contents.size() >= WriteBatchInternal::kHeader); + assert(b->prot_info_ == nullptr); b->rep_.assign(contents.data(), contents.size()); b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); return Status::OK(); @@ -2055,6 +2365,8 @@ Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, const bool wal_only) { + assert(dst->Count() == 0 || + (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr)); size_t src_len; int src_count; uint32_t src_flags; @@ -2071,6 +2383,13 @@ src_flags = src->content_flags_.load(std::memory_order_relaxed); } + if (dst->prot_info_ != nullptr) { + std::copy(src->prot_info_->entries_.begin(), + src->prot_info_->entries_.begin() + src_count, + std::back_inserter(dst->prot_info_->entries_)); + } else if (src->prot_info_ != nullptr) { + dst->prot_info_.reset(new WriteBatch::ProtectionInfo(*src->prot_info_)); + } SetCount(dst, Count(dst) + src_count); assert(src->rep_.size() >= WriteBatchInternal::kHeader); dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_internal.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,8 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include + #include "db/flush_scheduler.h" +#include "db/kv_checksum.h" #include "db/trim_history_scheduler.h" #include "db/write_thread.h" #include "rocksdb/db.h" @@ -17,6 +20,7 @@ #include "rocksdb/types.h" #include "rocksdb/write_batch.h" #include "util/autovector.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -61,6 +65,14 @@ MemTable* mem_; }; +struct WriteBatch::ProtectionInfo { + // `WriteBatch` usually doesn't contain a huge number of keys so protecting + // with a fixed, non-configurable eight bytes per key may work well enough. + autovector entries_; + + size_t GetBytesPerKey() const { return 8; } +}; + // WriteBatchInternal provides static methods for manipulating a // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { @@ -112,6 +124,9 @@ static Status MarkCommit(WriteBatch* batch, const Slice& xid); + static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid, + const Slice& commit_ts); + static Status InsertNoop(WriteBatch* batch); // Return the number of entries in the batch. @@ -204,7 +219,7 @@ // This write batch includes the latest state that should be persisted. Such // state meant to be used only during recovery. - static void SetAsLastestPersistentState(WriteBatch* b); + static void SetAsLatestPersistentState(WriteBatch* b); static bool IsLatestPersistentState(const WriteBatch* b); }; @@ -232,6 +247,9 @@ if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { batch_->rep_.resize(savepoint_.size); WriteBatchInternal::SetCount(batch_, savepoint_.count); + if (batch_->prot_info_ != nullptr) { + batch_->prot_info_->entries_.resize(savepoint_.count); + } batch_->content_flags_.store(savepoint_.content_flags, std::memory_order_relaxed); return Status::MemoryLimit(); @@ -247,4 +265,165 @@ #endif }; +template +class TimestampAssignerBase : public WriteBatch::Handler { + public: + explicit TimestampAssignerBase( + WriteBatch::ProtectionInfo* prot_info, + std::function&& checker) + : prot_info_(prot_info), checker_(std::move(checker)) {} + + ~TimestampAssignerBase() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { + return AssignTimestamp(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return AssignTimestamp(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return AssignTimestamp(cf, key); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice&) override { + return AssignTimestamp(cf, begin_key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { + return AssignTimestamp(cf, key); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override { + return AssignTimestamp(cf, key); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + protected: + Status AssignTimestamp(uint32_t cf, const Slice& key) { + Status s = static_cast_with_check(this)->AssignTimestampImpl( + cf, key, idx_); + ++idx_; + return s; + } + + Status CheckTimestampSize(uint32_t cf, size_t& ts_sz) { + return checker_(cf, ts_sz); + } + + Status UpdateTimestampIfNeeded(size_t ts_sz, const Slice& key, + const Slice& ts) { + if (ts_sz > 0) { + assert(ts_sz == ts.size()); + UpdateProtectionInformationIfNeeded(key, ts); + UpdateTimestamp(key, ts); + } + return Status::OK(); + } + + void UpdateProtectionInformationIfNeeded(const Slice& key, const Slice& ts) { + if (prot_info_ != nullptr) { + const size_t ts_sz = ts.size(); + SliceParts old_key(&key, 1); + Slice key_no_ts(key.data(), key.size() - ts_sz); + std::array new_key_cmpts{{key_no_ts, ts}}; + SliceParts new_key(new_key_cmpts.data(), 2); + prot_info_->entries_[idx_].UpdateK(old_key, new_key); + } + } + + void UpdateTimestamp(const Slice& key, const Slice& ts) { + const size_t ts_sz = ts.size(); + char* ptr = const_cast(key.data() + key.size() - ts_sz); + assert(ptr); + memcpy(ptr, ts.data(), ts_sz); + } + + // No copy or move. + TimestampAssignerBase(const TimestampAssignerBase&) = delete; + TimestampAssignerBase(TimestampAssignerBase&&) = delete; + TimestampAssignerBase& operator=(const TimestampAssignerBase&) = delete; + TimestampAssignerBase& operator=(TimestampAssignerBase&&) = delete; + + WriteBatch::ProtectionInfo* const prot_info_ = nullptr; + const std::function checker_{}; + size_t idx_ = 0; +}; + +class SimpleListTimestampAssigner + : public TimestampAssignerBase { + public: + explicit SimpleListTimestampAssigner( + WriteBatch::ProtectionInfo* prot_info, + std::function&& checker, + const std::vector& timestamps) + : TimestampAssignerBase(prot_info, + std::move(checker)), + timestamps_(timestamps) {} + + ~SimpleListTimestampAssigner() override {} + + private: + friend class TimestampAssignerBase; + + Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t idx) { + if (idx >= timestamps_.size()) { + return Status::InvalidArgument("Need more timestamps for the assignment"); + } + const Slice& ts = timestamps_[idx]; + size_t ts_sz = ts.size(); + const Status s = this->CheckTimestampSize(cf, ts_sz); + if (!s.ok()) { + return s; + } + return this->UpdateTimestampIfNeeded(ts_sz, key, ts); + } + + const std::vector& timestamps_; +}; + +class TimestampAssigner : public TimestampAssignerBase { + public: + explicit TimestampAssigner(WriteBatch::ProtectionInfo* prot_info, + std::function&& checker, + const Slice& ts) + : TimestampAssignerBase(prot_info, std::move(checker)), + timestamp_(ts) { + assert(!timestamp_.empty()); + } + ~TimestampAssigner() override {} + + private: + friend class TimestampAssignerBase; + + Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t /*idx*/) { + if (timestamp_.empty()) { + return Status::InvalidArgument("Timestamp is empty"); + } + size_t ts_sz = timestamp_.size(); + const Status s = this->CheckTimestampSize(cf, ts_sz); + if (!s.ok()) { + return s; + } + return this->UpdateTimestampIfNeeded(ts_sz, key, timestamp_); + } + + const Slice timestamp_; +}; + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,28 +7,35 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "rocksdb/db.h" - #include + #include "db/column_family.h" +#include "db/db_test_util.h" #include "db/memtable.h" #include "db/write_batch_internal.h" +#include "rocksdb/comparator.h" +#include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" #include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -static std::string PrintContents(WriteBatch* b) { +static std::string PrintContents(WriteBatch* b, + bool merge_operator_supported = true) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); Options options; options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + if (merge_operator_supported) { + options.merge_operator.reset(new TestPutOperator()); + } + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); @@ -59,10 +66,11 @@ if (iter == nullptr) { continue; } + EXPECT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; ikey.clear(); - EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey)); + EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); switch (ikey.type) { case kTypeValue: state.append("Put("); @@ -110,18 +118,21 @@ break; } state.append("@"); - state.append(NumberToString(ikey.sequence)); + state.append(ToString(ikey.sequence)); } + EXPECT_OK(iter->status()); } - EXPECT_EQ(b->HasPut(), put_count > 0); - EXPECT_EQ(b->HasDelete(), delete_count > 0); - EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); - EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); - EXPECT_EQ(b->HasMerge(), merge_count > 0); - if (!s.ok()) { + if (s.ok()) { + EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasDelete(), delete_count > 0); + EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); + EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); + EXPECT_EQ(b->HasMerge(), merge_count > 0); + if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + } else { state.append(s.ToString()); - } else if (count != WriteBatchInternal::Count(b)) { - state.append("CountMismatch()"); } delete mem->Unref(); return state; @@ -138,10 +149,10 @@ TEST_F(WriteBatchTest, Multiple) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.DeleteRange(Slice("bar"), Slice("foo")); - batch.Put(Slice("baz"), Slice("boo")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); + ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo"))); + ASSERT_OK(batch.Put(Slice("baz"), Slice("boo"))); WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); ASSERT_EQ(4u, WriteBatchInternal::Count(&batch)); @@ -156,12 +167,12 @@ TEST_F(WriteBatchTest, Corruption) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); WriteBatchInternal::SetSequence(&batch, 200); Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); + ASSERT_OK(WriteBatchInternal::SetContents( + &batch, Slice(contents.data(), contents.size() - 1))); ASSERT_EQ("Put(foo, bar)@200" "Corruption: bad WriteBatch Delete", PrintContents(&batch)); @@ -171,24 +182,24 @@ WriteBatch b1, b2; WriteBatchInternal::SetSequence(&b1, 200); WriteBatchInternal::SetSequence(&b2, 300); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("", PrintContents(&b1)); ASSERT_EQ(0u, b1.Count()); - b2.Put("a", "va"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Put("a", "va")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200", PrintContents(&b1)); ASSERT_EQ(1u, b1.Count()); b2.Clear(); - b2.Put("b", "vb"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Put("b", "vb")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200" "Put(b, vb)@201", PrintContents(&b1)); ASSERT_EQ(2u, b1.Count()); - b2.Delete("foo"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Delete("foo")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200" "Put(b, vb)@202" "Put(b, vb)@201" @@ -196,11 +207,11 @@ PrintContents(&b1)); ASSERT_EQ(4u, b1.Count()); b2.Clear(); - b2.Put("c", "cc"); - b2.Put("d", "dd"); + ASSERT_OK(b2.Put("c", "cc")); + ASSERT_OK(b2.Put("d", "dd")); b2.MarkWalTerminationPoint(); - b2.Put("e", "ee"); - WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true); + ASSERT_OK(b2.Put("e", "ee")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true)); ASSERT_EQ( "Put(a, va)@200" "Put(b, vb)@202" @@ -223,10 +234,10 @@ WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ("", PrintContents(&batch)); ASSERT_EQ(0u, batch.Count()); - batch.Put("a", "va"); + ASSERT_OK(batch.Put("a", "va")); ASSERT_EQ("Put(a, va)@100", PrintContents(&batch)); ASSERT_EQ(1u, batch.Count()); - batch.SingleDelete("a"); + ASSERT_OK(batch.SingleDelete("a")); ASSERT_EQ( "SingleDelete(a)@101" "Put(a, va)@100", @@ -307,6 +318,11 @@ seen += "MarkCommit(" + xid.ToString() + ")"; return Status::OK(); } + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override { + seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " + + ts.ToString(true) + ")"; + return Status::OK(); + } Status MarkRollback(const Slice& xid) override { seen += "MarkRollback(" + xid.ToString() + ")"; return Status::OK(); @@ -316,7 +332,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) { WriteBatch batch; - batch.Put(Slice("k1"), Slice("v1")); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); @@ -326,7 +342,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) { WriteBatch batch; - batch.Delete(Slice("k2")); + ASSERT_OK(batch.Delete(Slice("k2"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Delete(k2)@0", PrintContents(&batch)); @@ -336,7 +352,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { WriteBatch batch; - batch.SingleDelete(Slice("k2")); + ASSERT_OK(batch.SingleDelete(Slice("k2"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch)); @@ -346,7 +362,7 @@ TEST_F(WriteBatchTest, MergeNotImplemented) { WriteBatch batch; - batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch)); @@ -354,16 +370,26 @@ ASSERT_OK(batch.Iterate(&handler)); } +TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) { + WriteBatch batch; + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ( + "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator " + "!= nullptr`", + PrintContents(&batch, false /* merge_operator_supported */)); +} + TEST_F(WriteBatchTest, Blob) { WriteBatch batch; - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); - batch.Put(Slice("k3"), Slice("v3")); - batch.PutLogData(Slice("blob1")); - batch.Delete(Slice("k2")); - batch.SingleDelete(Slice("k3")); - batch.PutLogData(Slice("blob2")); - batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.Put(Slice("k3"), Slice("v3"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k2"))); + ASSERT_OK(batch.SingleDelete(Slice("k3"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); ASSERT_EQ(6u, batch.Count()); ASSERT_EQ( "Merge(foo, bar)@5" @@ -375,7 +401,7 @@ PrintContents(&batch)); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(k1, v1)" "Put(k2, v2)" @@ -390,19 +416,19 @@ TEST_F(WriteBatchTest, PrepareCommit) { WriteBatch batch; - WriteBatchInternal::InsertNoop(&batch); - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); + ASSERT_OK(WriteBatchInternal::InsertNoop(&batch)); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); batch.SetSavePoint(); - WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")); + ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"))); Status s = batch.RollbackToSavePoint(); ASSERT_EQ(s, Status::NotFound()); - WriteBatchInternal::MarkCommit(&batch, Slice("xid1")); - WriteBatchInternal::MarkRollback(&batch, Slice("xid1")); + ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1"))); + ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1"))); ASSERT_EQ(2u, batch.Count()); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "MarkBeginPrepare(false)" "Put(k1, v1)" @@ -419,7 +445,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { // Insert key and value of 3GB and push total batch size to 12GB. static const size_t kKeyValueSize = 4u; - static const uint32_t kNumUpdates = uint32_t(3 << 30); + static const uint32_t kNumUpdates = uint32_t{3} << 30; std::string raw(kKeyValueSize, 'A'); WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u); char c = 'A'; @@ -430,7 +456,7 @@ raw[0] = c; raw[raw.length() - 1] = c; c++; - batch.Put(raw, raw); + ASSERT_OK(batch.Put(raw, raw)); } ASSERT_EQ(kNumUpdates, batch.Count()); @@ -472,7 +498,7 @@ bool Continue() override { return num_seen < kNumUpdates; } } handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ(kNumUpdates, handler.num_seen); } @@ -486,7 +512,7 @@ for (char i = 0; i < 2; i++) { raw[0] = 'A' + i; raw[raw.length() - 1] = 'A' - i; - batch.Put(raw, raw); + ASSERT_OK(batch.Put(raw, raw)); } ASSERT_EQ(2u, batch.Count()); @@ -523,7 +549,7 @@ bool Continue() override { return num_seen < 2; } } handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ(2, handler.num_seen); } @@ -558,14 +584,14 @@ bool Continue() override { return num_seen < 5; } } handler; - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); - batch.PutLogData(Slice("blob1")); - batch.Delete(Slice("k1")); - batch.SingleDelete(Slice("k2")); - batch.PutLogData(Slice("blob2")); - batch.Merge(Slice("foo"), Slice("bar")); - batch.Iterate(&handler); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k1"))); + ASSERT_OK(batch.SingleDelete(Slice("k2"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(k1, v1)" "Put(k2, v2)" @@ -577,22 +603,22 @@ TEST_F(WriteBatchTest, PutGatherSlices) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); { // Try a write where the key is one slice but the value is two Slice key_slice("baz"); Slice value_slices[2] = { Slice("header"), Slice("payload") }; - batch.Put(SliceParts(&key_slice, 1), - SliceParts(value_slices, 2)); + ASSERT_OK( + batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2))); } { // One where the key is composite but the value is a single slice Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; Slice value_slice("value"); - batch.Put(SliceParts(key_slices, 3), - SliceParts(&value_slice, 1)); + ASSERT_OK( + batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1))); } WriteBatchInternal::SetSequence(&batch, 100); @@ -608,31 +634,34 @@ public: explicit ColumnFamilyHandleImplDummy(int id) : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + id_(id), + ucmp_(ucmp) {} uint32_t GetID() const override { return id_; } - const Comparator* GetComparator() const override { - return BytewiseComparator(); - } + const Comparator* GetComparator() const override { return ucmp_; } private: uint32_t id_; + const Comparator* const ucmp_ = BytewiseComparator(); }; } // namespace anonymous TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { WriteBatch batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Put(&two, Slice("twofoo"), Slice("bar2")); - batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); - batch.Delete(&eight, Slice("eightfoo")); - batch.SingleDelete(&two, Slice("twofoo")); - batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")); - batch.Merge(&three, Slice("threethree"), Slice("3three")); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Merge(Slice("omom"), Slice("nom")); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(foo, bar)" "PutCF(2, twofoo, bar2)" @@ -650,14 +679,14 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { WriteBatchWithIndex batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Put(&two, Slice("twofoo"), Slice("bar2")); - batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); - batch.Delete(&eight, Slice("eightfoo")); - batch.SingleDelete(&two, Slice("twofoo")); - batch.Merge(&three, Slice("threethree"), Slice("3three")); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Merge(Slice("omom"), Slice("nom")); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); std::unique_ptr iter; @@ -736,7 +765,7 @@ ASSERT_TRUE(!iter->Valid()); TestHandler handler; - batch.GetWriteBatch()->Iterate(&handler); + ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler)); ASSERT_EQ( "Put(foo, bar)" "PutCF(2, twofoo, bar2)" @@ -755,12 +784,12 @@ WriteBatch batch; batch.SetSavePoint(); - batch.Put("A", "a"); - batch.Put("B", "b"); + ASSERT_OK(batch.Put("A", "a")); + ASSERT_OK(batch.Put("B", "b")); batch.SetSavePoint(); - batch.Put("C", "c"); - batch.Delete("A"); + ASSERT_OK(batch.Put("C", "c")); + ASSERT_OK(batch.Delete("A")); batch.SetSavePoint(); batch.SetSavePoint(); @@ -779,8 +808,8 @@ "Put(B, b)@1", PrintContents(&batch)); - batch.Delete("A"); - batch.Put("B", "bb"); + ASSERT_OK(batch.Delete("A")); + ASSERT_OK(batch.Put("B", "bb")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ("", PrintContents(&batch)); @@ -789,12 +818,12 @@ ASSERT_TRUE(s.IsNotFound()); ASSERT_EQ("", PrintContents(&batch)); - batch.Put("D", "d"); - batch.Delete("A"); + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); batch.SetSavePoint(); - batch.Put("A", "aaa"); + ASSERT_OK(batch.Put("A", "aaa")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ( @@ -804,8 +833,8 @@ batch.SetSavePoint(); - batch.Put("D", "d"); - batch.Delete("A"); + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ( @@ -826,7 +855,7 @@ ASSERT_TRUE(s.IsNotFound()); ASSERT_EQ("", PrintContents(&batch2)); - batch2.Delete("A"); + ASSERT_OK(batch2.Delete("A")); batch2.SetSavePoint(); s = batch2.RollbackToSavePoint(); @@ -838,7 +867,7 @@ batch2.SetSavePoint(); - batch2.Delete("B"); + ASSERT_OK(batch2.Delete("B")); ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); batch2.SetSavePoint(); @@ -861,7 +890,7 @@ ASSERT_EQ("", PrintContents(&batch3)); batch3.SetSavePoint(); - batch3.Delete("A"); + ASSERT_OK(batch3.Delete("A")); s = batch3.PopSavePoint(); ASSERT_OK(s); @@ -880,6 +909,173 @@ ASSERT_TRUE(s.IsMemoryLimit()); } +namespace { +class TimestampChecker : public WriteBatch::Handler { + public: + explicit TimestampChecker( + std::unordered_map cf_to_ucmps, Slice ts) + : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {} + Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override { + auto cf_iter = cf_to_ucmps_.find(cf); + if (cf_iter == cf_to_ucmps_.end()) { + return Status::Corruption(); + } + const Comparator* const ucmp = cf_iter->second; + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return Status::OK(); + } + if (key.size() < ts_sz) { + return Status::Corruption(); + } + Slice ts = ExtractTimestampFromUserKey(key, ts_sz); + if (ts.compare(timestamp_) != 0) { + return Status::Corruption(); + } + return Status::OK(); + } + + private: + std::unordered_map cf_to_ucmps_; + Slice timestamp_; +}; + +Status CheckTimestampsInWriteBatch( + WriteBatch& wb, Slice timestamp, + std::unordered_map cf_to_ucmps) { + TimestampChecker ts_checker(cf_to_ucmps, timestamp); + return wb.Iterate(&ts_checker); +} +} // namespace + +TEST_F(WriteBatchTest, AssignTimestamps) { + // We assume the last eight bytes of each key is reserved for timestamps. + // Therefore, we must make sure each key is longer than eight bytes. + constexpr size_t key_size = 16; + constexpr size_t num_of_keys = 10; + std::vector key_strs(num_of_keys, std::string(key_size, '\0')); + + ColumnFamilyHandleImplDummy cf0(0); + ColumnFamilyHandleImplDummy cf4(4, test::ComparatorWithU64Ts()); + ColumnFamilyHandleImplDummy cf5(5, test::ComparatorWithU64Ts()); + + const std::unordered_map cf_to_ucmps = { + {0, cf0.GetComparator()}, + {4, cf4.GetComparator()}, + {5, cf5.GetComparator()}}; + + WriteBatch batch; + // Write to the batch. We will assign timestamps later. + for (const auto& key_str : key_strs) { + ASSERT_OK(batch.Put(&cf0, key_str, "value")); + ASSERT_OK(batch.Put(&cf4, key_str, "value")); + ASSERT_OK(batch.Put(&cf5, key_str, "value")); + } + + static constexpr size_t timestamp_size = sizeof(uint64_t); + const auto checker1 = [](uint32_t cf, size_t& ts_sz) { + if (cf == 4 || cf == 5) { + if (ts_sz != timestamp_size) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + } else if (cf == 0) { + ts_sz = 0; + return Status::OK(); + } else { + return Status::Corruption("Invalid cf"); + } + return Status::OK(); + }; + ASSERT_OK( + batch.AssignTimestamp(std::string(timestamp_size, '\xfe'), checker1)); + ASSERT_OK(CheckTimestampsInWriteBatch( + batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps)); + + // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to + // simulate the case in which a transaction enables indexing for some writes + // while disables indexing for other writes. A transaction uses a + // WriteBatchWithIndex object to buffer writes (we consider Write-committed + // policy only). If indexing is enabled, then writes go through + // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a + // mapping from cf to user comparators. If indexing is disabled, a transaction + // writes directly to the underlying raw WriteBatch. We will need to track the + // comparator information for the column families to which un-indexed writes + // are performed. When calling AssignTimestamp(s) API of WriteBatch, we need + // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform + // checking. + std::unordered_map indexed_cf_to_ucmps = { + {0, cf0.GetComparator()}, {4, cf4.GetComparator()}}; + std::unordered_set non_indexed_cfs_with_ts = {cf5.GetID()}; + const auto checker2 = [&indexed_cf_to_ucmps, &non_indexed_cfs_with_ts]( + uint32_t cf, size_t& ts_sz) { + if (non_indexed_cfs_with_ts.count(cf) > 0) { + if (ts_sz != timestamp_size) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + return Status::OK(); + } + auto cf_iter = indexed_cf_to_ucmps.find(cf); + if (cf_iter == indexed_cf_to_ucmps.end()) { + return Status::Corruption("Unknown cf"); + } + const Comparator* const ucmp = cf_iter->second; + assert(ucmp); + if (ucmp->timestamp_size() == 0) { + ts_sz = 0; + } else if (ts_sz != ucmp->timestamp_size()) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + return Status::OK(); + }; + ASSERT_OK( + batch.AssignTimestamp(std::string(timestamp_size, '\xef'), checker2)); + ASSERT_OK(CheckTimestampsInWriteBatch( + batch, std::string(timestamp_size, '\xef'), cf_to_ucmps)); + + std::vector ts_strs; + for (size_t i = 0; i < 3 * key_strs.size(); ++i) { + if (0 == (i % 3)) { + ts_strs.emplace_back(); + } else { + ts_strs.emplace_back(std::string(timestamp_size, '\xee')); + } + } + std::vector ts_vec(ts_strs.size()); + for (size_t i = 0; i < ts_vec.size(); ++i) { + ts_vec[i] = ts_strs[i]; + } + const auto checker3 = [&cf_to_ucmps](uint32_t cf, size_t& ts_sz) { + auto cf_iter = cf_to_ucmps.find(cf); + if (cf_iter == cf_to_ucmps.end()) { + return Status::Corruption("Invalid cf"); + } + const Comparator* const ucmp = cf_iter->second; + assert(ucmp); + if (ucmp->timestamp_size() != ts_sz) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + return Status::OK(); + }; + ASSERT_OK(batch.AssignTimestamps(ts_vec, checker3)); + ASSERT_OK(CheckTimestampsInWriteBatch( + batch, std::string(timestamp_size, '\xee'), cf_to_ucmps)); +} + +TEST_F(WriteBatchTest, CommitWithTimestamp) { + WriteBatch wb; + const std::string txn_name = "xid1"; + std::string ts; + constexpr uint64_t commit_ts = 23; + PutFixed64(&ts, commit_ts); + ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts)); + TestHandler handler; + ASSERT_OK(wb.Iterate(&handler)); + ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " + + Slice(ts).ToString(true) + ")", + handler.seen); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_callback_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_callback_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -84,13 +84,35 @@ bool AllowWriteBatching() override { return allow_batching_; } }; -TEST_F(WriteCallbackTest, WriteWithCallbackTest) { +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +class WriteCallbackPTest + : public WriteCallbackTest, + public ::testing::WithParamInterface< + std::tuple> { + public: + WriteCallbackPTest() { + std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_, + allow_batching_, enable_WAL_, enable_pipelined_write_) = + GetParam(); + } + + protected: + bool unordered_write_; + bool seq_per_batch_; + bool two_queues_; + bool allow_parallel_; + bool allow_batching_; + bool enable_WAL_; + bool enable_pipelined_write_; +}; + +TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { struct WriteOP { WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } void Put(const string& key, const string& val) { kvs_.push_back(std::make_pair(key, val)); - write_batch_.Put(key, val); + ASSERT_OK(write_batch_.Put(key, val)); } void Clear() { @@ -124,254 +146,239 @@ {false, false, true, false, true}, }; - for (auto& unordered_write : {true, false}) { - for (auto& seq_per_batch : {true, false}) { - for (auto& two_queues : {true, false}) { - for (auto& allow_parallel : {true, false}) { - for (auto& allow_batching : {true, false}) { - for (auto& enable_WAL : {true, false}) { - for (auto& enable_pipelined_write : {true, false}) { - for (auto& write_group : write_scenarios) { - Options options; - options.create_if_missing = true; - options.unordered_write = unordered_write; - options.allow_concurrent_memtable_write = allow_parallel; - options.enable_pipelined_write = enable_pipelined_write; - options.two_write_queues = two_queues; - // Skip unsupported combinations - if (options.enable_pipelined_write && seq_per_batch) { - continue; - } - if (options.enable_pipelined_write && options.two_write_queues) { - continue; - } - if (options.unordered_write && - !options.allow_concurrent_memtable_write) { - continue; - } - if (options.unordered_write && options.enable_pipelined_write) { - continue; - } - - ReadOptions read_options; - DB* db; - DBImpl* db_impl; - - DestroyDB(dbname, options); - - DBOptions db_options(options); - ColumnFamilyOptions cf_options(options); - std::vector column_families; - column_families.push_back( - ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); - std::vector handles; - auto open_s = - DBImpl::Open(db_options, dbname, column_families, &handles, - &db, seq_per_batch, true /* batch_per_txn */); - ASSERT_OK(open_s); - assert(handles.size() == 1); - delete handles[0]; - - db_impl = dynamic_cast(db); - ASSERT_TRUE(db_impl); - - // Writers that have called JoinBatchGroup. - std::atomic threads_joining(0); - // Writers that have linked to the queue - std::atomic threads_linked(0); - // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point. - std::atomic threads_verified(0); - - std::atomic seq(db_impl->GetLatestSequenceNumber()); - ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "WriteThread::JoinBatchGroup:Start", [&](void*) { - uint64_t cur_threads_joining = threads_joining.fetch_add(1); - // Wait for the last joined writer to link to the queue. - // In this way the writers link to the queue one by one. - // This allows us to confidently detect the first writer - // who increases threads_linked as the leader. - while (threads_linked.load() < cur_threads_joining) { - } - }); - - // Verification once writers call JoinBatchGroup. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { - uint64_t cur_threads_linked = threads_linked.fetch_add(1); - bool is_leader = false; - bool is_last = false; - - // who am i - is_leader = (cur_threads_linked == 0); - is_last = (cur_threads_linked == write_group.size() - 1); - - // check my state - auto* writer = reinterpret_cast(arg); - - if (is_leader) { - ASSERT_TRUE(writer->state == - WriteThread::State::STATE_GROUP_LEADER); - } else { - ASSERT_TRUE(writer->state == - WriteThread::State::STATE_INIT); - } - - // (meta test) the first WriteOP should indeed be the first - // and the last should be the last (all others can be out of - // order) - if (is_leader) { - ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == - !write_group.front().callback_.should_fail_); - } else if (is_last) { - ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == - !write_group.back().callback_.should_fail_); - } - - threads_verified.fetch_add(1); - // Wait here until all verification in this sync-point - // callback finish for all writers. - while (threads_verified.load() < write_group.size()) { - } - }); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { - // check my state - auto* writer = reinterpret_cast(arg); - - if (!allow_batching) { - // no batching so everyone should be a leader - ASSERT_TRUE(writer->state == - WriteThread::State::STATE_GROUP_LEADER); - } else if (!allow_parallel) { - ASSERT_TRUE(writer->state == - WriteThread::State::STATE_COMPLETED || - (enable_pipelined_write && - writer->state == - WriteThread::State:: - STATE_MEMTABLE_WRITER_LEADER)); - } - }); - - std::atomic thread_num(0); - std::atomic dummy_key(0); - - // Each write thread create a random write batch and write to DB - // with a write callback. - std::function write_with_callback_func = [&]() { - uint32_t i = thread_num.fetch_add(1); - Random rnd(i); - - // leaders gotta lead - while (i > 0 && threads_verified.load() < 1) { - } - - // loser has to lose - while (i == write_group.size() - 1 && - threads_verified.load() < write_group.size() - 1) { - } - - auto& write_op = write_group.at(i); - write_op.Clear(); - write_op.callback_.allow_batching_ = allow_batching; - - // insert some keys - for (uint32_t j = 0; j < rnd.Next() % 50; j++) { - // grab unique key - char my_key = dummy_key.fetch_add(1); - - string skey(5, my_key); - string sval(10, my_key); - write_op.Put(skey, sval); - - if (!write_op.callback_.should_fail_ && !seq_per_batch) { - seq.fetch_add(1); - } - } - if (!write_op.callback_.should_fail_ && seq_per_batch) { - seq.fetch_add(1); - } - - WriteOptions woptions; - woptions.disableWAL = !enable_WAL; - woptions.sync = enable_WAL; - Status s; - if (seq_per_batch) { - class PublishSeqCallback : public PreReleaseCallback { - public: - PublishSeqCallback(DBImpl* db_impl_in) - : db_impl_(db_impl_in) {} - Status Callback(SequenceNumber last_seq, bool /*not used*/, - uint64_t, size_t /*index*/, - size_t /*total*/) override { - db_impl_->SetLastPublishedSequence(last_seq); - return Status::OK(); - } - DBImpl* db_impl_; - } publish_seq_callback(db_impl); - // seq_per_batch requires a natural batch separator or Noop - WriteBatchInternal::InsertNoop(&write_op.write_batch_); - const size_t ONE_BATCH = 1; - s = db_impl->WriteImpl( - woptions, &write_op.write_batch_, &write_op.callback_, - nullptr, 0, false, nullptr, ONE_BATCH, - two_queues ? &publish_seq_callback : nullptr); - } else { - s = db_impl->WriteWithCallback( - woptions, &write_op.write_batch_, &write_op.callback_); - } - - if (write_op.callback_.should_fail_) { - ASSERT_TRUE(s.IsBusy()); - } else { - ASSERT_OK(s); - } - }; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - // do all the writes - std::vector threads; - for (uint32_t i = 0; i < write_group.size(); i++) { - threads.emplace_back(write_with_callback_func); - } - for (auto& t : threads) { - t.join(); - } - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - - // check for keys - string value; - for (auto& w : write_group) { - ASSERT_TRUE(w.callback_.was_called_.load()); - for (auto& kvp : w.kvs_) { - if (w.callback_.should_fail_) { - ASSERT_TRUE( - db->Get(read_options, kvp.first, &value).IsNotFound()); - } else { - ASSERT_OK(db->Get(read_options, kvp.first, &value)); - ASSERT_EQ(value, kvp.second); - } - } - } - - ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence()); - - delete db; - DestroyDB(dbname, options); - } + for (auto& write_group : write_scenarios) { + Options options; + options.create_if_missing = true; + options.unordered_write = unordered_write_; + options.allow_concurrent_memtable_write = allow_parallel_; + options.enable_pipelined_write = enable_pipelined_write_; + options.two_write_queues = two_queues_; + // Skip unsupported combinations + if (options.enable_pipelined_write && seq_per_batch_) { + continue; + } + if (options.enable_pipelined_write && options.two_write_queues) { + continue; + } + if (options.unordered_write && !options.allow_concurrent_memtable_write) { + continue; + } + if (options.unordered_write && options.enable_pipelined_write) { + continue; + } + + ReadOptions read_options; + DB* db; + DBImpl* db_impl; + + DestroyDB(dbname, options); + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles, + &db, seq_per_batch_, true /* batch_per_txn */); + ASSERT_OK(open_s); + assert(handles.size() == 1); + delete handles[0]; + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + // Writers that have called JoinBatchGroup. + std::atomic threads_joining(0); + // Writers that have linked to the queue + std::atomic threads_linked(0); + // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point. + std::atomic threads_verified(0); + + std::atomic seq(db_impl->GetLatestSequenceNumber()); + ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Start", [&](void*) { + uint64_t cur_threads_joining = threads_joining.fetch_add(1); + // Wait for the last joined writer to link to the queue. + // In this way the writers link to the queue one by one. + // This allows us to confidently detect the first writer + // who increases threads_linked as the leader. + while (threads_linked.load() < cur_threads_joining) { + } + }); + + // Verification once writers call JoinBatchGroup. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + uint64_t cur_threads_linked = threads_linked.fetch_add(1); + bool is_leader = false; + bool is_last = false; + + // who am i + is_leader = (cur_threads_linked == 0); + is_last = (cur_threads_linked == write_group.size() - 1); + + // check my state + auto* writer = reinterpret_cast(arg); + + if (is_leader) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT); + } + + // (meta test) the first WriteOP should indeed be the first + // and the last should be the last (all others can be out of + // order) + if (is_leader) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.front().callback_.should_fail_); + } else if (is_last) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.back().callback_.should_fail_); + } + + threads_verified.fetch_add(1); + // Wait here until all verification in this sync-point + // callback finish for all writers. + while (threads_verified.load() < write_group.size()) { + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { + // check my state + auto* writer = reinterpret_cast(arg); + + if (!allow_batching_) { + // no batching so everyone should be a leader + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else if (!allow_parallel_) { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED || + (enable_pipelined_write_ && + writer->state == + WriteThread::State::STATE_MEMTABLE_WRITER_LEADER)); + } + }); + + std::atomic thread_num(0); + std::atomic dummy_key(0); + + // Each write thread create a random write batch and write to DB + // with a write callback. + std::function write_with_callback_func = [&]() { + uint32_t i = thread_num.fetch_add(1); + Random rnd(i); + + // leaders gotta lead + while (i > 0 && threads_verified.load() < 1) { + } + + // loser has to lose + while (i == write_group.size() - 1 && + threads_verified.load() < write_group.size() - 1) { + } + + auto& write_op = write_group.at(i); + write_op.Clear(); + write_op.callback_.allow_batching_ = allow_batching_; + + // insert some keys + for (uint32_t j = 0; j < rnd.Next() % 50; j++) { + // grab unique key + char my_key = dummy_key.fetch_add(1); + + string skey(5, my_key); + string sval(10, my_key); + write_op.Put(skey, sval); + + if (!write_op.callback_.should_fail_ && !seq_per_batch_) { + seq.fetch_add(1); + } + } + if (!write_op.callback_.should_fail_ && seq_per_batch_) { + seq.fetch_add(1); + } + + WriteOptions woptions; + woptions.disableWAL = !enable_WAL_; + woptions.sync = enable_WAL_; + Status s; + if (seq_per_batch_) { + class PublishSeqCallback : public PreReleaseCallback { + public: + PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {} + Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t, + size_t /*index*/, size_t /*total*/) override { + db_impl_->SetLastPublishedSequence(last_seq); + return Status::OK(); } + DBImpl* db_impl_; + } publish_seq_callback(db_impl); + // seq_per_batch_ requires a natural batch separator or Noop + ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_)); + const size_t ONE_BATCH = 1; + s = db_impl->WriteImpl(woptions, &write_op.write_batch_, + &write_op.callback_, nullptr, 0, false, nullptr, + ONE_BATCH, + two_queues_ ? &publish_seq_callback : nullptr); + } else { + s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_, + &write_op.callback_); + } + + if (write_op.callback_.should_fail_) { + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + }; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < write_group.size(); i++) { + threads.emplace_back(write_with_callback_func); + } + for (auto& t : threads) { + t.join(); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // check for keys + string value; + for (auto& w : write_group) { + ASSERT_TRUE(w.callback_.was_called_.load()); + for (auto& kvp : w.kvs_) { + if (w.callback_.should_fail_) { + ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound()); + } else { + ASSERT_OK(db->Get(read_options, kvp.first, &value)); + ASSERT_EQ(value, kvp.second); } } } - } - } + + ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence()); + + delete db; + DestroyDB(dbname, options); } } +INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + TEST_F(WriteCallbackTest, WriteCallBackTest) { Options options; WriteOptions write_options; @@ -391,8 +398,8 @@ WriteBatch wb; - wb.Put("a", "value.a"); - wb.Delete("x"); + ASSERT_OK(wb.Put("a", "value.a")); + ASSERT_OK(wb.Delete("x")); // Test a simple Write s = db->Write(write_options, &wb); @@ -406,7 +413,7 @@ WriteCallbackTestWriteCallback1 callback1; WriteBatch wb2; - wb2.Put("a", "value.a2"); + ASSERT_OK(wb2.Put("a", "value.a2")); s = db_impl->WriteWithCallback(write_options, &wb2, &callback1); ASSERT_OK(s); @@ -420,7 +427,7 @@ WriteCallbackTestWriteCallback2 callback2; WriteBatch wb3; - wb3.Put("a", "value.a3"); + ASSERT_OK(wb3.Put("a", "value.a3")); s = db_impl->WriteWithCallback(write_options, &wb3, &callback2); ASSERT_NOK(s); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,12 @@ #include "db/write_controller.h" +#include #include #include #include -#include "rocksdb/env.h" + +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -19,10 +21,14 @@ std::unique_ptr WriteController::GetDelayToken( uint64_t write_rate) { - total_delayed_++; - // Reset counters. - last_refill_time_ = 0; - bytes_left_ = 0; + if (0 == total_delayed_++) { + // Starting delay, so reset counters. + next_refill_time_ = 0; + credit_in_bytes_ = 0; + } + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in + // next_refill_time_ will be based on an old rate. This rate will apply + // for subsequent additional debts and for the next refill. set_delayed_write_rate(write_rate); return std::unique_ptr(new DelayWriteToken(this)); } @@ -42,7 +48,7 @@ // If it turns out to be a performance issue, we can redesign the thread // synchronization model here. // The function trust caller will sleep micros returned. -uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) { +uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { if (total_stopped_.load(std::memory_order_relaxed) > 0) { return 0; } @@ -50,64 +56,51 @@ return 0; } - const uint64_t kMicrosPerSecond = 1000000; - const uint64_t kRefillInterval = 1024U; - - if (bytes_left_ >= num_bytes) { - bytes_left_ -= num_bytes; + if (credit_in_bytes_ >= num_bytes) { + credit_in_bytes_ -= num_bytes; return 0; } // The frequency to get time inside DB mutex is less than one per refill // interval. - auto time_now = NowMicrosMonotonic(env); + auto time_now = NowMicrosMonotonic(clock); - uint64_t sleep_debt = 0; - uint64_t time_since_last_refill = 0; - if (last_refill_time_ != 0) { - if (last_refill_time_ > time_now) { - sleep_debt = last_refill_time_ - time_now; - } else { - time_since_last_refill = time_now - last_refill_time_; - bytes_left_ += - static_cast(static_cast(time_since_last_refill) / - kMicrosPerSecond * delayed_write_rate_); - if (time_since_last_refill >= kRefillInterval && - bytes_left_ > num_bytes) { - // If refill interval already passed and we have enough bytes - // return without extra sleeping. - last_refill_time_ = time_now; - bytes_left_ -= num_bytes; - return 0; - } + const uint64_t kMicrosPerSecond = 1000000; + // Refill every 1 ms + const uint64_t kMicrosPerRefill = 1000; + + if (next_refill_time_ == 0) { + // Start with an initial allotment of bytes for one interval + next_refill_time_ = time_now; + } + if (next_refill_time_ <= time_now) { + // Refill based on time interval plus any extra elapsed + uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill; + credit_in_bytes_ += static_cast( + 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999); + next_refill_time_ = time_now + kMicrosPerRefill; + + if (credit_in_bytes_ >= num_bytes) { + // Avoid delay if possible, to reduce DB mutex release & re-aquire. + credit_in_bytes_ -= num_bytes; + return 0; } } - uint64_t single_refill_amount = - delayed_write_rate_ * kRefillInterval / kMicrosPerSecond; - if (bytes_left_ + single_refill_amount >= num_bytes) { - // Wait until a refill interval - // Never trigger expire for less than one refill interval to avoid to get - // time. - bytes_left_ = bytes_left_ + single_refill_amount - num_bytes; - last_refill_time_ = time_now + kRefillInterval; - return kRefillInterval + sleep_debt; - } - - // Need to refill more than one interval. Need to sleep longer. Check - // whether expiration will hit - - // Sleep just until `num_bytes` is allowed. - uint64_t sleep_amount = - static_cast(num_bytes / - static_cast(delayed_write_rate_) * - kMicrosPerSecond) + - sleep_debt; - last_refill_time_ = time_now + sleep_amount; - return sleep_amount; + // We need to delay to avoid exceeding write rate. + assert(num_bytes > credit_in_bytes_); + uint64_t bytes_over_budget = num_bytes - credit_in_bytes_; + uint64_t needed_delay = static_cast( + 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond); + + credit_in_bytes_ = 0; + next_refill_time_ += needed_delay; + + // Minimum delay of refill interval, to reduce DB mutex contention. + return std::max(next_refill_time_ - time_now, kMicrosPerRefill); } -uint64_t WriteController::NowMicrosMonotonic(Env* env) { - return env->NowNanos() / std::milli::den; +uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) { + return clock->NowNanos() / std::milli::den; } StopWriteToken::~StopWriteToken() { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,7 @@ namespace ROCKSDB_NAMESPACE { -class Env; +class SystemClock; class WriteControllerToken; // WriteController is controlling write stalls in our write code-path. Write @@ -27,8 +27,8 @@ : total_stopped_(0), total_delayed_(0), total_compaction_pressure_(0), - bytes_left_(0), - last_refill_time_(0), + credit_in_bytes_(0), + next_refill_time_(0), low_pri_rate_limiter_( NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { set_max_delayed_write_rate(_delayed_write_rate); @@ -57,7 +57,7 @@ // return how many microseconds the caller needs to sleep after the call // num_bytes: how many number of bytes to put into the DB. // Prerequisite: DB mutex held. - uint64_t GetDelay(Env* env, uint64_t num_bytes); + uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes); void set_delayed_write_rate(uint64_t write_rate) { // avoid divide 0 if (write_rate == 0) { @@ -85,7 +85,7 @@ RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } private: - uint64_t NowMicrosMonotonic(Env* env); + uint64_t NowMicrosMonotonic(SystemClock* clock); friend class WriteControllerToken; friend class StopWriteToken; @@ -95,11 +95,14 @@ std::atomic total_stopped_; std::atomic total_delayed_; std::atomic total_compaction_pressure_; - uint64_t bytes_left_; - uint64_t last_refill_time_; - // write rate set when initialization or by `DBImpl::SetDBOptions` + + // Number of bytes allowed to write without delay + uint64_t credit_in_bytes_; + // Next time that we can add more credit of bytes + uint64_t next_refill_time_; + // Write rate set when initialization or by `DBImpl::SetDBOptions` uint64_t max_delayed_write_rate_; - // current write rate + // Current write rate (bytes / second) uint64_t delayed_write_rate_; std::unique_ptr low_pri_rate_limiter_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,128 +3,240 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include - #include "db/write_controller.h" -#include "rocksdb/env.h" +#include +#include + +#include "rocksdb/system_clock.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { - -class WriteControllerTest : public testing::Test {}; - -class TimeSetEnv : public EnvWrapper { +namespace { +class TimeSetClock : public SystemClockWrapper { public: - explicit TimeSetEnv() : EnvWrapper(nullptr) {} + explicit TimeSetClock() : SystemClockWrapper(nullptr) {} + const char* Name() const override { return "TimeSetClock"; } uint64_t now_micros_ = 6666; uint64_t NowNanos() override { return now_micros_ * std::milli::den; } }; +} // namespace +class WriteControllerTest : public testing::Test { + public: + WriteControllerTest() { clock_ = std::make_shared(); } + std::shared_ptr clock_; +}; + +// Make tests easier to read +#define MILLION *1000000u +#define MB MILLION +#define MBPS MILLION +#define SECS MILLION // in microseconds + +TEST_F(WriteControllerTest, BasicAPI) { + WriteController controller(40 MBPS); // also set max delayed rate + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + // set, get + controller.set_delayed_write_rate(20 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + { + // set with token, get + auto delay_token_0 = controller.GetDelayToken(10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + // test with delay + EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 2 SECS; // pay the "debt" + + auto delay_token_1 = controller.GetDelayToken(2 MBPS); + EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 10 SECS; // pay the "debt" + + auto delay_token_2 = controller.GetDelayToken(1 MBPS); + EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 20 SECS; // pay the "debt" + + auto delay_token_3 = controller.GetDelayToken(20 MBPS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 1 SECS; // pay the "debt" + + // 60M is more than the max rate of 40M. Max rate will be used. + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 3); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_EQ(static_cast(0.5 SECS), + controller.GetDelay(clock_.get(), 20 MB)); + + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + + // Test stop tokens + { + auto stop_token_1 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + { + auto stop_token_2 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + // Stop tokens released + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + // pay the previous "debt" + clock_->now_micros_ += static_cast(0.5 SECS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB)); + } -TEST_F(WriteControllerTest, ChangeDelayRateTest) { - TimeSetEnv env; - WriteController controller(40000000u); // also set max delayed rate - controller.set_delayed_write_rate(10000000u); + // Delay tokens released + EXPECT_FALSE(controller.NeedsDelay()); +} + +TEST_F(WriteControllerTest, StartFilled) { + WriteController controller(10 MBPS); + + // Attempt to write two things that combined would be allowed within + // a single refill interval auto delay_token_0 = controller.GetDelayToken(controller.delayed_write_rate()); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_1 = controller.GetDelayToken(2000000u); - ASSERT_EQ(static_cast(10000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_2 = controller.GetDelayToken(1000000u); - ASSERT_EQ(static_cast(20000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_3 = controller.GetDelayToken(20000000u); - ASSERT_EQ(static_cast(1000000), - controller.GetDelay(&env, 20000000u)); - // This is more than max rate. Max delayed rate will be used. - auto delay_token_4 = - controller.GetDelayToken(controller.delayed_write_rate() * 3); - ASSERT_EQ(static_cast(500000), - controller.GetDelay(&env, 20000000u)); + + // Verify no delay because write rate has not been exceeded within + // refill interval. + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Allow refill (kMicrosPerRefill) + clock_->now_micros_ += 1000; + + // Again + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + + // Control: something bigger that would exceed write rate within interval + uint64_t delay = controller.GetDelay(clock_.get(), 10 MB); + EXPECT_GT(1.0 * delay, 0.999 SECS); + EXPECT_LT(1.0 * delay, 1.001 SECS); +} + +TEST_F(WriteControllerTest, DebtAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Accumulate a time delay debt with no passage of time, like many column + // families delaying writes simultaneously. (Old versions of WriteController + // would reset the debt on every GetDelayToken.) + uint64_t debt = 0; + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; + + // Now accumulate debt with some passage of time. + for (unsigned i = 0; i < tokens.size(); ++i) { + // Debt is accumulated in time, not in bytes, so this new write + // limit is not applied to prior requested delays, even it they are + // in progress. + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + uint64_t credit = debt / 2; + clock_->now_micros_ += credit; + debt -= credit; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; // consistent state + (void)debt; // appease clang-analyze + + // Verify paid down + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + + // Accumulate another debt, without accounting, and releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + // Big and small are delayed + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB)); + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + tokens[i].reset(); + } + // All tokens released. + // Verify that releasing all tokens pays down debt, even with no time passage. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); } -TEST_F(WriteControllerTest, SanityTest) { - WriteController controller(10000000u); - auto stop_token_1 = controller.GetStopToken(); - auto stop_token_2 = controller.GetStopToken(); - - ASSERT_TRUE(controller.IsStopped()); - stop_token_1.reset(); - ASSERT_TRUE(controller.IsStopped()); - stop_token_2.reset(); - ASSERT_FALSE(controller.IsStopped()); - - TimeSetEnv env; - - auto delay_token_1 = controller.GetDelayToken(10000000u); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - - env.now_micros_ += 1999900u; // sleep debt 1000 - - auto delay_token_2 = controller.GetDelayToken(10000000u); - // Rate reset after changing the token. - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - - env.now_micros_ += 1999900u; // sleep debt 1000 - - // One refill: 10240 bytes allowed, 1000 used, 9240 left - ASSERT_EQ(static_cast(1124), controller.GetDelay(&env, 1000u)); - env.now_micros_ += 1124u; // sleep debt 0 - - delay_token_2.reset(); - // 1000 used, 8240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 100u; // sleep credit 100 - // 1000 used, 7240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 100u; // sleep credit 200 - // One refill: 10240 fileed, sleep credit generates 2000. 8000 used - // 7240 + 10240 + 2000 - 8000 = 11480 left - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); - - env.now_micros_ += 200u; // sleep debt 824 - // 1000 used, 10480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 200u; // sleep debt 624 - // Out of bound sleep, still 10480 left - ASSERT_EQ(static_cast(3000624u), - controller.GetDelay(&env, 30000000u)); - - env.now_micros_ += 3000724u; // sleep credit 100 - // 6000 used, 4480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 6000u)); - - env.now_micros_ += 200u; // sleep credit 300 - // One refill, credit 4480 balance + 3000 credit + 10240 refill - // Use 8000, 9720 left - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); - - env.now_micros_ += 3024u; // sleep credit 2000 - - // 1720 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); - - // 1720 balance + 20000 credit = 20170 left - // Use 8000, 12170 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); - - // 4170 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); - - // Need a refill - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 9000u)); - - delay_token_1.reset(); - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 30000000u)); - delay_token_1.reset(); - ASSERT_FALSE(controller.IsStopped()); +// This may or may not be a "good" feature, but it's an old feature +TEST_F(WriteControllerTest, CreditAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Ensure started + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit + uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */; + clock_->now_micros_ += credit; + + // Spend some credit (burst of I/O) + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB)); + // In WriteController, credit is accumulated in bytes, not in time. + // After an "unnecessary" delay, all of our time credit will be + // translated to bytes on the next operation, in this case with + // setting 1 MBPS. So regardless of the rate at delay time, we just + // account for the bytes. + credit -= 63 MB; + } + // Spend remaining credit + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit)); + // Verify + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit, no accounting + clock_->now_micros_ += 1000 SECS; + + // Spend a small amount, releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB)); + tokens[i].reset(); + } + + // All tokens released. + // Verify credit is wiped away on new delay. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc 2025-05-19 16:14:27.000000000 +0000 @@ -208,6 +208,7 @@ } void WriteThread::SetState(Writer* w, uint8_t new_state) { + assert(w); auto state = w->state.load(std::memory_order_acquire); if (state == STATE_LOCKED_WAITING || !w->state.compare_exchange_strong(state, new_state)) { @@ -240,6 +241,7 @@ MutexLock lock(&stall_mu_); writers = newest_writer->load(std::memory_order_relaxed); if (writers == &write_stall_dummy_) { + TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w); stall_cv_.Wait(); // Load newest_writers_ again since it may have changed writers = newest_writer->load(std::memory_order_relaxed); @@ -344,7 +346,13 @@ prev->link_older = w->link_older; w->status = Status::Incomplete("Write stall"); SetState(w, STATE_COMPLETED); - if (prev->link_older) { + // Only update `link_newer` if it's already set. + // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later, + // which assumes the the first non-nullptr `link_newer` is the last + // nullptr link in the writer list. + // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop + // updating the whole list when it sees the first non nullptr link. + if (prev->link_older && prev->link_older->link_newer) { prev->link_older->link_newer = prev; } w = prev->link_older; @@ -438,6 +446,7 @@ // (newest_writer) is inclusive. Iteration goes from old to new. Writer* w = leader; while (w != newest_writer) { + assert(w->link_newer); w = w->link_newer; if (w->sync && !leader->sync) { @@ -457,6 +466,11 @@ break; } + if (w->protection_bytes_per_key != leader->protection_bytes_per_key) { + // Do not mix writes with different levels of integrity protection. + break; + } + if (w->batch == nullptr) { // Do not include those writes with nullptr batch. Those are not writes, // those are something else. They want to be alone @@ -464,7 +478,7 @@ } if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { - // dont batch writes that don't want to be batched + // don't batch writes that don't want to be batched break; } @@ -512,6 +526,7 @@ Writer* w = leader; while (w != newest_writer) { + assert(w->link_newer); w = w->link_newer; if (w->batch == nullptr) { @@ -568,6 +583,7 @@ if (w == last_writer) { break; } + assert(next); w = next; } // Note that leader has to exit last, since it owns the write group. @@ -599,6 +615,8 @@ } // else we're the last parallel worker and should perform exit duties. w->status = write_group->status; + // Callers of this function must ensure w->status is checked. + write_group->status.PermitUncheckedError(); return true; } @@ -615,11 +633,17 @@ static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader"); void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group, - Status status) { + Status& status) { Writer* leader = write_group.leader; Writer* last_writer = write_group.last_writer; assert(leader->link_older == nullptr); + // If status is non-ok already, then write_group.status won't have the chance + // of being propagated to caller. + if (!status.ok()) { + write_group.status.PermitUncheckedError(); + } + // Propagate memtable write error to the whole group. if (status.ok() && !write_group.status.ok()) { status = write_group.status; @@ -721,6 +745,7 @@ // leader now while (last_writer != leader) { + assert(last_writer); last_writer->status = status; // we need to read link_older before calling SetState, because as soon // as it is marked committed the other thread's Await may return and diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,11 +5,11 @@ #pragma once -#include -#include #include +#include #include #include +#include #include #include #include @@ -36,7 +36,7 @@ // non-parallel informs a follower that its writes have been committed // (-> STATE_COMPLETED), or when a leader that has chosen to perform // updates in parallel and needs this Writer to apply its batch (-> - // STATE_PARALLEL_FOLLOWER). + // STATE_PARALLEL_MEMTABLE_WRITER). STATE_INIT = 1, // The state used to inform a waiting Writer that it has become the @@ -119,6 +119,7 @@ bool disable_wal; bool disable_memtable; size_t batch_cnt; // if non-zero, number of sub-batches in the write batch + size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; uint64_t log_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference @@ -128,7 +129,7 @@ WriteGroup* write_group; SequenceNumber sequence; // the sequence number to use for the first key Status status; - Status callback_status; // status returned by callback->Callback() + Status callback_status; // status returned by callback->Callback() std::aligned_storage::type state_mutex_bytes; std::aligned_storage::type state_cv_bytes; @@ -142,6 +143,7 @@ disable_wal(false), disable_memtable(false), batch_cnt(0), + protection_bytes_per_key(0), pre_release_callback(nullptr), log_used(0), log_ref(0), @@ -163,6 +165,7 @@ disable_wal(write_options.disableWAL), disable_memtable(_disable_memtable), batch_cnt(_batch_cnt), + protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), log_used(0), log_ref(_log_ref), @@ -179,6 +182,8 @@ StateMutex().~mutex(); StateCV().~condition_variable(); } + status.PermitUncheckedError(); + callback_status.PermitUncheckedError(); } bool CheckCallback(DB* db) { @@ -241,7 +246,7 @@ std::condition_variable& StateCV() { assert(made_waitable); return *static_cast( - static_cast(&state_cv_bytes)); + static_cast(&state_cv_bytes)); } }; @@ -268,7 +273,7 @@ // STATE_GROUP_LEADER. If w has been made part of a sequential batch // group and the leader has performed the write, returns STATE_DONE. // If w has been made part of a parallel batch group and is responsible - // for updating the memtable, returns STATE_PARALLEL_FOLLOWER. + // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER. // // The db mutex SHOULD NOT be held when calling this function, because // it will block. @@ -289,7 +294,7 @@ // // WriteGroup* write_group: the write group // Status status: Status of write operation - void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status); + void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status); // Exit batch group on behalf of batch group leader. void ExitAsBatchGroupFollower(Writer* w); @@ -305,8 +310,8 @@ // the next leader if needed. void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); - // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the - // non-leader members of this write batch group. Sets Writer::sequence + // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of + // the non-leader members of this write batch group. Sets Writer::sequence // before waking them up. // // WriteGroup* write_group: Extra state used to coordinate the parallel add diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -1,14 +1,17 @@ add_executable(db_stress${ARTIFACT_SUFFIX} - db_stress.cc - db_stress_tool.cc batched_ops_stress.cc cf_consistency_stress.cc + db_stress.cc db_stress_common.cc db_stress_driver.cc - db_stress_test_base.cc - db_stress_shared_state.cc db_stress_gflags.cc + db_stress_listener.cc + db_stress_shared_state.cc + db_stress_stat.cc + db_stress_test_base.cc db_stress_tool.cc + expected_state.cc + multi_ops_txns_stress.cc no_batched_ops_stress.cc) -target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB}) +target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) list(APPEND tool_deps db_stress) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,6 +16,8 @@ BatchedOpsStressTest() {} virtual ~BatchedOpsStressTest() {} + bool IsStateTracked() const override { return false; } + // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... // ("9"+K, "9"+V) in DB atomically i.e in a single batch. // Also refer BatchedOpsStressTest::TestGet @@ -31,7 +33,8 @@ std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"}; std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"}; Slice value_slices[10]; - WriteBatch batch; + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + FLAGS_batch_protection_bytes_per_key); Status s; auto cfh = column_families_[rand_column_families[0]]; std::string key_str = Key(rand_keys[0]); @@ -66,7 +69,8 @@ std::unique_ptr& /* lock */) override { std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"}; - WriteBatch batch; + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + FLAGS_batch_protection_bytes_per_key); Status s; auto cfh = column_families_[rand_column_families[0]]; std::string key_str = Key(rand_keys[0]); @@ -228,7 +232,8 @@ for (size_t i = 1; i < num_prefixes; i++) { if (values[i] != values[0]) { fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", - key_str[i].c_str(), StringToHex(values[0].ToString()).c_str(), + StringToHex(key_str[i]).c_str(), + StringToHex(values[0].ToString()).c_str(), StringToHex(values[i].ToString()).c_str()); // we continue after error rather than exiting so that we can // find more errors if any diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,7 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" +#include "file/file_util.h" namespace ROCKSDB_NAMESPACE { class CfConsistencyStressTest : public StressTest { @@ -17,6 +18,8 @@ ~CfConsistencyStressTest() override {} + bool IsStateTracked() const override { return false; } + Status TestPut(ThreadState* thread, WriteOptions& write_opts, const ReadOptions& /* read_opts */, const std::vector& rand_column_families, @@ -282,70 +285,6 @@ return column_families_[thread->rand.Next() % column_families_.size()]; } -#ifdef ROCKSDB_LITE - Status TestCheckpoint(ThreadState* /* thread */, - const std::vector& /* rand_column_families */, - const std::vector& /* rand_keys */) override { - assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestCheckpoint\n"); - std::terminate(); - } -#else - Status TestCheckpoint(ThreadState* thread, - const std::vector& /* rand_column_families */, - const std::vector& /* rand_keys */) override { - std::string checkpoint_dir = - FLAGS_db + "/.checkpoint" + ToString(thread->tid); - - // We need to clear DB including manifest files, so make a copy - Options opt_copy = options_; - opt_copy.env = db_stress_env->target(); - DestroyDB(checkpoint_dir, opt_copy); - - Checkpoint* checkpoint = nullptr; - Status s = Checkpoint::Create(db_, &checkpoint); - if (s.ok()) { - s = checkpoint->CreateCheckpoint(checkpoint_dir); - } - std::vector cf_handles; - DB* checkpoint_db = nullptr; - if (s.ok()) { - delete checkpoint; - checkpoint = nullptr; - Options options(options_); - options.listeners.clear(); - std::vector cf_descs; - // TODO(ajkr): `column_family_names_` is not safe to access here when - // `clear_column_family_one_in != 0`. But we can't easily switch to - // `ListColumnFamilies` to get names because it won't necessarily give - // the same order as `column_family_names_`. - if (FLAGS_clear_column_family_one_in == 0) { - for (const auto& name : column_family_names_) { - cf_descs.emplace_back(name, ColumnFamilyOptions(options)); - } - s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs, - &cf_handles, &checkpoint_db); - } - } - if (checkpoint_db != nullptr) { - for (auto cfh : cf_handles) { - delete cfh; - } - cf_handles.clear(); - delete checkpoint_db; - checkpoint_db = nullptr; - } - DestroyDB(checkpoint_dir, opt_copy); - if (!s.ok()) { - fprintf(stderr, "A checkpoint operation failed with: %s\n", - s.ToString().c_str()); - } - return s; - } -#endif // !ROCKSDB_LITE - void VerifyDb(ThreadState* thread) const override { ReadOptions options(FLAGS_verify_checksum, true); // We must set total_order_seek to true because we are doing a SeekToFirst diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,7 +15,7 @@ return 1; } #else -#include +#include "rocksdb/db_stress_tool.h" int main(int argc, char** argv) { return ROCKSDB_NAMESPACE::db_stress_tool(argc, argv); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,18 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" + #include -ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env = nullptr; +#include "util/file_checksum_helper.h" +#include "util/xxhash.h" + +ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr; +#ifndef NDEBUG +// If non-null, injects read error at a rate specified by the +// read_fault_one_in or write_fault_one_in flag +std::shared_ptr fault_fs_guard; +#endif // NDEBUG enum ROCKSDB_NAMESPACE::CompressionType compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e = @@ -21,7 +30,7 @@ ROCKSDB_NAMESPACE::kCRC32c; enum RepFactory FLAGS_rep_factory = kSkipList; std::vector sum_probs(100001); -int64_t zipf_sum_size = 100000; +constexpr int64_t zipf_sum_size = 100000; namespace ROCKSDB_NAMESPACE { @@ -151,8 +160,10 @@ snprintf(buf, 4, "%X", value[i]); tmp.append(buf); } - fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf, - key, sz, tmp.c_str()); + auto key_str = Key(key); + Slice key_slice = key_str; + fprintf(stdout, "[CF %d] %s (%" PRIi64 ") == > (%" ROCKSDB_PRIszt ") %s\n", + cf, key_slice.ToString(true).c_str(), key, sz, tmp.c_str()); } // Note that if hot_key_alpha != 0, it generates the key based on Zipfian @@ -214,12 +225,129 @@ ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult; assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t)); (void)max_sz; - *((uint32_t*)v) = rand; + PutUnaligned(reinterpret_cast(v), rand); for (size_t i = sizeof(uint32_t); i < value_sz; i++) { v[i] = (char)(rand ^ i); } v[value_sz] = '\0'; return value_sz; // the size of the value set. } + +uint32_t GetValueBase(Slice s) { + assert(s.size() >= sizeof(uint32_t)); + uint32_t res; + GetUnaligned(reinterpret_cast(s.data()), &res); + return res; +} + +std::string NowNanosStr() { + uint64_t t = db_stress_env->NowNanos(); + std::string ret; + PutFixed64(&ret, t); + return ret; +} + +std::string GenerateTimestampForRead() { return NowNanosStr(); } + +namespace { + +class MyXXH64Checksum : public FileChecksumGenerator { + public: + explicit MyXXH64Checksum(bool big) : big_(big) { + state_ = XXH64_createState(); + XXH64_reset(state_, 0); + } + + virtual ~MyXXH64Checksum() override { XXH64_freeState(state_); } + + void Update(const char* data, size_t n) override { + XXH64_update(state_, data, n); + } + + void Finalize() override { + assert(str_.empty()); + uint64_t digest = XXH64_digest(state_); + // Store as little endian raw bytes + PutFixed64(&str_, digest); + if (big_) { + // Throw in some more data for stress testing (448 bits total) + PutFixed64(&str_, GetSliceHash64(str_)); + PutFixed64(&str_, GetSliceHash64(str_)); + PutFixed64(&str_, GetSliceHash64(str_)); + PutFixed64(&str_, GetSliceHash64(str_)); + PutFixed64(&str_, GetSliceHash64(str_)); + PutFixed64(&str_, GetSliceHash64(str_)); + } + } + + std::string GetChecksum() const override { + assert(!str_.empty()); + return str_; + } + + const char* Name() const override { + return big_ ? "MyBigChecksum" : "MyXXH64Checksum"; + } + + private: + bool big_; + XXH64_state_t* state_; + std::string str_; +}; + +class DbStressChecksumGenFactory : public FileChecksumGenFactory { + std::string default_func_name_; + + std::unique_ptr CreateFromFuncName( + const std::string& func_name) { + std::unique_ptr rv; + if (func_name == "FileChecksumCrc32c") { + rv.reset(new FileChecksumGenCrc32c(FileChecksumGenContext())); + } else if (func_name == "MyXXH64Checksum") { + rv.reset(new MyXXH64Checksum(false /* big */)); + } else if (func_name == "MyBigChecksum") { + rv.reset(new MyXXH64Checksum(true /* big */)); + } else { + // Should be a recognized function when we get here + assert(false); + } + return rv; + } + + public: + explicit DbStressChecksumGenFactory(const std::string& default_func_name) + : default_func_name_(default_func_name) {} + + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + if (context.requested_checksum_func_name.empty()) { + return CreateFromFuncName(default_func_name_); + } else { + return CreateFromFuncName(context.requested_checksum_func_name); + } + } + + const char* Name() const override { return "FileChecksumGenCrc32cFactory"; } +}; + +} // namespace + +std::shared_ptr GetFileChecksumImpl( + const std::string& name) { + // Translate from friendly names to internal names + std::string internal_name; + if (name == "crc32c") { + internal_name = "FileChecksumCrc32c"; + } else if (name == "xxh64") { + internal_name = "MyXXH64Checksum"; + } else if (name == "big") { + internal_name = "MyBigChecksum"; + } else { + assert(name.empty() || name == "none"); + return nullptr; + } + return std::make_shared(internal_name); +} + } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h 2025-05-19 16:14:27.000000000 +0000 @@ -26,6 +26,7 @@ #include #include #include + #include #include #include @@ -58,6 +59,7 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/write_batch.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" @@ -66,12 +68,6 @@ #include "util/random.h" #include "util/string_util.h" #include "utilities/blob_db/blob_db.h" -// SyncPoint is not supported in Released Windows Mode. -#if !(defined NDEBUG) || !defined(OS_WIN) -#include "test_util/sync_point.h" -#endif // !(defined NDEBUG) || !defined(OS_WIN) -#include "test_util/testutil.h" - #include "utilities/merge_operators.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; @@ -91,6 +87,7 @@ DECLARE_bool(test_batches_snapshots); DECLARE_bool(atomic_flush); DECLARE_bool(test_cf_consistency); +DECLARE_bool(test_multi_ops_txns); DECLARE_int32(threads); DECLARE_int32(ttl); DECLARE_int32(value_size_mult); @@ -112,6 +109,7 @@ DECLARE_int32(open_files); DECLARE_int64(compressed_cache_size); DECLARE_int32(compaction_style); +DECLARE_int32(num_levels); DECLARE_int32(level0_file_num_compaction_trigger); DECLARE_int32(level0_slowdown_writes_trigger); DECLARE_int32(level0_stop_writes_trigger); @@ -128,36 +126,46 @@ DECLARE_int32(universal_max_merge_width); DECLARE_int32(universal_max_size_amplification_percent); DECLARE_int32(clear_column_family_one_in); -DECLARE_int32(get_live_files_and_wal_files_one_in); +DECLARE_int32(get_live_files_one_in); +DECLARE_int32(get_sorted_wal_files_one_in); +DECLARE_int32(get_current_wal_file_one_in); DECLARE_int32(set_options_one_in); DECLARE_int32(set_in_place_one_in); DECLARE_int64(cache_size); +DECLARE_int32(cache_numshardbits); DECLARE_bool(cache_index_and_filter_blocks); +DECLARE_int32(top_level_index_pinning); +DECLARE_int32(partition_pinning); +DECLARE_int32(unpartitioned_pinning); DECLARE_bool(use_clock_cache); DECLARE_uint64(subcompactions); DECLARE_uint64(periodic_compaction_seconds); DECLARE_uint64(compaction_ttl); DECLARE_bool(allow_concurrent_memtable_write); +DECLARE_double(experimental_mempurge_threshold); DECLARE_bool(enable_write_thread_adaptive_yield); DECLARE_int32(reopen); DECLARE_double(bloom_bits); DECLARE_bool(use_block_based_filter); +DECLARE_int32(ribbon_starting_level); DECLARE_bool(partition_filters); +DECLARE_bool(optimize_filters_for_memory); DECLARE_int32(index_type); DECLARE_string(db); DECLARE_string(secondaries_base); DECLARE_bool(test_secondary); -DECLARE_string(expected_values_path); +DECLARE_string(expected_values_dir); DECLARE_bool(verify_checksum); DECLARE_bool(mmap_read); DECLARE_bool(mmap_write); DECLARE_bool(use_direct_reads); DECLARE_bool(use_direct_io_for_flush_and_compaction); +DECLARE_bool(mock_direct_io); DECLARE_bool(statistics); DECLARE_bool(sync); DECLARE_bool(use_fsync); DECLARE_int32(kill_random_test); -DECLARE_string(kill_prefix_blacklist); +DECLARE_string(kill_exclude_prefixes); DECLARE_bool(disable_wal); DECLARE_uint64(recycle_log_file_num); DECLARE_int64(target_file_size_base); @@ -167,15 +175,19 @@ DECLARE_int32(range_deletion_width); DECLARE_uint64(rate_limiter_bytes_per_sec); DECLARE_bool(rate_limit_bg_reads); +DECLARE_uint64(sst_file_manager_bytes_per_sec); +DECLARE_uint64(sst_file_manager_bytes_per_truncate); DECLARE_bool(use_txn); DECLARE_uint64(txn_write_policy); DECLARE_bool(unordered_write); DECLARE_int32(backup_one_in); +DECLARE_uint64(backup_max_size); DECLARE_int32(checkpoint_one_in); DECLARE_int32(ingest_external_file_one_in); DECLARE_int32(ingest_external_file_width); DECLARE_int32(compact_files_one_in); DECLARE_int32(compact_range_one_in); +DECLARE_int32(mark_for_compaction_one_file_in); DECLARE_int32(flush_one_in); DECLARE_int32(pause_background_one_in); DECLARE_int32(compact_range_width); @@ -192,13 +204,17 @@ DECLARE_int32(nooverwritepercent); DECLARE_int32(iterpercent); DECLARE_uint64(num_iterations); +DECLARE_int32(customopspercent); DECLARE_string(compression_type); DECLARE_string(bottommost_compression_type); DECLARE_int32(compression_max_dict_bytes); DECLARE_int32(compression_zstd_max_train_bytes); +DECLARE_int32(compression_parallel_threads); +DECLARE_uint64(compression_max_dict_buffer_bytes); DECLARE_string(checksum_type); DECLARE_string(hdfs); DECLARE_string(env_uri); +DECLARE_string(fs_uri); DECLARE_uint64(ops_per_thread); DECLARE_uint64(log2_keys_per_lock); DECLARE_uint64(max_manifest_file_size); @@ -211,13 +227,17 @@ DECLARE_int32(sync_wal_one_in); DECLARE_bool(avoid_unnecessary_blocking_io); DECLARE_bool(write_dbid_to_manifest); +DECLARE_bool(avoid_flush_during_recovery); DECLARE_uint64(max_write_batch_group_size_bytes); DECLARE_bool(level_compaction_dynamic_level_bytes); DECLARE_int32(verify_checksum_one_in); DECLARE_int32(verify_db_one_in); DECLARE_int32(continuous_verification_interval); +DECLARE_int32(get_property_one_in); +DECLARE_string(file_checksum_impl); #ifndef ROCKSDB_LITE +// Options for StackableDB-based BlobDB DECLARE_bool(use_blob_db); DECLARE_uint64(blob_db_min_blob_size); DECLARE_uint64(blob_db_bytes_per_sync); @@ -225,14 +245,46 @@ DECLARE_bool(blob_db_enable_gc); DECLARE_double(blob_db_gc_cutoff); #endif // !ROCKSDB_LITE + +// Options for integrated BlobDB +DECLARE_bool(allow_setting_blob_options_dynamically); +DECLARE_bool(enable_blob_files); +DECLARE_uint64(min_blob_size); +DECLARE_uint64(blob_file_size); +DECLARE_string(blob_compression_type); +DECLARE_bool(enable_blob_garbage_collection); +DECLARE_double(blob_garbage_collection_age_cutoff); +DECLARE_double(blob_garbage_collection_force_threshold); +DECLARE_uint64(blob_compaction_readahead_size); + DECLARE_int32(approximate_size_one_in); +DECLARE_bool(sync_fault_injection); -const long KB = 1024; -const int kRandomValueMaxFactor = 3; -const int kValueMaxLen = 100; +DECLARE_bool(best_efforts_recovery); +DECLARE_bool(skip_verifydb); +DECLARE_bool(enable_compaction_filter); +DECLARE_bool(paranoid_file_checks); +DECLARE_bool(fail_if_options_file_error); +DECLARE_uint64(batch_protection_bytes_per_key); + +DECLARE_uint64(user_timestamp_size); +DECLARE_string(secondary_cache_uri); +DECLARE_int32(secondary_cache_fault_one_in); + +DECLARE_int32(prepopulate_block_cache); + +constexpr long KB = 1024; +constexpr int kRandomValueMaxFactor = 3; +constexpr int kValueMaxLen = 100; // wrapped posix or hdfs environment -extern ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env; +extern ROCKSDB_NAMESPACE::Env* db_stress_env; +#ifndef NDEBUG +namespace ROCKSDB_NAMESPACE { +class FaultInjectionTestFS; +} // namespace ROCKSDB_NAMESPACE +extern std::shared_ptr fault_fs_guard; +#endif extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e; extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e; @@ -424,19 +476,10 @@ assert(size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)); - // Pad with zeros to make it a multiple of 8. This function may be called - // with a prefix, in which case we return the first index that falls - // inside or outside that prefix, dependeing on whether the prefix is - // the start of upper bound of a scan - unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t)); - if (pad < sizeof(uint64_t)) { - big_endian_key.append(pad, '\0'); - size_key += pad; - } - std::string little_endian_key; little_endian_key.resize(size_key); - for (size_t start = 0; start < size_key; start += sizeof(uint64_t)) { + for (size_t start = 0; start + sizeof(uint64_t) <= size_key; + start += sizeof(uint64_t)) { size_t end = start + sizeof(uint64_t); for (size_t i = 0; i < sizeof(uint64_t); ++i) { little_endian_key[start + i] = big_endian_key[end - 1 - i]; @@ -455,17 +498,40 @@ uint64_t pfx = prefixes[i]; key += (pfx / key_gen_ctx.weights[i]) * key_gen_ctx.window + pfx % key_gen_ctx.weights[i]; + if (i < prefixes.size() - 1) { + // The encoding writes a `key_gen_ctx.weights[i] - 1` that counts for + // `key_gen_ctx.weights[i]` when there are more prefixes to come. So we + // need to add back the one here as we're at a non-last prefix. + ++key; + } } *key_p = key; return true; } +// Given a string prefix, map it to the first corresponding index in the +// expected values buffer. +inline bool GetFirstIntValInPrefix(std::string big_endian_prefix, + uint64_t* key_p) { + size_t size_key = big_endian_prefix.size(); + // Pad with zeros to make it a multiple of 8. This function may be called + // with a prefix, in which case we return the first index that falls + // inside or outside that prefix, dependeing on whether the prefix is + // the start of upper bound of a scan + unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t)); + if (pad < sizeof(uint64_t)) { + big_endian_prefix.append(pad, '\0'); + } + return GetIntVal(std::move(big_endian_prefix), key_p); +} + extern inline uint64_t GetPrefixKeyCount(const std::string& prefix, const std::string& ub) { uint64_t start = 0; uint64_t end = 0; - if (!GetIntVal(prefix, &start) || !GetIntVal(ub, &end)) { + if (!GetFirstIntValInPrefix(prefix, &start) || + !GetFirstIntValInPrefix(ub, &end)) { return 0; } @@ -501,11 +567,20 @@ uint64_t iteration); extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz); +extern uint32_t GetValueBase(Slice s); extern StressTest* CreateCfConsistencyStressTest(); extern StressTest* CreateBatchedOpsStressTest(); extern StressTest* CreateNonBatchedOpsStressTest(); +extern StressTest* CreateMultiOpsTxnsStressTest(); +extern void CheckAndSetOptionsForMultiOpsTxnStressTest(); extern void InitializeHotKeyGenerator(double alpha); extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key); + +extern std::string GenerateTimestampForRead(); +extern std::string NowNanosStr(); + +std::shared_ptr GetFileChecksumImpl( + const std::string& name); } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db_stress_tool/db_stress_common.h" +#include "db_stress_tool/db_stress_shared_state.h" +#include "rocksdb/compaction_filter.h" + +namespace ROCKSDB_NAMESPACE { + +// DbStressCompactionFilter is safe to use with db_stress as it does not perform +// any mutation. It only makes `kRemove` decisions for keys that are already +// non-existent according to the `SharedState`. +class DbStressCompactionFilter : public CompactionFilter { + public: + DbStressCompactionFilter(SharedState* state, int cf_id) + : state_(state), cf_id_(cf_id) {} + + Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (state_ == nullptr) { + return Decision::kKeep; + } + if (key.empty() || ('0' <= key[0] && key[0] <= '9')) { + // It is likely leftover from a test_batches_snapshots run. Below this + // conditional, the test_batches_snapshots key format is not handled + // properly. Just keep it to be safe. + return Decision::kKeep; + } + uint64_t key_num = 0; + bool ok = GetIntVal(key.ToString(), &key_num); + assert(ok); + (void)ok; + port::Mutex* key_mutex = state_->GetMutexForKey(cf_id_, key_num); + if (!key_mutex->TryLock()) { + return Decision::kKeep; + } + // Reaching here means we acquired the lock. + + bool key_exists = state_->Exists(cf_id_, key_num); + + key_mutex->Unlock(); + + if (!key_exists) { + return Decision::kRemove; + } + return Decision::kKeep; + } + + const char* Name() const override { return "DbStressCompactionFilter"; } + + private: + SharedState* const state_; + const int cf_id_; +}; + +class DbStressCompactionFilterFactory : public CompactionFilterFactory { + public: + DbStressCompactionFilterFactory() : state_(nullptr) {} + + void SetSharedState(SharedState* state) { + MutexLock state_mutex_guard(&state_mutex_); + state_ = state; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + MutexLock state_mutex_guard(&state_mutex_); + return std::unique_ptr( + new DbStressCompactionFilter(state_, context.column_family_id)); + } + + const char* Name() const override { + return "DbStressCompactionFilterFactory"; + } + + private: + port::Mutex state_mutex_; + SharedState* state_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,13 +10,14 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { void ThreadBody(void* v) { ThreadState* thread = reinterpret_cast(v); SharedState* shared = thread->shared; - if (shared->ShouldVerifyAtBeginning()) { + if (!FLAGS_skip_verifydb && shared->ShouldVerifyAtBeginning()) { thread->shared->GetStressTest()->VerifyDb(thread); } { @@ -42,7 +43,9 @@ } } - thread->shared->GetStressTest()->VerifyDb(thread); + if (!FLAGS_skip_verifydb) { + thread->shared->GetStressTest()->VerifyDb(thread); + } { MutexLock l(shared->GetMutex()); @@ -54,31 +57,42 @@ } bool RunStressTest(StressTest* stress) { + SystemClock* clock = db_stress_env->GetSystemClock().get(); stress->InitDb(); - SharedState shared(db_stress_env, stress); - if (FLAGS_read_only) { - stress->InitReadonlyDb(&shared); - } + stress->FinishInitDb(&shared); - uint32_t n = shared.GetNumThreads(); +#ifndef NDEBUG + if (FLAGS_sync_fault_injection) { + fault_fs_guard->SetFilesystemDirectWritable(false); + } +#endif - uint64_t now = db_stress_env->NowMicros(); + uint32_t n = FLAGS_threads; + uint64_t now = clock->NowMicros(); fprintf(stdout, "%s Initializing worker threads\n", - db_stress_env->TimeToString(now / 1000000).c_str()); - std::vector threads(n); - for (uint32_t i = 0; i < n; i++) { - threads[i] = new ThreadState(i, &shared); - db_stress_env->StartThread(ThreadBody, threads[i]); - } + clock->TimeToString(now / 1000000).c_str()); + ThreadState bg_thread(0, &shared); - if (FLAGS_compaction_thread_pool_adjust_interval > 0) { - db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread); - } ThreadState continuous_verification_thread(0, &shared); - if (FLAGS_continuous_verification_interval > 0) { - db_stress_env->StartThread(DbVerificationThread, - &continuous_verification_thread); + std::vector threads(n); + { + MutexLock l(shared.GetMutex()); + + for (uint32_t i = 0; i < n; i++) { + shared.IncThreads(); + threads[i] = new ThreadState(i, &shared); + db_stress_env->StartThread(ThreadBody, threads[i]); + } + if (FLAGS_compaction_thread_pool_adjust_interval > 0) { + shared.IncBgThreads(); + db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread); + } + if (FLAGS_continuous_verification_interval > 0) { + shared.IncBgThreads(); + db_stress_env->StartThread(DbVerificationThread, + &continuous_verification_thread); + } } // Each thread goes through the following states: @@ -98,9 +112,9 @@ } } - now = db_stress_env->NowMicros(); + now = clock->NowMicros(); fprintf(stdout, "%s Starting database operations\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock->TimeToString(now / 1000000).c_str()); shared.SetStart(); shared.GetCondVar()->SignalAll(); @@ -108,13 +122,16 @@ shared.GetCondVar()->Wait(); } - now = db_stress_env->NowMicros(); + now = clock->NowMicros(); if (FLAGS_test_batches_snapshots) { fprintf(stdout, "%s Limited verification already done during gets\n", - db_stress_env->TimeToString((uint64_t)now / 1000000).c_str()); + clock->TimeToString((uint64_t)now / 1000000).c_str()); + } else if (FLAGS_skip_verifydb) { + fprintf(stdout, "%s Verification skipped\n", + clock->TimeToString((uint64_t)now / 1000000).c_str()); } else { fprintf(stdout, "%s Starting verification\n", - db_stress_env->TimeToString((uint64_t)now / 1000000).c_str()); + clock->TimeToString((uint64_t)now / 1000000).c_str()); } shared.SetStartVerify(); @@ -133,10 +150,11 @@ delete threads[i]; threads[i] = nullptr; } - now = db_stress_env->NowMicros(); - if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) { + now = clock->NowMicros(); + if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots && + !shared.HasVerificationFailedYet()) { fprintf(stdout, "%s Verification successful\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock->TimeToString(now / 1000000).c_str()); } stress->PrintStatistics(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,18 +15,24 @@ class DbStressEnvWrapper : public EnvWrapper { public: explicit DbStressEnvWrapper(Env* t) : EnvWrapper(t) {} + static const char* kClassName() { return "DbStressEnv"; } + const char* Name() const override { return kClassName(); } Status DeleteFile(const std::string& f) override { // We determine whether it is a manifest file by searching a strong, // so that there will be false positive if the directory path contains the // keyword but it is unlikely. - // Checkpoint directory needs to be exempted. + // Checkpoint, backup, and restore directories needs to be exempted. if (!if_preserve_all_manifests || f.find("MANIFEST-") == std::string::npos || - f.find("checkpoint") != std::string::npos) { + f.find("checkpoint") != std::string::npos || + f.find(".backup") != std::string::npos || + f.find(".restore") != std::string::npos) { return target()->DeleteFile(f); } - return Status::OK(); + // Rename the file instead of deletion to keep the history, and + // at the same time it is not visible to RocksDB. + return target()->RenameFile(f, f + "_renamed_"); } // If true, all manifest files will not be delted in DeleteFile(). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,7 +19,10 @@ return true; } -DEFINE_uint64(seed, 2341234, "Seed for PRNG"); +DEFINE_uint64(seed, 2341234, + "Seed for PRNG. When --nooverwritepercent is " + "nonzero and --expected_values_dir is nonempty, this value " + "must be fixed across invocations."); static const bool FLAGS_seed_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range); @@ -87,6 +90,11 @@ "multiple column families are consistent. Setting this implies " "`atomic_flush=true` is set true if `disable_wal=false`.\n"); +DEFINE_bool(test_multi_ops_txns, false, + "If set, runs stress test dedicated to verifying multi-ops " + "transactions on a simple relational table with primary and " + "secondary index."); + DEFINE_int32(threads, 32, "Number of concurrent threads to run."); DEFINE_int32(ttl, -1, @@ -186,6 +194,9 @@ DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style, ""); +DEFINE_int32(num_levels, ROCKSDB_NAMESPACE::Options().num_levels, + "Number of levels in the DB"); + DEFINE_int32(level0_file_num_compaction_trigger, ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger, "Level0 compaction start trigger"); @@ -256,10 +267,21 @@ "it again. If N == 0, never drop/create column families. " "When test_batches_snapshots is true, this flag has no effect"); -DEFINE_int32(get_live_files_and_wal_files_one_in, 1000000, - "With a chance of 1/N, call GetLiveFiles, GetSortedWalFiles " - "and GetCurrentWalFile to verify if it returns correctly. If " - "N == 0, never call the three interfaces."); +DEFINE_int32(get_live_files_one_in, 1000000, + "With a chance of 1/N, call GetLiveFiles to verify if it returns " + "correctly. If N == 0, do not call the interface."); + +DEFINE_int32( + get_sorted_wal_files_one_in, 1000000, + "With a chance of 1/N, call GetSortedWalFiles to verify if it returns " + "correctly. (Note that this API may legitimately return an error.) If N == " + "0, do not call the interface."); + +DEFINE_int32( + get_current_wal_file_one_in, 1000000, + "With a chance of 1/N, call GetCurrentWalFile to verify if it returns " + "correctly. (Note that this API may legitimately return an error.) If N == " + "0, do not call the interface."); DEFINE_int32(set_options_one_in, 0, "With a chance of 1/N, change some random options"); @@ -270,9 +292,32 @@ DEFINE_int64(cache_size, 2LL * KB * KB * KB, "Number of bytes to use as a cache of uncompressed data."); +DEFINE_int32(cache_numshardbits, 6, + "Number of shards for the block cache" + " is 2 ** cache_numshardbits. Negative means use default settings." + " This is applied only if FLAGS_cache_size is non-negative."); + DEFINE_bool(cache_index_and_filter_blocks, false, "True if indexes/filters should be cached in block cache."); +DEFINE_int32( + top_level_index_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for top-level indexes into metadata partitions (see " + "`enum PinningTier` in table.h)"); + +DEFINE_int32( + partition_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for metadata partitions (see `enum PinningTier` in " + "table.h)"); + +DEFINE_int32( + unpartitioned_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for unpartitioned metadata blocks (see `enum PinningTier` " + "in table.h)"); + DEFINE_bool(use_clock_cache, false, "Replace default LRU block cache with clock cache."); @@ -289,37 +334,87 @@ DEFINE_bool(allow_concurrent_memtable_write, false, "Allow multi-writers to update mem tables in parallel."); +DEFINE_double(experimental_mempurge_threshold, 0.0, + "Maximum estimated useful payload that triggers a " + "mempurge process to collect memtable garbage bytes."); + DEFINE_bool(enable_write_thread_adaptive_yield, true, "Use a yielding spin loop for brief writer thread waits."); #ifndef ROCKSDB_LITE -// BlobDB Options -DEFINE_bool(use_blob_db, false, "Use BlobDB."); +// Options for StackableDB-based BlobDB +DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB."); -DEFINE_uint64(blob_db_min_blob_size, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, - "Smallest blob to store in a file. Blobs smaller than this " - "will be inlined with the key in the LSM tree."); - -DEFINE_uint64(blob_db_bytes_per_sync, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, - "Sync blob files once per every N bytes written."); +DEFINE_uint64( + blob_db_min_blob_size, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, + "[Stacked BlobDB] Smallest blob to store in a file. Blobs " + "smaller than this will be inlined with the key in the LSM tree."); + +DEFINE_uint64( + blob_db_bytes_per_sync, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, + "[Stacked BlobDB] Sync blob files once per every N bytes written."); DEFINE_uint64(blob_db_file_size, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size, - "Target size of each blob file."); + "[Stacked BlobDB] Target size of each blob file."); DEFINE_bool( blob_db_enable_gc, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection, - "Enable BlobDB garbage collection."); + "[Stacked BlobDB] Enable BlobDB garbage collection."); DEFINE_double( blob_db_gc_cutoff, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, - "Cutoff ratio for BlobDB garbage collection."); + "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); #endif // !ROCKSDB_LITE +// Options for integrated BlobDB +DEFINE_bool(allow_setting_blob_options_dynamically, false, + "[Integrated BlobDB] Allow setting blob options dynamically."); + +DEFINE_bool( + enable_blob_files, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files, + "[Integrated BlobDB] Enable writing large values to separate blob files."); + +DEFINE_uint64(min_blob_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size, + "[Integrated BlobDB] The size of the smallest value to be stored " + "separately in a blob file."); + +DEFINE_uint64(blob_file_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size, + "[Integrated BlobDB] The size limit for blob files."); + +DEFINE_string(blob_compression_type, "none", + "[Integrated BlobDB] The compression algorithm to use for large " + "values stored in blob files."); + +DEFINE_bool(enable_blob_garbage_collection, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .enable_blob_garbage_collection, + "[Integrated BlobDB] Enable blob garbage collection."); + +DEFINE_double(blob_garbage_collection_age_cutoff, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_age_cutoff, + "[Integrated BlobDB] The cutoff in terms of blob file age for " + "garbage collection."); + +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + +DEFINE_uint64(blob_compaction_readahead_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_compaction_readahead_size, + "[Integrated BlobDB] Compaction readahead for blob files."); + static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); @@ -343,10 +438,21 @@ "use block based filter" "instead of full filter for block based table"); +DEFINE_int32( + ribbon_starting_level, 999, + "Use Bloom filter on levels below specified and Ribbon beginning on level " + "specified. Flush is considered level -1. 999 or more -> always Bloom. 0 " + "-> Ribbon except Bloom for flush. -1 -> always Ribbon."); + DEFINE_bool(partition_filters, false, "use partitioned filters " "for block-based table"); +DEFINE_bool( + optimize_filters_for_memory, + ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory, + "Minimize memory footprint of filters"); + DEFINE_int32( index_type, static_cast( @@ -361,12 +467,14 @@ DEFINE_bool(test_secondary, false, "Test secondary instance."); DEFINE_string( - expected_values_path, "", - "File where the array of expected uint32_t values will be stored. If " - "provided and non-empty, the DB state will be verified against these " - "values after recovery. --max_key and --column_family must be kept the " - "same across invocations of this program that use the same " - "--expected_values_path."); + expected_values_dir, "", + "Dir where files containing info about the latest/historical values will " + "be stored. If provided and non-empty, the DB state will be verified " + "against values from these files after recovery. --max_key and " + "--column_family must be kept the same across invocations of this program " + "that use the same --expected_values_dir. Currently historical values are " + "only tracked when --sync_fault_injection is set. See --seed and " + "--nooverwritepercent for further requirements."); DEFINE_bool(verify_checksum, false, "Verify checksum for every block read from storage"); @@ -384,6 +492,9 @@ ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction, "Use O_DIRECT for writing data"); +DEFINE_bool(mock_direct_io, false, + "Mock direct IO by not using O_DIRECT for direct IO read"); + DEFINE_bool(statistics, false, "Create database statistics"); DEFINE_bool(sync, false, "Sync all writes to disk"); @@ -395,12 +506,11 @@ "probability 1/this"); static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive); -extern int rocksdb_kill_odds; -DEFINE_string(kill_prefix_blacklist, "", +DEFINE_string(kill_exclude_prefixes, "", "If non-empty, kill points with prefix in the list given will be" " skipped. Items are comma-separated."); -extern std::vector rocksdb_kill_prefix_blacklist; +extern std::vector rocksdb_kill_exclude_prefixes; DEFINE_bool(disable_wal, false, "If true, do not write WAL for write."); @@ -430,6 +540,14 @@ DEFINE_bool(rate_limit_bg_reads, false, "Use options.rate_limiter on compaction reads"); +DEFINE_uint64(sst_file_manager_bytes_per_sec, 0, + "Set `Options::sst_file_manager` to delete at this rate. By " + "default the deletion rate is unbounded."); + +DEFINE_uint64(sst_file_manager_bytes_per_truncate, 0, + "Set `Options::sst_file_manager` to delete in chunks of this " + "many bytes. By default whole files will be deleted."); + DEFINE_bool(use_txn, false, "Use TransactionDB. Currently the default write policy is " "TxnDBWritePolicy::WRITE_PREPARED"); @@ -449,6 +567,10 @@ "every N operations on average. 0 indicates CreateNewBackup() " "is disabled."); +DEFINE_uint64(backup_max_size, 100 * 1024 * 1024, + "If non-zero, skip checking backup/restore when DB size in " + "bytes exceeds this setting."); + DEFINE_int32(checkpoint_one_in, 0, "If non-zero, then CreateCheckpoint() will be called once for " "every N operations on average. 0 indicates CreateCheckpoint() " @@ -470,6 +592,12 @@ "If non-zero, then CompactRange() will be called once for every N " "operations on average. 0 indicates CompactRange() is disabled."); +DEFINE_int32(mark_for_compaction_one_file_in, 0, + "A `TablePropertiesCollectorFactory` will be registered, which " + "creates a `TablePropertiesCollector` with `NeedCompact()` " + "returning true once for every N files on average. 0 or negative " + "mean `NeedCompact()` always returns false."); + DEFINE_int32(flush_one_in, 0, "If non-zero, then Flush() will be called once for every N ops " "on average. 0 indicates calls to Flush() are disabled."); @@ -537,7 +665,8 @@ DEFINE_int32(nooverwritepercent, 60, "Ratio of keys without overwrite to total workload (expressed as " - " a percentage)"); + "a percentage). When --expected_values_dir is nonempty, must " + "keep this value constant across invocations."); static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent); @@ -551,6 +680,10 @@ static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range); +DEFINE_int32( + customopspercent, 0, + "Ratio of custom operations to total workload (expressed as a percentage)"); + DEFINE_string(compression_type, "snappy", "Algorithm to use to compress the database"); @@ -562,16 +695,31 @@ "Maximum size of training data passed to zstd's dictionary " "trainer."); +DEFINE_int32(compression_parallel_threads, 1, + "Number of threads for parallel compression."); + +DEFINE_uint64(compression_max_dict_buffer_bytes, 0, + "Buffering limit for SST file data to sample for dictionary " + "compression."); + DEFINE_string(bottommost_compression_type, "disable", "Algorithm to use to compress bottommost level of the database. " "\"disable\" means disabling the feature"); DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks"); -DEFINE_string(hdfs, "", "Name of hdfs environment"); +DEFINE_string(hdfs, "", + "Name of hdfs environment. Mutually exclusive with" + " --env_uri and --fs_uri."); + +DEFINE_string( + env_uri, "", + "URI for env lookup. Mutually exclusive with --hdfs and --fs_uri"); -DEFINE_string(env_uri, "", - "URI for env lookup. Mutually exclusive with --hdfs"); +DEFINE_string(fs_uri, "", + "URI for registry Filesystem lookup. Mutually exclusive" + " with --hdfs and --env_uri." + " Creates a default environment with the specified filesystem."); DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) = @@ -627,6 +775,10 @@ ROCKSDB_NAMESPACE::Options().write_dbid_to_manifest, "Write DB_ID to manifest"); +DEFINE_bool(avoid_flush_during_recovery, + ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery, + "Avoid flush during recovery"); + DEFINE_uint64(max_write_batch_group_size_bytes, ROCKSDB_NAMESPACE::Options().max_write_batch_group_size_bytes, "Max write batch group size"); @@ -652,4 +804,79 @@ DEFINE_int32(approximate_size_one_in, 64, "If non-zero, DB::GetApproximateSizes() will be called against" " random key ranges."); + +DEFINE_int32(read_fault_one_in, 1000, + "On non-zero, enables fault injection on read"); + +DEFINE_int32(get_property_one_in, 1000, + "If non-zero, then DB::GetProperty() will be called to get various" + " properties for every N ops on average. 0 indicates that" + " GetProperty() will be not be called."); + +DEFINE_bool(sync_fault_injection, false, + "If true, FaultInjectionTestFS will be used for write operations, " + "and unsynced data in DB will lost after crash. In such a case we " + "track DB changes in a trace file (\"*.trace\") in " + "--expected_values_dir for verifying there are no holes in the " + "recovered data."); + +DEFINE_bool(best_efforts_recovery, false, + "If true, use best efforts recovery."); +DEFINE_bool(skip_verifydb, false, "If true, skip VerifyDb() calls."); + +DEFINE_bool(enable_compaction_filter, false, + "If true, configures a compaction filter that returns a kRemove " + "decision for deleted keys."); + +DEFINE_bool(paranoid_file_checks, true, + "After writing every SST file, reopen it and read all the keys " + "and validate checksums"); + +DEFINE_bool(fail_if_options_file_error, false, + "Fail operations that fail to detect or properly persist options " + "file."); + +DEFINE_uint64(batch_protection_bytes_per_key, 0, + "If nonzero, enables integrity protection in `WriteBatch` at the " + "specified number of bytes per key. Currently the only supported " + "nonzero value is eight."); + +DEFINE_string(file_checksum_impl, "none", + "Name of an implementation for file_checksum_gen_factory, or " + "\"none\" for null."); + +DEFINE_int32(write_fault_one_in, 0, + "On non-zero, enables fault injection on write"); + +DEFINE_uint64(user_timestamp_size, 0, + "Number of bytes for a user-defined timestamp. Currently, only " + "8-byte is supported"); + +DEFINE_int32(open_metadata_write_fault_one_in, 0, + "On non-zero, enables fault injection on file metadata write " + "during DB reopen."); + +#ifndef ROCKSDB_LITE +DEFINE_string(secondary_cache_uri, "", + "Full URI for creating a customized secondary cache object"); +DEFINE_int32(secondary_cache_fault_one_in, 0, + "On non-zero, enables fault injection in secondary cache inserts" + " and lookups"); +#endif // ROCKSDB_LITE +DEFINE_int32(open_write_fault_one_in, 0, + "On non-zero, enables fault injection on file writes " + "during DB reopen."); +DEFINE_int32(open_read_fault_one_in, 0, + "On non-zero, enables fault injection on file reads " + "during DB reopen."); +DEFINE_int32(injest_error_severity, 1, + "The severity of the injested IO Error. 1 is soft error (e.g. " + "retryable error), 2 is fatal error, and the default is " + "retryable error."); +DEFINE_int32(prepopulate_block_cache, + static_cast(ROCKSDB_NAMESPACE::BlockBasedTableOptions:: + PrepopulateBlockCache::kDisable), + "Options related to cache warming (see `enum " + "PrepopulateBlockCache` in table.h)"); + #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,148 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db_stress_tool/db_stress_listener.h" + +#include + +#include "rocksdb/file_system.h" +#include "util/coding_lean.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef GFLAGS +#ifndef ROCKSDB_LITE + +// TODO: consider using expected_values_dir instead, but this is more +// convenient for now. +UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name) + : path_(db_name + "/.unique_ids") { + // We expect such a small number of files generated during this test + // (thousands?), checking full 192-bit IDs for uniqueness is a very + // weak check. For a stronger check, we pick a specific 64-bit + // subsequence from the ID to check for uniqueness. All bits of the + // ID should be high quality, and 64 bits should be unique with + // very good probability for the quantities in this test. + offset_ = Random::GetTLSInstance()->Uniform(17); // 0 to 16 + + // Use default FileSystem to avoid fault injection, etc. + FileSystem& fs = *FileSystem::Default(); + IOOptions opts; + + Status st = fs.CreateDirIfMissing(db_name, opts, nullptr); + if (!st.ok()) { + fprintf(stderr, "Failed to create directory %s: %s\n", db_name.c_str(), + st.ToString().c_str()); + exit(1); + } + + { + std::unique_ptr reader; + Status s = + fs.NewSequentialFile(path_, FileOptions(), &reader, /*dbg*/ nullptr); + if (s.ok()) { + // Load from file + std::string id(24U, '\0'); + Slice result; + for (;;) { + s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error reading unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + if (result.size() < id.size()) { + // EOF + if (result.size() != 0) { + // Corrupt file. Not a DB bug but could happen if OS doesn't provide + // good guarantees on process crash. + fprintf(stdout, "Warning: clearing corrupt unique id file\n"); + id_set_.clear(); + reader.reset(); + s = fs.DeleteFile(path_, opts, /*dbg*/ nullptr); + assert(s.ok()); + } + break; + } + VerifyNoWrite(id); + } + } else { + // Newly created is ok. + // But FileSystem doesn't tell us whether non-existence was the cause of + // the failure. (Issue #9021) + Status s2 = fs.FileExists(path_, opts, /*dbg*/ nullptr); + if (!s2.IsNotFound()) { + fprintf(stderr, "Error opening unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + } + } + fprintf(stdout, "(Re-)verified %zu unique IDs\n", id_set_.size()); + Status s = fs.ReopenWritableFile(path_, FileOptions(), &data_file_writer_, + /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error opening unique id file for append: %s\n", + s.ToString().c_str()); + assert(false); + } +} + +UniqueIdVerifier::~UniqueIdVerifier() { + data_file_writer_->Close(IOOptions(), /*dbg*/ nullptr); +} + +void UniqueIdVerifier::VerifyNoWrite(const std::string& id) { + assert(id.size() == 24); + bool is_new = id_set_.insert(DecodeFixed64(&id[offset_])).second; + if (!is_new) { + fprintf(stderr, + "Duplicate partial unique ID found (offset=%zu, count=%zu)\n", + offset_, id_set_.size()); + assert(false); + } +} + +void UniqueIdVerifier::Verify(const std::string& id) { + assert(id.size() == 24); + std::lock_guard lock(mutex_); + // If we accumulate more than ~4 million IDs, there would be > 1 in 1M + // natural chance of collision. Thus, simply stop checking at that point. + if (id_set_.size() >= 4294967) { + return; + } + IOStatus s = + data_file_writer_->Append(Slice(id), IOOptions(), /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error writing to unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + s = data_file_writer_->Flush(IOOptions(), /*dbg*/ nullptr); + if (!s.ok()) { + fprintf(stderr, "Error flushing unique id file: %s\n", + s.ToString().c_str()); + assert(false); + } + VerifyNoWrite(id); +} + +void DbStressListener::VerifyTableFileUniqueId( + const TableProperties& new_file_properties, const std::string& file_path) { + // Verify unique ID + std::string id; + Status s = GetUniqueIdFromTableProperties(new_file_properties, &id); + if (!s.ok()) { + fprintf(stderr, "Error getting SST unique id for %s: %s\n", + file_path.c_str(), s.ToString().c_str()); + assert(false); + } + unique_ids_.Verify(id); +} + +#endif // !ROCKSDB_LITE +#endif // GFLAGS + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,12 +6,45 @@ #ifdef GFLAGS #pragma once +#include +#include + +#include "file/filename.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/unique_id.h" #include "util/gflags_compat.h" +#include "util/random.h" DECLARE_int32(compact_files_one_in); namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +// Verify across process executions that all seen IDs are unique +class UniqueIdVerifier { + public: + explicit UniqueIdVerifier(const std::string& db_name); + ~UniqueIdVerifier(); + + void Verify(const std::string& id); + + private: + void VerifyNoWrite(const std::string& id); + + private: + std::mutex mutex_; + // IDs persisted to a hidden file inside DB dir + std::string path_; + std::unique_ptr data_file_writer_; + // Starting byte for which 8 bytes to check in memory within 24 byte ID + size_t offset_; + // Working copy of the set of 8 byte pieces + std::unordered_set id_set_; +}; + class DbStressListener : public EventListener { public: DbStressListener(const std::string& db_name, @@ -20,8 +53,12 @@ : db_name_(db_name), db_paths_(db_paths), column_families_(column_families), - num_pending_file_creations_(0) {} -#ifndef ROCKSDB_LITE + num_pending_file_creations_(0), + unique_ids_(db_name) {} + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "DBStressListener"; } + ~DbStressListener() override { assert(num_pending_file_creations_ == 0); } void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { assert(IsValidColumnFamilyName(info.cf_name)); @@ -64,15 +101,15 @@ void OnTableFileCreated(const TableFileCreationInfo& info) override { assert(info.db_name == db_name_); assert(IsValidColumnFamilyName(info.cf_name)); - if (info.file_size) { - VerifyFilePath(info.file_path); - } assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0); - if (info.status.ok() && info.file_size > 0) { + if (info.status.ok()) { + assert(info.file_size > 0); + VerifyFilePath(info.file_path); assert(info.table_properties.data_size > 0 || info.table_properties.num_range_deletions > 0); assert(info.table_properties.raw_key_size > 0); assert(info.table_properties.num_entries > 0); + VerifyTableFileUniqueId(info.table_properties, info.file_path); } --num_pending_file_creations_; } @@ -86,9 +123,12 @@ RandomSleep(); } - void OnExternalFileIngested( - DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) override { + void OnExternalFileIngested(DB* /*db*/, + const ExternalFileIngestionInfo& info) override { RandomSleep(); + // Here we assume that each generated external file is ingested + // exactly once (or thrown away in case of crash) + VerifyTableFileUniqueId(info.table_properties, info.internal_file_path); } void OnBackgroundError(BackgroundErrorReason /* reason */, @@ -206,17 +246,23 @@ #endif // !NDEBUG } + // Unique id is verified using the TableProperties. file_path is only used + // for reporting. + void VerifyTableFileUniqueId(const TableProperties& new_file_properties, + const std::string& file_path); + void RandomSleep() { std::this_thread::sleep_for( std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000))); } -#endif // !ROCKSDB_LITE private: std::string db_name_; std::vector db_paths_; std::vector column_families_; std::atomic num_pending_file_creations_; + UniqueIdVerifier unique_ids_; }; +#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc 2025-05-19 16:14:27.000000000 +0000 @@ -14,5 +14,14 @@ namespace ROCKSDB_NAMESPACE { const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe; const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff; +#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) +#if defined(OS_SOLARIS) +__thread bool SharedState::ignore_read_error; +#else +thread_local bool SharedState::ignore_read_error; +#endif // OS_SOLARIS +#else +bool SharedState::ignore_read_error; +#endif // ROCKSDB_SUPPORT_THREAD_LOCAL } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,6 +11,11 @@ #pragma once #include "db_stress_tool/db_stress_stat.h" +#include "db_stress_tool/expected_state.h" +// SyncPoint is not supported in Released Windows Mode. +#if !(defined NDEBUG) || !defined(OS_WIN) +#include "test_util/sync_point.h" +#endif // !(defined NDEBUG) || !defined(OS_WIN) #include "util/gflags_compat.h" DECLARE_uint64(seed); @@ -19,11 +24,18 @@ DECLARE_int32(threads); DECLARE_int32(column_families); DECLARE_int32(nooverwritepercent); -DECLARE_string(expected_values_path); +DECLARE_string(expected_values_dir); DECLARE_int32(clear_column_family_one_in); DECLARE_bool(test_batches_snapshots); DECLARE_int32(compaction_thread_pool_adjust_interval); DECLARE_int32(continuous_verification_interval); +DECLARE_int32(read_fault_one_in); +DECLARE_int32(write_fault_one_in); +DECLARE_int32(open_metadata_write_fault_one_in); +DECLARE_int32(open_write_fault_one_in); +DECLARE_int32(open_read_fault_one_in); + +DECLARE_int32(injest_error_severity); namespace ROCKSDB_NAMESPACE { class StressTest; @@ -37,12 +49,26 @@ // indicates a key should definitely be deleted static const uint32_t DELETION_SENTINEL; - SharedState(Env* env, StressTest* stress_test) + // Errors when reading filter blocks are ignored, so we use a thread + // local variable updated via sync points to keep track of errors injected + // while reading filter blocks in order to ignore the Get/MultiGet result + // for those calls +#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) +#if defined(OS_SOLARIS) + static __thread bool ignore_read_error; +#else + static thread_local bool ignore_read_error; +#endif // OS_SOLARIS +#else + static bool ignore_read_error; +#endif // ROCKSDB_SUPPORT_THREAD_LOCAL + + SharedState(Env* /*env*/, StressTest* stress_test) : cv_(&mu_), seed_(static_cast(FLAGS_seed)), max_key_(FLAGS_max_key), log2_keys_per_lock_(static_cast(FLAGS_log2_keys_per_lock)), - num_threads_(FLAGS_threads), + num_threads_(0), num_initialized_(0), num_populated_(0), vote_reopen_(0), @@ -56,7 +82,7 @@ verification_failure_(false), should_stop_test_(false), no_overwrite_ids_(FLAGS_column_families), - values_(nullptr), + expected_state_manager_(nullptr), printing_verification_results_(false) { // Pick random keys in each column family that will not experience // overwrite @@ -85,64 +111,38 @@ } delete[] permutation; - size_t expected_values_size = - sizeof(std::atomic) * FLAGS_column_families * max_key_; - bool values_init_needed = false; Status status; - if (!FLAGS_expected_values_path.empty()) { + // TODO: We should introduce a way to explicitly disable verification + // during shutdown. When that is disabled and FLAGS_expected_values_dir + // is empty (disabling verification at startup), we can skip tracking + // expected state. Only then should we permit bypassing the below feature + // compatibility checks. + if (!FLAGS_expected_values_dir.empty()) { if (!std::atomic{}.is_lock_free()) { status = Status::InvalidArgument( - "Cannot use --expected_values_path on platforms without lock-free " + "Cannot use --expected_values_dir on platforms without lock-free " "std::atomic"); } if (status.ok() && FLAGS_clear_column_family_one_in > 0) { status = Status::InvalidArgument( - "Cannot use --expected_values_path on when " + "Cannot use --expected_values_dir on when " "--clear_column_family_one_in is greater than zero."); } - uint64_t size = 0; - if (status.ok()) { - status = env->GetFileSize(FLAGS_expected_values_path, &size); - } - std::unique_ptr wfile; - if (status.ok() && size == 0) { - const EnvOptions soptions; - status = - env->NewWritableFile(FLAGS_expected_values_path, &wfile, soptions); - } - if (status.ok() && size == 0) { - std::string buf(expected_values_size, '\0'); - status = wfile->Append(buf); - values_init_needed = true; - } - if (status.ok()) { - status = env->NewMemoryMappedFileBuffer(FLAGS_expected_values_path, - &expected_mmap_buffer_); - } - if (status.ok()) { - assert(expected_mmap_buffer_->GetLen() == expected_values_size); - values_ = static_cast*>( - expected_mmap_buffer_->GetBase()); - assert(values_ != nullptr); + } + if (status.ok()) { + if (FLAGS_expected_values_dir.empty()) { + expected_state_manager_.reset( + new AnonExpectedStateManager(FLAGS_max_key, FLAGS_column_families)); } else { - fprintf(stderr, "Failed opening shared file '%s' with error: %s\n", - FLAGS_expected_values_path.c_str(), status.ToString().c_str()); - assert(values_ == nullptr); + expected_state_manager_.reset(new FileExpectedStateManager( + FLAGS_max_key, FLAGS_column_families, FLAGS_expected_values_dir)); } + status = expected_state_manager_->Open(); } - if (values_ == nullptr) { - values_allocation_.reset( - new std::atomic[FLAGS_column_families * max_key_]); - values_ = &values_allocation_[0]; - values_init_needed = true; - } - assert(values_ != nullptr); - if (values_init_needed) { - for (int i = 0; i < FLAGS_column_families; ++i) { - for (int j = 0; j < max_key_; ++j) { - Delete(i, j, false /* pending */); - } - } + if (!status.ok()) { + fprintf(stderr, "Failed setting up expected state with error: %s\n", + status.ToString().c_str()); + exit(1); } if (FLAGS_test_batches_snapshots) { @@ -163,18 +163,24 @@ ptr.reset(new port::Mutex); } } - if (FLAGS_compaction_thread_pool_adjust_interval > 0) { - ++num_bg_threads_; - fprintf(stdout, "Starting compaction_thread_pool_adjust_thread\n"); - } - if (FLAGS_continuous_verification_interval > 0) { - ++num_bg_threads_; - fprintf(stdout, "Starting continuous_verification_thread\n"); +#ifndef NDEBUG + if (FLAGS_read_fault_one_in) { + SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError", + IgnoreReadErrorCallback); + SyncPoint::GetInstance()->EnableProcessing(); + } +#endif // NDEBUG + } + + ~SharedState() { +#ifndef NDEBUG + if (FLAGS_read_fault_one_in) { + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); } +#endif } - ~SharedState() {} - port::Mutex* GetMutex() { return &mu_; } port::CondVar* GetCondVar() { return &cv_; } @@ -185,6 +191,8 @@ uint32_t GetNumThreads() const { return num_threads_; } + void IncThreads() { num_threads_++; } + void IncInitialized() { num_initialized_++; } void IncOperated() { num_populated_++; } @@ -217,89 +225,84 @@ bool ShouldStopTest() const { return should_stop_test_.load(); } + // Returns a lock covering `key` in `cf`. port::Mutex* GetMutexForKey(int cf, int64_t key) { return key_locks_[cf][key >> log2_keys_per_lock_].get(); } + // Acquires locks for all keys in `cf`. void LockColumnFamily(int cf) { for (auto& mutex : key_locks_[cf]) { mutex->Lock(); } } + // Releases locks for all keys in `cf`. void UnlockColumnFamily(int cf) { for (auto& mutex : key_locks_[cf]) { mutex->Unlock(); } } - std::atomic& Value(int cf, int64_t key) const { - return values_[cf * max_key_ + key]; + Status SaveAtAndAfter(DB* db) { + return expected_state_manager_->SaveAtAndAfter(db); } + bool HasHistory() { return expected_state_manager_->HasHistory(); } + + Status Restore(DB* db) { return expected_state_manager_->Restore(db); } + + // Requires external locking covering all keys in `cf`. void ClearColumnFamily(int cf) { - std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */), - DELETION_SENTINEL); + return expected_state_manager_->ClearColumnFamily(cf); } // @param pending True if the update may have started but is not yet // guaranteed finished. This is useful for crash-recovery testing when the // process may crash before updating the expected values array. + // + // Requires external locking covering `key` in `cf`. void Put(int cf, int64_t key, uint32_t value_base, bool pending) { - if (!pending) { - // prevent expected-value update from reordering before Write - std::atomic_thread_fence(std::memory_order_release); - } - Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base, - std::memory_order_relaxed); - if (pending) { - // prevent Write from reordering before expected-value update - std::atomic_thread_fence(std::memory_order_release); - } + return expected_state_manager_->Put(cf, key, value_base, pending); } - uint32_t Get(int cf, int64_t key) const { return Value(cf, key); } + // Requires external locking covering `key` in `cf`. + uint32_t Get(int cf, int64_t key) const { + return expected_state_manager_->Get(cf, key); + } // @param pending See comment above Put() // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. bool Delete(int cf, int64_t key, bool pending) { - if (Value(cf, key) == DELETION_SENTINEL) { - return false; - } - Put(cf, key, DELETION_SENTINEL, pending); - return true; + return expected_state_manager_->Delete(cf, key, pending); } // @param pending See comment above Put() // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. bool SingleDelete(int cf, int64_t key, bool pending) { - return Delete(cf, key, pending); + return expected_state_manager_->Delete(cf, key, pending); } // @param pending See comment above Put() // Returns number of keys deleted by the call. + // + // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) { - int covered = 0; - for (int64_t key = begin_key; key < end_key; ++key) { - if (Delete(cf, key, pending)) { - ++covered; - } - } - return covered; + return expected_state_manager_->DeleteRange(cf, begin_key, end_key, + pending); } bool AllowsOverwrite(int64_t key) { return no_overwrite_ids_.find(key) == no_overwrite_ids_.end(); } + // Requires external locking covering `key` in `cf`. bool Exists(int cf, int64_t key) { - // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite - // is disallowed can't be accidentally added a second time, in which case - // SingleDelete wouldn't be able to properly delete the key. It does allow - // the case where a SingleDelete might be added which covers nothing, but - // that's not a correctness issue. - uint32_t expected_value = Value(cf, key).load(); - return expected_value != DELETION_SENTINEL; + return expected_state_manager_->Exists(cf, key); } uint32_t GetSeed() const { return seed_; } @@ -308,6 +311,8 @@ bool ShouldStopBgThread() { return should_stop_bg_thread_; } + void IncBgThreads() { ++num_bg_threads_; } + void IncBgThreadsFinished() { ++bg_thread_finished_; } bool BgThreadsFinished() const { @@ -315,7 +320,7 @@ } bool ShouldVerifyAtBeginning() const { - return expected_mmap_buffer_.get() != nullptr; + return !FLAGS_expected_values_dir.empty(); } bool PrintingVerificationResults() { @@ -329,12 +334,16 @@ } private: + static void IgnoreReadErrorCallback(void*) { + ignore_read_error = true; + } + port::Mutex mu_; port::CondVar cv_; const uint32_t seed_; const int64_t max_key_; const uint32_t log2_keys_per_lock_; - const int num_threads_; + int num_threads_; long num_initialized_; long num_populated_; long vote_reopen_; @@ -351,12 +360,10 @@ // Keys that should not be overwritten std::unordered_set no_overwrite_ids_; - std::atomic* values_; - std::unique_ptr[]> values_allocation_; + std::unique_ptr expected_state_manager_; // Has to make it owned by a smart ptr as port::Mutex is not copyable // and storing it in the container may require copying depending on the impl. std::vector>> key_locks_; - std::unique_ptr expected_mmap_buffer_; std::atomic printing_verification_results_; }; @@ -380,6 +387,8 @@ std::string value; // optional state of all keys in the db std::vector* key_vec; + + std::string timestamp; }; std::queue> snapshot_queue; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,17 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#include "db_stress_tool/db_stress_stat.h" + +namespace ROCKSDB_NAMESPACE { + +std::shared_ptr dbstats; +std::shared_ptr dbstats_secondaries; + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,9 +11,9 @@ #include "monitoring/histogram.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/snapshot.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "util/gflags_compat.h" #include "util/random.h" @@ -21,9 +21,10 @@ DECLARE_bool(progress_reports); namespace ROCKSDB_NAMESPACE { + // Database statistics -static std::shared_ptr dbstats; -static std::shared_ptr dbstats_secondaries; +extern std::shared_ptr dbstats; +extern std::shared_ptr dbstats_secondaries; class Stats { private: @@ -42,6 +43,7 @@ long range_deletions_; long covered_by_range_deletions_; long errors_; + long verified_errors_; long num_compact_files_succeed_; long num_compact_files_failed_; int next_report_; @@ -67,11 +69,12 @@ range_deletions_ = 0; covered_by_range_deletions_ = 0; errors_ = 0; + verified_errors_ = 0; bytes_ = 0; seconds_ = 0; num_compact_files_succeed_ = 0; num_compact_files_failed_ = 0; - start_ = Env::Default()->NowMicros(); + start_ = SystemClock::Default()->NowMicros(); last_op_finish_ = start_; finish_ = start_; } @@ -90,6 +93,7 @@ range_deletions_ += other.range_deletions_; covered_by_range_deletions_ = other.covered_by_range_deletions_; errors_ += other.errors_; + verified_errors_ += other.verified_errors_; bytes_ += other.bytes_; seconds_ += other.seconds_; num_compact_files_succeed_ += other.num_compact_files_succeed_; @@ -99,13 +103,13 @@ } void Stop() { - finish_ = Env::Default()->NowMicros(); + finish_ = SystemClock::Default()->NowMicros(); seconds_ = (finish_ - start_) * 1e-6; } void FinishedSingleOp() { if (FLAGS_histogram) { - auto now = Env::Default()->NowMicros(); + auto now = SystemClock::Default()->NowMicros(); auto micros = now - last_op_finish_; hist_.Add(micros); if (micros > 20000) { @@ -163,6 +167,8 @@ void AddErrors(long n) { errors_ += n; } + void AddVerifiedErrors(long n) { verified_errors_ += n; } + void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; } void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,65 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table.h" +#include "util/gflags_compat.h" +#include "util/random.h" + +DECLARE_int32(mark_for_compaction_one_file_in); + +namespace ROCKSDB_NAMESPACE { + +// A `DbStressTablePropertiesCollector` ignores what keys/values were added to +// the table, adds no properties to the table, and decides at random whether the +// table will be marked for compaction according to +// `FLAGS_mark_for_compaction_one_file_in`. +class DbStressTablePropertiesCollector : public TablePropertiesCollector { + public: + DbStressTablePropertiesCollector() + : need_compact_(Random::GetTLSInstance()->OneInOpt( + FLAGS_mark_for_compaction_one_file_in)) {} + + virtual Status AddUserKey(const Slice& /* key */, const Slice& /* value */, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + + virtual Status Finish(UserCollectedProperties* /* properties */) override { + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + virtual const char* Name() const override { + return "DbStressTablePropertiesCollector"; + } + + virtual bool NeedCompact() const override { return need_compact_; } + + private: + const bool need_compact_; +}; + +// A `DbStressTablePropertiesCollectorFactory` creates +// `DbStressTablePropertiesCollectorFactory`s. +class DbStressTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new DbStressTablePropertiesCollector(); + } + + virtual const char* Name() const override { + return "DbStressTablePropertiesCollectorFactory"; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,26 +10,64 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" +#include "db_stress_tool/db_stress_compaction_filter.h" #include "db_stress_tool/db_stress_driver.h" +#include "db_stress_tool/db_stress_table_properties_collector.h" #include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksdb/types.h" +#include "rocksdb/utilities/object_registry.h" +#include "util/cast_util.h" +#include "utilities/backupable/backupable_db_impl.h" +#include "utilities/fault_injection_fs.h" +#include "utilities/fault_injection_secondary_cache.h" namespace ROCKSDB_NAMESPACE { + +namespace { + +std::shared_ptr CreateFilterPolicy() { + if (FLAGS_bloom_bits < 0) { + return BlockBasedTableOptions().filter_policy; + } + const FilterPolicy* new_policy; + if (FLAGS_use_block_based_filter) { + if (FLAGS_ribbon_starting_level < 999) { + fprintf( + stderr, + "Cannot combine use_block_based_filter and ribbon_starting_level\n"); + exit(1); + } else { + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true); + } + } else if (FLAGS_ribbon_starting_level >= 999) { + // Use Bloom API + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false); + } else { + new_policy = NewRibbonFilterPolicy( + FLAGS_bloom_bits, /* bloom_before_level */ FLAGS_ribbon_starting_level); + } + return std::shared_ptr(new_policy); +} + +} // namespace + StressTest::StressTest() - : cache_(NewCache(FLAGS_cache_size)), + : cache_(NewCache(FLAGS_cache_size, FLAGS_cache_numshardbits)), compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)), - filter_policy_(FLAGS_bloom_bits >= 0 - ? FLAGS_use_block_based_filter - ? NewBloomFilterPolicy(FLAGS_bloom_bits, true) - : NewBloomFilterPolicy(FLAGS_bloom_bits, false) - : nullptr), + filter_policy_(CreateFilterPolicy()), db_(nullptr), #ifndef ROCKSDB_LITE txn_db_(nullptr), #endif + clock_(db_stress_env->GetSystemClock().get()), new_column_family_name_(1), num_times_reopened_(0), db_preload_finished_(false), - cmp_db_(nullptr) { + cmp_db_(nullptr), + is_db_stopped_(false) { if (FLAGS_destroy_db_initially) { std::vector files; db_stress_env->GetChildren(FLAGS_db, &files); @@ -40,6 +78,7 @@ } Options options; + options.env = db_stress_env; // Remove files without preserving manfiest files #ifndef ROCKSDB_LITE const Status s = !FLAGS_use_blob_db @@ -82,7 +121,9 @@ delete cmp_db_; } -std::shared_ptr StressTest::NewCache(size_t capacity) { +std::shared_ptr StressTest::NewCache(size_t capacity, + int32_t num_shard_bits) { + ConfigOptions config_options; if (capacity <= 0) { return nullptr; } @@ -94,8 +135,46 @@ } return cache; } else { - return NewLRUCache((size_t)capacity); + LRUCacheOptions opts; + opts.capacity = capacity; + opts.num_shard_bits = num_shard_bits; +#ifndef ROCKSDB_LITE + std::shared_ptr secondary_cache; + if (!FLAGS_secondary_cache_uri.empty()) { + Status s = SecondaryCache::CreateFromString( + config_options, FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf(stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + if (FLAGS_secondary_cache_fault_one_in > 0) { + secondary_cache = std::make_shared( + secondary_cache, static_cast(FLAGS_seed), + FLAGS_secondary_cache_fault_one_in); + } + opts.secondary_cache = secondary_cache; + } +#endif + return NewLRUCache(opts); + } +} + +std::vector StressTest::GetBlobCompressionTags() { + std::vector compression_tags{"kNoCompression"}; + + if (Snappy_Supported()) { + compression_tags.emplace_back("kSnappyCompression"); + } + if (LZ4_Supported()) { + compression_tags.emplace_back("kLZ4Compression"); } + if (ZSTD_Supported()) { + compression_tags.emplace_back("kZSTD"); + } + + return compression_tags; } bool StressTest::BuildOptionsTable() { @@ -176,6 +255,25 @@ {"max_sequential_skip_in_iterations", {"4", "8", "12"}}, }; + if (FLAGS_allow_setting_blob_options_dynamically) { + options_tbl.emplace("enable_blob_files", + std::vector{"false", "true"}); + options_tbl.emplace("min_blob_size", + std::vector{"0", "8", "16"}); + options_tbl.emplace("blob_file_size", + std::vector{"1M", "16M", "256M", "1G"}); + options_tbl.emplace("blob_compression_type", GetBlobCompressionTags()); + options_tbl.emplace("enable_blob_garbage_collection", + std::vector{"false", "true"}); + options_tbl.emplace( + "blob_garbage_collection_age_cutoff", + std::vector{"0.0", "0.25", "0.5", "0.75", "1.0"}); + options_tbl.emplace("blob_garbage_collection_force_threshold", + std::vector{"0.5", "0.75", "1.0"}); + options_tbl.emplace("blob_compaction_readahead_size", + std::vector{"0", "1M", "4M"}); + } + options_table_ = std::move(options_tbl); for (const auto& iter : options_table_) { @@ -185,28 +283,64 @@ } void StressTest::InitDb() { - uint64_t now = db_stress_env->NowMicros(); + uint64_t now = clock_->NowMicros(); fprintf(stdout, "%s Initializing db_stress\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock_->TimeToString(now / 1000000).c_str()); PrintEnv(); Open(); BuildOptionsTable(); } -void StressTest::InitReadonlyDb(SharedState* shared) { - uint64_t now = db_stress_env->NowMicros(); - fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n", - db_stress_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key); - PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared); +void StressTest::FinishInitDb(SharedState* shared) { + if (FLAGS_read_only) { + uint64_t now = clock_->NowMicros(); + fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n", + clock_->TimeToString(now / 1000000).c_str(), FLAGS_max_key); + PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared); + } + + if (shared->HasHistory()) { + // The way it works right now is, if there's any history, that means the + // previous run mutating the DB had all its operations traced, in which case + // we should always be able to `Restore()` the expected values to match the + // `db_`'s current seqno. + Status s = shared->Restore(db_); + if (!s.ok()) { + fprintf(stderr, "Error restoring historical expected values: %s\n", + s.ToString().c_str()); + exit(1); + } + } + + if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) { + Status s = shared->SaveAtAndAfter(db_); + if (!s.ok()) { + fprintf(stderr, "Error enabling history tracing: %s\n", + s.ToString().c_str()); + exit(1); + } + } + + if (FLAGS_enable_compaction_filter) { + auto* compaction_filter_factory = + reinterpret_cast( + options_.compaction_filter_factory.get()); + assert(compaction_filter_factory); + // This must be called only after any potential `SharedState::Restore()` has + // completed in order for the `compaction_filter_factory` to operate on the + // correct latest values file. + compaction_filter_factory->SetSharedState(shared); + fprintf(stdout, "Compaction filter factory: %s\n", + compaction_filter_factory->Name()); + } } bool StressTest::VerifySecondaries() { #ifndef ROCKSDB_LITE if (FLAGS_test_secondary) { - uint64_t now = db_stress_env->NowMicros(); - fprintf( - stdout, "%s Start to verify secondaries against primary\n", - db_stress_env->TimeToString(static_cast(now) / 1000000).c_str()); + uint64_t now = clock_->NowMicros(); + fprintf(stdout, "%s Start to verify secondaries against primary\n", + clock_->TimeToString(static_cast(now) / 1000000).c_str()); } for (size_t k = 0; k != secondaries_.size(); ++k) { Status s = secondaries_[k]->TryCatchUpWithPrimary(); @@ -248,10 +382,9 @@ } } if (FLAGS_test_secondary) { - uint64_t now = db_stress_env->NowMicros(); - fprintf( - stdout, "%s Verification of secondaries succeeded\n", - db_stress_env->TimeToString(static_cast(now) / 1000000).c_str()); + uint64_t now = clock_->NowMicros(); + fprintf(stdout, "%s Verification of secondaries succeeded\n", + clock_->TimeToString(static_cast(now) / 1000000).c_str()); } #endif // ROCKSDB_LITE return true; @@ -265,6 +398,11 @@ } ReadOptions ropt; ropt.snapshot = snap_state.snapshot; + Slice ts; + if (!snap_state.timestamp.empty()) { + ts = snap_state.timestamp; + ropt.timestamp = &ts; + } PinnableSlice exp_v(&snap_state.value); exp_v.PinSelf(); PinnableSlice v; @@ -316,9 +454,11 @@ void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf, int64_t key) const { + auto key_str = Key(key); + Slice key_slice = key_str; fprintf(stderr, - "Verification failed for column family %d key %" PRIi64 ": %s\n", cf, - key, msg.c_str()); + "Verification failed for column family %d key %s (%" PRIi64 "): %s\n", + cf, key_slice.ToString(true).c_str(), key, msg.c_str()); shared->SetVerificationFailure(); } @@ -368,6 +508,13 @@ } } else { if (!FLAGS_use_txn) { + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = NowNanosStr(); + ts = ts_str; + write_opts.timestamp = &ts; + } s = db_->Put(write_opts, cfh, key, v); } else { #ifndef ROCKSDB_LITE @@ -408,9 +555,9 @@ #endif db_preload_finished_.store(true); - auto now = db_stress_env->NowMicros(); + auto now = clock_->NowMicros(); fprintf(stdout, "%s Reopening database in read-only\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock_->TimeToString(now / 1000000).c_str()); // Reopen as read-only, can ignore all options related to updates Open(); } else { @@ -453,6 +600,8 @@ } static std::atomic txn_id = {0}; TransactionOptions txn_options; + txn_options.lock_timeout = 600000; // 10 min + txn_options.deadlock_detect = true; *txn = txn_db_->BeginTransaction(write_opts, txn_options); auto istr = std::to_string(txn_id.fetch_add(1)); Status s = (*txn)->SetName("xid" + istr); @@ -493,13 +642,40 @@ write_opts.sync = true; } write_opts.disableWAL = FLAGS_disable_wal; - const int prefixBound = static_cast(FLAGS_readpercent) + - static_cast(FLAGS_prefixpercent); - const int writeBound = prefixBound + static_cast(FLAGS_writepercent); - const int delBound = writeBound + static_cast(FLAGS_delpercent); - const int delRangeBound = delBound + static_cast(FLAGS_delrangepercent); + const int prefix_bound = static_cast(FLAGS_readpercent) + + static_cast(FLAGS_prefixpercent); + const int write_bound = prefix_bound + static_cast(FLAGS_writepercent); + const int del_bound = write_bound + static_cast(FLAGS_delpercent); + const int delrange_bound = + del_bound + static_cast(FLAGS_delrangepercent); + const int iterate_bound = + delrange_bound + static_cast(FLAGS_iterpercent); + const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1); +#ifndef NDEBUG + if (FLAGS_read_fault_one_in) { + fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), + FLAGS_read_fault_one_in); + } + if (FLAGS_write_fault_one_in) { + IOStatus error_msg; + if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) { + error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + } else if (FLAGS_injest_error_severity == 2) { + // Ingest the fatal error + error_msg = IOStatus::IOError("Fatal IO Error"); + error_msg.SetDataLoss(true); + } + std::vector types = {FileType::kTableFile, + FileType::kDescriptorFile, + FileType::kCurrentFile}; + fault_fs_guard->SetRandomWriteError( + thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, + /*inject_for_all_file_types=*/false, types); + } +#endif // NDEBUG thread->stats.Start(); for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) { if (thread->shared->HasVerificationFailedYet() || @@ -591,13 +767,29 @@ } #ifndef ROCKSDB_LITE - // Every 1 in N verify the one of the following: 1) GetLiveFiles - // 2) GetSortedWalFiles 3) GetCurrentWalFile. Each time, randomly select - // one of them to run the test. - if (thread->rand.OneInOpt(FLAGS_get_live_files_and_wal_files_one_in)) { - Status status = VerifyGetLiveAndWalFiles(thread); + // Verify GetLiveFiles with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) && + !FLAGS_write_fault_one_in) { + Status status = VerifyGetLiveFiles(); + if (!status.ok()) { + VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status); + } + } + + // Verify GetSortedWalFiles with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_sorted_wal_files_one_in)) { + Status status = VerifyGetSortedWalFiles(); + if (!status.ok()) { + VerificationAbort(shared, "VerifyGetSortedWalFiles status not OK", + status); + } + } + + // Verify GetCurrentWalFile with a 1 in N chance. + if (thread->rand.OneInOpt(FLAGS_get_current_wal_file_one_in)) { + Status status = VerifyGetCurrentWalFile(); if (!status.ok()) { - VerificationAbort(shared, "VerifyGetLiveAndWalFiles status not OK", + VerificationAbort(shared, "VerifyGetCurrentWalFile status not OK", status); } } @@ -618,6 +810,10 @@ VerificationAbort(shared, "VerifyChecksum status not OK", status); } } + + if (thread->rand.OneInOpt(FLAGS_get_property_one_in)) { + TestGetProperty(thread); + } #endif std::vector rand_keys = GenerateKeys(rand_key); @@ -627,10 +823,23 @@ } if (thread->rand.OneInOpt(FLAGS_backup_one_in)) { - Status s = TestBackupRestore(thread, rand_column_families, rand_keys); - if (!s.ok()) { - VerificationAbort(shared, "Backup/restore gave inconsistent state", - s); + // Beyond a certain DB size threshold, this test becomes heavier than + // it's worth. + uint64_t total_size = 0; + if (FLAGS_backup_max_size > 0) { + std::vector files; + db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files); + for (auto& file : files) { + total_size += file.size_bytes; + } + } + + if (total_size <= FLAGS_backup_max_size) { + Status s = TestBackupRestore(thread, rand_column_families, rand_keys); + if (!s.ok()) { + VerificationAbort(shared, "Backup/restore gave inconsistent state", + s); + } } } @@ -661,6 +870,20 @@ } } + // Assign timestamps if necessary. + std::string read_ts_str; + std::string write_ts_str; + Slice read_ts; + Slice write_ts; + if (ShouldAcquireMutexOnKey() && FLAGS_user_timestamp_size > 0) { + read_ts_str = GenerateTimestampForRead(); + read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } + int prob_op = thread->rand.Uniform(100); // Reset this in case we pick something other than a read op. We don't // want to use a stale value when deciding at the beginning of the loop @@ -683,7 +906,7 @@ } else { TestGet(thread, read_opts, rand_column_families, rand_keys); } - } else if (prob_op < prefixBound) { + } else if (prob_op < prefix_bound) { assert(static_cast(FLAGS_readpercent) <= prob_op); // OPERATION prefix scan // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are @@ -691,22 +914,22 @@ // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same // prefix TestPrefixScan(thread, read_opts, rand_column_families, rand_keys); - } else if (prob_op < writeBound) { - assert(prefixBound <= prob_op); + } else if (prob_op < write_bound) { + assert(prefix_bound <= prob_op); // OPERATION write TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys, value, lock); - } else if (prob_op < delBound) { - assert(writeBound <= prob_op); + } else if (prob_op < del_bound) { + assert(write_bound <= prob_op); // OPERATION delete TestDelete(thread, write_opts, rand_column_families, rand_keys, lock); - } else if (prob_op < delRangeBound) { - assert(delBound <= prob_op); + } else if (prob_op < delrange_bound) { + assert(del_bound <= prob_op); // OPERATION delete range TestDeleteRange(thread, write_opts, rand_column_families, rand_keys, lock); - } else { - assert(delRangeBound <= prob_op); + } else if (prob_op < iterate_bound) { + assert(delrange_bound <= prob_op); // OPERATION iterate int num_seeks = static_cast( std::min(static_cast(thread->rand.Uniform(4)), @@ -714,6 +937,9 @@ rand_keys = GenerateNKeys(thread, num_seeks, i); i += num_seeks - 1; TestIterate(thread, read_opts, rand_column_families, rand_keys); + } else { + assert(iterate_bound <= prob_op); + TestCustomOperations(thread, rand_column_families); } thread->stats.FinishedSingleOp(); #ifndef ROCKSDB_LITE @@ -751,8 +977,16 @@ std::vector boundaries; for (const LevelMetaData& lmd : cfmd.levels) { for (const SstFileMetaData& sfmd : lmd.files) { - boundaries.push_back(sfmd.smallestkey); - boundaries.push_back(sfmd.largestkey); + // If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey + // have timestamps. + const auto& skey = sfmd.smallestkey; + const auto& lkey = sfmd.largestkey; + assert(skey.size() >= FLAGS_user_timestamp_size); + assert(lkey.size() >= FLAGS_user_timestamp_size); + boundaries.push_back( + skey.substr(0, skey.size() - FLAGS_user_timestamp_size)); + boundaries.push_back( + lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size)); } } if (boundaries.empty()) { @@ -902,6 +1136,7 @@ // iterators with the same set-up, and it doesn't hurt to check them // to be equal. ReadOptions cmp_ro; + cmp_ro.timestamp = readoptionscopy.timestamp; cmp_ro.snapshot = snapshot; cmp_ro.total_order_seek = true; ColumnFamilyHandle* cmp_cfh = @@ -976,28 +1211,23 @@ } #ifndef ROCKSDB_LITE -// Test the return status of GetLiveFiles, GetSortedWalFiles, and -// GetCurrentWalFile. Each time, randomly select one of them to run -// and return the status. -Status StressTest::VerifyGetLiveAndWalFiles(ThreadState* thread) { - int case_num = thread->rand.Uniform(3); - if (case_num == 0) { - std::vector live_file; - uint64_t manifest_size; - return db_->GetLiveFiles(live_file, &manifest_size); - } - - if (case_num == 1) { - VectorLogPtr log_ptr; - return db_->GetSortedWalFiles(log_ptr); - } - - if (case_num == 2) { - std::unique_ptr cur_wal_file; - return db_->GetCurrentWalFile(&cur_wal_file); - } - assert(false); - return Status::Corruption("Undefined case happens!"); +// Test the return status of GetLiveFiles. +Status StressTest::VerifyGetLiveFiles() const { + std::vector live_file; + uint64_t manifest_size = 0; + return db_->GetLiveFiles(live_file, &manifest_size); +} + +// Test the return status of GetSortedWalFiles. +Status StressTest::VerifyGetSortedWalFiles() const { + VectorLogPtr log_ptr; + return db_->GetSortedWalFiles(log_ptr); +} + +// Test the return status of GetCurrentWalFile. +Status StressTest::VerifyGetCurrentWalFile() const { + std::unique_ptr cur_wal_file; + return db_->GetCurrentWalFile(&cur_wal_file); } #endif // !ROCKSDB_LITE @@ -1026,21 +1256,25 @@ *diverged = true; return; } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr && - (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >= - 0 || + (options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key, + /*b_has_ts=*/false) >= 0 || (ro.iterate_upper_bound != nullptr && - options_.comparator->Compare(*ro.iterate_lower_bound, - *ro.iterate_upper_bound) >= 0))) { + options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, + *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) { // Lower bound behavior is not well defined if it is larger than // seek key or upper bound. Disable the check for now. *diverged = true; return; } else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr && - (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <= - 0 || + (options_.comparator->CompareWithoutTimestamp( + *ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key, + /*b_has_ts=*/false) <= 0 || (ro.iterate_lower_bound != nullptr && - options_.comparator->Compare(*ro.iterate_lower_bound, - *ro.iterate_upper_bound) >= 0))) { + options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, + *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) { // Uppder bound behavior is not well defined if it is smaller than // seek key or lower bound. Disable the check for now. *diverged = true; @@ -1109,9 +1343,13 @@ if ((iter->Valid() && iter->key() != cmp_iter->key()) || (!iter->Valid() && (ro.iterate_upper_bound == nullptr || - cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) && + cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false, + *ro.iterate_upper_bound, + /*b_has_ts=*/false) < 0) && (ro.iterate_lower_bound == nullptr || - cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) { + cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false, + *ro.iterate_lower_bound, + /*b_has_ts=*/false) > 0))) { fprintf(stderr, "Iterator diverged from control iterator which" " has value %s %s\n", @@ -1169,35 +1407,137 @@ Status StressTest::TestBackupRestore( ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys) { - // Note the column families chosen by `rand_column_families` cannot be - // dropped while the locks for `rand_keys` are held. So we should not have - // to worry about accessing those column families throughout this function. - assert(rand_column_families.size() == rand_keys.size()); std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid); std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid); BackupableDBOptions backup_opts(backup_dir); + // For debugging, get info_log from live options + backup_opts.info_log = db_->GetDBOptions().info_log.get(); + if (thread->rand.OneIn(10)) { + backup_opts.share_table_files = false; + } else { + backup_opts.share_table_files = true; + if (thread->rand.OneIn(5)) { + backup_opts.share_files_with_checksum = false; + } else { + backup_opts.share_files_with_checksum = true; + if (thread->rand.OneIn(2)) { + // old + backup_opts.share_files_with_checksum_naming = + BackupableDBOptions::kLegacyCrc32cAndFileSize; + } else { + // new + backup_opts.share_files_with_checksum_naming = + BackupableDBOptions::kUseDbSessionId; + } + if (thread->rand.OneIn(2)) { + backup_opts.share_files_with_checksum_naming = + backup_opts.share_files_with_checksum_naming | + BackupableDBOptions::kFlagIncludeFileSize; + } + } + } BackupEngine* backup_engine = nullptr; + std::string from = "a backup/restore operation"; Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine); + if (!s.ok()) { + from = "BackupEngine::Open"; + } if (s.ok()) { - s = backup_engine->CreateNewBackup(db_); + if (thread->rand.OneIn(2)) { + TEST_FutureSchemaVersion2Options test_opts; + test_opts.crc32c_checksums = thread->rand.OneIn(2) == 0; + test_opts.file_sizes = thread->rand.OneIn(2) == 0; + TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts); + } + CreateBackupOptions create_opts; + if (FLAGS_disable_wal) { + // The verification can only work when latest value of `key` is backed up, + // which requires flushing in case of WAL disabled. + // + // Note this triggers a flush with a key lock held. Meanwhile, operations + // like flush/compaction may attempt to grab key locks like in + // `DbStressCompactionFilter`. The philosophy around preventing deadlock + // is the background operation key lock acquisition only tries but does + // not wait for the lock. So here in the foreground it is OK to hold the + // lock and wait on a background operation (flush). + create_opts.flush_before_backup = true; + } + s = backup_engine->CreateNewBackup(create_opts, db_); + if (!s.ok()) { + from = "BackupEngine::CreateNewBackup"; + } } if (s.ok()) { delete backup_engine; backup_engine = nullptr; s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine); + if (!s.ok()) { + from = "BackupEngine::Open (again)"; + } } + std::vector backup_info; + // If inplace_not_restore, we verify the backup by opening it as a + // read-only DB. If !inplace_not_restore, we restore it to a temporary + // directory for verification. + bool inplace_not_restore = thread->rand.OneIn(3); if (s.ok()) { - s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */, - restore_dir /* wal_dir */); + backup_engine->GetBackupInfo(&backup_info, + /*include_file_details*/ inplace_not_restore); + if (backup_info.empty()) { + s = Status::NotFound("no backups found"); + from = "BackupEngine::GetBackupInfo"; + } } - if (s.ok()) { - s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */); + if (s.ok() && thread->rand.OneIn(2)) { + s = backup_engine->VerifyBackup( + backup_info.front().backup_id, + thread->rand.OneIn(2) /* verify_with_checksum */); + if (!s.ok()) { + from = "BackupEngine::VerifyBackup"; + } + } + const bool allow_persistent = thread->tid == 0; // not too many + bool from_latest = false; + int count = static_cast(backup_info.size()); + if (s.ok() && !inplace_not_restore) { + if (count > 1) { + s = backup_engine->RestoreDBFromBackup( + RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id, + restore_dir /* db_dir */, restore_dir /* wal_dir */); + if (!s.ok()) { + from = "BackupEngine::RestoreDBFromBackup"; + } + } else { + from_latest = true; + s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(), + restore_dir /* db_dir */, + restore_dir /* wal_dir */); + if (!s.ok()) { + from = "BackupEngine::RestoreDBFromLatestBackup"; + } + } + } + if (s.ok() && !inplace_not_restore) { + // Purge early if restoring, to ensure the restored directory doesn't + // have some secret dependency on the backup directory. + uint32_t to_keep = 0; + if (allow_persistent) { + // allow one thread to keep up to 2 backups + to_keep = thread->rand.Uniform(3); + } + s = backup_engine->PurgeOldBackups(to_keep); + if (!s.ok()) { + from = "BackupEngine::PurgeOldBackups"; + } } DB* restored_db = nullptr; std::vector restored_cf_handles; - if (s.ok()) { + // Not yet implemented: opening restored BlobDB or TransactionDB + if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) { Options restore_options(options_); restore_options.listeners.clear(); + // Avoid dangling/shared file descriptors, for reliable destroy + restore_options.sst_file_manager = nullptr; std::vector cf_descriptors; // TODO(ajkr): `column_family_names_` is not safe to access here when // `clear_column_family_one_in != 0`. But we can't easily switch to @@ -1207,35 +1547,61 @@ for (auto name : column_family_names_) { cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options)); } - s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors, - &restored_cf_handles, &restored_db); + if (inplace_not_restore) { + BackupInfo& info = backup_info[thread->rand.Uniform(count)]; + restore_options.env = info.env_for_open.get(); + s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open, + cf_descriptors, &restored_cf_handles, + &restored_db); + if (!s.ok()) { + from = "DB::OpenForReadOnly in backup/restore"; + } + } else { + s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors, + &restored_cf_handles, &restored_db); + if (!s.ok()) { + from = "DB::Open in backup/restore"; + } + } } - // for simplicity, currently only verifies existence/non-existence of a few - // keys - for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) { - std::string key_str = Key(rand_keys[i]); + // Note the column families chosen by `rand_column_families` cannot be + // dropped while the locks for `rand_keys` are held. So we should not have + // to worry about accessing those column families throughout this function. + // + // For simplicity, currently only verifies existence/non-existence of a + // single key + for (size_t i = 0; restored_db && s.ok() && i < rand_column_families.size(); + ++i) { + std::string key_str = Key(rand_keys[0]); Slice key = key_str; std::string restored_value; + ReadOptions read_opts; + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + read_opts.timestamp = &ts; + } Status get_status = restored_db->Get( - ReadOptions(), restored_cf_handles[rand_column_families[i]], key, + read_opts, restored_cf_handles[rand_column_families[i]], key, &restored_value); - bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[i]); + bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]); if (get_status.ok()) { - if (!exists) { + if (!exists && from_latest && ShouldAcquireMutexOnKey()) { s = Status::Corruption("key exists in restore but not in original db"); } } else if (get_status.IsNotFound()) { - if (exists) { + if (exists && from_latest && ShouldAcquireMutexOnKey()) { s = Status::Corruption("key exists in original db but not in restore"); } } else { s = get_status; + if (!s.ok()) { + from = "DB::Get in backup/restore"; + } } } - if (backup_engine != nullptr) { - delete backup_engine; - backup_engine = nullptr; - } if (restored_db != nullptr) { for (auto* cf_handle : restored_cf_handles) { restored_db->DestroyColumnFamilyHandle(cf_handle); @@ -1243,14 +1609,44 @@ delete restored_db; restored_db = nullptr; } + if (s.ok() && inplace_not_restore) { + // Purge late if inplace open read-only + uint32_t to_keep = 0; + if (allow_persistent) { + // allow one thread to keep up to 2 backups + to_keep = thread->rand.Uniform(3); + } + s = backup_engine->PurgeOldBackups(to_keep); + if (!s.ok()) { + from = "BackupEngine::PurgeOldBackups"; + } + } + if (backup_engine != nullptr) { + delete backup_engine; + backup_engine = nullptr; + } + if (s.ok()) { + // Preserve directories on failure, or allowed persistent backup + if (!allow_persistent) { + s = DestroyDir(db_stress_env, backup_dir); + if (!s.ok()) { + from = "Destroy backup dir"; + } + } + } + if (s.ok()) { + s = DestroyDir(db_stress_env, restore_dir); + if (!s.ok()) { + from = "Destroy restore dir"; + } + } if (!s.ok()) { - fprintf(stderr, "A backup/restore operation failed with: %s\n", + fprintf(stderr, "Failure in %s with: %s\n", from.c_str(), s.ToString().c_str()); } return s; } -#ifndef ROCKSDB_LITE Status StressTest::TestApproximateSize( ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, @@ -1292,33 +1688,52 @@ return db_->GetApproximateSizes( sao, column_families_[rand_column_families[0]], &range, 1, &result); } -#endif // ROCKSDB_LITE Status StressTest::TestCheckpoint(ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys) { - // Note the column families chosen by `rand_column_families` cannot be - // dropped while the locks for `rand_keys` are held. So we should not have - // to worry about accessing those column families throughout this function. - assert(rand_column_families.size() == rand_keys.size()); std::string checkpoint_dir = FLAGS_db + "/.checkpoint" + ToString(thread->tid); Options tmp_opts(options_); tmp_opts.listeners.clear(); - tmp_opts.env = db_stress_env->target(); + tmp_opts.env = db_stress_env; DestroyDB(checkpoint_dir, tmp_opts); + if (db_stress_env->FileExists(checkpoint_dir).ok()) { + // If the directory might still exist, try to delete the files one by one. + // Likely a trash file is still there. + Status my_s = DestroyDir(db_stress_env, checkpoint_dir); + if (!my_s.ok()) { + fprintf(stderr, "Fail to destory directory before checkpoint: %s", + my_s.ToString().c_str()); + } + } + Checkpoint* checkpoint = nullptr; Status s = Checkpoint::Create(db_, &checkpoint); if (s.ok()) { s = checkpoint->CreateCheckpoint(checkpoint_dir); + if (!s.ok()) { + fprintf(stderr, "Fail to create checkpoint to %s\n", + checkpoint_dir.c_str()); + std::vector files; + Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files); + if (my_s.ok()) { + for (const auto& f : files) { + fprintf(stderr, " %s\n", f.c_str()); + } + } else { + fprintf(stderr, "Fail to get files under the directory to %s\n", + my_s.ToString().c_str()); + } + } } + delete checkpoint; + checkpoint = nullptr; std::vector cf_handles; DB* checkpoint_db = nullptr; if (s.ok()) { - delete checkpoint; - checkpoint = nullptr; Options options(options_); options.listeners.clear(); std::vector cf_descs; @@ -1326,6 +1741,7 @@ // `clear_column_family_one_in != 0`. But we can't easily switch to // `ListColumnFamilies` to get names because it won't necessarily give // the same order as `column_family_names_`. + assert(FLAGS_clear_column_family_one_in == 0); if (FLAGS_clear_column_family_one_in == 0) { for (const auto& name : column_family_names_) { cf_descs.emplace_back(name, ColumnFamilyOptions(options)); @@ -1335,21 +1751,24 @@ } } if (checkpoint_db != nullptr) { + // Note the column families chosen by `rand_column_families` cannot be + // dropped while the locks for `rand_keys` are held. So we should not have + // to worry about accessing those column families throughout this function. for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) { - std::string key_str = Key(rand_keys[i]); + std::string key_str = Key(rand_keys[0]); Slice key = key_str; std::string value; Status get_status = checkpoint_db->Get( ReadOptions(), cf_handles[rand_column_families[i]], key, &value); bool exists = - thread->shared->Exists(rand_column_families[i], rand_keys[i]); + thread->shared->Exists(rand_column_families[i], rand_keys[0]); if (get_status.ok()) { - if (!exists) { + if (!exists && ShouldAcquireMutexOnKey()) { s = Status::Corruption( "key exists in checkpoint but not in original db"); } } else if (get_status.IsNotFound()) { - if (exists) { + if (exists && ShouldAcquireMutexOnKey()) { s = Status::Corruption( "key exists in original db but not in checkpoint"); } @@ -1365,20 +1784,92 @@ checkpoint_db = nullptr; } - DestroyDB(checkpoint_dir, tmp_opts); - if (!s.ok()) { fprintf(stderr, "A checkpoint operation failed with: %s\n", s.ToString().c_str()); + } else { + DestroyDB(checkpoint_dir, tmp_opts); } return s; } +void StressTest::TestGetProperty(ThreadState* thread) const { + std::unordered_set levelPropertyNames = { + DB::Properties::kAggregatedTablePropertiesAtLevel, + DB::Properties::kCompressionRatioAtLevelPrefix, + DB::Properties::kNumFilesAtLevelPrefix, + }; + std::unordered_set unknownPropertyNames = { + DB::Properties::kEstimateOldestKeyTime, + DB::Properties::kOptionsStatistics, + DB::Properties:: + kLiveSstFilesSizeAtTemperature, // similar to levelPropertyNames, it + // requires a number suffix + }; + unknownPropertyNames.insert(levelPropertyNames.begin(), + levelPropertyNames.end()); + + std::string prop; + for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) { + bool res = db_->GetProperty(ppt_name_and_info.first, &prop); + if (unknownPropertyNames.find(ppt_name_and_info.first) == + unknownPropertyNames.end()) { + if (!res) { + fprintf(stderr, "Failed to get DB property: %s\n", + ppt_name_and_info.first.c_str()); + thread->shared->SetVerificationFailure(); + } + if (ppt_name_and_info.second.handle_int != nullptr) { + uint64_t prop_int; + if (!db_->GetIntProperty(ppt_name_and_info.first, &prop_int)) { + fprintf(stderr, "Failed to get Int property: %s\n", + ppt_name_and_info.first.c_str()); + thread->shared->SetVerificationFailure(); + } + } + if (ppt_name_and_info.second.handle_map != nullptr) { + std::map prop_map; + if (!db_->GetMapProperty(ppt_name_and_info.first, &prop_map)) { + fprintf(stderr, "Failed to get Map property: %s\n", + ppt_name_and_info.first.c_str()); + thread->shared->SetVerificationFailure(); + } + } + } + } + + ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(&cf_meta_data); + int level_size = static_cast(cf_meta_data.levels.size()); + for (int level = 0; level < level_size; level++) { + for (const auto& ppt_name : levelPropertyNames) { + bool res = db_->GetProperty(ppt_name + std::to_string(level), &prop); + if (!res) { + fprintf(stderr, "Failed to get DB property: %s\n", + (ppt_name + std::to_string(level)).c_str()); + thread->shared->SetVerificationFailure(); + } + } + } + + // Test for an invalid property name + if (thread->rand.OneIn(100)) { + if (db_->GetProperty("rocksdb.invalid_property_name", &prop)) { + fprintf(stderr, "Failed to return false for invalid property name\n"); + thread->shared->SetVerificationFailure(); + } + } +} + void StressTest::TestCompactFiles(ThreadState* thread, ColumnFamilyHandle* column_family) { ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data; db_->GetColumnFamilyMetaData(column_family, &cf_meta_data); + if (cf_meta_data.levels.empty()) { + return; + } + // Randomly compact up to three consecutive files from a level const int kMaxRetry = 3; for (int attempt = 0; attempt < kMaxRetry; ++attempt) { @@ -1424,6 +1915,9 @@ Status StressTest::TestFlush(const std::vector& rand_column_families) { FlushOptions flush_opts; + if (FLAGS_atomic_flush) { + return db_->Flush(flush_opts, column_families_); + } std::vector cfhs; std::for_each(rand_column_families.begin(), rand_column_families.end(), [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); }); @@ -1442,7 +1936,7 @@ // 1 chance in 625 of pausing full 16s.) int pwr2_micros = std::min(thread->rand.Uniform(25), thread->rand.Uniform(25)); - db_stress_env->SleepForMicroseconds(1 << pwr2_micros); + clock_->SleepForMicroseconds(1 << pwr2_micros); return db_->ContinueBackgroundWork(); } @@ -1451,8 +1945,9 @@ const std::string& keystr, uint64_t i) { Slice key = keystr; ColumnFamilyHandle* column_family = column_families_[rand_column_family]; + ReadOptions ropt; #ifndef ROCKSDB_LITE - auto db_impl = reinterpret_cast(db_->GetRootDB()); + auto db_impl = static_cast_with_check(db_->GetRootDB()); const bool ww_snapshot = thread->rand.OneIn(10); const Snapshot* snapshot = ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary() @@ -1460,8 +1955,19 @@ #else const Snapshot* snapshot = db_->GetSnapshot(); #endif // !ROCKSDB_LITE - ReadOptions ropt; ropt.snapshot = snapshot; + + // Ideally, we want snapshot taking and timestamp generation to be atomic + // here, so that the snapshot corresponds to the timestamp. However, it is + // not possible with current GetSnapshot() API. + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + ropt.timestamp = &ts; + } + std::string value_at; // When taking a snapshot, we also read a key from that snapshot. We // will later read the same key before releasing the snapshot and @@ -1483,10 +1989,14 @@ } } - ThreadState::SnapshotState snap_state = { - snapshot, rand_column_family, column_family->GetName(), - keystr, status_at, value_at, - key_vec}; + ThreadState::SnapshotState snap_state = {snapshot, + rand_column_family, + column_family->GetName(), + keystr, + status_at, + value_at, + key_vec, + ts_str}; uint64_t hold_for = FLAGS_snapshot_hold_ops; if (FLAGS_long_running_snapshots) { // Hold 10% of snapshots for 10x more @@ -1591,6 +2101,13 @@ ReadOptions ro; ro.snapshot = snapshot; ro.total_order_seek = true; + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + ro.timestamp = &ts; + } std::unique_ptr it(db_->NewIterator(ro, column_family)); for (it->Seek(start_key); it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0; @@ -1617,7 +2134,7 @@ fprintf(stdout, "TransactionDB : %s\n", FLAGS_use_txn ? "true" : "false"); #ifndef ROCKSDB_LITE - fprintf(stdout, "BlobDB : %s\n", + fprintf(stdout, "Stacked BlobDB : %s\n", FLAGS_use_blob_db ? "true" : "false"); #endif // !ROCKSDB_LITE fprintf(stdout, "Read only mode : %s\n", @@ -1634,7 +2151,7 @@ (unsigned long)FLAGS_ops_per_thread); std::string ttl_state("unused"); if (FLAGS_ttl > 0) { - ttl_state = NumberToString(FLAGS_ttl); + ttl_state = ToString(FLAGS_ttl); } fprintf(stdout, "Time to live(sec) : %s\n", ttl_state.c_str()); fprintf(stdout, "Read percentage : %d%%\n", FLAGS_readpercent); @@ -1645,6 +2162,7 @@ fprintf(stdout, "No overwrite percentage : %d%%\n", FLAGS_nooverwritepercent); fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent); + fprintf(stdout, "Custom ops percentage : %d%%\n", FLAGS_customopspercent); fprintf(stdout, "DB-write-buffer-size : %" PRIu64 "\n", FLAGS_db_write_buffer_size); fprintf(stdout, "Write-buffer-size : %d\n", FLAGS_write_buffer_size); @@ -1668,6 +2186,8 @@ bottommost_compression.c_str()); std::string checksum = ChecksumTypeToString(checksum_type_e); fprintf(stdout, "Checksum type : %s\n", checksum.c_str()); + fprintf(stdout, "File checksum impl : %s\n", + FLAGS_file_checksum_impl.c_str()); fprintf(stdout, "Bloom bits / key : %s\n", FormatDoubleParam(FLAGS_bloom_bits).c_str()); fprintf(stdout, "Max subcompactions : %" PRIu64 "\n", @@ -1690,13 +2210,16 @@ fprintf(stdout, "Memtablerep : %s\n", memtablerep); - fprintf(stdout, "Test kill odd : %d\n", rocksdb_kill_odds); - if (!rocksdb_kill_prefix_blacklist.empty()) { +#ifndef NDEBUG + KillPoint* kp = KillPoint::GetInstance(); + fprintf(stdout, "Test kill odd : %d\n", kp->rocksdb_kill_odds); + if (!kp->rocksdb_kill_exclude_prefixes.empty()) { fprintf(stdout, "Skipping kill points prefixes:\n"); - for (auto& p : rocksdb_kill_prefix_blacklist) { + for (auto& p : kp->rocksdb_kill_exclude_prefixes) { fprintf(stdout, " %s\n", p.c_str()); } } +#endif fprintf(stdout, "Periodic Compaction Secs : %" PRIu64 "\n", FLAGS_periodic_compaction_seconds); fprintf(stdout, "Compaction TTL : %" PRIu64 "\n", @@ -1709,6 +2232,18 @@ FLAGS_max_write_batch_group_size_bytes); fprintf(stdout, "Use dynamic level : %d\n", static_cast(FLAGS_level_compaction_dynamic_level_bytes)); + fprintf(stdout, "Read fault one in : %d\n", FLAGS_read_fault_one_in); + fprintf(stdout, "Write fault one in : %d\n", FLAGS_write_fault_one_in); + fprintf(stdout, "Open metadata write fault one in:\n"); + fprintf(stdout, " %d\n", + FLAGS_open_metadata_write_fault_one_in); + fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection); + fprintf(stdout, "Best efforts recovery : %d\n", + static_cast(FLAGS_best_efforts_recovery)); + fprintf(stdout, "Fail if OPTIONS file error: %d\n", + static_cast(FLAGS_fail_if_options_file_error)); + fprintf(stdout, "User timestamp size bytes : %d\n", + static_cast(FLAGS_user_timestamp_size)); fprintf(stdout, "------------------------------------------------\n"); } @@ -1723,6 +2258,12 @@ block_based_options.block_cache = cache_; block_based_options.cache_index_and_filter_blocks = FLAGS_cache_index_and_filter_blocks; + block_based_options.metadata_cache_options.top_level_index_pinning = + static_cast(FLAGS_top_level_index_pinning); + block_based_options.metadata_cache_options.partition_pinning = + static_cast(FLAGS_partition_pinning); + block_based_options.metadata_cache_options.unpartitioned_pinning = + static_cast(FLAGS_unpartitioned_pinning); block_based_options.block_cache_compressed = compressed_cache_; block_based_options.checksum = checksum_type_e; block_based_options.block_size = FLAGS_block_size; @@ -1732,8 +2273,13 @@ static_cast(FLAGS_index_block_restart_interval); block_based_options.filter_policy = filter_policy_; block_based_options.partition_filters = FLAGS_partition_filters; + block_based_options.optimize_filters_for_memory = + FLAGS_optimize_filters_for_memory; block_based_options.index_type = static_cast(FLAGS_index_type); + block_based_options.prepopulate_block_cache = + static_cast( + FLAGS_prepopulate_block_cache); options_.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); options_.db_write_buffer_size = FLAGS_db_write_buffer_size; @@ -1783,12 +2329,18 @@ options_.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes; options_.compression_opts.zstd_max_train_bytes = FLAGS_compression_zstd_max_train_bytes; + options_.compression_opts.parallel_threads = + FLAGS_compression_parallel_threads; + options_.compression_opts.max_dict_buffer_bytes = + FLAGS_compression_max_dict_buffer_bytes; options_.create_if_missing = true; options_.max_manifest_file_size = FLAGS_max_manifest_file_size; options_.inplace_update_support = FLAGS_in_place_update; options_.max_subcompactions = static_cast(FLAGS_subcompactions); options_.allow_concurrent_memtable_write = FLAGS_allow_concurrent_memtable_write; + options_.experimental_mempurge_threshold = + FLAGS_experimental_mempurge_threshold; options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds; options_.ttl = FLAGS_compaction_ttl; options_.enable_pipelined_write = FLAGS_enable_pipelined_write; @@ -1806,10 +2358,29 @@ options_.avoid_unnecessary_blocking_io = FLAGS_avoid_unnecessary_blocking_io; options_.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest; + options_.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery; options_.max_write_batch_group_size_bytes = FLAGS_max_write_batch_group_size_bytes; options_.level_compaction_dynamic_level_bytes = FLAGS_level_compaction_dynamic_level_bytes; + options_.file_checksum_gen_factory = + GetFileChecksumImpl(FLAGS_file_checksum_impl); + options_.track_and_verify_wals_in_manifest = true; + + // Integrated BlobDB + options_.enable_blob_files = FLAGS_enable_blob_files; + options_.min_blob_size = FLAGS_min_blob_size; + options_.blob_file_size = FLAGS_blob_file_size; + options_.blob_compression_type = + StringToCompressionType(FLAGS_blob_compression_type.c_str()); + options_.enable_blob_garbage_collection = + FLAGS_enable_blob_garbage_collection; + options_.blob_garbage_collection_age_cutoff = + FLAGS_blob_garbage_collection_age_cutoff; + options_.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; + options_.blob_compaction_readahead_size = + FLAGS_blob_compaction_readahead_size; } else { #ifdef ROCKSDB_LITE fprintf(stderr, "--options_file not supported in lite mode\n"); @@ -1839,6 +2410,21 @@ options_.new_table_reader_for_compaction_inputs = true; } } + if (FLAGS_sst_file_manager_bytes_per_sec > 0 || + FLAGS_sst_file_manager_bytes_per_truncate > 0) { + Status status; + options_.sst_file_manager.reset(NewSstFileManager( + db_stress_env, options_.info_log, "" /* trash_dir */, + static_cast(FLAGS_sst_file_manager_bytes_per_sec), + true /* delete_existing_trash */, &status, + 0.25 /* max_trash_db_ratio */, + FLAGS_sst_file_manager_bytes_per_truncate)); + if (!status.ok()) { + fprintf(stderr, "SstFileManager creation failed: %s\n", + status.ToString().c_str()); + exit(1); + } + } if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) { fprintf(stderr, @@ -1874,10 +2460,47 @@ } else { options_.merge_operator = MergeOperators::CreatePutOperator(); } + if (FLAGS_enable_compaction_filter) { + options_.compaction_filter_factory = + std::make_shared(); + } + options_.table_properties_collector_factories.emplace_back( + std::make_shared()); + + options_.best_efforts_recovery = FLAGS_best_efforts_recovery; + options_.paranoid_file_checks = FLAGS_paranoid_file_checks; + options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error; + + if ((options_.enable_blob_files || options_.enable_blob_garbage_collection || + FLAGS_allow_setting_blob_options_dynamically) && + FLAGS_best_efforts_recovery) { + fprintf(stderr, + "Integrated BlobDB is currently incompatible with best-effort " + "recovery\n"); + exit(1); + } + + fprintf(stdout, + "Integrated BlobDB: blob files enabled %d, min blob size %" PRIu64 + ", blob file size %" PRIu64 + ", blob compression type %s, blob GC enabled %d, cutoff %f, force " + "threshold %f, blob compaction readahead size %" PRIu64 "\n", + options_.enable_blob_files, options_.min_blob_size, + options_.blob_file_size, + CompressionTypeToString(options_.blob_compression_type).c_str(), + options_.enable_blob_garbage_collection, + options_.blob_garbage_collection_age_cutoff, + options_.blob_garbage_collection_force_threshold, + options_.blob_compaction_readahead_size); fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); Status s; + + if (FLAGS_user_timestamp_size > 0) { + CheckAndSetOptionsForUserTimestamp(); + } + if (FLAGS_ttl == -1) { std::vector existing_column_families; s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db, @@ -1927,36 +2550,130 @@ column_family_names_.push_back(name); } options_.listeners.clear(); +#ifndef ROCKSDB_LITE options_.listeners.emplace_back( new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors)); +#endif // !ROCKSDB_LITE options_.create_missing_column_families = true; if (!FLAGS_use_txn) { -#ifndef ROCKSDB_LITE - if (FLAGS_use_blob_db) { - blob_db::BlobDBOptions blob_db_options; - blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; - blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; - blob_db_options.blob_file_size = FLAGS_blob_db_file_size; - blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; - blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; - - blob_db::BlobDB* blob_db = nullptr; - s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db, - cf_descriptors, &column_families_, &blob_db); - if (s.ok()) { - db_ = blob_db; - } - } else +#ifndef NDEBUG + // Determine whether we need to ingest file metadata write failures + // during DB reopen. If it does, enable it. + // Only ingest metadata error if it is reopening, as initial open + // failure doesn't need to be handled. + // TODO cover transaction DB is not covered in this fault test too. + bool ingest_meta_error = false; + bool ingest_write_error = false; + bool ingest_read_error = false; + if ((FLAGS_open_metadata_write_fault_one_in || + FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) && + fault_fs_guard + ->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr) + .ok()) { + if (!FLAGS_sync) { + // When DB Stress is not sync mode, we expect all WAL writes to + // WAL is durable. Buffering unsynced writes will cause false + // positive in crash tests. Before we figure out a way to + // solve it, skip WAL from failure injection. + fault_fs_guard->SetSkipDirectWritableTypes({kWalFile}); + } + ingest_meta_error = FLAGS_open_metadata_write_fault_one_in; + ingest_write_error = FLAGS_open_write_fault_one_in; + ingest_read_error = FLAGS_open_read_fault_one_in; + if (ingest_meta_error) { + fault_fs_guard->EnableMetadataWriteErrorInjection(); + fault_fs_guard->SetRandomMetadataWriteError( + FLAGS_open_metadata_write_fault_one_in); + } + if (ingest_write_error) { + fault_fs_guard->SetFilesystemDirectWritable(false); + fault_fs_guard->EnableWriteErrorInjection(); + fault_fs_guard->SetRandomWriteError( + static_cast(FLAGS_seed), FLAGS_open_write_fault_one_in, + IOStatus::IOError("Injected Open Error"), + /*inject_for_all_file_types=*/true, /*types=*/{}); + } + if (ingest_read_error) { + fault_fs_guard->SetRandomReadError(FLAGS_open_read_fault_one_in); + } + } + while (true) { +#endif // NDEBUG +#ifndef ROCKSDB_LITE + // StackableDB-based BlobDB + if (FLAGS_use_blob_db) { + blob_db::BlobDBOptions blob_db_options; + blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; + blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; + blob_db_options.blob_file_size = FLAGS_blob_db_file_size; + blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; + blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; + + blob_db::BlobDB* blob_db = nullptr; + s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db, + cf_descriptors, &column_families_, + &blob_db); + if (s.ok()) { + db_ = blob_db; + } + } else #endif // !ROCKSDB_LITE - { - if (db_preload_finished_.load() && FLAGS_read_only) { - s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors, - &column_families_, &db_); - } else { - s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, - &column_families_, &db_); + { + if (db_preload_finished_.load() && FLAGS_read_only) { + s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, + cf_descriptors, &column_families_, &db_); + } else { + s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, + &column_families_, &db_); + } + } + +#ifndef NDEBUG + if (ingest_meta_error || ingest_write_error || ingest_read_error) { + fault_fs_guard->SetFilesystemDirectWritable(true); + fault_fs_guard->DisableMetadataWriteErrorInjection(); + fault_fs_guard->DisableWriteErrorInjection(); + fault_fs_guard->SetSkipDirectWritableTypes({}); + fault_fs_guard->SetRandomReadError(0); + if (s.ok()) { + // Ingested errors might happen in background compactions. We + // wait for all compactions to finish to make sure DB is in + // clean state before executing queries. + s = static_cast_with_check(db_->GetRootDB()) + ->TEST_WaitForCompact(true); + if (!s.ok()) { + for (auto cf : column_families_) { + delete cf; + } + column_families_.clear(); + delete db_; + db_ = nullptr; + } + } + if (!s.ok()) { + // After failure to opening a DB due to IO error, retry should + // successfully open the DB with correct data if no IO error shows + // up. + ingest_meta_error = false; + ingest_write_error = false; + ingest_read_error = false; + + Random rand(static_cast(FLAGS_seed)); + if (rand.OneIn(2)) { + fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(), + nullptr); + } + if (rand.OneIn(3)) { + fault_fs_guard->DropUnsyncedFileData(); + } else if (rand.OneIn(2)) { + fault_fs_guard->DropRandomUnsyncedFileData(&rand); + } + continue; + } } + break; } +#endif // NDEBUG } else { #ifndef ROCKSDB_LITE TransactionDBOptions txn_db_options; @@ -2000,7 +2717,7 @@ assert(!s.ok() || column_families_.size() == static_cast(FLAGS_column_families)); - if (FLAGS_test_secondary) { + if (s.ok() && FLAGS_test_secondary) { #ifndef ROCKSDB_LITE secondaries_.resize(FLAGS_threads); std::fill(secondaries_.begin(), secondaries_.end(), nullptr); @@ -2021,13 +2738,12 @@ break; } } - assert(s.ok()); #else fprintf(stderr, "Secondary is not supported in RocksDBLite\n"); exit(1); #endif } - if (FLAGS_continuous_verification_interval > 0 && !cmp_db_) { + if (s.ok() && FLAGS_continuous_verification_interval > 0 && !cmp_db_) { Options tmp_opts; // TODO(yanqin) support max_open_files != -1 for secondary instance. tmp_opts.max_open_files = -1; @@ -2077,7 +2793,7 @@ // the db via a callbac ii) they hold on to a snapshot and the upcoming // ::Close would complain about it. const bool write_prepared = FLAGS_use_txn && FLAGS_txn_write_policy != 0; - bool bg_canceled = false; + bool bg_canceled __attribute__((unused)) = false; if (write_prepared || thread->rand.OneIn(2)) { const bool wait = write_prepared || static_cast(thread->rand.OneIn(2)); @@ -2085,7 +2801,6 @@ bg_canceled = wait; } assert(!write_prepared || bg_canceled); - (void) bg_canceled; #else (void) thread; #endif @@ -2123,11 +2838,80 @@ secondaries_.clear(); num_times_reopened_++; - auto now = db_stress_env->NowMicros(); + auto now = clock_->NowMicros(); fprintf(stdout, "%s Reopening database for the %dth time\n", - db_stress_env->TimeToString(now / 1000000).c_str(), - num_times_reopened_); + clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_); Open(); + + if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) { + Status s = thread->shared->SaveAtAndAfter(db_); + if (!s.ok()) { + fprintf(stderr, "Error enabling history tracing: %s\n", + s.ToString().c_str()); + exit(1); + } + } +} + +void StressTest::CheckAndSetOptionsForUserTimestamp() { + assert(FLAGS_user_timestamp_size > 0); + const Comparator* const cmp = test::ComparatorWithU64Ts(); + assert(cmp); + if (FLAGS_user_timestamp_size != cmp->timestamp_size()) { + fprintf(stderr, + "Only -user_timestamp_size=%d is supported in stress test.\n", + static_cast(cmp->timestamp_size())); + exit(1); + } + if (FLAGS_use_merge || FLAGS_use_full_merge_v1) { + fprintf(stderr, "Merge does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_delrangepercent > 0) { + fprintf(stderr, "DeleteRange does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_use_txn) { + fprintf(stderr, "TransactionDB does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_read_only) { + fprintf(stderr, "When opened as read-only, timestamp not supported.\n"); + exit(1); + } + if (FLAGS_test_secondary || FLAGS_secondary_catch_up_one_in > 0 || + FLAGS_continuous_verification_interval > 0) { + fprintf(stderr, "Secondary instance does not support timestamp.\n"); + exit(1); + } + if (FLAGS_checkpoint_one_in > 0) { + fprintf(stderr, + "-checkpoint_one_in=%d requires " + "DBImplReadOnly, which is not supported with timestamp\n", + FLAGS_checkpoint_one_in); + exit(1); + } +#ifndef ROCKSDB_LITE + if (FLAGS_enable_blob_files || FLAGS_use_blob_db) { + fprintf(stderr, "BlobDB not supported with timestamp.\n"); + exit(1); + } +#endif // !ROCKSDB_LITE + if (FLAGS_enable_compaction_filter) { + fprintf(stderr, "CompactionFilter not supported with timestamp.\n"); + exit(1); + } + if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) { + fprintf(stderr, + "Due to per-key ts-seq ordering constraint, only the (default) " + "non-batched test is supported with timestamp.\n"); + exit(1); + } + if (FLAGS_ingest_external_file_one_in > 0) { + fprintf(stderr, "Bulk loading may not support timestamp yet.\n"); + exit(1); + } + options_.comparator = cmp; } } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,6 +13,7 @@ #include "db_stress_tool/db_stress_shared_state.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; class Transaction; class TransactionDB; @@ -22,12 +23,16 @@ virtual ~StressTest(); - std::shared_ptr NewCache(size_t capacity); + std::shared_ptr NewCache(size_t capacity, int32_t num_shard_bits); + + static std::vector GetBlobCompressionTags(); bool BuildOptionsTable(); void InitDb(); - void InitReadonlyDb(SharedState*); + // The initialization work is split into two parts to avoid a circular + // dependency with `SharedState`. + virtual void FinishInitDb(SharedState*); // Return false if verification fails. bool VerifySecondaries(); @@ -60,6 +65,9 @@ virtual bool ShouldAcquireMutexOnKey() const { return false; } + // Returns true if DB state is tracked by the stress test. + virtual bool IsStateTracked() const = 0; + virtual std::vector GenerateColumnFamilies( const int /* num_column_families */, int rand_column_family) const { return {rand_column_family}; @@ -184,13 +192,23 @@ Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i); #ifndef ROCKSDB_LITE - Status VerifyGetLiveAndWalFiles(ThreadState* thread); + Status VerifyGetLiveFiles() const; + Status VerifyGetSortedWalFiles() const; + Status VerifyGetCurrentWalFile() const; + void TestGetProperty(ThreadState* thread) const; + virtual Status TestApproximateSize( ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, const std::vector& rand_keys); #endif // !ROCKSDB_LITE + virtual Status TestCustomOperations( + ThreadState* /*thread*/, + const std::vector& /*rand_column_families*/) { + return Status::NotSupported("TestCustomOperations() must be overridden"); + } + void VerificationAbort(SharedState* shared, std::string msg, Status s) const; void VerificationAbort(SharedState* shared, std::string msg, int cf, @@ -202,6 +220,8 @@ void Reopen(ThreadState* thread); + void CheckAndSetOptionsForUserTimestamp(); + std::shared_ptr cache_; std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; @@ -210,6 +230,7 @@ TransactionDB* txn_db_; #endif Options options_; + SystemClock* clock_; std::vector column_families_; std::vector column_family_names_; std::atomic new_column_family_name_; @@ -225,6 +246,7 @@ // Fields used for continuous verification from another thread DB* cmp_db_; std::vector cmp_cfhs_; + bool is_db_stopped_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc 2025-05-19 16:14:27.000000000 +0000 @@ -23,11 +23,16 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_driver.h" +#include "rocksdb/convenience.h" +#ifndef NDEBUG +#include "utilities/fault_injection_fs.h" +#endif namespace ROCKSDB_NAMESPACE { namespace { static std::shared_ptr env_guard; static std::shared_ptr env_wrapper_guard; +static std::shared_ptr fault_env_guard; } // namespace KeyGenContext key_gen_ctx; @@ -41,6 +46,11 @@ SanitizeDoubleParam(&FLAGS_memtable_prefix_bloom_size_ratio); SanitizeDoubleParam(&FLAGS_max_bytes_for_level_multiplier); +#ifndef NDEBUG + if (FLAGS_mock_direct_io) { + SetupSyncPointsToMockDirectIO(); + } +#endif if (FLAGS_statistics) { dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics(); if (FLAGS_test_secondary) { @@ -54,24 +64,64 @@ Env* raw_env; + int env_opts = + !FLAGS_hdfs.empty() + !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty(); + if (env_opts > 1) { + fprintf(stderr, + "Error: --hdfs, --env_uri and --fs_uri are mutually exclusive\n"); + exit(1); + } + if (!FLAGS_hdfs.empty()) { - if (!FLAGS_env_uri.empty()) { - fprintf(stderr, "Cannot specify both --hdfs and --env_uri.\n"); - exit(1); - } raw_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs); - } else if (!FLAGS_env_uri.empty()) { - Status s = Env::LoadEnv(FLAGS_env_uri, &raw_env, &env_guard); - if (raw_env == nullptr) { - fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str()); + } else { + Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri, + &raw_env, &env_guard); + if (!s.ok()) { + fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(), + s.ToString().c_str()); exit(1); } - } else { - raw_env = Env::Default(); } + +#ifndef NDEBUG + if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection || + FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in || + FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) { + FaultInjectionTestFS* fs = + new FaultInjectionTestFS(raw_env->GetFileSystem()); + fault_fs_guard.reset(fs); + if (FLAGS_write_fault_one_in) { + fault_fs_guard->SetFilesystemDirectWritable(false); + } else { + fault_fs_guard->SetFilesystemDirectWritable(true); + } + fault_env_guard = + std::make_shared(raw_env, fault_fs_guard); + raw_env = fault_env_guard.get(); + } + if (FLAGS_write_fault_one_in) { + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); }); + SyncPoint::GetInstance()->EnableProcessing(); + } +#endif + env_wrapper_guard = std::make_shared(raw_env); db_stress_env = env_wrapper_guard.get(); +#ifndef NDEBUG + if (FLAGS_write_fault_one_in) { + // In the write injection case, we need to use the FS interface and returns + // the IOStatus with different error and flags. Therefore, + // DbStressEnvWrapper cannot be used which will swallow the FS + // implementations. We should directly use the raw_env which is the + // CompositeEnvWrapper of env and fault_fs. + db_stress_env = raw_env; + } +#endif + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); // The number of background threads should be at least as much the @@ -92,17 +142,26 @@ "test_batches_snapshots test!\n"); exit(1); } - if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) { + if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 && + !FLAGS_memtable_whole_key_filtering) { fprintf(stderr, - "Error: please specify positive prefix_size in order to use " - "memtable_prefix_bloom_size_ratio\n"); + "Error: please specify positive prefix_size or enable whole key " + "filtering in order to use memtable_prefix_bloom_size_ratio\n"); exit(1); } if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent + - FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) { - fprintf(stderr, - "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != " - "100!\n"); + FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent + + FLAGS_customopspercent) != 100) { + fprintf( + stderr, + "Error: " + "Read(-readpercent=%d)+Prefix(-prefixpercent=%d)+Write(-writepercent=%" + "d)+Delete(-delpercent=%d)+DeleteRange(-delrangepercent=%d)" + "+Iterate(-iterpercent=%d)+CustomOps(-customopspercent=%d) percents != " + "100!\n", + FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent, + FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent, + FLAGS_customopspercent); exit(1); } if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) { @@ -195,9 +254,52 @@ "Must set -test_secondary=true if secondary_catch_up_one_in > 0.\n"); exit(1); } + if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb && + !FLAGS_disable_wal) { + fprintf(stderr, + "With best-efforts recovery, either skip_verifydb or disable_wal " + "should be set to true.\n"); + exit(1); + } + if (FLAGS_skip_verifydb) { + if (FLAGS_verify_db_one_in > 0) { + fprintf(stderr, + "Must set -verify_db_one_in=0 if skip_verifydb is true.\n"); + exit(1); + } + if (FLAGS_continuous_verification_interval > 0) { + fprintf(stderr, + "Must set -continuous_verification_interval=0 if skip_verifydb " + "is true.\n"); + exit(1); + } + } + if (FLAGS_enable_compaction_filter && + (FLAGS_acquire_snapshot_one_in > 0 || FLAGS_compact_range_one_in > 0 || + FLAGS_iterpercent > 0 || FLAGS_test_batches_snapshots || + FLAGS_test_cf_consistency)) { + fprintf( + stderr, + "Error: acquire_snapshot_one_in, compact_range_one_in, iterpercent, " + "test_batches_snapshots must all be 0 when using compaction filter\n"); + exit(1); + } + if (FLAGS_batch_protection_bytes_per_key > 0 && + !FLAGS_test_batches_snapshots) { + fprintf(stderr, + "Error: test_batches_snapshots must be enabled when " + "batch_protection_bytes_per_key > 0\n"); + exit(1); + } + if (FLAGS_test_multi_ops_txns) { + CheckAndSetOptionsForMultiOpsTxnStressTest(); + } - rocksdb_kill_odds = FLAGS_kill_random_test; - rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist); +#ifndef NDEBUG + KillPoint* kp = KillPoint::GetInstance(); + kp->rocksdb_kill_odds = FLAGS_kill_random_test; + kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes); +#endif unsigned int levels = FLAGS_max_key_len; std::vector weights; @@ -224,7 +326,7 @@ } } else { uint64_t keys_per_level = key_gen_ctx.window / levels; - for (unsigned int level = 0; level < levels - 1; ++level) { + for (unsigned int level = 0; level + 1 < levels; ++level) { key_gen_ctx.weights.emplace_back(keys_per_level); } key_gen_ctx.weights.emplace_back(key_gen_ctx.window - @@ -236,6 +338,8 @@ stress.reset(CreateCfConsistencyStressTest()); } else if (FLAGS_test_batches_snapshots) { stress.reset(CreateBatchedOpsStressTest()); + } else if (FLAGS_test_multi_ops_txns) { + stress.reset(CreateMultiOpsTxnsStressTest()); } else { stress.reset(CreateNonBatchedOpsStressTest()); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,616 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#include "db_stress_tool/expected_state.h" + +#include "db_stress_tool/db_stress_common.h" +#include "db_stress_tool/db_stress_shared_state.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/trace_record_result.h" + +namespace ROCKSDB_NAMESPACE { + +ExpectedState::ExpectedState(size_t max_key, size_t num_column_families) + : max_key_(max_key), + num_column_families_(num_column_families), + values_(nullptr) {} + +void ExpectedState::ClearColumnFamily(int cf) { + std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */), + SharedState::DELETION_SENTINEL); +} + +void ExpectedState::Put(int cf, int64_t key, uint32_t value_base, + bool pending) { + if (!pending) { + // prevent expected-value update from reordering before Write + std::atomic_thread_fence(std::memory_order_release); + } + Value(cf, key).store(pending ? SharedState::UNKNOWN_SENTINEL : value_base, + std::memory_order_relaxed); + if (pending) { + // prevent Write from reordering before expected-value update + std::atomic_thread_fence(std::memory_order_release); + } +} + +uint32_t ExpectedState::Get(int cf, int64_t key) const { + return Value(cf, key); +} + +bool ExpectedState::Delete(int cf, int64_t key, bool pending) { + if (Value(cf, key) == SharedState::DELETION_SENTINEL) { + return false; + } + Put(cf, key, SharedState::DELETION_SENTINEL, pending); + return true; +} + +bool ExpectedState::SingleDelete(int cf, int64_t key, bool pending) { + return Delete(cf, key, pending); +} + +int ExpectedState::DeleteRange(int cf, int64_t begin_key, int64_t end_key, + bool pending) { + int covered = 0; + for (int64_t key = begin_key; key < end_key; ++key) { + if (Delete(cf, key, pending)) { + ++covered; + } + } + return covered; +} + +bool ExpectedState::Exists(int cf, int64_t key) { + // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite + // is disallowed can't be accidentally added a second time, in which case + // SingleDelete wouldn't be able to properly delete the key. It does allow + // the case where a SingleDelete might be added which covers nothing, but + // that's not a correctness issue. + uint32_t expected_value = Value(cf, key).load(); + return expected_value != SharedState::DELETION_SENTINEL; +} + +void ExpectedState::Reset() { + for (size_t i = 0; i < num_column_families_; ++i) { + for (size_t j = 0; j < max_key_; ++j) { + Delete(static_cast(i), j, false /* pending */); + } + } +} + +FileExpectedState::FileExpectedState(std::string expected_state_file_path, + size_t max_key, size_t num_column_families) + : ExpectedState(max_key, num_column_families), + expected_state_file_path_(expected_state_file_path) {} + +Status FileExpectedState::Open(bool create) { + size_t expected_values_size = GetValuesLen(); + + Env* default_env = Env::Default(); + + Status status; + if (create) { + std::unique_ptr wfile; + const EnvOptions soptions; + status = default_env->NewWritableFile(expected_state_file_path_, &wfile, + soptions); + if (status.ok()) { + std::string buf(expected_values_size, '\0'); + status = wfile->Append(buf); + } + } + if (status.ok()) { + status = default_env->NewMemoryMappedFileBuffer( + expected_state_file_path_, &expected_state_mmap_buffer_); + } + if (status.ok()) { + assert(expected_state_mmap_buffer_->GetLen() == expected_values_size); + values_ = static_cast*>( + expected_state_mmap_buffer_->GetBase()); + assert(values_ != nullptr); + if (create) { + Reset(); + } + } else { + assert(values_ == nullptr); + } + return status; +} + +AnonExpectedState::AnonExpectedState(size_t max_key, size_t num_column_families) + : ExpectedState(max_key, num_column_families) {} + +#ifndef NDEBUG +Status AnonExpectedState::Open(bool create) { +#else +Status AnonExpectedState::Open(bool /* create */) { +#endif + // AnonExpectedState only supports being freshly created. + assert(create); + values_allocation_.reset( + new std::atomic[GetValuesLen() / + sizeof(std::atomic)]); + values_ = &values_allocation_[0]; + Reset(); + return Status::OK(); +} + +ExpectedStateManager::ExpectedStateManager(size_t max_key, + size_t num_column_families) + : max_key_(max_key), + num_column_families_(num_column_families), + latest_(nullptr) {} + +ExpectedStateManager::~ExpectedStateManager() {} + +const std::string FileExpectedStateManager::kLatestBasename = "LATEST"; +const std::string FileExpectedStateManager::kStateFilenameSuffix = ".state"; +const std::string FileExpectedStateManager::kTraceFilenameSuffix = ".trace"; +const std::string FileExpectedStateManager::kTempFilenamePrefix = "."; +const std::string FileExpectedStateManager::kTempFilenameSuffix = ".tmp"; + +FileExpectedStateManager::FileExpectedStateManager( + size_t max_key, size_t num_column_families, + std::string expected_state_dir_path) + : ExpectedStateManager(max_key, num_column_families), + expected_state_dir_path_(std::move(expected_state_dir_path)) { + assert(!expected_state_dir_path_.empty()); +} + +Status FileExpectedStateManager::Open() { + // Before doing anything, sync directory state with ours. That is, determine + // `saved_seqno_`, and create any necessary missing files. + std::vector expected_state_dir_children; + Status s = Env::Default()->GetChildren(expected_state_dir_path_, + &expected_state_dir_children); + bool found_trace = false; + if (s.ok()) { + for (size_t i = 0; i < expected_state_dir_children.size(); ++i) { + const auto& filename = expected_state_dir_children[i]; + if (filename.size() >= kStateFilenameSuffix.size() && + filename.rfind(kStateFilenameSuffix) == + filename.size() - kStateFilenameSuffix.size() && + filename.rfind(kLatestBasename, 0) == std::string::npos) { + SequenceNumber found_seqno = ParseUint64( + filename.substr(0, filename.size() - kStateFilenameSuffix.size())); + if (saved_seqno_ == kMaxSequenceNumber || found_seqno > saved_seqno_) { + saved_seqno_ = found_seqno; + } + } + } + // Check if crash happened after creating state file but before creating + // trace file. + if (saved_seqno_ != kMaxSequenceNumber) { + std::string saved_seqno_trace_path = + GetPathForFilename(ToString(saved_seqno_) + kTraceFilenameSuffix); + Status exists_status = Env::Default()->FileExists(saved_seqno_trace_path); + if (exists_status.ok()) { + found_trace = true; + } else if (exists_status.IsNotFound()) { + found_trace = false; + } else { + s = exists_status; + } + } + } + if (s.ok() && saved_seqno_ != kMaxSequenceNumber && !found_trace) { + // Create an empty trace file so later logic does not need to distinguish + // missing vs. empty trace file. + std::unique_ptr wfile; + const EnvOptions soptions; + std::string saved_seqno_trace_path = + GetPathForFilename(ToString(saved_seqno_) + kTraceFilenameSuffix); + s = Env::Default()->NewWritableFile(saved_seqno_trace_path, &wfile, + soptions); + } + + if (s.ok()) { + s = Clean(); + } + + std::string expected_state_file_path = + GetPathForFilename(kLatestBasename + kStateFilenameSuffix); + bool found = false; + if (s.ok()) { + Status exists_status = Env::Default()->FileExists(expected_state_file_path); + if (exists_status.ok()) { + found = true; + } else if (exists_status.IsNotFound()) { + found = false; + } else { + s = exists_status; + } + } + + if (!found) { + // Initialize the file in a temp path and then rename it. That way, in case + // this process is killed during setup, `Clean()` will take care of removing + // the incomplete expected values file. + std::string temp_expected_state_file_path = + GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix); + FileExpectedState temp_expected_state(temp_expected_state_file_path, + max_key_, num_column_families_); + if (s.ok()) { + s = temp_expected_state.Open(true /* create */); + } + if (s.ok()) { + s = Env::Default()->RenameFile(temp_expected_state_file_path, + expected_state_file_path); + } + } + + if (s.ok()) { + latest_.reset(new FileExpectedState(std::move(expected_state_file_path), + max_key_, num_column_families_)); + s = latest_->Open(false /* create */); + } + return s; +} + +#ifndef ROCKSDB_LITE +Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { + SequenceNumber seqno = db->GetLatestSequenceNumber(); + + std::string state_filename = ToString(seqno) + kStateFilenameSuffix; + std::string state_file_temp_path = GetTempPathForFilename(state_filename); + std::string state_file_path = GetPathForFilename(state_filename); + + std::string latest_file_path = + GetPathForFilename(kLatestBasename + kStateFilenameSuffix); + + std::string trace_filename = ToString(seqno) + kTraceFilenameSuffix; + std::string trace_file_path = GetPathForFilename(trace_filename); + + // Populate a tempfile and then rename it to atomically create ".state" + // with contents from "LATEST.state" + Status s = + CopyFile(FileSystem::Default(), latest_file_path, state_file_temp_path, + 0 /* size */, false /* use_fsync */); + if (s.ok()) { + s = FileSystem::Default()->RenameFile(state_file_temp_path, state_file_path, + IOOptions(), nullptr /* dbg */); + } + SequenceNumber old_saved_seqno = 0; + if (s.ok()) { + old_saved_seqno = saved_seqno_; + saved_seqno_ = seqno; + } + + // If there is a crash now, i.e., after ".state" was created but before + // ".trace" is created, it will be treated as if ".trace" were + // present but empty. + + // Create ".trace" directly. It is initially empty so no need for + // tempfile. + std::unique_ptr trace_writer; + if (s.ok()) { + EnvOptions soptions; + // Disable buffering so traces will not get stuck in application buffer. + soptions.writable_file_max_buffer_size = 0; + s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path, + &trace_writer); + } + if (s.ok()) { + TraceOptions trace_opts; + trace_opts.filter |= kTraceFilterGet; + trace_opts.filter |= kTraceFilterMultiGet; + trace_opts.filter |= kTraceFilterIteratorSeek; + trace_opts.filter |= kTraceFilterIteratorSeekForPrev; + trace_opts.preserve_write_order = true; + s = db->StartTrace(trace_opts, std::move(trace_writer)); + } + + // Delete old state/trace files. Deletion order does not matter since we only + // delete after successfully saving new files, so old files will never be used + // again, even if we crash. + if (s.ok() && old_saved_seqno != kMaxSequenceNumber && + old_saved_seqno != saved_seqno_) { + s = Env::Default()->DeleteFile( + GetPathForFilename(ToString(old_saved_seqno) + kStateFilenameSuffix)); + } + if (s.ok() && old_saved_seqno != kMaxSequenceNumber && + old_saved_seqno != saved_seqno_) { + s = Env::Default()->DeleteFile( + GetPathForFilename(ToString(old_saved_seqno) + kTraceFilenameSuffix)); + } + return s; +} +#else // ROCKSDB_LITE +Status FileExpectedStateManager::SaveAtAndAfter(DB* /* db */) { + return Status::NotSupported(); +} +#endif // ROCKSDB_LITE + +bool FileExpectedStateManager::HasHistory() { + return saved_seqno_ != kMaxSequenceNumber; +} + +#ifndef ROCKSDB_LITE + +namespace { + +// An `ExpectedStateTraceRecordHandler` applies a configurable number of +// write operation trace records to the configured expected state. It is used in +// `FileExpectedStateManager::Restore()` to sync the expected state with the +// DB's post-recovery state. +class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, + public WriteBatch::Handler { + public: + ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state) + : max_write_ops_(max_write_ops), state_(state) {} + + ~ExpectedStateTraceRecordHandler() { assert(IsDone()); } + + // True if we have already reached the limit on write operations to apply. + bool IsDone() { return num_write_ops_ == max_write_ops_; } + + Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* /* result */) override { + if (IsDone()) { + return Status::OK(); + } + WriteBatch batch(record.GetWriteBatchRep().ToString()); + return batch.Iterate(this); + } + + // Ignore reads. + Status Handle(const GetQueryTraceRecord& /* record */, + std::unique_ptr* /* result */) override { + return Status::OK(); + } + + // Ignore reads. + Status Handle(const IteratorSeekQueryTraceRecord& /* record */, + std::unique_ptr* /* result */) override { + return Status::OK(); + } + + // Ignore reads. + Status Handle(const MultiGetQueryTraceRecord& /* record */, + std::unique_ptr* /* result */) override { + return Status::OK(); + } + + // Below are the WriteBatch::Handler overrides. We could use a separate + // object, but it's convenient and works to share state with the + // `TraceRecord::Handler`. + + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + uint64_t key_id; + if (!GetIntVal(key.ToString(), &key_id)) { + return Status::Corruption("unable to parse key", key.ToString()); + } + uint32_t value_id = GetValueBase(value); + + state_->Put(column_family_id, static_cast(key_id), value_id, + false /* pending */); + ++num_write_ops_; + return Status::OK(); + } + + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + uint64_t key_id; + if (!GetIntVal(key.ToString(), &key_id)) { + return Status::Corruption("unable to parse key", key.ToString()); + } + + state_->Delete(column_family_id, static_cast(key_id), + false /* pending */); + ++num_write_ops_; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + return DeleteCF(column_family_id, key); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + uint64_t begin_key_id, end_key_id; + if (!GetIntVal(begin_key.ToString(), &begin_key_id)) { + return Status::Corruption("unable to parse begin key", + begin_key.ToString()); + } + if (!GetIntVal(end_key.ToString(), &end_key_id)) { + return Status::Corruption("unable to parse end key", end_key.ToString()); + } + + state_->DeleteRange(column_family_id, static_cast(begin_key_id), + static_cast(end_key_id), false /* pending */); + ++num_write_ops_; + return Status::OK(); + } + + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + return PutCF(column_family_id, key, value); + } + + private: + uint64_t num_write_ops_ = 0; + uint64_t max_write_ops_; + ExpectedState* state_; +}; + +} // anonymous namespace + +Status FileExpectedStateManager::Restore(DB* db) { + assert(HasHistory()); + SequenceNumber seqno = db->GetLatestSequenceNumber(); + if (seqno < saved_seqno_) { + return Status::Corruption("DB is older than any restorable expected state"); + } + + std::string state_filename = ToString(saved_seqno_) + kStateFilenameSuffix; + std::string state_file_path = GetPathForFilename(state_filename); + + std::string latest_file_temp_path = + GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix); + std::string latest_file_path = + GetPathForFilename(kLatestBasename + kStateFilenameSuffix); + + std::string trace_filename = ToString(saved_seqno_) + kTraceFilenameSuffix; + std::string trace_file_path = GetPathForFilename(trace_filename); + + std::unique_ptr trace_reader; + Status s = NewFileTraceReader(Env::Default(), EnvOptions(), trace_file_path, + &trace_reader); + + if (s.ok()) { + // We are going to replay on top of "`seqno`.state" to create a new + // "LATEST.state". Start off by creating a tempfile so we can later make the + // new "LATEST.state" appear atomically using `RenameFile()`. + s = CopyFile(FileSystem::Default(), state_file_path, latest_file_temp_path, + 0 /* size */, false /* use_fsync */); + } + + { + std::unique_ptr replayer; + std::unique_ptr state; + std::unique_ptr handler; + if (s.ok()) { + state.reset(new FileExpectedState(latest_file_temp_path, max_key_, + num_column_families_)); + s = state->Open(false /* create */); + } + if (s.ok()) { + handler.reset(new ExpectedStateTraceRecordHandler(seqno - saved_seqno_, + state.get())); + // TODO(ajkr): An API limitation requires we provide `handles` although + // they will be unused since we only use the replayer for reading records. + // Just give a default CFH for now to satisfy the requirement. + s = db->NewDefaultReplayer({db->DefaultColumnFamily()} /* handles */, + std::move(trace_reader), &replayer); + } + + if (s.ok()) { + s = replayer->Prepare(); + } + for (;;) { + std::unique_ptr record; + s = replayer->Next(&record); + if (!s.ok()) { + break; + } + std::unique_ptr res; + record->Accept(handler.get(), &res); + } + if (s.IsCorruption() && handler->IsDone()) { + // There could be a corruption reading the tail record of the trace due to + // `db_stress` crashing while writing it. It shouldn't matter as long as + // we already found all the write ops we need to catch up the expected + // state. + s = Status::OK(); + } + if (s.IsIncomplete()) { + // OK because `Status::Incomplete` is expected upon finishing all the + // trace records. + s = Status::OK(); + } + } + + if (s.ok()) { + s = FileSystem::Default()->RenameFile(latest_file_temp_path, + latest_file_path, IOOptions(), + nullptr /* dbg */); + } + if (s.ok()) { + latest_.reset(new FileExpectedState(latest_file_path, max_key_, + num_column_families_)); + s = latest_->Open(false /* create */); + } + + // Delete old state/trace files. We must delete the state file first. + // Otherwise, a crash-recovery immediately after deleting the trace file could + // lead to `Restore()` unable to replay to `seqno`. + if (s.ok()) { + s = Env::Default()->DeleteFile(state_file_path); + } + if (s.ok()) { + saved_seqno_ = kMaxSequenceNumber; + s = Env::Default()->DeleteFile(trace_file_path); + } + return s; +} +#else // ROCKSDB_LITE +Status FileExpectedStateManager::Restore(DB* /* db */) { + return Status::NotSupported(); +} +#endif // ROCKSDB_LITE + +Status FileExpectedStateManager::Clean() { + std::vector expected_state_dir_children; + Status s = Env::Default()->GetChildren(expected_state_dir_path_, + &expected_state_dir_children); + // An incomplete `Open()` or incomplete `SaveAtAndAfter()` could have left + // behind invalid temporary files. An incomplete `SaveAtAndAfter()` could have + // also left behind stale state/trace files. An incomplete `Restore()` could + // have left behind stale trace files. + for (size_t i = 0; s.ok() && i < expected_state_dir_children.size(); ++i) { + const auto& filename = expected_state_dir_children[i]; + if (filename.rfind(kTempFilenamePrefix, 0 /* pos */) == 0 && + filename.size() >= kTempFilenameSuffix.size() && + filename.rfind(kTempFilenameSuffix) == + filename.size() - kTempFilenameSuffix.size()) { + // Delete all temp files. + s = Env::Default()->DeleteFile(GetPathForFilename(filename)); + } else if (filename.size() >= kStateFilenameSuffix.size() && + filename.rfind(kStateFilenameSuffix) == + filename.size() - kStateFilenameSuffix.size() && + filename.rfind(kLatestBasename, 0) == std::string::npos && + ParseUint64(filename.substr( + 0, filename.size() - kStateFilenameSuffix.size())) < + saved_seqno_) { + assert(saved_seqno_ != kMaxSequenceNumber); + // Delete stale state files. + s = Env::Default()->DeleteFile(GetPathForFilename(filename)); + } else if (filename.size() >= kTraceFilenameSuffix.size() && + filename.rfind(kTraceFilenameSuffix) == + filename.size() - kTraceFilenameSuffix.size() && + ParseUint64(filename.substr( + 0, filename.size() - kTraceFilenameSuffix.size())) < + saved_seqno_) { + // Delete stale trace files. + s = Env::Default()->DeleteFile(GetPathForFilename(filename)); + } + } + return s; +} + +std::string FileExpectedStateManager::GetTempPathForFilename( + const std::string& filename) { + assert(!expected_state_dir_path_.empty()); + std::string expected_state_dir_path_slash = + expected_state_dir_path_.back() == '/' ? expected_state_dir_path_ + : expected_state_dir_path_ + "/"; + return expected_state_dir_path_slash + kTempFilenamePrefix + filename + + kTempFilenameSuffix; +} + +std::string FileExpectedStateManager::GetPathForFilename( + const std::string& filename) { + assert(!expected_state_dir_path_.empty()); + std::string expected_state_dir_path_slash = + expected_state_dir_path_.back() == '/' ? expected_state_dir_path_ + : expected_state_dir_path_ + "/"; + return expected_state_dir_path_slash + filename; +} + +AnonExpectedStateManager::AnonExpectedStateManager(size_t max_key, + size_t num_column_families) + : ExpectedStateManager(max_key, num_column_families) {} + +Status AnonExpectedStateManager::Open() { + latest_.reset(new AnonExpectedState(max_key_, num_column_families_)); + return latest_->Open(true /* create */); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,287 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#pragma once + +#include + +#include +#include + +#include "db/dbformat.h" +#include "file/file_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// An `ExpectedState` provides read/write access to expected values for every +// key. +class ExpectedState { + public: + explicit ExpectedState(size_t max_key, size_t num_column_families); + + virtual ~ExpectedState() {} + + // Requires external locking preventing concurrent execution with any other + // member function. + virtual Status Open(bool create) = 0; + + // Requires external locking covering all keys in `cf`. + void ClearColumnFamily(int cf); + + // @param pending True if the update may have started but is not yet + // guaranteed finished. This is useful for crash-recovery testing when the + // process may crash before updating the expected values array. + // + // Requires external locking covering `key` in `cf`. + void Put(int cf, int64_t key, uint32_t value_base, bool pending); + + // Requires external locking covering `key` in `cf`. + uint32_t Get(int cf, int64_t key) const; + + // @param pending See comment above Put() + // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. + bool Delete(int cf, int64_t key, bool pending); + + // @param pending See comment above Put() + // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. + bool SingleDelete(int cf, int64_t key, bool pending); + + // @param pending See comment above Put() + // Returns number of keys deleted by the call. + // + // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. + int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending); + + // Requires external locking covering `key` in `cf`. + bool Exists(int cf, int64_t key); + + private: + // Requires external locking covering `key` in `cf`. + std::atomic& Value(int cf, int64_t key) const { + return values_[cf * max_key_ + key]; + } + + const size_t max_key_; + const size_t num_column_families_; + + protected: + size_t GetValuesLen() const { + return sizeof(std::atomic) * num_column_families_ * max_key_; + } + + // Requires external locking preventing concurrent execution with any other + // member function. + void Reset(); + + std::atomic* values_; +}; + +// A `FileExpectedState` implements `ExpectedState` backed by a file. +class FileExpectedState : public ExpectedState { + public: + explicit FileExpectedState(std::string expected_state_file_path, + size_t max_key, size_t num_column_families); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open(bool create) override; + + private: + const std::string expected_state_file_path_; + std::unique_ptr expected_state_mmap_buffer_; +}; + +// An `AnonExpectedState` implements `ExpectedState` backed by a memory +// allocation. +class AnonExpectedState : public ExpectedState { + public: + explicit AnonExpectedState(size_t max_key, size_t num_column_families); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open(bool create) override; + + private: + std::unique_ptr[]> values_allocation_; +}; + +// An `ExpectedStateManager` manages data about the expected state of the +// database. It exposes operations for reading and modifying the latest +// expected state. +class ExpectedStateManager { + public: + explicit ExpectedStateManager(size_t max_key, size_t num_column_families); + + virtual ~ExpectedStateManager(); + + // Requires external locking preventing concurrent execution with any other + // member function. + virtual Status Open() = 0; + + // Saves expected values for the current state of `db` and begins tracking + // changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be + // called on the same DB, as long as its state does not roll back to before + // its current state. + // + // Requires external locking preventing concurrent execution with any other + // member function. Furthermore, `db` must not be mutated while this function + // is executing. + virtual Status SaveAtAndAfter(DB* db) = 0; + + // Returns true if at least one state of historical expected values can be + // restored. + // + // Requires external locking preventing concurrent execution with any other + // member function. + virtual bool HasHistory() = 0; + + // Restores expected values according to the current state of `db`. See + // `SaveAtAndAfter()` for conditions where this can be called. + // + // Requires external locking preventing concurrent execution with any other + // member function. Furthermore, `db` must not be mutated while this function + // is executing. + virtual Status Restore(DB* db) = 0; + + // Requires external locking covering all keys in `cf`. + void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } + + // @param pending True if the update may have started but is not yet + // guaranteed finished. This is useful for crash-recovery testing when the + // process may crash before updating the expected values array. + // + // Requires external locking covering `key` in `cf`. + void Put(int cf, int64_t key, uint32_t value_base, bool pending) { + return latest_->Put(cf, key, value_base, pending); + } + + // Requires external locking covering `key` in `cf`. + uint32_t Get(int cf, int64_t key) const { return latest_->Get(cf, key); } + + // @param pending See comment above Put() + // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. + bool Delete(int cf, int64_t key, bool pending) { + return latest_->Delete(cf, key, pending); + } + + // @param pending See comment above Put() + // Returns true if the key was not yet deleted. + // + // Requires external locking covering `key` in `cf`. + bool SingleDelete(int cf, int64_t key, bool pending) { + return latest_->SingleDelete(cf, key, pending); + } + + // @param pending See comment above Put() + // Returns number of keys deleted by the call. + // + // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. + int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) { + return latest_->DeleteRange(cf, begin_key, end_key, pending); + } + + // Requires external locking covering `key` in `cf`. + bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); } + + protected: + const size_t max_key_; + const size_t num_column_families_; + std::unique_ptr latest_; +}; + +// A `FileExpectedStateManager` implements an `ExpectedStateManager` backed by +// a directory of files containing data about the expected state of the +// database. +class FileExpectedStateManager : public ExpectedStateManager { + public: + explicit FileExpectedStateManager(size_t max_key, size_t num_column_families, + std::string expected_state_dir_path); + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open() override; + + // See `ExpectedStateManager::SaveAtAndAfter()` API doc. + // + // This implementation makes a copy of "LATEST.state" into + // ".state", and starts a trace in ".trace". + // Due to using external files, a following `Restore()` can happen even + // from a different process. + Status SaveAtAndAfter(DB* db) override; + + // See `ExpectedStateManager::HasHistory()` API doc. + bool HasHistory() override; + + // See `ExpectedStateManager::Restore()` API doc. + // + // Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()` + // was called and now it is `b`. Then this function replays `b - a` write + // operations from "`a`.trace" onto "`a`.state", and then copies the resulting + // file into "LATEST.state". + Status Restore(DB* db) override; + + private: + // Requires external locking preventing concurrent execution with any other + // member function. + Status Clean(); + + std::string GetTempPathForFilename(const std::string& filename); + std::string GetPathForFilename(const std::string& filename); + + static const std::string kLatestBasename; + static const std::string kStateFilenameSuffix; + static const std::string kTraceFilenameSuffix; + static const std::string kTempFilenamePrefix; + static const std::string kTempFilenameSuffix; + + const std::string expected_state_dir_path_; + SequenceNumber saved_seqno_ = kMaxSequenceNumber; +}; + +// An `AnonExpectedStateManager` implements an `ExpectedStateManager` backed by +// a memory allocation containing data about the expected state of the database. +class AnonExpectedStateManager : public ExpectedStateManager { + public: + explicit AnonExpectedStateManager(size_t max_key, size_t num_column_families); + + // See `ExpectedStateManager::SaveAtAndAfter()` API doc. + // + // This implementation returns `Status::NotSupported` since we do not + // currently have a need to keep history of expected state within a process. + Status SaveAtAndAfter(DB* /* db */) override { + return Status::NotSupported(); + } + + // See `ExpectedStateManager::HasHistory()` API doc. + bool HasHistory() override { return false; } + + // See `ExpectedStateManager::Restore()` API doc. + // + // This implementation returns `Status::NotSupported` since we do not + // currently have a need to keep history of expected state within a process. + Status Restore(DB* /* db */) override { return Status::NotSupported(); } + + // Requires external locking preventing concurrent execution with any other + // member function. + Status Open() override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,1037 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#include "db_stress_tool/multi_ops_txns_stress.h" + +#include "rocksdb/utilities/write_batch_with_index.h" +#include "util/defer.h" +#ifndef NDEBUG +#include "utilities/fault_injection_fs.h" +#endif // NDEBUG + +namespace ROCKSDB_NAMESPACE { + +// TODO: move these to gflags. +static constexpr uint32_t kInitNumC = 1000; +#ifndef ROCKSDB_LITE +static constexpr uint32_t kInitialCARatio = 3; +#endif // ROCKSDB_LITE +static constexpr bool kDoPreload = true; + +std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey(uint32_t a) { + char buf[8]; + EncodeFixed32(buf, kPrimaryIndexId); + std::reverse(buf, buf + 4); + EncodeFixed32(buf + 4, a); + std::reverse(buf + 4, buf + 8); + return std::string(buf, sizeof(buf)); +} + +std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c) { + char buf[8]; + EncodeFixed32(buf, kSecondaryIndexId); + std::reverse(buf, buf + 4); + EncodeFixed32(buf + 4, c); + std::reverse(buf + 4, buf + 8); + return std::string(buf, sizeof(buf)); +} + +std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c, + uint32_t a) { + char buf[12]; + EncodeFixed32(buf, kSecondaryIndexId); + std::reverse(buf, buf + 4); + EncodeFixed32(buf + 4, c); + EncodeFixed32(buf + 8, a); + std::reverse(buf + 4, buf + 8); + std::reverse(buf + 8, buf + 12); + return std::string(buf, sizeof(buf)); +} + +std::tuple +MultiOpsTxnsStressTest::Record::DecodePrimaryIndexValue( + Slice primary_index_value) { + if (primary_index_value.size() != 8) { + return std::tuple{Status::Corruption(""), 0, 0}; + } + uint32_t b = 0; + uint32_t c = 0; + if (!GetFixed32(&primary_index_value, &b) || + !GetFixed32(&primary_index_value, &c)) { + assert(false); + return std::tuple{Status::Corruption(""), 0, 0}; + } + return std::tuple{Status::OK(), b, c}; +} + +std::pair +MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexValue( + Slice secondary_index_value) { + if (secondary_index_value.size() != 4) { + return std::make_pair(Status::Corruption(""), 0); + } + uint32_t crc = 0; + bool result __attribute__((unused)) = + GetFixed32(&secondary_index_value, &crc); + assert(result); + return std::make_pair(Status::OK(), crc); +} + +std::pair +MultiOpsTxnsStressTest::Record::EncodePrimaryIndexEntry() const { + std::string primary_index_key = EncodePrimaryKey(); + std::string primary_index_value = EncodePrimaryIndexValue(); + return std::make_pair(primary_index_key, primary_index_value); +} + +std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey() const { + return EncodePrimaryKey(a_); +} + +std::string MultiOpsTxnsStressTest::Record::EncodePrimaryIndexValue() const { + char buf[8]; + EncodeFixed32(buf, b_); + EncodeFixed32(buf + 4, c_); + return std::string(buf, sizeof(buf)); +} + +std::pair +MultiOpsTxnsStressTest::Record::EncodeSecondaryIndexEntry() const { + std::string secondary_index_key; + char buf[12]; + EncodeFixed32(buf, kSecondaryIndexId); + std::reverse(buf, buf + 4); + EncodeFixed32(buf + 4, c_); + EncodeFixed32(buf + 8, a_); + std::reverse(buf + 4, buf + 8); + std::reverse(buf + 8, buf + 12); + secondary_index_key.assign(buf, sizeof(buf)); + + // Secondary index value is always 4-byte crc32 of the secondary key + std::string secondary_index_value; + uint32_t crc = crc32c::Value(buf, sizeof(buf)); + PutFixed32(&secondary_index_value, crc); + return std::make_pair(secondary_index_key, secondary_index_value); +} + +std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey() const { + char buf[12]; + EncodeFixed32(buf, kSecondaryIndexId); + std::reverse(buf, buf + 4); + EncodeFixed32(buf + 4, c_); + EncodeFixed32(buf + 8, a_); + std::reverse(buf + 4, buf + 8); + std::reverse(buf + 8, buf + 12); + return std::string(buf, sizeof(buf)); +} + +Status MultiOpsTxnsStressTest::Record::DecodePrimaryIndexEntry( + Slice primary_index_key, Slice primary_index_value) { + if (primary_index_key.size() != 8) { + assert(false); + return Status::Corruption("Primary index key length is not 8"); + } + + const char* const index_id_buf = primary_index_key.data(); + uint32_t index_id = + static_cast(static_cast(index_id_buf[0])) << 24; + index_id += static_cast(static_cast(index_id_buf[1])) + << 16; + index_id += static_cast(static_cast(index_id_buf[2])) + << 8; + index_id += + static_cast(static_cast(index_id_buf[3])); + primary_index_key.remove_prefix(sizeof(uint32_t)); + if (index_id != kPrimaryIndexId) { + std::ostringstream oss; + oss << "Unexpected primary index id: " << index_id; + return Status::Corruption(oss.str()); + } + + const char* const buf = primary_index_key.data(); + a_ = static_cast(static_cast(buf[0])) << 24; + a_ += static_cast(static_cast(buf[1])) << 16; + a_ += static_cast(static_cast(buf[2])) << 8; + a_ += static_cast(static_cast(buf[3])); + + if (primary_index_value.size() != 8) { + return Status::Corruption("Primary index value length is not 8"); + } + GetFixed32(&primary_index_value, &b_); + GetFixed32(&primary_index_value, &c_); + return Status::OK(); +} + +Status MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexEntry( + Slice secondary_index_key, Slice secondary_index_value) { + if (secondary_index_key.size() != 12) { + return Status::Corruption("Secondary index key length is not 12"); + } + uint32_t crc = + crc32c::Value(secondary_index_key.data(), secondary_index_key.size()); + + const char* const index_id_buf = secondary_index_key.data(); + uint32_t index_id = + static_cast(static_cast(index_id_buf[0])) << 24; + index_id += static_cast(static_cast(index_id_buf[1])) + << 16; + index_id += static_cast(static_cast(index_id_buf[2])) + << 8; + index_id += + static_cast(static_cast(index_id_buf[3])); + secondary_index_key.remove_prefix(sizeof(uint32_t)); + if (index_id != kSecondaryIndexId) { + std::ostringstream oss; + oss << "Unexpected secondary index id: " << index_id; + return Status::Corruption(oss.str()); + } + + const char* const buf = secondary_index_key.data(); + assert(secondary_index_key.size() == 8); + c_ = static_cast(static_cast(buf[0])) << 24; + c_ += static_cast(static_cast(buf[1])) << 16; + c_ += static_cast(static_cast(buf[2])) << 8; + c_ += static_cast(static_cast(buf[3])); + + a_ = static_cast(static_cast(buf[4])) << 24; + a_ += static_cast(static_cast(buf[5])) << 16; + a_ += static_cast(static_cast(buf[6])) << 8; + a_ += static_cast(static_cast(buf[7])); + + if (secondary_index_value.size() != 4) { + return Status::Corruption("Secondary index value length is not 4"); + } + uint32_t val = 0; + GetFixed32(&secondary_index_value, &val); + if (val != crc) { + std::ostringstream oss; + oss << "Secondary index key checksum mismatch, stored: " << val + << ", recomputed: " << crc; + return Status::Corruption(oss.str()); + } + return Status::OK(); +} + +void MultiOpsTxnsStressTest::FinishInitDb(SharedState* shared) { + if (FLAGS_enable_compaction_filter) { + // TODO (yanqin) enable compaction filter + } + if (kDoPreload) { + ReopenAndPreloadDb(shared); + } +} + +void MultiOpsTxnsStressTest::ReopenAndPreloadDb(SharedState* shared) { + (void)shared; +#ifndef ROCKSDB_LITE + std::vector cf_descs; + for (const auto* handle : column_families_) { + cf_descs.emplace_back(handle->GetName(), ColumnFamilyOptions(options_)); + } + CancelAllBackgroundWork(db_, /*wait=*/true); + for (auto* handle : column_families_) { + delete handle; + } + column_families_.clear(); + delete db_; + db_ = nullptr; + txn_db_ = nullptr; + + TransactionDBOptions txn_db_opts; + txn_db_opts.skip_concurrency_control = true; // speed-up preloading + Status s = TransactionDB::Open(options_, txn_db_opts, FLAGS_db, cf_descs, + &column_families_, &txn_db_); + if (s.ok()) { + db_ = txn_db_; + } else { + fprintf(stderr, "Failed to open db: %s\n", s.ToString().c_str()); + exit(1); + } + + PreloadDb(shared, kInitNumC); + + // Reopen + CancelAllBackgroundWork(db_, /*wait=*/true); + for (auto* handle : column_families_) { + delete handle; + } + column_families_.clear(); + s = db_->Close(); + if (!s.ok()) { + fprintf(stderr, "Error during closing db: %s\n", s.ToString().c_str()); + exit(1); + } + delete db_; + db_ = nullptr; + txn_db_ = nullptr; + + Open(); +#endif // !ROCKSDB_LITE +} + +// Used for point-lookup transaction +Status MultiOpsTxnsStressTest::TestGet( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& /*rand_column_families*/, + const std::vector& /*rand_keys*/) { + uint32_t a = ChooseA(thread); + return PointLookupTxn(thread, read_opts, a); +} + +// Not used. +std::vector MultiOpsTxnsStressTest::TestMultiGet( + ThreadState* /*thread*/, const ReadOptions& /*read_opts*/, + const std::vector& /*rand_column_families*/, + const std::vector& /*rand_keys*/) { + return std::vector{Status::NotSupported()}; +} + +Status MultiOpsTxnsStressTest::TestPrefixScan( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) { + (void)thread; + (void)read_opts; + (void)rand_column_families; + (void)rand_keys; + return Status::OK(); +} + +// Given a key K, this creates an iterator which scans to K and then +// does a random sequence of Next/Prev operations. +Status MultiOpsTxnsStressTest::TestIterate( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& /*rand_column_families*/, + const std::vector& /*rand_keys*/) { + uint32_t c = thread->rand.Next() % kInitNumC; + return RangeScanTxn(thread, read_opts, c); +} + +// Not intended for use. +Status MultiOpsTxnsStressTest::TestPut(ThreadState* /*thread*/, + WriteOptions& /*write_opts*/, + const ReadOptions& /*read_opts*/, + const std::vector& /*cf_ids*/, + const std::vector& /*keys*/, + char (&value)[100], + std::unique_ptr& /*lock*/) { + (void)value; + return Status::NotSupported(); +} + +// Not intended for use. +Status MultiOpsTxnsStressTest::TestDelete( + ThreadState* /*thread*/, WriteOptions& /*write_opts*/, + const std::vector& /*rand_column_families*/, + const std::vector& /*rand_keys*/, + std::unique_ptr& /*lock*/) { + return Status::NotSupported(); +} + +// Not intended for use. +Status MultiOpsTxnsStressTest::TestDeleteRange( + ThreadState* /*thread*/, WriteOptions& /*write_opts*/, + const std::vector& /*rand_column_families*/, + const std::vector& /*rand_keys*/, + std::unique_ptr& /*lock*/) { + return Status::NotSupported(); +} + +void MultiOpsTxnsStressTest::TestIngestExternalFile( + ThreadState* thread, const std::vector& rand_column_families, + const std::vector& /*rand_keys*/, + std::unique_ptr& /*lock*/) { + // TODO (yanqin) + (void)thread; + (void)rand_column_families; +} + +void MultiOpsTxnsStressTest::TestCompactRange( + ThreadState* thread, int64_t /*rand_key*/, const Slice& /*start_key*/, + ColumnFamilyHandle* column_family) { + // TODO (yanqin). + // May use GetRangeHash() for validation before and after DB::CompactRange() + // completes. + (void)thread; + (void)column_family; +} + +Status MultiOpsTxnsStressTest::TestBackupRestore( + ThreadState* thread, const std::vector& rand_column_families, + const std::vector& /*rand_keys*/) { + // TODO (yanqin) + (void)thread; + (void)rand_column_families; + return Status::OK(); +} + +Status MultiOpsTxnsStressTest::TestCheckpoint( + ThreadState* thread, const std::vector& rand_column_families, + const std::vector& /*rand_keys*/) { + // TODO (yanqin) + (void)thread; + (void)rand_column_families; + return Status::OK(); +} + +#ifndef ROCKSDB_LITE +Status MultiOpsTxnsStressTest::TestApproximateSize( + ThreadState* thread, uint64_t iteration, + const std::vector& rand_column_families, + const std::vector& /*rand_keys*/) { + // TODO (yanqin) + (void)thread; + (void)iteration; + (void)rand_column_families; + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +Status MultiOpsTxnsStressTest::TestCustomOperations( + ThreadState* thread, const std::vector& rand_column_families) { + (void)rand_column_families; + // Randomly choose from 0, 1, and 2. + // TODO (yanqin) allow user to configure probability of each operation. + uint32_t rand = thread->rand.Uniform(3); + Status s; + if (0 == rand) { + // Update primary key. + uint32_t old_a = ChooseA(thread); + uint32_t new_a = GenerateNextA(); + s = PrimaryKeyUpdateTxn(thread, old_a, new_a); + } else if (1 == rand) { + // Update secondary key. + uint32_t old_c = thread->rand.Next() % kInitNumC; + int count = 0; + uint32_t new_c = 0; + do { + ++count; + new_c = thread->rand.Next() % kInitNumC; + } while (count < 100 && new_c == old_c); + if (count >= 100) { + // If we reach here, it means our random number generator has a serious + // problem, or kInitNumC is chosen poorly. + std::terminate(); + } + s = SecondaryKeyUpdateTxn(thread, old_c, new_c); + } else if (2 == rand) { + // Update primary index value. + uint32_t a = ChooseA(thread); + s = UpdatePrimaryIndexValueTxn(thread, a, /*b_delta=*/1); + } else { + // Should never reach here. + assert(false); + } + return s; +} + +Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread, + uint32_t old_a, + uint32_t new_a) { +#ifdef ROCKSDB_LITE + (void)thread; + (void)old_a; + (void)new_a; + return Status::NotSupported(); +#else + std::string old_pk = Record::EncodePrimaryKey(old_a); + std::string new_pk = Record::EncodePrimaryKey(new_a); + Transaction* txn = nullptr; + WriteOptions wopts; + Status s = NewTxn(wopts, &txn); + if (!s.ok()) { + assert(!txn); + thread->stats.AddErrors(1); + return s; + } + + assert(txn); + txn->SetSnapshotOnNextOperation(/*notifier=*/nullptr); + + const Defer cleanup([&s, thread, txn, this]() { + if (s.ok()) { + // Two gets, one for existing pk, one for locking potential new pk. + thread->stats.AddGets(/*ngets=*/2, /*nfounds=*/1); + thread->stats.AddDeletes(1); + thread->stats.AddBytesForWrites( + /*nwrites=*/2, + Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize); + thread->stats.AddSingleDeletes(1); + return; + } + if (s.IsNotFound()) { + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0); + } else if (s.IsBusy()) { + // ignore. + } else { + thread->stats.AddErrors(1); + } + RollbackTxn(txn).PermitUncheckedError(); + }); + + ReadOptions ropts; + std::string value; + s = txn->GetForUpdate(ropts, old_pk, &value); + if (!s.ok()) { + return s; + } + std::string empty_value; + s = txn->GetForUpdate(ropts, new_pk, &empty_value); + if (s.ok()) { + assert(!empty_value.empty()); + s = Status::Busy(); + return s; + } + + auto result = Record::DecodePrimaryIndexValue(value); + s = std::get<0>(result); + if (!s.ok()) { + return s; + } + uint32_t b = std::get<1>(result); + uint32_t c = std::get<2>(result); + + ColumnFamilyHandle* cf = db_->DefaultColumnFamily(); + s = txn->Delete(cf, old_pk, /*assume_tracked=*/true); + if (!s.ok()) { + return s; + } + s = txn->Put(cf, new_pk, value, /*assume_tracked=*/true); + if (!s.ok()) { + return s; + } + + auto* wb = txn->GetWriteBatch(); + assert(wb); + + std::string old_sk = Record::EncodeSecondaryKey(c, old_a); + s = wb->SingleDelete(old_sk); + if (!s.ok()) { + return s; + } + + Record record(new_a, b, c); + std::string new_sk; + std::string new_crc; + std::tie(new_sk, new_crc) = record.EncodeSecondaryIndexEntry(); + s = wb->Put(new_sk, new_crc); + if (!s.ok()) { + return s; + } + + s = CommitTxn(txn); + return s; +#endif // !ROCKSDB_LITE +} + +Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread, + uint32_t old_c, + uint32_t new_c) { +#ifdef ROCKSDB_LITE + (void)thread; + (void)old_c; + (void)new_c; + return Status::NotSupported(); +#else + Transaction* txn = nullptr; + WriteOptions wopts; + Status s = NewTxn(wopts, &txn); + if (!s.ok()) { + assert(!txn); + thread->stats.AddErrors(1); + return s; + } + + assert(txn); + + Iterator* it = nullptr; + long iterations = 0; + const Defer cleanup([&s, thread, &it, txn, this, &iterations]() { + delete it; + if (s.ok()) { + thread->stats.AddIterations(iterations); + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1); + thread->stats.AddSingleDeletes(1); + thread->stats.AddBytesForWrites( + /*nwrites=*/2, + Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize); + return; + } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() || + s.IsMergeInProgress()) { + // ww-conflict detected, or + // lock cannot be acquired, or + // memtable history is not large enough for conflict checking, or + // Merge operation cannot be resolved. + // TODO (yanqin) add stats for other cases? + } else if (s.IsNotFound()) { + // ignore. + } else { + thread->stats.AddErrors(1); + } + RollbackTxn(txn).PermitUncheckedError(); + }); + + // TODO (yanqin) try SetSnapshotOnNextOperation(). We currently need to take + // a snapshot here because we will later verify that point lookup in the + // primary index using GetForUpdate() returns the same value for 'c' as the + // iterator. The iterator does not need a snapshot though, because it will be + // assigned the current latest (published) sequence in the db, which will be + // no smaller than the snapshot created here. The GetForUpdate will perform + // ww conflict checking to ensure GetForUpdate() (using the snapshot) sees + // the same data as this iterator. + txn->SetSnapshot(); + std::string old_sk_prefix = Record::EncodeSecondaryKey(old_c); + std::string iter_ub_str = Record::EncodeSecondaryKey(old_c + 1); + Slice iter_ub = iter_ub_str; + ReadOptions ropts; + if (thread->rand.OneIn(2)) { + ropts.snapshot = txn->GetSnapshot(); + } + ropts.total_order_seek = true; + ropts.iterate_upper_bound = &iter_ub; + it = txn->GetIterator(ropts); + + assert(it); + it->Seek(old_sk_prefix); + if (!it->Valid()) { + s = Status::NotFound(); + return s; + } + auto* wb = txn->GetWriteBatch(); + assert(wb); + + do { + ++iterations; + Record record; + s = record.DecodeSecondaryIndexEntry(it->key(), it->value()); + if (!s.ok()) { + VerificationAbort(thread->shared, "Cannot decode secondary key", s); + break; + } + // At this point, record.b is not known yet, thus we need to access + // primary index. + std::string pk = Record::EncodePrimaryKey(record.a_value()); + std::string value; + ReadOptions read_opts; + read_opts.snapshot = txn->GetSnapshot(); + s = txn->GetForUpdate(read_opts, pk, &value); + if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() || + s.IsMergeInProgress()) { + // Write conflict, or cannot acquire lock, or memtable size is not large + // enough, or merge cannot be resolved. + break; + } else if (!s.ok()) { + // We can also fail verification here. + VerificationAbort(thread->shared, "pk should exist, but does not", s); + break; + } + auto result = Record::DecodePrimaryIndexValue(value); + s = std::get<0>(result); + if (!s.ok()) { + VerificationAbort(thread->shared, "Cannot decode primary index value", s); + break; + } + uint32_t b = std::get<1>(result); + uint32_t c = std::get<2>(result); + if (c != old_c) { + std::ostringstream oss; + oss << "c in primary index does not match secondary index: " << c + << " != " << old_c; + s = Status::Corruption(); + VerificationAbort(thread->shared, oss.str(), s); + break; + } + Record new_rec(record.a_value(), b, new_c); + std::string new_primary_index_value = new_rec.EncodePrimaryIndexValue(); + ColumnFamilyHandle* cf = db_->DefaultColumnFamily(); + s = txn->Put(cf, pk, new_primary_index_value, /*assume_tracked=*/true); + if (!s.ok()) { + break; + } + std::string old_sk = it->key().ToString(/*hex=*/false); + std::string new_sk; + std::string new_crc; + std::tie(new_sk, new_crc) = new_rec.EncodeSecondaryIndexEntry(); + s = wb->SingleDelete(old_sk); + if (!s.ok()) { + break; + } + s = wb->Put(new_sk, new_crc); + if (!s.ok()) { + break; + } + + it->Next(); + } while (it->Valid()); + + if (!s.ok()) { + return s; + } + + s = CommitTxn(txn); + + return s; +#endif // !ROCKSDB_LITE +} + +Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread, + uint32_t a, + uint32_t b_delta) { +#ifdef ROCKSDB_LITE + (void)thread; + (void)a; + (void)b_delta; + return Status::NotSupported(); +#else + std::string pk_str = Record::EncodePrimaryKey(a); + Transaction* txn = nullptr; + WriteOptions wopts; + Status s = NewTxn(wopts, &txn); + if (!s.ok()) { + assert(!txn); + thread->stats.AddErrors(1); + return s; + } + + assert(txn); + + const Defer cleanup([&s, thread, txn, this]() { + if (s.ok()) { + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1); + thread->stats.AddBytesForWrites( + /*nwrites=*/1, /*nbytes=*/Record::kPrimaryIndexEntrySize); + return; + } + if (s.IsNotFound()) { + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0); + } else if (s.IsInvalidArgument()) { + // ignored. + } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() || + s.IsMergeInProgress()) { + // ignored. + } else { + thread->stats.AddErrors(1); + } + RollbackTxn(txn).PermitUncheckedError(); + }); + ReadOptions ropts; + std::string value; + s = txn->GetForUpdate(ropts, pk_str, &value); + if (!s.ok()) { + return s; + } + auto result = Record::DecodePrimaryIndexValue(value); + if (!std::get<0>(result).ok()) { + return s; + } + uint32_t b = std::get<1>(result) + b_delta; + uint32_t c = std::get<2>(result); + Record record(a, b, c); + std::string primary_index_value = record.EncodePrimaryIndexValue(); + ColumnFamilyHandle* cf = db_->DefaultColumnFamily(); + s = txn->Put(cf, pk_str, primary_index_value, /*assume_tracked=*/true); + if (!s.ok()) { + return s; + } + s = CommitTxn(txn); + return s; +#endif // !ROCKSDB_LITE +} + +Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread, + ReadOptions ropts, uint32_t a) { +#ifdef ROCKSDB_LITE + (void)thread; + (void)ropts; + (void)a; + return Status::NotSupported(); +#else + std::string pk_str = Record::EncodePrimaryKey(a); + // pk may or may not exist + PinnableSlice value; + + Transaction* txn = nullptr; + WriteOptions wopts; + Status s = NewTxn(wopts, &txn); + if (!s.ok()) { + assert(!txn); + thread->stats.AddErrors(1); + return s; + } + + assert(txn); + + const Defer cleanup([&s, thread, txn, this]() { + if (s.ok()) { + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1); + return; + } else if (s.IsNotFound()) { + thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0); + } else { + thread->stats.AddErrors(1); + } + RollbackTxn(txn).PermitUncheckedError(); + }); + + s = txn->Get(ropts, db_->DefaultColumnFamily(), pk_str, &value); + if (s.ok()) { + s = txn->Commit(); + } + return s; +#endif // !ROCKSDB_LITE +} + +Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread, + ReadOptions ropts, uint32_t c) { +#ifdef ROCKSDB_LITE + (void)thread; + (void)ropts; + (void)c; + return Status::NotSupported(); +#else + std::string sk = Record::EncodeSecondaryKey(c); + + Transaction* txn = nullptr; + WriteOptions wopts; + Status s = NewTxn(wopts, &txn); + if (!s.ok()) { + assert(!txn); + thread->stats.AddErrors(1); + return s; + } + + assert(txn); + + const Defer cleanup([&s, thread, txn, this]() { + if (s.ok()) { + thread->stats.AddIterations(1); + return; + } + thread->stats.AddErrors(1); + RollbackTxn(txn).PermitUncheckedError(); + }); + std::unique_ptr iter(txn->GetIterator(ropts)); + iter->Seek(sk); + if (iter->status().ok()) { + s = txn->Commit(); + } else { + s = iter->status(); + } + // TODO (yanqin) more Seek/SeekForPrev/Next/Prev/SeekToFirst/SeekToLast + return s; +#endif // !ROCKSDB_LITE +} + +void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { + if (thread->shared->HasVerificationFailedYet()) { + return; + } + const Snapshot* const snapshot = db_->GetSnapshot(); + assert(snapshot); + ManagedSnapshot snapshot_guard(db_, snapshot); + + // TODO (yanqin) with a probability, we can use either forward or backward + // iterator in subsequent checks. We can also use more advanced features in + // range scan. For now, let's just use simple forward iteration with + // total_order_seek = true. + + // First, iterate primary index. + size_t primary_index_entries_count = 0; + { + char buf[4]; + EncodeFixed32(buf, Record::kPrimaryIndexId + 1); + std::reverse(buf, buf + sizeof(buf)); + std::string iter_ub_str(buf, sizeof(buf)); + Slice iter_ub = iter_ub_str; + + ReadOptions ropts; + ropts.snapshot = snapshot; + ropts.total_order_seek = true; + ropts.iterate_upper_bound = &iter_ub; + + std::unique_ptr it(db_->NewIterator(ropts)); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + ++primary_index_entries_count; + } + } + + // Second, iterate secondary index. + size_t secondary_index_entries_count = 0; + { + char buf[4]; + EncodeFixed32(buf, Record::kSecondaryIndexId); + std::reverse(buf, buf + sizeof(buf)); + const std::string start_key(buf, sizeof(buf)); + + ReadOptions ropts; + ropts.snapshot = snapshot; + ropts.total_order_seek = true; + + std::unique_ptr it(db_->NewIterator(ropts)); + for (it->Seek(start_key); it->Valid(); it->Next()) { + ++secondary_index_entries_count; + Record record; + Status s = record.DecodeSecondaryIndexEntry(it->key(), it->value()); + if (!s.ok()) { + VerificationAbort(thread->shared, "Cannot decode secondary index entry", + s); + return; + } + // After decoding secondary index entry, we know a and c. Crc is verified + // in decoding phase. + // + // Form a primary key and search in the primary index. + std::string pk = Record::EncodePrimaryKey(record.a_value()); + std::string value; + s = db_->Get(ropts, pk, &value); + if (!s.ok()) { + std::ostringstream oss; + oss << "Error searching pk " << Slice(pk).ToString(true) << ". " + << s.ToString(); + VerificationAbort(thread->shared, oss.str(), s); + return; + } + auto result = Record::DecodePrimaryIndexValue(value); + s = std::get<0>(result); + if (!s.ok()) { + std::ostringstream oss; + oss << "Error decoding primary index value " + << Slice(value).ToString(true) << ". " << s.ToString(); + VerificationAbort(thread->shared, oss.str(), s); + } + uint32_t c_in_primary = std::get<2>(result); + if (c_in_primary != record.c_value()) { + std::ostringstream oss; + oss << "Pk/sk mismatch. pk: (c=" << c_in_primary + << "), sk: (c=" << record.c_value() << ")"; + VerificationAbort(thread->shared, oss.str(), s); + } + } + } + + if (secondary_index_entries_count != primary_index_entries_count) { + std::ostringstream oss; + oss << "Pk/sk mismatch: primary index has " << primary_index_entries_count + << " entries. Secondary index has " << secondary_index_entries_count + << " entries."; + VerificationAbort(thread->shared, oss.str(), Status::OK()); + } +} + +uint32_t MultiOpsTxnsStressTest::ChooseA(ThreadState* thread) { + uint32_t rnd = thread->rand.Uniform(5); + uint32_t next_a_low = next_a_.load(std::memory_order_relaxed); + assert(next_a_low != 0); + if (rnd == 0) { + return next_a_low - 1; + } + + uint32_t result = 0; + result = thread->rand.Next() % next_a_low; + if (thread->rand.OneIn(3)) { + return result; + } + uint32_t next_a_high = next_a_.load(std::memory_order_relaxed); + // A higher chance that this a still exists. + return next_a_low + (next_a_high - next_a_low) / 2; +} + +uint32_t MultiOpsTxnsStressTest::GenerateNextA() { + return next_a_.fetch_add(1, std::memory_order_relaxed); +} + +void MultiOpsTxnsStressTest::PreloadDb(SharedState* shared, size_t num_c) { +#ifdef ROCKSDB_LITE + (void)shared; + (void)num_c; +#else + // TODO (yanqin) maybe parallelize. Currently execute in single thread. + WriteOptions wopts; + wopts.disableWAL = true; + wopts.sync = false; + Random rnd(shared->GetSeed()); + assert(txn_db_); + for (uint32_t c = 0; c < static_cast(num_c); ++c) { + for (uint32_t a = c * kInitialCARatio; a < ((c + 1) * kInitialCARatio); + ++a) { + Record record(a, /*_b=*/rnd.Next(), c); + WriteBatch wb; + const auto primary_index_entry = record.EncodePrimaryIndexEntry(); + Status s = wb.Put(primary_index_entry.first, primary_index_entry.second); + assert(s.ok()); + const auto secondary_index_entry = record.EncodeSecondaryIndexEntry(); + s = wb.Put(secondary_index_entry.first, secondary_index_entry.second); + assert(s.ok()); + s = txn_db_->Write(wopts, &wb); + assert(s.ok()); + + // TODO (yanqin): make the following check optional, especially when data + // size is large. + Record tmp_rec; + tmp_rec.SetB(record.b_value()); + s = tmp_rec.DecodeSecondaryIndexEntry(secondary_index_entry.first, + secondary_index_entry.second); + assert(s.ok()); + assert(tmp_rec == record); + } + } + Status s = db_->Flush(FlushOptions()); + assert(s.ok()); + next_a_.store(static_cast((num_c + 1) * kInitialCARatio)); + fprintf(stdout, "DB preloaded with %d entries\n", + static_cast(num_c * kInitialCARatio)); +#endif // !ROCKSDB_LITE +} + +StressTest* CreateMultiOpsTxnsStressTest() { + return new MultiOpsTxnsStressTest(); +} + +void CheckAndSetOptionsForMultiOpsTxnStressTest() { +#ifndef ROCKSDB_LITE + if (FLAGS_test_batches_snapshots || FLAGS_test_cf_consistency) { + fprintf(stderr, + "-test_multi_ops_txns is not compatible with " + "-test_bathces_snapshots and -test_cf_consistency\n"); + exit(1); + } + if (!FLAGS_use_txn) { + fprintf(stderr, "-use_txn must be true if -test_multi_ops_txns\n"); + exit(1); + } + if (FLAGS_clear_column_family_one_in > 0) { + fprintf(stderr, + "-test_multi_ops_txns is not compatible with clearing column " + "families\n"); + exit(1); + } + if (FLAGS_column_families > 1) { + // TODO (yanqin) support separating primary index and secondary index in + // different column families. + fprintf(stderr, + "-test_multi_ops_txns currently does not use more than one column " + "family\n"); + exit(1); + } + if (FLAGS_writepercent > 0 || FLAGS_delpercent > 0 || + FLAGS_delrangepercent > 0) { + fprintf(stderr, + "-test_multi_ops_txns requires that -writepercent, -delpercent and " + "-delrangepercent be 0\n"); + exit(1); + } +#else + fprintf(stderr, "-test_multi_ops_txns not supported in ROCKSDB_LITE mode\n"); + exit(1); +#endif // !ROCKSDB_LITE +} +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,302 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef GFLAGS +#include "db_stress_tool/db_stress_common.h" + +namespace ROCKSDB_NAMESPACE { + +// This file defines MultiOpsTxnsStress so that we can stress test RocksDB +// transactions on a simple, emulated relational table. +// +// The record format is similar to the example found at +// https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format. +// +// The table is created by +// ``` +// create table t1 ( +// a int primary key, +// b int, +// c int, +// key(c), +// ) +// ``` +// +// (For simplicity, we use uint32_t for int here.) +// +// For this table, there is a primary index using `a`, as well as a secondary +// index using `c` and `a`. +// +// Primary key format: +// | index id | M(a) | +// Primary index value: +// | b | c | +// M(a) represents the big-endian format of a. +// +// Secondary key format: +// | index id | M(c) | M(a) | +// Secondary index value: +// | crc32 | +// Similarly to M(a), M(c) is the big-endian format of c. +// +// The in-memory representation of a record is defined in class +// MultiOpsTxnsStress:Record that includes a number of helper methods to +// encode/decode primary index keys, primary index values, secondary index keys, +// secondary index values, etc. +// +// Sometimes primary index and secondary index reside on different column +// families, but sometimes they colocate in the same column family. Current +// implementation puts them in the same (default) column family, and this is +// subject to future change if we find it interesting to test the other case. +// +// Class MultiOpsTxnsStressTest has the following transactions for testing. +// +// 1. Primary key update +// UPDATE t1 SET a = 3 WHERE a = 2; +// ``` +// tx->GetForUpdate(primary key a=2) +// tx->GetForUpdate(primary key a=3) +// tx->Delete(primary key a=2) +// tx->Put(primary key a=3, value) +// tx->batch->SingleDelete(secondary key a=2) +// tx->batch->Put(secondary key a=3, value) +// tx->Prepare() +// Tx->Commit() +// ``` +// +// 2. Secondary key update +// UPDATE t1 SET c = 3 WHERE c = 2; +// ``` +// iter->Seek(secondary key) +// // Get corresponding primary key value(s) from iterator +// tx->GetForUpdate(primary key) +// tx->Put(primary key, value c=3) +// tx->batch->SingleDelete(secondary key c=2) +// tx->batch->Put(secondary key c=3) +// tx->Prepare() +// tx->Commit() +// ``` +// +// 3. Primary index value update +// UPDATE t1 SET b = b + 1 WHERE a = 2; +// ``` +// tx->GetForUpdate(primary key a=2) +// tx->Put(primary key a=2, value b=b+1) +// tx->Prepare() +// tx->Commit() +// ``` +// +// 4. Point lookup +// SELECT * FROM t1 WHERE a = 3; +// ``` +// tx->Get(primary key a=3) +// tx->Commit() +// ``` +// +// 5. Range scan +// SELECT * FROM t1 WHERE c = 2; +// ``` +// it = tx->GetIterator() +// it->Seek(secondary key c=2) +// tx->Commit() +// ``` + +class MultiOpsTxnsStressTest : public StressTest { + public: + class Record { + public: + static constexpr uint32_t kPrimaryIndexId = 1; + static constexpr uint32_t kSecondaryIndexId = 2; + + static constexpr size_t kPrimaryIndexEntrySize = 8 + 8; + static constexpr size_t kSecondaryIndexEntrySize = 12 + 4; + + static_assert(kPrimaryIndexId < kSecondaryIndexId, + "kPrimaryIndexId must be smaller than kSecondaryIndexId"); + + static_assert(sizeof(kPrimaryIndexId) == sizeof(uint32_t), + "kPrimaryIndexId must be 4 bytes"); + static_assert(sizeof(kSecondaryIndexId) == sizeof(uint32_t), + "kSecondaryIndexId must be 4 bytes"); + + // Used for generating search key to probe primary index. + static std::string EncodePrimaryKey(uint32_t a); + // Used for generating search prefix to probe secondary index. + static std::string EncodeSecondaryKey(uint32_t c); + // Used for generating search key to probe secondary index. + static std::string EncodeSecondaryKey(uint32_t c, uint32_t a); + + static std::tuple DecodePrimaryIndexValue( + Slice primary_index_value); + + static std::pair DecodeSecondaryIndexValue( + Slice secondary_index_value); + + Record() = default; + Record(uint32_t _a, uint32_t _b, uint32_t _c) : a_(_a), b_(_b), c_(_c) {} + + bool operator==(const Record& other) const { + return a_ == other.a_ && b_ == other.b_ && c_ == other.c_; + } + + bool operator!=(const Record& other) const { return !(*this == other); } + + std::pair EncodePrimaryIndexEntry() const; + + std::string EncodePrimaryKey() const; + + std::string EncodePrimaryIndexValue() const; + + std::pair EncodeSecondaryIndexEntry() const; + + std::string EncodeSecondaryKey() const; + + Status DecodePrimaryIndexEntry(Slice primary_index_key, + Slice primary_index_value); + + Status DecodeSecondaryIndexEntry(Slice secondary_index_key, + Slice secondary_index_value); + + uint32_t a_value() const { return a_; } + uint32_t b_value() const { return b_; } + uint32_t c_value() const { return c_; } + + void SetA(uint32_t _a) { a_ = _a; } + void SetB(uint32_t _b) { b_ = _b; } + void SetC(uint32_t _c) { c_ = _c; } + + std::string ToString() const { + std::string ret("("); + ret.append(std::to_string(a_)); + ret.append(","); + ret.append(std::to_string(b_)); + ret.append(","); + ret.append(std::to_string(c_)); + ret.append(")"); + return ret; + } + + private: + friend class InvariantChecker; + + uint32_t a_{0}; + uint32_t b_{0}; + uint32_t c_{0}; + }; + + MultiOpsTxnsStressTest() {} + + ~MultiOpsTxnsStressTest() override {} + + void FinishInitDb(SharedState*) override; + + void ReopenAndPreloadDb(SharedState* shared); + + bool IsStateTracked() const override { return false; } + + Status TestGet(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + std::vector TestMultiGet( + ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + // Given a key K, this creates an iterator which scans to K and then + // does a random sequence of Next/Prev operations. + Status TestIterate(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestPut(ThreadState* thread, WriteOptions& write_opts, + const ReadOptions& read_opts, const std::vector& cf_ids, + const std::vector& keys, char (&value)[100], + std::unique_ptr& lock) override; + + Status TestDelete(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys, + std::unique_ptr& lock) override; + + Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys, + std::unique_ptr& lock) override; + + void TestIngestExternalFile(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys, + std::unique_ptr& lock) override; + + void TestCompactRange(ThreadState* thread, int64_t rand_key, + const Slice& start_key, + ColumnFamilyHandle* column_family) override; + + Status TestBackupRestore(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + Status TestCheckpoint(ThreadState* thread, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + +#ifndef ROCKSDB_LITE + Status TestApproximateSize(ThreadState* thread, uint64_t iteration, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; +#endif // !ROCKSDB_LITE + + Status TestCustomOperations( + ThreadState* thread, + const std::vector& rand_column_families) override; + + Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a, + uint32_t new_a); + + Status SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c, + uint32_t new_c); + + Status UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a, + uint32_t b_delta); + + Status PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a); + + Status RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c); + + void VerifyDb(ThreadState* thread) const override; + + protected: + uint32_t ChooseA(ThreadState* thread); + + uint32_t GenerateNextA(); + + private: + void PreloadDb(SharedState* shared, size_t num_c); + + // TODO (yanqin) encapsulate the selection of keys a separate class. + std::atomic next_a_{0}; +}; + +class InvariantChecker { + public: + static_assert(sizeof(MultiOpsTxnsStressTest::Record().a_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::a_ must be 4 bytes"); + static_assert(sizeof(MultiOpsTxnsStressTest::Record().b_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::b_ must be 4 bytes"); + static_assert(sizeof(MultiOpsTxnsStressTest::Record().c_) == sizeof(uint32_t), + "MultiOpsTxnsStressTest::Record::c_ must be 4 bytes"); +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,9 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" +#ifndef NDEBUG +#include "utilities/fault_injection_fs.h" +#endif // NDEBUG namespace ROCKSDB_NAMESPACE { class NonBatchedOpsStressTest : public StressTest { @@ -19,6 +22,13 @@ void VerifyDb(ThreadState* thread) const override { ReadOptions options(FLAGS_verify_checksum, true); + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + options.timestamp = &ts; + } auto shared = thread->shared; const int64_t max_key = shared->GetMaxKey(); const int64_t keys_per_thread = max_key / shared->GetNumThreads(); @@ -33,8 +43,8 @@ if (thread->shared->HasVerificationFailedYet()) { break; } - if (!thread->rand.OneIn(2)) { - // Use iterator to verify this range + if (thread->rand.OneIn(3)) { + // 1/3 chance use iterator to verify this range Slice prefix; std::string seek_key = Key(start); std::unique_ptr iter( @@ -79,8 +89,8 @@ from_db.data(), from_db.length()); } } - } else { - // Use Get to verify this range + } else if (thread->rand.OneIn(2)) { + // 1/3 chance use Get to verify this range for (auto i = start; i < end; i++) { if (thread->shared->HasVerificationFailedYet()) { break; @@ -96,6 +106,38 @@ from_db.data(), from_db.length()); } } + } else { + // 1/3 chance use MultiGet to verify this range + for (auto i = start; i < end;) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + // Keep the batch size to some reasonable value + size_t batch_size = thread->rand.Uniform(128) + 1; + batch_size = std::min(batch_size, end - i); + std::vector keystrs(batch_size); + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + for (size_t j = 0; j < batch_size; ++j) { + keystrs[j] = Key(i + j); + keys[j] = Slice(keystrs[j].data(), keystrs[j].length()); + } + db_->MultiGet(options, column_families_[cf], batch_size, keys.data(), + values.data(), statuses.data()); + for (size_t j = 0; j < batch_size; ++j) { + Status s = statuses[j]; + std::string from_db = values[j].ToString(); + VerifyValue(static_cast(cf), i + j, options, shared, from_db, + s, true); + if (from_db.length()) { + PrintKeyValue(static_cast(cf), static_cast(i + j), + from_db.data(), from_db.length()); + } + } + + i += batch_size; + } } } } @@ -137,6 +179,8 @@ bool ShouldAcquireMutexOnKey() const override { return true; } + bool IsStateTracked() const override { return true; } + Status TestGet(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, const std::vector& rand_keys) override { @@ -144,18 +188,52 @@ std::string key_str = Key(rand_keys[0]); Slice key = key_str; std::string from_db; + int error_count = 0; + +#ifndef NDEBUG + if (fault_fs_guard) { + fault_fs_guard->EnableErrorInjection(); + SharedState::ignore_read_error = false; + } +#endif // NDEBUG Status s = db_->Get(read_opts, cfh, key, &from_db); +#ifndef NDEBUG + if (fault_fs_guard) { + error_count = fault_fs_guard->GetAndResetErrorCount(); + } +#endif // NDEBUG if (s.ok()) { +#ifndef NDEBUG + if (fault_fs_guard) { + if (error_count && !SharedState::ignore_read_error) { + // Grab mutex so multiple thread don't try to print the + // stack trace at the same time + MutexLock l(thread->shared->GetMutex()); + fprintf(stderr, "Didn't get expected error from Get\n"); + fprintf(stderr, "Callstack that injected the fault\n"); + fault_fs_guard->PrintFaultBacktrace(); + std::terminate(); + } + } +#endif // NDEBUG // found case thread->stats.AddGets(1, 1); } else if (s.IsNotFound()) { // not found case thread->stats.AddGets(1, 0); } else { - // errors case - fprintf(stderr, "TestGet error: %s\n", s.ToString().c_str()); - thread->stats.AddErrors(1); + if (error_count == 0) { + // errors case + thread->stats.AddErrors(1); + } else { + thread->stats.AddVerifiedErrors(1); + } + } +#ifndef NDEBUG + if (fault_fs_guard) { + fault_fs_guard->DisableErrorInjection(); } +#endif // NDEBUG return s; } @@ -171,6 +249,15 @@ std::vector values(num_keys); std::vector statuses(num_keys); ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; + int error_count = 0; + // Do a consistency check between Get and MultiGet. Don't do it too + // often as it will slow db_stress down + bool do_consistency_check = thread->rand.OneIn(4); + + ReadOptions readoptionscopy = read_opts; + if (do_consistency_check) { + readoptionscopy.snapshot = db_->GetSnapshot(); + } // To appease clang analyzer const bool use_txn = FLAGS_use_txn; @@ -231,18 +318,98 @@ } if (!use_txn) { - db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), +#ifndef NDEBUG + if (fault_fs_guard) { + fault_fs_guard->EnableErrorInjection(); + SharedState::ignore_read_error = false; + } +#endif // NDEBUG + db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); +#ifndef NDEBUG + if (fault_fs_guard) { + error_count = fault_fs_guard->GetAndResetErrorCount(); + } +#endif // NDEBUG } else { #ifndef ROCKSDB_LITE - txn->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(), + txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); - RollbackTxn(txn); #endif } - for (const auto& s : statuses) { - if (s.ok()) { +#ifndef NDEBUG + if (fault_fs_guard && error_count && !SharedState::ignore_read_error) { + int stat_nok = 0; + for (const auto& s : statuses) { + if (!s.ok() && !s.IsNotFound()) { + stat_nok++; + } + } + + if (stat_nok < error_count) { + // Grab mutex so multiple thread don't try to print the + // stack trace at the same time + MutexLock l(thread->shared->GetMutex()); + fprintf(stderr, "Didn't get expected error from MultiGet. \n"); + fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", num_keys, + error_count, stat_nok); + fprintf(stderr, "Callstack that injected the fault\n"); + fault_fs_guard->PrintFaultBacktrace(); + std::terminate(); + } + } + if (fault_fs_guard) { + fault_fs_guard->DisableErrorInjection(); + } +#endif // NDEBUG + + for (size_t i = 0; i < statuses.size(); ++i) { + Status s = statuses[i]; + bool is_consistent = true; + // Only do the consistency check if no error was injected and MultiGet + // didn't return an unexpected error + if (do_consistency_check && !error_count && (s.ok() || s.IsNotFound())) { + Status tmp_s; + std::string value; + + if (use_txn) { +#ifndef ROCKSDB_LITE + tmp_s = txn->Get(readoptionscopy, cfh, keys[i], &value); +#endif // ROCKSDB_LITE + } else { + tmp_s = db_->Get(readoptionscopy, cfh, keys[i], &value); + } + if (!tmp_s.ok() && !tmp_s.IsNotFound()) { + fprintf(stderr, "Get error: %s\n", s.ToString().c_str()); + is_consistent = false; + } else if (!s.ok() && tmp_s.ok()) { + fprintf(stderr, "MultiGet returned different results with key %s\n", + keys[i].ToString(true).c_str()); + fprintf(stderr, "Get returned ok, MultiGet returned not found\n"); + is_consistent = false; + } else if (s.ok() && tmp_s.IsNotFound()) { + fprintf(stderr, "MultiGet returned different results with key %s\n", + keys[i].ToString(true).c_str()); + fprintf(stderr, "MultiGet returned ok, Get returned not found\n"); + is_consistent = false; + } else if (s.ok() && value != values[i].ToString()) { + fprintf(stderr, "MultiGet returned different results with key %s\n", + keys[i].ToString(true).c_str()); + fprintf(stderr, "MultiGet returned value %s\n", + values[i].ToString(true).c_str()); + fprintf(stderr, "Get returned value %s\n", value.c_str()); + is_consistent = false; + } + } + + if (!is_consistent) { + fprintf(stderr, "TestMultiGet error: is_consistent is false\n"); + thread->stats.AddErrors(1); + // Fail fast to preserve the DB state + thread->shared->SetVerificationFailure(); + break; + } else if (s.ok()) { // found case thread->stats.AddGets(1, 1); } else if (s.IsNotFound()) { @@ -252,11 +419,24 @@ // With txn this is sometimes expected. thread->stats.AddGets(1, 1); } else { - // errors case - fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str()); - thread->stats.AddErrors(1); + if (error_count == 0) { + // errors case + fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + thread->stats.AddVerifiedErrors(1); + } } } + + if (readoptionscopy.snapshot) { + db_->ReleaseSnapshot(readoptionscopy.snapshot); + } + if (use_txn) { +#ifndef ROCKSDB_LITE + RollbackTxn(txn); +#endif + } return statuses; } @@ -308,6 +488,8 @@ int64_t max_key = shared->GetMaxKey(); int64_t rand_key = rand_keys[0]; int rand_column_family = rand_column_families[0]; + std::string write_ts_str; + Slice write_ts; while (!shared->AllowsOverwrite(rand_key) && (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) { lock.reset(); @@ -315,6 +497,11 @@ rand_column_family = thread->rand.Next() % FLAGS_column_families; lock.reset( new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key))); + if (FLAGS_user_timestamp_size > 0) { + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } } std::string key_str = Key(rand_key); @@ -369,8 +556,18 @@ } shared->Put(rand_column_family, rand_key, value_base, false /* pending */); if (!s.ok()) { - fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); - std::terminate(); + if (FLAGS_injest_error_severity >= 2) { + if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { + is_db_stopped_ = true; + } else if (!is_db_stopped_ || + s.severity() < Status::Severity::kFatalError) { + fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); + std::terminate(); + } + } else { + fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); + std::terminate(); + } } thread->stats.AddBytesForWrites(1, sz); PrintKeyValue(rand_column_family, static_cast(rand_key), value, @@ -390,6 +587,8 @@ // OPERATION delete // If the chosen key does not allow overwrite and it does not exist, // choose another key. + std::string write_ts_str; + Slice write_ts; while (!shared->AllowsOverwrite(rand_key) && !shared->Exists(rand_column_family, rand_key)) { lock.reset(); @@ -397,6 +596,11 @@ rand_column_family = thread->rand.Next() % FLAGS_column_families; lock.reset( new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key))); + if (FLAGS_user_timestamp_size > 0) { + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } } std::string key_str = Key(rand_key); @@ -425,8 +629,19 @@ shared->Delete(rand_column_family, rand_key, false /* pending */); thread->stats.AddDeletes(1); if (!s.ok()) { - fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); - std::terminate(); + if (FLAGS_injest_error_severity >= 2) { + if (!is_db_stopped_ && + s.severity() >= Status::Severity::kFatalError) { + is_db_stopped_ = true; + } else if (!is_db_stopped_ || + s.severity() < Status::Severity::kFatalError) { + fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } + } else { + fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } } } else { shared->SingleDelete(rand_column_family, rand_key, true /* pending */); @@ -447,8 +662,19 @@ shared->SingleDelete(rand_column_family, rand_key, false /* pending */); thread->stats.AddSingleDeletes(1); if (!s.ok()) { - fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); - std::terminate(); + if (FLAGS_injest_error_severity >= 2) { + if (!is_db_stopped_ && + s.severity() >= Status::Severity::kFatalError) { + is_db_stopped_ = true; + } else if (!is_db_stopped_ || + s.severity() < Status::Severity::kFatalError) { + fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } + } else { + fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } } } return s; @@ -494,8 +720,18 @@ Slice end_key = end_keystr; Status s = db_->DeleteRange(write_opts, cfh, key, end_key); if (!s.ok()) { - fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); - std::terminate(); + if (FLAGS_injest_error_severity >= 2) { + if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { + is_db_stopped_ = true; + } else if (!is_db_stopped_ || + s.severity() < Status::Severity::kFatalError) { + fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); + std::terminate(); + } + } else { + fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); + std::terminate(); + } } int covered = shared->DeleteRange(rand_column_family, rand_key, rand_key + FLAGS_range_deletion_width, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/defs.bzl mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl --- mariadb-10.11.11/storage/rocksdb/rocksdb/defs.bzl 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl 2025-05-19 16:14:27.000000000 +0000 @@ -3,6 +3,7 @@ # defs.bzl - Definitions for Facebook-specific buck build integration # in TARGETS +load("@fbcode_macros//build_defs:coverage.bzl", "coverage") load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary") load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest") @@ -35,8 +36,21 @@ external_deps = rocksdb_external_deps, ) + binary_path = "$(location :{})".format(test_bin) + + base_path = native.package_name() + tags = [] + if coverage.is_coverage_enabled(base_path): + # This tag instructs testpilot to use + # the lower-memory coverage runner + # (e.g. it tells testpilot that the binary + # is actually instrumented with coverage info) + tags = ["coverage"] + custom_unittest( name = test_name, - command = [TEST_RUNNER, "$(location :{})".format(test_bin)], + command = [TEST_RUNNER, binary_path], type = ttype, + env = {"BUCK_BASE_BINARY": binary_path}, + tags = tags, ) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile 2025-05-19 16:14:27.000000000 +0000 @@ -1,2 +1,4 @@ source 'https://rubygems.org' -gem 'github-pages', '~> 104' +gem 'github-pages', '~> 209' + +gem "webrick", "~> 1.7" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile.lock mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile.lock 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock 2025-05-19 16:14:27.000000000 +0000 @@ -1,146 +1,267 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.7) - i18n (~> 0.7) - json (~> 1.7, >= 1.7.7) + activesupport (6.0.3.4) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.4.0) + zeitwerk (~> 2.2, >= 2.2.2) + addressable (2.8.0) + public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.12.2) + coffee-script-source (1.11.1) colorator (1.1.0) - concurrent-ruby (1.0.5) - ethon (0.11.0) + commonmarker (0.17.13) + ruby-enum (~> 0.5) + concurrent-ruby (1.1.7) + dnsruby (1.61.5) + simpleidn (~> 0.1) + em-websocket (0.5.2) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0.6.0) + ethon (0.12.0) ffi (>= 1.3.0) + eventmachine (1.2.7) execjs (2.7.0) - faraday (0.15.2) + faraday (1.3.0) + faraday-net_http (~> 1.0) multipart-post (>= 1.2, < 3) - ffi (1.9.25) + ruby2_keywords + faraday-net_http (1.0.0) + ffi (1.14.2) forwardable-extended (2.6.0) - gemoji (2.1.0) - github-pages (104) - activesupport (= 4.2.7) - github-pages-health-check (= 1.2.0) - jekyll (>= 3.8.4) - jekyll-avatar (= 0.4.2) - jekyll-coffeescript (= 1.0.1) - jekyll-feed (= 0.8.0) - jekyll-gist (= 1.4.0) - jekyll-github-metadata (= 2.2.0) - jekyll-mentions (= 1.2.0) + gemoji (3.0.1) + github-pages (209) + github-pages-health-check (= 1.16.1) + jekyll (= 3.9.0) + jekyll-avatar (= 0.7.0) + jekyll-coffeescript (= 1.1.1) + jekyll-commonmark-ghpages (= 0.1.6) + jekyll-default-layout (= 0.1.4) + jekyll-feed (= 0.15.1) + jekyll-gist (= 1.5.0) + jekyll-github-metadata (= 2.13.0) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-redirect-from (= 0.11.0) - jekyll-sass-converter (= 1.3.0) - jekyll-seo-tag (= 2.1.0) - jekyll-sitemap (= 0.12.0) - jekyll-swiss (= 0.4.0) - jemoji (= 0.7.0) - kramdown (= 1.11.1) - liquid (= 3.0.6) - listen (= 3.0.6) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.2) + jekyll-sass-converter (= 1.5.2) + jekyll-seo-tag (= 2.6.1) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.1.1) + jekyll-theme-cayman (= 0.1.1) + jekyll-theme-dinky (= 0.1.1) + jekyll-theme-hacker (= 0.1.2) + jekyll-theme-leap-day (= 0.1.1) + jekyll-theme-merlot (= 0.1.1) + jekyll-theme-midnight (= 0.1.1) + jekyll-theme-minimal (= 0.1.1) + jekyll-theme-modernist (= 0.1.1) + jekyll-theme-primer (= 0.5.4) + jekyll-theme-slate (= 0.1.1) + jekyll-theme-tactile (= 0.1.1) + jekyll-theme-time-machine (= 0.1.1) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.1) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.0.0) - rouge (= 1.11.1) + minima (= 2.5.1) + nokogiri (>= 1.10.4, < 2.0) + rouge (= 3.23.0) terminal-table (~> 1.4) - github-pages-health-check (1.2.0) + github-pages-health-check (1.16.1) addressable (~> 2.3) - net-dns (~> 0.8) + dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 1.4) - typhoeus (~> 0.7) - html-pipeline (2.4.2) + public_suffix (~> 3.0) + typhoeus (~> 1.3) + html-pipeline (2.14.0) activesupport (>= 2) - nokogiri (~> 1.8.2) - i18n (0.7.0) - jekyll (3.8.4) + nokogiri (>= 1.4) + http_parser.rb (0.6.0) + i18n (0.9.5) + concurrent-ruby (~> 1.0) + jekyll (3.9.0) addressable (~> 2.4) colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 0.7) jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 1.1) - kramdown (~> 1.3) - liquid (~> 3.0) + jekyll-watch (~> 2.0) + kramdown (>= 1.17, < 3) + liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) - rouge (~> 1.7) + rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.4.2) - jekyll (~> 3.0) - jekyll-coffeescript (1.0.1) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) + jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) - jekyll-feed (0.8.0) - jekyll (~> 3.3) - jekyll-gist (1.4.0) + coffee-script-source (~> 1.11.1) + jekyll-commonmark (1.3.1) + commonmarker (~> 0.14) + jekyll (>= 3.7, < 5.0) + jekyll-commonmark-ghpages (0.1.6) + commonmarker (~> 0.17.6) + jekyll-commonmark (~> 1.2) + rouge (>= 2.0, < 4.0) + jekyll-default-layout (0.1.4) + jekyll (~> 3.0) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) + jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.2.0) - jekyll (~> 3.1) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.2.0) - activesupport (~> 4.0) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-redirect-from (0.11.0) - jekyll (>= 2.0) - jekyll-sass-converter (1.3.0) - sass (~> 3.2) - jekyll-seo-tag (2.1.0) - jekyll (~> 3.3) - jekyll-sitemap (0.12.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) - jekyll-watch (1.5.0) - listen (~> 3.0, < 3.1) - jemoji (0.7.0) - activesupport (~> 4.0) - gemoji (~> 2.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.2) + addressable (~> 2.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) + jekyll-sass-converter (1.5.2) + sass (~> 3.4) + jekyll-seo-tag (2.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-cayman (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-dinky (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-hacker (0.1.2) + jekyll (> 3.5, < 5.0) + jekyll-seo-tag (~> 2.0) + jekyll-theme-leap-day (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-merlot (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-midnight (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-minimal (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-modernist (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-primer (0.5.4) + jekyll (> 3.5, < 5.0) + jekyll-github-metadata (~> 2.9) + jekyll-seo-tag (~> 2.0) + jekyll-theme-slate (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-tactile (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-time-machine (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + jemoji (0.12.0) + gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (>= 3.0) - json (1.8.3) - kramdown (1.11.1) - liquid (3.0.6) - listen (3.0.6) - rb-fsevent (>= 0.9.3) - rb-inotify (>= 0.9.7) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.1) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.4.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.3.0) - minima (2.0.0) - minitest (5.9.1) - multipart-post (2.0.0) - net-dns (0.8.0) - nokogiri (~> 1.8.2) - mini_portile2 (~> 2.3.0) - octokit (4.4.1) - sawyer (~> 0.7.0, >= 0.5.3) - pathutil (0.14.0) + mini_portile2 (2.6.1) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + minitest (5.14.3) + multipart-post (2.1.1) + nokogiri (1.12.5) + mini_portile2 (~> 2.6.1) + racc (~> 1.4) + octokit (4.20.0) + faraday (>= 0.9) + sawyer (~> 0.8.0, >= 0.5.3) + pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (1.5.3) - rb-fsevent (0.9.8) - rb-inotify (0.9.7) - ffi (>= 0.5.0) - rouge (1.11.1) - safe_yaml (1.0.4) - sass (3.4.22) - sawyer (0.7.0) - addressable (>= 2.3.5, < 2.5) - faraday (~> 0.8, < 0.10) - terminal-table (1.7.3) - unicode-display_width (~> 1.1.1) - thread_safe (0.3.5) - typhoeus (0.8.0) - ethon (>= 0.8.0) - tzinfo (1.2.2) + public_suffix (3.1.1) + racc (1.5.2) + rb-fsevent (0.10.4) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.5) + rouge (3.23.0) + ruby-enum (0.8.0) + i18n + ruby2_keywords (0.0.2) + rubyzip (2.3.0) + safe_yaml (1.0.5) + sass (3.7.4) + sass-listen (~> 4.0.0) + sass-listen (4.0.0) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + sawyer (0.8.2) + addressable (>= 2.3.5) + faraday (> 0.8, < 2.0) + simpleidn (0.1.1) + unf (~> 0.1.4) + terminal-table (1.8.0) + unicode-display_width (~> 1.1, >= 1.1.1) + thread_safe (0.3.6) + typhoeus (1.4.0) + ethon (>= 0.9.0) + tzinfo (1.2.9) thread_safe (~> 0.1) - unicode-display_width (1.1.1) + unf (0.1.4) + unf_ext + unf_ext (0.0.7.7) + unicode-display_width (1.7.0) + webrick (1.7.0) + zeitwerk (2.4.2) PLATFORMS ruby DEPENDENCIES - github-pages (~> 104) + github-pages (~> 209) + webrick (~> 1.7) BUNDLED WITH - 1.13.1 + 2.2.3 diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_config.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_config.yml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml 2025-05-19 16:14:27.000000000 +0000 @@ -81,5 +81,5 @@ redcarpet: extensions: [with_toc_data] -gems: +plugins: - jekyll-redirect-from diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/authors.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/authors.yml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml 2025-05-19 16:14:27.000000000 +0000 @@ -68,3 +68,6 @@ fgwu: full_name: Fenggang Wu fbid: 100002297362180 + +ltamasi: + full_name: Levi Tamasi diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/nav.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/nav.yml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml 2025-05-19 16:14:27.000000000 +0000 @@ -7,11 +7,11 @@ category: external - title: API (C++) - href: https://github.com/facebook/rocksdb/tree/master/include/rocksdb + href: https://github.com/facebook/rocksdb/tree/main/include/rocksdb category: external - title: API (Java) - href: https://github.com/facebook/rocksdb/tree/master/java/src/main/java/org/rocksdb + href: https://github.com/facebook/rocksdb/tree/main/java/src/main/java/org/rocksdb category: external - title: Support diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_docs/getting-started.md mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_docs/getting-started.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md 2025-05-19 16:14:27.000000000 +0000 @@ -73,6 +73,6 @@ Here are some specific details about the RocksDB implementation: -- [Architecture Guide](https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide) -- [Format of an immutable Table file](https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format) -- [Format of a log file](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format) +- [RocksDB Overview](https://github.com/facebook/rocksdb/wiki/RocksDB-Overview) +- [Immutable BlockBased Table file format](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format) +- [Log file format](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_includes/doc.html mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_includes/doc.html 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html 2025-05-19 16:14:27.000000000 +0000 @@ -18,7 +18,7 @@ {% else %} {{ content }} -

Edit on GitHub

+

Edit on GitHub

{% endif %} {% include doc_paging.html %} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -17,7 +17,7 @@ The usual Mapnik workflow is to load the map data into a SQL-based database and then define map layers with SQL statements. To render a tile, Mapnik needs to execute a couple of SQL queries. The benefit of this approach is that you don't need to reload your database when you change your map style. You can just change your SQL query and Mapnik picks it up. In our model, we decided to precompute the features we need for each tile. We need to know the map style before we create the database. However, when rendering the map tile, we only fetch the features that we need to render. -We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications. +We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications. Let's take a tour of the API. When you create a spatial database, you specify the spatial indexes that need to be built. Each spatial index is defined by a bounding box and granularity. For map rendering, we create a spatial index for each zoom levels. Higher zoom levels have more granularity. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -138,7 +138,7 @@ ## The API -The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/env.h#L317-L318), which is an Env +The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/env.h#L317-L318), which is an Env function: ```c++ @@ -151,7 +151,7 @@ The `GetThreadList()` API simply returns a vector of `ThreadStatus`, each describes the current status of a thread. The `ThreadStatus` structure, defined in -[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/thread_status.h), contains the following information: +[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/thread_status.h), contains the following information: ```c++ // An unique ID for the thread. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -9,14 +9,14 @@ ## 4.8.0 (5/2/2016) -### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-1)Public API Change +### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-1)Public API Change * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes. * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see [https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F) * Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN". * Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status. -### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-2)New Features +### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-2)New Features * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size. @@ -24,25 +24,25 @@ -## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#470-482016)4.7.0 (4/8/2016) +## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#470-482016)4.7.0 (4/8/2016) -### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-2)Public API Change +### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-2)Public API Change * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too. * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See [https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
-## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#460-3102016)4.6.0 (3/10/2016) +## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#460-3102016)4.6.0 (3/10/2016) -### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-changes-1)Public API Changes +### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-changes-1)Public API Changes * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly. * Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted. * Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree. -### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-3)New Features +### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-3)New Features * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -34,4 +34,4 @@ } ``` -You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/master/examples/simple_example.cc) demonstrates that with more examples. +You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/main/examples/simple_example.cc) demonstrates that with more examples. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -21,6 +21,6 @@ ### Success story: MyRocks -Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for acheiving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer. +Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for achieving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer. Since binlog helps in recovering from some failure scenarios, MySQL can provide reliability without however needing a storage WAL flush after each individual commit. MyRocks benefits from this property, disables automatic WAL flush in RocksDB, and manually calls `::FlushWAL` when requested by MySQL. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -15,17 +15,17 @@ ### Overview Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer -[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)). +[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)). Users provide the Rocksdb configuration that they want to improve upon (as the familiar Rocksdb OPTIONS file — -[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)) +[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini)) and the path of the file which contains Rocksdb logs and statistics. -The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py) +The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py) creates appropriate DataSource objects (for Rocksdb -[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py), -[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py), -[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.) -and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py). +[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py), +[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py), +[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.) +and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py). The Rules uses rules from experts to parse data-sources and trigger appropriate rules. The Advisor's output gives information about which rules were triggered, why they were triggered and what each of them suggests. Each suggestion @@ -55,4 +55,4 @@ ### Read more -For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/master/tools/advisor/README.md). +For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/main/tools/advisor/README.md). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,46 @@ +--- +title: (Call For Contribution) Make Universal Compaction More Incremental +layout: post +author: sdong +category: blog +--- + +### Motivation + +Universal Compaction is an important compaction style, but few changes were made after we made the structure multi-leveled. Yet the major restriction of always compacting full sorted run is not relaxed. Compared to Leveled Compaction, where we usually only compile several SST files together, in universal compaction, we frequently compact GBs of data. Two issues with this gap: 1. it makes it harder to unify universal and leveled compaction; 2. periodically data is fully compacted, and in the mean time space is doubled. To ease the problem, we can break the restriction and do similar as leveled compaction, and bring it closer to unified compaction. + +We call for help for making following improvements. + + +### How Universal Compaction Works + +In universal, whole levels are compacted together to satisfy two conditions (See [wiki page](https://github.com/facebook/rocksdb/wiki/Universal-Compaction) for more details): + +1. total size / bottommost level size > a threshold, or +2. total number of sorted runs (non-0 levels + L0 files) is within a threshold + +1 is to limit extra space overhead used for dead data and 2 is for read performance. + +If 1 is triggered, likely a full compaction will be triggered. If 2 is triggered, RocksDB compact some sorted runs to bring the number down. It does it by using a simple heuristic so that less writes needed for that purpose over time: it starts from compacting smaller files, but if total size to compact is similar to or larger than size of the next level, it will take that level together, as soon on (whether it is the best heuristic is another question and we’ve never seriously looked at it). + +### How We Can Improve? + +Let’s start from condition 1. Here we do full compaction but is not necessary. A simple optimization would be to compact so that just enough files are merged into the bottommost level (Lmax) to satisfy condition 1. It would work if we only need to pick some files from Lmax-1, or if it is cheaper over time, we can pick some files from other levels too. + +Then condition 2. If we finish condition 1, there might be holes in some ranges in older levels. These holes might make it possible that only by compacting some sub ranges, we can fix the LSM-tree for condition 2. RocksDB can take single files into consideration and apply more sophisticated heuristic. + +This new approach makes universal compaction closer to leveled compaction. The operation for 1 is closer to how Leveled compaction triggeres Lmax-1 to Lmax compaction. And 2 can potentially be implemented as something similar to level picking in Leveled Compaction. In fact, all those file picking can co-existing in one single compaction style and there isn’t fundamental conflicts to that. + +### Limitation + +There are two limitations: + +* Periodic automatic full compaction is unpleasant but at the same time is pleasant in another way. Some users might uses it to reason that everything is periodically collapsed so dead data is gone and old data is rewritten. We need to make sure periodic compaction works to continue with that. +* L0 to the first non-L0 level compaction is the first time data is partitioned in LSM-tree so that incremental compaction by range is possible. We might need to do more of these compactions in order to make incremental possible, which will increase compaction slightly. +* Compacting subset of a level would introduce some extra overhead for unaligned files, just as in leveled compaction. More SST boundary cutting heuristic can reduce this overhead but it will be there. + +But I believe the benefits would outweight the limitations. Reducing temporary space doubling and moving towards to unified compaction would be important achievements. + +### Interested in Help? + +Compaction is the core of LSM-tree, but its improvements are far overdue. If you are a user of universal compaction and would be able to benefit from those improvements, we will be happy to work with you on speeding up the project and bring them to RocksDB sooner. Feel free to communicate with us in [this issue](https://github.com/facebook/rocksdb/issues/8181). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,101 @@ +--- +title: Integrated BlobDB +layout: post +author: ltamasi +category: blog +--- +## Background + +BlobDB is essentially RocksDB for large-value use cases. The basic idea, which was proposed in the [WiscKey paper](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf), is key-value separation: by storing large values in dedicated blob files and storing only small pointers to them in the LSM tree, we avoid copying the values over and over again during compaction, thus reducing write amplification. Historically, BlobDB supported only FIFO and TTL based use cases that can tolerate some data loss. In addition, it was incompatible with many widely used RocksDB features, and required users to adopt a custom API. In 2020, we decided to rearchitect BlobDB from the ground up, taking the lessons learned from WiscKey and the original BlobDB but also drawing inspiration and incorporating ideas from other similar systems. Our goals were to eliminate the above limitations and to create a new integrated version that enables customers to use the well-known RocksDB API, has feature parity with the core of RocksDB, and offers better performance. This new implementation is now available and provides the following improvements over the original: + +* **API.** In contrast with the legacy BlobDB implementation, which had its own `StackableDB`-based interface (`rocksdb::blob_db::BlobDB`), the new version can be used via the well-known `rocksdb::DB` API, and can be configured simply by using a few column family options. +* **Consistency.** With the integrated BlobDB implementation, RocksDB’s consistency guarantees and various write options (like using the WAL or synchronous writes) now apply to blobs as well. Moreover, the new BlobDB keeps track of blob files in the RocksDB MANIFEST. +* **Write performance.** When using the old BlobDB, blobs are extracted and immediately written to blob files by the BlobDB layer *in the application thread*. This has multiple drawbacks from a performance perspective: first, it requires synchronization; second, it means that expensive operations like compression are performed in the application thread; and finally, it involves flushing the blob file after each blob. The new code takes a completely different approach by *offloading blob file building to RocksDB’s background jobs*, i.e. flushes and compactions. This means that similarly to SSTs, any given blob file is now written by a single background thread, eliminating the need for locking, flushing, or performing compression in the foreground. Note that this approach is also a better fit for network-based file systems where small writes might be expensive and opens up the possibility of file format optimizations that involve buffering (like dictionary compression). +* **Read performance.** The old code relies on each read (i.e. `Get`, `MultiGet`, or iterator) taking a snapshot and uses those snapshots when deciding which obsolete blob files can be removed. The new BlobDB improves this by generalizing RocksDB’s Version concept, which historically referred to the set of live SST files at a given point in time, to include the set of live blob files as well. This has performance benefits like [making the read path mostly lock-free by utilizing thread-local storage](https://rocksdb.org/blog/2014/06/27/avoid-expensive-locks-in-get.html). We have also introduced a blob file cache that can be utilized to keep frequently accessed blob files open. +* **Garbage collection.** Key-value separation means that if a key pointing to a blob gets overwritten or deleted, the blob becomes unreferenced garbage. To be able to reclaim this space, BlobDB now has garbage collection capabilities. GC is integrated into the compaction process and works by relocating valid blobs residing in old blob files as they are encountered during compaction. Blob files can be marked obsolete (and eventually deleted in one shot) once they contain nothing but garbage. This is more efficient than the method used by WiscKey, which involves performing a `Get` operation to find out whether a blob is still referenced followed by a `Put` to update the reference, which in turn results in garbage collection competing and potentially conflicting with the application’s writes. +* **Feature parity with the RocksDB core.** The new BlobDB supports way more features than the original and is near feature parity with vanilla RocksDB. In particular, we support all basic read/write APIs (with the exception of `Merge`, which is coming soon), recovery, compression, atomic flush, column families, compaction filters, checkpoints, backup/restore, transactions, per-file checksums, and the SST file manager. In addition, the new BlobDB’s options can be dynamically adjusted using the `SetOptions` interface. + +## API + +The new BlobDB can be configured (on a per-column family basis if needed) simply by using the following options: + +* `enable_blob_files`: set it to `true` to enable key-value separation. +* `min_blob_size`: values at or above this threshold will be written to blob files during flush or compaction. +* `blob_file_size`: the size limit for blob files. +* `blob_compression_type`: the compression type to use for blob files. All blobs in the same file are compressed using the same algorithm. +* `enable_blob_garbage_collection`: set this to `true` to make BlobDB actively relocate valid blobs from the oldest blob files as they are encountered during compaction. +* `blob_garbage_collection_age_cutoff`: the threshold that the GC logic uses to determine which blob files should be considered “old.†For example, the default value of 0.25 signals to RocksDB that blobs residing in the oldest 25% of blob files should be relocated by GC. This parameter can be tuned to adjust the trade-off between write amplification and space amplification. + +The above options are all dynamically adjustable via the `SetOptions` API; changing them will affect subsequent flushes and compactions but not ones that are already in progress. + +In terms of compaction styles, we recommend using leveled compaction with BlobDB. The rationale behind universal compaction in general is to provide lower write amplification at the expense of higher read amplification; however, as we will see later in the Performance section, BlobDB can provide very low write amp and good read performance with leveled compaction. Therefore, there is really no reason to take the hit in read performance that comes with universal compaction. + +In addition to the above, consider tuning the following non-BlobDB specific options: + +* `write_buffer_size`: this is the memtable size. You might want to increase it for large-value workloads to ensure that SST and blob files contain a decent number of keys. +* `target_file_size_base`: the target size of SST files. Note that even when using BlobDB, it is important to have an LSM tree with a “nice†shape and multiple levels and files per level to prevent heavy compactions. Since BlobDB extracts and writes large values to blob files, it makes sense to make this parameter significantly smaller than the memtable size. One guideline is to set `blob_file_size` to the same value as `write_buffer_size` (adjusted for compression if needed) and make `target_file_size_base` proportionally smaller based on the ratio of key size to value size. +* `max_bytes_for_level_base`: consider setting this to a multiple (e.g. 8x or 10x) of `target_file_size_base`. + +As mentioned above, the new BlobDB now also supports compaction filters. Key-value separation actually enables an optimization here: if the compaction filter of an application can make a decision about a key-value solely based on the key, it is unnecessary to read the value from the blob file. Applications can take advantage of this optimization by implementing the new `FilterBlobByKey` method of the `CompactionFilter` interface. This method gets called by RocksDB first whenever it encounters a key-value where the value is stored in a blob file. If this method returns a “final†decision like `kKeep`, `kRemove`, `kChangeValue`, or `kRemoveAndSkipUntil`, RocksDB will honor that decision; on the other hand, if the method returns `kUndetermined`, RocksDB will read the blob from the blob file and call `FilterV2` with the value in the usual fashion. + +## Performance + +We tested the performance of the new BlobDB for six different value sizes between 1 KB and 1 MB using a customized version of our [standard benchmark suite](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) on a box with an 18-core Skylake DE CPU (running at 1.6 GHz, with hyperthreading enabled), 64 GB RAM, a 512 GB boot SSD, and two 1.88 TB M.2 SSDs in a RAID0 configuration for data. The RocksDB version used was equivalent to 6.18.1, with some benchmarking and statistics related enhancements. Leveled and universal compaction without key-value separation were used as reference points. Note that for simplicity, we use “leveled compaction†and “universal compaction†as shorthand for leveled and universal compaction without key-value separation, respectively, and “BlobDB†for BlobDB with leveled compaction. + +Our benchmarks cycled through six different workloads: two write-only ones (initial load and overwrite), two read/write ones (point lookup/write mix and range scan/write mix), and finally two read-only ones (point lookups and range scans). The first two phases performed a fixed amount of work (see below), while the final four were run for a fixed amount of time, namely 30 minutes each. Each phase other than the first one started with the database state left behind by the previous one. Here’s a brief description of the workloads: + +* **Initial load**: this workload has two distinct stages, a single-threaded random write stage during which compactions are disabled (so all data is flushed to L0, where it remains for the rest of the stage), followed by a full manual compaction. The random writes are performed with load-optimized settings, namely using the vector memtable implementation and with concurrent memtable writes and WAL disabled. This stage was used to populate the database with 1 TB worth of raw values, e.g. 2^30 (~1 billion) 1 KB values or 2^20 (~1 million) 1 MB values. +* **Overwrite**: this is a multi-threaded random write workload using the usual skiplist memtable, with compactions, WAL, and concurrent memtable writes enabled. In our tests, 16 writer threads were used. The total number of writes was set to the same number as in the initial load stage and split up evenly between the writer threads. For instance, for the 1 MB value size, we had 2^20 writes divided up between the 16 threads, resulting in each thread performing 2^16 write operations. At the end of this phase, a “wait for compactions†step was added to prevent this workload from exhibiting artificially low write amp or conversely, the next phase showing inflated write amp. +* **Point lookup/write mix**: a single writer thread performing random writes while N (in our case, 16) threads perform random point lookups. WAL is enabled and all writes are synced. +* **Range scan/write mix**: similar to the above, with one writer thread and N reader threads (where N was again set to 16 in our tests). The reader threads perform random range scans, with 10 `Next` calls per `Seek`. Again, WAL is enabled, and sync writes are used. +* **Point lookups (read-only)**: N=16 threads perform random point lookups. +* **Range scans (read-only)**: N=16 threads execute random range scans, with 10 `Next`s per `Seek` like above. + +With that out of the way, let’s see how the new BlobDB performs against traditional leveled and universal compaction. In the next few sections, we’ll be looking at write amplification as well as read and write performance. We’ll also briefly compare the write performance of the new BlobDB with the legacy implementation. + +### Write amplification + +Reducing write amp is the original motivation for key-value separation. Here, we follow RocksDB’s definition of write amplification (as used in compaction statistics and the info log). That is, we define write amp as the total amount of data written by flushes and compactions divided by the amount of data written by flushes, where “data written†includes SST files and blob files as well (if applicable). The following charts show that BlobDB significantly reduces write amplification for all of our (non-read only) workloads. + +For the initial load, where due to the nature of the workload both leveled and universal already have a low write amp factor of 1.6, BlobDB has a write amp close to the theoretical minimum of 1.0, namely in the 1.0..1.02 range, depending on value size. How is this possible? Well, the trick is that when key-value separation is used, the full compaction step only has to sort the keys but not the values. This results in a write amp that is about **36% lower** than the already low write amp you get with either leveled or universal. + +In the case of the overwrite workload, BlobDB had a write amp between 1.4 and 1.7 depending on value size. This is around **75-78% lower** than the write amp of leveled compaction (6.1 to 6.8) and **70-77% lower** than universal (5.7 to 6.2); for this workload, there wasn’t a huge difference between the performance of leveled and universal. + +When it comes to the point lookup/write mix workload, BlobDB had a write amp between 1.4 and 1.8. This is **83-88% lower** than the write amp of leveled compaction, which had values between 10.8 and 12.5. Universal fared much better than leveled under this workload, and had write amp in the 2.2..6.6 range; however, BlobDB still provided significant gains for all value sizes we tested: namely, write amp was **18-77% lower** than that of universal, depending on value size. + +As for the range scan/write mix workload, BlobDB again had a write amp between 1.4 and 1.8, while leveled had values between 13.6 and 14.9, and universal was between 2.8 and 5.0. In other words, BlobDB’s write amp was **88-90% lower** than that of leveled, and **46-70% lower** than that of universal. + +![Write amplification](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Write performance + +In terms of write performance, there are other factors to consider besides write amplification. The following charts show some interesting metrics for the two write-only workloads (initial load and overwrite). As discussed earlier, these two workloads perform a fixed amount of work; the two charts in the top row show how long it took BlobDB, leveled, and universal to complete that work. Note that each bar is broken down into two, corresponding to the two stages of each workload (random write and full compaction for initial load, and random write and waiting for compactions for overwrite). + +For initial load, note that the random write stage takes the same amount of time regardless of which algorithm is used. This is not surprising considering the fact that compactions are disabled during this stage and thus RocksDB is simply writing L0 files (and in BlobDB’s case, blob files) as fast as it can. The second stage, on the other hand, is very different: as mentioned above, BlobDB essentially only needs to read, sort, and rewrite the keys during compaction, which can be done much much faster (with 1 MB values, more than a hundred times faster) than doing the same for large key-values. Due to this, initial load completed **2.3x to 4.7x faster** overall when using BlobDB. + +As for the overwrite workload, BlobDB performs much better during both stages. The two charts in the bottom row help explain why. In the case of both leveled and universal compaction, compactions can’t keep up with the write rate, which eventually leads to back pressure in the form of write stalls. As shown in the chart below, both leveled and universal stall between ~40% and ~70% of the time; on the other hand, BlobDB is stall-free except for the largest value size tested (1 MB). This naturally leads to higher throughput, namely **2.1x to 3.5x higher** throughput compared to leveled, and **1.6x to 3.0x higher** throughput compared to universal. The overwrite time chart also shows that the catch-up stage that waits for all compactions to finish is much shorter (and in fact, at larger value sizes, negligible) with BlobDB. + +![Write performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Read/write and read-only performance + +The charts below show the read performance (in terms of operations per second) of BlobDB versus leveled and universal compaction under the two read/write workloads and the two read-only workloads. BlobDB meets or exceeds the read performance of leveled compaction, except for workloads involving range scans at the two smallest value sizes tested (1 KB and 4 KB). It also provides better (in some cases, much better) read performance than universal across the board. In particular, BlobDB provides up **1.4x higher** read performance than leveled (for larger values), and up to **5.6x higher** than universal. + +![Read-write and read-only performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Comparing the two BlobDB implementations + +To compare the write performance of the new BlobDB with the legacy implementation, we ran two versions of the first (single-threaded random write) stage of the initial load benchmark using 1 KB values: one with WAL disabled, and one with WAL enabled. The new implementation completed the load **4.6x faster** than the old one without WAL, and **2.3x faster** with WAL. + +![Comparing the two BlobDB implementations](/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Future work + +There are a few remaining features that are not yet supported by the new BlobDB. The most important one is `Merge` (and the related `GetMergeOperands` API); in addition, we don’t currently support the `EventListener` interface, the `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` APIs, secondary instances, and ingestion of blob files. We will continue to work on closing this gap. + +We also have further plans when it comes to performance. These include optimizing garbage collection, introducing a dedicated cache for blobs, improving iterator and `MultiGet` performance, and evolving the blob file format amongst others. + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,17 @@ +--- +title: Online Validation +layout: post +author: sdong +category: blog +--- +To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones. + +We improved ColumnFamilyOptions::force_consistency_checks and enabled it by default. The option does some basic consistency checks to LSM-tree, e.g., files in one level are not overlapping. The DB will be frozen from new writes if a violation is detected. Previously, the feature’s check was too limited and didn’t always freeze the DB in a timely manner. Last year, we made the checking stricter so that it can [catch much more corrupted LSM-tree structures](https://github.com/facebook/rocksdb/pull/6901). We also fixed several issues where the checking failure was swallowed without freezing the DB. After making force_consistency_checks more reliable, we changed the default value to be on. + +ColumnFamilyOptions::paranoid_file_checks does some more expensive extra checking when generating a new SST file. Last year, we advanced coverage to this feature: after every SST file is generated, the SST file is created, read back keys one by one and check two things: (1) the keys are in comparator order (also available and enabled by default during file write via ColumnFamilyOptions::check_flush_compaction_key_order); (2) the hash of all the KVs is the same as calculated when we add KVs into it. These checks detect certain corruptions so we can prevent the corrupt files from being applied to the DB. We suggest users turn it on at least in shadow environments, and consider to run it in production too if you can afford the overheads. + +A recent feature is added to check the count of entries added into memtable while flushing it into an SST file. This feature is to have some online coverage to memtable corruption, caused by either software bug or hardware issue. This feature will be released in the coming release (6.21) and by default on. In the future, we will check more counters during memtables, e.g. number of puts or number of deletes. + +We also improved the reporting of online validation errors to improve debuggability. For example, failure to parse a corrupt key now reports details about the corrupt key. Since we did not want to expose key data in logs, error messages, etc., by default, this reporting is opt-in via DBOptions::allow_data_in_errors. + +More online checking features are planned and some are more sophisticated, including key/value checksums and sample based query validation. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,195 @@ +--- +title: RocksDB Secondary Cache +layout: post +author: anand1976 +category: blog +--- +## Introduction + +The RocksDB team is implementing support for a block cache on non-volatile media, such as a local flash device or NVM/SCM. It can be viewed as an extension of RocksDB’s current volatile block cache (LRUCache or ClockCache). The non-volatile block cache acts as a second tier cache that contains blocks evicted from the volatile cache. Those blocks are then promoted to the volatile cache as they become hotter due to access. + +This feature is meant for cases where the DB is located on remote storage or cloud storage. The non-volatile cache is officially referred to in RocksDB as the SecondaryCache. By maintaining a SecondaryCache that’s an order of magnitude larger than DRAM, fewer reads would be required from remote storage, thus reducing read latency as well as network bandwidth consumption. + +From the user point of view, the local flash cache will support the following requirements - + +1. Provide a pointer to a secondary cache when opening a DB +2. Be able to share the secondary cache across DBs in the same process +3. Have multiple secondary caches on a host +4. Support persisting the cache across process restarts and reboots by ensuring repeatability of the cache key + +![Architecture](/static/images/rocksdb-secondary-cache/arch_diagram.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Design + +When designing the API for a SecondaryCache, we had a choice between making it visible to the RocksDB code (table reader) or hiding it behind the RocksDB block cache. There are several advantages of hiding it behind the block cache - + +* Allows flexibility in insertion of blocks into the secondary cache. A block can be inserted on eviction from the RAM tier, or it could be eagerly inserted. +* It makes the rest of the RocksDB code less complex by providing a uniform interface regardless of whether a secondary cache is configured or not +* Makes parallel reads, peeking in the cache for prefetching, failure handling etc. easier +* Makes it easier to extend to compressed data if needed, and allows other persistent media, such as PM, to be added as an additional tier + + +We decided to make the secondary cache transparent to the rest of RocksDB code by hiding it behind the block cache. A key issue that we needed to address was the allocation and ownership of memory of the cached items - insertion into the secondary cache may require that memory be allocated by the same. This means that parts of the cached object that can be transferred to the secondary cache needs to be copied out (referred to as **unpacking**), and on a lookup the data stored in the secondary cache needs to be provided to the object constructor (referred to as **packing**). For RocksDB cached objects such as data blocks, index and filter blocks, and compression dictionaries, unpacking involves copying out the raw uncompressed BlockContents of the block, and packing involves constructing the corresponding block/index/filter/dictionary object using the raw uncompressed data. + +Another alternative we considered was the existing PersistentCache interface. However, we decided to not pursue it and eventually deprecate it for the following reasons - +* It is exposed directly to the table reader code, which makes it more difficult to implement different policies such as inclusive/exclusive cache, as well as extending it to more sophisticated admission control policies +* The interface does not allow for custom memory allocation and object packing/unpacking, so new APIs would have to be defined anyway +* The current PersistentCache implementation is very simple and does not have any admission control policies + +## API + +The interface between RocksDB’s block cache and the secondary cache is designed to allow pluggable implementations. For FB internal usage, we plan to use Cachelib with a wrapper to provide the plug-in implementation and use folly and other fbcode libraries, which cannot be used directly by RocksDB, to efficiently implement the cache operations. The following diagrams show the flow of insertion and lookup of a block. + +![Insert flow](/static/images/rocksdb-secondary-cache/insert_flow.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +![Lookup flow](/static/images/rocksdb-secondary-cache/lookup_flow.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +An item in the secondary cache is referenced by a SecondaryCacheHandle. The handle may not be immediately ready or have a valid value. The caller can call IsReady() to determine if its ready, and can call Wait() in order to block until it becomes ready. The caller must call Value() after it becomes ready to determine if the item was successfully read. Value() must return nullptr on failure. + +``` +class SecondaryCacheHandle { + public: + virtual ~SecondaryCacheHandle() {} + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the value. If nullptr, it means the lookup was unsuccessful + virtual void* Value() = 0; + + // Return the size of value + virtual size_t Size() = 0; +}; +``` + +The user of the secondary cache (for example, BlockBasedTableReader indirectly through LRUCache) must implement the callbacks defined in CacheItemHelper, in order to facilitate the unpacking/packing of objects for saving to and restoring from the secondary cache. The CreateCallback must be implemented to construct a cacheable object from the raw data in secondary cache. + +``` + // The SizeCallback takes a void* pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + using SizeCallback = size_t (*)(void* obj); + + // The SaveToCallback takes a void* object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, + size_t length, void* out); + + // A function pointer type for custom destruction of an entry's + // value. The Cache is responsible for copying and reclaiming space + // for the key, but values are managed by the caller. + using DeleterFn = void (*)(const Slice& key, void* value); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + SizeCallback size_cb; + SaveToCallback saveto_cb; + DeleterFn del_cb; + + CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} + CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, + DeleterFn _del_cb) + : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + }; + + // The CreateCallback is passed by the block cache user to Lookup(). It + // takes in a buffer from the NVM cache and constructs an object using + // it. The callback doesn't have ownership of the buffer and should + // copy the contents into its own buffer. + // typedef std::function + // CreateCallback; + using CreateCallback = std::function; +``` + +The secondary cache provider must provide a concrete implementation of the SecondaryCache abstract class. + +``` +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +class SecondaryCache { + public: + virtual ~SecondaryCache() {} + + virtual std::string Name() = 0; + + static const std::string Type() { return "SecondaryCache"; } + + // Insert the given value into this cache. The value is not written + // directly. Rather, the SaveToCallback provided by helper_cb will be + // used to extract the persistable data in value, which will be written + // to this tier. The implementation may or may not write it to cache + // depending on the admission control policy, even if the return status is + // success. + virtual Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) = 0; + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0; + + // At the discretion of the implementation, erase the data associated + // with key + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready. This would be used + // by MultiGet, for example, to read multitple data blocks in parallel + virtual void WaitAll(std::vector handles) = 0; + + virtual std::string GetPrintableOptions() const = 0; +}; +``` + +A SecondaryCache is configured by the user by providing a pointer to it in LRUCacheOptions - +``` +struct LRUCacheOptions { + ... + // A SecondaryCache instance to use as an additional cache tier + std::shared_ptr secondary_cache; + ... +}; +``` + +## Current Status + +The initial RocksDB support for the secondary cache has been merged into the main branch, and will be available in the 6.21 release. This includes providing a way for the user to configure a secondary cache when instantiating RocksDB’s LRU cache (volatile block cache), spilling blocks evicted from the LRU cache to the flash cache, promoting a block read from the SecondaryCache to the LRU cache, update tools such as cache_bench and db_bench to specify a flash cache. The relevant PRs are [#8271](https://github.com/facebook/rocksdb/pull/8271), [#8191](https://github.com/facebook/rocksdb/pull/8191), and [#8312](https://github.com/facebook/rocksdb/pull/8312). + +We prototyped an end-to-end solution, with the above PRs as well as a Cachelib based implementation of the SecondaryCache. We ran a mixgraph benchmark to simulate a realistic read/write workload. The results showed a 15% gain with the local flash cache over no local cache, and a ~25-30% reduction in network reads with a corresponding decrease in cache misses. + +![Throughput](/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +![Hit Rate](/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Future Work + +In the short term, we plan to do the following in order to fully integrate the SecondaryCache with RocksDB - + +1. Use DB session ID as the cache key prefix to ensure uniqueness and repeatability +2. Optimize flash cache usage of MultiGet and iterator workloads +3. Stress testing +4. More benchmarking + +Longer term, we plan to deploy this in production at Facebook. + +## Call to Action + +We are hoping for a community contribution of a secondary cache implementation, which would make this feature usable by the broader RocksDB userbase. If you are interested in contributing, please reach out to us in [this issue](https://github.com/facebook/rocksdb/issues/8347). + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,157 @@ +--- +title: Preset Dictionary Compression +layout: post +author: ajkr +category: blog +--- + +## Summary + +Compression algorithms relying on an adaptive dictionary, such as LZ4, zstd, and zlib, struggle to achieve good compression ratios on small inputs when using the basic compress API. +With the basic compress API, the compressor starts with an empty dictionary. +With small inputs, not much content gets added to the dictionary during the compression. +Combined, these factors suggest the dictionary will never have enough contents to achieve great compression ratios. + +RocksDB groups key-value pairs into data blocks before storing them in files. +For use cases that are heavy on random accesses, smaller data block size is sometimes desirable for reducing I/O and CPU spent reading blocks. +However, as explained above, smaller data block size comes with the downside of worse compression ratio when using the basic compress API. + +Fortunately, zstd and other libraries offer advanced compress APIs that preset the dictionary. +A preset dictionary makes it possible for the compressor to start from a useful state instead of from an empty one, making compression immediately effective. + +RocksDB now optionally takes advantage of these dictionary presetting APIs. +The challenges in integrating this feature into the storage engine were more substantial than apparent on the surface. +First, we need to target a preset dictionary to the relevant data. +Second, preset dictionaries need to be trained from data samples, which need to be gathered. +Third, preset dictionaries need to be persisted since they are needed at decompression time. +Fourth, overhead in accessing the preset dictionary must be minimized to prevent regression in critical code paths. +Fifth, we need easy-to-use measurement to evaluate candidate use cases and production impact. + +In production, we have deployed dictionary presetting to save space in multiple RocksDB use cases with data block size 8KB or smaller. +We have measured meaningful benefit to compression ratio in use cases with data block size up to 16KB. +We have also measured a use case that can save both CPU and space by reducing data block size and turning on dictionary presetting at the same time. + +## Feature design +#### Targeting + +Over time we have considered a few possibilities for the scope of a dictionary. + +- Subcompaction +- SST file +- Column family + +The original choice was subcompaction scope. +This enabled an approach with minimal buffering overhead because we could collect samples while generating the first output SST file. +The dictionary could then be trained and applied to subsequent SST files in the same subcompaction. + +However, we found a large use case where the proximity of data in the keyspace was more correlated with its similarity than we had predicted. +In particular, the approach of training a dictionary on an adjacent file yielded substantially worse ratios than training the dictionary on the same file it would be used to compress. +In response to this finding, we changed the preset dictionary scope to per SST file. + +With this change in approach, we had to face the problem we had hoped to avoid: how can we compress all of an SST file's data blocks with the same preset dictionary while that dictionary can only be trained after many data blocks have been sampled? +The solutions we considered both involved a new overhead. +We could read the input more than once and introduce I/O overhead, or we could buffer the uncompressed output file data blocks until a dictionary is trained, introducing memory overhead. +We chose to take the hit on memory overhead. + +Another approach that we considered was associating multiple dictionaries with a column family. +For example, in MyRocks there could be a dictionary trained on data from each large table. +When compressing a data block, we would look at the table to which its data belongs and pick the corresponding dictionary. +However, this approach would introduce many challenges. +RocksDB would need to be aware of the key schema to know where are the table boundaries. +RocksDB would also need to periodically update the dictionaries to account for changes in data pattern. +It would need somewhere to store dictionaries at column family scope. +Overall, we thought these challenges were too difficult to pursue the approach. + +#### Training + +![](/static/images/dictcmp/dictcmp_raw_sampled.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+Raw samples mode (`zstd_max_train_bytes == 0`) +

+ +As mentioned earlier, the approach we took is to build the dictionary from buffered uncompressed data blocks. +The first row of data blocks in these diagrams illustrate this buffering. +The second row illustrates training samples selected from the buffered blocks. +In raw samples mode (above), the final dictionary is simply the concatenation of these samples. +Whereas, in zstd training mode (below), these samples will be passed to the trainer to produce the final dictionary. + +![](/static/images/dictcmp/dictcmp_zstd_trained.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+zstd training mode (`zstd_max_train_bytes > 0`) +

+ +#### Compression path + +Once the preset dictionary is generated by the above process, we apply it to the buffered data blocks and write them to the output file. +Thereafter, newly generated data blocks are immediately compressed and written out. + +One optimization here is available to zstd v0.7.0+ users. +Instead of deserializing the dictionary on each compress invocation, we can do that work once and reuse it. +A `ZSTD_CDict` holds this digested dictionary state and is passed to the compress API. + +#### Persistence + +When an SST file's data blocks are compressed using a preset dictionary, that dictionary is stored inside the file for later use in decompression. + +![](/static/images/dictcmp/dictcmp_sst_blocks.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+SST file layout with the preset dictionary in its own (uncompressed) block +

+ +#### Decompression path + +To decompress, we need to provide both the data block and the dictionary used to compress it. +Since dictionaries are just blocks in a file, we access them through block cache. +However this additional load on block cache can be problematic. +It can be alleviated by pinning the dictionaries to avoid going through the LRU locks. + +An optimization analogous to the digested dictionary exists for certain zstd users (see User API section for details). +When enabled, the block cache stores the digested dictionary state for decompression (`ZSTD_DDict`) instead of the block contents. +In some cases we have seen decompression CPU decrease overall when enabling dictionary thanks to this optimization. + +#### Measurement + +Typically our first step in evaluating a candidate use case is an offline analysis of the data. +This gives us a quick idea whether presetting dictionary will be beneficial without any code, config, or data changes. +Our `sst_dump` tool reports what size SST files would have been using specified compression libraries and options. +We can select random SST files and compare the size with vs. without dictionary. + +When that goes well, the next step is to see how it works in a live DB, like a production shadow or canary. +There we can observe how it affects application/system metrics. + +Even after dictionary is enabled, there is the question of how much space was finally saved. +We provide a way to A/B test size with vs. without dictionary while running in production. +This feature picks a sample of data blocks to compress in multiple ways -- one of the outputs is stored, while the other outputs are thrown away after counting their size. +Due to API limitations, the stored output always has to be the dictionary-compressed one, so this feature can only be used after enabling dictionary. +The size with and without dictionary are stored in the SST file as table properties. +These properties can be aggregated across all SST files in a DB (and across all DBs in a tier) to learn the final space saving. + +## User API + +RocksDB allows presetting compression dictionary for users of LZ4, zstd, and zlib. +The most advanced capabilities are available to zstd v1.1.4+ users who statically link (see below). +Newer versions of zstd (v1.3.6+) have internal changes to the dictionary trainer and digested dictionary management, which significantly improve memory and CPU efficiency. + +Run-time settings: + +- `CompressionOptions::max_dict_bytes`: Limit on per-SST file dictionary size. Increasing this causes dictionaries to consume more space and memory for the possibility of better data block compression. A typical value we use is 16KB. +- (**zstd only**) `CompressionOptions::zstd_max_train_bytes`: Limit on training data passed to zstd dictionary trainer. Larger values cause the training to consume more CPU (and take longer) while generating more effective dictionaries. The starting point guidance we received from zstd team is to set it to 100x `CompressionOptions::max_dict_bytes`. +- `CompressionOptions::max_dict_buffer_bytes`: Limit on data buffering from which training samples are gathered. By default we buffer up to the target file size per ongoing background job. If this amount of memory is concerning, this option can constrain the buffering with the downside that training samples will cover a smaller portion of the SST file. Work is ongoing to charge this memory usage to block cache so it will not need to be accounted for separately. +- `BlockBasedTableOptions::cache_index_and_filter_blocks`: Controls whether metadata blocks including dictionary are accessed through block cache or held in table reader memory (yes, its name is outdated). +- `BlockBasedTableOptions::metadata_cache_options`: Controls what metadata blocks are pinned in block cache. Pinning avoids LRU contention at the risk of cold blocks holding memory. +- `ColumnFamilyOptions::sample_for_compression`: Controls frequency of measuring extra compressions on data blocks using various libraries with default settings (i.e., without preset dictionary). + +Compile-time setting: + +- (**zstd only**) `EXTRA_CXXFLAGS=-DZSTD_STATIC_LINKING_ONLY`: Hold digested dictionaries in block cache to save repetitive deserialization overhead. This saves a lot of CPU for read-heavy workloads. This compiler flag is necessary because one of the digested dictionary APIs we use is marked as experimental. We still use it in production, however. + +Function: + +- `DB::GetPropertiesOfAllTables()`: The properties `kSlowCompressionEstimatedDataSize` and `kFastCompressionEstimatedDataSize` estimate what the data block size (`kDataSize`) would have been if the corresponding compression library had been used. These properties are only present when `ColumnFamilyOptions::sample_for_compression` causes one or more samples to be measured, and they become more accurate with higher sampling frequency. + +Tool: + +- `sst_dump --command=recompress`: Offline analysis tool that reports what the SST file size would have been using the specified compression library and options. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,281 @@ +--- +title: Ribbon Filter +layout: post +author: pdillinger +category: blog +--- + +## Summary +Since version 6.15 last year, RocksDB supports Ribbon filters, a new +alternative to Bloom filters that save space, especially memory, at +the cost of more CPU usage, mostly in constructing the filters in the +background. Most applications with long-lived data (many hours or +longer) will likely benefit from adopting a Ribbon+Bloom hybrid filter +policy. Here we explain why and how. + +[Ribbon filter on RocksDB wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter) + +[Ribbon filter paper](https://arxiv.org/abs/2103.02515) + +## Problem & background +Bloom filters play a critical role in optimizing point queries and +some range queries in LSM-tree storage systems like RocksDB. Very +large DBs can use 10% or more of their RAM memory for (Bloom) filters, +so that (average case) read performance can be very good despite high +(worst case) read amplification, [which is useful for lowering write +and/or space +amplification](http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html). +Although the `format_version=5` Bloom filter in RocksDB is extremely +fast, all Bloom filters use around 50% more space than is +theoretically possible for a hashed structure configured for the same +false positive (FP) rate and number of keys added. What would it take +to save that significant share of “wasted†filter memory, and when +does it make sense to use such a Bloom alternative? + +A number of alternatives to Bloom filters were known, especially for +static filters (not modified after construction), but all the +previously known structures were unsatisfying for SSTs because of some +combination of +* Not enough space savings for CPU increase. For example, [Xor + filters](https://arxiv.org/abs/1912.08258) use 3-4x more CPU than + Bloom but only save 15-20% of + space. [GOV](https://arxiv.org/pdf/1603.04330.pdf) can save around + 30% space but requires around 10x more CPU than Bloom. +* Inconsistent space savings. [Cuckoo + filters](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) + and Xor+ filters offer significant space savings for very low FP + rates (high bits per key) but little or no savings for higher FP + rates (low bits per key). ([Higher FP rates are considered best for + largest levels of + LSM.](https://stratos.seas.harvard.edu/files/stratos/files/monkeykeyvaluestore.pdf)) + [Spatially-coupled Xor + filters](https://arxiv.org/pdf/2001.10500.pdf) require very large + number of keys per filter for large space savings. +* Inflexible configuration. No published alternatives offered the same + continuous configurability of Bloom filters, where any FP rate and + any fractional bits per key could be chosen. This flexibility + improves memory efficiency with the `optimize_filters_for_memory` + option that minimizes internal fragmentation on filters. + +## Ribbon filter development and implementation +The Ribbon filter came about when I developed a faster, simpler, and +more adaptable algorithm for constructing a little-known [Xor-based +structure from Dietzfelbinger and +Walzer](https://arxiv.org/pdf/1907.04750.pdf). It has very good space +usage for required CPU time (~30% space savings for 3-4x CPU) and, +with some engineering, Bloom-like configurability. The complications +were managable for use in RocksDB: +* Ribbon space efficiency does not naturally scale to very large + number of keys in a single filter (whole SST file or partition), but + with the current 128-bit Ribbon implementation in RocksDB, even 100 + million keys in one filter saves 27% space vs. Bloom rather than 30% + for 100,000 keys in a filter. +* More temporary memory is required during construction, ~230 bits per + key for 128-bit Ribbon vs. ~75 bits per key for Bloom filter. A + quick calculation shows that if you are saving 3 bits per key on the + generated filter, you only need about 50 generated filters in memory + to offset this temporary memory usage. (Thousands of filters in + memory is typical.) Starting in RocksDB version 6.27, this temporary + memory can be accounted for under block cache using + `BlockBasedTableOptions::reserve_table_builder_memory`. +* Ribbon filter queries use relatively more CPU for lower FP rates + (but still O(1) relative to number of keys added to filter). This + should be OK because lower FP rates are only appropriate when then + cost of a false positive is very high (worth extra query time) or + memory is not so constrained (can use Bloom instead). + +Future: data in [the paper](https://arxiv.org/abs/2103.02515) suggests +that 32-bit Balanced Ribbon (new name: [Bump-Once +Ribbon](https://arxiv.org/pdf/2109.01892.pdf)) would improve all of +these issues and be better all around (except for code complexity). + +## Ribbon vs. Bloom in RocksDB configuration +Different applications and hardware configurations have different +constraints, but we can use hardware costs to examine and better +understand the trade-off between Bloom and Ribbon. + +### Same FP rate, RAM vs. CPU hardware cost +Under ideal conditions where we can adjust our hardware to suit the +application, in terms of dollars, how much does it cost to construct, +query, and keep in memory a Bloom filter vs. a Ribbon filter? The +Ribbon filter costs more for CPU but less for RAM. Importantly, the +RAM cost directly depends on how long the filter is kept in memory, +which in RocksDB is essentially the lifetime of the filter. +(Temporary RAM during construction is so short-lived that it is +ignored.) Using some consumer hardware and electricity prices and a +predicted balance between construction and queries, we can compute a +“break even†duration in memory. To minimize cost, filters with a +lifetime shorter than this should be Bloom and filters with a lifetime +longer than this should be Ribbon. (Python code) + +``` +# Commodity prices based roughly on consumer prices and rough guesses +# Upfront cost of a CPU per hardware thread +upfront_dollars_per_cpu_thread = 30.0 + +# CPU average power usage per hardware thread +watts_per_cpu_thread = 3.5 + +# Upfront cost of a GB of RAM +upfront_dollars_per_gb_ram = 8.0 + +# RAM average power usage per GB +# https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use +watts_per_gb_ram = 0.375 + +# Estimated price of power per kilowatt-hour, including overheads like conversion losses and cooling +dollars_per_kwh = 0.35 + +# Assume 3 year hardware lifetime +hours_per_lifetime = 3 * 365 * 24 +seconds_per_lifetime = hours_per_lifetime * 60 * 60 + +# Number of filter queries per key added in filter construction is heavily dependent on workload. +# When replication is in layer above RocksDB, it will be low, likely < 1. When replication is in +# storage layer below RocksDB, it will likely be > 1. Using a rough and general guesstimate. +key_query_per_construct = 1.0 + +#================================== +# Bloom & Ribbon filter performance +typical_bloom_bits_per_key = 10.0 +typical_ribbon_bits_per_key = 7.0 + +# Speeds here are sensitive to many variables, especially query speed because it +# is so dependent on memory latency. Using this benchmark here: +# for IMPL in 2 3; do +# ./filter_bench -impl=$IMPL -quick -m_keys_total_max=200 -use_full_block_reader +# done +# and "Random filter" queries. +nanoseconds_per_construct_bloom_key = 32.0 +nanoseconds_per_construct_ribbon_key = 140.0 + +nanoseconds_per_query_bloom_key = 500.0 +nanoseconds_per_query_ribbon_key = 600.0 + +#================================== +# Some constants +kwh_per_watt_lifetime = hours_per_lifetime / 1000.0 +bits_per_gb = 8 * 1024 * 1024 * 1024 + +#================================== +# Crunching the numbers +# on CPU for constructing filters +dollars_per_cpu_thread_lifetime = upfront_dollars_per_cpu_thread + watts_per_cpu_thread * kwh_per_watt_lifetime * dollars_per_kwh +dollars_per_cpu_thread_second = dollars_per_cpu_thread_lifetime / seconds_per_lifetime + +dollars_per_construct_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_bloom_key / 10**9 +dollars_per_construct_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_ribbon_key / 10**9 + +dollars_per_query_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_query_bloom_key / 10**9 +dollars_per_query_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_query_ribbon_key / 10**9 + +dollars_per_bloom_key_cpu = dollars_per_construct_bloom_key + key_query_per_construct * dollars_per_query_bloom_key +dollars_per_ribbon_key_cpu = dollars_per_construct_ribbon_key + key_query_per_construct * dollars_per_query_ribbon_key + +# on holding filters in RAM +dollars_per_gb_ram_lifetime = upfront_dollars_per_gb_ram + watts_per_gb_ram * kwh_per_watt_lifetime * dollars_per_kwh +dollars_per_gb_ram_second = dollars_per_gb_ram_lifetime / seconds_per_lifetime + +dollars_per_bloom_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_bloom_bits_per_key +dollars_per_ribbon_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_ribbon_bits_per_key + +#================================== +# How many seconds does it take for the added cost of constructing a ribbon filter instead +# of bloom to be offset by the added cost of holding the bloom filter in memory? +break_even_seconds = (dollars_per_ribbon_key_cpu - dollars_per_bloom_key_cpu) / (dollars_per_bloom_key_in_ram_second - dollars_per_ribbon_key_in_ram_second) +print(break_even_seconds) +# -> 3235.1647730256936 +``` + +So roughly speaking, filters that live in memory for more than an hour +should be Ribbon, and filters that live less than an hour should be +Bloom. This is very interesting, but how long do filters live in +RocksDB? + +First let's consider the average case. Write-heavy RocksDB loads are +often backed by flash storage, which has some specified write +endurance for its intended lifetime. This can be expressed as *device +writes per day* (DWPD), and supported DWPD is typically < 10.0 even +for high end devices (excluding NVRAM). Roughly speaking, the DB would +need to be writing at a rate of 20+ DWPD for data to have an average +lifetime of less than one hour. Thus, unless you are prematurely +burning out your flash or massively under-utilizing available storage, +using the Ribbon filter has the better cost profile *on average*. + +### Predictable lifetime +But we can do even better than optimizing for the average case. LSM +levels give us very strong data lifetime hints. Data in L0 might live +for minutes or a small number of hours. Data in Lmax might live for +days or weeks. So even if Ribbon filters weren't the best choice on +average for a workload, they almost certainly make sense for the +larger, longer-lived levels of the LSM. As of RocksDB 6.24, you can +specify a minimum LSM level for Ribbon filters with +`NewRibbonFilterPolicy`, and earlier levels will use Bloom filters. + +### Resident filter memory +The above analysis assumes that nearly all filters for all live SST +files are resident in memory. This is true if using +`cache_index_and_filter_blocks=0` and `max_open_files=-1` (defaults), +but `cache_index_and_filter_blocks=1` is popular. In that case, +if you use `optimize_filters_for_hits=1` and non-partitioned filters +(a popular MyRocks configuration), it is also likely that nearly all +live filters are in memory. However, if you don't use +`optimize_filters_for_hits` and use partitioned filters, then +cold data (by age or by key range) can lead to only a portion of +filters being resident in memory. In that case, benefit from Ribbon +filter is not as clear, though because Ribbon filters are smaller, +they are more efficient to read into memory. + +RocksDB version 6.21 and later include a rough feature to determine +block cache usage for data blocks, filter blocks, index blocks, etc. +Data like this is periodically dumped to LOG file +(`stats_dump_period_sec`): + +``` +Block cache entry stats(count,size,portion): DataBlock(441761,6.82 GB,75.765%) FilterBlock(3002,1.27 GB,14.1387%) IndexBlock(17777,887.75 MB,9.63267%) Misc(1,0.00 KB,0%) +Block cache LRUCache@0x7fdd08104290#7004432 capacity: 9.00 GB collections: 2573 last_copies: 10 last_secs: 0.143248 secs_since: 0 +``` + +This indicates that at this moment in time, the block cache object +identified by `LRUCache@0x7fdd08104290#7004432` (potentially used +by multiple DBs) uses roughly 14% of its 9GB, about 1.27 GB, on filter +blocks. This same data is available through `DB::GetMapProperty` with +`DB::Properties::kBlockCacheEntryStats`, and (with some effort) can +be compared to total size of all filters (not necessarily in memory) +using `rocksdb.filter.size` from +`DB::Properties::kAggregatedTableProperties`. + +### Sanity checking lifetime +Can we be sure that using filters even makes sense for such long-lived +data? We can apply [the current 5 minute rule for caching SSD data in +RAM](http://renata.borovica-gajic.com/data/adms2017_5minuterule.pdf). A +4KB filter page holds data for roughly 4K keys. If we assume at least +one negative (useful) filter query in its lifetime per added key, it +can satisfy the 5 minute rule with a lifetime of up to about two +weeks. Thus, the lifetime threshold for “no filter†is about 300x +higher than the lifetime threshold for Ribbon filter. + +### What to do with saved memory +The default way to improve overall RocksDB performance with more +available memory is to use more space for caching, which improves +latency, CPU load, read IOs, etc. With +`cache_index_and_filter_blocks=1`, savings in filters will +automatically make room for caching more data blocks in block +cache. With `cache_index_and_filter_blocks=0`, consider increasing +block cache size. + +Using the space savings to lower filter FP rates is also an option, +but there is less evidence for this commonly improving existing +*optimized* configurations. + +## Generic recommendation +If using `NewBloomFilterPolicy(bpk)` for a large persistent DB using +compression, try using `NewRibbonFilterPolicy(bpk)` instead, which +will generate Ribbon filters during compaction and Bloom filters +for flush, both with the same FP rate as the old setting. Once new SST +files are generated under the new policy, this should free up some +memory for more caching without much effect on burst or sustained +write speed. Both kinds of filters can be read under either policy, so +there's always an option to adjust settings or gracefully roll back to +using Bloom filter only (keeping in mind that SST files must be +replaced to see effect of that change). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_top-level/support.md mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_top-level/support.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md 2025-05-19 16:14:27.000000000 +0000 @@ -19,4 +19,4 @@ ### FAQ -Check out a list of [commonly asked questions](/docs/support/faq) about RocksDB. +Check out a list of [commonly asked questions](https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ) about RocksDB. Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png differ Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png differ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,464 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "env/composite_env_wrapper.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// The CompositeEnvWrapper class provides an interface that is compatible +// with the old monolithic Env API, and an implementation that wraps around +// the new Env that provides threading and other OS related functionality, and +// the new FileSystem API that provides storage functionality. By +// providing the old Env interface, it allows the rest of RocksDB code to +// be agnostic of whether the underlying Env implementation is a monolithic +// Env or an Env + FileSystem. In the former case, the user will specify +// Options::env only, whereas in the latter case, the user will specify +// Options::env and Options::file_system. + +class CompositeSequentialFileWrapper : public SequentialFile { + public: + explicit CompositeSequentialFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(n, io_opts, result, scratch, &dbg); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomAccessFileWrapper : public RandomAccessFile { + public: + explicit CompositeRandomAccessFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + IOOptions io_opts; + IODebugContext dbg; + std::vector fs_reqs; + Status status; + + fs_reqs.resize(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].offset = reqs[i].offset; + fs_reqs[i].len = reqs[i].len; + fs_reqs[i].scratch = reqs[i].scratch; + fs_reqs[i].status = IOStatus::OK(); + } + status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].result = fs_reqs[i].result; + reqs[i].status = fs_reqs[i].status; + } + return status; + } + Status Prefetch(uint64_t offset, size_t n) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Prefetch(offset, n, io_opts, &dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((FSRandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + std::unique_ptr target_; +}; + +class CompositeWritableFileWrapper : public WritableFile { + public: + explicit CompositeWritableFileWrapper(std::unique_ptr& t) + : target_(std::move(t)) {} + + Status Append(const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, &dbg); + } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, verification_info, &dbg); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, &dbg); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, verification_info, + &dbg); + } + Status Truncate(uint64_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Truncate(size, io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->GetFileSize(io_opts, &dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->RangeSync(offset, nbytes, io_opts, &dbg); + } + + void PrepareWrite(size_t offset, size_t len) override { + IOOptions io_opts; + IODebugContext dbg; + target_->PrepareWrite(offset, len, io_opts, &dbg); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Allocate(offset, len, io_opts, &dbg); + } + + std::unique_ptr* target() { return &target_; } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomRWFileWrapper : public RandomRWFile { + public: + explicit CompositeRandomRWFileWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Write(offset, data, io_opts, &dbg); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeDirectoryWrapper : public Directory { + public: + explicit CompositeDirectoryWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions()); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; +} // namespace + +Status CompositeEnv::NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeSequentialFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeRandomAccessFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewWritableFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file, + &dbg); + if (status.ok()) { + result->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReuseWritableFile(fname, old_fname, + FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg); + if (status.ok()) { + result->reset(new CompositeRandomRWFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + IOOptions io_opts; + IODebugContext dbg; + std::unique_ptr dir; + Status status; + status = file_system_->NewDirectory(name, io_opts, &dir, &dbg); + if (status.ok()) { + result->reset(new CompositeDirectoryWrapper(dir)); + } + return status; +} + +namespace { +static std::unordered_map + composite_env_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + {0, OptionType::kCustomizable, OptionVerificationType::kByName, + OptionTypeFlags::kDontSerialize | OptionTypeFlags::kRawPointer, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto target = static_cast(addr); + return Env::CreateFromString(opts, value, &(target->env), + &(target->guard)); + }, + nullptr, nullptr}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map + composite_fs_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"file_system", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map + composite_clock_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"clock", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +} // namespace + +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs) { + return std::unique_ptr(new CompositeEnvWrapper(Env::Default(), fs)); +} + +CompositeEnvWrapper::CompositeEnvWrapper(Env* env, + const std::shared_ptr& fs, + const std::shared_ptr& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &composite_env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs, + const std::shared_ptr& sc) + : CompositeEnv(fs, sc), target_(env) { + RegisterOptions("", &target_, &composite_env_wrapper_type_info); + RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info); + RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info); +} + +Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + if (file_system_ == nullptr) { + file_system_ = target_.env->GetFileSystem(); + } + if (system_clock_ == nullptr) { + system_clock_ = target_.env->GetSystemClock(); + } + return Env::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string CompositeEnvWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto options = CompositeEnv::SerializeOptions(config_options, header); + if (target_.env != nullptr && target_.env != Env::Default()) { + options.append("target="); + options.append(target_.env->ToString(config_options)); + } + return options; +} +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env_wrapper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,1111 +7,366 @@ #include "rocksdb/env.h" #include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" -namespace ROCKSDB_NAMESPACE { - -// The CompositeEnvWrapper class provides an interface that is compatible -// with the old monolithic Env API, and an implementation that wraps around -// the new Env that provides threading and other OS related functionality, and -// the new FileSystem API that provides storage functionality. By -// providing the old Env interface, it allows the rest of RocksDB code to -// be agnostic of whether the underlying Env implementation is a monolithic -// Env or an Env + FileSystem. In the former case, the user will specify -// Options::env only, whereas in the latter case, the user will specify -// Options::env and Options::file_system. - -inline IOStatus status_to_io_status(Status&& status) { - if (status.ok()) { - // Fast path - return IOStatus::OK(); - } else { - const char* state = status.getState(); - if (state) { - return IOStatus(status.code(), status.subcode(), - Slice(state, strlen(status.getState()) + 1), - Slice()); - } else { - return IOStatus(status.code(), status.subcode()); - } - } -} - -class CompositeSequentialFileWrapper : public SequentialFile { - public: - explicit CompositeSequentialFileWrapper( - std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Read(size_t n, Slice* result, char* scratch) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(n, io_opts, result, scratch, &dbg); - } - Status Skip(uint64_t n) override { return target_->Skip(n); } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); - } - - private: - std::unique_ptr target_; -}; - -class CompositeRandomAccessFileWrapper : public RandomAccessFile { - public: - explicit CompositeRandomAccessFileWrapper( - std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(offset, n, io_opts, result, scratch, &dbg); - } - Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { - IOOptions io_opts; - IODebugContext dbg; - std::vector fs_reqs; - Status status; - - fs_reqs.resize(num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - fs_reqs[i].offset = reqs[i].offset; - fs_reqs[i].len = reqs[i].len; - fs_reqs[i].scratch = reqs[i].scratch; - fs_reqs[i].status = IOStatus::OK(); - } - status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); - for (size_t i = 0; i < num_reqs; ++i) { - reqs[i].result = fs_reqs[i].result; - reqs[i].status = fs_reqs[i].status; - } - return status; - } - Status Prefetch(uint64_t offset, size_t n) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Prefetch(offset, n, io_opts, &dbg); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - }; - void Hint(AccessPattern pattern) override { - target_->Hint((FSRandomAccessFile::AccessPattern)pattern); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - - private: - std::unique_ptr target_; -}; - -class CompositeWritableFileWrapper : public WritableFile { - public: - explicit CompositeWritableFileWrapper(std::unique_ptr& t) - : target_(std::move(t)) {} - - Status Append(const Slice& data) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Append(data, io_opts, &dbg); - } - Status PositionedAppend(const Slice& data, uint64_t offset) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->PositionedAppend(data, offset, io_opts, &dbg); - } - Status Truncate(uint64_t size) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Truncate(size, io_opts, &dbg); - } - Status Close() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Close(io_opts, &dbg); - } - Status Flush() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Flush(io_opts, &dbg); - } - Status Sync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Sync(io_opts, &dbg); - } - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } - - bool use_direct_io() const override { return target_->use_direct_io(); } - - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - - void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { - target_->SetWriteLifeTimeHint(hint); - } - - Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { - return target_->GetWriteLifeTimeHint(); - } - - uint64_t GetFileSize() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->GetFileSize(io_opts, &dbg); - } - - void SetPreallocationBlockSize(size_t size) override { - target_->SetPreallocationBlockSize(size); - } - - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override { - target_->GetPreallocationStatus(block_size, last_allocated_block); - } - - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - } - - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - - Status RangeSync(uint64_t offset, uint64_t nbytes) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->RangeSync(offset, nbytes, io_opts, &dbg); - } - - void PrepareWrite(size_t offset, size_t len) override { - IOOptions io_opts; - IODebugContext dbg; - target_->PrepareWrite(offset, len, io_opts, &dbg); - } - - Status Allocate(uint64_t offset, uint64_t len) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Allocate(offset, len, io_opts, &dbg); - } - - std::unique_ptr* target() { return &target_; } - - private: - std::unique_ptr target_; -}; - -class CompositeRandomRWFileWrapper : public RandomRWFile { - public: - explicit CompositeRandomRWFileWrapper(std::unique_ptr& target) - : target_(std::move(target)) {} - - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status Write(uint64_t offset, const Slice& data) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Write(offset, data, io_opts, &dbg); - } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(offset, n, io_opts, result, scratch, &dbg); - } - Status Flush() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Flush(io_opts, &dbg); - } - Status Sync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Sync(io_opts, &dbg); - } - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - Status Close() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Close(io_opts, &dbg); - } - - private: - std::unique_ptr target_; -}; - -class CompositeDirectoryWrapper : public Directory { - public: - explicit CompositeDirectoryWrapper(std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - } +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif - private: - std::unique_ptr target_; -}; +namespace ROCKSDB_NAMESPACE { -class CompositeEnvWrapper : public Env { +class CompositeEnv : public Env { public: // Initialize a CompositeEnvWrapper that delegates all thread/time related // calls to env, and all file operations to fs - explicit CompositeEnvWrapper(Env* env, FileSystem* fs) - : env_target_(env), fs_env_target_(fs) {} - ~CompositeEnvWrapper() {} + explicit CompositeEnv(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : Env(fs, clock) {} - // Return the target to which this Env forwards all calls - Env* env_target() const { return env_target_; } - - FileSystem* fs_env_target() const { return fs_env_target_; } + Status RegisterDbPaths(const std::vector& paths) override { + return file_system_->RegisterDbPaths(paths); + } + Status UnregisterDbPaths(const std::vector& paths) override { + return file_system_->UnregisterDbPaths(paths); + } // The following text is boilerplate that forwards all methods to target() Status NewSequentialFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - fs_env_target_->NewSequentialFile(f, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeSequentialFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = fs_env_target_->NewRandomAccessFile(f, FileOptions(options), &file, - &dbg); - if (status.ok()) { - r->reset(new CompositeRandomAccessFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - fs_env_target_->NewWritableFile(f, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status ReopenWritableFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) override { - IODebugContext dbg; - Status status; - std::unique_ptr file; - status = fs_env_target_->ReopenWritableFile(fname, FileOptions(options), - &file, &dbg); - if (status.ok()) { - result->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, const std::string& old_fname, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - Status status; - std::unique_ptr file; - status = fs_env_target_->ReuseWritableFile( - fname, old_fname, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewRandomRWFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = fs_env_target_->NewRandomRWFile(fname, FileOptions(options), &file, - &dbg); - if (status.ok()) { - result->reset(new CompositeRandomRWFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewMemoryMappedFileBuffer( const std::string& fname, std::unique_ptr* result) override { - return fs_env_target_->NewMemoryMappedFileBuffer(fname, result); + return file_system_->NewMemoryMappedFileBuffer(fname, result); } + Status NewDirectory(const std::string& name, - std::unique_ptr* result) override { - IOOptions io_opts; - IODebugContext dbg; - std::unique_ptr dir; - Status status; - status = fs_env_target_->NewDirectory(name, io_opts, &dir, &dbg); - if (status.ok()) { - result->reset(new CompositeDirectoryWrapper(dir)); - } - return status; - } + std::unique_ptr* result) override; + Status FileExists(const std::string& f) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->FileExists(f, io_opts, &dbg); + return file_system_->FileExists(f, io_opts, &dbg); } Status GetChildren(const std::string& dir, std::vector* r) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetChildren(dir, io_opts, r, &dbg); + return file_system_->GetChildren(dir, io_opts, r, &dbg); } Status GetChildrenFileAttributes( const std::string& dir, std::vector* result) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetChildrenFileAttributes(dir, io_opts, result, - &dbg); + return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg); } Status DeleteFile(const std::string& f) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->DeleteFile(f, io_opts, &dbg); + return file_system_->DeleteFile(f, io_opts, &dbg); } Status Truncate(const std::string& fname, size_t size) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->Truncate(fname, size, io_opts, &dbg); + return file_system_->Truncate(fname, size, io_opts, &dbg); } Status CreateDir(const std::string& d) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->CreateDir(d, io_opts, &dbg); + return file_system_->CreateDir(d, io_opts, &dbg); } Status CreateDirIfMissing(const std::string& d) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->CreateDirIfMissing(d, io_opts, &dbg); + return file_system_->CreateDirIfMissing(d, io_opts, &dbg); } Status DeleteDir(const std::string& d) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->DeleteDir(d, io_opts, &dbg); + return file_system_->DeleteDir(d, io_opts, &dbg); } Status GetFileSize(const std::string& f, uint64_t* s) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetFileSize(f, io_opts, s, &dbg); + return file_system_->GetFileSize(f, io_opts, s, &dbg); } Status GetFileModificationTime(const std::string& fname, uint64_t* file_mtime) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetFileModificationTime(fname, io_opts, file_mtime, - &dbg); + return file_system_->GetFileModificationTime(fname, io_opts, file_mtime, + &dbg); } Status RenameFile(const std::string& s, const std::string& t) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->RenameFile(s, t, io_opts, &dbg); + return file_system_->RenameFile(s, t, io_opts, &dbg); } Status LinkFile(const std::string& s, const std::string& t) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->LinkFile(s, t, io_opts, &dbg); + return file_system_->LinkFile(s, t, io_opts, &dbg); } Status NumFileLinks(const std::string& fname, uint64_t* count) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->NumFileLinks(fname, io_opts, count, &dbg); + return file_system_->NumFileLinks(fname, io_opts, count, &dbg); } Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->AreFilesSame(first, second, io_opts, res, &dbg); + return file_system_->AreFilesSame(first, second, io_opts, res, &dbg); } Status LockFile(const std::string& f, FileLock** l) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->LockFile(f, io_opts, l, &dbg); + return file_system_->LockFile(f, io_opts, l, &dbg); } Status UnlockFile(FileLock* l) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->UnlockFile(l, io_opts, &dbg); + return file_system_->UnlockFile(l, io_opts, &dbg); } Status GetAbsolutePath(const std::string& db_path, std::string* output_path) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetAbsolutePath(db_path, io_opts, output_path, &dbg); + return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg); } -#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) - Status LoadLibrary(const std::string& lib_name, - const std::string& search_path, - std::shared_ptr* result) override { - return env_target_->LoadLibrary(lib_name, search_path, result); - } -#endif - - void Schedule(void (*f)(void* arg), void* a, Priority pri, - void* tag = nullptr, void (*u)(void* arg) = nullptr) override { - return env_target_->Schedule(f, a, pri, tag, u); - } - - int UnSchedule(void* tag, Priority pri) override { - return env_target_->UnSchedule(tag, pri); - } - - void StartThread(void (*f)(void*), void* a) override { - return env_target_->StartThread(f, a); - } - void WaitForJoin() override { return env_target_->WaitForJoin(); } - unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { - return env_target_->GetThreadPoolQueueLen(pri); - } - Status GetTestDirectory(std::string* path) override { - return env_target_->GetTestDirectory(path); - } Status NewLogger(const std::string& fname, std::shared_ptr* result) override { - return env_target_->NewLogger(fname, result); - } - uint64_t NowMicros() override { return env_target_->NowMicros(); } - uint64_t NowNanos() override { return env_target_->NowNanos(); } - uint64_t NowCPUNanos() override { return env_target_->NowCPUNanos(); } - - void SleepForMicroseconds(int micros) override { - env_target_->SleepForMicroseconds(micros); - } - Status GetHostName(char* name, uint64_t len) override { - return env_target_->GetHostName(name, len); - } - Status GetCurrentTime(int64_t* unix_time) override { - return env_target_->GetCurrentTime(unix_time); - } - void SetBackgroundThreads(int num, Priority pri) override { - return env_target_->SetBackgroundThreads(num, pri); - } - int GetBackgroundThreads(Priority pri) override { - return env_target_->GetBackgroundThreads(pri); - } - - Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { - return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access); - } - - void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { - return env_target_->IncBackgroundThreadsIfNeeded(num, pri); - } - - void LowerThreadPoolIOPriority(Priority pool = LOW) override { - env_target_->LowerThreadPoolIOPriority(pool); - } - - void LowerThreadPoolCPUPriority(Priority pool = LOW) override { - env_target_->LowerThreadPoolCPUPriority(pool); - } - - std::string TimeToString(uint64_t time) override { - return env_target_->TimeToString(time); - } - - Status GetThreadList(std::vector* thread_list) override { - return env_target_->GetThreadList(thread_list); + IOOptions io_opts; + IODebugContext dbg; + return file_system_->NewLogger(fname, io_opts, result, &dbg); } - ThreadStatusUpdater* GetThreadStatusUpdater() const override { - return env_target_->GetThreadStatusUpdater(); + Status IsDirectory(const std::string& path, bool* is_dir) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->IsDirectory(path, io_opts, is_dir, &dbg); } - uint64_t GetThreadID() const override { return env_target_->GetThreadID(); } - - std::string GenerateUniqueId() override { - return env_target_->GenerateUniqueId(); + Status GetTestDirectory(std::string* path) override { + IOOptions io_opts; + IODebugContext dbg; + return file_system_->GetTestDirectory(io_opts, path, &dbg); } EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { - return fs_env_target_->OptimizeForLogRead(FileOptions(env_options)); + return file_system_->OptimizeForLogRead(FileOptions(env_options)); } + EnvOptions OptimizeForManifestRead( const EnvOptions& env_options) const override { - return fs_env_target_->OptimizeForManifestRead( - FileOptions(env_options)); + return file_system_->OptimizeForManifestRead(FileOptions(env_options)); } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, const DBOptions& db_options) const override { - return fs_env_target_->OptimizeForLogWrite(FileOptions(env_options), - db_options); + return file_system_->OptimizeForLogWrite(FileOptions(env_options), + db_options); } + EnvOptions OptimizeForManifestWrite( const EnvOptions& env_options) const override { - return fs_env_target_->OptimizeForManifestWrite( - FileOptions(env_options)); + return file_system_->OptimizeForManifestWrite(FileOptions(env_options)); } + EnvOptions OptimizeForCompactionTableWrite( const EnvOptions& env_options, const ImmutableDBOptions& immutable_ops) const override { - return fs_env_target_->OptimizeForCompactionTableWrite( - FileOptions(env_options), - immutable_ops); + return file_system_->OptimizeForCompactionTableWrite( + FileOptions(env_options), immutable_ops); } EnvOptions OptimizeForCompactionTableRead( const EnvOptions& env_options, const ImmutableDBOptions& db_options) const override { - return fs_env_target_->OptimizeForCompactionTableRead( - FileOptions(env_options), - db_options); + return file_system_->OptimizeForCompactionTableRead( + FileOptions(env_options), db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForBlobFileRead(FileOptions(env_options), + db_options); } + // This seems to clash with a macro on Windows, so #undef it here +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { IOOptions io_opts; IODebugContext dbg; - return fs_env_target_->GetFreeSpace(path, io_opts, diskfree, &dbg); + return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg); } + uint64_t NowMicros() override { return system_clock_->NowMicros(); } + uint64_t NowNanos() override { return system_clock_->NowNanos(); } - private: - Env* env_target_; - FileSystem* fs_env_target_; -}; + uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); } -class LegacySequentialFileWrapper : public FSSequentialFile { - public: - explicit LegacySequentialFileWrapper( - std::unique_ptr&& _target) - : target_(std::move(_target)) {} - - IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, - char* scratch, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Read(n, result, scratch)); - } - IOStatus Skip(uint64_t n) override { - return status_to_io_status(target_->Skip(n)); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); - } - IOStatus PositionedRead(uint64_t offset, size_t n, - const IOOptions& /*options*/, Slice* result, - char* scratch, IODebugContext* /*dbg*/) override { - return status_to_io_status( - target_->PositionedRead(offset, n, result, scratch)); + void SleepForMicroseconds(int micros) override { + system_clock_->SleepForMicroseconds(micros); } - SequentialFile* target() { return target_.get(); } - private: - std::unique_ptr target_; -}; - -class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { - public: - explicit LegacyRandomAccessFileWrapper( - std::unique_ptr&& target) - : target_(std::move(target)) {} - - IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, - Slice* result, char* scratch, - IODebugContext* /*dbg*/) const override { - return status_to_io_status(target_->Read(offset, n, result, scratch)); - } - IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - std::vector reqs; - Status status; - - reqs.reserve(num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - ReadRequest req; - - req.offset = fs_reqs[i].offset; - req.len = fs_reqs[i].len; - req.scratch = fs_reqs[i].scratch; - req.status = Status::OK(); - - reqs.emplace_back(req); - } - status = target_->MultiRead(reqs.data(), num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - fs_reqs[i].result = reqs[i].result; - fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); - } - return status_to_io_status(std::move(status)); - ; - } - IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Prefetch(offset, n)); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - }; - void Hint(AccessPattern pattern) override { - target_->Hint((RandomAccessFile::AccessPattern)pattern); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); + Status GetCurrentTime(int64_t* unix_time) override { + return system_clock_->GetCurrentTime(unix_time); } - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); + std::string TimeToString(uint64_t time) override { + return system_clock_->TimeToString(time); } - - private: - std::unique_ptr target_; }; -class LegacyWritableFileWrapper : public FSWritableFile { +class CompositeEnvWrapper : public CompositeEnv { public: - explicit LegacyWritableFileWrapper(std::unique_ptr&& _target) - : target_(std::move(_target)) {} - - IOStatus Append(const Slice& data, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Append(data)); - } - IOStatus PositionedAppend(const Slice& data, uint64_t offset, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->PositionedAppend(data, offset)); - } - IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Truncate(size)); - } - IOStatus Close(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Close()); - } - IOStatus Flush(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Flush()); - } - IOStatus Sync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Sync()); - } - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); - } - bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } - - bool use_direct_io() const override { return target_->use_direct_io(); } - - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - - void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { - target_->SetWriteLifeTimeHint(hint); - } - - Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { - return target_->GetWriteLifeTimeHint(); - } - - uint64_t GetFileSize(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return target_->GetFileSize(); - } - - void SetPreallocationBlockSize(size_t size) override { - target_->SetPreallocationBlockSize(size); - } - - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override { - target_->GetPreallocationStatus(block_size, last_allocated_block); - } - - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnvWrapper(Env* env) + : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {} + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& sc); + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} + + explicit CompositeEnvWrapper(const std::shared_ptr& env, + const std::shared_ptr& fs, + const std::shared_ptr& sc); + + static const char* kClassName() { return "CompositeEnv"; } + const char* Name() const override { return kClassName(); } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return CompositeEnv::IsInstanceOf(name); + } } + const Customizable* Inner() const override { return target_.env; } - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); - } + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE - IOStatus RangeSync(uint64_t offset, uint64_t nbytes, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->RangeSync(offset, nbytes)); - } + // Return the target to which this Env forwards all calls + Env* env_target() const { return target_.env; } - void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - target_->PrepareWrite(offset, len); +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_.env->LoadLibrary(lib_name, search_path, result); } +#endif - IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Allocate(offset, len)); + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return target_.env->Schedule(f, a, pri, tag, u); } - WritableFile* target() { return target_.get(); } - - private: - std::unique_ptr target_; -}; - -class LegacyRandomRWFileWrapper : public FSRandomRWFile { - public: - explicit LegacyRandomRWFileWrapper(std::unique_ptr&& target) - : target_(std::move(target)) {} - - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - IOStatus Write(uint64_t offset, const Slice& data, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Write(offset, data)); - } - IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, - Slice* result, char* scratch, - IODebugContext* /*dbg*/) const override { - return status_to_io_status(target_->Read(offset, n, result, scratch)); - } - IOStatus Flush(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Flush()); - } - IOStatus Sync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Sync()); - } - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); - } - IOStatus Close(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Close()); + int UnSchedule(void* tag, Priority pri) override { + return target_.env->UnSchedule(tag, pri); } - private: - std::unique_ptr target_; -}; - -class LegacyDirectoryWrapper : public FSDirectory { - public: - explicit LegacyDirectoryWrapper(std::unique_ptr&& target) - : target_(std::move(target)) {} - - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); + void StartThread(void (*f)(void*), void* a) override { + return target_.env->StartThread(f, a); } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); + void WaitForJoin() override { return target_.env->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return target_.env->GetThreadPoolQueueLen(pri); } - private: - std::unique_ptr target_; -}; - -class LegacyFileSystemWrapper : public FileSystem { - public: - // Initialize an EnvWrapper that delegates all calls to *t - explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} - ~LegacyFileSystemWrapper() override {} - - const char* Name() const override { return "Legacy File System"; } - - // Return the target to which this Env forwards all calls - Env* target() const { return target_; } - - // The following text is boilerplate that forwards all methods to target() - IOStatus NewSequentialFile(const std::string& f, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewSequentialFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacySequentialFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewRandomAccessFile(const std::string& f, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewRandomAccessFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewWritableFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus ReopenWritableFile(const std::string& fname, - const FileOptions& file_opts, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->ReopenWritableFile(fname, &file, file_opts); - if (s.ok()) { - result->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewRandomRWFile(const std::string& fname, - const FileOptions& file_opts, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewRandomRWFile(fname, &file, file_opts); - if (s.ok()) { - result->reset(new LegacyRandomRWFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result) override { - return status_to_io_status( - target_->NewMemoryMappedFileBuffer(fname, result)); - } - IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr dir; - Status s = target_->NewDirectory(name, &dir); - if (s.ok()) { - result->reset(new LegacyDirectoryWrapper(std::move(dir))); - } - return status_to_io_status(std::move(s)); - } - IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->FileExists(f)); - } - IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, - std::vector* r, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetChildren(dir, r)); - } - IOStatus GetChildrenFileAttributes(const std::string& dir, - const IOOptions& /*options*/, - std::vector* result, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); - } - IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->DeleteFile(f)); - } - IOStatus Truncate(const std::string& fname, size_t size, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Truncate(fname, size)); - } - IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->CreateDir(d)); - } - IOStatus CreateDirIfMissing(const std::string& d, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->CreateDirIfMissing(d)); - } - IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->DeleteDir(d)); + Status GetHostName(char* name, uint64_t len) override { + return target_.env->GetHostName(name, len); } - IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, - uint64_t* s, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetFileSize(f, s)); + void SetBackgroundThreads(int num, Priority pri) override { + return target_.env->SetBackgroundThreads(num, pri); } - - IOStatus GetFileModificationTime(const std::string& fname, - const IOOptions& /*options*/, - uint64_t* file_mtime, - IODebugContext* /*dbg*/) override { - return status_to_io_status( - target_->GetFileModificationTime(fname, file_mtime)); + int GetBackgroundThreads(Priority pri) override { + return target_.env->GetBackgroundThreads(pri); } - IOStatus GetAbsolutePath(const std::string& db_path, - const IOOptions& /*options*/, - std::string* output_path, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); } - IOStatus RenameFile(const std::string& s, const std::string& t, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->RenameFile(s, t)); + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); } - IOStatus LinkFile(const std::string& s, const std::string& t, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->LinkFile(s, t)); + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); } - IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, - uint64_t* count, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->NumFileLinks(fname, count)); + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); } - IOStatus AreFilesSame(const std::string& first, const std::string& second, - const IOOptions& /*options*/, bool* res, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->AreFilesSame(first, second, res)); + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); } - IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, - FileLock** l, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->LockFile(f, l)); + Status GetThreadList(std::vector* thread_list) override { + return target_.env->GetThreadList(thread_list); } - IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->UnlockFile(l)); + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_.env->GetThreadStatusUpdater(); } - IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetTestDirectory(path)); - } - IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, - std::shared_ptr* result, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->NewLogger(fname, result)); - } + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } - FileOptions OptimizeForLogRead( - const FileOptions& file_options) const override { - return target_->OptimizeForLogRead(file_options); - } - FileOptions OptimizeForManifestRead( - const FileOptions& file_options) const override { - return target_->OptimizeForManifestRead(file_options); - } - FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const override { - return target_->OptimizeForLogWrite(file_options, db_options); - } - FileOptions OptimizeForManifestWrite( - const FileOptions& file_options) const override { - return target_->OptimizeForManifestWrite(file_options); - } - FileOptions OptimizeForCompactionTableWrite( - const FileOptions& file_options, - const ImmutableDBOptions& immutable_ops) const override { - return target_->OptimizeForCompactionTableWrite(file_options, - immutable_ops); - } - FileOptions OptimizeForCompactionTableRead( - const FileOptions& file_options, - const ImmutableDBOptions& db_options) const override { - return target_->OptimizeForCompactionTableRead(file_options, db_options); - } - IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, - uint64_t* diskfree, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetFreeSpace(path, diskfree)); + std::string GenerateUniqueId() override { + return target_.env->GenerateUniqueId(); } private: - Env* target_; + EnvWrapper::Target target_; }; - -inline std::unique_ptr NewLegacySequentialFileWrapper( - std::unique_ptr& file) { - return std::unique_ptr( - new LegacySequentialFileWrapper(std::move(file))); -} - -inline std::unique_ptr NewLegacyRandomAccessFileWrapper( - std::unique_ptr& file) { - return std::unique_ptr( - new LegacyRandomAccessFileWrapper(std::move(file))); -} - -inline std::unique_ptr NewLegacyWritableFileWrapper( - std::unique_ptr&& file) { - return std::unique_ptr( - new LegacyWritableFileWrapper(std::move(file))); -} - } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/emulated_clock.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/emulated_clock.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +// A SystemClock that can "mock" sleep and counts its operations. +class EmulatedSystemClock : public SystemClockWrapper { + private: + // Something to return when mocking current time + const int64_t maybe_starting_time_; + std::atomic sleep_counter_{0}; + std::atomic cpu_counter_{0}; + std::atomic addon_microseconds_{0}; + // Do not modify in the env of a running DB (could cause deadlock) + std::atomic time_elapse_only_sleep_; + bool no_slowdown_; + + public: + explicit EmulatedSystemClock(const std::shared_ptr& base, + bool time_elapse_only_sleep = false); + + static const char* kClassName() { return "TimeEmulatedSystemClock"; } + const char* Name() const override { return kClassName(); } + + virtual void SleepForMicroseconds(int micros) override { + sleep_counter_++; + if (no_slowdown_ || time_elapse_only_sleep_) { + addon_microseconds_.fetch_add(micros); + } + if (!no_slowdown_) { + SystemClockWrapper::SleepForMicroseconds(micros); + } + } + + void MockSleepForMicroseconds(int64_t micros) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(micros); + } + + void MockSleepForSeconds(int64_t seconds) { + sleep_counter_++; + assert(no_slowdown_); + addon_microseconds_.fetch_add(seconds * 1000000); + } + + void SetTimeElapseOnlySleep(bool enabled) { + // We cannot set these before destroying the last DB because they might + // cause a deadlock or similar without the appropriate options set in + // the DB. + time_elapse_only_sleep_ = enabled; + no_slowdown_ = enabled; + } + + bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); } + void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; } + bool IsMockSleepEnabled() const { return no_slowdown_; } + + int GetSleepCounter() const { return sleep_counter_.load(); } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s; + if (time_elapse_only_sleep_) { + *unix_time = maybe_starting_time_; + } else { + s = SystemClockWrapper::GetCurrentTime(unix_time); + } + if (s.ok()) { + // mock microseconds elapsed to seconds of time + *unix_time += addon_microseconds_.load() / 1000000; + } + return s; + } + + virtual uint64_t CPUNanos() override { + cpu_counter_++; + return SystemClockWrapper::CPUNanos(); + } + + virtual uint64_t CPUMicros() override { + cpu_counter_++; + return SystemClockWrapper::CPUMicros(); + } + + virtual uint64_t NowNanos() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) + + addon_microseconds_.load() * 1000; + } + + virtual uint64_t NowMicros() override { + return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) + + addon_microseconds_.load(); + } + + int GetCpuCounter() const { return cpu_counter_.load(); } + + void ResetCounters() { + cpu_counter_.store(0); + sleep_counter_.store(0); + } +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,17 +10,625 @@ #include "rocksdb/env.h" #include + #include "env/composite_env_wrapper.h" +#include "env/emulated_clock.h" +#include "env/mock_env.h" +#include "env/unique_id_gen.h" #include "logging/env_logger.h" #include "memory/arena.h" #include "options/db_options.h" #include "port/port.h" -#include "port/sys_time.h" +#include "rocksdb/convenience.h" #include "rocksdb/options.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" #include "util/autovector.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +namespace { +#ifndef ROCKSDB_LITE +static int RegisterBuiltinEnvs(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory(MockEnv::kClassName(), [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(MockEnv::Create(Env::Default())); + return guard->get(); + }); + library.AddFactory( + CompositeEnvWrapper::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CompositeEnvWrapper(Env::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +static void RegisterSystemEnvs() { +#ifndef ROCKSDB_LITE + static std::once_flag loaded; + std::call_once(loaded, [&]() { + RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE +} + +class LegacySystemClock : public SystemClock { + private: + Env* env_; + + public: + explicit LegacySystemClock(Env* env) : env_(env) {} + const char* Name() const override { return "LegacySystemClock"; } + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + uint64_t NowMicros() override { return env_->NowMicros(); } + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + uint64_t NowNanos() override { return env_->NowNanos(); } + + uint64_t CPUMicros() override { return CPUNanos() / 1000; } + uint64_t CPUNanos() override { return env_->NowCPUNanos(); } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + void SleepForMicroseconds(int micros) override { + env_->SleepForMicroseconds(micros); + } + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + Status GetCurrentTime(int64_t* unix_time) override { + return env_->GetCurrentTime(unix_time); + } + // Converts seconds-since-Jan-01-1970 to a printable string + std::string TimeToString(uint64_t time) override { + return env_->TimeToString(time); + } + +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacySystemClock to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } +#endif // ROCKSDB_LITE +}; + +class LegacySequentialFileWrapper : public FSSequentialFile { + public: + explicit LegacySequentialFileWrapper( + std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Read(n, result, scratch)); + } + IOStatus Skip(uint64_t n) override { + return status_to_io_status(target_->Skip(n)); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->PositionedRead(offset, n, result, scratch)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { + public: + explicit LegacyRandomAccessFileWrapper( + std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + + IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + std::vector reqs; + Status status; + + reqs.reserve(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest req; + + req.offset = fs_reqs[i].offset; + req.len = fs_reqs[i].len; + req.scratch = fs_reqs[i].scratch; + req.status = Status::OK(); + + reqs.emplace_back(req); + } + status = target_->MultiRead(reqs.data(), num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].result = reqs[i].result; + fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); + } + return status_to_io_status(std::move(status)); + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Prefetch(offset, n)); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((RandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomRWFileWrapper : public FSRandomRWFile { + public: + explicit LegacyRandomRWFileWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Write(offset, data)); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + + private: + std::unique_ptr target_; +}; + +class LegacyWritableFileWrapper : public FSWritableFile { + public: + explicit LegacyWritableFileWrapper(std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(size)); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return target_->GetFileSize(); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RangeSync(offset, nbytes)); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + target_->PrepareWrite(offset, len); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Allocate(offset, len)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyDirectoryWrapper : public FSDirectory { + public: + explicit LegacyDirectoryWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; + +class LegacyFileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} + ~LegacyFileSystemWrapper() override {} + + static const char* kClassName() { return "LegacyFileSystem"; } + const char* Name() const override { return kClassName(); } + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewSequentialFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacySequentialFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomAccessFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewWritableFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReopenWritableFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomRWFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyRandomRWFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return status_to_io_status( + target_->NewMemoryMappedFileBuffer(fname, result)); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr dir; + Status s = target_->NewDirectory(name, &dir); + if (s.ok()) { + result->reset(new LegacyDirectoryWrapper(std::move(dir))); + } + return status_to_io_status(std::move(s)); + } + IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->FileExists(f)); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, + std::vector* r, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildren(dir, r)); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteFile(f)); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(fname, size)); + } + IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDir(d)); + } + IOStatus CreateDirIfMissing(const std::string& d, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDirIfMissing(d)); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteDir(d)); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFileSize(f, s)); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->GetFileModificationTime(fname, file_mtime)); + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RenameFile(s, t)); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LinkFile(s, t)); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NumFileLinks(fname, count)); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*options*/, bool* res, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->AreFilesSame(first, second, res)); + } + + IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, + FileLock** l, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LockFile(f, l)); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->UnlockFile(l)); + } + + IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetTestDirectory(path)); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NewLogger(fname, result)); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeEnvOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, + uint64_t* diskfree, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFreeSpace(path, diskfree)); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->IsDirectory(path, is_dir)); + } + +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& /*config_options*/, + const std::string& /*prefix*/) const override { + // We do not want the LegacyFileSystem to appear in the serialized output. + // This clock is an internal class for those who do not implement one and + // would be part of the Env. As such, do not serialize it here. + return ""; + } +#endif // ROCKSDB_LITE + private: + Env* target_; +}; +} // end anonymous namespace + +Env::Env() : thread_status_updater_(nullptr) { + file_system_ = std::make_shared(this); + system_clock_ = std::make_shared(this); +} + +Env::Env(const std::shared_ptr& fs) + : thread_status_updater_(nullptr), file_system_(fs) { + system_clock_ = std::make_shared(this); +} + +Env::Env(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} Env::~Env() { } @@ -31,47 +639,99 @@ } Status Env::LoadEnv(const std::string& value, Env** result) { - Env* env = *result; - Status s; -#ifndef ROCKSDB_LITE - s = ObjectRegistry::NewInstance()->NewStaticObject(value, &env); -#else - s = Status::NotSupported("Cannot load environment in LITE mode: ", value); -#endif - if (s.ok()) { - *result = env; + return CreateFromString(ConfigOptions(), value, result); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result) { + Env* base = Env::Default(); + if (value.empty() || base->IsInstanceOf(value)) { + *result = base; + return Status::OK(); + } else { + RegisterSystemEnvs(); + Env* env = *result; + Status s = LoadStaticObject(config_options, value, nullptr, &env); + if (s.ok()) { + *result = env; + } + return s; } - return s; } Status Env::LoadEnv(const std::string& value, Env** result, std::shared_ptr* guard) { + return CreateFromString(ConfigOptions(), value, result, guard); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard) { assert(result); - Status s; -#ifndef ROCKSDB_LITE - Env* env = nullptr; - std::unique_ptr uniq_guard; - std::string err_msg; assert(guard != nullptr); - env = ObjectRegistry::NewInstance()->NewObject(value, &uniq_guard, - &err_msg); - if (!env) { - s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " + - value); - env = Env::Default(); - } - if (s.ok() && uniq_guard) { - guard->reset(uniq_guard.release()); - *result = guard->get(); - } else { - *result = env; + std::unique_ptr uniq; + + Env* env = *result; + std::string id; + std::unordered_map opt_map; + + Status status = + Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; } + Env* base = Env::Default(); + if (id.empty() || base->IsInstanceOf(id)) { + env = base; + status = Status::OK(); + } else { + RegisterSystemEnvs(); +#ifndef ROCKSDB_LITE + std::string errmsg; + env = config_options.registry->NewObject(id, &uniq, &errmsg); + if (!env) { + status = Status::NotSupported( + std::string("Cannot load environment[") + id + "]: ", errmsg); + } #else - (void)result; - (void)guard; - s = Status::NotSupported("Cannot load environment in LITE mode: ", value); + status = + Status::NotSupported("Cannot load environment in LITE mode", value); #endif - return s; + } + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, env, opt_map); + } + if (status.ok()) { + guard->reset(uniq.release()); + *result = env; + } + return status; +} + +Status Env::CreateFromUri(const ConfigOptions& config_options, + const std::string& env_uri, const std::string& fs_uri, + Env** result, std::shared_ptr* guard) { + *result = config_options.env; + if (env_uri.empty() && fs_uri.empty()) { + // Neither specified. Use the default + guard->reset(); + return Status::OK(); + } else if (!env_uri.empty() && !fs_uri.empty()) { + // Both specified. Cannot choose. Return Invalid + return Status::InvalidArgument("cannot specify both fs_uri and env_uri"); + } else if (fs_uri.empty()) { // Only have an ENV URI. Create an Env from it + return CreateFromString(config_options, env_uri, result, guard); + } else { + std::shared_ptr fs; + Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs); + if (s.ok()) { + guard->reset(new CompositeEnvWrapper(*result, fs)); + *result = guard->get(); + } + return s; + } } std::string Env::PriorityToString(Env::Priority priority) { @@ -132,6 +792,56 @@ return Status::OK(); } +Status Env::GetHostNameString(std::string* result) { + std::array hostname_buf{}; + Status s = GetHostName(hostname_buf.data(), hostname_buf.size()); + if (s.ok()) { + hostname_buf[hostname_buf.size() - 1] = '\0'; + result->assign(hostname_buf.data()); + } + return s; +} + +std::string Env::GenerateUniqueId() { + std::string result; + bool success = port::GenerateRfcUuid(&result); + if (!success) { + // Fall back on our own way of generating a unique ID and adapt it to + // RFC 4122 variant 1 version 4 (a random ID). + // https://en.wikipedia.org/wiki/Universally_unique_identifier + // We already tried GenerateRfcUuid so no need to try it again in + // GenerateRawUniqueId + constexpr bool exclude_port_uuid = true; + uint64_t upper, lower; + GenerateRawUniqueId(&upper, &lower, exclude_port_uuid); + + // Set 4-bit version to 4 + upper = (upper & (~uint64_t{0xf000})) | 0x4000; + // Set unary-encoded variant to 1 (0b10) + lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62); + + // Use 36 character format of RFC 4122 + result.resize(36U); + char* buf = &result[0]; + PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false); + *(buf++) = '-'; + PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false); + assert(buf == &result[36]); + + // Verify variant 1 version 4 + assert(result[14] == '4'); + assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' || + result[19] == 'b'); + } + return result; +} + SequentialFile::~SequentialFile() { } @@ -200,6 +910,14 @@ kInfoLogLevelNames[log_level], format); Logv(new_format, ap); } + + if (log_level >= InfoLogLevel::WARN_LEVEL && + log_level != InfoLogLevel::HEADER_LEVEL) { + // Log messages with severity of warning or higher should be rare and are + // sometimes followed by an unclean crash. We want to be sure important + // messages are not lost in an application buffer when that happens. + Flush(); + } } static void Logv(const InfoLogLevel log_level, Logger *info_log, const char *format, va_list ap) { @@ -361,30 +1079,74 @@ Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, bool should_sync) { - std::unique_ptr file; - EnvOptions soptions; - Status s = env->NewWritableFile(fname, &file, soptions); - if (!s.ok()) { - return s; - } - s = file->Append(data); - if (s.ok() && should_sync) { - s = file->Sync(); - } - if (!s.ok()) { - env->DeleteFile(fname); - } - return s; + const auto& fs = env->GetFileSystem(); + return WriteStringToFile(fs.get(), data, fname, should_sync); } Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { - LegacyFileSystemWrapper lfsw(env); - return ReadFileToString(&lfsw, fname, data); + const auto& fs = env->GetFileSystem(); + return ReadFileToString(fs.get(), fname, data); +} + +namespace { +static std::unordered_map env_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + {0, OptionType::kCustomizable, OptionVerificationType::kByName, + OptionTypeFlags::kDontSerialize | OptionTypeFlags::kRawPointer, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + EnvWrapper::Target* target = static_cast(addr); + return Env::CreateFromString(opts, value, &(target->env), + &(target->guard)); + }, + nullptr, nullptr}}, +#endif // ROCKSDB_LITE +}; +} // namespace + +EnvWrapper::EnvWrapper(Env* t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(std::unique_ptr&& t) : target_(std::move(t)) { + RegisterOptions("", &target_, &env_wrapper_type_info); +} + +EnvWrapper::EnvWrapper(const std::shared_ptr& t) : target_(t) { + RegisterOptions("", &target_, &env_wrapper_type_info); } EnvWrapper::~EnvWrapper() { } +Status EnvWrapper::PrepareOptions(const ConfigOptions& options) { + target_.Prepare(); + return Env::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const { + auto parent = Env::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_.env == nullptr || + target_.env == Env::Default()) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_.env->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + namespace { // anonymous namespace void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { @@ -445,6 +1207,12 @@ optimized_env_options.use_direct_reads = db_options.use_direct_reads; return optimized_env_options; } +EnvOptions Env::OptimizeForBlobFileRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} EnvOptions::EnvOptions(const DBOptions& options) { AssignEnvOptions(this, options); @@ -457,19 +1225,103 @@ Status NewEnvLogger(const std::string& fname, Env* env, std::shared_ptr* result) { - EnvOptions options; + FileOptions options; // TODO: Tune the buffer size. options.writable_file_max_buffer_size = 1024 * 1024; - std::unique_ptr writable_file; - const auto status = env->NewWritableFile(fname, &writable_file, options); + std::unique_ptr writable_file; + const auto status = env->GetFileSystem()->NewWritableFile( + fname, options, &writable_file, nullptr); if (!status.ok()) { return status; } - *result = std::make_shared( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, options, - env); + *result = std::make_shared(std::move(writable_file), fname, + options, env); return Status::OK(); } +const std::shared_ptr& Env::GetFileSystem() const { + return file_system_; +} + +const std::shared_ptr& Env::GetSystemClock() const { + return system_clock_; +} +namespace { +static std::unordered_map sc_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +#endif // ROCKSDB_LITE +}; + +} // namespace +SystemClockWrapper::SystemClockWrapper(const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &sc_wrapper_type_info); +} + +Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = SystemClock::Default(); + } + return SystemClock::PrepareOptions(options); +} + +#ifndef ROCKSDB_LITE +std::string SystemClockWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = SystemClock::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(SystemClock::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinSystemClocks(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + EmulatedSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new EmulatedSystemClock(SystemClock::Default())); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status SystemClock::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + auto clock = SystemClock::Default(); + if (clock->IsInstanceOf(value)) { + *result = clock; + return Status::OK(); + } else { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(config_options, value, nullptr, + result); + } +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_basic_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,95 +4,122 @@ // // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include #include #include #include -#include #include "env/mock_env.h" +#include "file/file_util.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { +namespace { +using CreateEnvFunc = Env*(); -// Normalizes trivial differences across Envs such that these test cases can -// run on all Envs. -class NormalizingEnvWrapper : public EnvWrapper { - public: - explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {} +// These functions are used to create the various environments under which this +// test can execute. These functions are used to allow the test cases to be +// created without the Env being initialized, thereby eliminating a potential +// static initialization fiasco/race condition when attempting to get a +// custom/configured env prior to main being invoked. + +static Env* GetDefaultEnv() { return Env::Default(); } + +static Env* GetMockEnv() { + static std::unique_ptr mock_env(MockEnv::Create(Env::Default())); + return mock_env.get(); +} +#ifndef ROCKSDB_LITE +static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) { + ConfigOptions config_opts; + config_opts.invoke_prepare_options = false; + + std::shared_ptr provider; + EXPECT_OK(EncryptionProvider::CreateFromString(config_opts, provider_id, + &provider)); + return NewEncryptedEnv(base, provider); +} + +static Env* GetCtrEncryptedEnv() { + static std::unique_ptr ctr_encrypt_env( + NewTestEncryptedEnv(Env::Default(), "CTR://test")); + return ctr_encrypt_env.get(); +} + +static Env* GetMemoryEnv() { + static std::unique_ptr mem_env(NewMemEnv(Env::Default())); + return mem_env.get(); +} - // Removes . and .. from directory listing - Status GetChildren(const std::string& dir, - std::vector* result) override { - Status status = EnvWrapper::GetChildren(dir, result); - if (status.ok()) { - result->erase(std::remove_if(result->begin(), result->end(), - [](const std::string& s) { - return s == "." || s == ".."; - }), - result->end()); +static Env* GetTestEnv() { + static std::shared_ptr env_guard; + static Env* custom_env = nullptr; + if (custom_env == nullptr) { + const char* uri = getenv("TEST_ENV_URI"); + if (uri != nullptr) { + EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env, + &env_guard)); } - return status; } + EXPECT_NE(custom_env, nullptr); + return custom_env; +} - // Removes . and .. from directory listing - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - Status status = EnvWrapper::GetChildrenFileAttributes(dir, result); - if (status.ok()) { - result->erase(std::remove_if(result->begin(), result->end(), - [](const FileAttributes& fa) { - return fa.name == "." || fa.name == ".."; - }), - result->end()); +static Env* GetTestFS() { + static std::shared_ptr fs_env_guard; + static Env* fs_env = nullptr; + if (fs_env == nullptr) { + const char* uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + EXPECT_OK( + Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard)); } - return status; } -}; + EXPECT_NE(fs_env, nullptr); + return fs_env; +} +#endif // ROCKSDB_LITE -class EnvBasicTestWithParam : public testing::Test, - public ::testing::WithParamInterface { +} // namespace +class EnvBasicTestWithParam + : public testing::Test, + public ::testing::WithParamInterface { public: Env* env_; const EnvOptions soptions_; std::string test_dir_; - EnvBasicTestWithParam() : env_(GetParam()) { + EnvBasicTestWithParam() : env_(GetParam()()) { test_dir_ = test::PerThreadDBPath(env_, "env_basic_test"); } - void SetUp() override { env_->CreateDirIfMissing(test_dir_); } + void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); } - void TearDown() override { - std::vector files; - env_->GetChildren(test_dir_, &files); - for (const auto& file : files) { - // don't know whether it's file or directory, try both. The tests must - // only create files or empty directories, so one must succeed, else the - // directory's corrupted. - Status s = env_->DeleteFile(test_dir_ + "/" + file); - if (!s.ok()) { - ASSERT_OK(env_->DeleteDir(test_dir_ + "/" + file)); - } - } - } + void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); } }; class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; -static std::unique_ptr def_env(new NormalizingEnvWrapper(Env::Default())); INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam, - ::testing::Values(def_env.get())); + ::testing::Values(&GetDefaultEnv)); INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam, - ::testing::Values(def_env.get())); + ::testing::Values(&GetDefaultEnv)); -static std::unique_ptr mock_env(new MockEnv(Env::Default())); INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, - ::testing::Values(mock_env.get())); + ::testing::Values(&GetMockEnv)); + #ifndef ROCKSDB_LITE -static std::unique_ptr mem_env(NewMemEnv(Env::Default())); +// next statements run env test against default encryption code. +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, + ::testing::Values(&GetCtrEncryptedEnv)); +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam, + ::testing::Values(&GetCtrEncryptedEnv)); + INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam, - ::testing::Values(mem_env.get())); + ::testing::Values(&GetMemoryEnv)); namespace { @@ -101,20 +128,15 @@ // // The purpose of returning an empty vector (instead of nullptr) is that gtest // ValuesIn() will skip running tests when given an empty collection. -std::vector GetCustomEnvs() { - static Env* custom_env; - static bool init = false; - if (!init) { - init = true; - const char* uri = getenv("TEST_ENV_URI"); - if (uri != nullptr) { - Env::LoadEnv(uri, &custom_env); - } +std::vector GetCustomEnvs() { + std::vector res; + const char* uri = getenv("TEST_ENV_URI"); + if (uri != nullptr) { + res.push_back(&GetTestEnv); } - - std::vector res; - if (custom_env != nullptr) { - res.emplace_back(custom_env); + uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + res.push_back(&GetTestFS); } return res; } @@ -126,7 +148,6 @@ INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam, ::testing::ValuesIn(GetCustomEnvs())); - #endif // ROCKSDB_LITE TEST_P(EnvBasicTestWithParam, Basics) { @@ -190,19 +211,18 @@ soptions_) .ok()); ASSERT_TRUE(!seq_file); - ASSERT_TRUE(!env_->NewRandomAccessFile(test_dir_ + "/non_existent", - &rand_file, soptions_) - .ok()); + ASSERT_NOK(env_->NewRandomAccessFile(test_dir_ + "/non_existent", &rand_file, + soptions_)); ASSERT_TRUE(!rand_file); // Check that deleting works. - ASSERT_TRUE(!env_->DeleteFile(test_dir_ + "/non_existent").ok()); + ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent")); ASSERT_OK(env_->DeleteFile(test_dir_ + "/g")); ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g")); ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(0U, children.size()); - ASSERT_TRUE( - env_->GetChildren(test_dir_ + "/non_existent", &children).IsNotFound()); + Status s = env_->GetChildren(test_dir_ + "/non_existent", &children); + ASSERT_TRUE(s.IsNotFound()); } TEST_P(EnvBasicTestWithParam, ReadWrite) { @@ -298,7 +318,7 @@ ASSERT_OK(env_->CreateDir(test_dir_ + "/j")); ASSERT_OK(env_->FileExists(test_dir_ + "/j")); std::vector children; - env_->GetChildren(test_dir_, &children); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(1U, children.size()); // fail because file already exists ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok()); @@ -327,14 +347,14 @@ ASSERT_EQ(3U, children.size()); ASSERT_EQ(3U, childAttr.size()); for (auto each : children) { - env_->DeleteDir(test_dir_ + "/" + each); + env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError(); } // necessary for default POSIX env // non-exist directory returns IOError ASSERT_OK(env_->DeleteDir(test_dir_)); - ASSERT_TRUE(!env_->FileExists(test_dir_).ok()); - ASSERT_TRUE(!env_->GetChildren(test_dir_, &children).ok()); - ASSERT_TRUE(!env_->GetChildrenFileAttributes(test_dir_, &childAttr).ok()); + ASSERT_NOK(env_->FileExists(test_dir_)); + ASSERT_NOK(env_->GetChildren(test_dir_, &children)); + ASSERT_NOK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); // if dir is a file, returns IOError ASSERT_OK(env_->CreateDir(test_dir_)); @@ -343,10 +363,36 @@ env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_)); ASSERT_OK(writable_file->Close()); writable_file.reset(); - ASSERT_TRUE(!env_->GetChildren(test_dir_ + "/file", &children).ok()); + ASSERT_NOK(env_->GetChildren(test_dir_ + "/file", &children)); ASSERT_EQ(0U, children.size()); } +TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) { + auto* env = Env::Default(); + ASSERT_OK(env->CreateDirIfMissing(test_dir_)); + + // Create a single file + std::string path = test_dir_; + const EnvOptions soptions; +#ifdef OS_WIN + path.append("\\test_file"); +#else + path.append("/test_file"); +#endif + std::string data("test data"); + std::unique_ptr file; + ASSERT_OK(env->NewWritableFile(path, &file, soptions)); + ASSERT_OK(file->Append("test data")); + + // get the children + std::vector result; + ASSERT_OK(env->GetChildren(test_dir_, &result)); + + // expect only one file named `test_data`, i.e. no `.` or `..` names + ASSERT_EQ(result.size(), 1); + ASSERT_EQ(result.at(0), "test_file"); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,28 +7,41 @@ #include "env/env_chroot.h" -#include -#include -#include -#include - -#include -#include -#include - -#include "rocksdb/status.h" +#include // errno +#include // realpath, free +#include // geteuid + +#include "env/composite_env_wrapper.h" +#include "env/fs_remap.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" // errnoStr namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map chroot_fs_type_info = { + {"chroot_dir", {0, OptionType::kString}}}; +} // namespace +ChrootFileSystem::ChrootFileSystem(const std::shared_ptr& base, + const std::string& chroot_dir) + : RemapFileSystem(base), chroot_dir_(chroot_dir) { + RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info); +} -class ChrootEnv : public EnvWrapper { - public: - ChrootEnv(Env* base_env, const std::string& chroot_dir) - : EnvWrapper(base_env) { +Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystemWrapper::PrepareOptions(options); + if (!s.ok()) { + return s; + } else if (chroot_dir_.empty()) { + s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir"); + } else { + s = target_->FileExists(chroot_dir_, IOOptions(), nullptr); + } + if (s.ok()) { #if defined(OS_AIX) char resolvedName[PATH_MAX]; - char* real_chroot_dir = realpath(chroot_dir.c_str(), resolvedName); + char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName); #else - char* real_chroot_dir = realpath(chroot_dir.c_str(), nullptr); + char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr); #endif // chroot_dir must exist so realpath() returns non-nullptr. assert(real_chroot_dir != nullptr); @@ -37,231 +50,32 @@ free(real_chroot_dir); #endif } + return s; +} - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewSequentialFile(status_and_enc_path.second, result, - options); - } - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewRandomAccessFile(status_and_enc_path.second, result, - options); - } - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewWritableFile(status_and_enc_path.second, result, - options); - } - - Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - auto status_and_old_enc_path = EncodePath(old_fname); - if (!status_and_old_enc_path.first.ok()) { - return status_and_old_enc_path.first; - } - return EnvWrapper::ReuseWritableFile(status_and_old_enc_path.second, - status_and_old_enc_path.second, result, - options); - } - - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewRandomRWFile(status_and_enc_path.second, result, - options); - } - - Status NewDirectory(const std::string& dir, - std::unique_ptr* result) override { - auto status_and_enc_path = EncodePathWithNewBasename(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewDirectory(status_and_enc_path.second, result); - } - - Status FileExists(const std::string& fname) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::FileExists(status_and_enc_path.second); - } - - Status GetChildren(const std::string& dir, - std::vector* result) override { - auto status_and_enc_path = EncodePath(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetChildren(status_and_enc_path.second, result); - } - - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - auto status_and_enc_path = EncodePath(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetChildrenFileAttributes(status_and_enc_path.second, - result); - } - - Status DeleteFile(const std::string& fname) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::DeleteFile(status_and_enc_path.second); - } - - Status CreateDir(const std::string& dirname) override { - auto status_and_enc_path = EncodePathWithNewBasename(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::CreateDir(status_and_enc_path.second); - } - - Status CreateDirIfMissing(const std::string& dirname) override { - auto status_and_enc_path = EncodePathWithNewBasename(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::CreateDirIfMissing(status_and_enc_path.second); - } - - Status DeleteDir(const std::string& dirname) override { - auto status_and_enc_path = EncodePath(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::DeleteDir(status_and_enc_path.second); - } - - Status GetFileSize(const std::string& fname, uint64_t* file_size) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetFileSize(status_and_enc_path.second, file_size); - } - - Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetFileModificationTime(status_and_enc_path.second, - file_mtime); - } - - Status RenameFile(const std::string& src, const std::string& dest) override { - auto status_and_src_enc_path = EncodePath(src); - if (!status_and_src_enc_path.first.ok()) { - return status_and_src_enc_path.first; - } - auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); - if (!status_and_dest_enc_path.first.ok()) { - return status_and_dest_enc_path.first; - } - return EnvWrapper::RenameFile(status_and_src_enc_path.second, - status_and_dest_enc_path.second); - } - - Status LinkFile(const std::string& src, const std::string& dest) override { - auto status_and_src_enc_path = EncodePath(src); - if (!status_and_src_enc_path.first.ok()) { - return status_and_src_enc_path.first; - } - auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); - if (!status_and_dest_enc_path.first.ok()) { - return status_and_dest_enc_path.first; - } - return EnvWrapper::LinkFile(status_and_src_enc_path.second, - status_and_dest_enc_path.second); - } - - Status LockFile(const std::string& fname, FileLock** lock) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - // FileLock subclasses may store path (e.g., PosixFileLock stores it). We - // can skip stripping the chroot directory from this path because callers - // shouldn't use it. - return EnvWrapper::LockFile(status_and_enc_path.second, lock); - } - - Status GetTestDirectory(std::string* path) override { - // Adapted from PosixEnv's implementation since it doesn't provide a way to - // create directory in the chroot. - char buf[256]; - snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast(geteuid())); - *path = buf; - - // Directory may already exist, so ignore return - CreateDir(*path); - return Status::OK(); - } - - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewLogger(status_and_enc_path.second, result); - } +IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options, + std::string* path, + IODebugContext* dbg) { + // Adapted from PosixEnv's implementation since it doesn't provide a way to + // create directory in the chroot. + char buf[256]; + snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast(geteuid())); + *path = buf; - Status GetAbsolutePath(const std::string& db_path, - std::string* output_path) override { - auto status_and_enc_path = EncodePath(db_path); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetAbsolutePath(status_and_enc_path.second, output_path); - } + // Directory may already exist, so ignore return + return CreateDirIfMissing(*path, options, dbg); +} - private: // Returns status and expanded absolute path including the chroot directory. // Checks whether the provided path breaks out of the chroot. If it returns // non-OK status, the returned path should not be used. - std::pair EncodePath(const std::string& path) { - if (path.empty() || path[0] != '/') { - return {Status::InvalidArgument(path, "Not an absolute path"), ""}; - } - std::pair res; - res.second = chroot_dir_ + path; +std::pair ChrootFileSystem::EncodePath( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + std::pair res; + res.second = chroot_dir_ + path; #if defined(OS_AIX) char resolvedName[PATH_MAX]; char* normalized_path = realpath(res.second.c_str(), resolvedName); @@ -269,51 +83,64 @@ char* normalized_path = realpath(res.second.c_str(), nullptr); #endif if (normalized_path == nullptr) { - res.first = Status::NotFound(res.second, strerror(errno)); + res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); } else if (strlen(normalized_path) < chroot_dir_.size() || strncmp(normalized_path, chroot_dir_.c_str(), chroot_dir_.size()) != 0) { - res.first = Status::IOError(res.second, - "Attempted to access path outside chroot"); + res.first = IOStatus::IOError(res.second, + "Attempted to access path outside chroot"); } else { - res.first = Status::OK(); + res.first = IOStatus::OK(); } #if !defined(OS_AIX) free(normalized_path); #endif return res; - } +} // Similar to EncodePath() except assumes the basename in the path hasn't been // created yet. - std::pair EncodePathWithNewBasename( - const std::string& path) { - if (path.empty() || path[0] != '/') { - return {Status::InvalidArgument(path, "Not an absolute path"), ""}; - } - // Basename may be followed by trailing slashes - size_t final_idx = path.find_last_not_of('/'); - if (final_idx == std::string::npos) { - // It's only slashes so no basename to extract - return EncodePath(path); - } +std::pair ChrootFileSystem::EncodePathWithNewBasename( + const std::string& path) { + if (path.empty() || path[0] != '/') { + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; + } + // Basename may be followed by trailing slashes + size_t final_idx = path.find_last_not_of('/'); + if (final_idx == std::string::npos) { + // It's only slashes so no basename to extract + return EncodePath(path); + } + + // Pull off the basename temporarily since realname(3) (used by + // EncodePath()) requires a path that exists + size_t base_sep = path.rfind('/', final_idx); + auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1)); + status_and_enc_path.second.append(path.substr(base_sep + 1)); + return status_and_enc_path; +} - // Pull off the basename temporarily since realname(3) (used by - // EncodePath()) requires a path that exists - size_t base_sep = path.rfind('/', final_idx); - auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1)); - status_and_enc_path.second.append(path.substr(base_sep + 1)); - return status_and_enc_path; +std::shared_ptr NewChrootFileSystem( + const std::shared_ptr& base, const std::string& chroot_dir) { + auto chroot_fs = std::make_shared(base, chroot_dir); + Status s = chroot_fs->PrepareOptions(ConfigOptions()); + if (s.ok()) { + return chroot_fs; + } else { + return nullptr; } - - std::string chroot_dir_; -}; +} Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) { if (!base_env->FileExists(chroot_dir).ok()) { return nullptr; } - return new ChrootEnv(base_env, chroot_dir); + auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir); + if (chroot_fs != nullptr) { + return new CompositeEnvWrapper(base_env, chroot_fs); + } else { + return nullptr; + } } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,13 +9,46 @@ #include -#include "rocksdb/env.h" +#include "env/fs_remap.h" +#include "rocksdb/file_system.h" namespace ROCKSDB_NAMESPACE { +class ChrootFileSystem : public RemapFileSystem { + public: + ChrootFileSystem(const std::shared_ptr& base, + const std::string& chroot_dir); + + static const char* kClassName() { return "ChrootFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + Status PrepareOptions(const ConfigOptions& options) override; + + protected: + // Returns status and expanded absolute path including the chroot directory. + // Checks whether the provided path breaks out of the chroot. If it returns + // non-OK status, the returned path should not be used. + std::pair EncodePath(const std::string& path) override; + + // Similar to EncodePath() except assumes the basename in the path hasn't been + // created yet. + std::pair EncodePathWithNewBasename( + const std::string& path) override; + + private: + std::string chroot_dir_; +}; // Returns an Env that translates paths such that the root directory appears to // be chroot_dir. chroot_dir should refer to an existing directory. +// +// This class has not been fully analyzed for providing strong security +// guarantees. Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir); +std::shared_ptr NewChrootFileSystem( + const std::shared_ptr& base, const std::string& chroot_dir); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,35 +5,33 @@ #ifndef ROCKSDB_LITE +#include "rocksdb/env_encryption.h" + #include #include #include #include -#include "rocksdb/env_encryption.h" +#include "env/composite_env_wrapper.h" +#include "env/env_encryption_ctr.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/convenience.h" +#include "rocksdb/io_status.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" #include "util/aligned_buffer.h" #include "util/coding.h" #include "util/random.h" +#include "util/string_util.h" #endif - namespace ROCKSDB_NAMESPACE { - #ifndef ROCKSDB_LITE - -class EncryptedSequentialFile : public SequentialFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - uint64_t offset_; - size_t prefixLength_; - - public: - // Default ctor. Given underlying sequential file is supposed to be at - // offset == prefixLength. - EncryptedSequentialFile(SequentialFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), offset_(prefixLength), prefixLength_(prefixLength) { - } +std::shared_ptr EncryptionProvider::NewCTRProvider( + const std::shared_ptr& cipher) { + return std::make_shared(cipher); +} // Read up to "n" bytes from the file. "scratch[0..n-1]" may be // written by this routine. Sets "*result" to the data that was @@ -43,76 +41,82 @@ // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization - Status Read(size_t n, Slice* result, char* scratch) override { - assert(scratch); - Status status = file_->Read(n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset_, (char*)result->data(), result->size()); - offset_ += result->size(); // We've already ready data from disk, so update offset_ even if decryption fails. - return status; +IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + IOStatus io_s = file_->Read(n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset_, (char*)result->data(), result->size())); + } + if (io_s.ok()) { + offset_ += result->size(); // We've already ready data from disk, so update + // offset_ even if decryption fails. } + return io_s; +} - // Skip "n" bytes from the file. This is guaranteed to be no - // slower that reading the same data, but may be faster. - // - // If end of file is reached, skipping will stop at the end of the - // file, and Skip will return OK. - // - // REQUIRES: External synchronization - Status Skip(uint64_t n) override { - auto status = file_->Skip(n); - if (!status.ok()) { - return status; - } - offset_ += n; +// Skip "n" bytes from the file. This is guaranteed to be no +// slower that reading the same data, but may be faster. +// +// If end of file is reached, skipping will stop at the end of the +// file, and Skip will return OK. +// +// REQUIRES: External synchronization +IOStatus EncryptedSequentialFile::Skip(uint64_t n) { + auto status = file_->Skip(n); + if (!status.ok()) { return status; } + offset_ += n; + return status; +} - // Indicates the upper layers if the current SequentialFile implementation - // uses direct IO. - bool use_direct_io() const override { return file_->use_direct_io(); } +// Indicates the upper layers if the current SequentialFile implementation +// uses direct IO. +bool EncryptedSequentialFile::use_direct_io() const { + return file_->use_direct_io(); +} - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O - size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } +IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned - Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override { - assert(scratch); - offset += prefixLength_; // Skip prefix - auto status = file_->PositionedRead(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - offset_ = offset + result->size(); - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); - return status; +IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + assert(scratch); + offset += prefixLength_; // Skip prefix + auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + offset_ = offset + result->size(); + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); } -}; - -// A file abstraction for randomly reading the contents of a file. -class EncryptedRandomAccessFile : public RandomAccessFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; - - public: - EncryptedRandomAccessFile(RandomAccessFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) { } + return io_s; +} // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" @@ -124,23 +128,31 @@ // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - assert(scratch); - offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); - return status; +IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto io_s = file_->Read(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; + } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); } + return io_s; +} // Readahead the file starting from offset by n bytes for caching. - Status Prefetch(uint64_t offset, size_t n) override { - //return Status::OK(); - return file_->Prefetch(offset + prefixLength_, n); - } +IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + // return Status::OK(); + return file_->Prefetch(offset + prefixLength_, n, options, dbg); +} // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). @@ -157,343 +169,603 @@ // a single varint. // // Note: these IDs are only valid for the duration of the process. - size_t GetUniqueId(char* id, size_t max_size) const override { - return file_->GetUniqueId(id, max_size); - }; +size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return file_->GetUniqueId(id, max_size); +}; - void Hint(AccessPattern pattern) override { file_->Hint(pattern); } +void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { + file_->Hint(pattern); +} // Indicates the upper layers if the current RandomAccessFile implementation // uses direct IO. - bool use_direct_io() const override { return file_->use_direct_io(); } +bool EncryptedRandomAccessFile::use_direct_io() const { + return file_->use_direct_io(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } -}; +IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} // A file abstraction for sequential writing. The implementation // must provide buffering since callers may append small fragments // at a time to the file. -class EncryptedWritableFile : public WritableFileWrapper { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; - - public: - // Default ctor. Prefix is assumed to be written already. - EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { } - - Status Append(const Slice& data) override { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - if (data.size() > 0) { - auto offset = file_->GetFileSize(); // size including prefix - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove - // so that the next two lines can be replaced with buf.Append(). - memmove(buf.BufferStart(), data.data(), data.size()); - buf.Size(data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); - if (!status.ok()) { - return status; - } - dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); +IOStatus EncryptedWritableFile::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + if (data.size() > 0) { + auto offset = file_->GetFileSize(options, dbg); // size including prefix + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - status = file_->Append(dataToAppend); - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } - return status; + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } + return file_->Append(dataToAppend, options, dbg); +} - Status PositionedAppend(const Slice& data, uint64_t offset) override { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - offset += prefixLength_; - if (data.size() > 0) { - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - memmove(buf.BufferStart(), data.data(), data.size()); - buf.Size(data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); - if (!status.ok()) { - return status; - } - dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); +IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToAppend(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - status = file_->PositionedAppend(dataToAppend, offset); - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } - return status; + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } + return file_->PositionedAppend(dataToAppend, offset, options, dbg); +} - // Indicates the upper layers if the current WritableFile implementation - // uses direct IO. - bool use_direct_io() const override { return file_->use_direct_io(); } +// Indicates the upper layers if the current WritableFile implementation +// uses direct IO. +bool EncryptedWritableFile::use_direct_io() const { + return file_->use_direct_io(); +} + +// true if Sync() and Fsync() are safe to call concurrently with Append() +// and Flush(). +bool EncryptedWritableFile::IsSyncThreadSafe() const { + return file_->IsSyncThreadSafe(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} - /* - * Get the size of valid data in the file. - */ - uint64_t GetFileSize() override { - return file_->GetFileSize() - prefixLength_; - } +/* + * Get the size of valid data in the file. + */ +uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + return file_->GetFileSize(options, dbg) - prefixLength_; +} - // Truncate is necessary to trim the file to the correct size - // before closing. It is not always possible to keep track of the file - // size due to whole pages writes. The behavior is undefined if called - // with other writes to follow. - Status Truncate(uint64_t size) override { - return file_->Truncate(size + prefixLength_); - } +// Truncate is necessary to trim the file to the correct size +// before closing. It is not always possible to keep track of the file +// size due to whole pages writes. The behavior is undefined if called +// with other writes to follow. +IOStatus EncryptedWritableFile::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Truncate(size + prefixLength_, options, dbg); +} - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. - // This call has no effect on dirty pages in the cache. - Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } - - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. - Status RangeSync(uint64_t offset, uint64_t nbytes) override { - return file_->RangeSync(offset + prefixLength_, nbytes); - } - - // PrepareWrite performs any necessary preparation for a write - // before the write actually occurs. This allows for pre-allocation - // of space on devices where it can result in less file - // fragmentation and/or less waste from over-zealous filesystem - // pre-allocation. - void PrepareWrite(size_t offset, size_t len) override { - file_->PrepareWrite(offset + prefixLength_, len); - } - - // Pre-allocates space for a file. - Status Allocate(uint64_t offset, uint64_t len) override { - return file_->Allocate(offset + prefixLength_, len); - } -}; +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +// This call has no effect on dirty pages in the cache. +IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} -// A file abstraction for random reading and writing. -class EncryptedRandomRWFile : public RandomRWFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; +// Sync a file range with disk. +// offset is the starting byte of the file range to be synchronized. +// nbytes specifies the length of the range to be synchronized. +// This asks the OS to initiate flushing the cached data to disk, +// without waiting for completion. +// Default implementation does nothing. +IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg); +} - public: - EncryptedRandomRWFile(RandomRWFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) {} +// PrepareWrite performs any necessary preparation for a write +// before the write actually occurs. This allows for pre-allocation +// of space on devices where it can result in less file +// fragmentation and/or less waste from over-zealous filesystem +// pre-allocation. +void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len, + const IOOptions& options, + IODebugContext* dbg) { + file_->PrepareWrite(offset + prefixLength_, len, options, dbg); +} - // Indicates if the class makes use of direct I/O - // If false you must pass aligned buffer to Write() - bool use_direct_io() const override { return file_->use_direct_io(); } +void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) { + // the size here doesn't need to include prefixLength_, as it's a + // configuration will be use for `PrepareWrite()`. + file_->SetPreallocationBlockSize(size); +} + +void EncryptedWritableFile::GetPreallocationStatus( + size_t* block_size, size_t* last_allocated_block) { + file_->GetPreallocationStatus(block_size, last_allocated_block); +} + +// Pre-allocates space for a file. +IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Allocate(offset + prefixLength_, len, options, dbg); +} + +IOStatus EncryptedWritableFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} + +// A file abstraction for random reading and writing. + +// Indicates if the class makes use of direct I/O +// If false you must pass aligned buffer to Write() +bool EncryptedRandomRWFile::use_direct_io() const { + return file_->use_direct_io(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. - Status Write(uint64_t offset, const Slice& data) override { - AlignedBuffer buf; - Status status; - Slice dataToWrite(data); - offset += prefixLength_; - if (data.size() > 0) { - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - memmove(buf.BufferStart(), data.data(), data.size()); - buf.Size(data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); - if (!status.ok()) { - return status; - } - dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); +IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + AlignedBuffer buf; + Slice dataToWrite(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + IOStatus io_s; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - status = file_->Write(offset, dataToWrite); - return status; + if (!io_s.ok()) { + return io_s; + } + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); } + return file_->Write(offset, dataToWrite, options, dbg); +} // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - assert(scratch); - offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); +IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { + assert(scratch); + offset += prefixLength_; + auto status = file_->Read(offset, n, options, result, scratch, dbg); + if (!status.ok()) { return status; } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); + } + return status; +} - Status Flush() override { return file_->Flush(); } +IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} - Status Sync() override { return file_->Sync(); } +IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Fsync(options, dbg); +} - Status Fsync() override { return file_->Fsync(); } +IOStatus EncryptedRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} - Status Close() override { return file_->Close(); } +namespace { +static std::unordered_map encrypted_fs_type_info = + { + {"provider", + OptionTypeInfo::AsCustomSharedPtr( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, }; +// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption +// to files stored on disk. +class EncryptedFileSystemImpl : public EncryptedFileSystem { + public: + const char* Name() const override { + return EncryptedFileSystem::kClassName(); + } + // Returns the raw encryption provider that should be used to write the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetWritableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No WriteProvider specified"); + } + } + + // Returns the raw encryption provider that should be used to read the input + // encrypted file. If there is no such provider, NotFound is returned. + IOStatus GetReadableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return IOStatus::OK(); + } else { + *result = nullptr; + return IOStatus::NotFound("No Provider specified"); + } + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateWritableCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus status = GetWritableProvider(fname, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + status = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (status.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + status = underlying->Append(prefix, options.io_options, dbg); + } + if (!status.ok()) { + return status; + } + } + // Create cipher stream + status = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return status; + } + + template + IOStatus CreateWritableEncryptedFile(const std::string& fname, + std::unique_ptr& underlying, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + // Create cipher stream + std::unique_ptr stream; + size_t prefix_length; + IOStatus status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } + } + return status; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a writable provider is found and encryption is enabled, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // should be encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateRandomWriteCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + IOStatus io_s = GetWritableProvider(fname, &provider); + if (!io_s.ok()) { + return io_s; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + io_s = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (io_s.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + io_s = underlying->Write(0, prefix, options.io_options, dbg); + } + if (!io_s.ok()) { + return io_s; + } + } + // Create cipher stream + io_s = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); + } + return io_s; + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateSequentialCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(*prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } + + // Creates a CipherStream for the underlying file/name using the options + // If a readable provider is found and the file is encrypted, uses + // this provider to create a cipher stream. + // @param fname Name of the writable file + // @param underlying The underlying "raw" file + // @param options Options for creating the file/cipher + // @param prefix_length Returns the length of the encryption prefix used for + // this file + // @param stream Returns the cipher stream to use for this file if it + // is encrypted + // @return OK on success, non-OK on failure. + template + IOStatus CreateRandomReadCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + IOStatus status = underlying->Read(0, *prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); + } -// EncryptedEnv implements an Env wrapper that adds encryption to files stored on disk. -class EncryptedEnv : public EnvWrapper { public: - EncryptedEnv(Env* base_env, EncryptionProvider *provider) - : EnvWrapper(base_env) { + EncryptedFileSystemImpl(const std::shared_ptr& base, + const std::shared_ptr& provider) + : EncryptedFileSystem(base) { provider_ = provider; + RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info); + } + + Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) override { + return provider_->AddCipher(descriptor, cipher, len, for_write); } // NewSequentialFile opens a file for sequential reading. - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - // Read prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Read prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); + uint64_t file_size; + status = FileSystemWrapper::GetFileSize(fname, options.io_options, + &file_size, dbg); + if (!status.ok()) { + return status; + } + if (!file_size) { + *result = std::move(underlying); + return status; } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateSequentialCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + result->reset(new EncryptedSequentialFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedSequentialFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return status; } // NewRandomAccessFile opens a file for random read access. - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = FileSystemWrapper::NewRandomAccessFile(fname, options, + &underlying, dbg); if (!status.ok()) { return status; } - // Read prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Read prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); - } - // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomAccessFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } } - (*result) = std::unique_ptr(new EncryptedRandomAccessFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return status; } // NewWritableFile opens a file for sequential writing. - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::NewWritableFile(fname, &underlying, options); - if (!status.ok()) { - return status; - } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } - // Create cipher stream - std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Create an object that writes to a new file with the specified @@ -503,86 +775,42 @@ // returns non-OK. // // The returned file will only be accessed by one thread at a time. - Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::ReopenWritableFile(fname, &underlying, options); + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } - // Create cipher stream - std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; - } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Reuse an existing file by renaming it and opening it as writable. - Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); + std::unique_ptr underlying; + auto status = FileSystemWrapper::ReuseWritableFile( + fname, old_fname, options, &underlying, dbg); if (!status.ok()) { return status; } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } - // Create cipher stream - std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; - } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Open `fname` for random read and write, if file doesn't exist the file @@ -590,102 +818,137 @@ // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads || options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Check file exists - bool isNewFile = !FileExists(fname).ok(); + bool isNewFile = !FileExists(fname, options.io_options, dbg).ok(); // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::NewRandomRWFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - // Read or Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - if (!isNewFile) { - // File already exists, read prefix - status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); - } else { - // File is new, initialize & write prefix - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Write(0, prefixSlice); - if (!status.ok()) { - return status; - } - } - } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length = 0; + if (!isNewFile) { + // File already exists, read prefix + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } else { + status = CreateRandomWriteCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); + } + if (status.ok()) { + if (stream) { + result->reset(new EncryptedRandomRWFile( + std::move(underlying), std::move(stream), prefix_length)); + } else { + result->reset(underlying.release()); + } } - (*result) = std::unique_ptr(new EncryptedRandomRWFile(underlying.release(), stream.release(), prefixLength)); - return Status::OK(); + return status; } - // Store in *result the attributes of the children of the specified directory. - // In case the implementation lists the directory prior to iterating the files - // and files are concurrently deleted, the deleted files will be omitted from + // Store in *result the attributes of the children of the specified + // directory. + // In case the implementation lists the directory prior to iterating the + // files + // and files are concurrently deleted, the deleted files will be omitted + // from // result. // The name attributes are relative to "dir". // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. - // NotFound if "dir" does not exist, the calling process does not have + // NotFound if "dir" does not exist, the calling process does not + // have // permission to access "dir", or if "dir" is invalid. // IOError if an IO Error was encountered - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg); if (!status.ok()) { return status; } - size_t prefixLength = provider_->GetPrefixLength(); - for (auto it = std::begin(*result); it!=std::end(*result); ++it) { - assert(it->size_bytes >= prefixLength); - it->size_bytes -= prefixLength; + for (auto it = std::begin(*result); it != std::end(*result); ++it) { + // assert(it->size_bytes >= prefixLength); + // breaks env_basic_test when called on directory containing + // directories + // which makes subtraction of prefixLength worrisome since + // FileAttributes does not identify directories + EncryptionProvider* provider; + status = GetReadableProvider(it->name, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + it->size_bytes -= provider->GetPrefixLength(); + } } - return Status::OK(); + return IOStatus::OK(); } // Store the size of fname in *file_size. - Status GetFileSize(const std::string& fname, uint64_t* file_size) override { - auto status = EnvWrapper::GetFileSize(fname, file_size); - if (!status.ok()) { + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); + if (!status.ok() || !(*file_size)) { return status; } - size_t prefixLength = provider_->GetPrefixLength(); - assert(*file_size >= prefixLength); - *file_size -= prefixLength; - return Status::OK(); + EncryptionProvider* provider; + status = GetReadableProvider(fname, &provider); + if (provider != nullptr && status.ok()) { + size_t prefixLength = provider->GetPrefixLength(); + assert(*file_size >= prefixLength); + *file_size -= prefixLength; + } + return status; } private: - EncryptionProvider *provider_; + std::shared_ptr provider_; }; +} // namespace + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr& base, + const std::shared_ptr& provider, + std::unique_ptr* result) { + result->reset(new EncryptedFileSystemImpl(base, provider)); + return Status::OK(); +} +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base, + const std::shared_ptr& provider) { + std::unique_ptr efs; + Status s = NewEncryptedFileSystemImpl(base, provider, &efs); + if (s.ok()) { + s = efs->PrepareOptions(ConfigOptions()); + } + if (s.ok()) { + std::shared_ptr result(efs.release()); + return result; + } else { + return nullptr; + } +} // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. -Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { - return new EncryptedEnv(base_env, provider); +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider) { + return new CompositeEnvWrapper( + base_env, NewEncryptedFS(base_env->GetFileSystem(), provider)); } // Encrypt one or more (partial) blocks of data at the file offset. @@ -786,38 +1049,71 @@ } } -// Encrypt a block of data. -// Length of data is equal to BlockSize(). -Status ROT13BlockCipher::Encrypt(char *data) { - for (size_t i = 0; i < blockSize_; ++i) { +namespace { +static std::unordered_map + rot13_block_cipher_type_info = { + {"block_size", + {0 /* No offset, whole struct*/, OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; +// Implements a BlockCipher using ROT13. +// +// Note: This is a sample implementation of BlockCipher, +// it is NOT considered safe and should NOT be used in production. +class ROT13BlockCipher : public BlockCipher { + private: + size_t blockSize_; + + public: + explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) { + RegisterOptions("ROT13BlockCipherOptions", &blockSize_, + &rot13_block_cipher_type_info); + } + + static const char* kClassName() { return "ROT13"; } + const char* Name() const override { return kClassName(); } + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + Status Encrypt(char* data) override { + for (size_t i = 0; i < blockSize_; ++i) { data[i] += 13; + } + return Status::OK(); } - return Status::OK(); -} -// Decrypt a block of data. -// Length of data is equal to BlockSize(). -Status ROT13BlockCipher::Decrypt(char *data) { - return Encrypt(data); -} + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + Status Decrypt(char* data) override { return Encrypt(data); } +}; +static const std::unordered_map + ctr_encryption_provider_type_info = { + {"cipher", + OptionTypeInfo::AsCustomSharedPtr( + 0 /* No offset, whole struct*/, OptionVerificationType::kByName, + OptionTypeFlags::kNone)}, +}; +} // anonymous namespace // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. void CTRCipherStream::AllocateScratch(std::string& scratch) { - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); scratch.reserve(blockSize); } // Encrypt a block of data at the given block index. // Length of data is equal to BlockSize(); -Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scratch) { - +Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) { // Create nonce + counter - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); // Encrypt nonce+counter - auto status = cipher_.Encrypt(scratch); + auto status = cipher_->Encrypt(scratch); if (!status.ok()) { return status; } @@ -831,22 +1127,44 @@ // Decrypt a block of data at the given block index. // Length of data is equal to BlockSize(); -Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) { +Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) { // For CTR decryption & encryption are the same return EncryptBlock(blockIndex, data, scratch); } +CTREncryptionProvider::CTREncryptionProvider( + const std::shared_ptr& c) + : cipher_(c) { + RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info); +} + // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of // the page size. -size_t CTREncryptionProvider::GetPrefixLength() { +size_t CTREncryptionProvider::GetPrefixLength() const { return defaultPrefixLength; } +Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/, + const char* cipher, size_t len, + bool /*for_write*/) { + if (cipher_) { + return Status::NotSupported("Cannot add keys to CTREncryptionProvider"); + } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) { + cipher_.reset(new ROT13BlockCipher(len)); + return Status::OK(); + } else { + return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher), + &cipher_); + } +} + // decodeCTRParameters decodes the initial counter & IV from the given // (plain text) prefix. -static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &initialCounter, Slice &iv) { +static void decodeCTRParameters(const char* prefix, size_t blockSize, + uint64_t& initialCounter, Slice& iv) { // First block contains 64-bit initial counter initialCounter = DecodeFixed64(prefix); // Second block contains IV @@ -857,25 +1175,35 @@ // for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, - size_t prefixLength) { + size_t prefixLength) const { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } // Create & seed rnd. - Random rnd((uint32_t)Env::Default()->NowMicros()); + Random rnd((uint32_t)SystemClock::Default()->NowMicros()); // Fill entire prefix block with random values. for (size_t i = 0; i < prefixLength; i++) { prefix[i] = rnd.Uniform(256) & 0xFF; } // Take random data to extract initial counter & IV - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); uint64_t initialCounter; Slice prefixIV; decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV); // Now populate the rest of the prefix, starting from the third block. - PopulateSecretPrefixPart(prefix + (2 * blockSize), prefixLength - (2 * blockSize), blockSize); + PopulateSecretPrefixPart(prefix + (2 * blockSize), + prefixLength - (2 * blockSize), blockSize); - // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial counter & IV unencrypted) + // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial + // counter & IV unencrypted) CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter); - auto status = cipherStream.Encrypt(0, prefix + (2 * blockSize), prefixLength - (2 * blockSize)); + Status status; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = cipherStream.Encrypt(0, prefix + (2 * blockSize), + prefixLength - (2 * blockSize)); + } if (!status.ok()) { return status; } @@ -886,9 +1214,8 @@ // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. -size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/, - size_t /*prefixLength*/, - size_t /*blockSize*/) { +size_t CTREncryptionProvider::PopulateSecretPrefixPart( + char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const { // Nothing to do here, put in custom data in override when needed. return 0; } @@ -896,8 +1223,11 @@ Status CTREncryptionProvider::CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, std::unique_ptr* result) { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } // Read plain text part of prefix. - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); uint64_t initialCounter; Slice iv; decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv); @@ -910,19 +1240,26 @@ ": read attempt would read beyond file bounds"); } - // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted) + // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 + // with initial counter & IV are unencrypted) CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter); - auto status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), prefix.size() - (2 * blockSize)); + Status status; + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), + prefix.size() - (2 * blockSize)); + } if (!status.ok()) { return status; } // Create cipher stream - return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result); + return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, + prefix, result); } -// CreateCipherStreamFromPrefix creates a block access cipher stream for a file given -// given name and options. The given prefix is already decrypted. +// CreateCipherStreamFromPrefix creates a block access cipher stream for a file +// given given name and options. The given prefix is already decrypted. Status CTREncryptionProvider::CreateCipherStreamFromPrefix( const std::string& /*fname*/, const EnvOptions& /*options*/, uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, @@ -932,6 +1269,72 @@ return Status::OK(); } +namespace { +static void RegisterEncryptionBuiltins() { + static std::once_flag once; + std::call_once(once, [&]() { + auto lib = ObjectRegistry::Default()->AddLibrary("encryption"); + // Match "CTR" or "CTR://test" + lib->AddFactory( + ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true) + .AddSuffix("://test"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + if (EndsWith(uri, "://test")) { + std::shared_ptr cipher = + std::make_shared(32); + guard->reset(new CTREncryptionProvider(cipher)); + } else { + guard->reset(new CTREncryptionProvider()); + } + return guard->get(); + }); + + lib->AddFactory( + "1://test", [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /*errmsg*/) { + std::shared_ptr cipher = + std::make_shared(32); + guard->reset(new CTREncryptionProvider(cipher)); + return guard->get(); + }); + + // Match "ROT13" or "ROT13:[0-9]+" + lib->AddFactory( + ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + size_t colon = uri.find(':'); + if (colon != std::string::npos) { + size_t block_size = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new ROT13BlockCipher(block_size)); + } else { + guard->reset(new ROT13BlockCipher(32)); + } + + return guard->get(); + }); + }); +} +} // namespace + +Status BlockCipher::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject(config_options, value, nullptr, result); +} + +Status EncryptionProvider::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + RegisterEncryptionBuiltins(); + return LoadSharedObject(config_options, value, nullptr, + result); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption_ctr.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption_ctr.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,116 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) + +#include "rocksdb/env_encryption.h" + +namespace ROCKSDB_NAMESPACE { +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. +// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation +// +// Note: This is a possible implementation of BlockAccessCipherStream, +// it is considered suitable for use. +class CTRCipherStream final : public BlockAccessCipherStream { + private: + std::shared_ptr cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(const std::shared_ptr& c, const char* iv, + uint64_t initialCounter) + : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return cipher_->BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override; +}; + +// This encryption provider uses a CTR cipher stream, with a given block cipher +// and IV. +// +// Note: This is a possible implementation of EncryptionProvider, +// it is considered suitable for use, provided a safe BlockCipher is used. +class CTREncryptionProvider : public EncryptionProvider { + private: + std::shared_ptr cipher_; + + protected: + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. This size is to ensure the first real data byte + // is placed at largest known alignment point for direct io. + const static size_t defaultPrefixLength = 4096; + + public: + explicit CTREncryptionProvider( + const std::shared_ptr& c = nullptr); + virtual ~CTREncryptionProvider() {} + + static const char* kClassName() { return "CTR"; } + const char* Name() const override { return kClassName(); } + + // GetPrefixLength returns the length of the prefix that is added to every + // file + // and used for storing encryption options. + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. + size_t GetPrefixLength() const override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) override; + + Status AddCipher(const std::string& descriptor, const char* /*cipher*/, + size_t /*len*/, bool /*for_write*/) override; + protected: + + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize) const; + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given + // given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr* result); +}; + +Status NewEncryptedFileSystemImpl( + const std::shared_ptr& base_fs, + const std::shared_ptr& provider, + std::unique_ptr* fs); + +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_hdfs.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_hdfs.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ #define ROCKSDB_HDFS_FILE_C #include -#include #include #include #include @@ -38,10 +37,10 @@ // Log error message static Status IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) - ? Status::NoSpace(context, strerror(err_number)) + ? Status::NoSpace(context, errnoStr(err_number).c_str()) : (err_number == ENOENT) - ? Status::PathNotFound(context, strerror(err_number)) - : Status::IOError(context, strerror(err_number)); + ? Status::PathNotFound(context, errnoStr(err_number).c_str()) + : Status::IOError(context, errnoStr(err_number).c_str()); } // assume that there is one global logger for now. It is not thread-safe, @@ -124,8 +123,9 @@ Status s; ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); - ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset, - (void*)scratch, (tSize)n); + tSize bytes_read = + hdfsPread(fileSys_, hfile_, offset, static_cast(scratch), + static_cast(n)); ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read); @@ -213,6 +213,8 @@ } } + using WritableFile::Append; + // If the file was successfully created, then this returns true. // Otherwise returns false. bool isValid() { @@ -609,6 +611,18 @@ return Status::OK(); } +Status HdfsEnv::IsDirectory(const std::string& path, bool* is_dir) { + hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, path.c_str()); + if (pFileInfo != nullptr) { + if (is_dir != nullptr) { + *is_dir = (pFileInfo->mKind == kObjectKindDirectory); + } + hdfsFreeFileInfo(pFileInfo, 1); + return Status::OK(); + } + return IOError(path, errno); +} + // The factory method for creating an HDFS Env Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) { *hdfs_env = new HdfsEnv(fsname); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_posix.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,10 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors + +#include "port/lang.h" +#if !defined(OS_WIN) + #include #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION #include @@ -13,9 +17,6 @@ #include #include -#if defined(OS_LINUX) -#include -#endif #if defined(ROCKSDB_IOURING_PRESENT) #include #endif @@ -24,13 +25,10 @@ #include #include #include -#include #include #include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include -#include -#include #endif #include #include @@ -39,9 +37,11 @@ #include #endif #include +#include + #include // Get nano time includes -#if defined(OS_LINUX) || defined(OS_FREEBSD) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) #elif defined(__MACH__) #include #include @@ -55,13 +55,14 @@ #include "env/composite_env_wrapper.h" #include "env/io_posix.h" -#include "logging/logging.h" #include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" #include "port/port.h" +#include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" @@ -126,23 +127,105 @@ }; #endif // !ROCKSDB_NO_DYNAMIC_EXTENSION -class PosixEnv : public CompositeEnvWrapper { +class PosixClock : public SystemClock { public: - PosixEnv(); + static const char* kClassName() { return "PosixClock"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } - ~PosixEnv() override { - for (const auto tid : threads_to_join_) { - pthread_join(tid, nullptr); - } - for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { - thread_pools_[pool_id].JoinAllThreads(); + uint64_t NowMicros() override { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + uint64_t NowNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(OS_SOLARIS) + return gethrtime(); +#elif defined(__MACH__) + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#else + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif + } + + uint64_t CPUMicros() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return (static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000; +#endif + return 0; + } + + uint64_t CPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + + void SleepForMicroseconds(int micros) override { usleep(micros); } + + Status GetCurrentTime(int64_t* unix_time) override { + time_t ret = time(nullptr); + if (ret == (time_t)-1) { + return IOError("GetCurrentTime", "", errno); } - // Delete the thread_status_updater_ only when the current Env is not - // Env::Default(). This is to avoid the free-after-use error when - // Env::Default() is destructed while some other child threads are - // still trying to update thread status. - if (this != Env::Default()) { - delete thread_status_updater_; + *unix_time = (int64_t)ret; + return Status::OK(); + } + + std::string TimeToString(uint64_t secondsSince1970) override { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + localtime_r(&seconds, &t); + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + return dummy; + } +}; + +class PosixEnv : public CompositeEnv { + public: + static const char* kClassName() { return "PosixEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } + + ~PosixEnv() override { + if (this == Env::Default()) { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].JoinAllThreads(); + } + // Do not delete the thread_status_updater_ in order to avoid the + // free after use when Env::Default() is destructed while some other + // child threads are still trying to update thread status. All + // PosixEnv instances use the same thread_status_updater_, so never + // explicitly delete it. } } @@ -163,7 +246,6 @@ // provided by the search path Status LoadLibrary(const std::string& name, const std::string& path, std::shared_ptr* result) override { - Status status; assert(result != nullptr); if (name.empty()) { void* hndl = dlopen(NULL, RTLD_NOW); @@ -220,109 +302,32 @@ unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; - Status GetTestDirectory(std::string* result) override { - const char* env = getenv("TEST_TMPDIR"); - if (env && env[0] != '\0') { - *result = env; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid())); - *result = buf; - } - // Directory may already exist - CreateDir(*result); - return Status::OK(); - } - Status GetThreadList(std::vector* thread_list) override { assert(thread_status_updater_); return thread_status_updater_->GetThreadList(thread_list); } - static uint64_t gettid(pthread_t tid) { + uint64_t GetThreadID() const override { uint64_t thread_id = 0; +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 30) + thread_id = ::gettid(); +#else // __GLIBC_PREREQ(2, 30) + pthread_t tid = pthread_self(); memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); - return thread_id; - } - - static uint64_t gettid() { +#endif // __GLIBC_PREREQ(2, 30) +#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) pthread_t tid = pthread_self(); - return gettid(tid); - } - - uint64_t GetThreadID() const override { return gettid(pthread_self()); } - - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override { - FILE* f; - { - IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), - "w" -#ifdef __GLIBC_PREREQ -#if __GLIBC_PREREQ(2, 7) - "e" // glibc extension to enable O_CLOEXEC -#endif -#endif - ); - } - if (f == nullptr) { - result->reset(); - return IOError("when fopen a file for new logger", fname, errno); - } else { - int fd = fileno(f); -#ifdef ROCKSDB_FALLOCATE_PRESENT - fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024); -#endif - SetFD_CLOEXEC(fd, nullptr); - result->reset(new PosixLogger(f, &PosixEnv::gettid, this)); - return Status::OK(); - } - } - - uint64_t NowMicros() override { - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - uint64_t NowNanos() override { -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#elif defined(OS_SOLARIS) - return gethrtime(); -#elif defined(__MACH__) - clock_serv_t cclock; - mach_timespec_t ts; - host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); - clock_get_time(cclock, &ts); - mach_port_deallocate(mach_task_self(), cclock); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#else - return std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()).count(); -#endif - } - - uint64_t NowCPUNanos() override { -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \ - (defined(__MACH__) && defined(__MAC_10_12)) - struct timespec ts; - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#endif - return 0; + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + return thread_id; } - void SleepForMicroseconds(int micros) override { usleep(micros); } - Status GetHostName(char* name, uint64_t len) override { int ret = gethostname(name, static_cast(len)); if (ret < 0) { if (errno == EFAULT || errno == EINVAL) { - return Status::InvalidArgument(strerror(errno)); + return Status::InvalidArgument(errnoStr(errno).c_str()); } else { return IOError("GetHostName", name, errno); } @@ -330,15 +335,6 @@ return Status::OK(); } - Status GetCurrentTime(int64_t* unix_time) override { - time_t ret = time(nullptr); - if (ret == (time_t) -1) { - return IOError("GetCurrentTime", "", errno); - } - *unix_time = (int64_t) ret; - return Status::OK(); - } - ThreadStatusUpdater* GetThreadStatusUpdater() const override { return Env::GetThreadStatusUpdater(); } @@ -367,7 +363,7 @@ thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); } - void LowerThreadPoolIOPriority(Priority pool = LOW) override { + void LowerThreadPoolIOPriority(Priority pool) override { assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); #ifdef OS_LINUX thread_pools_[pool].LowerIOPriority(); @@ -376,48 +372,46 @@ #endif } - void LowerThreadPoolCPUPriority(Priority pool = LOW) override { + void LowerThreadPoolCPUPriority(Priority pool) override { assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); -#ifdef OS_LINUX - thread_pools_[pool].LowerCPUPriority(); -#else - (void)pool; -#endif + thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow); } - std::string TimeToString(uint64_t secondsSince1970) override { - const time_t seconds = (time_t)secondsSince1970; - struct tm t; - int maxsize = 64; - std::string dummy; - dummy.reserve(maxsize); - dummy.resize(maxsize); - char* p = &dummy[0]; - localtime_r(&seconds, &t); - snprintf(p, maxsize, - "%04d/%02d/%02d-%02d:%02d:%02d ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec); - return dummy; + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(pri); + return Status::OK(); } private: - std::vector thread_pools_; - pthread_mutex_t mu_; - std::vector threads_to_join_; + friend Env* Env::Default(); + // Constructs the default Env, a singleton + PosixEnv(); + + // The below 4 members are only used by the default PosixEnv instance. + // Non-default instances simply maintain references to the backing + // members in te default instance + std::vector thread_pools_storage_; + pthread_mutex_t mu_storage_; + std::vector threads_to_join_storage_; + bool allow_non_owner_access_storage_; + + std::vector& thread_pools_; + pthread_mutex_t& mu_; + std::vector& threads_to_join_; // If true, allow non owner read access for db files. Otherwise, non-owner // has no access to db files. - bool allow_non_owner_access_; + bool& allow_non_owner_access_; }; PosixEnv::PosixEnv() - : CompositeEnvWrapper(this, FileSystem::Default().get()), - thread_pools_(Priority::TOTAL), - allow_non_owner_access_(true) { + : CompositeEnv(FileSystem::Default(), SystemClock::Default()), + thread_pools_storage_(Priority::TOTAL), + allow_non_owner_access_storage_(true), + thread_pools_(thread_pools_storage_), + mu_(mu_storage_), + threads_to_join_(threads_to_join_storage_), + allow_non_owner_access_(allow_non_owner_access_storage_) { ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].SetThreadPriority( @@ -476,31 +470,6 @@ } // namespace -std::string Env::GenerateUniqueId() { - std::string uuid_file = "/proc/sys/kernel/random/uuid"; - - Status s = FileExists(uuid_file); - if (s.ok()) { - std::string uuid; - s = ReadFileToString(this, uuid_file, &uuid); - if (s.ok()) { - return uuid; - } - } - // Could not read uuid_file - generate uuid using "nanos-random" - Random64 r(time(nullptr)); - uint64_t random_uuid_portion = - r.Uniform(std::numeric_limits::max()); - uint64_t nanos_uuid_portion = NowNanos(); - char uuid2[200]; - snprintf(uuid2, - 200, - "%lx-%lx", - (unsigned long)nanos_uuid_portion, - (unsigned long)random_uuid_portion); - return uuid2; -} - // // Default Posix Env // @@ -518,10 +487,19 @@ ThreadLocalPtr::InitSingletons(); CompressionContextCache::InitSingleton(); INIT_SYNC_POINT_SINGLETONS(); + // ~PosixEnv must be called on exit static PosixEnv default_env; - static CompositeEnvWrapper composite_env(&default_env, - FileSystem::Default().get()); - return &composite_env; + return &default_env; } +// +// Default Posix SystemClock +// +const std::shared_ptr& SystemClock::Default() { + static std::shared_ptr default_clock = + std::make_shared(); + return default_clock; +} } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,12 +11,17 @@ #include #endif +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#include +#endif + #include -#include -#include #include #include +#include +#include #ifdef OS_LINUX #include @@ -30,17 +35,36 @@ #include #endif +#include "db/db_impl/db_impl.h" +#include "env/emulated_clock.h" #include "env/env_chroot.h" +#include "env/env_encryption_ctr.h" +#include "env/fs_readonly.h" +#include "env/mock_env.h" +#include "env/unique_id_gen.h" #include "logging/log_buffer.h" +#include "logging/logging.h" #include "port/malloc.h" #include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/crc32c.h" #include "util/mutexlock.h" +#include "util/random.h" #include "util/string_util.h" +#include "utilities/env_timed.h" +#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -86,6 +110,11 @@ Env* env_; bool direct_io_; EnvPosixTest() : env_(Env::Default()), direct_io_(false) {} + ~EnvPosixTest() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } }; class EnvPosixTestWithParam @@ -183,7 +212,7 @@ if (::stat(filename.c_str(), &sb) == 0) { ASSERT_EQ(sb.st_mode & 0777, 0644); } - env_->DeleteFile(filename); + ASSERT_OK(env_->DeleteFile(filename)); } env_->SetAllowNonOwnerAccess(false); @@ -196,10 +225,88 @@ if (::stat(filename.c_str(), &sb) == 0) { ASSERT_EQ(sb.st_mode & 0777, 0600); } - env_->DeleteFile(filename); + ASSERT_OK(env_->DeleteFile(filename)); } } } + +TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) { + std::atomic from_priority(CpuPriority::kNormal); + std::atomic to_priority(CpuPriority::kNormal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) { + from_priority.store(*reinterpret_cast(pri)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) { + to_priority.store(*reinterpret_cast(pri)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + env_->SetBackgroundThreads(1, Env::BOTTOM); + env_->SetBackgroundThreads(1, Env::HIGH); + + auto RunTask = [&](Env::Priority pool) { + std::atomic called(false); + env_->Schedule(&SetBool, &called, pool); + for (int i = 0; i < kDelayMicros; i++) { + if (called.load()) { + break; + } + Env::Default()->SleepForMicroseconds(1); + } + ASSERT_TRUE(called.load()); + }; + + { + // Same priority, no-op. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, + CpuPriority::kNormal) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kNormal); + } + + { + // Higher priority, no-op. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kHigh) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kNormal); + } + + { + // Lower priority from kNormal -> kLow. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kLow) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kLow); + } + + { + // Lower priority from kLow -> kIdle. + env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kIdle) + .PermitUncheckedError(); + RunTask(Env::Priority::BOTTOM); + ASSERT_EQ(from_priority, CpuPriority::kLow); + ASSERT_EQ(to_priority, CpuPriority::kIdle); + } + + { + // Lower priority from kNormal -> kIdle for another pool. + env_->LowerThreadPoolCPUPriority(Env::Priority::HIGH, CpuPriority::kIdle) + .PermitUncheckedError(); + RunTask(Env::Priority::HIGH); + ASSERT_EQ(from_priority, CpuPriority::kNormal); + ASSERT_EQ(to_priority, CpuPriority::kIdle); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} #endif TEST_F(EnvPosixTest, MemoryMappedFileBuffer) { @@ -212,7 +319,7 @@ ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); Random rnd(301); - test::RandomString(&rnd, kFileBytes, &expected_data); + expected_data = rnd.RandomString(kFileBytes); ASSERT_OK(wfile->Append(expected_data)); } @@ -325,6 +432,7 @@ // run in any order. The purpose of the test is unclear. #ifndef OS_WIN TEST_P(EnvPosixTestWithParam, RunMany) { + env_->SetBackgroundThreads(1, Env::LOW); std::atomic last_id(0); struct CB { @@ -831,7 +939,7 @@ } else { // mkdtemp failed: diagnose it, but don't give up. fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(), - strerror(errno)); + errnoStr(errno).c_str()); } } @@ -929,7 +1037,7 @@ ASSERT_EQ(unique_id2, unique_id3); // Delete the file - env_->DeleteFile(fname); + ASSERT_OK(env_->DeleteFile(fname)); } } #endif // !defined(OS_WIN) @@ -956,7 +1064,8 @@ int err_number = 0; if (alloc_status != 0) { err_number = errno; - fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number)); + fprintf(stderr, "Warning: fallocate() fails, %s\n", + errnoStr(err_number).c_str()); } close(fd); ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); @@ -1044,7 +1153,7 @@ // Collect and check whether the IDs are unique. std::unordered_set ids; - for (const std::string fname : fnames) { + for (const std::string& fname : fnames) { std::unique_ptr file; std::string unique_id; ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); @@ -1058,7 +1167,7 @@ } // Delete the files - for (const std::string fname : fnames) { + for (const std::string& fname : fnames) { ASSERT_OK(env_->DeleteFile(fname)); } @@ -1066,7 +1175,9 @@ } } -TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { +// TODO: Disable the flaky test, it's a known issue that ext4 may return same +// key after file deletion. The issue is tracked in #7405, #7470. +TEST_P(EnvPosixTestWithParam, DISABLED_RandomAccessUniqueIDDeletes) { if (env_ == Env::Default()) { EnvOptions soptions; soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; @@ -1180,6 +1291,213 @@ } } +TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { + // In this test we don't do aligned read, so it doesn't work for + // direct I/O case. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kTotalSize = 81920; + Random rnd(301); + std::string expected_data = rnd.RandomString(kTotalSize); + + // Create file. + { + std::unique_ptr wfile; + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile->Append(expected_data)); + ASSERT_OK(wfile->Close()); + } + + // More attempts to simulate more partial result sequences. + for (uint32_t attempt = 0; attempt < 25; attempt++) { + // Right now kIoUringDepth is hard coded as 256, so we need very large + // number of keys to cover the case of multiple rounds of submissions. + // Right now the test latency is still acceptable. If it ends up with + // too long, we can modify the io uring depth with SyncPoint here. + const int num_reads = rnd.Uniform(512) + 1; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_result", [&](void* arg) { + if (attempt > 5) { + // Improve partial result rates in second half of the run to + // cover the case of repeated partial results. + int odd = (attempt < 15) ? num_reads / 2 : 4; + // No failure in first several attempts. + size_t& bytes_read = *static_cast(arg); + if (rnd.OneIn(odd)) { + bytes_read = 0; + } else if (rnd.OneIn(odd / 2)) { + bytes_read = static_cast( + rnd.Uniform(static_cast(bytes_read))); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Generate (offset, len) pairs + std::set start_offsets; + for (int i = 0; i < num_reads; i++) { + int rnd_off; + // No repeat offsets. + while (start_offsets.find(rnd_off = rnd.Uniform(81920)) != start_offsets.end()) {} + start_offsets.insert(rnd_off); + } + std::vector offsets; + std::vector lens; + // std::set already sorted the offsets. + for (int so: start_offsets) { + offsets.push_back(so); + } + for (size_t i = 0; i + 1 < offsets.size(); i++) { + lens.push_back(static_cast(rnd.Uniform(static_cast(offsets[i + 1] - offsets[i])) + 1)); + } + lens.push_back(static_cast(rnd.Uniform(static_cast(kTotalSize - offsets.back())) + 1)); + ASSERT_EQ(num_reads, lens.size()); + + // Create requests + std::vector scratches; + scratches.reserve(num_reads); + std::vector reqs(num_reads); + for (size_t i = 0; i < reqs.size(); ++i) { + reqs[i].offset = offsets[i]; + reqs[i].len = lens[i]; + scratches.emplace_back(reqs[i].len, ' '); + reqs[i].scratch = const_cast(scratches.back().data()); + } + + // Query the data + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + + // Validate results + for (int i = 0; i < num_reads; ++i) { + ASSERT_OK(reqs[i].status); + ASSERT_EQ(Slice(expected_data.data() + offsets[i], lens[i]).ToString(true), + reqs[i].result.ToString(true)); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + +#if defined(ROCKSDB_IOURING_PRESENT) +void GenerateFilesAndRequest(Env* env, const std::string& fname, + std::vector* ret_reqs, + std::vector* scratches) { + const size_t kTotalSize = 81920; + Random rnd(301); + std::string expected_data = rnd.RandomString(kTotalSize); + + // Create file. + { + std::unique_ptr wfile; + ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions())); + ASSERT_OK(wfile->Append(expected_data)); + ASSERT_OK(wfile->Close()); + } + + // Right now kIoUringDepth is hard coded as 256, so we need very large + // number of keys to cover the case of multiple rounds of submissions. + // Right now the test latency is still acceptable. If it ends up with + // too long, we can modify the io uring depth with SyncPoint here. + const int num_reads = 3; + std::vector offsets = {10000, 20000, 30000}; + std::vector lens = {3000, 200, 100}; + + // Create requests + scratches->reserve(num_reads); + std::vector& reqs = *ret_reqs; + reqs.resize(num_reads); + for (int i = 0; i < num_reads; ++i) { + reqs[i].offset = offsets[i]; + reqs[i].len = lens[i]; + scratches->emplace_back(reqs[i].len, ' '); + reqs[i].scratch = const_cast(scratches->back().data()); + } +} + +TEST_F(EnvPosixTest, MultiReadIOUringError) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector scratches; + std::vector reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_wait_cqe_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", + [&](void* arg) { + if (!io_uring_wait_cqe_called) { + io_uring_wait_cqe_called = true; + ssize_t& ret = *(static_cast(arg)); + ret = 1; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_wait_cqe_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(EnvPosixTest, MultiReadIOUringError2) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector scratches; + std::vector reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_submit_and_wait_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + [&](void* arg) { + io_uring_submit_and_wait_called = true; + ssize_t* ret = static_cast(arg); + (*ret)--; + }); + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + [&](void* arg) { + struct io_uring* iu = static_cast(arg); + struct io_uring_cqe* cqe; + assert(io_uring_wait_cqe(iu, &cqe) == 0); + io_uring_cqe_seen(iu, cqe); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_submit_and_wait_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_IOURING_PRESENT + // Only works in linux platforms #ifdef OS_WIN TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) { @@ -1398,7 +1716,7 @@ auto data = NewAligned(kStrSize, 'A'); Slice str(data.get(), kStrSize); srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize); - srcfile->Append(str); + ASSERT_OK(srcfile->Append(str)); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 1UL); @@ -1407,7 +1725,7 @@ auto buf_ptr = NewAligned(block_size, ' '); Slice buf(buf_ptr.get(), block_size); srcfile->PrepareWrite(srcfile->GetFileSize(), block_size); - srcfile->Append(buf); + ASSERT_OK(srcfile->Append(buf)); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 2UL); } @@ -1417,7 +1735,7 @@ auto buf_ptr = NewAligned(block_size * 5, ' '); Slice buf = Slice(buf_ptr.get(), block_size * 5); srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); - srcfile->Append(buf); + ASSERT_OK(srcfile->Append(buf)); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 7UL); } @@ -1433,9 +1751,10 @@ const int kNumChildren = 10; std::string data; + std::string test_base_dir = test::PerThreadDBPath(env_, "env_test_chr_attr"); + env_->CreateDir(test_base_dir).PermitUncheckedError(); for (int i = 0; i < kNumChildren; ++i) { - const std::string path = - test::TmpDir(env_) + "/" + "testfile_" + std::to_string(i); + const std::string path = test_base_dir + "/testfile_" + std::to_string(i); std::unique_ptr file; #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) if (soptions.use_direct_writes) { @@ -1449,15 +1768,15 @@ ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); auto buf_ptr = NewAligned(data.size(), 'T'); Slice buf(buf_ptr.get(), data.size()); - file->Append(buf); + ASSERT_OK(file->Append(buf)); data.append(std::string(4096, 'T')); } std::vector file_attrs; - ASSERT_OK(env_->GetChildrenFileAttributes(test::TmpDir(env_), &file_attrs)); + ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs)); for (int i = 0; i < kNumChildren; ++i) { const std::string name = "testfile_" + std::to_string(i); - const std::string path = test::TmpDir(env_) + "/" + name; + const std::string path = test_base_dir + "/" + name; auto file_attrs_iter = std::find_if( file_attrs.begin(), file_attrs.end(), @@ -1490,12 +1809,26 @@ return Status::OK(); } + Status Append( + const Slice& /*data*/, + const DataVerificationInfo& /* verification_info */) override { + inc(1); + return Status::OK(); + } + Status PositionedAppend(const Slice& /*data*/, uint64_t /*offset*/) override { inc(2); return Status::OK(); } + Status PositionedAppend( + const Slice& /*data*/, uint64_t /*offset*/, + const DataVerificationInfo& /* verification_info */) override { + inc(2); + return Status::OK(); + } + Status Truncate(uint64_t /*size*/) override { inc(3); return Status::OK(); @@ -1600,13 +1933,13 @@ { Base b(&step); Wrapper w(&b); - w.Append(Slice()); - w.PositionedAppend(Slice(), 0); - w.Truncate(0); - w.Close(); - w.Flush(); - w.Sync(); - w.Fsync(); + ASSERT_OK(w.Append(Slice())); + ASSERT_OK(w.PositionedAppend(Slice(), 0)); + ASSERT_OK(w.Truncate(0)); + ASSERT_OK(w.Close()); + ASSERT_OK(w.Flush()); + ASSERT_OK(w.Sync()); + ASSERT_OK(w.Fsync()); w.IsSyncThreadSafe(); w.use_direct_io(); w.GetRequiredBufferAlignment(); @@ -1618,10 +1951,10 @@ w.SetPreallocationBlockSize(0); w.GetPreallocationStatus(nullptr, nullptr); w.GetUniqueId(nullptr, 0); - w.InvalidateCache(0, 0); - w.RangeSync(0, 0); + ASSERT_OK(w.InvalidateCache(0, 0)); + ASSERT_OK(w.RangeSync(0, 0)); w.PrepareWrite(0, 0); - w.Allocate(0, 0); + ASSERT_OK(w.Allocate(0, 0)); } EXPECT_EQ(24, step); @@ -1630,7 +1963,7 @@ TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) { const std::string path = test::PerThreadDBPath(env_, "random_rw_file"); - env_->DeleteFile(path); + env_->DeleteFile(path).PermitUncheckedError(); std::unique_ptr file; @@ -1680,7 +2013,7 @@ ASSERT_EQ(read_res.ToString(), "XXXQ"); // Close file and reopen it - file->Close(); + ASSERT_OK(file->Close()); ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions())); ASSERT_OK(file->Read(0, 9, &read_res, buf)); @@ -1697,7 +2030,7 @@ ASSERT_EQ(read_res.ToString(), "ABXXTTTTTT"); // Clean up - env_->DeleteFile(path); + ASSERT_OK(env_->DeleteFile(path)); } class RandomRWFileWithMirrorString { @@ -1757,7 +2090,7 @@ TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) { const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand"); - env_->DeleteFile(path); + env_->DeleteFile(path).PermitUncheckedError(); std::unique_ptr file; @@ -1779,7 +2112,7 @@ std::string buf; for (int i = 0; i < 10000; i++) { // Genrate random data - test::RandomString(&rnd, 10, &buf); + buf = rnd.RandomString(10); // Pick random offset for write size_t write_off = rnd.Next() % 1000; @@ -1798,35 +2131,36 @@ } // clean up - env_->DeleteFile(path); + ASSERT_OK(env_->DeleteFile(path)); } class TestEnv : public EnvWrapper { public: explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) { } - - class TestLogger : public Logger { - public: - using Logger::Logv; - TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } - ~TestLogger() override { - if (!closed_) { - CloseHelper(); + const char* Name() const override { return "TestEnv"; } + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + Status s = CloseHelper(); + s.PermitUncheckedError(); + } } - } - void Logv(const char* /*format*/, va_list /*ap*/) override{}; + void Logv(const char* /*format*/, va_list /*ap*/) override {} - protected: - Status CloseImpl() override { return CloseHelper(); } + protected: + Status CloseImpl() override { return CloseHelper(); } - private: - Status CloseHelper() { - env->CloseCountInc();; - return Status::OK(); - } - TestEnv* env; - }; + private: + Status CloseHelper() { + env->CloseCountInc(); + return Status::OK(); + } + TestEnv* env; + }; void CloseCountInc() { close_count++; } @@ -1842,7 +2176,13 @@ int close_count; }; -class EnvTest : public testing::Test {}; +class EnvTest : public testing::Test { + public: + EnvTest() : test_directory_(test::PerThreadDBPath("env_test")) {} + + protected: + const std::string test_directory_; +}; TEST_F(EnvTest, Close) { TestEnv* env = new TestEnv(); @@ -1850,23 +2190,43 @@ Status s; s = env->NewLogger("", &logger); - ASSERT_EQ(s, Status::OK()); - logger.get()->Close(); + ASSERT_OK(s); + ASSERT_OK(logger.get()->Close()); ASSERT_EQ(env->GetCloseCount(), 1); // Call Close() again. CloseHelper() should not be called again - logger.get()->Close(); + ASSERT_OK(logger.get()->Close()); ASSERT_EQ(env->GetCloseCount(), 1); logger.reset(); ASSERT_EQ(env->GetCloseCount(), 1); s = env->NewLogger("", &logger); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); logger.reset(); ASSERT_EQ(env->GetCloseCount(), 2); delete env; } +class LogvWithInfoLogLevelLogger : public Logger { + public: + using Logger::Logv; + void Logv(const InfoLogLevel /* log_level */, const char* /* format */, + va_list /* ap */) override {} +}; + +TEST_F(EnvTest, LogvWithInfoLogLevel) { + // Verifies the log functions work on a `Logger` that only overrides the + // `Logv()` overload including `InfoLogLevel`. + const std::string kSampleMessage("sample log message"); + LogvWithInfoLogLevelLogger logger; + ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); +} + INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(Env::Default(), false))); @@ -1877,19 +2237,845 @@ #endif // !defined(ROCKSDB_LITE) #if !defined(ROCKSDB_LITE) && !defined(OS_WIN) -static std::unique_ptr chroot_env( - NewChrootEnv(Env::Default(), test::TmpDir(Env::Default()))); -INSTANTIATE_TEST_CASE_P( - ChrootEnvWithoutDirectIO, EnvPosixTestWithParam, - ::testing::Values(std::pair(chroot_env.get(), false))); -INSTANTIATE_TEST_CASE_P( - ChrootEnvWithDirectIO, EnvPosixTestWithParam, - ::testing::Values(std::pair(chroot_env.get(), true))); +static Env* GetChrootEnv() { + static std::unique_ptr chroot_env( + NewChrootEnv(Env::Default(), test::TmpDir(Env::Default()))); + return chroot_env.get(); +} +INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair(GetChrootEnv(), + false))); +INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam, + ::testing::Values(std::pair(GetChrootEnv(), + true))); #endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) +class EnvFSTestWithParam + : public ::testing::Test, + public ::testing::WithParamInterface> { + public: + EnvFSTestWithParam() { + bool env_non_null = std::get<0>(GetParam()); + bool env_default = std::get<1>(GetParam()); + bool fs_default = std::get<2>(GetParam()); + + env_ = env_non_null ? (env_default ? Env::Default() : nullptr) : nullptr; + fs_ = fs_default + ? FileSystem::Default() + : std::make_shared(FileSystem::Default()); + if (env_non_null && env_default && !fs_default) { + env_ptr_ = NewCompositeEnv(fs_); + } + if (env_non_null && !env_default && fs_default) { + env_ptr_ = std::unique_ptr(new FaultInjectionTestEnv(Env::Default())); + fs_.reset(); + } + if (env_non_null && !env_default && !fs_default) { + env_ptr_.reset(new FaultInjectionTestEnv(Env::Default())); + composite_env_ptr_.reset(new CompositeEnvWrapper(env_ptr_.get(), fs_)); + env_ = composite_env_ptr_.get(); + } else { + env_ = env_ptr_.get(); + } + + dbname1_ = test::PerThreadDBPath("env_fs_test1"); + dbname2_ = test::PerThreadDBPath("env_fs_test2"); + } + + ~EnvFSTestWithParam() = default; + + Env* env_; + std::unique_ptr env_ptr_; + std::unique_ptr composite_env_ptr_; + std::shared_ptr fs_; + std::string dbname1_; + std::string dbname2_; +}; + +TEST_P(EnvFSTestWithParam, OptionsTest) { + Options opts; + opts.env = env_; + opts.create_if_missing = true; + std::string dbname = dbname1_; + + if (env_) { + if (fs_) { + ASSERT_EQ(fs_.get(), env_->GetFileSystem().get()); + } else { + ASSERT_NE(FileSystem::Default().get(), env_->GetFileSystem().get()); + } + } + for (int i = 0; i < 2; ++i) { + DB* db; + Status s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + + WriteOptions wo; + ASSERT_OK(db->Put(wo, "a", "a")); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK(db->Put(wo, "b", "b")); + ASSERT_OK(db->Flush(FlushOptions())); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string val; + ASSERT_OK(db->Get(ReadOptions(), "a", &val)); + ASSERT_EQ("a", val); + ASSERT_OK(db->Get(ReadOptions(), "b", &val)); + ASSERT_EQ("b", val); + + ASSERT_OK(db->Close()); + delete db; + ASSERT_OK(DestroyDB(dbname, opts)); + + dbname = dbname2_; + } +} + +// The parameters are as follows - +// 1. True means Options::env is non-null, false means null +// 2. True means use Env::Default, false means custom +// 3. True means use FileSystem::Default, false means custom +INSTANTIATE_TEST_CASE_P( + EnvFSTest, EnvFSTestWithParam, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); +// This test ensures that default Env and those allocated by +// NewCompositeEnv() all share the same threadpool +TEST_F(EnvTest, MultipleCompositeEnv) { + std::shared_ptr fs1 = + std::make_shared(FileSystem::Default()); + std::shared_ptr fs2 = + std::make_shared(FileSystem::Default()); + std::unique_ptr env1 = NewCompositeEnv(fs1); + std::unique_ptr env2 = NewCompositeEnv(fs2); + Env::Default()->SetBackgroundThreads(8, Env::HIGH); + Env::Default()->SetBackgroundThreads(16, Env::LOW); + ASSERT_EQ(env1->GetBackgroundThreads(Env::LOW), 16); + ASSERT_EQ(env1->GetBackgroundThreads(Env::HIGH), 8); + ASSERT_EQ(env2->GetBackgroundThreads(Env::LOW), 16); + ASSERT_EQ(env2->GetBackgroundThreads(Env::HIGH), 8); +} + +TEST_F(EnvTest, IsDirectory) { + Status s = Env::Default()->CreateDirIfMissing(test_directory_); + ASSERT_OK(s); + const std::string test_sub_dir = test_directory_ + "sub1"; + const std::string test_file_path = test_directory_ + "file1"; + ASSERT_OK(Env::Default()->CreateDirIfMissing(test_sub_dir)); + bool is_dir = false; + ASSERT_OK(Env::Default()->IsDirectory(test_sub_dir, &is_dir)); + ASSERT_TRUE(is_dir); + { + std::unique_ptr wfile; + s = Env::Default()->GetFileSystem()->NewWritableFile( + test_file_path, FileOptions(), &wfile, /*dbg=*/nullptr); + ASSERT_OK(s); + std::unique_ptr fwriter; + fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path, + FileOptions(), + SystemClock::Default().get())); + constexpr char buf[] = "test"; + s = fwriter->Append(buf); + ASSERT_OK(s); + } + ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir)); + ASSERT_FALSE(is_dir); +} + +TEST_F(EnvTest, EnvWriteVerificationTest) { + Status s = Env::Default()->CreateDirIfMissing(test_directory_); + const std::string test_file_path = test_directory_ + "file1"; + ASSERT_OK(s); + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + std::unique_ptr file; + s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions()); + ASSERT_OK(s); + + DataVerificationInfo v_info; + std::string test_data = "test"; + std::string checksum; + uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size()); + PutFixed32(&checksum, v_crc32c); + v_info.checksum = Slice(checksum); + s = file->Append(Slice(test_data), v_info); + ASSERT_OK(s); +} + +class CreateEnvTest : public testing::Test { + public: + CreateEnvTest() { + config_options_.ignore_unknown_options = false; + config_options_.ignore_unsupported_options = false; + } + ConfigOptions config_options_; +}; + +#ifndef ROCKSDB_LITE +TEST_F(CreateEnvTest, LoadCTRProvider) { + config_options_.invoke_prepare_options = false; + std::string CTR = CTREncryptionProvider::kClassName(); + std::shared_ptr provider; + // Test a provider with no cipher + ASSERT_OK( + EncryptionProvider::CreateFromString(config_options_, CTR, &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_NOK(provider->PrepareOptions(config_options_)); + ASSERT_NOK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + auto cipher = provider->GetOptions>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_EQ(cipher->get(), nullptr); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, + CTR + "://test", &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_OK(provider->PrepareOptions(config_options_)); + ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + cipher = provider->GetOptions>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "1://test", + &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + ASSERT_OK(provider->PrepareOptions(config_options_)); + ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + cipher = provider->GetOptions>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); + + ASSERT_OK(EncryptionProvider::CreateFromString( + config_options_, "id=" + CTR + "; cipher=ROT13", &provider)); + ASSERT_NE(provider, nullptr); + ASSERT_EQ(provider->Name(), CTR); + cipher = provider->GetOptions>("Cipher"); + ASSERT_NE(cipher, nullptr); + ASSERT_NE(cipher->get(), nullptr); + ASSERT_STREQ(cipher->get()->Name(), "ROT13"); + provider.reset(); +} + +TEST_F(CreateEnvTest, LoadROT13Cipher) { + std::shared_ptr cipher; + // Test a provider with no cipher + ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &cipher)); + ASSERT_NE(cipher, nullptr); + ASSERT_STREQ(cipher->Name(), "ROT13"); +} +#endif // ROCKSDB_LITE + +TEST_F(CreateEnvTest, CreateDefaultSystemClock) { + std::shared_ptr clock, copy; + ASSERT_OK(SystemClock::CreateFromString(config_options_, + SystemClock::kDefaultName(), &clock)); + ASSERT_NE(clock, nullptr); + ASSERT_EQ(clock, SystemClock::Default()); +#ifndef ROCKSDB_LITE + std::string opts_str = clock->ToString(config_options_); + std::string mismatch; + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch)); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +TEST_F(CreateEnvTest, CreateMockSystemClock) { + std::shared_ptr mock, copy; + + config_options_.registry->AddLibrary("test")->AddFactory( + MockSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockSystemClock(nullptr)); + return guard->get(); + }); + ASSERT_OK(SystemClock::CreateFromString( + config_options_, EmulatedSystemClock::kClassName(), &mock)); + ASSERT_NE(mock, nullptr); + ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName()); + ASSERT_EQ(mock->Inner(), SystemClock::Default().get()); + std::string opts_str = mock->ToString(config_options_); + std::string mismatch; + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch)); + + std::string id = std::string("id=") + EmulatedSystemClock::kClassName() + + ";target=" + MockSystemClock::kClassName(); + + ASSERT_OK(SystemClock::CreateFromString(config_options_, id, &mock)); + ASSERT_NE(mock, nullptr); + ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName()); + ASSERT_NE(mock->Inner(), nullptr); + ASSERT_STREQ(mock->Inner()->Name(), MockSystemClock::kClassName()); + ASSERT_EQ(mock->Inner()->Inner(), SystemClock::Default().get()); + opts_str = mock->ToString(config_options_); + ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(SystemClock::CreateFromString( + config_options_, EmulatedSystemClock::kClassName(), &mock)); +} + +TEST_F(CreateEnvTest, CreateReadOnlyFileSystem) { + std::shared_ptr fs, copy; + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, ReadOnlyFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + std::string("id=") + ReadOnlyFileSystem::kClassName() + + "; target=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + opts_str = fs->ToString(config_options_); + ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CreateEnvTest, CreateTimedFileSystem) { + std::shared_ptr fs, copy; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, + TimedFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + std::string("id=") + TimedFileSystem::kClassName() + + "; target=" + ReadOnlyFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + opts_str = fs->ToString(config_options_); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} +#ifndef OS_WIN +TEST_F(CreateEnvTest, CreateChrootFileSystem) { + std::shared_ptr fs, copy; + auto tmp_dir = test::TmpDir(Env::Default()); + // The Chroot FileSystem has a required "chroot_dir" option. + ASSERT_NOK(FileSystem::CreateFromString(config_options_, + ChrootFileSystem::kClassName(), &fs)); + + // ChrootFileSystem fails with an invalid directory + ASSERT_NOK(FileSystem::CreateFromString( + config_options_, + std::string("chroot_dir=/No/Such/Directory; id=") + + ChrootFileSystem::kClassName(), + &fs)); + std::string chroot_opts = std::string("chroot_dir=") + tmp_dir + + std::string("; id=") + + ChrootFileSystem::kClassName(); + + // Create a valid ChrootFileSystem with an inner Default + ASSERT_OK(FileSystem::CreateFromString(config_options_, chroot_opts, &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + // Create a valid ChrootFileSystem with an inner TimedFileSystem + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + chroot_opts + "; target=" + TimedFileSystem::kClassName(), &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + + // Create a TimedFileSystem with an inner ChrootFileSystem + ASSERT_OK(FileSystem::CreateFromString( + config_options_, + "target={" + chroot_opts + "}; id=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), ChrootFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} +#endif // OS_WIN + +TEST_F(CreateEnvTest, CreateEncryptedFileSystem) { + std::shared_ptr fs, copy; + + std::string base_opts = + std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName(); + // The EncryptedFileSystem requires a "provider" option. + ASSERT_NOK(FileSystem::CreateFromString( + config_options_, EncryptedFileSystem::kClassName(), &fs)); + + ASSERT_OK(FileSystem::CreateFromString(config_options_, base_opts, &fs)); + + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner(), FileSystem::Default().get()); + std::string opts_str = fs->ToString(config_options_); + std::string mismatch; + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(FileSystem::CreateFromString( + config_options_, base_opts + "; target=" + TimedFileSystem::kClassName(), + &fs)); + ASSERT_NE(fs, nullptr); + ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName()); + ASSERT_NE(fs->Inner(), nullptr); + ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName()); + ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get()); + opts_str = fs->ToString(config_options_); + ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, ©)); + ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +#endif // ROCKSDB_LITE + +namespace { + +constexpr size_t kThreads = 8; +constexpr size_t kIdsPerThread = 1000; + +// This is a mini-stress test to check for duplicates in functions like +// GenerateUniqueId() +template > +struct NoDuplicateMiniStressTest { + std::unordered_set ids; + std::mutex mutex; + Env* env; + + NoDuplicateMiniStressTest() { env = Env::Default(); } + + virtual ~NoDuplicateMiniStressTest() {} + + void Run() { + std::array threads; + for (size_t i = 0; i < kThreads; ++i) { + threads[i] = std::thread([&]() { ThreadFn(); }); + } + for (auto& thread : threads) { + thread.join(); + } + // All must be unique + ASSERT_EQ(ids.size(), kThreads * kIdsPerThread); + } + + void ThreadFn() { + std::array my_ids; + // Generate in parallel threads as fast as possible + for (size_t i = 0; i < kIdsPerThread; ++i) { + my_ids[i] = Generate(); + } + // Now collate + std::lock_guard lock(mutex); + for (auto& id : my_ids) { + ids.insert(id); + } + } + + virtual IdType Generate() = 0; +}; + +void VerifyRfcUuids(const std::unordered_set& uuids) { + if (uuids.empty()) { + return; + } +} + +using uint64_pair_t = std::pair; +struct HashUint64Pair { + std::size_t operator()( + std::pair const& u) const noexcept { + // Assume suitable distribution already + return static_cast(u.first ^ u.second); + } +}; + +} // namespace + +TEST_F(EnvTest, GenerateUniqueId) { + struct MyStressTest : public NoDuplicateMiniStressTest { + std::string Generate() override { return env->GenerateUniqueId(); } + }; + + MyStressTest t; + t.Run(); + + // Basically verify RFC-4122 format + for (auto& uuid : t.ids) { + ASSERT_EQ(36U, uuid.size()); + ASSERT_EQ('-', uuid[8]); + ASSERT_EQ('-', uuid[13]); + ASSERT_EQ('-', uuid[18]); + ASSERT_EQ('-', uuid[23]); + } +} + +TEST_F(EnvTest, GenerateDbSessionId) { + struct MyStressTest : public NoDuplicateMiniStressTest { + std::string Generate() override { return DBImpl::GenerateDbSessionId(env); } + }; + + MyStressTest t; + t.Run(); + + // Basically verify session ID + for (auto& id : t.ids) { + ASSERT_EQ(20U, id.size()); + } +} + +constexpr bool kRequirePortGenerateRfcUuid = +#if defined(OS_LINUX) || defined(OS_ANDROID) || defined(OS_WIN) + true; +#else + false; +#endif + +TEST_F(EnvTest, PortGenerateRfcUuid) { + if (!kRequirePortGenerateRfcUuid) { + ROCKSDB_GTEST_SKIP("Not supported/expected on this platform"); + return; + } + struct MyStressTest : public NoDuplicateMiniStressTest { + std::string Generate() override { + std::string u; + assert(port::GenerateRfcUuid(&u)); + return u; + } + }; + + MyStressTest t; + t.Run(); + + // Extra verification on versions and variants + VerifyRfcUuids(t.ids); +} + +// Test the atomic, linear generation of GenerateRawUuid +TEST_F(EnvTest, GenerateRawUniqueId) { + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + GenerateRawUniqueId(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +// Test that each entropy source ("track") is at least adequate +TEST_F(EnvTest, GenerateRawUniqueIdTrackPortUuidOnly) { + if (!kRequirePortGenerateRfcUuid) { + ROCKSDB_GTEST_SKIP("Not supported/expected on this platform"); + return; + } + + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, false, true, true); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, GenerateRawUniqueIdTrackEnvDetailsOnly) { + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, true, false, true); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) { + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + TEST_GenerateRawUniqueId(&p.first, &p.second, true, true, false); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) { + // Must be thread safe and usable as a static + static SemiStructuredUniqueIdGen gen; + + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, FailureToCreateLockFile) { + auto env = Env::Default(); + auto fs = env->GetFileSystem(); + std::string dir = test::PerThreadDBPath(env, "lockdir"); + std::string file = dir + "/lockfile"; + + // Ensure directory doesn't exist + ASSERT_OK(DestroyDir(env, dir)); + + // Make sure that we can acquire a file lock after the first attempt fails + FileLock* lock = nullptr; + ASSERT_NOK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr)); + ASSERT_FALSE(lock); + + ASSERT_OK(fs->CreateDir(dir, IOOptions(), /*dbg*/ nullptr)); + ASSERT_OK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr)); + ASSERT_OK(fs->UnlockFile(lock, IOOptions(), /*dbg*/ nullptr)); + + // Clean up + ASSERT_OK(DestroyDir(env, dir)); +} + +TEST_F(EnvTest, CreateDefaultEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + + std::shared_ptr guard; + Env* env = nullptr; + ASSERT_OK(Env::CreateFromString(options, "", &env)); + ASSERT_EQ(env, Env::Default()); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env)); + ASSERT_EQ(env, Env::Default()); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, "", &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); + + env = nullptr; + ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); + +#ifndef ROCKSDB_LITE + std::string opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_EQ(env, Env::Default()); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_EQ(env, Env::Default()); + ASSERT_EQ(guard, nullptr); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +namespace { +class WrappedEnv : public EnvWrapper { + public: + explicit WrappedEnv(Env* t) : EnvWrapper(t) {} + explicit WrappedEnv(const std::shared_ptr& t) : EnvWrapper(t) {} + static const char* kClassName() { return "WrappedEnv"; } + const char* Name() const override { return kClassName(); } + static void Register(ObjectLibrary& lib, const std::string& /*arg*/) { + lib.AddFactory( + WrappedEnv::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new WrappedEnv(nullptr)); + return guard->get(); + }); + } +}; +} // namespace +TEST_F(EnvTest, CreateMockEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + std::shared_ptr guard, copy; + std::string opt_str; + + Env* env = nullptr; + ASSERT_NOK(Env::CreateFromString(options, MockEnv::kClassName(), &env)); + ASSERT_OK( + Env::CreateFromString(options, MockEnv::kClassName(), &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + std::string mismatch; + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + guard.reset(MockEnv::Create(Env::Default(), SystemClock::Default())); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + std::unique_ptr wrapped_env(new WrappedEnv(Env::Default())); + guard.reset(MockEnv::Create(wrapped_env.get(), SystemClock::Default())); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + opt_str = copy->ToString(options); +} + +TEST_F(EnvTest, CreateWrappedEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + Env* env = nullptr; + std::shared_ptr guard, copy; + std::string opt_str; + std::string mismatch; + + ASSERT_NOK(Env::CreateFromString(options, WrappedEnv::kClassName(), &env)); + ASSERT_OK( + Env::CreateFromString(options, WrappedEnv::kClassName(), &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_FALSE(guard->AreEquivalent(options, Env::Default(), &mismatch)); + + opt_str = env->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + guard.reset(new WrappedEnv(std::make_shared(Env::Default()))); + ASSERT_NE(guard.get(), env); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + guard.reset(new WrappedEnv(std::make_shared( + std::make_shared(Env::Default())))); + ASSERT_NE(guard.get(), env); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(copy, guard); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); +} + +TEST_F(EnvTest, CreateCompositeEnv) { + ConfigOptions options; + options.ignore_unsupported_options = false; + std::shared_ptr guard, copy; + Env* env = nullptr; + std::string mismatch, opt_str; + + WrappedEnv::Register(*(options.registry->AddLibrary("test")), ""); + std::unique_ptr base(NewCompositeEnv(FileSystem::Default())); + std::unique_ptr wrapped(new WrappedEnv(Env::Default())); + std::shared_ptr timed_fs = + std::make_shared(FileSystem::Default()); + std::shared_ptr clock = + std::make_shared(SystemClock::Default()); + + opt_str = base->ToString(options); + ASSERT_NOK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_EQ(env->GetFileSystem(), FileSystem::Default()); + ASSERT_EQ(env->GetSystemClock(), SystemClock::Default()); + + base = NewCompositeEnv(timed_fs); + opt_str = base->ToString(options); + ASSERT_NOK(Env::CreateFromString(options, opt_str, &env)); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_NE(env->GetFileSystem(), FileSystem::Default()); + ASSERT_EQ(env->GetSystemClock(), SystemClock::Default()); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), clock)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); + + env = nullptr; + guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs, clock)); + opt_str = guard->ToString(options); + ASSERT_OK(Env::CreateFromString(options, opt_str, &env, ©)); + ASSERT_NE(env, nullptr); + ASSERT_NE(env, Env::Default()); + ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch)); +} +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,10 +3,20 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "env/composite_env_wrapper.h" #include "rocksdb/file_system.h" + +#include "env/composite_env_wrapper.h" +#include "env/env_chroot.h" +#include "env/env_encryption_ctr.h" +#include "env/fs_readonly.h" +#include "env/mock_env.h" #include "options/db_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" +#include "utilities/env_timed.h" namespace ROCKSDB_NAMESPACE { @@ -16,14 +26,85 @@ Status FileSystem::Load(const std::string& value, std::shared_ptr* result) { - Status s; + return CreateFromString(ConfigOptions(), value, result); +} + #ifndef ROCKSDB_LITE - s = ObjectRegistry::NewInstance()->NewSharedObject(value, result); -#else - (void)result; - s = Status::NotSupported("Cannot load FileSystem in LITE mode: ", value); -#endif - return s; +static int RegisterBuiltinFileSystems(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + TimedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TimedFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory( + ReadOnlyFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new ReadOnlyFileSystem(nullptr)); + return guard->get(); + }); + library.AddFactory( + EncryptedFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard); + if (!s.ok()) { + *errmsg = s.ToString(); + } + return guard->get(); + }); + library.AddFactory( + MockFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new MockFileSystem(SystemClock::Default())); + return guard->get(); + }); +#ifndef OS_WIN + library.AddFactory( + ChrootFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new ChrootFileSystem(nullptr, "")); + return guard->get(); + }); +#endif // OS_WIN + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status FileSystem::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + auto default_fs = FileSystem::Default(); + if (default_fs->IsInstanceOf(value)) { + *result = default_fs; + return Status::OK(); + } else { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject(config_options, value, nullptr, result); + } +} + +IOStatus FileSystem::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) { + IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, opts, result, dbg); } FileOptions FileSystem::OptimizeForLogRead( @@ -71,12 +152,39 @@ return optimized_file_options; } -Status ReadFileToString(FileSystem* fs, const std::string& fname, - std::string* data) { +FileOptions FileSystem::OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + +IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, bool should_sync) { + std::unique_ptr file; + EnvOptions soptions; + IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr); + if (!s.ok()) { + return s; + } + s = file->Append(data, IOOptions(), nullptr); + if (s.ok() && should_sync) { + s = file->Sync(IOOptions(), nullptr); + } + if (!s.ok()) { + fs->DeleteFile(fname, IOOptions(), nullptr); + } + return s; +} + +IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data) { FileOptions soptions; data->clear(); std::unique_ptr file; - Status s = fs->NewSequentialFile(fname, soptions, &file, nullptr); + IOStatus s = status_to_io_status( + fs->NewSequentialFile(fname, soptions, &file, nullptr)); if (!s.ok()) { return s; } @@ -98,13 +206,58 @@ return s; } -#ifdef OS_WIN -std::shared_ptr FileSystem::Default() { - static LegacyFileSystemWrapper default_fs(Env::Default()); - static std::shared_ptr default_fs_ptr( - &default_fs, [](LegacyFileSystemWrapper*) {}); - return default_fs_ptr; +namespace { +static std::unordered_map fs_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, +#endif // ROCKSDB_LITE +}; +} // namespace +FileSystemWrapper::FileSystemWrapper(const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &fs_wrapper_type_info); +} + +Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) { + if (target_ == nullptr) { + target_ = FileSystem::Default(); + } + return FileSystem::PrepareOptions(options); } -#endif +#ifndef ROCKSDB_LITE +std::string FileSystemWrapper::SerializeOptions( + const ConfigOptions& config_options, const std::string& header) const { + auto parent = FileSystem::SerializeOptions(config_options, ""); + if (config_options.IsShallow() || target_ == nullptr || + target_->IsInstanceOf(FileSystem::kDefaultName())) { + return parent; + } else { + std::string result = header; + if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) { + result.append(OptionTypeInfo::kIdPropName()).append("="); + } + result.append(parent); + if (!EndsWith(result, config_options.delimiter)) { + result.append(config_options.delimiter); + } + result.append("target=").append(target_->ToString(config_options)); + return result; + } +} +#endif // ROCKSDB_LITE + +DirFsyncOptions::DirFsyncOptions() { reason = kDefault; } + +DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) { + reason = kFileRenamed; + renamed_new_name = file_renamed_new_name; +} + +DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) { + assert(fsync_reason != kFileRenamed); + reason = fsync_reason; +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,519 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/file_system_tracer.h" + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus FileSystemTracingWrapper::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& file_opts, std::unique_ptr* result, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewDirectory( + const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewDirectory(name, io_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + name.substr(name.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir, + const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetChildren(dir, io_opts, r, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dir.substr(dir.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteFile(fname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::CreateDirIfMissing( + const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->DeleteDir(dirname, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->GetFileSize(fname, options, file_size, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed, + s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname, + size_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(fname, size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1), size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::Read(size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSSequentialFileTracingWrapper::PositionedRead( + uint64_t offset, size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->PositionedRead(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs, + size_t num_reqs, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t latency = elapsed; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + for (size_t i = 0; i < num_reqs; i++) { + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency, + reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset); + io_tracer_->WriteIOOp(io_record, dbg); + } + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Prefetch(offset, n, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Append(data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->PositionedAppend(data, offset, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, size, + 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + uint64_t file_size = target()->GetFileSize(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, "OK", file_name_, file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return file_size; +} + +IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset, + size_t length) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->InvalidateCache(offset, length); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + static_cast(offset)); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Write(offset, data, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Flush(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Sync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Fsync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,447 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "trace_replay/io_tracer.h" + +namespace ROCKSDB_NAMESPACE { + +// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards +// the call to the underlying storage system. It then invokes IOTracer to record +// file operations and other contextual information in a binary format for +// tracing. It overrides methods we are interested in tracing and extends +// FileSystemWrapper, which forwards all methods that are not explicitly +// overridden. +class FileSystemTracingWrapper : public FileSystemWrapper { + public: + FileSystemTracingWrapper(const std::shared_ptr& t, + const std::shared_ptr& io_tracer) + : FileSystemWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()) {} + + ~FileSystemTracingWrapper() override {} + + static const char* kClassName() { return "FileSystemTracing"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; +}; + +// The FileSystemPtr is a wrapper class that takes pointer to storage systems +// (such as posix filesystems). It overloads operator -> and returns a pointer +// of either FileSystem or FileSystemTracingWrapper based on whether tracing is +// enabled or not. It is added to bypass FileSystemTracingWrapper when tracing +// is disabled. +class FileSystemPtr { + public: + FileSystemPtr(std::shared_ptr fs, + const std::shared_ptr& io_tracer) + : fs_(fs), io_tracer_(io_tracer) { + fs_tracer_ = std::make_shared(fs_, io_tracer_); + } + + std::shared_ptr operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_; + } else { + return fs_; + } + } + + /* Returns the underlying File System pointer */ + FileSystem* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_.get(); + } + } + + private: + std::shared_ptr fs_; + std::shared_ptr io_tracer_; + std::shared_ptr fs_tracer_; +}; + +// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSSequentialFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper { + public: + FSSequentialFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSSequentialFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSSequentialFileTracingWrapper() override {} + + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + std::string file_name_; +}; + +// The FSSequentialFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSSequentialFileTracingWrapper when tracing is disabled. +class FSSequentialFilePtr { + public: + FSSequentialFilePtr() = delete; + FSSequentialFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSSequentialFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSSequentialFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSSequentialFileTracingWrapper fs_tracer_; +}; + +// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile +// that forwards the call to the underlying storage system. It then invokes +// IOTracer to record file operations and other contextual information in a +// binary format for tracing. It overrides methods we are interested in tracing +// and extends FSRandomAccessFileWrapper, which forwards all methods that are +// not explicitly overridden. +class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper { + public: + FSRandomAccessFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSRandomAccessFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomAccessFileTracingWrapper() override {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper +// based on whether tracing is enabled or not. It is added to bypass +// FSRandomAccessFileTracingWrapper when tracing is disabled. +class FSRandomAccessFilePtr { + public: + FSRandomAccessFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomAccessFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomAccessFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSRandomAccessFileTracingWrapper fs_tracer_; +}; + +// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSWritableFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper { + public: + FSWritableFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSWritableFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSWritableFileTracingWrapper() override {} + + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return Append(data, options, dbg); + } + + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, options, dbg); + } + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus InvalidateCache(size_t offset, size_t length) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSWritableFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSWritableFileTracingWrapper when tracing is disabled. +class FSWritableFilePtr { + public: + FSWritableFilePtr(std::unique_ptr&& fs, + const std::shared_ptr& io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer) { + fs_tracer_.reset(new FSWritableFileTracingWrapper( + std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */)); + } + + FSWritableFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else { + return fs_tracer_->target(); + } + } + + FSWritableFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return fs_tracer_.get(); + } else if (fs_tracer_) { + return fs_tracer_->target(); + } else { + return nullptr; + } + } + + void reset() { + fs_tracer_.reset(); + io_tracer_ = nullptr; + } + + private: + std::shared_ptr io_tracer_; + std::unique_ptr fs_tracer_; +}; + +// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that +// forwards the call to the underlying storage system. It then invokes IOTracer +// to record file operations and other contextual information in a binary format +// for tracing. It overrides methods we are interested in tracing and extends +// FSRandomRWFileWrapper, which forwards all methods that are not explicitly +// overridden. +class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper { + public: + FSRandomRWFileTracingWrapper(std::unique_ptr&& t, + std::shared_ptr io_tracer, + const std::string& file_name) + : FSRandomRWFileOwnerWrapper(std::move(t)), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} + + ~FSRandomRWFileTracingWrapper() override {} + + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + private: + std::shared_ptr io_tracer_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; +}; + +// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage +// systems (such as posix filesystems). It overloads operator -> and returns a +// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on +// whether tracing is enabled or not. It is added to bypass +// FSRandomRWFileTracingWrapper when tracing is disabled. +class FSRandomRWFilePtr { + public: + FSRandomRWFilePtr(std::unique_ptr&& fs, + std::shared_ptr io_tracer, + const std::string& file_name) + : io_tracer_(io_tracer), + fs_tracer_(std::move(fs), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} + + FSRandomRWFile* operator->() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + FSRandomRWFile* get() const { + if (io_tracer_ && io_tracer_->is_tracing_enabled()) { + return const_cast(&fs_tracer_); + } else { + return fs_tracer_.target(); + } + } + + private: + std::shared_ptr io_tracer_; + FSRandomRWFileTracingWrapper fs_tracer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_posix.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,16 +6,15 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors + +#if !defined(OS_WIN) + #include #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION #include #endif #include #include - -#if defined(OS_LINUX) -#include -#endif #include #include #include @@ -26,13 +25,13 @@ #include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include -#include #include #endif #include #include #include #include + #include // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) @@ -47,14 +46,15 @@ #include #include +#include "env/composite_env_wrapper.h" #include "env/io_posix.h" -#include "logging/logging.h" #include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" #include "port/port.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/utilities/object_registry.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" @@ -73,6 +73,8 @@ #define EXT4_SUPER_MAGIC 0xEF53 #endif +extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__)); + namespace ROCKSDB_NAMESPACE { namespace { @@ -81,9 +83,16 @@ return allow_non_owner_access ? 0644 : 0600; } +static uint64_t gettid() { return Env::Default()->GetThreadID(); } + // list of pathnames that are locked -static std::set lockedFiles; -static port::Mutex mutex_lockedFiles; +// Only used for error message. +struct LockHoldingInfo { + int64_t acquire_time; + uint64_t acquiring_thread; +}; +static std::map locked_files; +static port::Mutex mutex_locked_files; static int LockOrUnlock(int fd, bool lock) { errno = 0; @@ -100,8 +109,18 @@ class PosixFileLock : public FileLock { public: - int fd_; + int fd_ = /*invalid*/ -1; std::string filename; + + void Clear() { + fd_ = -1; + filename.clear(); + } + + virtual ~PosixFileLock() override { + // Check for destruction without UnlockFile + assert(fd_ == -1); + } }; int cloexec_flags(int flags, const EnvOptions* options) { @@ -112,6 +131,8 @@ if (options == nullptr || options->set_fd_cloexec) { flags |= O_CLOEXEC; } +#else + (void)options; #endif return flags; } @@ -120,7 +141,9 @@ public: PosixFileSystem(); - const char* Name() const override { return "Posix File System"; } + static const char* kClassName() { return "PosixFileSystem"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } ~PosixFileSystem() override {} @@ -146,6 +169,7 @@ #endif // !ROCKSDB_LITE #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; + TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags); #endif } @@ -178,7 +202,9 @@ errno); } } - result->reset(new PosixSequentialFile(fname, file, fd, options)); + result->reset(new PosixSequentialFile( + fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options)); return IOStatus::OK(); } @@ -187,7 +213,7 @@ std::unique_ptr* result, IODebugContext* /*dbg*/) override { result->reset(); - IOStatus s; + IOStatus s = IOStatus::OK(); int fd; int flags = cloexec_flags(O_RDONLY, &options); @@ -207,11 +233,12 @@ fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_)); } while (fd < 0 && errno == EINTR); if (fd < 0) { - return IOError("While open a file for random read", fname, errno); + s = IOError("While open a file for random read", fname, errno); + return s; } SetFD_CLOEXEC(fd, &options); - if (options.use_mmap_reads && sizeof(void*) >= 8) { + if (options.use_mmap_reads) { // Use of mmap for random reads has been removed because it // kills performance when storage is fast. // Use mmap when virtual address-space is plentiful. @@ -227,6 +254,8 @@ s = IOError("while mmap file for read", fname, errno); close(fd); } + } else { + close(fd); } } else { if (options.use_direct_reads && !options.use_mmap_reads) { @@ -237,19 +266,20 @@ } #endif } - result->reset(new PosixRandomAccessFile(fname, fd, options + result->reset(new PosixRandomAccessFile( + fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd), + options #if defined(ROCKSDB_IOURING_PRESENT) - , - thread_local_io_urings_.get() + , + !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get() #endif - )); + )); } return s; } virtual IOStatus OpenWritableFile(const std::string& fname, - const FileOptions& options, - bool reopen, + const FileOptions& options, bool reopen, std::unique_ptr* result, IODebugContext* /*dbg*/) { result->reset(); @@ -295,14 +325,7 @@ SetFD_CLOEXEC(fd, &options); if (options.use_mmap_writes) { - if (!checkedDiskForMmap_) { - // this will be executed once in the program's lifetime. - // do not use mmapWrite on non ext-3/xfs/tmpfs systems. - if (!SupportsFastAllocate(fname)) { - forceMmapOff_ = true; - } - checkedDiskForMmap_ = true; - } + MaybeForceDisableMmap(fd); } if (options.use_mmap_writes && !forceMmapOff_) { result->reset(new PosixMmapFile(fname, fd, page_size_, options)); @@ -323,12 +346,18 @@ } } #endif - result->reset(new PosixWritableFile(fname, fd, options)); + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); } else { // disable mmap writes EnvOptions no_mmap_writes_options = options; no_mmap_writes_options.use_mmap_writes = false; - result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); } return s; } @@ -395,14 +424,7 @@ } if (options.use_mmap_writes) { - if (!checkedDiskForMmap_) { - // this will be executed once in the program's lifetime. - // do not use mmapWrite on non ext-3/xfs/tmpfs systems. - if (!SupportsFastAllocate(fname)) { - forceMmapOff_ = true; - } - checkedDiskForMmap_ = true; - } + MaybeForceDisableMmap(fd); } if (options.use_mmap_writes && !forceMmapOff_) { result->reset(new PosixMmapFile(fname, fd, page_size_, options)); @@ -423,12 +445,18 @@ } } #endif - result->reset(new PosixWritableFile(fname, fd, options)); + result->reset(new PosixWritableFile( + fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd), + options)); } else { // disable mmap writes FileOptions no_mmap_writes_options = options; no_mmap_writes_options.use_mmap_writes = false; - result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); + result->reset( + new PosixWritableFile(fname, fd, + GetLogicalBlockSizeForWriteIfNeeded( + no_mmap_writes_options, fname, fd), + no_mmap_writes_options)); } return s; } @@ -519,10 +547,45 @@ return IOStatus::OK(); } - IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*opts*/, - std::shared_ptr* /*ptr*/, + IOStatus NewLogger(const std::string& fname, const IOOptions& /*opts*/, + std::shared_ptr* result, IODebugContext* /*dbg*/) override { - return IOStatus::NotSupported(); + FILE* f = nullptr; + int fd; + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(fname.c_str(), + cloexec_flags(O_WRONLY | O_CREAT | O_TRUNC, nullptr), + GetDBFileMode(allow_non_owner_access_)); + if (fd != -1) { + f = fdopen(fd, + "w" +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 7) + "e" // glibc extension to enable O_CLOEXEC +#endif +#endif + ); + } + } + if (fd == -1) { + result->reset(); + return status_to_io_status( + IOError("when open a file for new logger", fname, errno)); + } + if (f == nullptr) { + close(fd); + result->reset(); + return status_to_io_status( + IOError("when fdopen a file for new logger", fname, errno)); + } else { +#ifdef ROCKSDB_FALLOCATE_PRESENT + fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024); +#endif + SetFD_CLOEXEC(fd, nullptr); + result->reset(new PosixLogger(f, &gettid, Env::Default())); + return IOStatus::OK(); + } } IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/, @@ -543,7 +606,8 @@ return IOStatus::NotFound(); default: assert(err == EIO || err == ENOMEM); - return IOStatus::IOError("Unexpected error(" + ToString(err) + + return IOStatus::IOError("Unexpected error(" + + ROCKSDB_NAMESPACE::ToString(err) + ") accessing file `" + fname + "' "); } } @@ -552,6 +616,7 @@ std::vector* result, IODebugContext* /*dbg*/) override { result->clear(); + DIR* d = opendir(dir.c_str()); if (d == nullptr) { switch (errno) { @@ -563,11 +628,36 @@ return IOError("While opendir", dir, errno); } } + + // reset errno before calling readdir() + errno = 0; struct dirent* entry; while ((entry = readdir(d)) != nullptr) { - result->push_back(entry->d_name); + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + entry->d_type == DT_DIR && + (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0); + if (!ignore) { + result->push_back(entry->d_name); + } + errno = 0; // reset errno if readdir() success + } + + // always attempt to close the dir + const auto pre_close_errno = errno; // errno may be modified by closedir + const int close_result = closedir(d); + + if (pre_close_errno != 0) { + // error occurred during readdir + return IOError("While readdir", dir, pre_close_errno); + } + + if (close_result != 0) { + // error occurred during closedir + return IOError("While closedir", dir, errno); } - closedir(d); + return IOStatus::OK(); } @@ -582,50 +672,46 @@ IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - IOStatus result; if (mkdir(name.c_str(), 0755) != 0) { - result = IOError("While mkdir", name, errno); + return IOError("While mkdir", name, errno); } - return result; + return IOStatus::OK(); } IOStatus CreateDirIfMissing(const std::string& name, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - IOStatus result; if (mkdir(name.c_str(), 0755) != 0) { if (errno != EEXIST) { - result = IOError("While mkdir if missing", name, errno); + return IOError("While mkdir if missing", name, errno); } else if (!DirExists(name)) { // Check that name is actually a // directory. // Message is taken from mkdir - result = - IOStatus::IOError("`" + name + "' exists but is not a directory"); + return IOStatus::IOError("`" + name + + "' exists but is not a directory"); } } - return result; + return IOStatus::OK(); } IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - IOStatus result; if (rmdir(name.c_str()) != 0) { - result = IOError("file rmdir", name, errno); + return IOError("file rmdir", name, errno); } - return result; + return IOStatus::OK(); } IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/, uint64_t* size, IODebugContext* /*dbg*/) override { - IOStatus s; struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { *size = 0; - s = IOError("while stat a file for size", fname, errno); + return IOError("while stat a file for size", fname, errno); } else { *size = sbuf.st_size; } - return s; + return IOStatus::OK(); } IOStatus GetFileModificationTime(const std::string& fname, @@ -643,24 +729,24 @@ IOStatus RenameFile(const std::string& src, const std::string& target, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - IOStatus result; if (rename(src.c_str(), target.c_str()) != 0) { - result = IOError("While renaming a file to " + target, src, errno); + return IOError("While renaming a file to " + target, src, errno); } - return result; + return IOStatus::OK(); } IOStatus LinkFile(const std::string& src, const std::string& target, const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { - IOStatus result; if (link(src.c_str(), target.c_str()) != 0) { - if (errno == EXDEV) { - return IOStatus::NotSupported("No cross FS links allowed"); + if (errno == EXDEV || errno == ENOTSUP) { + return IOStatus::NotSupported(errno == EXDEV + ? "No cross FS links allowed" + : "Links not supported by FS"); } - result = IOError("while link file to " + target, src, errno); + return IOError("while link file to " + target, src, errno); } - return result; + return IOStatus::OK(); } IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/, @@ -697,11 +783,19 @@ IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/, FileLock** lock, IODebugContext* /*dbg*/) override { *lock = nullptr; - IOStatus result; - mutex_lockedFiles.Lock(); - // If it already exists in the lockedFiles set, then it is already locked, - // and fail this lock attempt. Otherwise, insert it into lockedFiles. + LockHoldingInfo lhi; + int64_t current_time = 0; + // Ignore status code as the time is only used for error message. + SystemClock::Default() + ->GetCurrentTime(¤t_time) + .PermitUncheckedError(); + lhi.acquire_time = current_time; + lhi.acquiring_thread = Env::Default()->GetThreadID(); + + mutex_locked_files.Lock(); + // If it already exists in the locked_files set, then it is already locked, + // and fail this lock attempt. Otherwise, insert it into locked_files. // This check is needed because fcntl() does not detect lock conflict // if the fcntl is issued by the same thread that earlier acquired // this lock. @@ -709,12 +803,22 @@ // Otherwise, we will open a new file descriptor. Locks are associated with // a process, not a file descriptor and when *any* file descriptor is // closed, all locks the process holds for that *file* are released - if (lockedFiles.insert(fname).second == false) { - mutex_lockedFiles.Unlock(); + const auto it_success = locked_files.insert({fname, lhi}); + if (it_success.second == false) { + LockHoldingInfo prev_info = it_success.first->second; + mutex_locked_files.Unlock(); errno = ENOLCK; - return IOError("lock ", fname, errno); + // Note that the thread ID printed is the same one as the one in + // posix logger, but posix logger prints it hex format. + return IOError( + "lock hold by current process, acquire time " + + ROCKSDB_NAMESPACE::ToString(prev_info.acquire_time) + + " acquiring thread " + + ROCKSDB_NAMESPACE::ToString(prev_info.acquiring_thread), + fname, errno); } + IOStatus result = IOStatus::OK(); int fd; int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr); @@ -725,9 +829,6 @@ if (fd < 0) { result = IOError("while open a file for lock", fname, errno); } else if (LockOrUnlock(fd, true) == -1) { - // if there is an error in locking, then remove the pathname from - // lockedfiles - lockedFiles.erase(fname); result = IOError("While lock file", fname, errno); close(fd); } else { @@ -737,8 +838,14 @@ my_lock->filename = fname; *lock = my_lock; } + if (!result.ok()) { + // If there is an error in locking, then remove the pathname from + // locked_files. (If we got this far, it did not exist in locked_files + // before this call.) + locked_files.erase(fname); + } - mutex_lockedFiles.Unlock(); + mutex_locked_files.Unlock(); return result; } @@ -746,18 +853,19 @@ IODebugContext* /*dbg*/) override { PosixFileLock* my_lock = reinterpret_cast(lock); IOStatus result; - mutex_lockedFiles.Lock(); + mutex_locked_files.Lock(); // If we are unlocking, then verify that we had locked it earlier, - // it should already exist in lockedFiles. Remove it from lockedFiles. - if (lockedFiles.erase(my_lock->filename) != 1) { + // it should already exist in locked_files. Remove it from locked_files. + if (locked_files.erase(my_lock->filename) != 1) { errno = ENOLCK; result = IOError("unlock", my_lock->filename, errno); } else if (LockOrUnlock(my_lock->fd_, false) == -1) { result = IOError("unlock", my_lock->filename, errno); } close(my_lock->fd_); + my_lock->Clear(); delete my_lock; - mutex_lockedFiles.Unlock(); + mutex_locked_files.Unlock(); return result; } @@ -772,7 +880,7 @@ char the_path[256]; char* ret = getcwd(the_path, 256); if (ret == nullptr) { - return IOStatus::IOError(strerror(errno)); + return IOStatus::IOError(errnoStr(errno).c_str()); } *output_path = ret; @@ -792,7 +900,7 @@ // Directory may already exist { IOOptions opts; - CreateDir(*result, opts, nullptr); + return CreateDirIfMissing(*result, opts, nullptr); } return IOStatus::OK(); } @@ -806,12 +914,46 @@ return IOError("While doing statvfs", fname, errno); } - *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + // sbuf.bfree is total free space available to root + // sbuf.bavail is total free space available to unprivileged user + // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id + if (geteuid()) { + // non-zero user is unprivileged, or -1 if error. take more conservative + // size + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail); + } else { + // root user can access all disk space + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + } return IOStatus::OK(); } + IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + // First open + int fd = -1; + int flags = cloexec_flags(O_RDONLY, nullptr); + { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(path.c_str(), flags); + } + if (fd < 0) { + return IOError("While open for IsDirectory()", path, errno); + } + IOStatus io_s; + struct stat sbuf; + if (fstat(fd, &sbuf) < 0) { + io_s = IOError("While doing stat for IsDirectory()", path, errno); + } + close(fd); + if (io_s.ok() && nullptr != is_dir) { + *is_dir = S_ISDIR(sbuf.st_mode); + } + return io_s; + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const override { + const DBOptions& db_options) const override { FileOptions optimized = file_options; optimized.use_mmap_writes = false; optimized.use_direct_writes = false; @@ -833,10 +975,17 @@ optimized.fallocate_with_keep_size = true; return optimized; } - +#ifdef OS_LINUX + Status RegisterDbPaths(const std::vector& paths) override { + return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths); + } + Status UnregisterDbPaths(const std::vector& paths) override { + logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths); + return Status::OK(); + } +#endif private: - bool checkedDiskForMmap_; - bool forceMmapOff_; // do we override Env options? + bool forceMmapOff_ = false; // do we override Env options? // Returns true iff the named directory exists and is a directory. virtual bool DirExists(const std::string& dname) { @@ -847,10 +996,10 @@ return false; // stat() failed return false } - bool SupportsFastAllocate(const std::string& path) { + bool SupportsFastAllocate(int fd) { #ifdef ROCKSDB_FALLOCATE_PRESENT struct statfs s; - if (statfs(path.c_str(), &s)) { + if (fstatfs(fd, &s)) { return false; } switch (s.f_type) { @@ -864,11 +1013,36 @@ return false; } #else - (void)path; + (void)fd; return false; #endif } + void MaybeForceDisableMmap(int fd) { + static std::once_flag s_check_disk_for_mmap_once; + assert(this == FileSystem::Default().get()); + std::call_once( + s_check_disk_for_mmap_once, + [this](int fdesc) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fdesc)) { + forceMmapOff_ = true; + } + }, + fd); + } + +#ifdef ROCKSDB_IOURING_PRESENT + bool IsIOUringEnabled() { + if (RocksDbIOUringEnable && RocksDbIOUringEnable()) { + return true; + } else { + return false; + } + } +#endif // ROCKSDB_IOURING_PRESENT + #if defined(ROCKSDB_IOURING_PRESENT) // io_uring instance std::unique_ptr thread_local_io_urings_; @@ -879,11 +1053,50 @@ // If true, allow non owner read access for db files. Otherwise, non-owner // has no access to db files. bool allow_non_owner_access_; + +#ifdef OS_LINUX + static LogicalBlockSizeCache logical_block_size_cache_; +#endif + static size_t GetLogicalBlockSize(const std::string& fname, int fd); + // In non-direct IO mode, this directly returns kDefaultPageSize. + // Otherwise call GetLogicalBlockSize. + static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); + static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options, + const std::string& fname, + int fd); }; +#ifdef OS_LINUX +LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_; +#endif + +size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) { +#ifdef OS_LINUX + return logical_block_size_cache_.GetLogicalBlockSize(fname, fd); +#else + (void)fname; + return PosixHelper::GetLogicalBlockSizeOfFd(fd); +#endif +} + +size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_reads + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + +size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded( + const EnvOptions& options, const std::string& fname, int fd) { + return options.use_direct_writes + ? PosixFileSystem::GetLogicalBlockSize(fname, fd) + : kDefaultPageSize; +} + PosixFileSystem::PosixFileSystem() - : checkedDiskForMmap_(false), - forceMmapOff_(false), + : forceMmapOff_(false), page_size_(getpagesize()), allow_non_owner_access_(true) { #if defined(ROCKSDB_IOURING_PRESENT) @@ -910,4 +1123,17 @@ return default_fs_ptr; } +#ifndef ROCKSDB_LITE +static FactoryFunc posix_filesystem_reg = + ObjectLibrary::Default()->AddFactory( + ObjectLibrary::PatternEntry("posix").AddSeparator("://", false), + [](const std::string& /* uri */, std::unique_ptr* f, + std::string* /* errmsg */) { + f->reset(new PosixFileSystem()); + return f->get(); + }); +#endif + } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_readonly.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_readonly.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,107 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem wrapper that only allows read-only operation. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class ReadOnlyFileSystem : public FileSystemWrapper { + static inline IOStatus FailReadOnly() { + IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem"); + assert(s.GetRetryable() == false); + return s; + } + + public: + explicit ReadOnlyFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + static const char* kClassName() { return "ReadOnlyFileSystem"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewWritableFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus ReuseWritableFile(const std::string& /*fname*/, + const std::string& /*old_fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewDirectory(const std::string& /*dir*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus DeleteFile(const std::string& /*fname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override { + // Allow if dir already exists + bool is_dir = false; + IOStatus s = IsDirectory(dirname, options, &is_dir, dbg); + if (s.ok() && is_dir) { + return s; + } else { + return FailReadOnly(); + } + } + IOStatus DeleteDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/, + FileLock** /*lock*/, IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/, + std::shared_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,306 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "env/fs_remap.h" + +namespace ROCKSDB_NAMESPACE { + +RemapFileSystem::RemapFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + +std::pair RemapFileSystem::EncodePathWithNewBasename( + const std::string& path) { + // No difference by default + return EncodePath(path); +} + +Status RemapFileSystem::RegisterDbPaths(const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::RegisterDbPaths(encoded_paths); +} + +Status RemapFileSystem::UnregisterDbPaths( + const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::UnregisterDbPaths(encoded_paths); +} + +IOStatus RemapFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + auto status_and_old_enc_path = EncodePath(old_fname); + if (!status_and_old_enc_path.first.ok()) { + return status_and_old_enc_path.first; + } + return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second, + status_and_old_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::NewDirectory(const std::string& dir, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewDirectory(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::FileExists(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildren(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildrenFileAttributes( + status_and_enc_path.second, options, result, dbg); +} + +IOStatus RemapFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second, + options, dbg); +} + +IOStatus RemapFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options, + file_size, dbg); +} + +IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second, + options, file_mtime, dbg); +} + +IOStatus RemapFileSystem::IsDirectory(const std::string& path, + const IOOptions& options, bool* is_dir, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options, + is_dir, dbg); +} + +IOStatus RemapFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::RenameFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::LinkFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + // FileLock subclasses may store path (e.g., PosixFileLock stores it). We + // can skip stripping the chroot directory from this path because callers + // shouldn't use it. + return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock, + dbg); +} + +IOStatus RemapFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewLogger(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(db_path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options, + output_path, dbg); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,139 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// An abstract FileSystem wrapper that creates a view of an existing +// FileSystem by remapping names in some way. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class RemapFileSystem : public FileSystemWrapper { + public: + explicit RemapFileSystem(const std::shared_ptr& base); + + protected: + // Returns status and mapped-to path in the wrapped filesystem. + // If it returns non-OK status, the returned path should not be used. + virtual std::pair EncodePath( + const std::string& path) = 0; + + // Similar to EncodePath() except used in cases in which it is OK for + // no file or directory on 'path' to already exist, such as if the + // operation would create one. However, the parent of 'path' is expected + // to exist for the operation to succeed. + // Default implementation: call EncodePath + virtual std::pair EncodePathWithNewBasename( + const std::string& path); + + public: + // Left abstract: + // const char* Name() const override { ... } + static const char* kClassName() { return "RemapFileSystem"; } + bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(id); + } + } + + Status RegisterDbPaths(const std::vector& paths) override; + + Status UnregisterDbPaths(const std::vector& paths) override; + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& dir, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override; + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc 2025-05-19 16:14:27.000000000 +0000 @@ -27,11 +27,11 @@ #include #ifdef OS_LINUX #include -#include #include #endif #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "port/stack_trace.h" #include "rocksdb/slice.h" #include "test_util/sync_point.h" #include "util/autovector.h" @@ -45,6 +45,35 @@ namespace ROCKSDB_NAMESPACE { +std::string IOErrorMsg(const std::string& context, + const std::string& file_name) { + if (file_name.empty()) { + return context; + } + return context + ": " + file_name; +} + +// file_name can be left empty if it is not unkown. +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number) { + switch (err_number) { + case ENOSPC: { + IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + s.SetRetryable(true); + return s; + } + case ESTALE: + return IOStatus::IOError(IOStatus::kStaleFile); + case ENOENT: + return IOStatus::PathNotFound(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + default: + return IOStatus::IOError(IOErrorMsg(context, file_name), + errnoStr(err_number).c_str()); + } +} + // A wrapper for fadvise, if the platform doesn't support fadvise, // it will simply return 0. int Fadvise(int fd, off_t offset, size_t len, int advice) { @@ -112,75 +141,6 @@ return true; } -size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { -#ifdef OS_LINUX - struct stat buf; - int result = fstat(fd, &buf); - if (result == -1) { - return kDefaultPageSize; - } - if (major(buf.st_dev) == 0) { - // Unnamed devices (e.g. non-device mounts), reserved as null device number. - // These don't have an entry in /sys/dev/block/. Return a sensible default. - return kDefaultPageSize; - } - - // Reading queue/logical_block_size does not require special permissions. - const int kBufferSize = 100; - char path[kBufferSize]; - char real_path[PATH_MAX + 1]; - snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), - minor(buf.st_dev)); - if (realpath(path, real_path) == nullptr) { - return kDefaultPageSize; - } - std::string device_dir(real_path); - if (!device_dir.empty() && device_dir.back() == '/') { - device_dir.pop_back(); - } - // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda - // and nvme0n1 have it. - // $ ls -al '/sys/dev/block/8:3' - // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> - // ../../block/sda/sda3 - // $ ls -al '/sys/dev/block/259:4' - // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> - // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 - size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); - if (parent_end == std::string::npos) { - return kDefaultPageSize; - } - size_t parent_begin = device_dir.rfind('/', parent_end - 1); - if (parent_begin == std::string::npos) { - return kDefaultPageSize; - } - std::string parent = - device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); - std::string child = device_dir.substr(parent_end + 1, std::string::npos); - if (parent != "block" && - (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { - device_dir = device_dir.substr(0, parent_end); - } - std::string fname = device_dir + "/queue/logical_block_size"; - FILE* fp; - size_t size = 0; - fp = fopen(fname.c_str(), "r"); - if (fp != nullptr) { - char* line = nullptr; - size_t len = 0; - if (getline(&line, &len, fp) != -1) { - sscanf(line, "%zu", &size); - } - free(line); - fclose(fp); - } - if (size != 0 && (size & (size - 1)) == 0) { - return size; - } -#endif - return kDefaultPageSize; -} - #ifdef ROCKSDB_RANGESYNC_PRESENT #if !defined(ZFS_SUPER_MAGIC) @@ -190,11 +150,11 @@ #endif bool IsSyncFileRangeSupported(int fd) { - // The approach taken in this function is to build a blacklist of cases where - // we know `sync_file_range` definitely will not work properly despite passing - // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or - // if any of the checks fail in unexpected ways, we allow `sync_file_range` to - // be used. This way should minimize risk of impacting existing use cases. + // This function tracks and checks for cases where we know `sync_file_range` + // definitely will not work properly despite passing the compile-time check + // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks + // fail in unexpected ways, we allow `sync_file_range` to be used. This way + // should minimize risk of impacting existing use cases. struct statfs buf; int ret = fstatfs(fd, &buf); assert(ret == 0); @@ -216,7 +176,7 @@ // ("Function not implemented"). return false; } - // None of the cases on the blacklist matched, so allow `sync_file_range` use. + // None of the known cases matched, so allow `sync_file_range` use. return true; } @@ -229,30 +189,31 @@ /* * DirectIOHelper */ -#ifndef NDEBUG namespace { bool IsSectorAligned(const size_t off, size_t sector_size) { - return off % sector_size == 0; + assert((sector_size & (sector_size - 1)) == 0); + return (off & (sector_size - 1)) == 0; } +#ifndef NDEBUG bool IsSectorAligned(const void* ptr, size_t sector_size) { return uintptr_t(ptr) % sector_size == 0; } - -} // namespace #endif +} // namespace /* * PosixSequentialFile */ PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, - int fd, const EnvOptions& options) + int fd, size_t logical_block_size, + const EnvOptions& options) : filename_(fname), file_(file), fd_(fd), use_direct_io_(options.use_direct_reads), - logical_sector_size_(GetLogicalBufferSize(fd_)) { + logical_sector_size_(logical_block_size) { assert(!options.use_direct_reads || !options.use_mmap_reads); } @@ -273,6 +234,7 @@ IOStatus s; size_t r = 0; do { + clearerr(file_); r = fread_unlocked(scratch, 1, n, file_); } while (r == 0 && ferror(file_) && errno == EINTR); *result = Slice(scratch, r); @@ -314,7 +276,7 @@ ptr += r; offset += r; left -= r; - if (r % static_cast(GetRequiredBufferAlignment()) != 0) { + if (!IsSectorAligned(r, GetRequiredBufferAlignment())) { // Bytes reads don't fill sectors. Should only happen at the end // of the file. break; @@ -409,13 +371,178 @@ return static_cast(rid - id); } #endif + +#ifdef OS_LINUX +std::string RemoveTrailingSlash(const std::string& path) { + std::string p = path; + if (p.size() > 1 && p.back() == '/') { + p.pop_back(); + } + return p; +} + +Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize( + const std::vector& directories) { + std::vector dirs; + dirs.reserve(directories.size()); + for (auto& d : directories) { + dirs.emplace_back(RemoveTrailingSlash(d)); + } + + std::map dir_sizes; + { + ReadLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + if (cache_.find(dir) == cache_.end()) { + dir_sizes.emplace(dir, 0); + } + } + } + + Status s; + for (auto& dir_size : dir_sizes) { + s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second); + if (!s.ok()) { + return s; + } + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto& v = cache_[dir]; + v.ref++; + auto dir_size = dir_sizes.find(dir); + if (dir_size != dir_sizes.end()) { + v.size = dir_size->second; + } + } + return s; +} + +void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector& directories) { + std::vector dirs; + dirs.reserve(directories.size()); + for (auto& dir : directories) { + dirs.emplace_back(RemoveTrailingSlash(dir)); + } + + WriteLock lock(&cache_mutex_); + for (const auto& dir : dirs) { + auto it = cache_.find(dir); + if (it != cache_.end() && !(--(it->second.ref))) { + cache_.erase(it); + } + } +} + +size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, + int fd) { + std::string dir = fname.substr(0, fname.find_last_of("/")); + if (dir.empty()) { + dir = "/"; + } + { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it != cache_.end()) { + return it->second.size; + } + } + return get_logical_block_size_of_fd_(fd); +} +#endif + +Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size) { + int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); + if (fd == -1) { + close(fd); + return Status::IOError("Cannot open directory " + directory); + } + *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); + close(fd); + return Status::OK(); +} + +size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { +#ifdef OS_LINUX + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return kDefaultPageSize; + } + if (major(buf.st_dev) == 0) { + // Unnamed devices (e.g. non-device mounts), reserved as null device number. + // These don't have an entry in /sys/dev/block/. Return a sensible default. + return kDefaultPageSize; + } + + // Reading queue/logical_block_size does not require special permissions. + const int kBufferSize = 100; + char path[kBufferSize]; + char real_path[PATH_MAX + 1]; + snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), + minor(buf.st_dev)); + if (realpath(path, real_path) == nullptr) { + return kDefaultPageSize; + } + std::string device_dir(real_path); + if (!device_dir.empty() && device_dir.back() == '/') { + device_dir.pop_back(); + } + // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda + // and nvme0n1 have it. + // $ ls -al '/sys/dev/block/8:3' + // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> + // ../../block/sda/sda3 + // $ ls -al '/sys/dev/block/259:4' + // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> + // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 + size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); + if (parent_end == std::string::npos) { + return kDefaultPageSize; + } + size_t parent_begin = device_dir.rfind('/', parent_end - 1); + if (parent_begin == std::string::npos) { + return kDefaultPageSize; + } + std::string parent = + device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); + std::string child = device_dir.substr(parent_end + 1, std::string::npos); + if (parent != "block" && + (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { + device_dir = device_dir.substr(0, parent_end); + } + std::string fname = device_dir + "/queue/logical_block_size"; + FILE* fp; + size_t size = 0; + fp = fopen(fname.c_str(), "r"); + if (fp != nullptr) { + char* line = nullptr; + size_t len = 0; + if (getline(&line, &len, fp) != -1) { + sscanf(line, "%zu", &size); + } + free(line); + fclose(fp); + } + if (size != 0 && (size & (size - 1)) == 0) { + return size; + } +#endif + (void)fd; + return kDefaultPageSize; +} + /* * PosixRandomAccessFile * * pread() based random-access */ PosixRandomAccessFile::PosixRandomAccessFile( - const std::string& fname, int fd, const EnvOptions& options + const std::string& fname, int fd, size_t logical_block_size, + const EnvOptions& options #if defined(ROCKSDB_IOURING_PRESENT) , ThreadLocalPtr* thread_local_io_urings @@ -424,14 +551,14 @@ : filename_(fname), fd_(fd), use_direct_io_(options.use_direct_reads), - logical_sector_size_(GetLogicalBufferSize(fd_)) + logical_sector_size_(logical_block_size) #if defined(ROCKSDB_IOURING_PRESENT) , thread_local_io_urings_(thread_local_io_urings) #endif { assert(!options.use_direct_reads || !options.use_mmap_reads); - assert(!options.use_mmap_reads || sizeof(void*) < 8); + assert(!options.use_mmap_reads); } PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } @@ -481,6 +608,14 @@ size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { + if (use_direct_io()) { + for (size_t i = 0; i < num_reqs; i++) { + assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment())); + assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment())); + } + } + #if defined(ROCKSDB_IOURING_PRESENT) struct io_uring* iu = nullptr; if (thread_local_io_urings_) { @@ -499,6 +634,8 @@ return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); } + IOStatus ios = IOStatus::OK(); + struct WrappedReadRequest { FSReadRequest* req; struct iovec iov; @@ -508,6 +645,7 @@ autovector req_wraps; autovector incomplete_rq_list; + std::unordered_set wrap_cache; for (size_t i = 0; i < num_reqs; i++) { req_wraps.emplace_back(&reqs[i]); @@ -540,26 +678,71 @@ sqe, fd_, &rep_to_submit->iov, 1, rep_to_submit->req->offset + rep_to_submit->finished_len); io_uring_sqe_set_data(sqe, rep_to_submit); + wrap_cache.emplace(rep_to_submit); } incomplete_rq_list.clear(); ssize_t ret = io_uring_submit_and_wait(iu, static_cast(this_reqs)); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + &ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + iu); + if (static_cast(ret) != this_reqs) { fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); + // If error happens and we submitted fewer than expected, it is an + // exception case and we don't retry here. We should still consume + // what is is submitted in the ring. + for (ssize_t i = 0; i < ret; i++) { + struct io_uring_cqe* cqe = nullptr; + io_uring_wait_cqe(iu, &cqe); + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + } + return IOStatus::IOError("io_uring_submit_and_wait() requested " + + ToString(this_reqs) + " but returned " + + ToString(ret)); } - assert(static_cast(ret) == this_reqs); for (size_t i = 0; i < this_reqs; i++) { - struct io_uring_cqe* cqe; + struct io_uring_cqe* cqe = nullptr; WrappedReadRequest* req_wrap; // We could use the peek variant here, but this seems safer in terms // of our initial wait not reaping all completions ret = io_uring_wait_cqe(iu, &cqe); - assert(!ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); + if (ret) { + ios = IOStatus::IOError("io_uring_wait_cqe() returns " + ToString(ret)); + + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + continue; + } req_wrap = static_cast(io_uring_cqe_get_data(cqe)); + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + // Check that we got a valid unique cqe data + auto wrap_check = wrap_cache.find(req_wrap); + if (wrap_check == wrap_cache.end()) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "Bad cqe data from IO uring - %p\n", + req_wrap); + port::PrintStack(); + ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + + ToString((uint64_t)req_wrap)); + continue; + } + wrap_cache.erase(wrap_check); + FSReadRequest* req = req_wrap->req; if (cqe->res < 0) { req->result = Slice(req->scratch, 0); @@ -576,13 +759,22 @@ // comment // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435 // Fall back to pread in this case. - Slice tmp_slice; - req->status = - Read(req->offset + req_wrap->finished_len, - req->len - req_wrap->finished_len, options, &tmp_slice, - req->scratch + req_wrap->finished_len, dbg); - req->result = - Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + if (use_direct_io() && + !IsSectorAligned(req_wrap->finished_len, + GetRequiredBufferAlignment())) { + // Bytes reads don't fill sectors. Should only happen at the end + // of the file. + req->result = Slice(req->scratch, req_wrap->finished_len); + req->status = IOStatus::OK(); + } else { + Slice tmp_slice; + req->status = + Read(req->offset + req_wrap->finished_len, + req->len - req_wrap->finished_len, options, &tmp_slice, + req->scratch + req_wrap->finished_len, dbg); + req->result = + Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + } } else if (bytes_read < req_wrap->iov.iov_len) { assert(bytes_read > 0); assert(bytes_read + req_wrap->finished_len < req->len); @@ -596,8 +788,9 @@ } io_uring_cqe_seen(iu, cqe); } + wrap_cache.clear(); } - return IOStatus::OK(); + return ios; #else return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); #endif @@ -750,7 +943,7 @@ * knows enough to skip zero suffixes. */ IOStatus PosixMmapFile::UnmapCurrentRegion() { - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); if (base_ != nullptr) { int munmap_status = munmap(base_, limit_ - base_); if (munmap_status != 0) { @@ -773,7 +966,7 @@ IOStatus PosixMmapFile::MapNewRegion() { #ifdef ROCKSDB_FALLOCATE_PRESENT assert(base_ == nullptr); - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); // we can't fallocate with FALLOC_FL_KEEP_SIZE here if (allow_fallocate_) { IOSTATS_TIMER_GUARD(allocate_nanos); @@ -784,17 +977,17 @@ } if (alloc_status != 0) { return IOStatus::IOError("Error allocating space to file : " + filename_ + - "Error : " + strerror(alloc_status)); + "Error : " + errnoStr(alloc_status).c_str()); } } - TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:1"); void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, file_offset_); if (ptr == MAP_FAILED) { return IOStatus::IOError("MMap failed on " + filename_); } - TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:2"); base_ = reinterpret_cast(ptr); limit_ = base_ + map_size_; @@ -815,7 +1008,7 @@ size_t p1 = TruncateToPageBoundary(last_sync_ - base_); size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); last_sync_ = dst_; - TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Msync:0"); if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { return IOError("While msync", filename_, errno); } @@ -846,7 +1039,8 @@ PosixMmapFile::~PosixMmapFile() { if (fd_ >= 0) { - PosixMmapFile::Close(IOOptions(), nullptr); + IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); } } @@ -867,7 +1061,7 @@ if (!s.ok()) { return s; } - TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:0"); } size_t n = (left <= avail) ? left : avail; @@ -914,9 +1108,15 @@ IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync mmapped file", filename_, errno); } +#endif // HAVE_FULLFSYNC return Msync(); } @@ -926,9 +1126,15 @@ */ IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync mmaped file", filename_, errno); } +#endif // HAVE_FULLFSYNC return Msync(); } @@ -965,7 +1171,7 @@ IODebugContext* /*dbg*/) { assert(offset <= static_cast(std::numeric_limits::max())); assert(len <= static_cast(std::numeric_limits::max())); - TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Allocate:0"); int alloc_status = 0; if (allow_fallocate_) { alloc_status = @@ -988,13 +1194,14 @@ * Use posix write to write data to a file. */ PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, const EnvOptions& options) : FSWritableFile(options), filename_(fname), use_direct_io_(options.use_direct_writes), fd_(fd), filesize_(0), - logical_sector_size_(GetLogicalBufferSize(fd_)) { + logical_sector_size_(logical_block_size) { #ifdef ROCKSDB_FALLOCATE_PRESENT allow_fallocate_ = options.allow_fallocate; fallocate_with_keep_size_ = options.fallocate_with_keep_size; @@ -1007,7 +1214,8 @@ PosixWritableFile::~PosixWritableFile() { if (fd_ >= 0) { - PosixWritableFile::Close(IOOptions(), nullptr); + IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr); + s.PermitUncheckedError(); } } @@ -1067,6 +1275,7 @@ size_t block_size; size_t last_allocated_block; GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); if (last_allocated_block > 0) { // trim the extra space preallocated at the end of the file // NOTE(ljin): we probably don't want to surface failure as an IOError, @@ -1123,17 +1332,29 @@ IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } @@ -1186,7 +1407,7 @@ IODebugContext* /*dbg*/) { assert(offset <= static_cast(std::numeric_limits::max())); assert(len <= static_cast(std::numeric_limits::max())); - TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixWritableFile::Allocate:0"); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { @@ -1249,7 +1470,8 @@ PosixRandomRWFile::~PosixRandomRWFile() { if (fd_ >= 0) { - Close(IOOptions(), nullptr); + IOStatus s = Close(IOOptions(), nullptr); + s.PermitUncheckedError(); } } @@ -1305,17 +1527,29 @@ IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync random read/write file", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync random read/write file", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } @@ -1336,17 +1570,71 @@ /* * PosixDirectory */ +#if !defined(BTRFS_SUPER_MAGIC) +// The magic number for BTRFS is fixed, if it's not defined, define it here +#define BTRFS_SUPER_MAGIC 0x9123683E +#endif +PosixDirectory::PosixDirectory(int fd) : fd_(fd) { + is_btrfs_ = false; +#ifdef OS_LINUX + struct statfs buf; + int ret = fstatfs(fd, &buf); + is_btrfs_ = (ret == 0 && buf.f_type == static_cast( + BTRFS_SUPER_MAGIC)); +#endif +} PosixDirectory::~PosixDirectory() { close(fd_); } -IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/, - IODebugContext* /*dbg*/) { +IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) { + return FsyncWithDirOptions(opts, dbg, DirFsyncOptions()); +} + +IOStatus PosixDirectory::FsyncWithDirOptions( + const IOOptions& /*opts*/, IODebugContext* /*dbg*/, + const DirFsyncOptions& dir_fsync_options) { + IOStatus s = IOStatus::OK(); #ifndef OS_AIX + if (is_btrfs_) { + // skip dir fsync for new file creation, which is not needed for btrfs + if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) { + return s; + } + // skip dir fsync for renaming file, only need to sync new file + if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) { + std::string new_name = dir_fsync_options.renamed_new_name; + assert(!new_name.empty()); + int fd; + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(new_name.c_str(), O_RDONLY); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open renaming file", new_name, errno); + } else if (fsync(fd) < 0) { + s = IOError("While fsync renaming file", new_name, errno); + } + if (close(fd) < 0) { + s = IOError("While closing file after fsync", new_name, errno); + } + return s; + } + // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted + } +#ifdef HAVE_FULLFSYNC + // btrfs is a Linux file system, while currently F_FULLFSYNC is available on + // Mac OS. + assert(!is_btrfs_); + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) == -1) { - return IOError("While fsync", "a directory", errno); + s = IOError("While fsync", "a directory", errno); } -#endif - return IOStatus::OK(); +#endif // HAVE_FULLFSYNC +#endif // OS_AIX + return s; } } // namespace ROCKSDB_NAMESPACE #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,11 +14,15 @@ #endif #include #include +#include +#include #include +#include "port/port.h" #include "rocksdb/env.h" -#include "util/thread_local.h" #include "rocksdb/file_system.h" #include "rocksdb/io_status.h" +#include "util/mutexlock.h" +#include "util/thread_local.h" // For non linux platform, the following macros are used only as place // holder. @@ -27,43 +31,96 @@ #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ -#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */ #endif namespace ROCKSDB_NAMESPACE { -static std::string IOErrorMsg(const std::string& context, - const std::string& file_name) { - if (file_name.empty()) { - return context; - } - return context + ": " + file_name; -} - +std::string IOErrorMsg(const std::string& context, + const std::string& file_name); // file_name can be left empty if it is not unkown. -static IOStatus IOError(const std::string& context, - const std::string& file_name, int err_number) { - switch (err_number) { - case ENOSPC: { - IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), - strerror(err_number)); - s.SetRetryable(true); - return s; - } - case ESTALE: - return IOStatus::IOError(IOStatus::kStaleFile); - case ENOENT: - return IOStatus::PathNotFound(IOErrorMsg(context, file_name), - strerror(err_number)); - default: - return IOStatus::IOError(IOErrorMsg(context, file_name), - strerror(err_number)); - } -} +IOStatus IOError(const std::string& context, const std::string& file_name, + int err_number); class PosixHelper { public: static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size); + static size_t GetLogicalBlockSizeOfFd(int fd); + static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, + size_t* size); +}; + +#ifdef OS_LINUX +// Files under a specific directory have the same logical block size. +// This class caches the logical block size for the specified directories to +// save the CPU cost of computing the size. +// Safe for concurrent access from multiple threads without any external +// synchronization. +class LogicalBlockSizeCache { + public: + LogicalBlockSizeCache( + std::function get_logical_block_size_of_fd = + PosixHelper::GetLogicalBlockSizeOfFd, + std::function + get_logical_block_size_of_directory = + PosixHelper::GetLogicalBlockSizeOfDirectory) + : get_logical_block_size_of_fd_(get_logical_block_size_of_fd), + get_logical_block_size_of_directory_( + get_logical_block_size_of_directory) {} + + // Takes the following actions: + // 1. Increases reference count of the directories; + // 2. If the directory's logical block size is not cached, + // compute the buffer size and cache the result. + Status RefAndCacheLogicalBlockSize( + const std::vector& directories); + + // Takes the following actions: + // 1. Decreases reference count of the directories; + // 2. If the reference count of a directory reaches 0, remove the directory + // from the cache. + void UnrefAndTryRemoveCachedLogicalBlockSize( + const std::vector& directories); + + // Returns the logical block size for the file. + // + // If the file is under a cached directory, return the cached size. + // Otherwise, the size is computed. + size_t GetLogicalBlockSize(const std::string& fname, int fd); + + int GetRefCount(const std::string& dir) { + ReadLock lock(&cache_mutex_); + auto it = cache_.find(dir); + if (it == cache_.end()) { + return 0; + } + return it->second.ref; + } + + size_t Size() const { return cache_.size(); } + + bool Contains(const std::string& dir) { + ReadLock lock(&cache_mutex_); + return cache_.find(dir) != cache_.end(); + } + + private: + struct CacheValue { + CacheValue() : size(0), ref(0) {} + + // Logical block size of the directory. + size_t size; + // Reference count of the directory. + int ref; + }; + + std::function get_logical_block_size_of_fd_; + std::function + get_logical_block_size_of_directory_; + + std::map cache_; + port::RWMutex cache_mutex_; }; +#endif class PosixSequentialFile : public FSSequentialFile { private: @@ -75,6 +132,7 @@ public: PosixSequentialFile(const std::string& fname, FILE* file, int fd, + size_t logical_block_size, const EnvOptions& options); virtual ~PosixSequentialFile(); @@ -123,6 +181,7 @@ public: PosixRandomAccessFile(const std::string& fname, int fd, + size_t logical_block_size, const EnvOptions& options #if defined(ROCKSDB_IOURING_PRESENT) , @@ -172,6 +231,7 @@ public: explicit PosixWritableFile(const std::string& fname, int fd, + size_t logical_block_size, const EnvOptions& options); virtual ~PosixWritableFile(); @@ -182,9 +242,20 @@ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Append(const Slice& data, const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset, const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus PositionedAppend( + const Slice& data, uint64_t offset, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; @@ -271,6 +342,11 @@ virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Append(const Slice& data, const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; @@ -315,12 +391,17 @@ class PosixDirectory : public FSDirectory { public: - explicit PosixDirectory(int fd) : fd_(fd) {} + explicit PosixDirectory(int fd); ~PosixDirectory(); virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus FsyncWithDirOptions( + const IOOptions&, IODebugContext*, + const DirFsyncOptions& dir_fsync_options) override; + private: int fd_; + bool is_btrfs_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,140 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testharness.h" + +#ifdef ROCKSDB_LIB_IO_POSIX +#include "env/io_posix.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef OS_LINUX +class LogicalBlockSizeCacheTest : public testing::Test {}; + +// Tests the caching behavior. +TEST_F(LogicalBlockSizeCacheTest, Cache) { + int ncall = 0; + auto get_fd_block_size = [&](int fd) { + ncall++; + return fd; + }; + std::map dir_fds{ + {"/", 0}, + {"/db", 1}, + {"/db1", 2}, + {"/db2", 3}, + }; + auto get_dir_block_size = [&](const std::string& dir, size_t* size) { + ncall++; + *size = dir_fds[dir]; + return Status::OK(); + }; + LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size); + ASSERT_EQ(0, ncall); + ASSERT_EQ(0, cache.Size()); + + ASSERT_EQ(6, cache.GetLogicalBlockSize("/sst", 6)); + ASSERT_EQ(1, ncall); + ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(2, ncall); + ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(3, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/", "/db1/", "/db2"})); + ASSERT_EQ(3, cache.Size()); + ASSERT_TRUE(cache.Contains("/")); + ASSERT_TRUE(cache.Contains("/db1")); + ASSERT_TRUE(cache.Contains("/db2")); + ASSERT_EQ(6, ncall); + // Block size for / is cached. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/sst", 6)); + ASSERT_EQ(6, ncall); + // No cached size for /db. + ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(7, ncall); + ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(8, ncall); + // Block size for /db1 is cached. + ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst1", 4)); + ASSERT_EQ(8, ncall); + ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst2", 5)); + ASSERT_EQ(8, ncall); + // Block size for /db2 is cached. + ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst1", 6)); + ASSERT_EQ(8, ncall); + ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst2", 7)); + ASSERT_EQ(8, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(4, cache.Size()); + ASSERT_TRUE(cache.Contains("/")); + ASSERT_TRUE(cache.Contains("/db1")); + ASSERT_TRUE(cache.Contains("/db2")); + ASSERT_TRUE(cache.Contains("/db")); + + ASSERT_EQ(9, ncall); + // Block size for /db is cached. + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst1", 7)); + ASSERT_EQ(9, ncall); + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst2", 8)); + ASSERT_EQ(9, ncall); +} + +// Tests the reference counting behavior. +TEST_F(LogicalBlockSizeCacheTest, Ref) { + int ncall = 0; + auto get_fd_block_size = [&](int fd) { + ncall++; + return fd; + }; + std::map dir_fds{ + {"/db", 0}, + }; + auto get_dir_block_size = [&](const std::string& dir, size_t* size) { + ncall++; + *size = dir_fds[dir]; + return Status::OK(); + }; + LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size); + + ASSERT_EQ(0, ncall); + + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1)); + ASSERT_EQ(1, ncall); + + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(2, ncall); + ASSERT_EQ(1, cache.GetRefCount("/db")); + // Block size for /db is cached. Ref count = 1. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst1", 1)); + ASSERT_EQ(2, ncall); + + // Ref count = 2, but won't recompute the cached buffer size. + ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"})); + ASSERT_EQ(2, cache.GetRefCount("/db")); + ASSERT_EQ(2, ncall); + + // Ref count = 1. + cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"}); + ASSERT_EQ(1, cache.GetRefCount("/db")); + // Block size for /db is still cached. + ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst2", 1)); + ASSERT_EQ(2, ncall); + + // Ref count = 0 and cached buffer size for /db is removed. + cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"}); + ASSERT_EQ(0, cache.Size()); + ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1)); + ASSERT_EQ(3, ncall); +} +#endif + +} // namespace ROCKSDB_NAMESPACE +#endif + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,28 +8,94 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "env/mock_env.h" + #include #include + +#include "env/emulated_clock.h" +#include "file/filename.h" #include "port/sys_time.h" +#include "rocksdb/file_system.h" +#include "rocksdb/utilities/options_type.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" -#include "util/murmurhash.h" +#include "util/hash.h" #include "util/random.h" #include "util/rate_limiter.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +namespace { +int64_t MaybeCurrentTime(const std::shared_ptr& clock) { + int64_t time = 1337346000; // arbitrary fallback default + clock->GetCurrentTime(&time).PermitUncheckedError(); + return time; +} + +static std::unordered_map time_elapse_type_info = { +#ifndef ROCKSDB_LITE + {"time_elapse_only_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast(addr); + clock->SetTimeElapseOnlySleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast(addr); + *value = clock->IsTimeElapseOnlySleep() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map mock_sleep_type_info = { +#ifndef ROCKSDB_LITE + {"mock_sleep", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto clock = static_cast(addr); + clock->SetMockSleep(ParseBoolean("", value)); + return Status::OK(); + }, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto clock = static_cast(addr); + *value = clock->IsMockSleepEnabled() ? "true" : "false"; + return Status::OK(); + }, + nullptr}}, +#endif // ROCKSDB_LITE +}; +} // namespace + +EmulatedSystemClock::EmulatedSystemClock( + const std::shared_ptr& base, bool time_elapse_only_sleep) + : SystemClockWrapper(base), + maybe_starting_time_(MaybeCurrentTime(base)), + time_elapse_only_sleep_(time_elapse_only_sleep), + no_slowdown_(time_elapse_only_sleep) { + RegisterOptions("", this, &time_elapse_type_info); + RegisterOptions("", this, &mock_sleep_type_info); +} class MemFile { public: - explicit MemFile(Env* env, const std::string& fn, bool _is_lock_file = false) - : env_(env), + explicit MemFile(SystemClock* clock, const std::string& fn, + bool _is_lock_file = false) + : clock_(clock), fn_(fn), refs_(0), is_lock_file_(_is_lock_file), locked_(false), size_(0), modified_time_(Now()), - rnd_(static_cast( - MurmurHash(fn.data(), static_cast(fn.size()), 0))), + rnd_(Lower32of64(GetSliceNPHash64(fn))), fsynced_bytes_(0) {} // No copying allowed. MemFile(const MemFile&) = delete; @@ -77,7 +143,8 @@ uint64_t Size() const { return size_; } - void Truncate(size_t size) { + void Truncate(size_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); if (size < size_) { data_.resize(size); @@ -99,7 +166,17 @@ } } - Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, IODebugContext* /*dbg*/) const { + { + IOStatus s; + TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s); + if (!s.ok()) { + // with sync point only + *result = Slice(); + return s; + } + } MutexLock lock(&mutex_); const uint64_t available = Size() - std::min(Size(), offset); size_t offset_ = static_cast(offset); @@ -108,7 +185,7 @@ } if (n == 0) { *result = Slice(); - return Status::OK(); + return IOStatus::OK(); } if (scratch) { memcpy(scratch, &(data_[offset_]), n); @@ -116,10 +193,11 @@ } else { *result = Slice(&(data_[offset_]), n); } - return Status::OK(); + return IOStatus::OK(); } - Status Write(uint64_t offset, const Slice& data) { + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); size_t offset_ = static_cast(offset); if (offset + data.size() > data_.size()) { @@ -128,20 +206,21 @@ data_.replace(offset_, data.size(), data.data(), data.size()); size_ = data_.size(); modified_time_ = Now(); - return Status::OK(); + return IOStatus::OK(); } - Status Append(const Slice& data) { + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); data_.append(data.data(), data.size()); size_ = data_.size(); modified_time_ = Now(); - return Status::OK(); + return IOStatus::OK(); } - Status Fsync() { + IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) { fsynced_bytes_ = size_.load(); - return Status::OK(); + return IOStatus::OK(); } uint64_t ModifiedTime() const { return modified_time_; } @@ -149,7 +228,7 @@ private: uint64_t Now() { int64_t unix_time = 0; - auto s = env_->GetCurrentTime(&unix_time); + auto s = clock_->GetCurrentTime(&unix_time); assert(s.ok()); return static_cast(unix_time); } @@ -157,7 +236,7 @@ // Private since only Unref() should be used to delete it. ~MemFile() { assert(refs_ == 0); } - Env* env_; + SystemClock* clock_; const std::string fn_; mutable port::Mutex mutex_; int refs_; @@ -176,111 +255,176 @@ namespace { -class MockSequentialFile : public SequentialFile { +class MockSequentialFile : public FSSequentialFile { public: - explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) { + explicit MockSequentialFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads), + pos_(0) { file_->Ref(); } ~MockSequentialFile() override { file_->Unref(); } - Status Read(size_t n, Slice* result, char* scratch) override { - Status s = file_->Read(pos_, n, result, scratch); + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus s = file_->Read(pos_, n, options, result, + (use_mmap_read_) ? nullptr : scratch, dbg); if (s.ok()) { pos_ += result->size(); } return s; } - Status Skip(uint64_t n) override { + bool use_direct_io() const override { return use_direct_io_; } + IOStatus Skip(uint64_t n) override { if (pos_ > file_->Size()) { - return Status::IOError("pos_ > file_->Size()"); + return IOStatus::IOError("pos_ > file_->Size()"); } const uint64_t available = file_->Size() - pos_; if (n > available) { n = available; } pos_ += static_cast(n); - return Status::OK(); + return IOStatus::OK(); } private: MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; size_t pos_; }; -class MockRandomAccessFile : public RandomAccessFile { +class MockRandomAccessFile : public FSRandomAccessFile { public: - explicit MockRandomAccessFile(MemFile* file) : file_(file) { file_->Ref(); } + explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads) { + file_->Ref(); + } ~MockRandomAccessFile() override { file_->Unref(); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - return file_->Read(offset, n, result, scratch); + bool use_direct_io() const override { return use_direct_io_; } + + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + if (use_mmap_read_) { + return file_->Read(offset, n, options, result, nullptr, dbg); + } else { + return file_->Read(offset, n, options, result, scratch, dbg); + } } private: MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; }; -class MockRandomRWFile : public RandomRWFile { +class MockRandomRWFile : public FSRandomRWFile { public: explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); } ~MockRandomRWFile() override { file_->Unref(); } - Status Write(uint64_t offset, const Slice& data) override { - return file_->Write(offset, data); + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return file_->Write(offset, data, options, dbg); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - return file_->Read(offset, n, result, scratch); + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return file_->Read(offset, n, options, result, scratch, dbg); } - Status Close() override { return file_->Fsync(); } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } - Status Flush() override { return Status::OK(); } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Sync() override { return file_->Fsync(); } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } private: MemFile* file_; }; -class MockWritableFile : public WritableFile { +class MockWritableFile : public FSWritableFile { public: - MockWritableFile(MemFile* file, RateLimiter* rate_limiter) - : file_(file), rate_limiter_(rate_limiter) { + MockWritableFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_writes), + rate_limiter_(opts.rate_limiter) { file_->Ref(); } ~MockWritableFile() override { file_->Unref(); } - Status Append(const Slice& data) override { + bool use_direct_io() const override { return false && use_direct_io_; } + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { size_t bytes_written = 0; while (bytes_written < data.size()) { auto bytes = RequestToken(data.size() - bytes_written); - Status s = file_->Append(Slice(data.data() + bytes_written, bytes)); + IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes), + options, dbg); if (!s.ok()) { return s; } bytes_written += bytes; } - return Status::OK(); + return IOStatus::OK(); + } + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/, + const IOOptions& options, + IODebugContext* dbg) override { + assert(use_direct_io_); + return Append(data, options, dbg); } - Status Truncate(uint64_t size) override { - file_->Truncate(static_cast(size)); - return Status::OK(); + + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + file_->Truncate(static_cast(size), options, dbg); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); } - Status Close() override { return file_->Fsync(); } - Status Flush() override { return Status::OK(); } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Sync() override { return file_->Fsync(); } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } - uint64_t GetFileSize() override { return file_->Size(); } + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return file_->Size(); + } private: inline size_t RequestToken(size_t bytes) { @@ -293,12 +437,16 @@ } MemFile* file_; + bool use_direct_io_; RateLimiter* rate_limiter_; }; -class MockEnvDirectory : public Directory { +class MockEnvDirectory : public FSDirectory { public: - Status Fsync() override { return Status::OK(); } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } }; class MockEnvFileLock : public FileLock { @@ -313,21 +461,26 @@ class TestMemLogger : public Logger { private: - std::unique_ptr file_; + std::unique_ptr file_; std::atomic_size_t log_size_; static const uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; - Env* env_; + SystemClock* clock_; + IOOptions options_; + IODebugContext* dbg_; std::atomic flush_pending_; public: - TestMemLogger(std::unique_ptr f, Env* env, + TestMemLogger(std::unique_ptr f, SystemClock* clock, + const IOOptions& options, IODebugContext* dbg, const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) : Logger(log_level), file_(std::move(f)), log_size_(0), last_flush_micros_(0), - env_(env), + clock_(clock), + options_(options), + dbg_(dbg), flush_pending_(false) {} ~TestMemLogger() override {} @@ -335,7 +488,7 @@ if (flush_pending_) { flush_pending_ = false; } - last_flush_micros_ = env_->NowMicros(); + last_flush_micros_ = clock_->NowMicros(); } using Logger::Logv; @@ -393,9 +546,11 @@ assert(p <= limit); const size_t write_size = p - base; - file_->Append(Slice(base, write_size)); - flush_pending_ = true; - log_size_ += write_size; + Status s = file_->Append(Slice(base, write_size), options_, dbg_); + if (s.ok()) { + flush_pending_ = true; + log_size_ += write_size; + } uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + now_tv.tv_usec; if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { @@ -411,151 +566,235 @@ size_t GetLogFileSize() const override { return log_size_; } }; -} // Anonymous namespace +static std::unordered_map mock_fs_type_info = { +#ifndef ROCKSDB_LITE + {"supports_direct_io", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +} // namespace -MockEnv::MockEnv(Env* base_env) : EnvWrapper(base_env), fake_sleep_micros_(0) {} +MockFileSystem::MockFileSystem(const std::shared_ptr& clock, + bool supports_direct_io) + : system_clock_(clock), supports_direct_io_(supports_direct_io) { + clock_ = system_clock_.get(); + RegisterOptions("", &supports_direct_io_, &mock_fs_type_info); +} -MockEnv::~MockEnv() { - for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) { +MockFileSystem::~MockFileSystem() { + for (auto i = file_map_.begin(); i != file_map_.end(); ++i) { i->second->Unref(); } } -// Partial implementation of the Env interface. -Status MockEnv::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +Status MockFileSystem::PrepareOptions(const ConfigOptions& options) { + Status s = FileSystem::PrepareOptions(options); + if (s.ok() && system_clock_ == SystemClock::Default()) { + system_clock_ = options.env->GetSystemClock(); + clock_ = system_clock_.get(); + } + return s; +} + +IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) { + *output_path = NormalizeMockPath(db_path); + if (output_path->at(0) != '/') { + return IOStatus::NotSupported("GetAbsolutePath"); + } else { + return IOStatus::OK(); + } +} + +std::string MockFileSystem::NormalizeMockPath(const std::string& path) { + std::string p = NormalizePath(path); + if (p.back() == kFilePathSeparator && p.size() > 1) { + p.pop_back(); + } + return p; +} + +// Partial implementation of the FileSystem interface. +IOStatus MockFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockSequentialFile(f, file_opts)); + return IOStatus::OK(); } - result->reset(new MockSequentialFile(f)); - return Status::OK(); } -Status MockEnv::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockRandomAccessFile(f, file_opts)); + return IOStatus::OK(); } - result->reset(new MockRandomAccessFile(f)); - return Status::OK(); } -Status MockEnv::NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& /*file_opts*/, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); } result->reset(new MockRandomRWFile(f)); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) { - auto s = RenameFile(old_fname, fname); +IOStatus MockFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto s = RenameFile(old_fname, fname, IOOptions(), dbg); if (!s.ok()) { return s; + } else { + result->reset(); + return NewWritableFile(fname, options, result, dbg); } - result->reset(); - return NewWritableFile(fname, result, options); } -Status MockEnv::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& env_options) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { DeleteFileInternal(fn); } - MemFile* file = new MemFile(this, fn, false); + MemFile* file = new MemFile(clock_, fn, false); file->Ref(); file_map_[fn] = file; + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} - result->reset(new MockWritableFile(file, env_options.rate_limiter)); - return Status::OK(); +IOStatus MockFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + MemFile* file = nullptr; + if (file_map_.find(fn) == file_map_.end()) { + file = new MemFile(clock_, fn, false); + // Only take a reference when we create the file objectt + file->Ref(); + file_map_[fn] = file; + } else { + file = file_map_[fn]; + } + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } } -Status MockEnv::NewDirectory(const std::string& /*name*/, - std::unique_ptr* result) { +IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/, + const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { result->reset(new MockEnvDirectory()); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::FileExists(const std::string& fname) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::FileExists(const std::string& fname, + const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { // File exists - return Status::OK(); + return IOStatus::OK(); } // Now also check if fn exists as a dir for (const auto& iter : file_map_) { const std::string& filename = iter.first; if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' && Slice(filename).starts_with(Slice(fn))) { - return Status::OK(); + return IOStatus::OK(); } } - return Status::NotFound(); + return IOStatus::NotFound(); } -Status MockEnv::GetChildren(const std::string& dir, - std::vector* result) { - auto d = NormalizePath(dir); +bool MockFileSystem::GetChildrenInternal(const std::string& dir, + std::vector* result) { + auto d = NormalizeMockPath(dir); bool found_dir = false; - { - MutexLock lock(&mutex_); - result->clear(); - for (const auto& iter : file_map_) { - const std::string& filename = iter.first; - - if (filename == d) { - found_dir = true; - } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && - Slice(filename).starts_with(Slice(d))) { - found_dir = true; - size_t next_slash = filename.find('/', d.size() + 1); - if (next_slash != std::string::npos) { - result->push_back( - filename.substr(d.size() + 1, next_slash - d.size() - 1)); - } else { - result->push_back(filename.substr(d.size() + 1)); - } + result->clear(); + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + + if (filename == d) { + found_dir = true; + } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && + Slice(filename).starts_with(Slice(d))) { + found_dir = true; + size_t next_slash = filename.find('/', d.size() + 1); + if (next_slash != std::string::npos) { + result->push_back( + filename.substr(d.size() + 1, next_slash - d.size() - 1)); + } else { + result->push_back(filename.substr(d.size() + 1)); } } } result->erase(std::unique(result->begin(), result->end()), result->end()); - return found_dir ? Status::OK() : Status::NotFound(); + return found_dir; } -void MockEnv::DeleteFileInternal(const std::string& fname) { - assert(fname == NormalizePath(fname)); +IOStatus MockFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + bool found_dir = GetChildrenInternal(dir, result); + return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir); +} + +void MockFileSystem::DeleteFileInternal(const std::string& fname) { + assert(fname == NormalizeMockPath(fname)); const auto& pair = file_map_.find(fname); if (pair != file_map_.end()) { pair->second->Unref(); @@ -563,180 +802,222 @@ } } -Status MockEnv::DeleteFile(const std::string& fname) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } DeleteFileInternal(fn); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::Truncate(const std::string& fname, size_t size) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& options, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } - iter->second->Truncate(size); - return Status::OK(); + iter->second->Truncate(size, options, dbg); + return IOStatus::OK(); } -Status MockEnv::CreateDir(const std::string& dirname) { - auto dn = NormalizePath(dirname); +IOStatus MockFileSystem::CreateDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dn = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); if (file_map_.find(dn) == file_map_.end()) { - MemFile* file = new MemFile(this, dn, false); + MemFile* file = new MemFile(clock_, dn, false); file->Ref(); file_map_[dn] = file; } else { - return Status::IOError(); + return IOStatus::IOError(); } - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::CreateDirIfMissing(const std::string& dirname) { - CreateDir(dirname); - return Status::OK(); +IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + CreateDir(dirname, options, dbg).PermitUncheckedError(); + return IOStatus::OK(); } -Status MockEnv::DeleteDir(const std::string& dirname) { - return DeleteFile(dirname); +IOStatus MockFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dir = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dir) == file_map_.end()) { + return IOStatus::PathNotFound(dir); + } else { + std::vector children; + if (GetChildrenInternal(dir, &children)) { + for (const auto& child : children) { + DeleteFileInternal(child); + } + } + DeleteFileInternal(dir); + return IOStatus::OK(); + } } -Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_size, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } *file_size = iter->second->Size(); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::GetFileModificationTime(const std::string& fname, - uint64_t* time) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* time, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } *time = iter->second->ModifiedTime(); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::RenameFile(const std::string& src, const std::string& dest) { - auto s = NormalizePath(src); - auto t = NormalizePath(dest); - MutexLock lock(&mutex_); - if (file_map_.find(s) == file_map_.end()) { - return Status::IOError(s, "File not found"); +bool MockFileSystem::RenameFileInternal(const std::string& src, + const std::string& dest) { + if (file_map_.find(src) == file_map_.end()) { + return false; + } else { + std::vector children; + if (GetChildrenInternal(src, &children)) { + for (const auto& child : children) { + RenameFileInternal(src + "/" + child, dest + "/" + child); + } + } + DeleteFileInternal(dest); + file_map_[dest] = file_map_[src]; + file_map_.erase(src); + return true; } +} - DeleteFileInternal(t); - file_map_[t] = file_map_[s]; - file_map_.erase(s); - return Status::OK(); +IOStatus MockFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + bool found = RenameFileInternal(s, t); + if (!found) { + return IOStatus::PathNotFound(s); + } else { + return IOStatus::OK(); + } } -Status MockEnv::LinkFile(const std::string& src, const std::string& dest) { - auto s = NormalizePath(src); - auto t = NormalizePath(dest); +IOStatus MockFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); MutexLock lock(&mutex_); if (file_map_.find(s) == file_map_.end()) { - return Status::IOError(s, "File not found"); + return IOStatus::PathNotFound(s); } DeleteFileInternal(t); file_map_[t] = file_map_[s]; file_map_[t]->Ref(); // Otherwise it might get deleted when noone uses s - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::NewLogger(const std::string& fname, - std::shared_ptr* result) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); MemFile* file = nullptr; if (iter == file_map_.end()) { - file = new MemFile(this, fn, false); + file = new MemFile(clock_, fn, false); file->Ref(); file_map_[fn] = file; } else { file = iter->second; } - std::unique_ptr f(new MockWritableFile(file, nullptr)); - result->reset(new TestMemLogger(std::move(f), this)); - return Status::OK(); + std::unique_ptr f(new MockWritableFile(file, FileOptions())); + result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg)); + return IOStatus::OK(); } -Status MockEnv::LockFile(const std::string& fname, FileLock** flock) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::LockFile(const std::string& fname, + const IOOptions& /*options*/, + FileLock** flock, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); { MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { if (!file_map_[fn]->is_lock_file()) { - return Status::InvalidArgument(fname, "Not a lock file."); + return IOStatus::InvalidArgument(fname, "Not a lock file."); } if (!file_map_[fn]->Lock()) { - return Status::IOError(fn, "Lock is already held."); + return IOStatus::IOError(fn, "lock is already held."); } } else { - auto* file = new MemFile(this, fn, true); + auto* file = new MemFile(clock_, fn, true); file->Ref(); file->Lock(); file_map_[fn] = file; } } *flock = new MockEnvFileLock(fn); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::UnlockFile(FileLock* flock) { - std::string fn = - static_cast_with_check(flock)->FileName(); +IOStatus MockFileSystem::UnlockFile(FileLock* flock, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + std::string fn = static_cast_with_check(flock)->FileName(); { MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { if (!file_map_[fn]->is_lock_file()) { - return Status::InvalidArgument(fn, "Not a lock file."); + return IOStatus::InvalidArgument(fn, "Not a lock file."); } file_map_[fn]->Unlock(); } } delete flock; - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::GetTestDirectory(std::string* path) { +IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/, + std::string* path, + IODebugContext* /*dbg*/) { *path = "/test"; - return Status::OK(); -} - -Status MockEnv::GetCurrentTime(int64_t* unix_time) { - auto s = EnvWrapper::GetCurrentTime(unix_time); - if (s.ok()) { - *unix_time += fake_sleep_micros_.load() / (1000 * 1000); - } - return s; -} - -uint64_t MockEnv::NowMicros() { - return EnvWrapper::NowMicros() + fake_sleep_micros_.load(); + return IOStatus::OK(); } -uint64_t MockEnv::NowNanos() { - return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000; -} - -Status MockEnv::CorruptBuffer(const std::string& fname) { - auto fn = NormalizePath(fname); +Status MockFileSystem::CorruptBuffer(const std::string& fname) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { @@ -746,24 +1027,29 @@ return Status::OK(); } -std::string MockEnv::NormalizePath(const std::string path) { - std::string dst; - for (auto c : path) { - if (!dst.empty() && c == '/' && dst.back() == '/') { - continue; - } - dst.push_back(c); - } - return dst; +MockEnv::MockEnv(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& clock) + : CompositeEnvWrapper(env, fs, clock) {} + +MockEnv* MockEnv::Create(Env* env) { + auto clock = + std::make_shared(env->GetSystemClock(), true); + return MockEnv::Create(env, clock); +} + +MockEnv* MockEnv::Create(Env* env, const std::shared_ptr& clock) { + auto fs = std::make_shared(clock); + return new MockEnv(env, fs, clock); } -void MockEnv::FakeSleepForMicroseconds(int64_t micros) { - fake_sleep_micros_.fetch_add(micros); +Status MockEnv::CorruptBuffer(const std::string& fname) { + auto mock = static_cast_with_check(GetFileSystem().get()); + return mock->CorruptBuffer(fname); } #ifndef ROCKSDB_LITE // This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv -Env* NewMemEnv(Env* base_env) { return new MockEnv(base_env); } +Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); } #else // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,103 +12,132 @@ #include #include #include + +#include "env/composite_env_wrapper.h" +#include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/status.h" -#include "port/port.h" -#include "util/mutexlock.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { - class MemFile; -class MockEnv : public EnvWrapper { +class MockFileSystem : public FileSystem { public: - explicit MockEnv(Env* base_env); - - virtual ~MockEnv(); - - // Partial implementation of the Env interface. - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - virtual Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - virtual Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override; + explicit MockFileSystem(const std::shared_ptr& clock, + bool supports_direct_io = true); + ~MockFileSystem() override; + + static const char* kClassName() { return "MemoryFileSystem"; } + const char* Name() const override { return kClassName(); } + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override; + IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& /*options*/, bool* /*is_dir*/, + IODebugContext* /*dgb*/) override { + return IOStatus::NotSupported("IsDirectory"); + } - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& env_options) override; - - virtual Status NewDirectory(const std::string& name, - std::unique_ptr* result) override; - - virtual Status FileExists(const std::string& fname) override; - - virtual Status GetChildren(const std::string& dir, - std::vector* result) override; + Status CorruptBuffer(const std::string& fname); + Status PrepareOptions(const ConfigOptions& options) override; + private: + bool RenameFileInternal(const std::string& src, const std::string& dest); void DeleteFileInternal(const std::string& fname); + bool GetChildrenInternal(const std::string& fname, + std::vector* results); - virtual Status DeleteFile(const std::string& fname) override; - - virtual Status Truncate(const std::string& fname, size_t size) override; - - virtual Status CreateDir(const std::string& dirname) override; - - virtual Status CreateDirIfMissing(const std::string& dirname) override; - - virtual Status DeleteDir(const std::string& dirname) override; + std::string NormalizeMockPath(const std::string& path); - virtual Status GetFileSize(const std::string& fname, - uint64_t* file_size) override; - - virtual Status GetFileModificationTime(const std::string& fname, - uint64_t* time) override; - - virtual Status RenameFile(const std::string& src, - const std::string& target) override; - - virtual Status LinkFile(const std::string& src, - const std::string& target) override; - - virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result) override; - - virtual Status LockFile(const std::string& fname, FileLock** flock) override; - - virtual Status UnlockFile(FileLock* flock) override; + private: + // Map from filenames to MemFile objects, representing a simple file system. + port::Mutex mutex_; + std::map file_map_; // Protected by mutex_. + std::shared_ptr system_clock_; + SystemClock* clock_; + bool supports_direct_io_; +}; - virtual Status GetTestDirectory(std::string* path) override; +class MockEnv : public CompositeEnvWrapper { + public: + static MockEnv* Create(Env* base); + static MockEnv* Create(Env* base, const std::shared_ptr& clock); - // Results of these can be affected by FakeSleepForMicroseconds() - virtual Status GetCurrentTime(int64_t* unix_time) override; - virtual uint64_t NowMicros() override; - virtual uint64_t NowNanos() override; + static const char* kClassName() { return "MockEnv"; } + const char* Name() const override { return kClassName(); } Status CorruptBuffer(const std::string& fname); - - // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros() - // and NowNanos() - void FakeSleepForMicroseconds(int64_t micros); - private: - std::string NormalizePath(const std::string path); - - // Map from filenames to MemFile objects, representing a simple file system. - typedef std::map FileSystem; - port::Mutex mutex_; - FileSystem file_map_; // Protected by mutex_. - - std::atomic fake_sleep_micros_; + MockEnv(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& clock); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,9 +19,7 @@ MockEnv* env_; const EnvOptions soptions_; - MockEnvTest() - : env_(new MockEnv(Env::Default())) { - } + MockEnvTest() : env_(MockEnv::Create(Env::Default())) {} ~MockEnvTest() override { delete env_; } }; @@ -68,7 +66,7 @@ int64_t now = 0; auto s = env_->GetCurrentTime(&now); ASSERT_OK(s); - env_->FakeSleepForMicroseconds(3 * 1000 * 1000); + env_->SleepForMicroseconds(3 * 1000 * 1000); int64_t after_sleep = 0; s = env_->GetCurrentTime(&after_sleep); ASSERT_OK(s); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,164 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "env/unique_id_gen.h" + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/version.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +struct GenerateRawUniqueIdOpts { + Env* env = Env::Default(); + bool exclude_port_uuid = false; + bool exclude_env_details = false; + bool exclude_random_device = false; +}; + +// Each of these "tracks" below should be sufficient for generating 128 bits +// of entropy, after hashing the raw bytes. The tracks are separable for +// testing purposes, but in production we combine as many tracks as possible +// to ensure quality results even if some environments have degraded +// capabilities or quality in some APIs. +// +// This approach has not been validated for use in cryptography. The goal is +// generating globally unique values with high probability without coordination +// between instances. +// +// Linux performance: EntropyTrackRandomDevice is much faster than +// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid. + +struct EntropyTrackPortUuid { + std::array uuid; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_port_uuid) { + return; + } + std::string s; + port::GenerateRfcUuid(&s); + if (s.size() >= uuid.size()) { + std::copy_n(s.begin(), uuid.size(), uuid.begin()); + } + } +}; + +struct EntropyTrackEnvDetails { + std::array hostname_buf; + int64_t process_id; + uint64_t thread_id; + int64_t unix_time; + uint64_t nano_time; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_env_details) { + return; + } + opts.env->GetHostName(hostname_buf.data(), hostname_buf.size()) + .PermitUncheckedError(); + process_id = port::GetProcessID(); + thread_id = opts.env->GetThreadID(); + opts.env->GetCurrentTime(&unix_time).PermitUncheckedError(); + nano_time = opts.env->NowNanos(); + } +}; + +struct EntropyTrackRandomDevice { + using RandType = std::random_device::result_type; + static constexpr size_t kNumRandVals = + /* generous bits */ 192U / (8U * sizeof(RandType)); + std::array rand_vals; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + if (opts.exclude_random_device) { + return; + } + std::random_device r; + for (auto& val : rand_vals) { + val = r(); + } + } +}; + +struct Entropy { + uint64_t version_identifier; + EntropyTrackRandomDevice et1; + EntropyTrackEnvDetails et2; + EntropyTrackPortUuid et3; + + void Populate(const GenerateRawUniqueIdOpts& opts) { + // If we change the format of what goes into the entropy inputs, it's + // conceivable there could be a physical collision in the hash input + // even though they are logically different. This value should change + // if there's a change to the "schema" here, including byte order. + version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) + + (uint64_t{ROCKSDB_MINOR} << 16) + + uint64_t{ROCKSDB_PATCH}; + et1.Populate(opts); + et2.Populate(opts); + et3.Populate(opts); + } +}; + +void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b, + const GenerateRawUniqueIdOpts& opts) { + Entropy e; + std::memset(&e, 0, sizeof(e)); + e.Populate(opts); + Hash2x64(reinterpret_cast(&e), sizeof(e), a, b); +} + +} // namespace + +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + assert(!opts.exclude_env_details); + assert(!opts.exclude_random_device); + GenerateRawUniqueIdImpl(a, b, opts); +} + +#ifndef NDEBUG +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device) { + GenerateRawUniqueIdOpts opts; + opts.exclude_port_uuid = exclude_port_uuid; + opts.exclude_env_details = exclude_env_details; + opts.exclude_random_device = exclude_random_device; + GenerateRawUniqueIdImpl(a, b, opts); +} +#endif + +void SemiStructuredUniqueIdGen::Reset() { + saved_process_id_ = port::GetProcessID(); + GenerateRawUniqueId(&base_upper_, &base_lower_); + counter_ = 0; +} + +void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { + if (port::GetProcessID() == saved_process_id_) { + // Safe to increment the atomic for guaranteed uniqueness within this + // process lifetime. Xor slightly better than +. See + // https://github.com/pdillinger/unique_id + *lower = base_lower_ ^ counter_.fetch_add(1); + *upper = base_upper_; + } else { + // There must have been a fork() or something. Rather than attempting to + // update in a thread-safe way, simply fall back on GenerateRawUniqueId. + GenerateRawUniqueId(upper, lower); + } +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,71 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This file is for functions that generate unique identifiers by +// (at least in part) by extracting novel entropy or sources of uniqueness +// from the execution environment. (By contrast, random.h is for algorithmic +// pseudorandomness.) +// +// These functions could eventually migrate to public APIs, such as in Env. + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Generates a new 128-bit identifier that is universally unique +// (with high probability) for each call. The result is split into +// two 64-bit pieces. This function has NOT been validated for use in +// cryptography. +// +// This is used in generating DB session IDs and by Env::GenerateUniqueId +// (used for DB IDENTITY) if the platform does not provide a generator of +// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this +// function is used as a fallback for GenerateRfcUuid, because no need +// trying it again.) +void GenerateRawUniqueId(uint64_t* a, uint64_t* b, + bool exclude_port_uuid = false); + +#ifndef NDEBUG +// A version of above with options for challenge testing +void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid, + bool exclude_env_details, + bool exclude_random_device); +#endif + +// Generates globally unique ids with lower probability of any collisions +// vs. each unique id being independently random (GenerateRawUniqueId). +// We call this "semi-structured" because between different +// SemiStructuredUniqueIdGen objects, the IDs are separated by random +// intervals (unstructured), but within a single SemiStructuredUniqueIdGen +// object, the generated IDs are trivially related (structured). See +// https://github.com/pdillinger/unique_id for how this improves probability +// of no collision. In short, if we have n SemiStructuredUniqueIdGen +// objects each generating m IDs, the first collision is expected at +// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64, +// rather than n * m = 2^64 for fully random IDs. +class SemiStructuredUniqueIdGen { + public: + // Initializes with random starting state (from GenerateRawUniqueId) + SemiStructuredUniqueIdGen() { Reset(); } + // Re-initializes, but not thread safe + void Reset(); + + // Assuming no fork(), `lower` is guaranteed unique from one call + // to the next (thread safe). + void GenerateNext(uint64_t* upper, uint64_t* lower); + + private: + uint64_t base_upper_; + uint64_t base_lower_; + std::atomic counter_; + int64_t saved_process_id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,45 @@ +add_executable(simple_example + simple_example.cc) +target_link_libraries(simple_example + ${ROCKSDB_LIB}) + +add_executable(column_families_example + column_families_example.cc) +target_link_libraries(column_families_example + ${ROCKSDB_LIB}) + +add_executable(compact_files_example + compact_files_example.cc) +target_link_libraries(compact_files_example + ${ROCKSDB_LIB}) + +add_executable(c_simple_example + c_simple_example.c) +target_link_libraries(c_simple_example + ${ROCKSDB_LIB}) + +add_executable(optimistic_transaction_example + optimistic_transaction_example.cc) +target_link_libraries(optimistic_transaction_example + ${ROCKSDB_LIB}) + +add_executable(transaction_example + transaction_example.cc) +target_link_libraries(transaction_example + ${ROCKSDB_LIB}) + +add_executable(compaction_filter_example + compaction_filter_example.cc) +target_link_libraries(compaction_filter_example + ${ROCKSDB_LIB}) + +add_executable(options_file_example + options_file_example.cc) +target_link_libraries(options_file_example + ${ROCKSDB_LIB}) + +add_executable(multi_processes_example + EXCLUDE_FROM_ALL + multi_processes_example.cc) +target_link_libraries(multi_processes_example + ${ROCKSDB_LIB}) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/Makefile 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,8 @@ CXXFLAGS += -fno-rtti endif +CFLAGS += -Wstrict-prototypes + .PHONY: clean librocksdb all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/c_simple_example.c mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/c_simple_example.c 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c 2025-05-19 16:14:27.000000000 +0000 @@ -10,18 +10,35 @@ #include "rocksdb/c.h" +#if defined(OS_WIN) +#include +#else #include // sysconf() - get CPU count +#endif -const char DBPath[] = "/tmp/rocksdb_simple_example"; -const char DBBackupPath[] = "/tmp/rocksdb_simple_example_backup"; +#if defined(OS_WIN) +const char DBPath[] = "C:\\Windows\\TEMP\\rocksdb_c_simple_example"; +const char DBBackupPath[] = + "C:\\Windows\\TEMP\\rocksdb_c_simple_example_backup"; +#else +const char DBPath[] = "/tmp/rocksdb_c_simple_example"; +const char DBBackupPath[] = "/tmp/rocksdb_c_simple_example_backup"; +#endif int main(int argc, char **argv) { rocksdb_t *db; rocksdb_backup_engine_t *be; rocksdb_options_t *options = rocksdb_options_create(); // Optimize RocksDB. This is the easiest way to - // get RocksDB to perform well - long cpus = sysconf(_SC_NPROCESSORS_ONLN); // get # of online cores + // get RocksDB to perform well. +#if defined(OS_WIN) + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + long cpus = system_info.dwNumberOfProcessors; +#else + long cpus = sysconf(_SC_NPROCESSORS_ONLN); +#endif + // Set # of online cores rocksdb_options_increase_parallelism(options, (int)(cpus)); rocksdb_options_optimize_level_style_compaction(options, 0); // create the DB if it's not already present diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/column_families_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/column_families_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,23 @@ #include "rocksdb/slice.h" #include "rocksdb/options.h" -using namespace ROCKSDB_NAMESPACE; - +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_column_families_example"; +#else std::string kDBPath = "/tmp/rocksdb_column_families_example"; +#endif + +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteOptions; int main() { // open DB @@ -28,14 +42,15 @@ assert(s.ok()); // close DB - delete cf; + s = db->DestroyColumnFamilyHandle(cf); + assert(s.ok()); delete db; // open DB with two column families std::vector column_families; // have to open default column family column_families.push_back(ColumnFamilyDescriptor( - kDefaultColumnFamilyName, ColumnFamilyOptions())); + ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions())); // open the new one, too column_families.push_back(ColumnFamilyDescriptor( "new_cf", ColumnFamilyOptions())); @@ -64,7 +79,8 @@ // close db for (auto handle : handles) { - delete handle; + s = db->DestroyColumnFamilyHandle(handle); + assert(s.ok()); } delete db; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compact_files_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compact_files_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,8 +12,22 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" -using namespace ROCKSDB_NAMESPACE; +using ROCKSDB_NAMESPACE::ColumnFamilyMetaData; +using ROCKSDB_NAMESPACE::CompactionOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::EventListener; +using ROCKSDB_NAMESPACE::FlushJobInfo; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteOptions; + +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_compact_files_example"; +#else std::string kDBPath = "/tmp/rocksdb_compact_files_example"; +#endif + struct CompactionTask; // This is an example interface of external-compaction algorithm. @@ -136,7 +150,7 @@ Options options; options.create_if_missing = true; // Disable RocksDB background compaction. - options.compaction_style = kCompactionStyleNone; + options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleNone; // Small slowdown and stop trigger for experimental purpose. options.level0_slowdown_writes_trigger = 3; options.level0_stop_writes_trigger = 5; @@ -144,7 +158,7 @@ options.listeners.emplace_back(new FullCompactor(options)); DB* db = nullptr; - DestroyDB(kDBPath, options); + ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options); Status s = DB::Open(options, kDBPath, &db); assert(s.ok()); assert(db); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include -#include -#include -#include +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" class MyMerge : public ROCKSDB_NAMESPACE::MergeOperator { public: @@ -54,22 +54,30 @@ mutable int merge_count_ = 0; }; +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksmergetest"; +std::string kRemoveDirCommand = "rmdir /Q /S "; +#else +std::string kDBPath = "/tmp/rocksmergetest"; +std::string kRemoveDirCommand = "rm -rf "; +#endif + int main() { ROCKSDB_NAMESPACE::DB* raw_db; ROCKSDB_NAMESPACE::Status status; MyFilter filter; - int ret = system("rm -rf /tmp/rocksmergetest"); + std::string rm_cmd = kRemoveDirCommand + kDBPath; + int ret = system(rm_cmd.c_str()); if (ret != 0) { - fprintf(stderr, "Error deleting /tmp/rocksmergetest, code: %d\n", ret); - return ret; + fprintf(stderr, "Error deleting %s, code: %d\n", kDBPath.c_str(), ret); } ROCKSDB_NAMESPACE::Options options; options.create_if_missing = true; options.merge_operator.reset(new MyMerge); options.compaction_filter = &filter; - status = ROCKSDB_NAMESPACE::DB::Open(options, "/tmp/rocksmergetest", &raw_db); + status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db); assert(status.ok()); std::unique_ptr db(raw_db); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/multi_processes_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/multi_processes_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -23,6 +23,8 @@ #include #include +// TODO: port this example to other systems. It should be straightforward for +// POSIX-compliant systems. #if defined(OS_LINUX) #include #include @@ -30,7 +32,6 @@ #include #include #include -#endif // !OS_LINUX #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -136,9 +137,6 @@ static bool ShouldCloseDB() { return true; } -// TODO: port this example to other systems. It should be straightforward for -// POSIX-compliant systems. -#if defined(OS_LINUX) void CreateDB() { long my_pid = static_cast(getpid()); Options options; @@ -301,7 +299,7 @@ std::string value; db->Get(ropts, key, &value); } - fprintf(stdout, "[process %ld] Point lookup thread finished\n"); + fprintf(stdout, "[process %ld] Point lookup thread finished\n", my_pid); }); uint64_t curr_key = 0; @@ -389,7 +387,7 @@ } #else // OS_LINUX int main() { - fpritnf(stderr, "Not implemented.\n"); + fprintf(stderr, "Not implemented.\n"); return 0; } #endif // !OS_LINUX diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,9 +11,21 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" -using namespace ROCKSDB_NAMESPACE; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::OptimisticTransactionDB; +using ROCKSDB_NAMESPACE::OptimisticTransactionOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::Transaction; +using ROCKSDB_NAMESPACE::WriteOptions; +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example"; +#else std::string kDBPath = "/tmp/rocksdb_transaction_example"; +#endif int main() { // open DB diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/options_file_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/options_file_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -18,9 +18,24 @@ #include "rocksdb/table.h" #include "rocksdb/utilities/options_util.h" -using namespace ROCKSDB_NAMESPACE; - +using ROCKSDB_NAMESPACE::BlockBasedTableOptions; +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::CompactionFilter; +using ROCKSDB_NAMESPACE::ConfigOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::NewLRUCache; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::Status; + +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_options_file_example"; +#else std::string kDBPath = "/tmp/rocksdb_options_file_example"; +#endif namespace { // A dummy compaction filter @@ -41,7 +56,8 @@ db_opt.create_if_missing = true; std::vector cf_descs; - cf_descs.push_back({kDefaultColumnFamilyName, ColumnFamilyOptions()}); + cf_descs.push_back( + {ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions()}); cf_descs.push_back({"new_cf", ColumnFamilyOptions()}); // initialize BlockBasedTableOptions @@ -59,7 +75,8 @@ // destroy and open DB DB* db; - Status s = DestroyDB(kDBPath, Options(db_opt, cf_descs[0].options)); + Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath, + Options(db_opt, cf_descs[0].options)); assert(s.ok()); s = DB::Open(Options(db_opt, cf_descs[0].options), kDBPath, &db); assert(s.ok()); @@ -79,15 +96,17 @@ // Load the options file. DBOptions loaded_db_opt; std::vector loaded_cf_descs; - s = LoadLatestOptions(kDBPath, Env::Default(), &loaded_db_opt, + ConfigOptions config_options; + s = LoadLatestOptions(config_options, kDBPath, &loaded_db_opt, &loaded_cf_descs); assert(s.ok()); assert(loaded_db_opt.create_if_missing == db_opt.create_if_missing); // Initialize pointer options for each column family for (size_t i = 0; i < loaded_cf_descs.size(); ++i) { - auto* loaded_bbt_opt = reinterpret_cast( - loaded_cf_descs[0].options.table_factory->GetOptions()); + auto* loaded_bbt_opt = + loaded_cf_descs[0] + .options.table_factory->GetOptions(); // Expect the same as BlockBasedTableOptions will be loaded form file. assert(loaded_bbt_opt->block_size == bbt_opts.block_size); // However, block_cache needs to be manually initialized as documented diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/simple_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/simple_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,19 @@ #include "rocksdb/slice.h" #include "rocksdb/options.h" -using namespace ROCKSDB_NAMESPACE; - +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::PinnableSlice; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteOptions; + +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_simple_example"; +#else std::string kDBPath = "/tmp/rocksdb_simple_example"; +#endif int main() { DB* db; @@ -68,7 +78,7 @@ } PinnableSlice pinnable_val; - db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val); + s = db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val); assert(s.IsNotFound()); // Reset PinnableSlice after each use and before each reuse pinnable_val.Reset(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/transaction_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/transaction_example.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,9 +11,21 @@ #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" -using namespace ROCKSDB_NAMESPACE; - +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Snapshot; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::Transaction; +using ROCKSDB_NAMESPACE::TransactionDB; +using ROCKSDB_NAMESPACE::TransactionDBOptions; +using ROCKSDB_NAMESPACE::TransactionOptions; +using ROCKSDB_NAMESPACE::WriteOptions; + +#if defined(OS_WIN) +std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example"; +#else std::string kDBPath = "/tmp/rocksdb_transaction_example"; +#endif int main() { // open DB @@ -179,7 +191,7 @@ // Cleanup delete txn_db; - DestroyDB(kDBPath, options); + ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options); return 0; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,7 @@ #include "file/delete_scheduler.h" +#include #include #include @@ -14,17 +15,19 @@ #include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs, +DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs, int64_t rate_bytes_per_sec, Logger* info_log, SstFileManagerImpl* sst_file_manager, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) - : env_(env), + : clock_(clock), fs_(fs), total_trash_size_(0), rate_bytes_per_sec_(rate_bytes_per_sec), @@ -32,13 +35,13 @@ bytes_max_delete_chunk_(bytes_max_delete_chunk), closing_(false), cv_(&mu_), + bg_thread_(nullptr), info_log_(info_log), sst_file_manager_(sst_file_manager), max_trash_db_ratio_(max_trash_db_ratio) { assert(sst_file_manager != nullptr); assert(max_trash_db_ratio >= 0); - bg_thread_.reset( - new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this)); + MaybeCreateBackgroundThread(); } DeleteScheduler::~DeleteScheduler() { @@ -50,47 +53,68 @@ if (bg_thread_) { bg_thread_->join(); } + for (const auto& it : bg_errors_) { + it.second.PermitUncheckedError(); + } } Status DeleteScheduler::DeleteFile(const std::string& file_path, const std::string& dir_to_sync, const bool force_bg) { - Status s; if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && total_trash_size_.load() > sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { // Rate limiting is disabled or trash size makes up more than // max_trash_db_ratio_ (default 25%) of the total DB size TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); - s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr); if (s.ok()) { - sst_file_manager_->OnDeleteFile(file_path); + s = sst_file_manager_->OnDeleteFile(file_path); + ROCKS_LOG_INFO(info_log_, + "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64 + ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf", + file_path.c_str(), rate_bytes_per_sec_.load(), + total_trash_size_.load(), max_trash_db_ratio_.load()); + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); } return s; } // Move file to trash std::string trash_file; - s = MarkAsTrash(file_path, &trash_file); + Status s = MarkAsTrash(file_path, &trash_file); + ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(), + s.ToString().c_str()); if (!s.ok()) { ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s", file_path.c_str(), s.ToString().c_str()); s = fs_->DeleteFile(file_path, IOOptions(), nullptr); if (s.ok()) { - sst_file_manager_->OnDeleteFile(file_path); + s = sst_file_manager_->OnDeleteFile(file_path); + ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately", + trash_file.c_str()); + InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY); } return s; } // Update the total trash size uint64_t trash_file_size = 0; - fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); - total_trash_size_.fetch_add(trash_file_size); + IOStatus io_s = + fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); + if (io_s.ok()) { + total_trash_size_.fetch_add(trash_file_size); + } + //**TODO: What should we do if we failed to + // get the file size? // Add file to delete queue { InstrumentedMutexLock l(&mu_); + RecordTick(stats_.get(), FILES_MARKED_TRASH); queue_.emplace(trash_file, dir_to_sync); pending_files_++; if (pending_files_ == 1) { @@ -131,7 +155,7 @@ std::string trash_file = path + "/" + current_file; if (sfm) { // We have an SstFileManager that will schedule the file delete - sfm->OnAddFile(trash_file); + s = sfm->OnAddFile(trash_file); file_delete = sfm->ScheduleFileDeletion(trash_file, path); } else { // Delete the file immediately @@ -154,17 +178,17 @@ return Status::InvalidArgument("file_path is corrupted"); } - Status s; if (DeleteScheduler::IsTrashFile(file_path)) { // This is already a trash file *trash_file = file_path; - return s; + return Status::OK(); } *trash_file = file_path + kTrashExtension; // TODO(tec) : Implement Env::RenameFileIfNotExist and remove // file_move_mu mutex. int cnt = 0; + Status s; InstrumentedMutexLock l(&file_move_mu_); while (true) { s = fs_->FileExists(*trash_file, IOOptions(), nullptr); @@ -182,7 +206,7 @@ cnt++; } if (s.ok()) { - sst_file_manager_->OnMoveFile(file_path, *trash_file); + s = sst_file_manager_->OnMoveFile(file_path, *trash_file); } return s; } @@ -201,22 +225,24 @@ } // Delete all files in queue_ - uint64_t start_time = env_->NowMicros(); + uint64_t start_time = clock_->NowMicros(); uint64_t total_deleted_bytes = 0; int64_t current_delete_rate = rate_bytes_per_sec_.load(); while (!queue_.empty() && !closing_) { if (current_delete_rate != rate_bytes_per_sec_.load()) { // User changed the delete rate current_delete_rate = rate_bytes_per_sec_.load(); - start_time = env_->NowMicros(); + start_time = clock_->NowMicros(); total_deleted_bytes = 0; + ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64, + current_delete_rate); } // Get new file to delete const FileAndDir& fad = queue_.front(); std::string path_in_trash = fad.fname; - // We dont need to hold the lock while deleting the file + // We don't need to hold the lock while deleting the file mu_.Unlock(); uint64_t deleted_bytes = 0; bool is_complete = true; @@ -233,19 +259,27 @@ bg_errors_[path_in_trash] = s; } - // Apply penlty if necessary - uint64_t total_penlty; + // Apply penalty if necessary + uint64_t total_penalty; if (current_delete_rate > 0) { // rate limiting is enabled - total_penlty = + total_penalty = ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate); - while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {} + ROCKS_LOG_INFO(info_log_, + "Rate limiting is enabled with penalty %" PRIu64 + " after deleting file %s", + total_penalty, path_in_trash.c_str()); + while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) { + } } else { // rate limiting is disabled - total_penlty = 0; + total_penalty = 0; + ROCKS_LOG_INFO(info_log_, + "Rate limiting is disabled after deleting file %s", + path_in_trash.c_str()); } TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", - &total_penlty); + &total_penalty); if (is_complete) { pending_files_--; @@ -323,14 +357,18 @@ s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr); } if (s.ok()) { - s = dir_obj->Fsync(IOOptions(), nullptr); + s = dir_obj->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); TEST_SYNC_POINT_CALLBACK( "DeleteScheduler::DeleteTrashFile::AfterSyncDir", reinterpret_cast(const_cast(&dir_to_sync))); } } - *deleted_bytes = file_size; - sst_file_manager_->OnDeleteFile(path_in_trash); + if (s.ok()) { + *deleted_bytes = file_size; + s = sst_file_manager_->OnDeleteFile(path_in_trash); + } } } if (!s.ok()) { @@ -352,6 +390,17 @@ } } +void DeleteScheduler::MaybeCreateBackgroundThread() { + if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) { + bg_thread_.reset( + new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this)); + ROCKS_LOG_INFO(info_log_, + "Created background thread for deletion scheduler with " + "rate_bytes_per_sec: %" PRIi64, + rate_bytes_per_sec_.load()); + } +} + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,26 +15,28 @@ #include "monitoring/instrumented_mutex.h" #include "port/port.h" -#include "rocksdb/file_system.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { class Env; +class FileSystem; class Logger; class SstFileManagerImpl; +class SystemClock; // DeleteScheduler allows the DB to enforce a rate limit on file deletion, // Instead of deleteing files immediately, files are marked as trash -// and deleted in a background thread that apply sleep penlty between deletes +// and deleted in a background thread that apply sleep penalty between deletes // if they are happening in a rate faster than rate_bytes_per_sec, // // Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this // case DeleteScheduler will delete files immediately. class DeleteScheduler { public: - DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec, - Logger* info_log, SstFileManagerImpl* sst_file_manager, + DeleteScheduler(SystemClock* clock, FileSystem* fs, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); ~DeleteScheduler(); @@ -45,9 +47,10 @@ // Set delete rate limit in bytes per second void SetRateBytesPerSecond(int64_t bytes_per_sec) { rate_bytes_per_sec_.store(bytes_per_sec); + MaybeCreateBackgroundThread(); } - // Mark file as trash directory and schedule it's deletion. If force_bg is + // Mark file as trash directory and schedule its deletion. If force_bg is // set, it forces the file to always be deleted in the background thread, // except when rate limiting is disabled Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, @@ -77,11 +80,16 @@ static const std::string kTrashExtension; static bool IsTrashFile(const std::string& file_path); - // Check if there are any .trash filse in path, and schedule their deletion + // Check if there are any .trash files in path, and schedule their deletion // Or delete immediately if sst_file_manager is nullptr static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm, const std::string& path); + void SetStatisticsPtr(const std::shared_ptr& stats) { + InstrumentedMutexLock l(&mu_); + stats_ = stats; + } + private: Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash); @@ -91,14 +99,16 @@ void BackgroundEmptyTrash(); - Env* env_; + void MaybeCreateBackgroundThread(); + + SystemClock* clock_; FileSystem* fs_; // total size of trash files std::atomic total_trash_size_; // Maximum number of bytes that should be deleted per second std::atomic rate_bytes_per_sec_; - // Mutex to protect queue_, pending_files_, bg_errors_, closing_ + // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_ InstrumentedMutex mu_; struct FileAndDir { @@ -134,6 +144,7 @@ // immediately std::atomic max_trash_db_ratio_; static const uint64_t kMicrosInSecond = 1000 * 1000LL; + std::shared_ptr stats_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,18 +3,19 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "file/delete_scheduler.h" + #include #include #include #include -#include "file/delete_scheduler.h" +#include "file/file_util.h" #include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" -#include "test_util/testutil.h" #include "util/string_util.h" #ifndef ROCKSDB_LITE @@ -32,6 +33,7 @@ ToString(i)); DestroyAndCreateDir(dummy_files_dirs_.back()); } + stats_ = ROCKSDB_NAMESPACE::CreateDBStatistics(); } ~DeleteSchedulerTest() override { @@ -39,12 +41,12 @@ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); for (const auto& dummy_files_dir : dummy_files_dirs_) { - test::DestroyDir(env_, dummy_files_dir); + DestroyDir(env_, dummy_files_dir); } } void DestroyAndCreateDir(const std::string& dir) { - ASSERT_OK(test::DestroyDir(env_, dir)); + ASSERT_OK(DestroyDir(env_, dir)); EXPECT_OK(env_->CreateDir(dir)); } @@ -55,7 +57,7 @@ int normal_cnt = 0; for (auto& f : files_in_dir) { - if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") { + if (!DeleteScheduler::IsTrashFile(f)) { normal_cnt++; } } @@ -85,20 +87,20 @@ std::string data(size, 'A'); EXPECT_OK(f->Append(data)); EXPECT_OK(f->Close()); - sst_file_mgr_->OnAddFile(file_path, false); + sst_file_mgr_->OnAddFile(file_path); return file_path; } void NewDeleteScheduler() { - // Tests in this file are for DeleteScheduler component and dont create any + // Tests in this file are for DeleteScheduler component and don't create any // DBs, so we need to set max_trash_db_ratio to 100% (instead of default // 25%) - std::shared_ptr - fs(std::make_shared(env_)); sst_file_mgr_.reset( - new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_, + new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(), + nullptr, rate_bytes_per_sec_, /* max_trash_db_ratio= */ 1.1, 128 * 1024)); delete_scheduler_ = sst_file_mgr_->delete_scheduler(); + sst_file_mgr_->SetStatisticsPtr(stats_); } Env* env_; @@ -106,6 +108,7 @@ int64_t rate_bytes_per_sec_; DeleteScheduler* delete_scheduler_; std::unique_ptr sst_file_mgr_; + std::shared_ptr stats_; }; // Test the basic functionality of DeleteScheduler (Rate Limiting). @@ -182,6 +185,8 @@ ASSERT_EQ(num_files, dir_synced); ASSERT_EQ(CountTrashFiles(), 0); + ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } } @@ -219,6 +224,9 @@ ASSERT_EQ(0, CountTrashFiles(i)); } + ASSERT_EQ(kNumFiles, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -301,12 +309,16 @@ ASSERT_EQ(CountNormalFiles(), 0); ASSERT_EQ(CountTrashFiles(), 0); + ASSERT_EQ(num_files * thread_cnt, + stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } } // Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure -// that when DeleteScheduler delete a file it delete it immediately and dont +// that when DeleteScheduler delete a file it delete it immediately and don't // move it to trash TEST_F(DeleteSchedulerTest, DisableRateLimiting) { int bg_delete_file = 0; @@ -318,8 +330,9 @@ rate_bytes_per_sec_ = 0; NewDeleteScheduler(); + constexpr int num_files = 10; - for (int i = 0; i < 10; i++) { + for (int i = 0; i < num_files; i++) { // Every file we delete will be deleted immediately std::string dummy_file = NewDummyFile("dummy.data"); ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, "")); @@ -329,6 +342,9 @@ } ASSERT_EQ(bg_delete_file, 0); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(num_files, + stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -365,6 +381,8 @@ auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -406,7 +424,9 @@ delete_scheduler_->WaitForEmptyTrash(); auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 10); - + for (const auto& it : bg_errors) { + ASSERT_TRUE(it.second.IsPathNotFound()); + } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -439,9 +459,12 @@ auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); } ASSERT_EQ(bg_delete_file, 50); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); } @@ -647,12 +670,14 @@ } for (std::string& file_name : generated_files) { - delete_scheduler_->DeleteFile(file_name, ""); + ASSERT_OK(delete_scheduler_->DeleteFile(file_name, "")); } // When we end up with 26 files in trash we will start // deleting new files immediately ASSERT_EQ(fg_delete_file, 74); + ASSERT_EQ(26, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(74, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -21,12 +21,14 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, +Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, + RandomAccessFileReader* reader, uint64_t offset, size_t n, bool for_compaction) { if (!enable_ || reader == nullptr) { return Status::OK(); } + TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start"); size_t alignment = reader->file()->GetRequiredBufferAlignment(); size_t offset_ = static_cast(offset); uint64_t rounddown_offset = Rounddown(offset_, alignment); @@ -86,18 +88,30 @@ } Slice result; - s = reader->Read(rounddown_offset + chunk_len, - static_cast(roundup_len - chunk_len), &result, - buffer_.BufferStart() + chunk_len, for_compaction); - if (s.ok()) { - buffer_offset_ = rounddown_offset; - buffer_.Size(static_cast(chunk_len) + result.size()); - } + size_t read_len = static_cast(roundup_len - chunk_len); + s = reader->Read(opts, rounddown_offset + chunk_len, read_len, &result, + buffer_.BufferStart() + chunk_len, nullptr, for_compaction); + if (!s.ok()) { + return s; + } + +#ifndef NDEBUG + if (result.size() < read_len) { + // Fake an IO error to force db_stress fault injection to ignore + // truncated read errors + IGNORE_STATUS_IF_ERROR(Status::IOError()); + } +#endif + buffer_offset_ = rounddown_offset; + buffer_.Size(static_cast(chunk_len) + result.size()); return s; } -bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, - Slice* result, bool for_compaction) { +bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Slice* result, Status* status, + bool for_compaction) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -106,21 +120,47 @@ } // If the buffer contains only a few of the requested bytes: - // If readahead is enabled: prefetch the remaining bytes + readadhead bytes + // If readahead is enabled: prefetch the remaining bytes + readahead bytes // and satisfy the request. // If readahead is not enabled: return false. + TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache", + &readahead_size_); if (offset + n > buffer_offset_ + buffer_.CurrentSize()) { if (readahead_size_ > 0) { - assert(file_reader_ != nullptr); + assert(reader != nullptr); assert(max_readahead_size_ >= readahead_size_); Status s; if (for_compaction) { - s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), + s = Prefetch(opts, reader, offset, std::max(n, readahead_size_), for_compaction); } else { - s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); + if (implicit_auto_readahead_) { + // Prefetch only if this read is sequential otherwise reset + // readahead_size_ to initial value. + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, n); + ResetValues(); + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + num_file_reads_++; + if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) { + UpdateReadPattern(offset, n); + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + } + s = Prefetch(opts, reader, offset, n + readahead_size_, for_compaction); } if (!s.ok()) { + if (status) { + *status = s; + } +#ifndef NDEBUG + IGNORE_STATUS_IF_ERROR(s); +#endif return false; } readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); @@ -128,7 +168,7 @@ return false; } } - + UpdateReadPattern(offset, n); uint64_t offset_in_buffer = offset - buffer_offset_; *result = Slice(buffer_.BufferStart() + offset_in_buffer, n); return true; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,23 +8,33 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include #include #include -#include "file/random_access_file_reader.h" + +#include "file/readahead_file_info.h" #include "port/port.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "util/aligned_buffer.h" namespace ROCKSDB_NAMESPACE { +#define DEAFULT_DECREMENT 8 * 1024 + +struct IOOptions; +class RandomAccessFileReader; + // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: + static const int kMinNumFileReadsToStartAutoReadahead = 2; + static const size_t kInitAutoReadaheadSize = 8 * 1024; + // Constructor. // // All arguments are optional. - // file_reader : the file reader to use. Can be a nullptr. // readahead_size : the initial readahead size. // max_readahead_size : the maximum readahead size. // If max_readahead_size > readahead_size, the readahead size will be @@ -36,54 +46,113 @@ // for the minimum offset if track_min_offset = true. // track_min_offset : Track the minimum offset ever read and collect stats on // it. Used for adaptable readahead of the file footer/metadata. + // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after + // doing sequential scans for two times. // - // Automatic readhead is enabled for a file if file_reader, readahead_size, + // Automatic readhead is enabled for a file if readahead_size // and max_readahead_size are passed in. - // If file_reader is a nullptr, setting readadhead_size and max_readahead_size - // does not make any sense. So it does nothing. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. - FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, - size_t readadhead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false) + FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false) : buffer_offset_(0), - file_reader_(file_reader), - readahead_size_(readadhead_size), + readahead_size_(readahead_size), max_readahead_size_(max_readahead_size), min_offset_read_(port::kMaxSizet), enable_(enable), - track_min_offset_(track_min_offset) {} + track_min_offset_(track_min_offset), + implicit_auto_readahead_(implicit_auto_readahead), + prev_offset_(0), + prev_len_(0), + num_file_reads_(kMinNumFileReadsToStartAutoReadahead + 1) {} // Load data into the buffer from a file. // reader : the file reader. // offset : the file offset to start reading from. // n : the number of bytes to read. // for_compaction : if prefetch is done for compaction read. - Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n, - bool for_compaction = false); + Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t n, bool for_compaction = false); - // Tries returning the data for a file raed from this buffer, if that data is + // Tries returning the data for a file read from this buffer if that data is // in the buffer. // It handles tracking the minimum read offset if track_min_offset = true. - // It also does the exponential readahead when readadhead_size is set as part + // It also does the exponential readahead when readahead_size is set as part // of the constructor. // - // offset : the file offset. - // n : the number of bytes. - // result : output buffer to put the data into. - // for_compaction : if cache read is done for compaction read. - bool TryReadFromCache(uint64_t offset, size_t n, Slice* result, + // opts : the IO options to use. + // reader : the file reader. + // offset : the file offset. + // n : the number of bytes. + // result : output buffer to put the data into. + // s : output status. + // for_compaction : true if cache read is done for compaction read. + bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, Status* s, bool for_compaction = false); // The minimum `offset` ever passed to TryReadFromCache(). This will nly be // tracked if track_min_offset = true. size_t min_offset_read() const { return min_offset_read_; } + // Called in case of implicit auto prefetching. + void UpdateReadPattern(const uint64_t& offset, const size_t& len, + bool is_adaptive_readahead = false) { + if (is_adaptive_readahead) { + // Since this block was eligible for prefetch but it was found in + // cache, so check and decrease the readahead_size by 8KB (default) + // if eligible. + DecreaseReadAheadIfEligible(offset, len); + } + prev_offset_ = offset; + prev_len_ = len; + } + + bool IsBlockSequential(const size_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + // Called in case of implicit auto prefetching. + void ResetValues() { + num_file_reads_ = 1; + readahead_size_ = kInitAutoReadaheadSize; + } + + void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) { + readahead_info->readahead_size = readahead_size_; + readahead_info->num_file_reads = num_file_reads_; + } + + void DecreaseReadAheadIfEligible(uint64_t offset, size_t size, + size_t value = DEAFULT_DECREMENT) { + // Decrease the readahead_size if + // - its enabled internally by RocksDB (implicit_auto_readahead_) and, + // - readahead_size is greater than 0 and, + // - this block would have called prefetch API if not found in cache for + // which conditions are: + // - few/no bytes are in buffer and, + // - block is sequential with the previous read and, + // - num_file_reads_ + 1 (including this read) > + // kMinNumFileReadsToStartAutoReadahead + if (implicit_auto_readahead_ && readahead_size_ > 0) { + if ((offset + size > buffer_offset_ + buffer_.CurrentSize()) && + IsBlockSequential(offset) && + (num_file_reads_ + 1 > kMinNumFileReadsToStartAutoReadahead)) { + size_t initial_auto_readahead_size = kInitAutoReadaheadSize; + readahead_size_ = + std::max(initial_auto_readahead_size, + (readahead_size_ >= value ? readahead_size_ - value : 0)); + } + } + } + private: AlignedBuffer buffer_; uint64_t buffer_offset_; - RandomAccessFileReader* file_reader_; size_t readahead_size_; + // FilePrefetchBuffer object won't be created from Iterator flow if + // max_readahead_size_ = 0. size_t max_readahead_size_; // The minimum `offset` ever passed to TryReadFromCache(). size_t min_offset_read_; @@ -93,5 +162,12 @@ // If true, track minimum `offset` ever passed to TryReadFromCache(), which // can be fetched from min_offset_read(). bool track_min_offset_; + + // implicit_auto_readahead is enabled by rocksdb internally after 2 + // sequential IOs. + bool implicit_auto_readahead_; + uint64_t prev_offset_; + size_t prev_len_; + int64_t num_file_reads_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,33 +17,35 @@ namespace ROCKSDB_NAMESPACE { // Utility function to copy a file up to a specified length -Status CopyFile(FileSystem* fs, const std::string& source, - const std::string& destination, uint64_t size, bool use_fsync) { +IOStatus CopyFile(FileSystem* fs, const std::string& source, + const std::string& destination, uint64_t size, bool use_fsync, + const std::shared_ptr& io_tracer) { const FileOptions soptions; - Status s; + IOStatus io_s; std::unique_ptr src_reader; std::unique_ptr dest_writer; { std::unique_ptr srcfile; - s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr); - if (!s.ok()) { - return s; + io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr); + if (!io_s.ok()) { + return io_s; } std::unique_ptr destfile; - s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); - if (!s.ok()) { - return s; + io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); + if (!io_s.ok()) { + return io_s; } if (size == 0) { // default argument means copy everything - s = fs->GetFileSize(source, IOOptions(), &size, nullptr); - if (!s.ok()) { - return s; + io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr); + if (!io_s.ok()) { + return io_s; } } - src_reader.reset(new SequentialFileReader(std::move(srcfile), source)); + src_reader.reset( + new SequentialFileReader(std::move(srcfile), source, io_tracer)); dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, soptions)); } @@ -52,16 +54,16 @@ Slice slice; while (size > 0) { size_t bytes_to_read = std::min(sizeof(buffer), static_cast(size)); - s = src_reader->Read(bytes_to_read, &slice, buffer); - if (!s.ok()) { - return s; + io_s = status_to_io_status(src_reader->Read(bytes_to_read, &slice, buffer)); + if (!io_s.ok()) { + return io_s; } if (slice.size() == 0) { - return Status::Corruption("file too small"); + return IOStatus::Corruption("file too small"); } - s = dest_writer->Append(slice); - if (!s.ok()) { - return s; + io_s = dest_writer->Append(slice); + if (!io_s.ok()) { + return io_s; } size -= slice.size(); } @@ -69,22 +71,22 @@ } // Utility function to create a file with the provided contents -Status CreateFile(FileSystem* fs, const std::string& destination, - const std::string& contents, bool use_fsync) { +IOStatus CreateFile(FileSystem* fs, const std::string& destination, + const std::string& contents, bool use_fsync) { const EnvOptions soptions; - Status s; + IOStatus io_s; std::unique_ptr dest_writer; std::unique_ptr destfile; - s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); - if (!s.ok()) { - return s; + io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr); + if (!io_s.ok()) { + return io_s; } dest_writer.reset( new WritableFileWriter(std::move(destfile), destination, soptions)); - s = dest_writer->Append(Slice(contents)); - if (!s.ok()) { - return s; + io_s = dest_writer->Append(Slice(contents)); + if (!io_s.ok()) { + return io_s; } return dest_writer->Sync(use_fsync); } @@ -110,15 +112,147 @@ #endif } -bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) { - bool same = false; - assert(!db_options->db_paths.empty()); - Status s = db_options->env->AreFilesSame(db_options->wal_dir, - db_options->db_paths[0].path, &same); - if (s.IsNotSupported()) { - same = db_options->wal_dir == db_options->db_paths[0].path; +// requested_checksum_func_name brings the function name of the checksum +// generator in checksum_factory. Empty string is permitted, in which case the +// name of the generator created by the factory is unchecked. When +// `requested_checksum_func_name` is non-empty, however, the created generator's +// name must match it, otherwise an `InvalidArgument` error is returned. +IOStatus GenerateOneFileChecksum( + FileSystem* fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool allow_mmap_reads, + std::shared_ptr& io_tracer, RateLimiter* rate_limiter) { + if (checksum_factory == nullptr) { + return IOStatus::InvalidArgument("Checksum factory is invalid"); + } + assert(file_checksum != nullptr); + assert(file_checksum_func_name != nullptr); + + FileChecksumGenContext gen_context; + gen_context.requested_checksum_func_name = requested_checksum_func_name; + gen_context.file_name = file_path; + std::unique_ptr checksum_generator = + checksum_factory->CreateFileChecksumGenerator(gen_context); + if (checksum_generator == nullptr) { + std::string msg = + "Cannot get the file checksum generator based on the requested " + "checksum function name: " + + requested_checksum_func_name + + " from checksum factory: " + checksum_factory->Name(); + return IOStatus::InvalidArgument(msg); + } else { + // For backward compatibility and use in file ingestion clients where there + // is no stored checksum function name, `requested_checksum_func_name` can + // be empty. If we give the requested checksum function name, we expect it + // is the same name of the checksum generator. + if (!requested_checksum_func_name.empty() && + checksum_generator->Name() != requested_checksum_func_name) { + std::string msg = "Expected file checksum generator named '" + + requested_checksum_func_name + + "', while the factory created one " + "named '" + + checksum_generator->Name() + "'"; + return IOStatus::InvalidArgument(msg); + } + } + + uint64_t size; + IOStatus io_s; + std::unique_ptr reader; + { + std::unique_ptr r_file; + io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr); + if (!io_s.ok()) { + return io_s; + } + io_s = fs->GetFileSize(file_path, IOOptions(), &size, nullptr); + if (!io_s.ok()) { + return io_s; + } + reader.reset(new RandomAccessFileReader(std::move(r_file), file_path, + nullptr /*Env*/, io_tracer, nullptr, + 0, nullptr, rate_limiter)); + } + + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + size_t default_max_read_ahead_size = 256 * 1024; + size_t readahead_size = (verify_checksums_readahead_size != 0) + ? verify_checksums_readahead_size + : default_max_read_ahead_size; + + FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */, + readahead_size /* max_readahead_size */, + !allow_mmap_reads /* enable */); + + Slice slice; + uint64_t offset = 0; + IOOptions opts; + while (size > 0) { + size_t bytes_to_read = + static_cast(std::min(uint64_t{readahead_size}, size)); + if (!prefetch_buffer.TryReadFromCache( + opts, reader.get(), offset, bytes_to_read, &slice, + nullptr /* status */, false /* for_compaction */)) { + return IOStatus::Corruption("file read failed"); + } + if (slice.size() == 0) { + return IOStatus::Corruption("file too small"); + } + checksum_generator->Update(slice.data(), slice.size()); + size -= slice.size(); + offset += slice.size(); + } + checksum_generator->Finalize(); + *file_checksum = checksum_generator->GetChecksum(); + *file_checksum_func_name = checksum_generator->Name(); + return IOStatus::OK(); +} + +Status DestroyDir(Env* env, const std::string& dir) { + Status s; + if (env->FileExists(dir).IsNotFound()) { + return s; + } + std::vector files_in_dir; + s = env->GetChildren(dir, &files_in_dir); + if (s.ok()) { + for (auto& file_in_dir : files_in_dir) { + std::string path = dir + "/" + file_in_dir; + bool is_dir = false; + s = env->IsDirectory(path, &is_dir); + if (s.ok()) { + if (is_dir) { + s = DestroyDir(env, path); + } else { + s = env->DeleteFile(path); + } + } else if (s.IsNotSupported()) { + s = Status::OK(); + } + if (!s.ok()) { + // IsDirectory, etc. might not report NotFound + if (s.IsNotFound() || env->FileExists(path).IsNotFound()) { + // Allow files to be deleted externally + s = Status::OK(); + } else { + break; + } + } + } + } + + if (s.ok()) { + s = env->DeleteDir(dir); + // DeleteDir might or might not report NotFound + if (!s.ok() && (s.IsNotFound() || env->FileExists(dir).IsNotFound())) { + // Allow to be deleted externally + s = Status::OK(); + } } - return same; + return s; } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,24 +10,83 @@ #include "options/db_options.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" #include "rocksdb/status.h" +#include "rocksdb/system_clock.h" #include "rocksdb/types.h" +#include "trace_replay/io_tracer.h" namespace ROCKSDB_NAMESPACE { // use_fsync maps to options.use_fsync, which determines the way that // the file is synced after copying. -extern Status CopyFile(FileSystem* fs, const std::string& source, - const std::string& destination, uint64_t size, - bool use_fsync); +extern IOStatus CopyFile(FileSystem* fs, const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync, + const std::shared_ptr& io_tracer = nullptr); +inline IOStatus CopyFile(const std::shared_ptr& fs, + const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync, + const std::shared_ptr& io_tracer = nullptr) { + return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer); +} -extern Status CreateFile(FileSystem* fs, const std::string& destination, - const std::string& contents, bool use_fsync); +extern IOStatus CreateFile(FileSystem* fs, const std::string& destination, + const std::string& contents, bool use_fsync); + +inline IOStatus CreateFile(const std::shared_ptr& fs, + const std::string& destination, + const std::string& contents, bool use_fsync) { + return CreateFile(fs.get(), destination, contents, use_fsync); +} extern Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& path_to_sync, const bool force_bg, const bool force_fg); -extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options); +extern IOStatus GenerateOneFileChecksum( + FileSystem* fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool allow_mmap_reads, + std::shared_ptr& io_tracer, RateLimiter* rate_limiter = nullptr); + +inline IOStatus GenerateOneFileChecksum( + const std::shared_ptr& fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool allow_mmap_reads, + std::shared_ptr& io_tracer) { + return GenerateOneFileChecksum( + fs.get(), file_path, checksum_factory, requested_checksum_func_name, + file_checksum, file_checksum_func_name, verify_checksums_readahead_size, + allow_mmap_reads, io_tracer); +} + +inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, + SystemClock* clock, IOOptions& opts) { + if (ro.deadline.count()) { + std::chrono::microseconds now = + std::chrono::microseconds(clock->NowMicros()); + // Ensure there is atleast 1us available. We don't want to pass a value of + // 0 as that means no timeout + if (now >= ro.deadline) { + return IOStatus::TimedOut("Deadline exceeded"); + } + opts.timeout = ro.deadline - now; + } + + if (ro.io_timeout.count() && + (!opts.timeout.count() || ro.io_timeout < opts.timeout)) { + opts.timeout = ro.io_timeout; + } + return IOStatus::OK(); +} +// Test method to delete the input directory and all of its contents. +// This method is destructive and is meant for use only in tests!!! +Status DestroyDir(Env* env, const std::string& dir); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,6 @@ #include #include #include "file/writable_file_writer.h" -#include "logging/logging.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" @@ -21,9 +20,14 @@ namespace ROCKSDB_NAMESPACE { +const std::string kCurrentFileName = "CURRENT"; +const std::string kOptionsFileNamePrefix = "OPTIONS-"; +const std::string kTempFileNameSuffix = "dbtmp"; + static const std::string kRocksDbTFileExt = "sst"; static const std::string kLevelDbTFileExt = "ldb"; static const std::string kRocksDBBlobFileExt = "blob"; +static const std::string kArchivalDirName = "archive"; // Given a path, flatten the path name by replacing all chars not in // {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end. @@ -79,6 +83,11 @@ return MakeFileName(number, "log"); } +std::string BlobFileName(uint64_t number) { + assert(number > 0); + return MakeFileName(number, kRocksDBBlobFileExt.c_str()); +} + std::string BlobFileName(const std::string& blobdirname, uint64_t number) { assert(number > 0); return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str()); @@ -92,11 +101,11 @@ } std::string ArchivalDirectory(const std::string& dir) { - return dir + "/" + ARCHIVAL_DIR; + return dir + "/" + kArchivalDirName; } std::string ArchivedLogFileName(const std::string& name, uint64_t number) { assert(number > 0); - return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log"); + return MakeFileName(name + "/" + kArchivalDirName, number, "log"); } std::string MakeTableFileName(const std::string& path, uint64_t number) { @@ -151,16 +160,20 @@ } } -std::string DescriptorFileName(const std::string& dbname, uint64_t number) { +std::string DescriptorFileName(uint64_t number) { assert(number > 0); char buf[100]; - snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", + snprintf(buf, sizeof(buf), "MANIFEST-%06llu", static_cast(number)); - return dbname + buf; + return buf; +} + +std::string DescriptorFileName(const std::string& dbname, uint64_t number) { + return dbname + "/" + DescriptorFileName(number); } std::string CurrentFileName(const std::string& dbname) { - return dbname + "/CURRENT"; + return dbname + "/" + kCurrentFileName; } std::string LockFileName(const std::string& dbname) { @@ -179,7 +192,8 @@ snprintf(buf, sizeof(buf), kInfoLogPrefix); prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1); } else { - size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf)); + size_t len = + GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf)); prefix = Slice(buf, len); } } @@ -208,11 +222,14 @@ return log_dir + "/" + info_log_prefix.buf + ".old." + buf; } -std::string OptionsFileName(const std::string& dbname, uint64_t file_num) { +std::string OptionsFileName(uint64_t file_num) { char buffer[256]; snprintf(buffer, sizeof(buffer), "%s%06" PRIu64, kOptionsFileNamePrefix.c_str(), file_num); - return dbname + "/" + buffer; + return buffer; +} +std::string OptionsFileName(const std::string& dbname, uint64_t file_num) { + return dbname + "/" + OptionsFileName(file_num); } std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) { @@ -326,11 +343,12 @@ // Avoid strtoull() to keep filename format independent of the // current locale bool archive_dir_found = false; - if (rest.starts_with(ARCHIVAL_DIR)) { - if (rest.size() <= ARCHIVAL_DIR.size()) { + if (rest.starts_with(kArchivalDirName)) { + if (rest.size() <= kArchivalDirName.size()) { return false; } - rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also + rest.remove_prefix(kArchivalDirName.size() + + 1); // Add 1 to remove / also if (log_type) { *log_type = kArchivedLogFile; } @@ -347,7 +365,7 @@ Slice suffix = rest; if (suffix == Slice("log")) { - *type = kLogFile; + *type = kWalFile; if (log_type && !archive_dir_found) { *log_type = kAliveLogFile; } @@ -368,27 +386,34 @@ return true; } -Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number, - Directory* directory_to_fsync) { +IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, + uint64_t descriptor_number, + FSDirectory* directory_to_fsync) { // Remove leading "dbname/" and add newline to manifest file name std::string manifest = DescriptorFileName(dbname, descriptor_number); Slice contents = manifest; assert(contents.starts_with(dbname + "/")); contents.remove_prefix(dbname.size() + 1); std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true); + IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { - TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2); - s = env->RenameFile(tmp, CurrentFileName(dbname)); - TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2); + s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr); + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s); } if (s.ok()) { if (directory_to_fsync != nullptr) { - s = directory_to_fsync->Fsync(); + s = directory_to_fsync->FsyncWithDirOptions( + IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname))); } } else { - env->DeleteFile(tmp); + fs->DeleteFile(tmp, IOOptions(), nullptr) + .PermitUncheckedError(); // NOTE: PermitUncheckedError is acceptable + // here as we are already handling an error + // case, and this is just a best-attempt + // effort at some cleanup } return s; } @@ -404,30 +429,41 @@ assert(!id.empty()); // Reserve the filename dbname/000000.dbtmp for the temporary identity file std::string tmp = TempFileName(dbname, 0); + std::string identify_file_name = IdentityFileName(dbname); Status s = WriteStringToFile(env, id, tmp, true); if (s.ok()) { - s = env->RenameFile(tmp, IdentityFileName(dbname)); + s = env->RenameFile(tmp, identify_file_name); + } + std::unique_ptr dir_obj; + if (s.ok()) { + s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj, + nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(identify_file_name)); } if (!s.ok()) { - env->DeleteFile(tmp); + env->DeleteFile(tmp).PermitUncheckedError(); } return s; } -Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, - WritableFileWriter* file) { - TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2); - StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); +IOStatus SyncManifest(const ImmutableDBOptions* db_options, + WritableFileWriter* file) { + TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2); + StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS); return file->Sync(db_options->use_fsync); } -Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, - const std::string& dbname, std::string* parent_dir, +Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, + std::string* parent_dir, std::vector* info_log_list) { assert(parent_dir != nullptr); assert(info_log_list != nullptr); uint64_t number = 0; - FileType type = kLogFile; + FileType type = kWalFile; if (!db_log_dir.empty()) { *parent_dir = db_log_dir; @@ -438,7 +474,7 @@ InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname); std::vector file_names; - Status s = env->GetChildren(*parent_dir, &file_names); + Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr); if (!s.ok()) { return s; @@ -453,4 +489,16 @@ return Status::OK(); } +std::string NormalizePath(const std::string& path) { + std::string dst; + for (auto c : path) { + if (!dst.empty() && (c == kFilePathSeparator || c == '/') && + (dst.back() == kFilePathSeparator || dst.back() == '/')) { + continue; + } + dst.push_back(c); + } + return dst; +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,6 +17,7 @@ #include "options/db_options.h" #include "port/port.h" +#include "rocksdb/file_system.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -26,21 +27,14 @@ class Env; class Directory; +class SystemClock; class WritableFileWriter; -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile, // Either the current one, or an old one - kMetaDatabase, - kIdentityFile, - kOptionsFile, - kBlobFile -}; +#ifdef OS_WIN +constexpr char kFilePathSeparator = '\\'; +#else +constexpr char kFilePathSeparator = '/'; +#endif // Return the name of the log file with the specified number // in the db named by "dbname". The result will be prefixed with @@ -49,13 +43,13 @@ extern std::string LogFileName(uint64_t number); +extern std::string BlobFileName(uint64_t number); + extern std::string BlobFileName(const std::string& bdirname, uint64_t number); extern std::string BlobFileName(const std::string& dbname, const std::string& blob_dir, uint64_t number); -static const std::string ARCHIVAL_DIR = "archive"; - extern std::string ArchivalDirectory(const std::string& dbname); // Return the name of the archived log file with the specified number @@ -93,6 +87,10 @@ extern std::string DescriptorFileName(const std::string& dbname, uint64_t number); +extern std::string DescriptorFileName(uint64_t number); + +extern const std::string kCurrentFileName; // = "CURRENT" + // Return the name of the current file. This file contains the name // of the current manifest file. The result will be prefixed with // "dbname". @@ -126,13 +124,14 @@ const std::string& db_path = "", const std::string& log_dir = ""); -static const std::string kOptionsFileNamePrefix = "OPTIONS-"; -static const std::string kTempFileNameSuffix = "dbtmp"; +extern const std::string kOptionsFileNamePrefix; // = "OPTIONS-" +extern const std::string kTempFileNameSuffix; // = "dbtmp" // Return a options file name given the "dbname" and file number. // Format: OPTIONS-[number].dbtmp extern std::string OptionsFileName(const std::string& dbname, uint64_t file_num); +extern std::string OptionsFileName(uint64_t file_num); // Return a temp options file name given the "dbname" and file number. // Format: OPTIONS-[number] @@ -162,24 +161,27 @@ // Make the CURRENT file point to the descriptor file with the // specified number. -extern Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number, - Directory* directory_to_fsync); +extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, + uint64_t descriptor_number, + FSDirectory* directory_to_fsync); // Make the IDENTITY file for the db extern Status SetIdentityFile(Env* env, const std::string& dbname, const std::string& db_id = {}); // Sync manifest file `file`. -extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, - WritableFileWriter* file); +extern IOStatus SyncManifest(const ImmutableDBOptions* db_options, + WritableFileWriter* file); // Return list of file names of info logs in `file_names`. // The list only contains file name. The parent directory name is stored // in `parent_dir`. // `db_log_dir` should be the one as in options.db_log_dir -extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, +extern Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, std::string* parent_dir, std::vector* file_names); + +extern std::string NormalizePath(const std::string& path); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,68 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/line_file_reader.h" + +#include + +#include "monitoring/iostats_context_imp.h" + +namespace ROCKSDB_NAMESPACE { + +IOStatus LineFileReader::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new LineFileReader(std::move(file), fname)); + } + return io_s; +} + +bool LineFileReader::ReadLine(std::string* out) { + assert(out); + if (!io_status_.ok()) { + // Status should be checked (or permit unchecked) any time we return false. + io_status_.MustCheck(); + return false; + } + out->clear(); + for (;;) { + // Look for line delimiter + const char* found = static_cast( + std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_)); + if (found) { + size_t len = found - buf_begin_; + out->append(buf_begin_, len); + buf_begin_ += len + /*delim*/ 1; + ++line_number_; + return true; + } + if (at_eof_) { + io_status_.MustCheck(); + return false; + } + // else flush and reload buffer + out->append(buf_begin_, buf_end_ - buf_begin_); + Slice result; + io_status_ = sfr_.Read(buf_.size(), &result, buf_.data()); + IOSTATS_ADD(bytes_read, result.size()); + if (!io_status_.ok()) { + io_status_.MustCheck(); + return false; + } + if (result.size() != buf_.size()) { + // The obscure way of indicating EOF + at_eof_ = true; + } + buf_begin_ = result.data(); + buf_end_ = result.data() + result.size(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "file/sequence_file_reader.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper on top of Env::SequentialFile for reading text lines from a file. +// Lines are delimited by '\n'. The last line may or may not include a +// trailing newline. Uses SequentialFileReader internally. +class LineFileReader { + private: + std::array buf_; + SequentialFileReader sfr_; + IOStatus io_status_; + const char* buf_begin_ = buf_.data(); + const char* buf_end_ = buf_.data(); + size_t line_number_ = 0; + bool at_eof_ = false; + + public: + // See SequentialFileReader constructors + template + explicit LineFileReader(Args&&... args) + : sfr_(std::forward(args)...) {} + + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); + + LineFileReader(const LineFileReader&) = delete; + LineFileReader& operator=(const LineFileReader&) = delete; + + // Reads another line from the file, returning true on success and saving + // the line to `out`, without delimiter, or returning false on failure. You + // must check GetStatus() to determine whether the failure was just + // end-of-file (OK status) or an I/O error (another status). + bool ReadLine(std::string* out); + + // Returns the number of the line most recently returned from ReadLine. + // Return value is unspecified if ReadLine has returned false due to + // I/O error. After ReadLine returns false due to end-of-file, return + // value is the last returned line number, or equivalently the total + // number of lines returned. + size_t GetLineNumber() const { return line_number_; } + + // Returns any error encountered during read. The error is considered + // permanent and no retry or recovery is attempted with the same + // LineFileReader. + const IOStatus& GetStatus() const { return io_status_; } +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/prefetch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/prefetch_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,1004 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class MockFS; + +class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper { + public: + MockRandomAccessFile(std::unique_ptr& file, + bool support_prefetch, std::atomic_int& prefetch_count) + : FSRandomAccessFileOwnerWrapper(std::move(file)), + support_prefetch_(support_prefetch), + prefetch_count_(prefetch_count) {} + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { + if (support_prefetch_) { + prefetch_count_.fetch_add(1); + return target()->Prefetch(offset, n, options, dbg); + } else { + return IOStatus::NotSupported("Prefetch not supported"); + } + } + + private: + const bool support_prefetch_; + std::atomic_int& prefetch_count_; +}; + +class MockFS : public FileSystemWrapper { + public: + explicit MockFS(const std::shared_ptr& wrapped, + bool support_prefetch) + : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {} + + static const char* kClassName() { return "MockFS"; } + const char* Name() const override { return kClassName(); } + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s; + s = target()->NewRandomAccessFile(fname, opts, &file, dbg); + result->reset( + new MockRandomAccessFile(file, support_prefetch_, prefetch_count_)); + return s; + } + + void ClearPrefetchCount() { prefetch_count_ = 0; } + + bool IsPrefetchCalled() { return prefetch_count_ > 0; } + + int GetPrefetchCount() { + return prefetch_count_.load(std::memory_order_relaxed); + } + + private: + const bool support_prefetch_; + std::atomic_int prefetch_count_{0}; +}; + +class PrefetchTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + PrefetchTest() : DBTestBase("prefetch_test", true) {} +}; + +INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + +std::string BuildKey(int num, std::string postfix = "") { + return "my_key_" + std::to_string(num) + postfix; +} + +TEST_P(PrefetchTest, Basic) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + const int kNumKeys = 1100; + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + // create first key range + WriteBatch batch; + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key")); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + // create second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key")); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + // delete second key range + batch.Clear(); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Delete(BuildKey(i, "key2"))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + // compact database + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + // commenting out the line below causes the example to work correctly + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + if (support_prefetch && !use_direct_io) { + // If underline file system supports prefetch, and directIO is not enabled + // make sure prefetch() is called and FilePrefetchBuffer is not used. + ASSERT_TRUE(fs->IsPrefetchCalled()); + fs->ClearPrefetchCount(); + ASSERT_EQ(0, buff_prefetch_count); + } else { + // If underline file system doesn't support prefetch, or directIO is + // enabled, make sure prefetch() is not called and FilePrefetchBuffer is + // used. + ASSERT_FALSE(fs->IsPrefetchCalled()); + ASSERT_GT(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + + // count the keys + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + } + + // Make sure prefetch is called only if file system support prefetch. + if (support_prefetch && !use_direct_io) { + ASSERT_TRUE(fs->IsPrefetchCalled()); + fs->ClearPrefetchCount(); + ASSERT_EQ(0, buff_prefetch_count); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + ASSERT_GT(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + Close(); +} + +#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.max_auto_readahead_size = 0; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + // DB open will create table readers unless we reduce the table cache + // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table + // cache is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 10 so table cache capacity will become 0. This will + // prevent file open during DB open and force the file to be opened during + // Iteration. + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + Random rnd(309); + int key_count = 0; + const int num_keys_per_level = 100; + // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299]. + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + for (int i = 0; i < num_keys_per_level; ++i) { + ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + Close(); + std::vector buff_prefectch_level_count = {0, 0, 0}; + TryReopen(options); + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + switch (level) { + case 0: + // max_auto_readahead_size is set 0 so data and index blocks are not + // prefetched. + ASSERT_OK(db_->SetOptions( + {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}})); + break; + case 1: + // max_auto_readahead_size is set less than + // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains + // equal to max_auto_readahead_size. + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=4096;}"}})); + break; + case 2: + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=65536;}"}})); + break; + default: + assert(false); + } + + for (int i = 0; i < num_keys_per_level; ++i) { + iter->Seek(Key(key_count++)); + iter->Next(); + } + + buff_prefectch_level_count[level] = buff_prefetch_count; + if (support_prefetch && !use_direct_io) { + if (level == 0) { + ASSERT_FALSE(fs->IsPrefetchCalled()); + } else { + ASSERT_TRUE(fs->IsPrefetchCalled()); + } + fs->ClearPrefetchCount(); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + if (level == 0) { + ASSERT_EQ(buff_prefetch_count, 0); + } else { + ASSERT_GT(buff_prefetch_count, 0); + } + buff_prefetch_count = 0; + } + } + } + + if (!support_prefetch) { + ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(PrefetchTest, PrefetchWhenReseek) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + const int kNumKeys = 2000; + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + /* + * Reseek keys from sequential Data Blocks within same partitioned + * index. After 2 sequential reads it will prefetch the data block. + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more + * initially (2 more data blocks). + */ + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // Prefetch Data + iter->Seek(BuildKey(1008)); + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); // Prefetch Data + iter->Seek(BuildKey(1019)); + // Missed 2 blocks but they are already in buffer so no reset. + iter->Seek(BuildKey(103)); // Already in buffer. + iter->Seek(BuildKey(1033)); // Prefetch Data + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 3); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 3); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek keys from non sequential data blocks within same partitioned + * index. buff_prefetch_count will be 0 in that case. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1008)); + iter->Seek(BuildKey(1019)); + iter->Seek(BuildKey(1033)); + iter->Seek(BuildKey(1048)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* + * Reesek keys from Single Data Block. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1)); + iter->Seek(BuildKey(10)); + iter->Seek(BuildKey(100)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek keys from sequential data blocks to set implicit auto readahead + * and prefetch data but after that iterate over different (non sequential) + * data blocks which won't prefetch any data further. So buff_prefetch_count + * will be 1 for the first one. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // This iteration will prefetch buffer + iter->Seek(BuildKey(1008)); + iter->Seek( + BuildKey(996)); // Reseek won't prefetch any data and + // readahead_size will be initiallized to 8*1024. + iter->Seek(BuildKey(992)); + iter->Seek(BuildKey(989)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + + // Read sequentially to confirm readahead_size is reset to initial value (2 + // more data blocks) + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); // Prefetch Data + iter->Seek(BuildKey(1022)); + iter->Seek(BuildKey(1026)); + iter->Seek(BuildKey(103)); // Prefetch Data + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 2); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 2); + buff_prefetch_count = 0; + } + } + { + /* Reseek keys from sequential partitioned index block. Since partitioned + * index fetch are sequential, buff_prefetch_count will be 1. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1167)); + iter->Seek(BuildKey(1334)); // This iteration will prefetch buffer + iter->Seek(BuildKey(1499)); + iter->Seek(BuildKey(1667)); + iter->Seek(BuildKey(1847)); + iter->Seek(BuildKey(1999)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek over different keys from different blocks. buff_prefetch_count is + * set 0. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + int i = 0; + int j = 1000; + do { + iter->Seek(BuildKey(i)); + if (!iter->Valid()) { + break; + } + i = i + 100; + iter->Seek(BuildKey(j)); + j = j + 100; + } while (i < 1000 && j < kNumKeys && iter->Valid()); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* Iterates sequentially over all keys. It will prefetch the buffer.*/ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + } + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 13); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 13); + buff_prefetch_count = 0; + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + +TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + const int kNumKeys = 2000; + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + + BlockBasedTableOptions table_options; + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + { + /* + * Reseek keys from sequential Data Blocks within same partitioned + * index. After 2 sequential reads it will prefetch the data block. + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more + * initially (2 more data blocks). + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + // Warm up the cache + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + } + { + // After caching, blocks will be read from cache (Sequential blocks) + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // Prefetch data (not in cache). + // Missed one sequential block but next is in already in buffer so readahead + // will not be reset. + iter->Seek(BuildKey(1011)); + // Prefetch data but blocks are in cache so no prefetch and reset. + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); + iter->Seek(BuildKey(1022)); + // Prefetch data with readahead_size = 4 blocks. + iter->Seek(BuildKey(1026)); + iter->Seek(BuildKey(103)); + iter->Seek(BuildKey(1033)); + iter->Seek(BuildKey(1037)); + + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 3); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 2); + buff_prefetch_count = 0; + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + +class PrefetchTest1 + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + PrefetchTest1() : DBTestBase("prefetch_test1", true) {} +}; + +INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + +#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest1, DBIterLevelReadAhead) { + const int kNumKeys = 1000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + bool is_adaptive_readahead = std::get<1>(GetParam()); + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + if (std::get<0>(GetParam())) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (std::get<0>(GetParam()) && + (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + int buff_prefetch_count = 0; + int readahead_carry_over_count = 0; + int num_sst_files = NumTableFilesAtLevel(2); + size_t current_readahead_size = 0; + + // Test - Iterate over the keys sequentially. + { + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + // The callback checks, since reads are sequential, readahead_size doesn't + // start from 8KB when iterator moves to next file and its called + // num_sst_files-1 times (excluding for first file). + SyncPoint::GetInstance()->SetCallBack( + "BlockPrefetcher::SetReadaheadState", [&](void* arg) { + readahead_carry_over_count++; + size_t readahead_size = *reinterpret_cast(arg); + if (readahead_carry_over_count) { + ASSERT_GT(readahead_size, 8 * 1024); + // ASSERT_GE(readahead_size, current_readahead_size); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { + current_readahead_size = *reinterpret_cast(arg); + ASSERT_GT(current_readahead_size, 0); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ro; + if (is_adaptive_readahead) { + ro.adaptive_readahead = true; + } + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + num_keys++; + } + + ASSERT_GT(buff_prefetch_count, 0); + buff_prefetch_count = 0; + // For index and data blocks. + if (is_adaptive_readahead) { + ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1)); + } else { + ASSERT_EQ(readahead_carry_over_count, 0); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + Close(); +} +#endif //! ROCKSDB_LITE + +class PrefetchTest2 : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefetchTest2() : DBTestBase("prefetch_test2", true) {} +}; + +INSTANTIATE_TEST_CASE_P(PrefetchTest2, PrefetchTest2, ::testing::Bool()); + +#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest2, NonSequentialReads) { + const int kNumKeys = 1000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + if (GetParam()) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + int buff_prefetch_count = 0; + int set_readahead = 0; + size_t readahead_size = 0; + + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack( + "BlockPrefetcher::SetReadaheadState", + [&](void* /*arg*/) { set_readahead++; }); + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::TryReadFromCache", + [&](void* arg) { readahead_size = *reinterpret_cast(arg); }); + + SyncPoint::GetInstance()->EnableProcessing(); + + { + // Iterate until prefetch is done. + ReadOptions ro; + ro.adaptive_readahead = true; + auto iter = std::unique_ptr(db_->NewIterator(ro)); + iter->SeekToFirst(); + while (iter->Valid() && buff_prefetch_count == 0) { + iter->Next(); + } + ASSERT_EQ(readahead_size, 8 * 1024); + ASSERT_EQ(buff_prefetch_count, 1); + ASSERT_EQ(set_readahead, 0); + buff_prefetch_count = 0; + + // Move to last file and check readahead size fallbacks to 8KB. So next + // readahead size after prefetch should be 8 * 1024; + iter->Seek(BuildKey(4004)); + while (iter->Valid() && buff_prefetch_count == 0) { + iter->Next(); + } + ASSERT_EQ(readahead_size, 8 * 1024); + ASSERT_EQ(set_readahead, 0); + ASSERT_EQ(buff_prefetch_count, 1); + } + Close(); +} +#endif //! ROCKSDB_LITE + +TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) { + const int kNumKeys = 2000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + if (GetParam()) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + int buff_prefetch_count = 0; + size_t current_readahead_size = 0; + size_t expected_current_readahead_size = 8 * 1024; + size_t decrease_readahead_size = 8 * 1024; + + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { + current_readahead_size = *reinterpret_cast(arg); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + ReadOptions ro; + ro.adaptive_readahead = true; + { + /* + * Reseek keys from sequential Data Blocks within same partitioned + * index. After 2 sequential reads it will prefetch the data block. + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data + * more initially (2 more data blocks). + */ + auto iter = std::unique_ptr(db_->NewIterator(ro)); + // Warm up the cache + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); + buff_prefetch_count = 0; + } + { + // After caching, blocks will be read from cache (Sequential blocks) + auto iter = std::unique_ptr(db_->NewIterator(ro)); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // Prefetch data (not in cache). + ASSERT_EQ(current_readahead_size, expected_current_readahead_size); + + // Missed one sequential block but 1011 is already in buffer so + // readahead will not be reset. + iter->Seek(BuildKey(1011)); + ASSERT_EQ(current_readahead_size, expected_current_readahead_size); + + // Eligible to Prefetch data (not in buffer) but block is in cache so no + // prefetch will happen and will result in decrease in readahead_size. + // readahead_size will be 8 * 1024 + iter->Seek(BuildKey(1015)); + expected_current_readahead_size -= decrease_readahead_size; + + // 1016 is the same block as 1015. So no change in readahead_size. + iter->Seek(BuildKey(1016)); + + // Prefetch data (not in buffer) but found in cache. So decrease + // readahead_size. Since it will 0 after decrementing so readahead_size will + // be set to initial value. + iter->Seek(BuildKey(1019)); + expected_current_readahead_size = std::max( + decrease_readahead_size, + (expected_current_readahead_size >= decrease_readahead_size + ? (expected_current_readahead_size - decrease_readahead_size) + : 0)); + + // Prefetch next sequential data. + iter->Seek(BuildKey(1022)); + ASSERT_EQ(current_readahead_size, expected_current_readahead_size); + ASSERT_EQ(buff_prefetch_count, 2); + buff_prefetch_count = 0; + } + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,20 +12,130 @@ #include #include +#include "file/file_util.h" #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "table/format.h" #include "test_util/sync_point.h" #include "util/random.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, - char* scratch, bool for_compaction) const { - Status s; +inline void IOStatsAddBytesByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value); + break; + default: + break; + } +} + +inline void IOStatsAddCountByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value); + break; + default: + break; + } +} + +inline void StatisticAddBytesByTemperature(Statistics* stats, + Temperature file_temperature, + size_t value) { + if (stats == nullptr || file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + RecordTick(stats, HOT_FILE_READ_BYTES, value); + break; + case Temperature::kWarm: + RecordTick(stats, WARM_FILE_READ_BYTES, value); + break; + case Temperature::kCold: + RecordTick(stats, COLD_FILE_READ_BYTES, value); + break; + default: + break; + } +} + +inline void StatisticAddCountByTemperature(Statistics* stats, + Temperature file_temperature, + size_t value) { + if (stats == nullptr || file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + RecordTick(stats, HOT_FILE_READ_COUNT, value); + break; + case Temperature::kWarm: + RecordTick(stats, WARM_FILE_READ_COUNT, value); + break; + case Temperature::kCold: + RecordTick(stats, COLD_FILE_READ_COUNT, value); + break; + default: + break; + } +} + +IOStatus RandomAccessFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new RandomAccessFileReader(std::move(file), fname)); + } + return io_s; +} + +IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, + size_t n, Slice* result, char* scratch, + AlignedBuf* aligned_buf, + bool for_compaction) const { + (void)aligned_buf; + + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr); + + // To be paranoid: modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return success and `scratch` returns + // contains a previous block, returned value will not pass checksum. + if (n > 0 && scratch != nullptr) { + // This byte might not change anything for direct I/O case, but it's OK. + scratch[0]++; + } + + IOStatus io_s; uint64_t elapsed = 0; { - StopWatch sw(env_, stats_, hist_type_, + StopWatch sw(clock_, stats_, hist_type_, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -53,32 +163,47 @@ } Slice tmp; - FileOperationInfo::TimePoint start_ts; + FileOperationInfo::StartTimePoint start_ts; uint64_t orig_offset = 0; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::now(); + start_ts = FileOperationInfo::StartNow(); orig_offset = aligned_offset + buf.CurrentSize(); } + { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); - s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, - IOOptions(), &tmp, buf.Destination(), nullptr); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + // Only user reads are expected to specify a timeout. And user reads + // are not subjected to rate_limiter and should go through only + // one iteration of this loop, so we don't need to check and adjust + // the opts.timeout before calling file_->Read + assert(!opts.timeout.count() || allowed == read_size); + io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts, + &tmp, buf.Destination(), nullptr); } if (ShouldNotifyListeners()) { - auto finish_ts = std::chrono::system_clock::now(); + auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, - s); + io_s); + if (!io_s.ok()) { + NotifyOnIOError(io_s, FileOperationType::kRead, file_name(), + tmp.size(), orig_offset); + } } buf.Size(buf.CurrentSize() + tmp.size()); - if (!s.ok() || tmp.size() < allowed) { + if (!io_s.ok() || tmp.size() < allowed) { break; } } size_t res_len = 0; - if (s.ok() && offset_advance < buf.CurrentSize()) { - res_len = buf.Read(scratch, offset_advance, - std::min(buf.CurrentSize() - offset_advance, n)); + if (io_s.ok() && offset_advance < buf.CurrentSize()) { + res_len = std::min(buf.CurrentSize() - offset_advance, n); + if (aligned_buf == nullptr) { + buf.Read(scratch, offset_advance, res_len); + } else { + scratch = buf.BufferStart() + offset_advance; + aligned_buf->reset(buf.Release()); + } } *result = Slice(scratch, res_len); #endif // !ROCKSDB_LITE @@ -103,24 +228,34 @@ Slice tmp_result; #ifndef ROCKSDB_LITE - FileOperationInfo::TimePoint start_ts; + FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::now(); + start_ts = FileOperationInfo::StartNow(); } #endif + { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); - s = file_->Read(offset + pos, allowed, IOOptions(), &tmp_result, - scratch + pos, nullptr); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + // Only user reads are expected to specify a timeout. And user reads + // are not subjected to rate_limiter and should go through only + // one iteration of this loop, so we don't need to check and adjust + // the opts.timeout before calling file_->Read + assert(!opts.timeout.count() || allowed == n); + io_s = file_->Read(offset + pos, allowed, opts, &tmp_result, + scratch + pos, nullptr); } #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { - auto finish_ts = std::chrono::system_clock::now(); + auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, - finish_ts, s); + finish_ts, io_s); + + if (!io_s.ok()) { + NotifyOnIOError(io_s, FileOperationType::kRead, file_name(), + tmp_result.size(), offset + pos); + } } #endif - if (res_scratch == nullptr) { // we can't simply use `scratch` because reads of mmap'd files return // data in a different buffer. @@ -130,53 +265,194 @@ assert(tmp_result.data() == res_scratch + pos); } pos += tmp_result.size(); - if (!s.ok() || tmp_result.size() < allowed) { + if (!io_s.ok() || tmp_result.size() < allowed) { break; } } - *result = Slice(res_scratch, s.ok() ? pos : 0); + *result = Slice(res_scratch, io_s.ok() ? pos : 0); } - IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + IOSTATS_ADD(bytes_read, result->size()); + IOStatsAddBytesByTemperature(file_temperature_, result->size()); + IOStatsAddCountByTemperature(file_temperature_, 1); + StatisticAddBytesByTemperature(stats_, file_temperature_, result->size()); + StatisticAddCountByTemperature(stats_, file_temperature_, 1); SetPerfLevel(prev_perf_level); } if (stats_ != nullptr && file_read_hist_ != nullptr) { file_read_hist_->Add(elapsed); } - return s; + return io_s; +} + +size_t End(const FSReadRequest& r) { + return static_cast(r.offset) + r.len; +} + +FSReadRequest Align(const FSReadRequest& r, size_t alignment) { + FSReadRequest req; + req.offset = static_cast( + TruncateToPageBoundary(alignment, static_cast(r.offset))); + req.len = Roundup(End(r), alignment) - req.offset; + req.scratch = nullptr; + return req; } -Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs, - size_t num_reqs) const { - Status s; +bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) { + size_t dest_offset = static_cast(dest->offset); + size_t src_offset = static_cast(src.offset); + size_t dest_end = End(*dest); + size_t src_end = End(src); + if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) { + return false; + } + dest->offset = static_cast(std::min(dest_offset, src_offset)); + dest->len = std::max(dest_end, src_end) - dest->offset; + return true; +} + +IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, + FSReadRequest* read_reqs, + size_t num_reqs, + AlignedBuf* aligned_buf) const { + (void)aligned_buf; // suppress warning of unused variable in LITE mode + assert(num_reqs > 0); + +#ifndef NDEBUG + for (size_t i = 0; i < num_reqs - 1; ++i) { + assert(read_reqs[i].offset <= read_reqs[i + 1].offset); + } +#endif // !NDEBUG + + // To be paranoid modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return succee and `scratch` returns + // contains a previous block, returned value will not pass checksum. + // This byte might not change anything for direct I/O case, but it's OK. + for (size_t i = 0; i < num_reqs; i++) { + FSReadRequest& r = read_reqs[i]; + if (r.len > 0 && r.scratch != nullptr) { + r.scratch[0]++; + } + } + + IOStatus io_s; uint64_t elapsed = 0; - assert(!use_direct_io()); { - StopWatch sw(env_, stats_, hist_type_, + StopWatch sw(clock_, stats_, hist_type_, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); IOSTATS_TIMER_GUARD(read_nanos); + FSReadRequest* fs_reqs = read_reqs; + size_t num_fs_reqs = num_reqs; +#ifndef ROCKSDB_LITE + std::vector aligned_reqs; + if (use_direct_io()) { + // num_reqs is the max possible size, + // this can reduce std::vecector's internal resize operations. + aligned_reqs.reserve(num_reqs); + // Align and merge the read requests. + size_t alignment = file_->GetRequiredBufferAlignment(); + for (size_t i = 0; i < num_reqs; i++) { + const auto& r = Align(read_reqs[i], alignment); + if (i == 0) { + // head + aligned_reqs.push_back(r); + + } else if (!TryMerge(&aligned_reqs.back(), r)) { + // head + n + aligned_reqs.push_back(r); + + } else { + // unused + r.status.PermitUncheckedError(); + } + } + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs", + &aligned_reqs); + + // Allocate aligned buffer and let scratch buffers point to it. + size_t total_len = 0; + for (const auto& r : aligned_reqs) { + total_len += r.len; + } + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(total_len); + char* scratch = buf.BufferStart(); + for (auto& r : aligned_reqs) { + r.scratch = scratch; + scratch += r.len; + } + + aligned_buf->reset(buf.Release()); + fs_reqs = aligned_reqs.data(); + num_fs_reqs = aligned_reqs.size(); + } +#endif // ROCKSDB_LITE + #ifndef ROCKSDB_LITE - FileOperationInfo::TimePoint start_ts; + FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::now(); + start_ts = FileOperationInfo::StartNow(); } #endif // ROCKSDB_LITE + { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); - s = file_->MultiRead(read_reqs, num_reqs, IOOptions(), nullptr); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr); } + +#ifndef ROCKSDB_LITE + if (use_direct_io()) { + // Populate results in the unaligned read requests. + size_t aligned_i = 0; + for (size_t i = 0; i < num_reqs; i++) { + auto& r = read_reqs[i]; + if (static_cast(r.offset) > End(aligned_reqs[aligned_i])) { + aligned_i++; + } + const auto& fs_r = fs_reqs[aligned_i]; + r.status = fs_r.status; + if (r.status.ok()) { + uint64_t offset = r.offset - fs_r.offset; + if (fs_r.result.size() <= offset) { + // No byte in the read range is returned. + r.result = Slice(); + } else { + size_t len = std::min( + r.len, static_cast(fs_r.result.size() - offset)); + r.result = Slice(fs_r.scratch + offset, len); + } + } else { + r.result = Slice(); + } + } + } +#endif // ROCKSDB_LITE + for (size_t i = 0; i < num_reqs; ++i) { #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { - auto finish_ts = std::chrono::system_clock::now(); + auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(), start_ts, finish_ts, read_reqs[i].status); } + if (!read_reqs[i].status.ok()) { + NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead, + file_name(), read_reqs[i].result.size(), + read_reqs[i].offset); + } + #endif // ROCKSDB_LITE - IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size()); + IOSTATS_ADD(bytes_read, read_reqs[i].result.size()); + IOStatsAddBytesByTemperature(file_temperature_, + read_reqs[i].result.size()); + IOStatsAddCountByTemperature(file_temperature_, 1); + StatisticAddBytesByTemperature(stats_, file_temperature_, + read_reqs[i].result.size()); + StatisticAddCountByTemperature(stats_, file_temperature_, 1); } SetPerfLevel(prev_perf_level); } @@ -184,6 +460,15 @@ file_read_hist_->Add(elapsed); } - return s; + return io_s; +} + +IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, + IOOptions& opts) { + if (clock_ != nullptr) { + return PrepareIOFromReadOptions(ro, clock_, opts); + } else { + return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts); + } } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,18 +11,34 @@ #include #include #include + +#include "env/file_system_tracer.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/listener.h" +#include "rocksdb/options.h" #include "rocksdb/rate_limiter.h" #include "util/aligned_buffer.h" namespace ROCKSDB_NAMESPACE { class Statistics; class HistogramImpl; +class SystemClock; + +using AlignedBuf = std::unique_ptr; + +// Align the request r according to alignment and return the aligned result. +FSReadRequest Align(const FSReadRequest& r, size_t alignment); -// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is +// Try to merge src to dest if they have overlap. +// +// Each request represents an inclusive interval [offset, offset + len]. +// If the intervals have overlap, update offset and len to represent the +// merged interval, and return true. +// Otherwise, do nothing and return false. +bool TryMerge(FSReadRequest* dest, const FSReadRequest& src); + +// RandomAccessFileReader is a wrapper on top of Env::RandomAccessFile. It is // responsible for: // - Handling Buffered and Direct reads appropriately. // - Rate limiting compaction reads. @@ -31,47 +47,69 @@ class RandomAccessFileReader { private: #ifndef ROCKSDB_LITE - void NotifyOnFileReadFinish(uint64_t offset, size_t length, - const FileOperationInfo::TimePoint& start_ts, - const FileOperationInfo::TimePoint& finish_ts, - const Status& status) const { - FileOperationInfo info(file_name_, start_ts, finish_ts); + void NotifyOnFileReadFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts, + finish_ts, status); info.offset = offset; info.length = length; - info.status = status; for (auto& listener : listeners_) { listener->OnFileReadFinish(info); } + info.status.PermitUncheckedError(); } + + void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation, + const std::string& file_path, size_t length, + uint64_t offset) const { + if (listeners_.empty()) { + return; + } + IOErrorInfo io_error_info(io_status, operation, file_path, length, offset); + + for (auto& listener : listeners_) { + listener->OnIOError(io_error_info); + } + io_status.PermitUncheckedError(); + } + #endif // ROCKSDB_LITE bool ShouldNotifyListeners() const { return !listeners_.empty(); } - std::unique_ptr file_; + FSRandomAccessFilePtr file_; std::string file_name_; - Env* env_; + SystemClock* clock_; Statistics* stats_; uint32_t hist_type_; HistogramImpl* file_read_hist_; RateLimiter* rate_limiter_; std::vector> listeners_; + Temperature file_temperature_; public: explicit RandomAccessFileReader( - std::unique_ptr&& raf, std::string _file_name, - Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0, + std::unique_ptr&& raf, const std::string& _file_name, + SystemClock* clock = nullptr, + const std::shared_ptr& io_tracer = nullptr, + Statistics* stats = nullptr, uint32_t hist_type = 0, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, - const std::vector>& listeners = {}) - : file_(std::move(raf)), + const std::vector>& listeners = {}, + Temperature file_temperature = Temperature::kUnknown) + : file_(std::move(raf), io_tracer, _file_name), file_name_(std::move(_file_name)), - env_(env), + clock_(clock), stats_(stats), hist_type_(hist_type), file_read_hist_(file_read_hist), rate_limiter_(rate_limiter), - listeners_() { + listeners_(), + file_temperature_(file_temperature) { #ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { @@ -84,37 +122,45 @@ #endif } - RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT { - *this = std::move(o); - } - - RandomAccessFileReader& operator=(RandomAccessFileReader&& o) - ROCKSDB_NOEXCEPT { - file_ = std::move(o.file_); - env_ = std::move(o.env_); - stats_ = std::move(o.stats_); - hist_type_ = std::move(o.hist_type_); - file_read_hist_ = std::move(o.file_read_hist_); - rate_limiter_ = std::move(o.rate_limiter_); - return *this; - } - + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); RandomAccessFileReader(const RandomAccessFileReader&) = delete; RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; - Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, - bool for_compaction = false) const; + // In non-direct IO mode, + // 1. if using mmap, result is stored in a buffer other than scratch; + // 2. if not using mmap, result is stored in the buffer starting from scratch. + // + // In direct IO mode, an aligned buffer is allocated internally. + // 1. If aligned_buf is null, then results are copied to the buffer + // starting from scratch; + // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns + // the internally allocated buffer on return, and the result refers to a + // region in aligned_buf. + IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, + char* scratch, AlignedBuf* aligned_buf, + bool for_compaction = false) const; + + // REQUIRES: + // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. + // In non-direct IO mode, aligned_buf should be null; + // In direct IO mode, aligned_buf stores the aligned buffer allocated inside + // MultiRead, the result Slices in reqs refer to aligned_buf. + IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs, + size_t num_reqs, AlignedBuf* aligned_buf) const; - Status MultiRead(FSReadRequest* reqs, size_t num_reqs) const; - - Status Prefetch(uint64_t offset, size_t n) const { + IOStatus Prefetch(uint64_t offset, size_t n) const { return file_->Prefetch(offset, n, IOOptions(), nullptr); } FSRandomAccessFile* file() { return file_.get(); } - std::string file_name() const { return file_name_; } + const std::string& file_name() const { return file_name_; } bool use_direct_io() const { return file_->use_direct_io(); } + + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,483 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/random_access_file_reader.h" + +#include + +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/file_system.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReaderTest : public testing::Test { + public: + void SetUp() override { + SetupSyncPointsToMockDirectIO(); + env_ = Env::Default(); + fs_ = FileSystem::Default(); + test_dir_ = test::PerThreadDBPath("random_access_file_reader_test"); + ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr)); + } + + void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); } + + void Write(const std::string& fname, const std::string& content) { + std::unique_ptr f; + ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append(content, IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + + void Read(const std::string& fname, const FileOptions& opts, + std::unique_ptr* reader) { + std::string fpath = Path(fname); + std::unique_ptr f; + ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr)); + reader->reset(new RandomAccessFileReader(std::move(f), fpath, + env_->GetSystemClock().get())); + } + + void AssertResult(const std::string& content, + const std::vector& reqs) { + for (const auto& r : reqs) { + ASSERT_OK(r.status); + ASSERT_EQ(r.len, r.result.size()); + ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString()); + } + } + + private: + Env* env_; + std::shared_ptr fs_; + std::string test_dir_; + + std::string Path(const std::string& fname) { + return test_dir_ + "/" + fname; + } +}; + +// Skip the following tests in lite mode since direct I/O is unsupported. +#ifndef ROCKSDB_LITE + +TEST_F(RandomAccessFileReaderTest, ReadDirectIO) { + std::string fname = "read-direct-io"; + Random rand(0); + std::string content = rand.RandomString(kDefaultPageSize); + Write(fname, content); + + FileOptions opts; + opts.use_direct_reads = true; + std::unique_ptr r; + Read(fname, opts, &r); + ASSERT_TRUE(r->use_direct_io()); + + const size_t page_size = r->file()->GetRequiredBufferAlignment(); + size_t offset = page_size / 2; + size_t len = page_size / 3; + Slice result; + AlignedBuf buf; + for (bool for_compaction : {true, false}) { + ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf, + for_compaction)); + ASSERT_EQ(result.ToString(), content.substr(offset, len)); + } +} + +TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { + std::vector aligned_reqs; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* reqs) { + // Copy reqs, since it's allocated on stack inside MultiRead, which will + // be deallocated after MultiRead returns. + aligned_reqs = *reinterpret_cast*>(reqs); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Creates a file with 3 pages. + std::string fname = "multi-read-direct-io"; + Random rand(0); + std::string content = rand.RandomString(3 * kDefaultPageSize); + Write(fname, content); + + FileOptions opts; + opts.use_direct_reads = true; + std::unique_ptr r; + Read(fname, opts, &r); + ASSERT_TRUE(r->use_direct_io()); + + const size_t page_size = r->file()->GetRequiredBufferAlignment(); + + { + // Reads 2 blocks in the 1st page. + // The results should be SharedSlices of the same underlying buffer. + // + // Illustration (each x is a 1/4 page) + // First page: xxxx + // 1st block: x + // 2nd block: xx + FSReadRequest r0; + r0.offset = 0; + r0.len = page_size / 4; + r0.scratch = nullptr; + + FSReadRequest r1; + r1.offset = page_size / 2; + r1.len = page_size / 2; + r1.scratch = nullptr; + + std::vector reqs; + reqs.push_back(std::move(r0)); + reqs.push_back(std::move(r1)); + AlignedBuf aligned_buf; + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + + AssertResult(content, reqs); + + // Reads the first page internally. + ASSERT_EQ(aligned_reqs.size(), 1); + const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); + ASSERT_EQ(aligned_r.offset, 0); + ASSERT_EQ(aligned_r.len, page_size); + } + + { + // Reads 3 blocks: + // 1st block in the 1st page; + // 2nd block from the middle of the 1st page to the middle of the 2nd page; + // 3rd block in the 2nd page. + // The results should be SharedSlices of the same underlying buffer. + // + // Illustration (each x is a 1/4 page) + // 2 pages: xxxxxxxx + // 1st block: x + // 2nd block: xxxx + // 3rd block: x + FSReadRequest r0; + r0.offset = 0; + r0.len = page_size / 4; + r0.scratch = nullptr; + + FSReadRequest r1; + r1.offset = page_size / 2; + r1.len = page_size; + r1.scratch = nullptr; + + FSReadRequest r2; + r2.offset = 2 * page_size - page_size / 4; + r2.len = page_size / 4; + r2.scratch = nullptr; + + std::vector reqs; + reqs.push_back(std::move(r0)); + reqs.push_back(std::move(r1)); + reqs.push_back(std::move(r2)); + AlignedBuf aligned_buf; + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + + AssertResult(content, reqs); + + // Reads the first two pages in one request internally. + ASSERT_EQ(aligned_reqs.size(), 1); + const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); + ASSERT_EQ(aligned_r.offset, 0); + ASSERT_EQ(aligned_r.len, 2 * page_size); + } + + { + // Reads 3 blocks: + // 1st block in the middle of the 1st page; + // 2nd block in the middle of the 2nd page; + // 3rd block in the middle of the 3rd page. + // The results should be SharedSlices of the same underlying buffer. + // + // Illustration (each x is a 1/4 page) + // 3 pages: xxxxxxxxxxxx + // 1st block: xx + // 2nd block: xx + // 3rd block: xx + FSReadRequest r0; + r0.offset = page_size / 4; + r0.len = page_size / 2; + r0.scratch = nullptr; + + FSReadRequest r1; + r1.offset = page_size + page_size / 4; + r1.len = page_size / 2; + r1.scratch = nullptr; + + FSReadRequest r2; + r2.offset = 2 * page_size + page_size / 4; + r2.len = page_size / 2; + r2.scratch = nullptr; + + std::vector reqs; + reqs.push_back(std::move(r0)); + reqs.push_back(std::move(r1)); + reqs.push_back(std::move(r2)); + AlignedBuf aligned_buf; + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + + AssertResult(content, reqs); + + // Reads the first 3 pages in one request internally. + ASSERT_EQ(aligned_reqs.size(), 1); + const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); + ASSERT_EQ(aligned_r.offset, 0); + ASSERT_EQ(aligned_r.len, 3 * page_size); + } + + { + // Reads 2 blocks: + // 1st block in the middle of the 1st page; + // 2nd block in the middle of the 3rd page. + // The results are two different buffers. + // + // Illustration (each x is a 1/4 page) + // 3 pages: xxxxxxxxxxxx + // 1st block: xx + // 2nd block: xx + FSReadRequest r0; + r0.offset = page_size / 4; + r0.len = page_size / 2; + r0.scratch = nullptr; + + FSReadRequest r1; + r1.offset = 2 * page_size + page_size / 4; + r1.len = page_size / 2; + r1.scratch = nullptr; + + std::vector reqs; + reqs.push_back(std::move(r0)); + reqs.push_back(std::move(r1)); + AlignedBuf aligned_buf; + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); + + AssertResult(content, reqs); + + // Reads the 1st and 3rd pages in two requests internally. + ASSERT_EQ(aligned_reqs.size(), 2); + const FSReadRequest& aligned_r0 = aligned_reqs[0]; + const FSReadRequest& aligned_r1 = aligned_reqs[1]; + ASSERT_OK(aligned_r0.status); + ASSERT_EQ(aligned_r0.offset, 0); + ASSERT_EQ(aligned_r0.len, page_size); + ASSERT_OK(aligned_r1.status); + ASSERT_EQ(aligned_r1.offset, 2 * page_size); + ASSERT_EQ(aligned_r1.len, page_size); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +#endif // ROCKSDB_LITE + +TEST(FSReadRequest, Align) { + FSReadRequest r; + r.offset = 2000; + r.len = 2000; + r.scratch = nullptr; + ASSERT_OK(r.status); + + FSReadRequest aligned_r = Align(r, 1024); + ASSERT_OK(r.status); + ASSERT_OK(aligned_r.status); + ASSERT_EQ(aligned_r.offset, 1024); + ASSERT_EQ(aligned_r.len, 3072); +} + +TEST(FSReadRequest, TryMerge) { + // reverse means merging dest into src. + for (bool reverse : {true, false}) { + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 15; + src.len = 10; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) { + std::swap(dest, src); + } + ASSERT_FALSE(TryMerge(&dest, src)); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 10; + src.len = 10; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) { + std::swap(dest, src); + } + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 20); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 5; + src.len = 10; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) { + std::swap(dest, src); + } + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 15); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 5; + src.len = 5; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) { + std::swap(dest, src); + } + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 5; + src.len = 1; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) std::swap(dest, src); + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 0; + src.len = 10; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) std::swap(dest, src); + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + + { + // dest: [ ] + // src: [ ] + FSReadRequest dest; + dest.offset = 0; + dest.len = 10; + dest.scratch = nullptr; + ASSERT_OK(dest.status); + + FSReadRequest src; + src.offset = 0; + src.len = 5; + src.scratch = nullptr; + ASSERT_OK(src.status); + + if (reverse) std::swap(dest, src); + ASSERT_TRUE(TryMerge(&dest, src)); + ASSERT_EQ(dest.offset, 0); + ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,48 +17,13 @@ IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, std::unique_ptr* result, const FileOptions& options) { + TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature", + const_cast(&options.temperature)); IOStatus s = fs->NewWritableFile(fname, options, result, nullptr); - TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2); return s; } -bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader, - std::string* output, bool* has_data, Status* result) { - const int kBufferSize = 8192; - char buffer[kBufferSize + 1]; - Slice input_slice; - - std::string line; - bool has_complete_line = false; - while (!has_complete_line) { - if (std::getline(*iss, line)) { - has_complete_line = !iss->eof(); - } else { - has_complete_line = false; - } - if (!has_complete_line) { - // if we're not sure whether we have a complete line, - // further read from the file. - if (*has_data) { - *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer); - } - if (input_slice.size() == 0) { - // meaning we have read all the data - *has_data = false; - break; - } else { - iss->str(line + input_slice.ToString()); - // reset the internal state of iss so that we can keep reading it. - iss->clear(); - *has_data = (input_slice.size() == kBufferSize); - continue; - } - } - } - *output = line; - return *has_data || has_complete_line; -} - #ifndef NDEBUG bool IsFileSectorAligned(const size_t off, size_t sector_size) { return off % sector_size == 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h 2025-05-19 16:14:27.000000000 +0000 @@ -24,10 +24,6 @@ std::unique_ptr* result, const FileOptions& options); -// Read a single line from a file. -bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader, - std::string* output, bool* has_data, Status* result); - #ifndef NDEBUG bool IsFileSectorAligned(const size_t off, size_t sector_size); #endif // NDEBUG diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_file_info.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_file_info.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// struct ReadaheadFileInfo contains readahead information that is passed from +// one file to another file per level during iterations. This information helps +// iterators to carry forward the internal automatic prefetching readahead value +// to next file during sequential reads instead of starting from the scratch. + +struct ReadaheadFileInfo { + struct ReadaheadInfo { + size_t readahead_size = 0; + int64_t num_file_reads = 0; + }; + + // Used by Data block iterators to update readahead info. + ReadaheadInfo data_block_readahead_info; + + // Used by Index block iterators to update readahead info. + ReadaheadInfo index_block_readahead_info; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,15 +11,17 @@ #include #include + #include "file/read_write_util.h" +#include "rocksdb/file_system.h" #include "util/aligned_buffer.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { namespace { -class ReadaheadRandomAccessFile : public RandomAccessFile { +class ReadaheadRandomAccessFile : public FSRandomAccessFile { public: - ReadaheadRandomAccessFile(std::unique_ptr&& file, + ReadaheadRandomAccessFile(std::unique_ptr&& file, size_t readahead_size) : file_(std::move(file)), alignment_(file_->GetRequiredBufferAlignment()), @@ -35,11 +37,12 @@ ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete; - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { // Read-ahead only make sense if we have some slack left after reading if (n + alignment_ >= readahead_size_) { - return file_->Read(offset, n, result, scratch); + return file_->Read(offset, n, options, result, scratch, dbg); } std::unique_lock lk(lock_); @@ -53,14 +56,14 @@ (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { // We read exactly what we needed, or we hit end of file - return. *result = Slice(scratch, cached_len); - return Status::OK(); + return IOStatus::OK(); } size_t advanced_offset = static_cast(offset + cached_len); // In the case of cache hit advanced_offset is already aligned, means that // chunk_offset equals to advanced_offset size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); - Status s = ReadIntoBuffer(chunk_offset, readahead_size_); + IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg); if (s.ok()) { // The data we need is now in cache, so we can safely read it size_t remaining_len; @@ -71,11 +74,12 @@ return s; } - Status Prefetch(uint64_t offset, size_t n) override { + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { if (n < readahead_size_) { // Don't allow smaller prefetches than the configured `readahead_size_`. // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. - return Status::OK(); + return IOStatus::OK(); } std::unique_lock lk(lock_); @@ -83,10 +87,11 @@ size_t offset_ = static_cast(offset); size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); if (prefetch_offset == buffer_offset_) { - return Status::OK(); + return IOStatus::OK(); } return ReadIntoBuffer(prefetch_offset, - Roundup(offset_ + n, alignment_) - prefetch_offset); + Roundup(offset_ + n, alignment_) - prefetch_offset, + options, dbg); } size_t GetUniqueId(char* id, size_t max_size) const override { @@ -95,7 +100,7 @@ void Hint(AccessPattern pattern) override { file_->Hint(pattern); } - Status InvalidateCache(size_t offset, size_t length) override { + IOStatus InvalidateCache(size_t offset, size_t length) override { std::unique_lock lk(lock_); buffer_.Clear(); return file_->InvalidateCache(offset, length); @@ -125,14 +130,16 @@ // Reads into buffer_ the next n bytes from file_ starting at offset. // Can actually read less if EOF was reached. // Returns the status of the read operastion on the file. - Status ReadIntoBuffer(uint64_t offset, size_t n) const { + IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) const { if (n > buffer_.Capacity()) { n = buffer_.Capacity(); } assert(IsFileSectorAligned(offset, alignment_)); assert(IsFileSectorAligned(n, alignment_)); Slice result; - Status s = file_->Read(offset, n, &result, buffer_.BufferStart()); + IOStatus s = + file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg); if (s.ok()) { buffer_offset_ = offset; buffer_.Size(result.size()); @@ -141,7 +148,7 @@ return s; } - const std::unique_ptr file_; + const std::unique_ptr file_; const size_t alignment_; const size_t readahead_size_; @@ -153,9 +160,9 @@ }; } // namespace -std::unique_ptr NewReadaheadRandomAccessFile( - std::unique_ptr&& file, size_t readahead_size) { - std::unique_ptr result( +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size) { + std::unique_ptr result( new ReadaheadRandomAccessFile(std::move(file), readahead_size)); return result; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,10 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include -#include "rocksdb/env.h" +#include + +#include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { +class FSRandomAccessFile; // This file provides the following main abstractions: // SequentialFileReader : wrapper over Env::SequentialFile // RandomAccessFileReader : wrapper over Env::RandomAccessFile @@ -22,6 +24,6 @@ // NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to // always prefetch additional data with every read. This is mainly used in // Compaction Table Readers. -std::unique_ptr NewReadaheadRandomAccessFile( - std::unique_ptr&& file, size_t readahead_size); +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -22,8 +22,20 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { - Status s; +IOStatus SequentialFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, std::unique_ptr* reader, + IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new SequentialFileReader(std::move(file), fname)); + } + return io_s; +} + +IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { + IOStatus io_s; if (use_direct_io()) { #ifndef ROCKSDB_LITE size_t offset = offset_.fetch_add(n); @@ -35,28 +47,64 @@ AlignedBuffer buf; buf.Alignment(alignment); buf.AllocateNewBuffer(size); + Slice tmp; - s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp, - buf.BufferStart(), nullptr); - if (s.ok() && offset_advance < tmp.size()) { + uint64_t orig_offset = 0; + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + orig_offset = aligned_offset + buf.CurrentSize(); + start_ts = FileOperationInfo::StartNow(); + } + io_s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp, + buf.BufferStart(), nullptr); + if (io_s.ok() && offset_advance < tmp.size()) { buf.Size(tmp.size()); r = buf.Read(scratch, offset_advance, std::min(tmp.size() - offset_advance, n)); } *result = Slice(scratch, r); + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + io_s); + } #endif // !ROCKSDB_LITE } else { - s = file_->Read(n, IOOptions(), result, scratch, nullptr); + // To be paranoid, modify scratch a little bit, so in case underlying + // FileSystem doesn't fill the buffer but return succee and `scratch` + // returns contains a previous block, returned value will not pass + // checksum. + // It's hard to find useful byte for direct I/O case, so we skip it. + if (n > 0 && scratch != nullptr) { + scratch[0]++; + } + +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + + io_s = file_->Read(n, IOOptions(), result, scratch, nullptr); + +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + size_t offset = offset_.fetch_add(result->size()); + NotifyOnFileReadFinish(offset, result->size(), start_ts, finish_ts, io_s); + } +#endif } IOSTATS_ADD(bytes_read, result->size()); - return s; + return io_s; } -Status SequentialFileReader::Skip(uint64_t n) { +IOStatus SequentialFileReader::Skip(uint64_t n) { #ifndef ROCKSDB_LITE if (use_direct_io()) { offset_ += static_cast(n); - return Status::OK(); + return IOStatus::OK(); } #endif // !ROCKSDB_LITE return file_->Skip(n); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,8 @@ #pragma once #include #include + +#include "env/file_system_tracer.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" @@ -21,36 +23,82 @@ // cache disabled) reads appropriately, and also updates the IO stats. class SequentialFileReader { private: - std::unique_ptr file_; +#ifndef ROCKSDB_LITE + void NotifyOnFileReadFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts, + finish_ts, status); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileReadFinish(info); + } + info.status.PermitUncheckedError(); + } + + void AddFileIOListeners( + const std::vector>& listeners) { + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); + } +#endif // ROCKSDB_LITE + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + std::string file_name_; + FSSequentialFilePtr file_; std::atomic offset_{0}; // read offset + std::vector> listeners_{}; public: - explicit SequentialFileReader(std::unique_ptr&& _file, - const std::string& _file_name) - : file_(std::move(_file)), file_name_(_file_name) {} - - explicit SequentialFileReader(std::unique_ptr&& _file, - const std::string& _file_name, - size_t _readahead_size) - : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)), - file_name_(_file_name) {} - - SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { - *this = std::move(o); + explicit SequentialFileReader( + std::unique_ptr&& _file, const std::string& _file_name, + const std::shared_ptr& io_tracer = nullptr, + const std::vector>& listeners = {}) + : file_name_(_file_name), + file_(std::move(_file), io_tracer, _file_name), + listeners_() { +#ifndef ROCKSDB_LITE + AddFileIOListeners(listeners); +#else + (void)listeners; +#endif } - SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { - file_ = std::move(o.file_); - return *this; + explicit SequentialFileReader( + std::unique_ptr&& _file, const std::string& _file_name, + size_t _readahead_size, + const std::shared_ptr& io_tracer = nullptr, + const std::vector>& listeners = {}) + : file_name_(_file_name), + file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size), + io_tracer, _file_name), + listeners_() { +#ifndef ROCKSDB_LITE + AddFileIOListeners(listeners); +#else + (void)listeners; +#endif } + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); SequentialFileReader(const SequentialFileReader&) = delete; SequentialFileReader& operator=(const SequentialFileReader&) = delete; - Status Read(size_t n, Slice* result, char* scratch); + IOStatus Read(size_t n, Slice* result, char* scratch); - Status Skip(uint64_t n); + IOStatus Skip(uint64_t n); FSSequentialFile* file() { return file_.get(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,7 @@ #include #include "db/db_impl/db_impl.h" -#include "env/composite_env_wrapper.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" @@ -19,21 +19,21 @@ namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE -SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr fs, - std::shared_ptr logger, - int64_t rate_bytes_per_sec, - double max_trash_db_ratio, - uint64_t bytes_max_delete_chunk) - : env_(env), +SstFileManagerImpl::SstFileManagerImpl( + const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, int64_t rate_bytes_per_sec, + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) + : clock_(clock), fs_(fs), logger_(logger), total_files_size_(0), - in_progress_files_size_(0), compaction_buffer_size_(0), cur_compactions_reserved_size_(0), max_allowed_space_(0), - delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this, - max_trash_db_ratio, bytes_max_delete_chunk), + delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec, + logger.get(), this, max_trash_db_ratio, + bytes_max_delete_chunk), cv_(&mu_), closing_(false), bg_thread_(nullptr), @@ -43,6 +43,7 @@ SstFileManagerImpl::~SstFileManagerImpl() { Close(); + bg_err_.PermitUncheckedError(); } void SstFileManagerImpl::Close() { @@ -59,23 +60,24 @@ } } -Status SstFileManagerImpl::OnAddFile(const std::string& file_path, - bool compaction) { +Status SstFileManagerImpl::OnAddFile(const std::string& file_path) { uint64_t file_size; Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr); if (s.ok()) { MutexLock l(&mu_); - OnAddFileImpl(file_path, file_size, compaction); + OnAddFileImpl(file_path, file_size); } - TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); return s; } Status SstFileManagerImpl::OnAddFile(const std::string& file_path, - uint64_t file_size, bool compaction) { + uint64_t file_size) { MutexLock l(&mu_); - OnAddFileImpl(file_path, file_size, compaction); - TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + OnAddFileImpl(file_path, file_size); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); return Status::OK(); } @@ -84,7 +86,8 @@ MutexLock l(&mu_); OnDeleteFileImpl(file_path); } - TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile"); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile", + const_cast(&file_path)); return Status::OK(); } @@ -98,19 +101,6 @@ } } cur_compactions_reserved_size_ -= size_added_by_compaction; - - auto new_files = c->edit()->GetNewFiles(); - for (auto& new_file : new_files) { - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, - new_file.second.fd.GetNumber(), - new_file.second.fd.GetPathId()); - if (in_progress_files_.find(fn) != in_progress_files_.end()) { - auto tracked_file = tracked_files_.find(fn); - assert(tracked_file != tracked_files_.end()); - in_progress_files_size_ -= tracked_file->second; - in_progress_files_.erase(fn); - } - } } Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, @@ -121,7 +111,7 @@ if (file_size != nullptr) { *file_size = tracked_files_[old_path]; } - OnAddFileImpl(new_path, tracked_files_[old_path], false); + OnAddFileImpl(new_path, tracked_files_[old_path]); OnDeleteFileImpl(old_path); } TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile"); @@ -158,7 +148,7 @@ bool SstFileManagerImpl::EnoughRoomForCompaction( ColumnFamilyData* cfd, const std::vector& inputs, - Status bg_error) { + const Status& bg_error) { MutexLock l(&mu_); uint64_t size_added_by_compaction = 0; // First check if we even have the space to do the compaction @@ -183,12 +173,13 @@ // seen a NoSpace() error. This is tin order to contain a single potentially // misbehaving DB instance and prevent it from slowing down compactions of // other DB instances - if (CheckFreeSpace() && bg_error == Status::NoSpace()) { + if (bg_error.IsNoSpace() && CheckFreeSpace()) { auto fn = TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(), inputs[0][0]->fd.GetPathId()); uint64_t free_space = 0; - fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr); + Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr); + s.PermitUncheckedError(); // TODO: Check the status // needed_headroom is based on current size reserved by compactions, // minus any files created by running compactions as they would count // against the reserved size. If user didn't specify any compaction @@ -197,7 +188,6 @@ if (compaction_buffer_size_ == 0) { needed_headroom += reserved_disk_buffer_; } - needed_headroom -= in_progress_files_size_; if (free_space < needed_headroom + size_added_by_compaction) { // We hit the condition of not enough disk space ROCKS_LOG_ERROR(logger_, @@ -328,7 +318,7 @@ // error is also a NoSpace() non-fatal error, leave the instance in // the list Status err = cur_instance_->GetBGError(); - if (s.ok() && err == Status::NoSpace() && + if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace && err.severity() < Status::Severity::kFatalError) { s = err; } @@ -346,7 +336,7 @@ if (!error_handler_list_.empty()) { // If there are more instances to be recovered, reschedule after 5 // seconds - int64_t wait_until = env_->NowMicros() + 5000000; + int64_t wait_until = clock_->NowMicros() + 5000000; cv_.TimedWait(wait_until); } @@ -438,24 +428,15 @@ } void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, - uint64_t file_size, bool compaction) { + uint64_t file_size) { auto tracked_file = tracked_files_.find(file_path); if (tracked_file != tracked_files_.end()) { // File was added before, we will just update the size - assert(!compaction); total_files_size_ -= tracked_file->second; total_files_size_ += file_size; cur_compactions_reserved_size_ -= file_size; } else { total_files_size_ += file_size; - if (compaction) { - // Keep track of the size of files created by in-progress compactions. - // When calculating whether there's enough headroom for new compactions, - // this will be subtracted from cur_compactions_reserved_size_. - // Otherwise, compactions will be double counted. - in_progress_files_size_ += file_size; - in_progress_files_.insert(file_path); - } } tracked_files_[file_path] = file_size; } @@ -464,16 +445,10 @@ auto tracked_file = tracked_files_.find(file_path); if (tracked_file == tracked_files_.end()) { // File is not tracked - assert(in_progress_files_.find(file_path) == in_progress_files_.end()); return; } total_files_size_ -= tracked_file->second; - // Check if it belonged to an in-progress compaction - if (in_progress_files_.find(file_path) != in_progress_files_.end()) { - in_progress_files_size_ -= tracked_file->second; - in_progress_files_.erase(file_path); - } tracked_files_.erase(tracked_file); } @@ -483,14 +458,7 @@ bool delete_existing_trash, Status* status, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) { - std::shared_ptr fs; - - if (env == Env::Default()) { - fs = FileSystem::Default(); - } else { - fs.reset(new LegacyFileSystemWrapper(env)); - } - + const auto& fs = env->GetFileSystem(); return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec, delete_existing_trash, status, max_trash_db_ratio, bytes_max_delete_chunk); @@ -503,22 +471,19 @@ bool delete_existing_trash, Status* status, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) { + const auto& clock = env->GetSystemClock(); SstFileManagerImpl* res = - new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec, + new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec, max_trash_db_ratio, bytes_max_delete_chunk); // trash_dir is deprecated and not needed anymore, but if user passed it // we will still remove files in it. - Status s; + Status s = Status::OK(); if (delete_existing_trash && trash_dir != "") { std::vector files_in_trash; s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr); if (s.ok()) { for (const std::string& trash_file : files_in_trash) { - if (trash_file == "." || trash_file == "..") { - continue; - } - std::string path_in_trash = trash_dir + "/" + trash_file; res->OnAddFile(path_in_trash); Status file_delete = @@ -532,6 +497,9 @@ if (status) { *status = s; + } else { + // No one passed us a Status, so they must not care about the error... + s.PermitUncheckedError(); } return res; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,47 +12,45 @@ #include "port/port.h" #include "db/compaction/compaction.h" -#include "db/error_handler.h" #include "file/delete_scheduler.h" -#include "rocksdb/file_system.h" #include "rocksdb/sst_file_manager.h" namespace ROCKSDB_NAMESPACE { - -class Env; +class ErrorHandler; +class FileSystem; +class SystemClock; class Logger; -// SstFileManager is used to track SST files in the DB and control there -// deletion rate. -// All SstFileManager public functions are thread-safe. +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. class SstFileManagerImpl : public SstFileManager { public: - explicit SstFileManagerImpl(Env* env, std::shared_ptr fs, - std::shared_ptr logger, + explicit SstFileManagerImpl(const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, int64_t rate_bytes_per_sec, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); ~SstFileManagerImpl(); - // DB will call OnAddFile whenever a new sst file is added. - Status OnAddFile(const std::string& file_path, bool compaction = false); + // DB will call OnAddFile whenever a new sst/blob file is added. + Status OnAddFile(const std::string& file_path); // Overload where size of the file is provided by the caller rather than // queried from the filesystem. This is an optimization. - Status OnAddFile(const std::string& file_path, uint64_t file_size, - bool compaction); + Status OnAddFile(const std::string& file_path, uint64_t file_size); - // DB will call OnDeleteFile whenever an sst file is deleted. + // DB will call OnDeleteFile whenever a sst/blob file is deleted. Status OnDeleteFile(const std::string& file_path); - // DB will call OnMoveFile whenever an sst file is move to a new path. + // DB will call OnMoveFile whenever a sst/blob file is move to a new path. Status OnMoveFile(const std::string& old_path, const std::string& new_path, uint64_t* file_size = nullptr); // Update the maximum allowed space that should be used by RocksDB, if - // the total size of the SST files exceeds max_allowed_space, writes to - // RocksDB will fail. + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. // // Setting max_allowed_space to 0 will disable this feature, maximum allowed // space will be infinite (Default value). @@ -62,8 +60,8 @@ void SetCompactionBufferSize(uint64_t compaction_buffer_size) override; - // Return true if the total size of SST files exceeded the maximum allowed - // space usage. + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. // // thread-safe. bool IsMaxAllowedSpaceReached() override; @@ -77,7 +75,7 @@ // the full compaction size). bool EnoughRoomForCompaction(ColumnFamilyData* cfd, const std::vector& inputs, - Status bg_error); + const Status& bg_error); // Bookkeeping so total_file_sizes_ goes back to normal after compaction // finishes @@ -135,10 +133,14 @@ // once in the object's lifetime, and before the destructor void Close(); + void SetStatisticsPtr(const std::shared_ptr& stats) override { + stats_ = stats; + delete_scheduler_.SetStatisticsPtr(stats); + } + private: // REQUIRES: mutex locked - void OnAddFileImpl(const std::string& file_path, uint64_t file_size, - bool compaction); + void OnAddFileImpl(const std::string& file_path, uint64_t file_size); // REQUIRES: mutex locked void OnDeleteFileImpl(const std::string& file_path); @@ -147,15 +149,13 @@ return bg_err_.severity() == Status::Severity::kSoftError; } - Env* env_; + std::shared_ptr clock_; std::shared_ptr fs_; std::shared_ptr logger_; // Mutex to protect tracked_files_, total_files_size_ port::Mutex mu_; // The summation of the sizes of all files in tracked_files_ map uint64_t total_files_size_; - // The summation of all output files of in-progress compactions - uint64_t in_progress_files_size_; // Compactions should only execute if they can leave at least // this amount of buffer space for logs and flushes uint64_t compaction_buffer_size_; @@ -164,9 +164,7 @@ // A map containing all tracked files and there sizes // file_path => file_size std::unordered_map tracked_files_; - // A set of files belonging to in-progress compactions - std::unordered_set in_progress_files_; - // The maximum allowed space (in bytes) for sst files. + // The maximum allowed space (in bytes) for sst and blob files. uint64_t max_allowed_space_; // DeleteScheduler used to throttle file deletition. DeleteScheduler delete_scheduler_; @@ -186,10 +184,11 @@ // compactions to run full throttle. If disk space is below this trigger, // compactions will be gated by free disk space > input size uint64_t free_space_trigger_; - // List of database error handler instances tracked by this sst file manager + // List of database error handler instances tracked by this SstFileManager. std::list error_handler_list_; // Pointer to ErrorHandler instance that is currently processing recovery ErrorHandler* cur_instance_; + std::shared_ptr stats_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,19 +16,37 @@ #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" +#include "util/crc32c.h" #include "util/random.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -Status WritableFileWriter::Append(const Slice& data) { +IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + writer->reset(new WritableFileWriter(std::move(file), fname, file_opts)); + } + return io_s; +} + +IOStatus WritableFileWriter::Append(const Slice& data, + uint32_t crc32c_checksum) { const char* src = data.data(); size_t left = data.size(); - Status s; + IOStatus s; pending_sync_ = true; - TEST_KILL_RANDOM("WritableFileWriter::Append:0", - rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2); + + // Calculate the checksum of appended data + UpdateFileChecksum(data); { IOSTATS_TIMER_GUARD(prepare_write_nanos); @@ -64,40 +82,88 @@ assert(buf_.CurrentSize() == 0); } - // We never write directly to disk with direct I/O on. - // or we simply use it for its original purpose to accumulate many small - // chunks - if (use_direct_io() || (buf_.Capacity() >= left)) { - while (left > 0) { - size_t appended = buf_.Append(src, left); - left -= appended; - src += appended; - - if (left > 0) { - s = Flush(); - if (!s.ok()) { - break; + if (perform_data_verification_ && buffered_data_with_checksum_ && + crc32c_checksum != 0) { + // Since we want to use the checksum of the input data, we cannot break it + // into several pieces. We will only write them in the buffer when buffer + // size is enough. Otherwise, we will directly write it down. + if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) { + if ((buf_.Capacity() - buf_.CurrentSize()) >= left) { + size_t appended = buf_.Append(src, left); + if (appended != left) { + s = IOStatus::Corruption("Write buffer append failure"); + } + buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine( + buffered_data_crc32c_checksum_, crc32c_checksum, appended); + } else { + while (left > 0) { + size_t appended = buf_.Append(src, left); + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, src, appended); + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(); + if (!s.ok()) { + break; + } + } } } + } else { + assert(buf_.CurrentSize() == 0); + buffered_data_crc32c_checksum_ = crc32c_checksum; + s = WriteBufferedWithChecksum(src, left); } } else { - // Writing directly to file bypassing the buffer - assert(buf_.CurrentSize() == 0); - s = WriteBuffered(src, left); + // In this case, either we do not need to do the data verification or + // caller does not provide the checksum of the data (crc32c_checksum = 0). + // + // We never write directly to disk with direct I/O on. + // or we simply use it for its original purpose to accumulate many small + // chunks + if (use_direct_io() || (buf_.Capacity() >= left)) { + while (left > 0) { + size_t appended = buf_.Append(src, left); + if (perform_data_verification_ && buffered_data_with_checksum_) { + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, src, appended); + } + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(); + if (!s.ok()) { + break; + } + } + } + } else { + // Writing directly to file bypassing the buffer + assert(buf_.CurrentSize() == 0); + if (perform_data_verification_ && buffered_data_with_checksum_) { + buffered_data_crc32c_checksum_ = crc32c::Value(src, left); + s = WriteBufferedWithChecksum(src, left); + } else { + s = WriteBuffered(src, left); + } + } } - TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Append:1"); if (s.ok()) { filesize_ += data.size(); - CalculateFileChecksum(data); } return s; } -Status WritableFileWriter::Pad(const size_t pad_bytes) { +IOStatus WritableFileWriter::Pad(const size_t pad_bytes) { assert(pad_bytes < kDefaultPageSize); size_t left = pad_bytes; size_t cap = buf_.Capacity() - buf_.CurrentSize(); + size_t pad_start = buf_.CurrentSize(); // Assume pad_bytes is small compared to buf_ capacity. So we always // use buf_ rather than write directly to file in certain cases like @@ -107,7 +173,7 @@ buf_.PadWith(append_bytes, 0); left -= append_bytes; if (left > 0) { - Status s = Flush(); + IOStatus s = Flush(); if (!s.ok()) { return s; } @@ -116,71 +182,158 @@ } pending_sync_ = true; filesize_ += pad_bytes; - return Status::OK(); + if (perform_data_verification_) { + buffered_data_crc32c_checksum_ = + crc32c::Extend(buffered_data_crc32c_checksum_, + buf_.BufferStart() + pad_start, pad_bytes); + } + return IOStatus::OK(); } -Status WritableFileWriter::Close() { +IOStatus WritableFileWriter::Close() { // Do not quit immediately on failure the file MUST be closed - Status s; + IOStatus s; // Possible to close it twice now as we MUST close // in __dtor, simply flushing is not enough // Windows when pre-allocating does not fill with zeros // also with unbuffered access we also set the end of data. - if (!writable_file_) { + if (writable_file_.get() == nullptr) { return s; } s = Flush(); // flush cache to OS - Status interim; + IOStatus interim; // In direct I/O mode we write whole pages so // we need to let the file know where data ends. if (use_direct_io()) { - interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr); + { +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileTruncateFinish(start_ts, finish_ts, s); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(), + filesize_); + } + } +#endif + } if (interim.ok()) { - interim = writable_file_->Fsync(IOOptions(), nullptr); + { +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + interim = writable_file_->Fsync(IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileSyncFinish(start_ts, finish_ts, s, + FileOperationType::kFsync); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kFsync, file_name()); + } + } +#endif + } } if (!interim.ok() && s.ok()) { s = interim; } } - TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds); - interim = writable_file_->Close(IOOptions(), nullptr); + TEST_KILL_RANDOM("WritableFileWriter::Close:0"); + { +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + interim = writable_file_->Close(IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = FileOperationInfo::FinishNow(); + NotifyOnFileCloseFinish(start_ts, finish_ts, s); + if (!interim.ok()) { + NotifyOnIOError(interim, FileOperationType::kClose, file_name()); + } + } +#endif + } if (!interim.ok() && s.ok()) { s = interim; } writable_file_.reset(); - TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Close:1"); + + if (s.ok() && checksum_generator_ != nullptr && !checksum_finalized_) { + checksum_generator_->Finalize(); + checksum_finalized_ = true; + } return s; } // write out the cached data to the OS cache or storage if direct I/O // enabled -Status WritableFileWriter::Flush() { - Status s; - TEST_KILL_RANDOM("WritableFileWriter::Flush:0", - rocksdb_kill_odds * REDUCE_ODDS2); +IOStatus WritableFileWriter::Flush() { + IOStatus s; + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2); if (buf_.CurrentSize() > 0) { if (use_direct_io()) { #ifndef ROCKSDB_LITE if (pending_sync_) { - s = WriteDirect(); + if (perform_data_verification_ && buffered_data_with_checksum_) { + s = WriteDirectWithChecksum(); + } else { + s = WriteDirect(); + } } #endif // !ROCKSDB_LITE } else { - s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize()); + if (perform_data_verification_ && buffered_data_with_checksum_) { + s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize()); + } else { + s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize()); + } } if (!s.ok()) { return s; } } - s = writable_file_->Flush(IOOptions(), nullptr); + { +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + s = writable_file_->Flush(IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileFlushFinish(start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kFlush, file_name()); + } + } +#endif + } if (!s.ok()) { return s; @@ -216,71 +369,118 @@ return s; } +std::string WritableFileWriter::GetFileChecksum() { + if (checksum_generator_ != nullptr) { + assert(checksum_finalized_); + return checksum_generator_->GetChecksum(); + } else { + return kUnknownFileChecksum; + } +} + const char* WritableFileWriter::GetFileChecksumFuncName() const { - if (checksum_func_ != nullptr) { - return checksum_func_->Name(); + if (checksum_generator_ != nullptr) { + return checksum_generator_->Name(); } else { - return kUnknownFileChecksumFuncName.c_str(); + return kUnknownFileChecksumFuncName; } } -Status WritableFileWriter::Sync(bool use_fsync) { - Status s = Flush(); +IOStatus WritableFileWriter::Sync(bool use_fsync) { + IOStatus s = Flush(); if (!s.ok()) { return s; } - TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Sync:0"); if (!use_direct_io() && pending_sync_) { s = SyncInternal(use_fsync); if (!s.ok()) { return s; } } - TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Sync:1"); pending_sync_ = false; - return Status::OK(); + return IOStatus::OK(); } -Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) { +IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) { if (!writable_file_->IsSyncThreadSafe()) { - return Status::NotSupported( + return IOStatus::NotSupported( "Can't WritableFileWriter::SyncWithoutFlush() because " "WritableFile::IsSyncThreadSafe() is false"); } TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); - Status s = SyncInternal(use_fsync); + IOStatus s = SyncInternal(use_fsync); TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); return s; } -Status WritableFileWriter::SyncInternal(bool use_fsync) { - Status s; +IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { + IOStatus s; IOSTATS_TIMER_GUARD(fsync_nanos); TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0"); auto prev_perf_level = GetPerfLevel(); - IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif if (use_fsync) { s = writable_file_->Fsync(IOOptions(), nullptr); } else { s = writable_file_->Sync(IOOptions(), nullptr); } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileSyncFinish( + start_ts, finish_ts, s, + use_fsync ? FileOperationType::kFsync : FileOperationType::kSync); + if (!s.ok()) { + NotifyOnIOError( + s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync), + file_name()); + } + } +#endif SetPerfLevel(prev_perf_level); return s; } -Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { +IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { IOSTATS_TIMER_GUARD(range_sync_nanos); TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); - return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } +#endif + IOStatus s = writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr); +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes, + offset); + } + } +#endif + return s; } // This method writes to disk the specified data and makes use of the rate // limiter if available -Status WritableFileWriter::WriteBuffered(const char* data, size_t size) { - Status s; +IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) { + IOStatus s; assert(!use_direct_io()); const char* src = data; size_t left = size; + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; while (left > 0) { size_t allowed; @@ -297,23 +497,48 @@ TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); #ifndef ROCKSDB_LITE - FileOperationInfo::TimePoint start_ts; + FileOperationInfo::StartTimePoint start_ts; uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr); if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::now(); + start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; } #endif { auto prev_perf_level = GetPerfLevel(); - IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); - s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info, + nullptr); + } else { + s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr); + } + if (!s.ok()) { + // If writable_file_->Append() failed, then the data may or may not + // exist in the underlying memory buffer, OS page cache, remote file + // system's buffer, etc. If WritableFileWriter keeps the data in + // buf_, then a future Close() or write retry may send the data to + // the underlying file again. If the data does exist in the + // underlying buffer and gets written to the file eventually despite + // returning error, the file may end up with two duplicate pieces of + // data. Therefore, clear the buf_ at the WritableFileWriter layer + // and let caller determine error handling. + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + } SetPerfLevel(prev_perf_level); } #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { - auto finish_ts = std::chrono::system_clock::now(); + auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed, + old_size); + } } #endif if (!s.ok()) { @@ -322,25 +547,117 @@ } IOSTATS_ADD(bytes_written, allowed); - TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0"); left -= allowed; src += allowed; } buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; return s; } -void WritableFileWriter::CalculateFileChecksum(const Slice& data) { - if (checksum_func_ != nullptr) { - if (is_first_checksum_) { - file_checksum_ = checksum_func_->Value(data.data(), data.size()); - is_first_checksum_ = false; - } else { - file_checksum_ = - checksum_func_->Extend(file_checksum_, data.data(), data.size()); +IOStatus WritableFileWriter::WriteBufferedWithChecksum(const char* data, + size_t size) { + IOStatus s; + assert(!use_direct_io()); + assert(perform_data_verification_ && buffered_data_with_checksum_); + const char* src = data; + size_t left = size; + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + + // Check how much is allowed. Here, we loop until the rate limiter allows to + // write the entire buffer. + // TODO: need to be improved since it sort of defeats the purpose of the rate + // limiter + size_t data_size = left; + if (rate_limiter_ != nullptr) { + while (data_size > 0) { + size_t tmp_size; + tmp_size = rate_limiter_->RequestToken( + data_size, buf_.Alignment(), writable_file_->GetIOPriority(), stats_, + RateLimiter::OpType::kWrite); + data_size -= tmp_size; } } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + +#ifndef ROCKSDB_LITE + FileOperationInfo::StartTimePoint start_ts; + uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr); + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + old_size = next_write_offset_; + } +#endif + { + auto prev_perf_level = GetPerfLevel(); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + + EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->Append(Slice(src, left), IOOptions(), v_info, + nullptr); + SetPerfLevel(prev_perf_level); + } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left, + old_size); + } + } +#endif + if (!s.ok()) { + // If writable_file_->Append() failed, then the data may or may not + // exist in the underlying memory buffer, OS page cache, remote file + // system's buffer, etc. If WritableFileWriter keeps the data in + // buf_, then a future Close() or write retry may send the data to + // the underlying file again. If the data does exist in the + // underlying buffer and gets written to the file eventually despite + // returning error, the file may end up with two duplicate pieces of + // data. Therefore, clear the buf_ at the WritableFileWriter layer + // and let caller determine error handling. + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + return s; + } + } + + IOSTATS_ADD(bytes_written, left); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0"); + + // Buffer write is successful, reset the buffer current size to 0 and reset + // the corresponding checksum value + buf_.Size(0); + buffered_data_crc32c_checksum_ = 0; + return s; +} + +void WritableFileWriter::UpdateFileChecksum(const Slice& data) { + if (checksum_generator_ != nullptr) { + checksum_generator_->Update(data.data(), data.size()); + } +} + +// Currently, crc32c checksum is used to calculate the checksum value of the +// content in the input buffer for handoff. In the future, the checksum might be +// calculated from the existing crc32c checksums of the in WAl and Manifest +// records, or even SST file blocks. +// TODO: effectively use the existing checksum of the data being writing to +// generate the crc32c checksum instead of a raw calculation. +void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, + size_t size, + char* buf) { + uint32_t v_crc32c = crc32c::Extend(0, data, size); + EncodeFixed32(buf, v_crc32c); } // This flushes the accumulated data in the buffer. We pad data with zeros if @@ -352,20 +669,20 @@ // only write on aligned // offsets. #ifndef ROCKSDB_LITE -Status WritableFileWriter::WriteDirect() { +IOStatus WritableFileWriter::WriteDirect() { assert(use_direct_io()); - Status s; + IOStatus s; const size_t alignment = buf_.Alignment(); assert((next_write_offset_ % alignment) == 0); // Calculate whole page final file advance if all writes succeed - size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize()); + const size_t file_advance = + TruncateToPageBoundary(alignment, buf_.CurrentSize()); // Calculate the leftover tail, we write it here padded with zeros BUT we - // will write - // it again in the future either on Close() OR when the current whole page - // fills out - size_t leftover_tail = buf_.CurrentSize() - file_advance; + // will write it again in the future either on Close() OR when the current + // whole page fills out. + const size_t leftover_tail = buf_.CurrentSize() - file_advance; // Round up and pad buf_.PadToAlignmentWith(0); @@ -373,6 +690,8 @@ const char* src = buf_.BufferStart(); uint64_t write_offset = next_write_offset_; size_t left = buf_.CurrentSize(); + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; while (left > 0) { // Check how much is allowed @@ -388,16 +707,28 @@ { IOSTATS_TIMER_GUARD(write_nanos); TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); - FileOperationInfo::TimePoint start_ts; + FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::now(); + start_ts = FileOperationInfo::StartNow(); } // direct writes must be positional - s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - IOOptions(), nullptr); + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, size, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + IOOptions(), v_info, nullptr); + } else { + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + IOOptions(), nullptr); + } + if (ShouldNotifyListeners()) { - auto finish_ts = std::chrono::system_clock::now(); + auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(), + size, write_offset); + } } if (!s.ok()) { buf_.Size(file_advance + leftover_tail); @@ -420,6 +751,104 @@ // This is where we start writing next time which may or not be // the actual file size on disk. They match if the buffer size // is a multiple of whole pages otherwise filesize_ is leftover_tail + // behind + next_write_offset_ += file_advance; + } + return s; +} + +IOStatus WritableFileWriter::WriteDirectWithChecksum() { + assert(use_direct_io()); + assert(perform_data_verification_ && buffered_data_with_checksum_); + IOStatus s; + const size_t alignment = buf_.Alignment(); + assert((next_write_offset_ % alignment) == 0); + + // Calculate whole page final file advance if all writes succeed + const size_t file_advance = + TruncateToPageBoundary(alignment, buf_.CurrentSize()); + + // Calculate the leftover tail, we write it here padded with zeros BUT we + // will write it again in the future either on Close() OR when the current + // whole page fills out. + const size_t leftover_tail = buf_.CurrentSize() - file_advance; + + // Round up, pad, and combine the checksum. + size_t last_cur_size = buf_.CurrentSize(); + buf_.PadToAlignmentWith(0); + size_t padded_size = buf_.CurrentSize() - last_cur_size; + const char* padded_start = buf_.BufferStart() + last_cur_size; + uint32_t padded_checksum = crc32c::Value(padded_start, padded_size); + buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine( + buffered_data_crc32c_checksum_, padded_checksum, padded_size); + + const char* src = buf_.BufferStart(); + uint64_t write_offset = next_write_offset_; + size_t left = buf_.CurrentSize(); + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; + + // Check how much is allowed. Here, we loop until the rate limiter allows to + // write the entire buffer. + // TODO: need to be improved since it sort of defeats the purpose of the rate + // limiter + size_t data_size = left; + if (rate_limiter_ != nullptr) { + while (data_size > 0) { + size_t size; + size = rate_limiter_->RequestToken(data_size, buf_.Alignment(), + writable_file_->GetIOPriority(), + stats_, RateLimiter::OpType::kWrite); + data_size -= size; + } + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + FileOperationInfo::StartTimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = FileOperationInfo::StartNow(); + } + // direct writes must be positional + EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->PositionedAppend(Slice(src, left), write_offset, + IOOptions(), v_info, nullptr); + + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::steady_clock::now(); + NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s); + if (!s.ok()) { + NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(), + left, write_offset); + } + } + if (!s.ok()) { + // In this case, we do not change buffered_data_crc32c_checksum_ because + // it still aligns with the data in the buffer. + buf_.Size(file_advance + leftover_tail); + buffered_data_crc32c_checksum_ = + crc32c::Value(buf_.BufferStart(), buf_.CurrentSize()); + return s; + } + } + + IOSTATS_ADD(bytes_written, left); + assert((next_write_offset_ % alignment) == 0); + + if (s.ok()) { + // Move the tail to the beginning of the buffer + // This never happens during normal Append but rather during + // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will + // recalculated accordingly. + buf_.RefitTail(file_advance, leftover_tail); + // Adjust the checksum value to align with the data in the buffer + buffered_data_crc32c_checksum_ = + crc32c::Value(buf_.BufferStart(), buf_.CurrentSize()); + // This is where we start writing next time which may or not be + // the actual file size on disk. They match if the buffer size + // is a multiple of whole pages otherwise filesize_ is leftover_tail // behind next_write_offset_ += file_advance; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,11 +10,13 @@ #pragma once #include #include + #include "db/version_edit.h" +#include "env/file_system_tracer.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" #include "rocksdb/listener.h" #include "rocksdb/rate_limiter.h" #include "test_util/sync_point.h" @@ -22,6 +24,7 @@ namespace ROCKSDB_NAMESPACE { class Statistics; +class SystemClock; // WritableFileWriter is a wrapper on top of Env::WritableFile. It provides // facilities to: @@ -33,27 +36,107 @@ class WritableFileWriter { private: #ifndef ROCKSDB_LITE - void NotifyOnFileWriteFinish(uint64_t offset, size_t length, - const FileOperationInfo::TimePoint& start_ts, - const FileOperationInfo::TimePoint& finish_ts, - const Status& status) { - FileOperationInfo info(file_name_, start_ts, finish_ts); + void NotifyOnFileWriteFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kWrite, file_name_, start_ts, + finish_ts, io_status); info.offset = offset; info.length = length; - info.status = status; for (auto& listener : listeners_) { listener->OnFileWriteFinish(info); } + info.status.PermitUncheckedError(); + } + void NotifyOnFileFlushFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kFlush, file_name_, start_ts, + finish_ts, io_status); + + for (auto& listener : listeners_) { + listener->OnFileFlushFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileSyncFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status, + FileOperationType type = FileOperationType::kSync) { + FileOperationInfo info(type, file_name_, start_ts, finish_ts, io_status); + + for (auto& listener : listeners_) { + listener->OnFileSyncFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileRangeSyncFinish( + uint64_t offset, size_t length, + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kRangeSync, file_name_, start_ts, + finish_ts, io_status); + info.offset = offset; + info.length = length; + + for (auto& listener : listeners_) { + listener->OnFileRangeSyncFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileTruncateFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kTruncate, file_name_, start_ts, + finish_ts, io_status); + + for (auto& listener : listeners_) { + listener->OnFileTruncateFinish(info); + } + info.status.PermitUncheckedError(); + } + void NotifyOnFileCloseFinish( + const FileOperationInfo::StartTimePoint& start_ts, + const FileOperationInfo::FinishTimePoint& finish_ts, + const IOStatus& io_status) { + FileOperationInfo info(FileOperationType::kClose, file_name_, start_ts, + finish_ts, io_status); + + for (auto& listener : listeners_) { + listener->OnFileCloseFinish(info); + } + info.status.PermitUncheckedError(); + } + + void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation, + const std::string& file_path, size_t length = 0, + uint64_t offset = 0) { + if (listeners_.empty()) { + return; + } + IOErrorInfo io_error_info(io_status, operation, file_path, length, offset); + for (auto& listener : listeners_) { + listener->OnIOError(io_error_info); + } + io_error_info.io_status.PermitUncheckedError(); } #endif // ROCKSDB_LITE bool ShouldNotifyListeners() const { return !listeners_.empty(); } - void CalculateFileChecksum(const Slice& data); + void UpdateFileChecksum(const Slice& data); + void Crc32cHandoffChecksumCalculation(const char* data, size_t size, + char* buf); - std::unique_ptr writable_file_; std::string file_name_; - Env* env_; + FSWritableFilePtr writable_file_; + SystemClock* clock_; AlignedBuffer buf_; size_t max_buffer_size_; // Actually written data size can be used for truncate @@ -71,20 +154,25 @@ RateLimiter* rate_limiter_; Statistics* stats_; std::vector> listeners_; - FileChecksumFunc* checksum_func_; - std::string file_checksum_ = kUnknownFileChecksum; - bool is_first_checksum_ = true; + std::unique_ptr checksum_generator_; + bool checksum_finalized_; + bool perform_data_verification_; + uint32_t buffered_data_crc32c_checksum_; + bool buffered_data_with_checksum_; public: WritableFileWriter( std::unique_ptr&& file, const std::string& _file_name, - const FileOptions& options, Env* env = nullptr, + const FileOptions& options, SystemClock* clock = nullptr, + const std::shared_ptr& io_tracer = nullptr, Statistics* stats = nullptr, const std::vector>& listeners = {}, - FileChecksumFunc* checksum_func = nullptr) - : writable_file_(std::move(file)), - file_name_(_file_name), - env_(env), + FileChecksumGenFactory* file_checksum_gen_factory = nullptr, + bool perform_data_verification = false, + bool buffered_data_with_checksum = false) + : file_name_(_file_name), + writable_file_(std::move(file), io_tracer, _file_name), + clock_(clock), buf_(), max_buffer_size_(options.writable_file_max_buffer_size), filesize_(0), @@ -97,7 +185,11 @@ rate_limiter_(options.rate_limiter), stats_(stats), listeners_(), - checksum_func_(checksum_func) { + checksum_generator_(nullptr), + checksum_finalized_(false), + perform_data_verification_(perform_data_verification), + buffered_data_crc32c_checksum_(0), + buffered_data_with_checksum_(buffered_data_with_checksum) { TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", reinterpret_cast(max_buffer_size_)); buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); @@ -112,34 +204,50 @@ #else // !ROCKSDB_LITE (void)listeners; #endif + if (file_checksum_gen_factory != nullptr) { + FileChecksumGenContext checksum_gen_context; + checksum_gen_context.file_name = _file_name; + checksum_generator_ = + file_checksum_gen_factory->CreateFileChecksumGenerator( + checksum_gen_context); + } } + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg); WritableFileWriter(const WritableFileWriter&) = delete; WritableFileWriter& operator=(const WritableFileWriter&) = delete; - ~WritableFileWriter() { Close(); } + ~WritableFileWriter() { + auto s = Close(); + s.PermitUncheckedError(); + } std::string file_name() const { return file_name_; } - Status Append(const Slice& data); + // When this Append API is called, if the crc32c_checksum is not provided, we + // will calculate the checksum internally. + IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0); - Status Pad(const size_t pad_bytes); + IOStatus Pad(const size_t pad_bytes); - Status Flush(); + IOStatus Flush(); - Status Close(); + IOStatus Close(); - Status Sync(bool use_fsync); + IOStatus Sync(bool use_fsync); // Sync only the data that was already Flush()ed. Safe to call concurrently // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), // returns NotSupported status. - Status SyncWithoutFlush(bool use_fsync); + IOStatus SyncWithoutFlush(bool use_fsync); uint64_t GetFileSize() const { return filesize_; } - Status InvalidateCache(size_t offset, size_t length) { + IOStatus InvalidateCache(size_t offset, size_t length) { return writable_file_->InvalidateCache(offset, length); } @@ -149,11 +257,12 @@ bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; } - void TEST_SetFileChecksumFunc(FileChecksumFunc* checksum_func) { - checksum_func_ = checksum_func; + void TEST_SetFileChecksumGenerator( + FileChecksumGenerator* checksum_generator) { + checksum_generator_.reset(checksum_generator); } - const std::string& GetFileChecksum() const { return file_checksum_; } + std::string GetFileChecksum(); const char* GetFileChecksumFuncName() const; @@ -161,11 +270,13 @@ // Used when os buffering is OFF and we are writing // DMA such as in Direct I/O mode #ifndef ROCKSDB_LITE - Status WriteDirect(); + IOStatus WriteDirect(); + IOStatus WriteDirectWithChecksum(); #endif // !ROCKSDB_LITE // Normal write - Status WriteBuffered(const char* data, size_t size); - Status RangeSync(uint64_t offset, uint64_t nbytes); - Status SyncInternal(bool use_fsync); + IOStatus WriteBuffered(const char* data, size_t size); + IOStatus WriteBufferedWithChecksum(const char* data, size_t size); + IOStatus RangeSync(uint64_t offset, uint64_t nbytes); + IOStatus SyncInternal(bool use_fsync); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/Makefile 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,61 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +ROOT_DIR = $(abspath $(shell pwd)/../) + +include $(ROOT_DIR)/make_config.mk + +PROTOBUF_CFLAGS = `pkg-config --cflags protobuf` +PROTOBUF_LDFLAGS = `pkg-config --libs protobuf` + +PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator` +PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator` + +ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include +ROCKSDB_LIB_DIR = $(ROOT_DIR) + +PROTO_IN = $(ROOT_DIR)/fuzz/proto +PROTO_OUT = $(ROOT_DIR)/fuzz/proto/gen + +ifneq ($(FUZZ_ENV), ossfuzz) +CC = clang++ +CCFLAGS += -Wall -fsanitize=address,fuzzer +CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) +LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +else +# OSS-Fuzz sets various environment flags that are used for compilation. +# These environment flags depend on which type of sanitizer build is being +# used, however, an ASan build would set the environment flags as follows: +# CFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \ + -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \ + -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link" +# CXXFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \ + -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \ + -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link \ + -stdlib=libc++" +# LIB_FUZZING_ENGINE="-fsanitize=fuzzer" +CC = $(CXX) +CCFLAGS = $(CXXFLAGS) +CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) +LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +endif + +.PHONY: gen_proto + +gen_proto: + mkdir -p $(PROTO_OUT) + protoc \ + --proto_path=$(PROTO_IN) \ + --cpp_out=$(PROTO_OUT) \ + $(PROTO_IN)/*.proto + +db_fuzzer: db_fuzzer.cc + $(CC) $(CCFLAGS) -o db_fuzzer db_fuzzer.cc $(CFLAGS) $(LDFLAGS) + +db_map_fuzzer: gen_proto db_map_fuzzer.cc proto/gen/db_operation.pb.cc + $(CC) $(CCFLAGS) -o db_map_fuzzer db_map_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS) + +sst_file_writer_fuzzer: gen_proto sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc + $(CC) $(CCFLAGS) -o sst_file_writer_fuzzer sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/README.md 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,160 @@ +# Fuzzing RocksDB + +## Overview + +This directory contains [fuzz tests](https://en.wikipedia.org/wiki/Fuzzing) for RocksDB. +RocksDB testing infrastructure currently includes unit tests and [stress tests](https://github.com/facebook/rocksdb/wiki/Stress-test), +we hope fuzz testing can catch more bugs. + +## Prerequisite + +We use [LLVM libFuzzer](http://llvm.org/docs/LibFuzzer.html) as the fuzzying engine, +so make sure you have [clang](https://clang.llvm.org/get_started.html) as your compiler. + +Some tests rely on [structure aware fuzzing](https://github.com/google/fuzzing/blob/master/docs/structure-aware-fuzzing.md). +We use [protobuf](https://developers.google.com/protocol-buffers) to define structured input to the fuzzer, +and use [libprotobuf-mutator](https://github.com/google/libprotobuf-mutator) as the custom libFuzzer mutator. +So make sure you have protobuf and libprotobuf-mutator installed, and make sure `pkg-config` can find them. + +## Example + +This example shows you how to do structure aware fuzzing to `rocksdb::SstFileWriter`. + +After walking through the steps to create the fuzzer, we'll introduce a bug into `rocksdb::SstFileWriter::Put`, +then show that the fuzzer can catch the bug. + +### Design the test + +We want the fuzzing engine to automatically generate a list of database operations, +then we apply these operations to `SstFileWriter` in sequence, +finally, after the SST file is generated, we use `SstFileReader` to check the file's checksum. + +### Define input + +We define the database operations in protobuf, each operation has a type of operation and a key value pair, +see [proto/db_operation.proto](proto/db_operation.proto) for details. + +### Define tests with the input + +In [sst_file_writer_fuzzer.cc](sst_file_writer_fuzzer.cc), +we define the tests to be run on the generated input: + +``` +DEFINE_PROTO_FUZZER(DBOperations& input) { + // apply the operations to SstFileWriter and use SstFileReader to verify checksum. + // ... +} +``` + +`SstFileWriter` requires the keys of the operations to be unique and be in ascending order, +but the fuzzing engine generates the input randomly, so we need to process the generated input before +passing it to `DEFINE_PROTO_FUZZER`, this is accomplished by registering a post processor: + +``` +protobuf_mutator::libfuzzer::PostProcessorRegistration +``` + +### Compile and link the fuzzer + +In the rocksdb root directory, compile rocksdb library by `make static_lib`. + +Go to the `fuzz` directory, +run `make sst_file_writer_fuzzer` to generate the fuzzer, +it will compile rocksdb static library, generate protobuf, then compile and link `sst_file_writer_fuzzer`. + +### Introduce a bug + +Manually introduce a bug to `SstFileWriter::Put`: + +``` +diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc +index ab1ee7c4e..c7da9ffa0 100644 +--- a/table/sst_file_writer.cc ++++ b/table/sst_file_writer.cc +@@ -277,6 +277,11 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { + } + + Status SstFileWriter::Put(const Slice& user_key, const Slice& value) { ++ if (user_key.starts_with("!")) { ++ if (value.ends_with("!")) { ++ return Status::Corruption("bomb"); ++ } ++ } + return rep_->Add(user_key, value, ValueType::kTypeValue); + } +``` + +The bug is that for `Put`, if `user_key` starts with `!` and `value` ends with `!`, then corrupt. + +### Run fuzz testing to catch the bug + +Run the fuzzer by `time ./sst_file_writer_fuzzer`. + +Here is the output on my machine: + +``` +Corruption: bomb +==59680== ERROR: libFuzzer: deadly signal + #0 0x109487315 in __sanitizer_print_stack_trace+0x35 (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4d315) + #1 0x108d63f18 in fuzzer::PrintStackTrace() FuzzerUtil.cpp:205 + #2 0x108d47613 in fuzzer::Fuzzer::CrashCallback() FuzzerLoop.cpp:232 + #3 0x7fff6af535fc in _sigtramp+0x1c (libsystem_platform.dylib:x86_64+0x35fc) + #4 0x7ffee720f3ef () + #5 0x7fff6ae29807 in abort+0x77 (libsystem_c.dylib:x86_64+0x7f807) + #6 0x108cf1c4c in TestOneProtoInput(DBOperations&)+0x113c (sst_file_writer_fuzzer:x86_64+0x100302c4c) + #7 0x108cf09be in LLVMFuzzerTestOneInput+0x16e (sst_file_writer_fuzzer:x86_64+0x1003019be) + #8 0x108d48ce0 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) FuzzerLoop.cpp:556 + #9 0x108d48425 in fuzzer::Fuzzer::RunOne(unsigned char const*, unsigned long, bool, fuzzer::InputInfo*, bool*) FuzzerLoop.cpp:470 + #10 0x108d4a626 in fuzzer::Fuzzer::MutateAndTestOne() FuzzerLoop.cpp:698 + #11 0x108d4b325 in fuzzer::Fuzzer::Loop(std::__1::vector >&) FuzzerLoop.cpp:830 + #12 0x108d37fcd in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) FuzzerDriver.cpp:829 + #13 0x108d652b2 in main FuzzerMain.cpp:19 + #14 0x7fff6ad5acc8 in start+0x0 (libdyld.dylib:x86_64+0x1acc8) + +NOTE: libFuzzer has rudimentary signal handlers. + Combine libFuzzer with AddressSanitizer or similar for better crash reports. +SUMMARY: libFuzzer: deadly signal +MS: 7 Custom-CustomCrossOver-InsertByte-Custom-ChangeBit-Custom-CustomCrossOver-; base unit: 90863b4d83c3f994bba0a417d0c2ee3b68f9e795 +0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x76,0x61,0x6c,0x75,0x65,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2b,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2e,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x5c,0x32,0x35,0x33,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa, +operations {\x0a key: \"!\"\x0a value: \"!\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \"+\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \".\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \"\\253\"\x0a type: PUT\x0a}\x0a +artifact_prefix='./'; Test unit written to ./crash-a1460be302d09b548e61787178d9edaa40aea467 +Base64: b3BlcmF0aW9ucyB7CiAga2V5OiAiISIKICB2YWx1ZTogIiEiCiAgdHlwZTogUFVUCn0Kb3BlcmF0aW9ucyB7CiAga2V5OiAiKyIKICB0eXBlOiBQVVQKfQpvcGVyYXRpb25zIHsKICBrZXk6ICIuIgogIHR5cGU6IFBVVAp9Cm9wZXJhdGlvbnMgewogIGtleTogIlwyNTMiCiAgdHlwZTogUFVUCn0K +./sst_file_writer_fuzzer 5.97s user 4.40s system 64% cpu 16.195 total +``` + +Within 6 seconds, it catches the bug. + +The input that triggers the bug is persisted in `./crash-a1460be302d09b548e61787178d9edaa40aea467`: + +``` +$ cat ./crash-a1460be302d09b548e61787178d9edaa40aea467 +operations { + key: "!" + value: "!" + type: PUT +} +operations { + key: "+" + type: PUT +} +operations { + key: "." + type: PUT +} +operations { + key: "\253" + type: PUT +} +``` + +### Reproduce the crash to debug + +The above crash can be reproduced by `./sst_file_writer_fuzzer ./crash-a1460be302d09b548e61787178d9edaa40aea467`, +so you can debug the crash. + +## Future Work + +According to [OSS-Fuzz](https://github.com/google/oss-fuzz), +`as of June 2020, OSS-Fuzz has found over 20,000 bugs in 300 open source projects.` + +RocksDB can join OSS-Fuzz together with other open source projects such as sqlite. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,164 @@ +#include + +#include "rocksdb/db.h" + +enum OperationType { + kPut, + kGet, + kDelete, + kGetProperty, + kIterator, + kSnapshot, + kOpenClose, + kColumn, + kCompactRange, + kSeekForPrev, + OP_COUNT +}; + +constexpr char db_path[] = "/tmp/testdb"; + +// Fuzzes DB operations by doing interpretations on the data. Both the +// sequence of API calls to be called on the DB as well as the arguments +// to each of these APIs are interpreted by way of the data buffer. +// The operations that the fuzzer supports are given by the OperationType +// enum. The goal is to capture sanitizer bugs, so the code should be +// compiled with a given sanitizer (ASan, UBSan, MSan). +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + ROCKSDB_NAMESPACE::DB* db; + ROCKSDB_NAMESPACE::Options options; + options.create_if_missing = true; + ROCKSDB_NAMESPACE::Status status = + ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); + if (!status.ok()) { + return 0; + } + FuzzedDataProvider fuzzed_data(data, size); + + // perform a sequence of calls on our db instance + int max_iter = static_cast(data[0]); + for (int i = 0; i < max_iter && i < size; i++) { + OperationType op = static_cast(data[i] % OP_COUNT); + + switch (op) { + case kPut: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string val = fuzzed_data.ConsumeRandomLengthString(); + db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, val); + break; + } + case kGet: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string value; + db->Get(ROCKSDB_NAMESPACE::ReadOptions(), key, &value); + break; + } + case kDelete: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), key); + break; + } + case kGetProperty: { + std::string prop; + std::string property_name = fuzzed_data.ConsumeRandomLengthString(); + db->GetProperty(property_name, &prop); + break; + } + case kIterator: { + ROCKSDB_NAMESPACE::Iterator* it = + db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + } + delete it; + break; + } + case kSnapshot: { + ROCKSDB_NAMESPACE::ReadOptions snapshot_options; + snapshot_options.snapshot = db->GetSnapshot(); + ROCKSDB_NAMESPACE::Iterator* it = db->NewIterator(snapshot_options); + db->ReleaseSnapshot(snapshot_options.snapshot); + delete it; + break; + } + case kOpenClose: { + db->Close(); + delete db; + status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); + if (!status.ok()) { + ROCKSDB_NAMESPACE::DestroyDB(db_path, options); + return 0; + } + + break; + } + case kColumn: { + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf; + ROCKSDB_NAMESPACE::Status s; + s = db->CreateColumnFamily(ROCKSDB_NAMESPACE::ColumnFamilyOptions(), + "new_cf", &cf); + s = db->DestroyColumnFamilyHandle(cf); + db->Close(); + delete db; + + // open DB with two column families + std::vector column_families; + // have to open default column family + column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor( + ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, + ROCKSDB_NAMESPACE::ColumnFamilyOptions())); + // open the new one, too + column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor( + "new_cf", ROCKSDB_NAMESPACE::ColumnFamilyOptions())); + std::vector handles; + s = ROCKSDB_NAMESPACE::DB::Open(ROCKSDB_NAMESPACE::DBOptions(), db_path, + column_families, &handles, &db); + + if (s.ok()) { + std::string key1 = fuzzed_data.ConsumeRandomLengthString(); + std::string val1 = fuzzed_data.ConsumeRandomLengthString(); + std::string key2 = fuzzed_data.ConsumeRandomLengthString(); + s = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), handles[1], key1, + val1); + std::string value; + s = db->Get(ROCKSDB_NAMESPACE::ReadOptions(), handles[1], key2, + &value); + s = db->DropColumnFamily(handles[1]); + for (auto handle : handles) { + s = db->DestroyColumnFamilyHandle(handle); + } + } else { + status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db); + if (!status.ok()) { + // At this point there is no saving to do. So we exit + ROCKSDB_NAMESPACE::DestroyDB(db_path, ROCKSDB_NAMESPACE::Options()); + return 0; + } + } + break; + } + case kCompactRange: { + std::string slice_start = fuzzed_data.ConsumeRandomLengthString(); + std::string slice_end = fuzzed_data.ConsumeRandomLengthString(); + + ROCKSDB_NAMESPACE::Slice begin(slice_start); + ROCKSDB_NAMESPACE::Slice end(slice_end); + ROCKSDB_NAMESPACE::CompactRangeOptions options; + ROCKSDB_NAMESPACE::Status s = db->CompactRange(options, &begin, &end); + break; + } + case kSeekForPrev: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + auto iter = db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); + iter->SeekForPrev(key); + delete iter; + break; + } + } + } + + // Cleanup DB + db->Close(); + delete db; + ROCKSDB_NAMESPACE::DestroyDB(db_path, options); + return 0; +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "proto/gen/db_operation.pb.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "src/libfuzzer/libfuzzer_macro.h" +#include "util.h" + +protobuf_mutator::libfuzzer::PostProcessorRegistration reg = { + [](DBOperations* input, unsigned int /* seed */) { + const ROCKSDB_NAMESPACE::Comparator* comparator = + ROCKSDB_NAMESPACE::BytewiseComparator(); + auto ops = input->mutable_operations(); + // Make sure begin <= end for DELETE_RANGE. + for (DBOperation& op : *ops) { + if (op.type() == OpType::DELETE_RANGE) { + auto begin = op.key(); + auto end = op.value(); + if (comparator->Compare(begin, end) > 0) { + std::swap(begin, end); + op.set_key(begin); + op.set_value(end); + } + } + } + }}; + +// Execute randomly generated operations on both a DB and a std::map, +// then reopen the DB and make sure that iterating the DB produces the +// same key-value pairs as iterating through the std::map. +DEFINE_PROTO_FUZZER(DBOperations& input) { + if (input.operations().empty()) { + return; + } + + const std::string kDbPath = "/tmp/db_map_fuzzer_test"; + auto fs = ROCKSDB_NAMESPACE::FileSystem::Default(); + if (fs->FileExists(kDbPath, ROCKSDB_NAMESPACE::IOOptions(), /*dbg=*/nullptr) + .ok()) { + std::cerr << "db path " << kDbPath << " already exists" << std::endl; + abort(); + } + + std::map kv; + ROCKSDB_NAMESPACE::DB* db = nullptr; + ROCKSDB_NAMESPACE::Options options; + options.create_if_missing = true; + CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db)); + + for (const DBOperation& op : input.operations()) { + switch (op.type()) { + case OpType::PUT: { + CHECK_OK( + db->Put(ROCKSDB_NAMESPACE::WriteOptions(), op.key(), op.value())); + kv[op.key()] = op.value(); + break; + } + case OpType::MERGE: { + break; + } + case OpType::DELETE: { + CHECK_OK(db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), op.key())); + kv.erase(op.key()); + break; + } + case OpType::DELETE_RANGE: { + // [op.key(), op.value()) corresponds to [begin, end). + CHECK_OK(db->DeleteRange(ROCKSDB_NAMESPACE::WriteOptions(), + db->DefaultColumnFamily(), op.key(), + op.value())); + kv.erase(kv.lower_bound(op.key()), kv.lower_bound(op.value())); + break; + } + default: { + std::cerr << "Unsupported operation" << static_cast(op.type()); + return; + } + } + } + CHECK_OK(db->Close()); + delete db; + db = nullptr; + + CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db)); + auto kv_it = kv.begin(); + ROCKSDB_NAMESPACE::Iterator* it = + db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()); + for (it->SeekToFirst(); it->Valid(); it->Next(), kv_it++) { + CHECK_TRUE(kv_it != kv.end()); + CHECK_EQ(it->key().ToString(), kv_it->first); + CHECK_EQ(it->value().ToString(), kv_it->second); + } + CHECK_TRUE(kv_it == kv.end()); + delete it; + + CHECK_OK(db->Close()); + delete db; + CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options)); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,28 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Defines database operations. +// Each operation is a key-value pair and an operation type. + +syntax = "proto2"; + +enum OpType { + PUT = 0; + MERGE = 1; + DELETE = 2; + DELETE_RANGE = 3; +} + +message DBOperation { + required string key = 1; + // value is ignored for DELETE. + // [key, value] is the range for DELETE_RANGE. + optional string value = 2; + required OpType type = 3; +} + +message DBOperations { + repeated DBOperation operations = 1; +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,185 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "proto/gen/db_operation.pb.h" +#include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" +#include "src/libfuzzer/libfuzzer_macro.h" +#include "table/table_reader.h" +#include "util.h" + +// Keys in SST file writer operations must be unique and in ascending order. +// For each DBOperation generated by the fuzzer, this function is called on +// it to deduplicate and sort the keys in the DBOperations. +protobuf_mutator::libfuzzer::PostProcessorRegistration reg = { + [](DBOperations* input, unsigned int /* seed */) { + const Comparator* comparator = BytewiseComparator(); + auto ops = input->mutable_operations(); + + // Make sure begin <= end for DELETE_RANGE. + for (DBOperation& op : *ops) { + if (op.type() == OpType::DELETE_RANGE) { + auto begin = op.key(); + auto end = op.value(); + if (comparator->Compare(begin, end) > 0) { + std::swap(begin, end); + op.set_key(begin); + op.set_value(end); + } + } + } + + std::sort(ops->begin(), ops->end(), + [&comparator](const DBOperation& a, const DBOperation& b) { + return comparator->Compare(a.key(), b.key()) < 0; + }); + + auto last = std::unique( + ops->begin(), ops->end(), + [&comparator](const DBOperation& a, const DBOperation& b) { + return comparator->Compare(a.key(), b.key()) == 0; + }); + ops->erase(last, ops->end()); + }}; + +TableReader* NewTableReader(const std::string& sst_file_path, + const Options& options, + const EnvOptions& env_options, + const ImmutableCFOptions& cf_ioptions) { + // This code block is similar to SstFileReader::Open. + + uint64_t file_size = 0; + std::unique_ptr file_reader; + std::unique_ptr table_reader; + const auto& fs = options.env->GetFileSystem(); + FileOptions fopts(env_options); + Status s = options.env->GetFileSize(sst_file_path, fopts.io_options, + &file_size, nullptr); + if (s.ok()) { + s = RandomAccessFileReader::Create(fs, sst_file_path, fopts, &file_reader, + nullptr); + } + if (s.ok()) { + TableReaderOptions t_opt(cf_ioptions, /*prefix_extractor=*/nullptr, + env_options, cf_ioptions.internal_comparator); + t_opt.largest_seqno = kMaxSequenceNumber; + s = options.table_factory->NewTableReader(t_opt, std::move(file_reader), + file_size, &table_reader, + /*prefetch=*/false); + } + if (!s.ok()) { + std::cerr << "Failed to create TableReader for " << sst_file_path << ": " + << s.ToString() << std::endl; + abort(); + } + return table_reader.release(); +} + +ValueType ToValueType(OpType op_type) { + switch (op_type) { + case OpType::PUT: + return ValueType::kTypeValue; + case OpType::MERGE: + return ValueType::kTypeMerge; + case OpType::DELETE: + return ValueType::kTypeDeletion; + case OpType::DELETE_RANGE: + return ValueType::kTypeRangeDeletion; + default: + std::cerr << "Unknown operation type " << static_cast(op_type) + << std::endl; + abort(); + } +} + +// Fuzzes DB operations as input, let SstFileWriter generate a SST file +// according to the operations, then let TableReader read and check all the +// key-value pairs from the generated SST file. +DEFINE_PROTO_FUZZER(DBOperations& input) { + if (input.operations().empty()) { + return; + } + + std::string sstfile; + { + auto fs = FileSystem::Default(); + std::string dir; + IOOptions opt; + CHECK_OK(fs->GetTestDirectory(opt, &dir, nullptr)); + sstfile = dir + "/SstFileWriterFuzzer.sst"; + } + + Options options; + EnvOptions env_options(options); + ImmutableCFOptions cf_ioptions(options); + + // Generate sst file. + SstFileWriter writer(env_options, options); + CHECK_OK(writer.Open(sstfile)); + for (const DBOperation& op : input.operations()) { + switch (op.type()) { + case OpType::PUT: { + CHECK_OK(writer.Put(op.key(), op.value())); + break; + } + case OpType::MERGE: { + CHECK_OK(writer.Merge(op.key(), op.value())); + break; + } + case OpType::DELETE: { + CHECK_OK(writer.Delete(op.key())); + break; + } + case OpType::DELETE_RANGE: { + CHECK_OK(writer.DeleteRange(op.key(), op.value())); + break; + } + default: { + std::cerr << "Unsupported operation" << static_cast(op.type()) + << std::endl; + abort(); + } + } + } + ExternalSstFileInfo info; + CHECK_OK(writer.Finish(&info)); + + // Iterate and verify key-value pairs. + std::unique_ptr table_reader( + NewTableReader(sstfile, options, env_options, cf_ioptions)); + ReadOptions roptions; + CHECK_OK(table_reader->VerifyChecksum(roptions, + TableReaderCaller::kUncategorized)); + std::unique_ptr it( + table_reader->NewIterator(roptions, /*prefix_extractor=*/nullptr, + /*arena=*/nullptr, /*skip_filters=*/true, + TableReaderCaller::kUncategorized)); + it->SeekToFirst(); + for (const DBOperation& op : input.operations()) { + if (op.type() == OpType::DELETE_RANGE) { + // InternalIterator cannot iterate over DELETE_RANGE entries. + continue; + } + CHECK_TRUE(it->Valid()); + ParsedInternalKey ikey; + CHECK_OK(ParseInternalKey(it->key(), &ikey, /*log_err_key=*/true)); + CHECK_EQ(ikey.user_key.ToString(), op.key()); + CHECK_EQ(ikey.sequence, 0); + CHECK_EQ(ikey.type, ToValueType(op.type())); + if (op.type() != OpType::DELETE) { + CHECK_EQ(op.value(), it->value().ToString()); + } + it->Next(); + } + CHECK_TRUE(!it->Valid()); + + // Delete sst file. + remove(sstfile.c_str()); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/util.h mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/util.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,23 @@ +#pragma once + +#define CHECK_OK(expression) \ + do { \ + auto status = (expression); \ + if (!status.ok()) { \ + std::cerr << status.ToString() << std::endl; \ + abort(); \ + } \ + } while (0) + +#define CHECK_EQ(a, b) \ + if (a != b) { \ + std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \ + << std::endl; \ + abort(); \ + } + +#define CHECK_TRUE(cond) \ + if (!(cond)) { \ + std::cerr << "\"" << #cond << "\" is false" << std::endl; \ + abort(); \ + } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/hdfs/env_hdfs.h mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/hdfs/env_hdfs.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h 2025-05-19 16:14:27.000000000 +0000 @@ -48,6 +48,10 @@ posixEnv = Env::Default(); fileSys_ = connectToPath(fsname_); } + static const char* kClassName() { return "HdfsEnv"; } + const char* Name() const override { return kClassName(); } + static const char* kNickName() { return "hdfs"; } + const char* NickName() const override { return kNickName(); } virtual ~HdfsEnv() { fprintf(stderr, "Destroying HdfsEnv::Default()\n"); @@ -101,6 +105,8 @@ Status NewLogger(const std::string& fname, std::shared_ptr* result) override; + Status IsDirectory(const std::string& path, bool* is_dir) override; + void Schedule(void (*function)(void* arg), void* arg, Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = 0) override { @@ -160,10 +166,7 @@ return posixEnv->TimeToString(number); } - static uint64_t gettid() { - assert(sizeof(pthread_t) <= sizeof(uint64_t)); - return (uint64_t)pthread_self(); - } + static uint64_t gettid() { return Env::Default()->GetThreadID(); } uint64_t GetThreadID() const override { return HdfsEnv::gettid(); } @@ -207,8 +210,7 @@ std::string portStr = (rem == 0 ? remaining : remaining.substr(0, rem)); - tPort port; - port = atoi(portStr.c_str()); + tPort port = static_cast(atoi(portStr.c_str())); if (port == 0) { throw HdfsFatalException("Bad host-port for hdfs " + uri); } @@ -236,8 +238,6 @@ namespace ROCKSDB_NAMESPACE { -static const Status notsup; - class HdfsEnv : public Env { public: @@ -246,6 +246,10 @@ fprintf(stderr, "Please see hdfs/README for details\n"); abort(); } + static const char* kClassName() { return "HdfsEnv"; } + const char* Name() const override { return kClassName(); } + static const char* kNickName() { return "hdfs"; } + const char* NickName() const override { return kNickName(); } virtual ~HdfsEnv() { } @@ -258,75 +262,81 @@ const std::string& /*fname*/, std::unique_ptr* /*result*/, const EnvOptions& /*options*/) override { - return notsup; + return Status::NotSupported(); } virtual Status NewWritableFile(const std::string& /*fname*/, std::unique_ptr* /*result*/, const EnvOptions& /*options*/) override { - return notsup; + return Status::NotSupported(); } virtual Status NewDirectory(const std::string& /*name*/, std::unique_ptr* /*result*/) override { - return notsup; + return Status::NotSupported(); } virtual Status FileExists(const std::string& /*fname*/) override { - return notsup; + return Status::NotSupported(); } virtual Status GetChildren(const std::string& /*path*/, std::vector* /*result*/) override { - return notsup; + return Status::NotSupported(); } virtual Status DeleteFile(const std::string& /*fname*/) override { - return notsup; + return Status::NotSupported(); } virtual Status CreateDir(const std::string& /*name*/) override { - return notsup; + return Status::NotSupported(); } virtual Status CreateDirIfMissing(const std::string& /*name*/) override { - return notsup; + return Status::NotSupported(); } virtual Status DeleteDir(const std::string& /*name*/) override { - return notsup; + return Status::NotSupported(); } virtual Status GetFileSize(const std::string& /*fname*/, uint64_t* /*size*/) override { - return notsup; + return Status::NotSupported(); } virtual Status GetFileModificationTime(const std::string& /*fname*/, uint64_t* /*time*/) override { - return notsup; + return Status::NotSupported(); } virtual Status RenameFile(const std::string& /*src*/, const std::string& /*target*/) override { - return notsup; + return Status::NotSupported(); } virtual Status LinkFile(const std::string& /*src*/, const std::string& /*target*/) override { - return notsup; + return Status::NotSupported(); } virtual Status LockFile(const std::string& /*fname*/, FileLock** /*lock*/) override { - return notsup; + return Status::NotSupported(); } - virtual Status UnlockFile(FileLock* /*lock*/) override { return notsup; } + virtual Status UnlockFile(FileLock* /*lock*/) override { + return Status::NotSupported(); + } virtual Status NewLogger(const std::string& /*fname*/, std::shared_ptr* /*result*/) override { - return notsup; + return Status::NotSupported(); + } + + Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) override { + return Status::NotSupported(); } virtual void Schedule(void (* /*function*/)(void* arg), void* /*arg*/, @@ -346,7 +356,7 @@ } virtual Status GetTestDirectory(std::string* /*path*/) override { - return notsup; + return Status::NotSupported(); } virtual uint64_t NowMicros() override { return 0; } @@ -354,16 +364,16 @@ virtual void SleepForMicroseconds(int /*micros*/) override {} virtual Status GetHostName(char* /*name*/, uint64_t /*len*/) override { - return notsup; + return Status::NotSupported(); } virtual Status GetCurrentTime(int64_t* /*unix_time*/) override { - return notsup; + return Status::NotSupported(); } virtual Status GetAbsolutePath(const std::string& /*db_path*/, std::string* /*outputpath*/) override { - return notsup; + return Status::NotSupported(); } virtual void SetBackgroundThreads(int /*number*/, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include +#include "rocksdb/compression_type.h" #include "rocksdb/memtablerep.h" #include "rocksdb/universal_compaction.h" @@ -17,7 +18,6 @@ class Slice; class SliceTransform; -enum CompressionType : unsigned char; class TablePropertiesCollectorFactory; class TableFactory; struct Options; @@ -70,6 +70,10 @@ // Default: false; bool allow_compaction = false; + // When not 0, if the data in the file is older than this threshold, RocksDB + // will soon move the file to warm temperature. + uint64_t age_for_warm = 0; + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) : max_table_files_size(_max_table_files_size), @@ -101,9 +105,14 @@ // // When compression dictionary is disabled, we compress and write each block // before buffering data for the next one. When compression dictionary is - // enabled, we buffer all SST file data in-memory so we can sample it, as data + // enabled, we buffer SST file data in-memory so we can sample it, as data // can only be compressed and written after the dictionary has been finalized. - // So users of this feature may see increased memory usage. + // + // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This + // buffered memory is charged to the block cache when there is a block cache. + // If block cache insertion fails with `Status::Incomplete` (i.e., it is + // full), we finalize the dictionary with whatever data we have and then stop + // buffering. // // Default: 0. uint32_t max_dict_bytes; @@ -117,6 +126,21 @@ // Default: 0. uint32_t zstd_max_train_bytes; + // Number of threads for parallel compression. + // Parallel compression is enabled only if threads > 1. + // THE FEATURE IS STILL EXPERIMENTAL + // + // This option is valid only when BlockBasedTable is used. + // + // When parallel compression is enabled, SST size file sizes might be + // more inflated compared to the target size, because more data of unknown + // compressed size is in flight when compression is parallelized. To be + // reasonably accurate, this inflation is also estimated by using historical + // compression ratio and current bytes inflight. + // + // Default: 1. + uint32_t parallel_threads; + // When the compression options are set by the user, it will be set to "true". // For bottommost_compression_opts, to enable it, user must set enabled=true. // Otherwise, bottommost compression will use compression_opts as default @@ -128,21 +152,67 @@ // Default: false. bool enabled; + // Limit on data buffering when gathering samples to build a dictionary. Zero + // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), + // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. + // + // In compaction, the buffering is limited to the target file size (see + // `target_file_size_base` and `target_file_size_multiplier`) even if this + // setting permits more buffering. Since we cannot determine where the file + // should be cut until data blocks are compressed with dictionary, buffering + // more than the target file size could lead to selecting samples that belong + // to a later output SST. + // + // Limiting too strictly may harm dictionary effectiveness since it forces + // RocksDB to pick samples from the initial portion of the output SST, which + // may not be representative of the whole file. Configuring this limit below + // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can + // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can + // restrict the size of the final dictionary. + // + // Default: 0 (unlimited) + uint64_t max_dict_buffer_bytes; + CompressionOptions() : window_bits(-14), level(kDefaultCompressionLevel), strategy(0), max_dict_bytes(0), zstd_max_train_bytes(0), - enabled(false) {} - CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes, - int _zstd_max_train_bytes, bool _enabled) + parallel_threads(1), + enabled(false), + max_dict_buffer_bytes(0) {} + CompressionOptions(int wbits, int _lev, int _strategy, + uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes, + uint32_t _parallel_threads, bool _enabled, + uint64_t _max_dict_buffer_bytes) : window_bits(wbits), level(_lev), strategy(_strategy), max_dict_bytes(_max_dict_bytes), zstd_max_train_bytes(_zstd_max_train_bytes), - enabled(_enabled) {} + parallel_threads(_parallel_threads), + enabled(_enabled), + max_dict_buffer_bytes(_max_dict_buffer_bytes) {} +}; + +// Temperature of a file. Used to pass to FileSystem for a different +// placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. +enum class Temperature : uint8_t { + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, +}; + +// The control option of how the cache tiers will be used. Currently rocksdb +// support block cahe (volatile tier), secondary cache (non-volatile tier). +// In the future, we may add more caching layers. +enum class CacheTier : uint8_t { + kVolatileTier = 0, + kNonVolatileBlockTier = 0x01, }; enum UpdateStatus { // Return status For inplace update callback @@ -183,17 +253,32 @@ // ignored. int max_write_buffer_number_to_maintain = 0; - // The total maximum size(bytes) of write buffers to maintain in memory - // including copies of buffers that have already been flushed. This parameter - // only affects trimming of flushed buffers and does not affect flushing. - // This controls the maximum amount of write history that will be available - // in memory for conflict checking when Transactions are used. The actual - // size of write history (flushed Memtables) might be higher than this limit - // if further trimming will reduce write history total size below this - // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB, - // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB. - // Because trimming the next Memtable of size 20MB will reduce total memory - // usage to 52MB which is below the limit, RocksDB will stop trimming. + // The target number of write history bytes to hold in memory. Write history + // comprises the latest write buffers (memtables). To reach the target, write + // buffers that were most recently flushed to SST files may be retained in + // memory. + // + // This controls the target amount of write history that will be available + // in memory for conflict checking when Transactions are used. + // + // This target may be undershot when the CF first opens and has not recovered + // or received enough writes to reach the target. After reaching the target + // once, it is guaranteed to never undershoot again. That guarantee is + // implemented by retaining flushed write buffers in-memory until the oldest + // one can be trimmed without dropping below the target. + // + // Examples with `max_write_buffer_size_to_maintain` set to 32MB: + // + // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB, + // and zero flushed immutable memtables. Nothing trimmable exists. + // - One mutable memtable of 16MB, zero unflushed immutable memtables, and + // one flushed immutable memtable of 64MB. Trimming is disallowed because + // dropping the earliest (only) flushed immutable memtable would result in + // write history of 16MB < 32MB. + // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB, + // and one flushed immutable memtable of 16MB. The earliest (only) flushed + // immutable memtable is trimmed because without it we still have + // 16MB + 24MB = 40MB > 32MB of write history. // // When using an OptimisticTransactionDB: // If this value is too low, some transactions may fail at commit time due @@ -219,6 +304,7 @@ // achieve point-in-time consistency using snapshot or iterator (assuming // concurrent updates). Hence iterator and multi-get will return results // which are not consistent as of any point-in-time. + // Backward iteration on memtables will not work either. // If inplace_callback function is not set, // Put(key, new_value) will update inplace the existing_value iff // * key exists in current memtable @@ -241,45 +327,55 @@ // delta_value - Delta value to be merged with the existing_value. // Stored in transaction logs. // merged_value - Set when delta is applied on the previous value. - + // // Applicable only when inplace_update_support is true, // this callback function is called at the time of updating the memtable // as part of a Put operation, lets say Put(key, delta_value). It allows the // 'delta_value' specified as part of the Put operation to be merged with // an 'existing_value' of the key in the database. - + // // If the merged value is smaller in size that the 'existing_value', // then this function can update the 'existing_value' buffer inplace and // the corresponding 'existing_value'_size pointer, if it wishes to. // The callback should return UpdateStatus::UPDATED_INPLACE. // In this case. (In this case, the snapshot-semantics of the rocksdb // Iterator is not atomic anymore). - + // // If the merged value is larger in size than the 'existing_value' or the // application does not wish to modify the 'existing_value' buffer inplace, // then the merged value should be returned via *merge_value. It is set by // merging the 'existing_value' and the Put 'delta_value'. The callback should // return UpdateStatus::UPDATED in this case. This merged value will be added // to the memtable. - + // // If merging fails or the application does not wish to take any action, // then the callback should return UpdateStatus::UPDATE_FAILED. - + // // Please remember that the original call from the application is Put(key, // delta_value). So the transaction log (if enabled) will still contain (key, // delta_value). The 'merged_value' is not stored in the transaction log. // Hence the inplace_callback function should be consistent across db reopens. - + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + // // Default: nullptr UpdateStatus (*inplace_callback)(char* existing_value, uint32_t* existing_value_size, Slice delta_value, std::string* merged_value) = nullptr; - // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, - // create prefix bloom for memtable with the size of + // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic + // Bloom filter in memtable to optimize many queries that must go beyond + // the memtable. The size in bytes of the filter is // write_buffer_size * memtable_prefix_bloom_size_ratio. - // If it is larger than 0.25, it is sanitized to 0.25. + // * If prefix_extractor is set, the filter includes prefixes. + // * If memtable_whole_key_filtering, the filter includes whole keys. + // * If both, the filter includes both. + // * If neither, the feature is disabled. + // + // If this value is larger than 0.25, it is sanitized to 0.25. // // Default: 0 (disable) // @@ -338,7 +434,8 @@ // size of one block in arena memory allocation. // If <= 0, a proper value is automatically calculated (usually 1/8 of - // writer_buffer_size, rounded up to a multiple of 4KB). + // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is + // smaller). // // There are two additional restriction of the specified size: // (1) size should be in the range of [4096, 2 << 30] and @@ -591,8 +688,8 @@ // the tables. // Default: empty vector -- no user-defined statistics collection will be // performed. - typedef std::vector> - TablePropertiesCollectorFactories; + using TablePropertiesCollectorFactories = + std::vector>; TablePropertiesCollectorFactories table_properties_collector_factories; // Maximum number of successive merge operations on a key in the memtable. @@ -624,18 +721,32 @@ // Default: false bool optimize_filters_for_hits = false; + // During flush or compaction, check whether keys inserted to output files + // are in order. + // + // Default: true + // + // Dynamically changeable through SetOptions() API + bool check_flush_compaction_key_order = true; + // After writing every SST file, reopen it and read all the keys. + // Checks the hash of all of the keys and values written versus the + // keys in the file and signals a corruption if they do not match // // Default: false // // Dynamically changeable through SetOptions() API bool paranoid_file_checks = false; - // In debug mode, RocksDB run consistency checks on the LSM every time the LSM - // change (Flush, Compaction, AddFile). These checks are disabled in release - // mode, use this option to enable them in release mode as well. - // Default: false - bool force_consistency_checks = false; + // In debug mode, RocksDB runs consistency checks on the LSM every time the + // LSM changes (Flush, Compaction, AddFile). When this option is true, these + // checks are also enabled in release mode. These checks were historically + // disabled in release mode, but are now enabled by default for proactive + // corruption detection. The CPU overhead is negligible for normal mixed + // operations but can slow down saturated writing. See + // Options::DisableExtraChecks(). + // Default: true + bool force_consistency_checks = true; // Measure IO stats in compactions and flushes, if true. // @@ -644,10 +755,14 @@ // Dynamically changeable through SetOptions() API bool report_bg_io_stats = false; - // Files older than TTL will go through the compaction process. + // Files containing updates older than TTL will go through the compaction + // process. This usually happens in a cascading way so that those entries + // will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. // Pre-req: This needs max_open_files to be set to -1. // In Level: Non-bottom-level files older than TTL will go through the - // compation process. + // compaction process. // In FIFO: Files older than TTL will be deleted. // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 // In FIFO, this option will have the same meaning as @@ -664,6 +779,9 @@ // Files older than this value will be picked up for compaction, and // re-written to the same level as they were before. + // One main use of the feature is to make sure a file goes through compaction + // filters periodically. Users can also use the feature to clear up SST + // files using old format. // // A file's age is computed by looking at file_creation_time or creation_time // table properties in order, if they have valid non-zero values; if not, the @@ -697,6 +815,100 @@ // data is left uncompressed (unless compression is also requested). uint64_t sample_for_compression = 0; + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when creating bottommost files, pass this + // temperature to FileSystem used. Should be no-op for default FileSystem + // and users need to plug in their own FileSystem to take advantage of it. + Temperature bottommost_temperature = Temperature::kUnknown; + + // When set, large values (blobs) are written to separate blob files, and + // only pointers to them are stored in SST files. This can reduce write + // amplification for large-value use cases at the cost of introducing a level + // of indirection for reads. See also the options min_blob_size, + // blob_file_size, blob_compression_type, enable_blob_garbage_collection, + // blob_garbage_collection_age_cutoff, + // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size + // below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_files = false; + + // The size of the smallest value to be stored separately in a blob file. + // Values which have an uncompressed size smaller than this threshold are + // stored alongside the keys in SST files in the usual fashion. A value of + // zero for this option means that all values are stored in blob files. Note + // that enable_blob_files has to be set in order for this option to have any + // effect. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t min_blob_size = 0; + + // The size limit for blob files. When writing blob files, a new file is + // opened once this limit is reached. Note that enable_blob_files has to be + // set in order for this option to have any effect. + // + // Default: 256 MB + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_file_size = 1ULL << 28; + + // The compression algorithm to use for large values stored in blob files. + // Note that enable_blob_files has to be set in order for this option to have + // any effect. + // + // Default: no compression + // + // Dynamically changeable through the SetOptions() API + CompressionType blob_compression_type = kNoCompression; + + // Enables garbage collection of blobs. Blob GC is performed as part of + // compaction. Valid blobs residing in blob files older than a cutoff get + // relocated to new files as they are encountered during compaction, which + // makes it possible to clean up blob files once they contain nothing but + // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and + // blob_garbage_collection_force_threshold below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N blob files will be relocated when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_blob_files. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 0.25 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_age_cutoff = 0.25; + + // If the ratio of garbage in the oldest blob files exceeds this threshold, + // targeted compactions are scheduled in order to force garbage collecting + // the blob files in question, assuming they are all eligible based on the + // value of blob_garbage_collection_age_cutoff above. This option is + // currently only supported with leveled compactions. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 1.0 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_force_threshold = 1.0; + + // Compaction readahead for blob files. + // + // Default: 0 + // + // Dynamically changeable through the SetOptions() API + uint64_t blob_compaction_readahead_size = 0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/c.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/c.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h 2025-05-19 16:14:27.000000000 +0000 @@ -71,8 +71,11 @@ typedef struct rocksdb_t rocksdb_t; typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_backupable_db_options_t rocksdb_backupable_db_options_t; typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; -typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; +typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; typedef struct rocksdb_compactionfiltercontext_t rocksdb_compactionfiltercontext_t; @@ -136,7 +139,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( const rocksdb_options_t* options, const char* name, - unsigned char error_if_log_file_exist, char** errptr); + unsigned char error_if_wal_file_exists, char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( const rocksdb_options_t* options, const char* name, @@ -145,6 +148,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* +rocksdb_backup_engine_open_opts(const rocksdb_backupable_db_options_t* options, + rocksdb_env_t* env, char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); @@ -156,7 +163,7 @@ rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* -rocksdb_restore_options_create(); +rocksdb_restore_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( rocksdb_restore_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( @@ -171,6 +178,11 @@ rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, const rocksdb_restore_options_t* restore_options, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, const uint32_t backup_id, + char** errptr); + extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be); @@ -198,6 +210,100 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close( rocksdb_backup_engine_t* be); +/* BackupableDBOptions */ + +extern ROCKSDB_LIBRARY_API rocksdb_backupable_db_options_t* +rocksdb_backupable_db_options_create(const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_backup_dir( + rocksdb_backupable_db_options_t* options, const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_env( + rocksdb_backupable_db_options_t* options, rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_share_table_files( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_share_table_files( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_sync( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backupable_db_options_get_sync( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_destroy_old_data( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_destroy_old_data( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_backup_log_files( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_backup_log_files( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_backup_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_backup_rate_limit( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_restore_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_restore_rate_limit( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_max_background_operations( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_max_background_operations( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options, uint64_t size); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_destroy( + rocksdb_backupable_db_options_t*); + +/* Checkpoint */ + extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr); @@ -214,13 +320,20 @@ const rocksdb_options_t* const* column_family_options, rocksdb_column_family_handle_t** column_family_handles, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl( + const rocksdb_options_t* options, const char* name, int num_column_families, + const char* const* column_family_names, + const rocksdb_options_t* const* column_family_options, + rocksdb_column_family_handle_t** column_family_handles, const int* ttls, + char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only_column_families( const rocksdb_options_t* options, const char* name, int num_column_families, const char* const* column_family_names, const rocksdb_options_t* const* column_family_options, rocksdb_column_family_handle_t** column_family_handles, - unsigned char error_if_log_file_exist, char** errptr); + unsigned char error_if_wal_file_exists, char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( const rocksdb_options_t* options, const char* name, @@ -241,6 +354,11 @@ const rocksdb_options_t* column_family_options, const char* column_family_name, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* +rocksdb_create_column_family_with_ttl( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, int ttl, char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family( rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr); @@ -320,6 +438,21 @@ const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs); +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist( + rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + +// The value is only allocated (using malloc) and returned if it is found and +// value_found isn't NULL. In that case the user is responsible for freeing it. +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf( + rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t key_len, char** value, size_t* val_len, const char* timestamp, + size_t timestamp_len, unsigned char* value_found); + extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator( rocksdb_t* db, const rocksdb_readoptions_t* options); @@ -365,13 +498,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes( rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, - const size_t* range_limit_key_len, uint64_t* sizes); + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, - const size_t* range_limit_key_len, uint64_t* sizes); + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db, const char* start_key, @@ -406,6 +539,10 @@ rocksdb_t* db, const rocksdb_flushoptions_t* options, rocksdb_column_family_handle_t* column_family, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, + unsigned char sync, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr); @@ -451,7 +588,8 @@ /* Write batch */ -extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create( + void); extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from( const char* rep, size_t size); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy( @@ -495,9 +633,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete( + rocksdb_writebatch_t* b, const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf( rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf( + rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev( rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); @@ -583,9 +726,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t*, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf( rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf( + rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev( rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list, const size_t* keys_list_sizes); @@ -670,7 +818,7 @@ /* Block based table options */ extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t* -rocksdb_block_based_options_create(); +rocksdb_block_based_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy( rocksdb_block_based_table_options_t* options); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size( @@ -745,7 +893,7 @@ /* Cuckoo table options */ extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t* -rocksdb_cuckoo_options_create(); +rocksdb_cuckoo_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy( rocksdb_cuckoo_table_options_t* options); extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio( @@ -769,8 +917,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr); -extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(); +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism( rocksdb_options_t* opt, int total_threads); extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup( @@ -783,12 +933,16 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( rocksdb_options_t*, rocksdb_compactionfilter_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory( rocksdb_options_t*, rocksdb_compactionfilterfactory_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator( rocksdb_options_t*, rocksdb_comparator_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator( @@ -796,16 +950,24 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator( rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level( - rocksdb_options_t* opt, int* level_values, size_t num_levels); + rocksdb_options_t* opt, const int* level_values, size_t num_levels); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_missing_column_families(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_create_missing_column_families(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); @@ -815,41 +977,98 @@ rocksdb_logger_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_write_buffer_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size( rocksdb_options_t* opt, uint64_t n); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( rocksdb_options_t*, int, int, int, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int, + int, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( + rocksdb_options_t*, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( rocksdb_options_t*, rocksdb_slicetransform_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_mem_compaction_level( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_target_file_size_base(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_multiplier_additional( rocksdb_options_t*, int* level_values, size_t num_levels); @@ -858,9 +1077,56 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open( rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( + rocksdb_options_t* opt); + +/* Blob Options Settings */ +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_min_blob_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_file_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt, + uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt); /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( @@ -868,122 +1134,222 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, int64_t); +extern ROCKSDB_LIBRARY_API int64_t +rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_max_subcompactions(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_base_background_compactions( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_log_file_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_keep_log_file_num(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit( rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_soft_rate_limit( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit( rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_hard_rate_limit( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_pending_compaction_bytes_limit( rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_pending_compaction_bytes_limit( rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_rate_limit_delay_max_milliseconds(rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_rate_limit_delay_max_milliseconds(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_remove_scan_count_limit(rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_arena_block_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir( rocksdb_options_t*, const char*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*, const char*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_skip_log_error_on_recovery(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec( rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec( + rocksdb_options_t*, unsigned int); +extern ROCKSDB_LIBRARY_API unsigned int +rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_advise_random_on_open(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_bytes_per_sync(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load( rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep( rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_size_ratio( rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API double +rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes( rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep( rocksdb_options_t*, size_t, int32_t, int32_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep( @@ -996,17 +1362,29 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_max_successive_merges(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality( rocksdb_options_t*, uint32_t); +extern ROCKSDB_LIBRARY_API uint32_t +rocksdb_options_get_bloom_locality(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_options_get_inplace_update_support(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats( + rocksdb_options_t*); enum { rocksdb_tolerate_corrupted_tail_records_recovery = 0, @@ -1016,6 +1394,8 @@ }; extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode( + rocksdb_options_t*); enum { rocksdb_no_compression = 0, @@ -1029,6 +1409,12 @@ }; extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression( + rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression( + rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression( + rocksdb_options_t*); enum { rocksdb_level_compaction = 0, @@ -1037,6 +1423,8 @@ }; extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style( rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_universal_compaction_options( rocksdb_options_t*, rocksdb_universal_compaction_options_t*); @@ -1046,11 +1434,21 @@ rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush( rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush( + rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( rocksdb_options_t* opt, rocksdb_cache_t* cache ); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( + rocksdb_options_t* opt); + /* RateLimiter */ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); @@ -1139,7 +1537,8 @@ }; extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); -extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( rocksdb_perfcontext_t* context); extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( @@ -1211,9 +1610,14 @@ rocksdb_filterpolicy_t*); extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* -rocksdb_filterpolicy_create_bloom(int bits_per_key); +rocksdb_filterpolicy_create_bloom(double bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_bloom_full(double bits_per_key); extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* -rocksdb_filterpolicy_create_bloom_full(int bits_per_key); +rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key); +extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* +rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key, + int bloom_before_level); /* Merge Operator */ @@ -1237,13 +1641,18 @@ /* Read options */ -extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy( rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache( + rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound( @@ -1252,80 +1661,155 @@ rocksdb_readoptions_t*, const char* key, size_t keylen); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier( + rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing( + rocksdb_readoptions_t*); // The functionality that this option controlled has been removed. extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed( rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size( rocksdb_readoptions_t*, size_t); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data( + rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys( rocksdb_readoptions_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_background_purge_on_iterator_cleanup( + rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions( rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*); /* Write options */ -extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* -rocksdb_writeoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy( rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync( rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync( + rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL( rocksdb_writeoptions_t* opt, int disable); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL( + rocksdb_writeoptions_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families( rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_ignore_missing_column_families( + rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown( rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown( + rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri( rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri( + rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_writeoptions_get_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t*); /* Compact range options */ extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t* -rocksdb_compactoptions_create(); +rocksdb_compactoptions_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy( rocksdb_compactoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_exclusive_manual_compaction( rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_exclusive_manual_compaction( + rocksdb_compactoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_bottommost_level_compaction( rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_bottommost_level_compaction( + rocksdb_compactoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level( rocksdb_compactoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level( rocksdb_compactoptions_t*, int); +extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level( + rocksdb_compactoptions_t*); /* Flush options */ -extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* -rocksdb_flushoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy( rocksdb_flushoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( rocksdb_flushoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait( + rocksdb_flushoptions_t*); + +/* Memory allocator */ + +extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t* +rocksdb_jemalloc_nodump_allocator_create(char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy( + rocksdb_memory_allocator_t*); /* Cache */ +extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t* +rocksdb_lru_cache_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy( + rocksdb_lru_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity( + rocksdb_lru_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*); + extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data( + rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( rocksdb_cache_t* cache, size_t capacity); extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_capacity(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache); @@ -1337,12 +1821,24 @@ /* Env */ -extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(); -extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads( + rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads( + rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int +rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); @@ -1352,7 +1848,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); -extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy( rocksdb_envoptions_t* opt); @@ -1387,7 +1884,7 @@ rocksdb_sstfilewriter_t* writer); extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t* -rocksdb_ingestexternalfileoptions_create(); +rocksdb_ingestexternalfileoptions_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_set_move_files( rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files); @@ -1433,7 +1930,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t); extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* -rocksdb_slicetransform_create_noop(); +rocksdb_slicetransform_create_noop(void); extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( rocksdb_slicetransform_t*); @@ -1445,38 +1942,61 @@ }; extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t* -rocksdb_universal_compaction_options_create(); +rocksdb_universal_compaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_size_ratio( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_min_merge_width( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_max_merge_width( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_max_size_amplification_percent( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_compression_size_percent( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_stop_style( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* -rocksdb_fifo_compaction_options_create(); +rocksdb_fifo_compaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_set_max_table_files_size( rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts); extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts); extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count( const rocksdb_livefiles_t*); +extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name( + const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name( const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level( @@ -1522,7 +2042,7 @@ const rocksdb_transactiondb_options_t* txn_db_options, const char* name, char** errptr); -rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( const rocksdb_options_t* options, const rocksdb_transactiondb_options_t* txn_db_options, const char* name, int num_column_families, const char* const* column_family_names, @@ -1535,6 +2055,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot( rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot); +extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value( + rocksdb_transactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int( + rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val); + extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin( rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* write_options, @@ -1574,7 +2100,7 @@ const char* key, size_t klen, size_t* vlen, unsigned char exclusive, char** errptr); -char* rocksdb_transaction_get_for_update_cf( +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf( rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, size_t* vlen, unsigned char exclusive, char** errptr); @@ -1692,13 +2218,22 @@ const rocksdb_optimistictransaction_options_t* otxn_options, rocksdb_transaction_t* old_txn); +extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write( + rocksdb_optimistictransactiondb_t* otxn_db, + const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close( rocksdb_optimistictransactiondb_t* otxn_db); +extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* +rocksdb_optimistictransactiondb_checkpoint_object_create( + rocksdb_optimistictransactiondb_t* otxn_db, char** errptr); + /* Transaction Options */ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t* -rocksdb_transactiondb_options_create(); +rocksdb_transactiondb_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy( rocksdb_transactiondb_options_t* opt); @@ -1718,7 +2253,7 @@ rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout); extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t* -rocksdb_transaction_options_create(); +rocksdb_transaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy( rocksdb_transaction_options_t* opt); @@ -1744,7 +2279,7 @@ rocksdb_transaction_options_t* opt, size_t size); extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t* -rocksdb_optimistictransaction_options_create(); +rocksdb_optimistictransaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy( rocksdb_optimistictransaction_options_t* opt); @@ -1753,6 +2288,13 @@ rocksdb_optimistictransaction_options_set_set_snapshot( rocksdb_optimistictransaction_options_t* opt, unsigned char v); +extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value( + rocksdb_optimistictransactiondb_t* db, const char* propname); + +extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int( + rocksdb_optimistictransactiondb_t* db, const char* propname, + uint64_t* out_val); + // referring to convention (3), this should be used by client // to free memory that was malloc()ed extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr); @@ -1770,7 +2312,7 @@ const rocksdb_pinnableslice_t* t, size_t* vlen); extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* - rocksdb_memory_consumers_create(); +rocksdb_memory_consumers_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( rocksdb_memory_consumers_t* consumers, rocksdb_t* db); extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( @@ -1796,6 +2338,16 @@ rocksdb_approximate_memory_usage_get_cache_total( rocksdb_memory_usage_t* memory_usage); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats( + rocksdb_options_t*, unsigned char); + +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*, + unsigned char); + +extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work( + rocksdb_t* db, unsigned char wait); + #ifdef __cplusplus } /* end extern "C" */ #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -22,9 +22,11 @@ #pragma once -#include +#include +#include #include #include + #include "rocksdb/memory_allocator.h" #include "rocksdb/slice.h" #include "rocksdb/statistics.h" @@ -33,6 +35,8 @@ namespace ROCKSDB_NAMESPACE { class Cache; +struct ConfigOptions; +class SecondaryCache; extern const bool kDefaultToAdaptiveMutex; @@ -58,10 +62,10 @@ // Percentage of cache reserved for high priority entries. // If greater than zero, the LRU list will be split into a high-pri - // list and a low-pri list. High-pri entries will be insert to the + // list and a low-pri list. High-pri entries will be inserted to the // tail of high-pri list, while low-pri entries will be first inserted to - // the low-pri list (the midpoint). This is refered to as - // midpoint insertion strategy to make entries never get hit in cache + // the low-pri list (the midpoint). This is referred to as + // midpoint insertion strategy to make entries that never get hit in cache // age out faster. // // See also @@ -86,6 +90,9 @@ CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy; + // A SecondaryCache instance to use a the non-volatile tier + std::shared_ptr secondary_cache; + LRUCacheOptions() {} LRUCacheOptions(size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, double _high_pri_pool_ratio, @@ -125,23 +132,104 @@ // more detail. // // Return nullptr if it is not supported. +// +// BROKEN: ClockCache is known to have bugs that could lead to crash or +// corruption, so should not be used until fixed. Use NewLRUCache instead. extern std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy); + class Cache { public: // Depending on implementation, cache entries with high priority could be less // likely to get evicted than low priority entries. enum class Priority { HIGH, LOW }; + // A set of callbacks to allow objects in the primary block cache to be + // be persisted in a secondary cache. The purpose of the secondary cache + // is to support other ways of caching the object, such as persistent or + // compressed data, that may require the object to be parsed and transformed + // in some way. Since the primary cache holds C++ objects and the secondary + // cache may only hold flat data that doesn't need relocation, these + // callbacks need to be provided by the user of the block + // cache to do the conversion. + // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers + // to callback functions for size, saving and deletion of the + // object. The callbacks are defined in C-style in order to make them + // stateless and not add to the cache metadata size. + // Saving multiple std::function objects will take up 32 bytes per + // function, even if its not bound to an object and does no capture. + // + // All the callbacks are C-style function pointers in order to simplify + // lifecycle management. Objects in the cache can outlive the parent DB, + // so anything required for these operations should be contained in the + // object itself. + // + // The SizeCallback takes a void* pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + using SizeCallback = size_t (*)(void* obj); + + // The SaveToCallback takes a void* object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, + size_t length, void* out); + + // A function pointer type for custom destruction of an entry's + // value. The Cache is responsible for copying and reclaiming space + // for the key, but values are managed by the caller. + using DeleterFn = void (*)(const Slice& key, void* value); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + SizeCallback size_cb; + SaveToCallback saveto_cb; + DeleterFn del_cb; + + CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} + CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, + DeleterFn _del_cb) + : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + }; + + // The CreateCallback is passed by the block cache user to Lookup(). It + // takes in a buffer from the NVM cache and constructs an object using + // it. The callback doesn't have ownership of the buffer and should + // copy the contents into its own buffer. + using CreateCallback = std::function; + Cache(std::shared_ptr allocator = nullptr) : memory_allocator_(std::move(allocator)) {} // No copying allowed Cache(const Cache&) = delete; Cache& operator=(const Cache&) = delete; + // Creates a new Cache based on the input value string and returns the result. + // Currently, this method can be used to create LRUCaches only + // @param config_options + // @param value The value might be: + // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102( + // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; + // For the LRUCache, the values are defined in LRUCacheOptions. + // @param result The new Cache object + // @return OK if the cache was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + // Destroys all existing entries by calling the "deleter" // function that was passed via the Insert() function. // @@ -154,8 +242,8 @@ // The type of the Cache virtual const char* Name() const = 0; - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. + // Insert a mapping from key->value into the volatile cache only + // and assign it // the specified charge against the total cache capacity. // If strict_capacity_limit is true and cache reaches its full capacity, // return Status::Incomplete. // @@ -168,10 +256,11 @@ // insert. In case of error value will be cleanup. // // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". + // value will be passed to "deleter" which must delete the value. + // (The Cache is responsible for copying and reclaiming space for + // the key.) virtual Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle = nullptr, + DeleterFn deleter, Handle** handle = nullptr, Priority priority = Priority::LOW) = 0; // If the cache has no mapping for "key", returns nullptr. @@ -248,6 +337,12 @@ // returns the charge for the specific entry in the cache. virtual size_t GetCharge(Handle* handle) const = 0; + // Returns the deleter for the specified entry. This might seem useless + // as the Cache itself is responsible for calling the deleter, but + // the deleter can essentially verify that a cache entry is of an + // expected type from an expected code source. + virtual DeleterFn GetDeleter(Handle* handle) const = 0; + // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak // memory - call this only if you're shutting down the process. @@ -257,11 +352,33 @@ // default implementation is noop } - // Apply callback to all entries in the cache - // If thread_safe is true, it will also lock the accesses. Otherwise, it will - // access the cache without the lock held - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) = 0; + struct ApplyToAllEntriesOptions { + // If the Cache uses locks, setting `average_entries_per_lock` to + // a higher value suggests iterating over more entries each time a lock + // is acquired, likely reducing the time for ApplyToAllEntries but + // increasing latency for concurrent users of the Cache. Setting + // `average_entries_per_lock` to a smaller value could be helpful if + // callback is relatively expensive, such as using large data structures. + size_t average_entries_per_lock = 256; + }; + + // Apply a callback to all entries in the cache. The Cache must ensure + // thread safety but does not guarantee that a consistent snapshot of all + // entries is iterated over if other threads are operating on the Cache + // also. + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) = 0; + + // DEPRECATED version of above. (Default implementation uses above.) + virtual void ApplyToAllCacheEntries(void (*callback)(void* value, + size_t charge), + bool /*thread_safe*/) { + ApplyToAllEntries([callback](const Slice&, void* value, size_t charge, + DeleterFn) { callback(value, charge); }, + {}); + } // Remove all entries. // Prerequisite: no entry is referenced. @@ -271,6 +388,108 @@ MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + // EXPERIMENTAL + // The following APIs are experimental and might change in the future. + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation more suitable + // for on disk storage. They rely on a per object CacheItemHelper to do + // the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::Incomplete. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or promoted to the secondary cache. It, + // therefore, must outlive the cache. + // + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. + // + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Status Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) { + if (!helper) { + return Status::InvalidArgument(); + } + return Insert(key, value, charge, helper->del_cb, handle, priority); + } + + // Lookup the key in the primary and secondary caches (if one is configured). + // The create_cb callback function object will be used to contruct the + // cached object. + // If none of the caches have the mapping for the key, returns nullptr. + // Else, returns a handle that corresponds to the mapping. + // + // This call may promote the object from the secondary cache (if one is + // configured, and has the given key) to the primary cache. + // + // The helper argument should be provided if the caller wants the lookup + // to include the secondary cache (if one is configured) and the object, + // if it exists, to be promoted to the primary cache. The helper may be + // saved and used later when the object is evicted. Therefore, it must + // outlive the cache. + // + // The handle returned may not be ready. The caller should call IsReady() + // to check if the item value is ready, and call Wait() or WaitAll() if + // its not ready. The caller should then call Value() to check if the + // item was successfully retrieved. If unsuccessful (perhaps due to an + // IO error), Value() will return nullptr. + virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/, + const CreateCallback& /*create_cb*/, + Priority /*priority*/, bool /*wait*/, + Statistics* stats = nullptr) { + return Lookup(key, stats); + } + + // Release a mapping returned by a previous Lookup(). The "useful" + // parameter specifies whether the data was actually used or not, + // which may be used by the cache implementation to decide whether + // to consider it as a hit for retention purposes. + virtual bool Release(Handle* handle, bool /*useful*/, bool force_erase) { + return Release(handle, force_erase); + } + + // Determines if the handle returned by Lookup() has a valid value yet. The + // call is not thread safe and should be called only by someone holding a + // reference to the handle. + virtual bool IsReady(Handle* /*handle*/) { return true; } + + // If the handle returned by Lookup() is not ready yet, wait till it + // becomes ready. + // Note: A ready handle doesn't necessarily mean it has a valid value. The + // user should call Value() and check for nullptr. + virtual void Wait(Handle* /*handle*/) {} + + // Wait for a vector of handles to become ready. As with Wait(), the user + // should check the Value() of each handle for nullptr. This call is not + // thread safe and should only be called by the caller holding a reference + // to each of the handles. + virtual void WaitAll(std::vector& /*handles*/) {} + private: std::shared_ptr memory_allocator_; }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,14 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +int cache_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h 2025-05-19 16:14:27.000000000 +0000 @@ -30,7 +30,7 @@ // // Note that unlike all of the preceding methods, this method is // not abstract and therefore clients should not override it. - typedef void (*CleanupFunction)(void* arg1, void* arg2); + using CleanupFunction = void (*)(void* arg1, void* arg2); void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); void DelegateCleanupsTo(Cleanable* other); // DoCleanup and also resets the pointers for reuse diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,26 +13,22 @@ #include #include +#include "rocksdb/customizable.h" #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { class Slice; class SliceTransform; -// Context information of a compaction run -struct CompactionFilterContext { - // Does this compaction run include all data files - bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process - bool is_manual_compaction; -}; - -// CompactionFilter allows an application to modify/delete a key-value at -// the time of compaction. - -class CompactionFilter { +// CompactionFilter allows an application to modify/delete a key-value during +// table file creation. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilter : public Customizable { public: enum ValueType { kValue, @@ -45,35 +41,44 @@ kRemove, kChangeValue, kRemoveAndSkipUntil, + kChangeBlobIndex, // used internally by BlobDB. + kIOError, // used internally by BlobDB. + kUndetermined, }; enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; - // Context information of a compaction run + // Context information for a table file creation. struct Context { - // Does this compaction run include all data files + // Whether this table file is created as part of a compaction including all + // table files. bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process + // Whether this table file is created as part of a compaction requested by + // the client. bool is_manual_compaction; - // Which column family this compaction is for. + // The column family that will contain the created table file. uint32_t column_family_id; + // Reason this table file is being created. + TableFileCreationReason reason; }; virtual ~CompactionFilter() {} - - // The compaction process invokes this - // method for kv that is being compacted. A return value - // of false indicates that the kv should be preserved in the - // output of this compaction run and a return value of true - // indicates that this key-value should be removed from the - // output of the compaction. The application can inspect - // the existing value of the key and make decision based on it. - // - // Key-Values that are results of merge operation during compaction are not - // passed into this function. Currently, when you have a mix of Put()s and - // Merge()s on a same key, we only guarantee to process the merge operands - // through the compaction filters. Put()s might be processed, or might not. + static const char* Type() { return "CompactionFilter"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& name, + const CompactionFilter** result); + + // The table file creation process invokes this method before adding a kv to + // the table file. A return value of false indicates that the kv should be + // preserved in the new table file and a return value of true indicates + // that this key-value should be removed from the new table file. The + // application can inspect the existing value of the key and make decision + // based on it. + // + // Key-Values that are results of merge operation during table file creation + // are not passed into this function. Currently, when you have a mix of Put()s + // and Merge()s on a same key, we only guarantee to process the merge operands + // through the `CompactionFilter`s. Put()s might be processed, or might not. // // When the value is to be preserved, the application has the option // to modify the existing_value and pass it back through new_value. @@ -81,9 +86,10 @@ // // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a // DB* object) will not guarantee to preserve the state of the DB with - // CompactionFilter. Data seen from a snapshot might disppear after a - // compaction finishes. If you use snapshots, think twice about whether you - // want to use compaction filter and whether you are using it in a safe way. + // CompactionFilter. Data seen from a snapshot might disappear after a + // table file created with a `CompactionFilter` is installed. If you use + // snapshots, think twice about whether you want to use `CompactionFilter` and + // whether you are using it in a safe way. // // If multithreaded compaction is being used *and* a single CompactionFilter // instance was supplied via Options::compaction_filter, this method may be @@ -91,7 +97,7 @@ // that the call is thread-safe. // // If the CompactionFilter was created by a factory, then it will only ever - // be used by a single thread that is doing the compaction run, and this + // be used by a single thread that is doing the table file creation, and this // call does not need to be thread-safe. However, multiple filters may be // in existence and operating concurrently. virtual bool Filter(int /*level*/, const Slice& /*key*/, @@ -101,9 +107,9 @@ return false; } - // The compaction process invokes this method on every merge operand. If this - // method returns true, the merge operand will be ignored and not written out - // in the compaction output + // The table file creation process invokes this method on every merge operand. + // If this method returns true, the merge operand will be ignored and not + // written out in the new table file. // // Note: If you are using a TransactionDB, it is not recommended to implement // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB @@ -140,14 +146,16 @@ // snapshot - beware if you're using TransactionDB or // DB::GetSnapshot(). // - If value for a key was overwritten or merged into (multiple Put()s - // or Merge()s), and compaction filter skips this key with + // or Merge()s), and `CompactionFilter` skips this key with // kRemoveAndSkipUntil, it's possible that it will remove only // the new value, exposing the old value that was supposed to be // overwritten. // - Doesn't work with PlainTableFactory in prefix mode. - // - If you use kRemoveAndSkipUntil, consider also reducing - // compaction_readahead_size option. + // - If you use kRemoveAndSkipUntil for table files created by + // compaction, consider also reducing compaction_readahead_size + // option. // + // Should never return kUndetermined. // Note: If you are using a TransactionDB, it is not recommended to filter // out or modify merge operands (ValueType::kMergeOperand). // If a merge operation is filtered out, TransactionDB may not realize there @@ -185,28 +193,62 @@ } // This function is deprecated. Snapshots will always be ignored for - // compaction filters, because we realized that not ignoring snapshots doesn't - // provide the gurantee we initially thought it would provide. Repeatable - // reads will not be guaranteed anyway. If you override the function and - // returns false, we will fail the compaction. + // `CompactionFilter`s, because we realized that not ignoring snapshots + // doesn't provide the guarantee we initially thought it would provide. + // Repeatable reads will not be guaranteed anyway. If you override the + // function and returns false, we will fail the table file creation. virtual bool IgnoreSnapshots() const { return true; } - // Returns a name that identifies this compaction filter. + // Returns a name that identifies this `CompactionFilter`. // The name will be printed to LOG file on start up for diagnosis. - virtual const char* Name() const = 0; + const char* Name() const override = 0; + + // Internal (BlobDB) use only. Do not override in application code. + virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; } + + // In the case of BlobDB, it may be possible to reach a decision with only + // the key without reading the actual value. Keys whose value_type is + // kBlobIndex will be checked by this method. + // Returning kUndetermined will cause FilterV2() to be called to make a + // decision as usual. + virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/, + std::string* /*new_value*/, + std::string* /*skip_until*/) const { + return Decision::kUndetermined; + } }; -// Each compaction will create a new CompactionFilter allowing the -// application to know about different compactions -class CompactionFilterFactory { +// Each thread of work involving creating table files will create a new +// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This +// allows the application to know about the different ongoing threads of work +// and makes it unnecessary for `CompactionFilter` to provide thread-safety. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionFilterFactory : public Customizable { public: virtual ~CompactionFilterFactory() {} + static const char* Type() { return "CompactionFilterFactory"; } + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& name, + std::shared_ptr* result); + + // Returns whether a thread creating table files for the specified `reason` + // should invoke `CreateCompactionFilter()` and pass KVs through the returned + // filter. + virtual bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const { + // For backward compatibility, default implementation only applies + // `CompactionFilter` to files generated by compaction. + return reason == TableFileCreationReason::kCompaction; + } virtual std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) = 0; - // Returns a name that identifies this compaction filter factory. - virtual const char* Name() const = 0; + // Returns a name that identifies this `CompactionFilter` factory. + virtual const char* Name() const override = 0; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,23 +25,33 @@ // the number of compaction input records. uint64_t num_input_records; - // the number of compaction input files. + // the number of blobs read from blob files + uint64_t num_blobs_read; + // the number of compaction input files (table files) size_t num_input_files; - // the number of compaction input files at the output level. + // the number of compaction input files at the output level (table files) size_t num_input_files_at_output_level; // the number of compaction output records. uint64_t num_output_records; - // the number of compaction output files. + // the number of compaction output files (table files) size_t num_output_files; + // the number of compaction output files (blob files) + size_t num_output_files_blob; + // true if the compaction is a full compaction (all live SST files input) + bool is_full_compaction; // true if the compaction is a manual compaction bool is_manual_compaction; - // the size of the compaction input in bytes. + // the total size of table files in the compaction input uint64_t total_input_bytes; - // the size of the compaction output in bytes. + // the total size of blobs read from blob files + uint64_t total_blob_bytes_read; + // the total size of table files in the compaction output uint64_t total_output_bytes; + // the total size of blob files in the compaction output + uint64_t total_output_bytes_blob; // number of records being replaced by newer record associated with same key. // this could be a new value or a deletion entry for that key so this field diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/comparator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/comparator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include +#include "rocksdb/customizable.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { @@ -20,7 +21,11 @@ // used as keys in an sstable or a database. A Comparator implementation // must be thread-safe since rocksdb may invoke its methods concurrently // from multiple threads. -class Comparator { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Comparator : public Customizable { public: Comparator() : timestamp_size_(0) {} @@ -35,13 +40,20 @@ return *this; } - virtual ~Comparator() {} + ~Comparator() override {} + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + const Comparator** comp); static const char* Type() { return "Comparator"; } + // Three-way comparison. Returns value: // < 0 iff "a" < "b", // == 0 iff "a" == "b", // > 0 iff "a" > "b" + // Note that Compare(a, b) also compares timestamp if timestamp size is + // non-zero. For the same user key with different timestamps, larger (newer) + // timestamp comes first. virtual int Compare(const Slice& a, const Slice& b) const = 0; // Compares two slices for equality. The following invariant should always @@ -63,7 +75,7 @@ // // Names starting with "rocksdb." are reserved and should not be used // by any clients of this package. - virtual const char* Name() const = 0; + const char* Name() const override = 0; // Advanced functions: these are used to reduce the space requirements // for internal data structures like index blocks. @@ -97,15 +109,34 @@ inline size_t timestamp_size() const { return timestamp_size_; } - virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { - return Compare(a, b); + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); } + // For two events e1 and e2 whose timestamps are t1 and t2 respectively, + // Returns value: + // < 0 iff t1 < t2 + // == 0 iff t1 == t2 + // > 0 iff t1 > t2 + // Note that an all-zero byte array will be the smallest (oldest) timestamp + // of the same length, and a byte array with all bits 1 will be the largest. + // In the future, we can extend Comparator so that subclasses can specify + // both largest and smallest timestamps. virtual int CompareTimestamp(const Slice& /*ts1*/, const Slice& /*ts2*/) const { return 0; } + virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, + const Slice& b, bool /*b_has_ts*/) const { + return Compare(a, b); + } + + virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return 0 == + CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + private: size_t timestamp_size_; }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. + +enum CompressionType : unsigned char { + // NOTE: do not change the values of existing entries, as these are + // part of the persistent format on disk. + kNoCompression = 0x0, + kSnappyCompression = 0x1, + kZlibCompression = 0x2, + kBZip2Compression = 0x3, + kLZ4Compression = 0x4, + kLZ4HCCompression = 0x5, + kXpressCompression = 0x6, + kZSTD = 0x7, + + // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than + // 0.8.0 or consider a possibility of downgrading the service or copying + // the database files to another service running with an older version of + // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will + // eventually remove the option from the public API. + kZSTDNotFinalCompression = 0x40, + + // kDisableCompressionOption is used to disable some compression options. + kDisableCompressionOption = 0xff, +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,11 +9,16 @@ #pragma once -#include "rocksdb/env.h" -#include "rocksdb/statistics.h" +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { +// This is NOT an extensible interface but a public interface for result of +// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal. class ConcurrentTaskLimiter { public: virtual ~ConcurrentTaskLimiter() {} @@ -33,7 +38,7 @@ virtual int32_t GetOutstandingTask() const = 0; }; -// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs +// Create a ConcurrentTaskLimiter that can be shared with multiple CFs // across RocksDB instances to control concurrent tasks. // // @param name: Name of the limiter. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/configurable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/configurable.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,397 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class ObjectRegistry; +class OptionTypeInfo; +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; + +// Configurable is a base class used by the rocksdb that describes a +// standard way of configuring objects. A Configurable object can: +// -> Populate itself given: +// - One or more "name/value" pair strings +// - A string representing the set of name=value properties +// - A map of name/value properties. +// -> Convert itself into its string representation +// -> Dump itself to a Logger +// -> Compare itself to another Configurable object to see if the two objects +// have equivalent options settings +// +// If a derived class calls RegisterOptions to register (by name) how its +// options objects are to be processed, this functionality can typically be +// handled by this class without additional overrides. Otherwise, the derived +// class will need to implement the methods for handling the corresponding +// functionality. +class Configurable { + protected: + friend class ConfigurableHelper; + struct RegisteredOptions { + // The name of the options being registered + std::string name; + // Pointer to the object being registered + void* opt_ptr; +#ifndef ROCKSDB_LITE + // The map of options being registered + const std::unordered_map* type_map; +#endif + }; + + public: + virtual ~Configurable() {} + + // Returns the raw pointer of the named options that is used by this + // object, or nullptr if this function is not supported. + // Since the return value is a raw pointer, the object owns the + // pointer and the caller should not delete the pointer. + // + // Note that changing the underlying options while the object + // is currently used by any open DB is undefined behavior. + // Developers should use DB::SetOption() instead to dynamically change + // options while the DB is open. + template + const T* GetOptions() const { + return GetOptions(T::kName()); + } + template + T* GetOptions() { + return GetOptions(T::kName()); + } + template + const T* GetOptions(const std::string& name) const { + return reinterpret_cast(GetOptionsPtr(name)); + } + template + T* GetOptions(const std::string& name) { + return reinterpret_cast(const_cast(GetOptionsPtr(name))); + } + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. + // If this method fails, an attempt is made to revert the object to original + // state. Note that the revert may not be the original state but may be an + // equivalent. For example, if the object contains an option that is a + // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not + // the Cache object that was passed in, but a Cache object of the same size). + // + // The acceptable values of the name/value pairs are documented with the + // specific class/instance. + // + // @param config_options Controls how the arguments are processed. + // @param opt_map Name/value pairs of the options to update + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all values in the map were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names in the opt_map were not valid + // for this object. If unused is specified, it will contain the + // collection of NotFound names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + // @see ConfigOptions for a description of the controls. + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map); + Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map, + std::unordered_map* unused); + +#ifndef ROCKSDB_LITE + // Updates the named option to the input value, returning OK if successful. + // Note that ConfigureOption does not cause PrepareOptions to be invoked. + // @param config_options Controls how the name/value is processed. + // @param name The name of the option to update + // @param value The value to set for the named option + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @return NotSupported If the name is valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If the value cannot be successfully parsed. + Status ConfigureOption(const ConfigOptions& config_options, + const std::string& name, const std::string& value); +#endif // ROCKSDB_LITE + + // Configures the options for this class based on the input parameters. + // On successful completion, the object is updated with the settings from + // the opt_map. If this method fails, an attempt is made to revert the + // object to original state. Note that the revert may not be the original + // state but may be an equivalent. + // @see ConfigureFromMap for more details + // @param config_options Controls how the arguments are processed. + // @param opt_str string containing the values to update. + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all specified values were successfully updated + // If invoke_prepare_options is true, OK also implies + // PrepareOptions ran successfully. + // @return NotFound If any of the names were not valid for this object. + // If unused is specified, it will contain the collection of NotFound + // names. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + Status ConfigureFromString(const ConfigOptions& config_options, + const std::string& opts); + + // Fills in result with the serialized options for this object. + // This is the inverse of ConfigureFromString. + // @param config_options Controls how serialization happens. + // @param result The string representation of this object. + // @return OK If the options for this object were successfully serialized. + // @return InvalidArgument If one or more of the options could not be + // serialized. + Status GetOptionString(const ConfigOptions& config_options, + std::string* result) const; +#ifndef ROCKSDB_LITE + // Returns the serialized options for this object. + // This method is similar to GetOptionString with no errors. + // @param config_options Controls how serialization happens. + // @param prefix A string to prepend to every option. + // @return The serialized representation of the options for this object + std::string ToString(const ConfigOptions& config_options) const { + return ToString(config_options, ""); + } + std::string ToString(const ConfigOptions& config_options, + const std::string& prefix) const; + + // Returns the list of option names associated with this configurable + // @param config_options Controls how the names are returned + // @param result The set of option names for this object. Note that + // options that are deprecated or aliases are not returned. + // @return OK on success. + Status GetOptionNames(const ConfigOptions& config_options, + std::unordered_set* result) const; + + // Returns the value of the option associated with the input name + // This method is the functional inverse of ConfigureOption + // @param config_options Controls how the value is returned + // @param name The name of the option to return a value for. + // @param value The returned value associated with the named option. + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @param InvalidArgument If the name is valid for this object but + // its value cannot be serialized. + virtual Status GetOption(const ConfigOptions& config_options, + const std::string& name, std::string* value) const; +#endif // ROCKSDB_LITE + + // Checks to see if this Configurable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + virtual bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* name) const; + + // Returns a pretty-printed, human-readable version of the options. + // This method is typically used to dump the options to a log file. + // Classes should override this method + virtual std::string GetPrintableOptions() const { return ""; } + + // Validates that the settings are valid/consistent and performs any object + // initialization required by this object. This method may be called as part + // of Configure (if invoke_prepare_options is set), or may be invoked + // separately. + // + // Once an object has been prepared, non-mutable options can no longer be + // updated. + // + // Classes must override this method to provide any implementation-specific + // initialization, such as opening log files or setting up cache parameters. + // Implementations should be idempotent (e.g. don't re-open the log file or + // reconfigure the cache), as there is the potential this method can be called + // more than once. + // + // By default, this method will also prepare all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param config_options Controls how the object is prepared. Also contains + // a Logger and Env that can be used to initialize this object. + // @return OK If the object was successfully initialized. + // @return InvalidArgument If this object could not be successfully + // initialized. + virtual Status PrepareOptions(const ConfigOptions& config_options); + + // Checks to see if the settings are valid for this object. + // This method checks to see if the input DBOptions and ColumnFamilyOptions + // are valid for the settings of this object. For example, an Env might not + // support certain mmap modes or a TableFactory might require certain + // settings. + // + // By default, this method will also validate all nested (Inner and + // OptionType::kConfigurable) objects. + // + // @param db_opts The DBOptions to validate + // @param cf_opts The ColumnFamilyOptions to validate + // @return OK if the options are valid + // @return InvalidArgument If the arguments are not valid for the options + // of the current object. + virtual Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const; + + // Splits the input opt_value into the ID field and the remaining options. + // The input opt_value can be in the form of "name" or "name=value + // [;name=value]". The first form uses the "name" as an id with no options The + // latter form converts the input into a map of name=value pairs and sets "id" + // to the "id" value from the map. + // @param opt_value The value to split into id and options + // @param id The id field from the opt_value + // @param options The remaining name/value pairs from the opt_value + // @param default_id If specified and there is no id field in the map, this + // value is returned as the ID + // @return OK if the value was converted to a map successfully and an ID was + // found. + // @return InvalidArgument if the value could not be converted to a map or + // there was or there is no id property in the map. + static Status GetOptionsMap( + const std::string& opt_value, const std::string& default_id, + std::string* id, std::unordered_map* options); + + protected: + // Returns the raw pointer for the associated named option. + // The name is typically the name of an option registered via the + // Classes may override this method to provide further specialization (such as + // returning a sub-option) + // + // The default implementation looks at the registered options. If the + // input name matches that of a registered option, the pointer registered + // with that name is returned. + // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns + // "my_ptr" + virtual const void* GetOptionsPtr(const std::string& name) const; + + // Method for allowing options to be configured outside of the normal + // registered options framework. Classes may override this method if they + // wish to support non-standard options implementations (such as configuring + // themselves from constant or simple ":"-separated strings. + // + // The default implementation does nothing and returns OK + virtual Status ParseStringOptions(const ConfigOptions& config_options, + const std::string& opts_str); + + // Internal method to configure an object from a map of name-value options. + // This method uses the input config_options to drive the configuration of + // the options in opt_map. Any option name that cannot be found from the + // input set will be returned in "unused". + // + // Classes may override this method to extend the functionality if required. + // @param config_options Controls how the options are configured and errors + // handled. + // @param opts_map The set of options to configure + // @param unused Any options from opt_map that were not configured. + // @returns a Status based on the rules outlined in ConfigureFromMap + virtual Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused); + +#ifndef ROCKSDB_LITE + // Method that configures a the specific opt_name from opt_value. + // By default, this method calls opt_info.ParseOption with the + // input parameters. + // Classes may override this method to extend the functionality, or + // change the returned Status. + virtual Status ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, void* opt_ptr); + + // Internal method to see if the single option name/info matches for this and + // that Classes may override this value to change its behavior. + // @param config_options Controls how the options are being matched + // @param opt_info The OptionTypeInfo registered for this option name + // that controls what field is matched (offset) and how (type). + // @param name The name associated with this opt_info. + // @param this_ptr The base pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param that_ptr The other pointer to compare to. This is the object + // registered for + // for this OptionTypeInfo. + // @param bad_name If the match fails, the name of the option that failed to + // match. + virtual bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& name, + const void* const this_ptr, + const void* const that_ptr, + std::string* bad_name) const; +#endif +#ifndef ROCKSDB_LITE + // Internal method to serialize options (ToString) + // Classes may override this value to change its behavior. + virtual std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const; +#endif // ROCKSDB_LITE + + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + virtual std::string GetOptionName(const std::string& long_name) const; + + // Registers the input name with the options and associated map. + // When classes register their options in this manner, most of the + // functionality (excluding unknown options and validate/prepare) is + // implemented by the base class. + // + // This method should be called in the class constructor to register the + // option set for this object. For example, to register the options + // associated with the BlockBasedTableFactory, the constructor calls this + // method passing in: + // - the name of the options ("BlockBasedTableOptions"); + // - the options object (the BlockBasedTableOptions object for this object; + // - the options type map for the BlockBasedTableOptions. + // This registration allows the Configurable class to process the option + // values associated with the BlockBasedTableOptions without further code in + // the derived class. + // + // @param name The name of this set of options (@see GetOptionsPtr) + // @param opt_ptr Pointer to the options to associate with this name + // @param opt_map Options map that controls how this option is configured. + template + void RegisterOptions( + T* opt_ptr, + const std::unordered_map* opt_map) { + RegisterOptions(T::kName(), opt_ptr, opt_map); + } + void RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* opt_map); + + private: + // Contains the collection of options (name, opt_ptr, opt_map) associated with + // this object. This collection is typically set in the constructor of the + // Configurable option via + std::vector options_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/convenience.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/convenience.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,15 +9,108 @@ #include #include +#include "rocksdb/compression_type.h" #include "rocksdb/db.h" -#include "rocksdb/options.h" +#include "rocksdb/status.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { +class Env; +class Logger; +class ObjectRegistry; + +struct ColumnFamilyOptions; +struct DBOptions; +struct Options; + +// ConfigOptions containing the parameters/controls for +// comparing objects and converting to/from strings. +// These settings control how the methods +// treat errors (e.g. ignore_unknown_objects), the format +// of the serialization (e.g. delimiter), and how to compare +// options (sanity_level). +struct ConfigOptions { + // Constructs a new ConfigOptions with a new object registry. + // This method should only be used when a DBOptions is not available, + // else registry settings may be lost + ConfigOptions(); + + // Constructs a new ConfigOptions using the settings from + // the input DBOptions. Currently constructs a new object registry. + explicit ConfigOptions(const DBOptions&); + + // This enum defines the RocksDB options sanity level. + enum SanityLevel : unsigned char { + kSanityLevelNone = 0x01, // Performs no sanity check at all. + // Performs minimum check to ensure the RocksDB instance can be + // opened without corrupting / mis-interpreting the data. + kSanityLevelLooselyCompatible = 0x02, + // Perform exact match sanity check. + kSanityLevelExactMatch = 0xFF, + }; + + enum Depth { + kDepthDefault, // Traverse nested options that are not flagged as "shallow" + kDepthShallow, // Do not traverse into any nested options + kDepthDetailed, // Traverse nested options, overriding the options shallow + // setting + }; + + // When true, any unused options will be ignored and OK will be returned + bool ignore_unknown_options = false; + + // When true, any unsupported options will be ignored and OK will be returned + bool ignore_unsupported_options = true; + + // If the strings are escaped (old-style?) + bool input_strings_escaped = true; + + // Whether or not to invoke PrepareOptions after configure is called. + bool invoke_prepare_options = true; + + // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not. + // When "mutable_options_only=false", all options are evaluated. + // When "mutable_options_only="true", any option not marked as Mutable is + // either ignored (in the case of string/equals methods) or results in an + // error (in the case of Configure). + bool mutable_options_only = false; + + // The separator between options when converting to a string + std::string delimiter = ";"; + + // Controls how to traverse options during print/match stages + Depth depth = Depth::kDepthDefault; + + // Controls how options are serialized + // Controls how pedantic the comparison must be for equivalency + SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch; + // `file_readahead_size` is used for readahead for the option file. + size_t file_readahead_size = 512 * 1024; + + // The environment to use for this option + Env* env = Env::Default(); + +#ifndef ROCKSDB_LITE + // The object registry to use for this options + std::shared_ptr registry; +#endif + + bool IsShallow() const { return depth == Depth::kDepthShallow; } + bool IsDetailed() const { return depth == Depth::kDepthDetailed; } + + bool IsCheckDisabled() const { + return sanity_level == SanityLevel::kSanityLevelNone; + } + + bool IsCheckEnabled(SanityLevel level) const { + return (level > SanityLevel::kSanityLevelNone && level <= sanity_level); + } +}; #ifndef ROCKSDB_LITE + // The following set of functions provide a way to construct RocksDB Options -// from a string or a string-to-string map. Here're the general rule of +// from a string or a string-to-string map. Here is the general rule of // setting option values from strings by type. Some RocksDB types are also // supported in these APIs. Please refer to the comment of the function itself // to find more information about how to config those RocksDB types. @@ -73,7 +166,7 @@ // ColumnFamilyOptions "new_options". // // Below are the instructions of how to config some non-primitive-typed -// options in ColumnFOptions: +// options in ColumnFamilyOptions: // // * table_factory: // table_factory can be configured using our custom nested-option syntax. @@ -115,7 +208,7 @@ // * {"memtable", "skip_list:5"} is equivalent to setting // memtable to SkipListFactory(5). // - PrefixHash: -// Pass "prfix_hash:" to config memtable +// Pass "prefix_hash:" to config memtable // to use PrefixHash, or simply "prefix_hash" to use the default // PrefixHash. // [Example]: @@ -134,13 +227,6 @@ // [Example]: // * {"memtable", "vector:1024"} is equivalent to setting memtable // to VectorRepFactory(1024). -// - HashCuckooRepFactory: -// Pass "cuckoo:" to use HashCuckooRepFactory with the -// specified write buffer size, or simply "cuckoo" to use the default -// HashCuckooRepFactory. -// [Example]: -// * {"memtable", "cuckoo:1024"} is equivalent to setting memtable -// to NewHashCuckooRepFactory(1024). // // * compression_opts: // Use "compression_opts" to config compression_opts. The value format @@ -153,6 +239,12 @@ // cf_opt.compression_opts.strategy = 6; // cf_opt.compression_opts.max_dict_bytes = 7; // +// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. // @param base_options the default options of the output "new_options". // @param opts_map an option name to value map for specifying how "new_options" // should be set. @@ -165,6 +257,17 @@ // instead of resulting in an unknown-option error. // @return Status::OK() on success. Otherwise, a non-ok status indicating // error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetColumnFamilyOptionsFromMap( + const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options); Status GetColumnFamilyOptionsFromMap( const ColumnFamilyOptions& base_options, const std::unordered_map& opts_map, @@ -184,6 +287,12 @@ // - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to // passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec. // +// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. // @param base_options the default options of the output "new_options". // @param opts_map an option name to value map for specifying how "new_options" // should be set. @@ -196,6 +305,16 @@ // instead of resulting in an unknown-option error. // @return Status::OK() on success. Otherwise, a non-ok status indicating // error will be returned, and "new_options" will be set to "base_options". +// @return Status::NotFound means the one (or more) of the option name in +// the opts_map is not valid for this option +// @return Status::NotSupported means we do not know how to parse one of the +// value for this option +// @return Status::InvalidArgument means the one of the option values is not +// valid for this option. +Status GetDBOptionsFromMap( + const ConfigOptions& cfg_options, const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options); Status GetDBOptionsFromMap( const DBOptions& base_options, const std::unordered_map& opts_map, @@ -227,6 +346,12 @@ // - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is // equivalent to setting block_cache using NewLRUCache(1024 * 1024). // +// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding +// options in the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. // @param table_options the default options of the output "new_table_options". // @param opts_map an option name to value map for specifying how // "new_table_options" should be set. @@ -241,6 +366,11 @@ // error will be returned, and "new_table_options" will be set to // "table_options". Status GetBlockBasedTableOptionsFromMap( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options); +Status GetBlockBasedTableOptionsFromMap( const BlockBasedTableOptions& table_options, const std::unordered_map& opts_map, BlockBasedTableOptions* new_table_options, @@ -250,6 +380,12 @@ // map "opts_map" of option name to option value to construct the new // PlainTableOptions "new_table_options". // +// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +// +// @param config_options controls how the map is processed. // @param table_options the default options of the output "new_table_options". // @param opts_map an option name to value map for specifying how // "new_table_options" should be set. @@ -264,12 +400,16 @@ // error will be returned, and "new_table_options" will be set to // "table_options". Status GetPlainTableOptionsFromMap( + const ConfigOptions& config_options, const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options); +Status GetPlainTableOptionsFromMap( const PlainTableOptions& table_options, const std::unordered_map& opts_map, PlainTableOptions* new_table_options, bool input_strings_escaped = false, bool ignore_unknown_options = false); -// Take a string representation of option names and values, apply them into the +// Take a string representation of option names and values, apply them into the // base_options, and return the new options as a result. The string has the // following format: // "write_buffer_size=1024;max_write_buffer_number=2" @@ -277,22 +417,43 @@ // BlockBasedTableOptions as part of the string for block-based table factory: // "write_buffer_size=1024;block_based_table_factory={block_size=4k};" // "max_write_buffer_num=2" +// +// +// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the +// alternative signature may be deprecated in a future release. The equivalent +// functionality can be achieved by setting the corresponding options in +// the ConfigOptions parameter. +Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options, const std::string& opts_str, ColumnFamilyOptions* new_options); +Status GetDBOptionsFromString(const ConfigOptions& config_options, + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + Status GetDBOptionsFromString(const DBOptions& base_options, const std::string& opts_str, DBOptions* new_options); +Status GetStringFromDBOptions(const ConfigOptions& config_options, + const DBOptions& db_options, + std::string* opts_str); + Status GetStringFromDBOptions(std::string* opts_str, const DBOptions& db_options, const std::string& delimiter = "; "); +Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options, + const ColumnFamilyOptions& cf_options, + std::string* opts_str); Status GetStringFromColumnFamilyOptions(std::string* opts_str, const ColumnFamilyOptions& cf_options, const std::string& delimiter = "; "); - Status GetStringFromCompressionType(std::string* compression_str, CompressionType compression_type); @@ -301,10 +462,18 @@ Status GetBlockBasedTableOptionsFromString( const BlockBasedTableOptions& table_options, const std::string& opts_str, BlockBasedTableOptions* new_table_options); +Status GetBlockBasedTableOptionsFromString( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options); Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, const std::string& opts_str, PlainTableOptions* new_table_options); +Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, + const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options); Status GetMemTableRepFactoryFromString( const std::string& opts_str, @@ -312,6 +481,9 @@ Status GetOptionsFromString(const Options& base_options, const std::string& opts_str, Options* new_options); +Status GetOptionsFromString(const ConfigOptions& config_options, + const Options& base_options, + const std::string& opts_str, Options* new_options); Status StringToMap(const std::string& opts_str, std::unordered_map* opts_map); @@ -345,7 +517,6 @@ const EnvOptions& env_options, const ReadOptions& read_options, const std::string& file_path); - #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/customizable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/customizable.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,233 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/configurable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +/** + * Customizable a base class used by the rocksdb that describes a + * standard way of configuring and creating objects. Customizable objects + * are configurable objects that can be created from an ObjectRegistry. + * + * Customizable classes are used when there are multiple potential + * implementations of a class for use by RocksDB (e.g. Table, Cache, + * MergeOperator, etc). The abstract base class is expected to define a method + * declaring its type and a factory method for creating one of these, such as: + * static const char *Type() { return "Table"; } + * static Status CreateFromString(const ConfigOptions& options, + * const std::string& id, + * std::shared_ptr* result); + * The "Type" string is expected to be unique (no two base classes are the same + * type). This factory is expected, based on the options and id, create and + * return the appropriate derived type of the customizable class (e.g. + * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers, + * helper classes and methods are provided for writing this factory. + * + * Instances of a Customizable class need to define: + * - A "static const char *kClassName()" method. This method defines the name + * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the + * CheckedCast method. + * - The Name() of the object. This name is used when creating and saving + * instances of this class. Typically this name will be the same as + * kClassName(). + * + * Additionally, Customizable classes should register any options used to + * configure themselves with the Configurable subsystem. + * + * When a Customizable is being created, the "name" property specifies + * the name of the instance being created. + * For custom objects, their configuration and name can be specified by: + * [prop]={name=X;option 1 = value1[; option2=value2...]} + * + * [prop].name=X + * [prop].option1 = value1 + * + * [prop].name=X + * X.option1 =value1 + */ +class Customizable : public Configurable { + public: + ~Customizable() override {} + + // Returns the name of this class of Customizable + virtual const char* Name() const = 0; + + // Returns an identifier for this Customizable. + // This could be its name or something more complex (like its URL/pattern). + // Used for pretty printing. + virtual std::string GetId() const { + std::string id = Name(); + return id; + } + + // This is typically determined by if the input name matches the + // name of this object. + // This method is typically used in conjunction with CheckedCast to find the + // derived class instance from its base. For example, if you have an Env + // and want the "Default" env, you would IsInstanceOf("Default") to get + // the default implementation. This method should be used when you need a + // specific derivative or implementation of a class. + // + // Intermediary caches (such as SharedCache) may wish to override this method + // to check for the intermediary name (SharedCache). Classes with multiple + // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override + // this method. + // + // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a". + // Wrapped classes that have an Inner "has-a" should not be returned. + // + // @param name The name of the instance to find. + // Returns true if the class is an instance of the input name. + virtual bool IsInstanceOf(const std::string& name) const { + if (name.empty()) { + return false; + } else if (name == Name()) { + return true; + } else { + const char* nickname = NickName(); + if (nickname != nullptr && name == nickname) { + return true; + } else { + return false; + } + } + } + + const void* GetOptionsPtr(const std::string& name) const override { + const void* ptr = Configurable::GetOptionsPtr(name); + if (ptr != nullptr) { + return ptr; + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->GetOptionsPtr(name); + } else { + return nullptr; + } + } + } + + // Returns the named instance of the Customizable as a T*, or nullptr if not + // found. This method uses IsInstanceOf/Inner to find the appropriate class + // instance and then casts it to the expected return type. + template + const T* CheckedCast() const { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + template + T* CheckedCast() { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + auto inner = const_cast(Inner()); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + // Checks to see if this Customizable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + // @see Configurable::AreEquivalent for more details + bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const override; +#ifndef ROCKSDB_LITE + // Gets the value of the option associated with the input name + // @see Configurable::GetOption for more details + Status GetOption(const ConfigOptions& config_options, const std::string& name, + std::string* value) const override; +#endif // ROCKSDB_LITE + // Helper method for getting for parsing the opt_value into the corresponding + // options for use in potentially creating a new Customizable object (this + // method is primarily a support method for LoadSharedObject et al for new + // Customizable objects). The opt_value may be either name-value pairs + // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new + // Customizable, the ID is determined by: + // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this + // name; + // - Otherwise, if there is a "id=value", the id is set to "value" + // - Otherwise, if the input customizable is not null, custom->GetId is used + // - Otherwise, an error is returned. + // + // If the opt_value is name-value pairs, these pairs will be returned in + // options (without the id pair). If the ID being returned matches the ID of + // the input custom object, then the options from the input object will also + // be added to the returned options. + // + // This method returns non-OK if the ID could not be found, or if the + // opt_value could not be parsed into name-value pairs. + static Status GetOptionsMap( + const ConfigOptions& config_options, const Customizable* custom, + const std::string& opt_value, std::string* id, + std::unordered_map* options); + + // Helper method to configure a new object with the supplied options. + // If the object is not null and invoke_prepare_options=true, the object + // will be configured and prepared. + // Returns success if the object is properly configured and (optionally) + // prepared Returns InvalidArgument if the object is nullptr and there are + // options in the map Returns the result of the ConfigureFromMap or + // PrepareOptions + static Status ConfigureNewObject( + const ConfigOptions& config_options, Customizable* object, + const std::unordered_map& options); + + // Returns the inner class when a Customizable implements a has-a (wrapped) + // relationship. Derived classes that implement a has-a must override this + // method in order to get CheckedCast to function properly. + virtual const Customizable* Inner() const { return nullptr; } + + protected: + // Generates a ID specific for this instance of the customizable. + // The unique ID is of the form :#pid, where: + // - name is the Name() of this object; + // - addr is the memory address of this object; + // - pid is the process ID of this process ID for this process. + // Note that if obj1 and obj2 have the same unique IDs, they must be the + // same. However, if an object is deleted and recreated, it may have the + // same unique ID as a predecessor + // + // This method is useful for objects (especially ManagedObjects) that + // wish to generate an ID that is specific for this instance and wish to + // override the GetId() method. + std::string GenerateIndividualId() const; + + // Some classes have both a class name (e.g. PutOperator) and a nickname + // (e.g. put). Classes can override this method to return a + // nickname. Nicknames can be used by InstanceOf and object creation. + virtual const char* NickName() const { return ""; } + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + std::string GetOptionName(const std::string& long_name) const override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& options, + const std::string& prefix) const override; +#endif // ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,51 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This is a data structure specifically designed as a "Set" for a +// pretty small scale of Enum structure. For now, it can support up +// to 64 element, and it is expandable in the future. +template +class SmallEnumSet { + public: + SmallEnumSet() : state_(0) {} + + ~SmallEnumSet() {} + + // Return true if the input enum is included in the "Set" (i.e., changes the + // internal scalar state successfully), otherwise, it will return false. + bool Add(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t old_state = state_; + uint64_t tmp = 1; + state_ |= (tmp << value); + return old_state != state_; + } + + // Return true if the input enum is contained in the "Set". + bool Contains(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t tmp = 1; + return state_ & (tmp << value); + } + + private: + uint64_t state_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/db.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h 2025-05-19 16:14:27.000000000 +0000 @@ -39,25 +39,31 @@ namespace ROCKSDB_NAMESPACE { -struct Options; -struct DBOptions; struct ColumnFamilyOptions; -struct ReadOptions; -struct WriteOptions; -struct FlushOptions; struct CompactionOptions; struct CompactRangeOptions; -struct TableProperties; +struct DBOptions; struct ExternalSstFileInfo; -class WriteBatch; +struct FlushOptions; +struct Options; +struct ReadOptions; +struct TableProperties; +struct WriteOptions; +#ifdef ROCKSDB_LITE +class CompactionJobInfo; +#endif class Env; class EventListener; +class FileSystem; +#ifndef ROCKSDB_LITE +class Replayer; +#endif class StatsHistoryIterator; +#ifndef ROCKSDB_LITE +class TraceReader; class TraceWriter; -#ifdef ROCKSDB_LITE -class CompactionJobInfo; #endif -class FileSystem; +class WriteBatch; extern const std::string kDefaultColumnFamilyName; extern const std::string kPersistentStatsColumnFamilyName; @@ -111,10 +117,19 @@ RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {} }; +// It is valid that files_checksums and files_checksum_func_names are both +// empty (no checksum information is provided for ingestion). Otherwise, +// their sizes should be the same as external_files. The file order should +// be the same in three vectors and guaranteed by the caller. +// Note that, we assume the temperatures of this batch of files to be +// ingested are the same. struct IngestExternalFileArg { ColumnFamilyHandle* column_family = nullptr; std::vector external_files; IngestExternalFileOptions options; + std::vector files_checksums; + std::vector files_checksum_func_names; + Temperature file_temperature = Temperature::kUnknown; }; struct GetMergeOperandsOptions { @@ -124,19 +139,25 @@ // A collections of table properties objects, where // key: is the table's file name. // value: the table properties object of the given table. -typedef std::unordered_map> - TablePropertiesCollection; +using TablePropertiesCollection = + std::unordered_map>; -// A DB is a persistent ordered map from keys to values. +// A DB is a persistent, versioned ordered map from keys to values. // A DB is safe for concurrent access from multiple threads without // any external synchronization. +// DB is an abstract base class with one primary implementation (DBImpl) +// and a number of wrapper implementations. class DB { public: - // Open the database with the specified "name". + // Open the database with the specified "name" for reads and writes. // Stores a pointer to a heap-allocated database in *dbptr and returns // OK on success. - // Stores nullptr in *dbptr and returns a non-OK status on error. - // Caller should delete *dbptr when it is no longer needed. + // Stores nullptr in *dbptr and returns a non-OK status on error, including + // if the DB is already open (read-write) by another DB object. (This + // guarantee depends on options.env->LockFile(), which might not provide + // this guarantee in a custom Env implementation.) + // + // Caller must delete *dbptr when it is no longer needed. static Status Open(const Options& options, const std::string& name, DB** dbptr); @@ -145,11 +166,17 @@ // If the db is opened in read only mode, then no compactions // will happen. // + // While a given DB can be simultaneously open via OpenForReadOnly + // by any number of readers, if a DB is simultaneously open by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + // // Not supported in ROCKSDB_LITE, in which case the function will // return Status::NotSupported. static Status OpenForReadOnly(const Options& options, const std::string& name, DB** dbptr, - bool error_if_log_file_exist = false); + bool error_if_wal_file_exists = false); // Open the database for read only with column families. When opening DB with // read only, you can specify only a subset of column families in the @@ -157,13 +184,19 @@ // column family. The default column family name is 'default' and it's stored // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName // + // While a given DB can be simultaneously open via OpenForReadOnly + // by any number of readers, if a DB is simultaneously open by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + // // Not supported in ROCKSDB_LITE, in which case the function will // return Status::NotSupported. static Status OpenForReadOnly( const DBOptions& db_options, const std::string& name, const std::vector& column_families, std::vector* handles, DB** dbptr, - bool error_if_log_file_exist = false); + bool error_if_wal_file_exists = false); // The following OpenAsSecondary functions create a secondary instance that // can dynamically tail the MANIFEST of a primary that must have already been @@ -197,11 +230,11 @@ // to open the primary instance. // The secondary_path argument points to a directory where the secondary // instance stores its info log. - // The column_families argument specifieds a list of column families to open. + // The column_families argument specifies a list of column families to open. // If any of the column families does not exist, the function returns non-OK // status. // The handles is an out-arg corresponding to the opened database column - // familiy handles. + // family handles. // The dbptr is an out-arg corresponding to the opened secondary instance. // The pointer points to a heap-allocated database, and the caller should // delete it after use. Before deleting the dbptr, the user should also @@ -231,6 +264,16 @@ const std::vector& column_families, std::vector* handles, DB** dbptr); + // Open DB and run the compaction. + // It's a read-only operation, the result won't be installed to the DB, it + // will be output to the `output_directory`. The API should only be used with + // `options.CompactionService` to run compaction triggered by + // `CompactionService`. + static Status OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options); + virtual Status Resume() { return Status::NotSupported(); } // Close the DB by releasing resources, closing files etc. This should be @@ -242,9 +285,9 @@ // If the return status is Aborted(), closing fails because there is // unreleased snapshot in the system. In this case, users can release // the unreleased snapshots and try again and expect it to succeed. For - // other status, recalling Close() will be no-op. - // If the return status is NotSupported(), then the DB implementation does - // cleanup in the destructor + // other status, re-calling Close() will be no-op and return the original + // close status. If the return status is NotSupported(), then the DB + // implementation does cleanup in the destructor virtual Status Close() { return Status::NotSupported(); } // ListColumnFamilies will open the DB specified by argument name @@ -255,6 +298,7 @@ const std::string& name, std::vector* column_families); + // Abstract class ctor DB() {} // No copying allowed DB(const DB&) = delete; @@ -353,8 +397,15 @@ // Removes the database entries in the range ["begin_key", "end_key"), i.e., // including "begin_key" and excluding "end_key". Returns OK on success, and - // a non-OK status on error. It is not an error if no keys exist in the range - // ["begin_key", "end_key"). + // a non-OK status on error. It is not an error if the database does not + // contain any existing data in the range ["begin_key", "end_key"). + // + // If "end_key" comes before "start_key" according to the user's comparator, + // a `Status::InvalidArgument` is returned. + // + // WARNING: Do not use `Iterator::Refresh()` API on DBs where `DeleteRange()` + // has been used or will be used. This feature combination is neither + // supported nor programmatically prevented. // // This feature is now usable in production, with the following caveats: // 1) Accumulating many range tombstones in the memtable will degrade read @@ -388,6 +439,9 @@ // If the database contains an entry for "key" store the // corresponding value in *value and return OK. // + // If timestamp is enabled and a non-null timestamp pointer is passed in, + // timestamp is returned. + // // If there is no entry for "key" leave *value unchanged and return // a status for which Status::IsNotFound() returns true. // @@ -412,6 +466,32 @@ return Get(options, DefaultColumnFamily(), key, value); } + // Get() methods that return timestamp. Derived DB classes don't need to worry + // about this group of methods if they don't care about timestamp feature. + virtual inline Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, std::string* timestamp) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = Get(options, column_family, key, &pinnable_val, timestamp); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + virtual Status Get(const ReadOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, PinnableSlice* /*value*/, + std::string* /*timestamp*/) { + return Status::NotSupported( + "Get() that returns timestamp is not implemented."); + } + virtual Status Get(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp) { + return Get(options, DefaultColumnFamily(), key, value, timestamp); + } + // Returns all the merge operands corresponding to the key. If the // number of merge operands in DB is greater than // merge_operands_options.expected_max_number_of_operands @@ -428,6 +508,11 @@ GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) = 0; + // Consistent Get of many keys across column families without the need + // for an explicit snapshot. NOTE: the implementation of this MultiGet API + // does not have the performance benefits of the void-returning MultiGet + // functions. + // // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, @@ -451,6 +536,25 @@ keys, values); } + virtual std::vector MultiGet( + const ReadOptions& /*options*/, + const std::vector& /*column_family*/, + const std::vector& keys, std::vector* /*values*/, + std::vector* /*timestamps*/) { + return std::vector( + keys.size(), Status::NotSupported( + "MultiGet() returning timestamps not implemented.")); + } + virtual std::vector MultiGet(const ReadOptions& options, + const std::vector& keys, + std::vector* values, + std::vector* timestamps) { + return MultiGet( + options, + std::vector(keys.size(), DefaultColumnFamily()), + keys, values, timestamps); + } + // Overloaded MultiGet API that improves performance by batching operations // in the read path for greater efficiency. Currently, only the block based // table format with full filters are supported. Other table formats such @@ -492,6 +596,30 @@ } } + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_family); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + // Overloaded MultiGet API that improves performance by batching operations // in the read path for greater efficiency. Currently, only the block based // table format with full filters are supported. Other table formats such @@ -531,6 +659,28 @@ values++; } } + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + std::vector tss; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals, &tss); + std::copy(status.begin(), status.end(), statuses); + std::copy(tss.begin(), tss.end(), timestamps); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } // If the key definitely does not exist in the database, then this method // returns false, else true. If the caller wants to obtain value when the key @@ -542,17 +692,33 @@ virtual bool KeyMayExist(const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, std::string* /*value*/, + std::string* /*timestamp*/, bool* value_found = nullptr) { if (value_found != nullptr) { *value_found = false; } return true; } + + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found = nullptr) { + return KeyMayExist(options, column_family, key, value, + /*timestamp=*/nullptr, value_found); + } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, std::string* value, bool* value_found = nullptr) { return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found); } + virtual bool KeyMayExist(const ReadOptions& options, const Slice& key, + std::string* value, std::string* timestamp, + bool* value_found = nullptr) { + return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp, + value_found); + } + // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). @@ -578,7 +744,7 @@ // snapshot is no longer needed. // // nullptr will be returned if the DB fails to take a snapshot or does - // not support snapshot. + // not support snapshot (eg: inplace_update_support enabled). virtual const Snapshot* GetSnapshot() = 0; // Release a previously acquired snapshot. The caller must not @@ -586,7 +752,9 @@ virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; #ifndef ROCKSDB_LITE - // Contains all valid property arguments for GetProperty(). + // Contains all valid property arguments for GetProperty() or + // GetMapProperty(). Each is a "string" property for retrieval with + // GetProperty() unless noted as a "map" property, for GetMapProperty(). // // NOTE: Property names cannot end in numbers since those are interpreted as // arguments, e.g., see kNumFilesAtLevelPrefix. @@ -611,34 +779,35 @@ // SST files. static const std::string kSSTables; - // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and - // "rocksdb.cf-file-histogram" together. See below for description - // of the two. + // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram" + // and "rocksdb.cf-file-histogram" as a "map" property. static const std::string kCFStats; // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with - // general columm family stats per-level over db's lifetime ("L"), + // general column family stats per-level over db's lifetime ("L"), // aggregated over db's lifetime ("Sum"), and aggregated over the // interval since the last retrieval ("Int"). - // It could also be used to return the stats in the format of the map. - // In this case there will a pair of string to array of double for - // each level as well as for "Sum". "Int" stats will not be affected - // when this form of stats are retrieved. static const std::string kCFStatsNoFileHistogram; // "rocksdb.cf-file-histogram" - print out how many file reads to every // level, as well as the histogram of latency of single requests. static const std::string kCFFileHistogram; - // "rocksdb.dbstats" - returns a multi-line string with general database - // stats, both cumulative (over the db's lifetime) and interval (since - // the last retrieval of kDBStats). + // "rocksdb.dbstats" - As a string property, returns a multi-line string + // with general database stats, both cumulative (over the db's + // lifetime) and interval (since the last retrieval of kDBStats). + // As a map property, returns cumulative stats only and does not + // update the baseline for the interval stats. static const std::string kDBStats; // "rocksdb.levelstats" - returns multi-line string containing the number // of files per level and total size of each level (MB). static const std::string kLevelStats; + // "rocksdb.block-cache-entry-stats" - returns a multi-line string or + // map with statistics on block cache usage. + static const std::string kBlockCacheEntryStats; + // "rocksdb.num-immutable-mem-table" - returns number of immutable // memtables that have not yet been flushed. static const std::string kNumImmutableMemTable; @@ -733,7 +902,8 @@ static const std::string kCurrentSuperVersionNumber; // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of - // live data in bytes. + // live data in bytes. For BlobDB, it also includes the exact value of + // live bytes in the blob files of the version. static const std::string kEstimateLiveDataSize; // "rocksdb.min-log-number-to-keep" - return the minimum log number of the @@ -754,6 +924,10 @@ // files belong to the latest LSM tree. static const std::string kLiveSstFilesSize; + // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes) + // of SST files at all certain file temperature + static const std::string kLiveSstFilesSizeAtTemperature; + // "rocksdb.base-level" - returns number of level to which L0 data will be // compacted. static const std::string kBaseLevel; @@ -764,8 +938,10 @@ // based. static const std::string kEstimatePendingCompactionBytes; - // "rocksdb.aggregated-table-properties" - returns a string representation - // of the aggregated table properties of the target column family. + // "rocksdb.aggregated-table-properties" - returns a string or map + // representation of the aggregated table properties of the target + // column family. Only properties that make sense for aggregation + // are included. static const std::string kAggregatedTableProperties; // "rocksdb.aggregated-table-properties-at-level", same as the previous @@ -800,18 +976,39 @@ // "rocksdb.options-statistics" - returns multi-line string // of options.statistics static const std::string kOptionsStatistics; + + // "rocksdb.num-blob-files" - returns number of blob files in the current + // version. + static const std::string kNumBlobFiles; + + // "rocksdb.blob-stats" - return the total number and size of all blob + // files, and total amount of garbage (bytes) in the blob files in + // the current version. + static const std::string kBlobStats; + + // "rocksdb.total-blob-file-size" - returns the total size of all blob + // files over all versions. + static const std::string kTotalBlobFileSize; + + // "rocksdb.live-blob-file-size" - returns the total size of all blob + // files in the current version. + static const std::string kLiveBlobFileSize; }; #endif /* ROCKSDB_LITE */ - // DB implementations can export properties about their state via this method. - // If "property" is a valid property understood by this DB implementation (see - // Properties struct above for valid options), fills "*value" with its current - // value and returns true. Otherwise, returns false. + // DB implementations export properties about their state via this method. + // If "property" is a valid "string" property understood by this DB + // implementation (see Properties struct above for valid options), fills + // "*value" with its current value and returns true. Otherwise, returns + // false. virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) { return GetProperty(DefaultColumnFamily(), property, value); } + + // Like GetProperty but for valid "map" properties. (Some properties can be + // accessed as either "string" properties or "map" properties.) virtual bool GetMapProperty(ColumnFamilyHandle* column_family, const Slice& property, std::map* value) = 0; @@ -856,6 +1053,11 @@ // "rocksdb.block-cache-capacity" // "rocksdb.block-cache-usage" // "rocksdb.block-cache-pinned-usage" + // + // Properties dedicated for BlobDB: + // "rocksdb.num-blob-files" + // "rocksdb.total-blob-file-size" + // "rocksdb.live-blob-file-size" virtual bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) = 0; virtual bool GetIntProperty(const Slice& property, uint64_t* value) { @@ -883,32 +1085,36 @@ }; // For each i in [0,n-1], store in "sizes[i]", the approximate - // file system space used by keys in "[range[i].start .. range[i].limit)". + // file system space used by keys in "[range[i].start .. range[i].limit)" + // in a single column family. // // Note that the returned sizes measure file system space usage, so // if the user data compresses by a factor of ten, the returned // sizes will be one-tenth the size of the corresponding user data size. virtual Status GetApproximateSizes(const SizeApproximationOptions& options, ColumnFamilyHandle* column_family, - const Range* range, int n, + const Range* ranges, int n, uint64_t* sizes) = 0; // Simpler versions of the GetApproximateSizes() method above. - // The include_flags argumenbt must of type DB::SizeApproximationFlags + // The include_flags argument must of type DB::SizeApproximationFlags // and can not be NONE. - virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) { + virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { SizeApproximationOptions options; options.include_memtabtles = (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; options.include_files = (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; - GetApproximateSizes(options, column_family, range, n, sizes); + return GetApproximateSizes(options, column_family, ranges, n, sizes); } - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) { - GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); + virtual Status GetApproximateSizes(const Range* ranges, int n, + uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { + return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, + include_flags); } // The method is similar to GetApproximateSizes, except it @@ -948,6 +1154,8 @@ // and the data is rearranged to reduce the cost of operations // needed to access the data. This operation should typically only // be invoked by users who understand the underlying implementation. + // This call blocks until the operation completes successfully, fails, + // or is aborted (Status::Incomplete). See DisableManualCompaction. // // begin==nullptr is treated as a key before all keys in the database. // end==nullptr is treated as a key after all keys in the database. @@ -1002,9 +1210,9 @@ const std::unordered_map& new_options) = 0; // CompactFiles() inputs a list of files specified by file numbers and - // compacts them to the specified level. Note that the behavior is different - // from CompactRange() in that CompactFiles() performs the compaction job - // using the CURRENT thread. + // compacts them to the specified level. A small difference compared to + // CompactRange() is that CompactFiles() performs the compaction job + // using the CURRENT thread, so is not considered a "background" job. // // @see GetDataBaseMetaData // @see GetColumnFamilyMetaData @@ -1029,7 +1237,8 @@ // This function will wait until all currently running background processes // finish. After it returns, no background process will be run until - // ContinueBackgroundWork is called + // ContinueBackgroundWork is called, once for each preceding OK-returning + // call to PauseBackgroundWork. virtual Status PauseBackgroundWork() = 0; virtual Status ContinueBackgroundWork() = 0; @@ -1045,7 +1254,16 @@ virtual Status EnableAutoCompaction( const std::vector& column_family_handles) = 0; + // After this function call, CompactRange() or CompactFiles() will not + // run compactions and fail. Calling this function will tell outstanding + // manual compactions to abort and will wait for them to finish or abort + // before returning. virtual void DisableManualCompaction() = 0; + // Re-enable CompactRange() and ComapctFiles() that are disabled by + // DisableManualCompaction(). This function must be called as many times + // as DisableManualCompaction() has been called in order to re-enable + // manual compactions, and must not be called more times than + // DisableManualCompaction() has been called. virtual void EnableManualCompaction() = 0; // Number of levels used for this DB. @@ -1137,13 +1355,20 @@ // updated, false if user attempted to call if with seqnum <= current value. virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; -#ifndef ROCKSDB_LITE - // Prevent file deletions. Compactions will continue to occur, // but no obsolete files will be deleted. Calling this multiple // times have the same effect as calling it once. virtual Status DisableFileDeletions() = 0; + // Increase the full_history_ts of column family. The new ts_low value should + // be newer than current full_history_ts value. + virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) = 0; + + // Get current full_history_ts value. + virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) = 0; + // Allow compactions to delete obsolete files. // If force == true, the call to EnableFileDeletions() will guarantee that // file deletions are enabled after the call, even if DisableFileDeletions() @@ -1155,6 +1380,7 @@ // threads call EnableFileDeletions() virtual Status EnableFileDeletions(bool force = true) = 0; +#ifndef ROCKSDB_LITE // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup // Retrieve the list of all files in the database. The files are @@ -1216,6 +1442,14 @@ // Windows API macro interference #undef DeleteFile + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. + // // Delete the file name from the db directory and update the internal state to // reflect that. Supports deletion of sst and log files only. 'name' must be // path relative to the db directory. eg. 000001.sst, /archive/000003.log @@ -1226,6 +1460,20 @@ virtual void GetLiveFilesMetaData( std::vector* /*metadata*/) {} + // Return a list of all table and blob files checksum info. + // Note: This function might be of limited use because it cannot be + // synchronized with GetLiveFiles. + virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; + + // EXPERIMENTAL: This function is not yet feature-complete. + // Get information about all live files that make up a DB, for making + // live copies (Checkpoint, backups, etc.) or other storage-related purposes. + // Use DisableFileDeletions() before and EnableFileDeletions() after to + // preserve the files for live copy. + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) = 0; + // Obtains the meta data of the specified column family of the DB. virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) {} @@ -1235,6 +1483,12 @@ GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); } + // Obtains the meta data of all column families for the DB. + // The returned map contains one entry for each column family indexed by the + // name of the column family. + virtual void GetAllColumnFamilyMetaData( + std::vector* /*metadata*/) {} + // IngestExternalFile() will load a list of external SST files (1) into the DB // Two primary modes are supported: // - Duplicate keys in the new files will overwrite exiting keys (default) @@ -1286,13 +1540,14 @@ // this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in - // an existing DB. + // an existing DB using Checkpoint::ExportColumnFamily. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a - // move, the call ensures that the specified files at external_file_path are - // deleted on successful return and files are not modified on any error - // return. + // move, the call makes a best effort to delete the specified files at + // external_file_path on successful return, logging any failure to delete + // rather than returning in Status. Files are not modified on any error + // return, and a best effort is made to remove any newly-created files. // On error return, column family handle returned will be nullptr. // ColumnFamily will be present on successful return and will not be present // on error return. ColumnFamily may be present on any crash during this call. @@ -1302,6 +1557,14 @@ const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of + // table files are checked. + virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { + return Status::NotSupported("File verification not supported"); + } + + // Verify the block checksums of files in db. The block checksums of table + // files are checked. virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } @@ -1415,10 +1678,18 @@ // Returns Status::OK if identity could be set properly virtual Status GetDbIdentity(std::string& identity) const = 0; + // Return a unique identifier for each DB object that is opened + // This DB session ID should be unique among all open DB instances on all + // hosts, and should be unique among re-openings of the same or other DBs. + // (Two open DBs have the same identity from other function GetDbIdentity when + // one is physically copied from the other.) + virtual Status GetDbSessionId(std::string& session_id) const = 0; + // Returns default column family handle virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; #ifndef ROCKSDB_LITE + virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) = 0; virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { @@ -1449,6 +1720,16 @@ return Status::NotSupported("EndTrace() is not implemented."); } + // IO Tracing operations. Use EndIOTrace() to stop tracing. + virtual Status StartIOTrace(const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartIOTrace() is not implemented."); + } + + virtual Status EndIOTrace() { + return Status::NotSupported("EndIOTrace() is not implemented."); + } + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. virtual Status StartBlockCacheTrace( const TraceOptions& /*options*/, @@ -1459,6 +1740,15 @@ virtual Status EndBlockCacheTrace() { return Status::NotSupported("EndBlockCacheTrace() is not implemented."); } + + // Create a default trace replayer. + virtual Status NewDefaultReplayer( + const std::vector& /*handles*/, + std::unique_ptr&& /*reader*/, + std::unique_ptr* /*replayer*/) { + return Status::NotSupported("NewDefaultReplayer() is not implemented."); + } + #endif // ROCKSDB_LITE // Needed for StackableDB diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,12 +17,16 @@ #pragma once #include + #include #include #include #include #include #include + +#include "rocksdb/customizable.h" +#include "rocksdb/functor_wrapper.h" #include "rocksdb/status.h" #include "rocksdb/thread_status.h" @@ -30,11 +34,12 @@ // Windows API macro interference #undef DeleteFile #undef GetCurrentTime +#undef LoadLibrary #endif #if defined(__GNUC__) || defined(__clang__) #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ - __attribute__((__format__(__printf__, format_param, dots_param))) + __attribute__((__format__(__printf__, format_param, dots_param))) #else #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) #endif @@ -47,6 +52,7 @@ class RandomAccessFile; class SequentialFile; class Slice; +struct DataVerificationInfo; class WritableFile; class RandomRWFile; class MemoryMappedFileBuffer; @@ -57,9 +63,19 @@ class RateLimiter; class ThreadStatusUpdater; struct ThreadStatus; +class FileSystem; +class SystemClock; +struct ConfigOptions; const size_t kDefaultPageSize = 4 * 1024; +enum class CpuPriority { + kIdle = 0, + kLow = 1, + kNormal = 2, + kHigh = 3, +}; + // Options while opening a file to read/write struct EnvOptions { // Construct with default Options @@ -68,7 +84,8 @@ // Construct from Options explicit EnvOptions(const DBOptions& options); - // If true, then use mmap to read data + // If true, then use mmap to read data. + // Not recommended for 32-bit OS. bool use_mmap_reads = false; // If true, then use mmap to write data @@ -130,8 +147,12 @@ RateLimiter* rate_limiter = nullptr; }; -class Env { +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Env : public Customizable { public: + static const char* kDefaultName() { return "DefaultEnv"; } struct FileAttributes { // File name std::string name; @@ -140,22 +161,63 @@ uint64_t size_bytes; }; - Env() : thread_status_updater_(nullptr) {} + Env(); + // Construct an Env with a separate FileSystem and/or SystemClock + // implementation + explicit Env(const std::shared_ptr& fs); + Env(const std::shared_ptr& fs, + const std::shared_ptr& clock); // No copying allowed Env(const Env&) = delete; void operator=(const Env&) = delete; - virtual ~Env(); + ~Env() override; static const char* Type() { return "Environment"; } + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return ""; } + // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status LoadEnv(const std::string& value, Env** result); // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status LoadEnv(const std::string& value, Env** result, std::shared_ptr* guard); + // Loads the environment specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // + // @param config_options Controls how the environment is loaded. + // @param value the name and associated properties for the environment. + // @param result On success, the environment that was loaded. + // @param guard If specified and the loaded environment is not static, + // this value will contain the loaded environment (guard.get() == + // result). + // @return OK If the environment was successfully loaded (and optionally + // prepared) + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard); + + // Loads the environment specified by the env and fs uri. + // If both are specified, an error is returned. + // Otherwise, the environment is created by loading (via CreateFromString) + // the appropriate env/fs from the corresponding values. + static Status CreateFromUri(const ConfigOptions& options, + const std::string& env_uri, + const std::string& fs_uri, Env** result, + std::shared_ptr* guard); + // Return a default environment suitable for the current operating // system. Sophisticated users may wish to provide their own Env // implementation instead of relying on this default environment. @@ -163,6 +225,15 @@ // The result of Default() belongs to rocksdb and must never be deleted. static Env* Default(); + // See FileSystem::RegisterDbPaths. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // See FileSystem::UnregisterDbPaths. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // Create a brand new sequentially-readable file with the specified name. // On success, stores a pointer to the new file in *result and returns OK. // On failure stores nullptr in *result and returns non-OK. If the file does @@ -205,17 +276,18 @@ std::unique_ptr* result, const EnvOptions& options) = 0; - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. + // Create an object that writes to a file with the specified name. + // `WritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. // // The returned file will only be accessed by one thread at a time. virtual Status ReopenWritableFile(const std::string& /*fname*/, std::unique_ptr* /*result*/, const EnvOptions& /*options*/) { - return Status::NotSupported(); + return Status::NotSupported("Env::ReopenWritableFile() not supported."); } // Reuse an existing file by renaming it and opening it as writable. @@ -263,7 +335,8 @@ virtual Status FileExists(const std::string& fname) = 0; // Store in *result the names of the children of the specified directory. - // The names are relative to "dir". + // The names are relative to "dir", and shall never include the + // names `.` or `..`. // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. // NotFound if "dir" does not exist, the calling process does not have @@ -276,7 +349,8 @@ // In case the implementation lists the directory prior to iterating the files // and files are concurrently deleted, the deleted files will be omitted from // result. - // The name attributes are relative to "dir". + // The name attributes are relative to "dir", and shall never include the + // names `.` or `..`. // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. // NotFound if "dir" does not exist, the calling process does not have @@ -301,6 +375,8 @@ virtual Status CreateDirIfMissing(const std::string& dirname) = 0; // Delete the specified directory. + // Many implementations of this function will only delete a directory if it is + // empty. virtual Status DeleteDir(const std::string& dirname) = 0; // Store the size of fname in *file_size. @@ -369,7 +445,13 @@ static std::string PriorityToString(Priority priority); // Priority for requesting bytes in rate limiter scheduler - enum IOPriority { IO_LOW = 0, IO_HIGH = 1, IO_TOTAL = 2 }; + enum IOPriority { + IO_LOW = 0, + IO_MID = 1, + IO_HIGH = 2, + IO_USER = 3, + IO_TOTAL = 4 + }; // Arrange to run "(*function)(arg)" once in a background thread, in // the thread pool specified by pri. By default, jobs go to the 'LOW' @@ -393,6 +475,21 @@ // When "function(arg)" returns, the thread will be destroyed. virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + // Start a new thread, invoking "function(args...)" within the new thread. + // When "function(args...)" returns, the thread will be destroyed. + template + void StartThreadTyped(FunctionT function, Args&&... args) { + using FWType = FunctorWrapper; + StartThread( + [](void* arg) { + auto* functor = static_cast(arg); + functor->invoke(); + delete functor; + }, + new FWType(std::function(function), + std::forward(args)...)); + } + // Wait for all threads started by StartThread to terminate. virtual void WaitForJoin() {} @@ -408,7 +505,7 @@ virtual Status GetTestDirectory(std::string* path) = 0; // Create and returns a default logger (an instance of EnvLogger) for storing - // informational messages. Derived classes can overide to provide custom + // informational messages. Derived classes can override to provide custom // logger. virtual Status NewLogger(const std::string& fname, std::shared_ptr* result); @@ -431,9 +528,15 @@ // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; - // Get the current host name. + // Get the current host name as a null terminated string iff the string + // length is < len. The hostname should otherwise be truncated to len. virtual Status GetHostName(char* name, uint64_t len) = 0; + // Get the current hostname from the given env as a std::string in result. + // The result may be truncated if the hostname is too + // long + virtual Status GetHostNameString(std::string* result); + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). // Only overwrites *unix_time on success. virtual Status GetCurrentTime(int64_t* unix_time) = 0; @@ -449,7 +552,7 @@ virtual int GetBackgroundThreads(Priority pri = LOW) = 0; virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) { - return Status::NotSupported("Not supported."); + return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported."); } // Enlarge number of background worker threads of a specific thread pool @@ -461,12 +564,22 @@ virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {} // Lower CPU priority for threads from the specified pool. + virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/, + CpuPriority /*pri*/) { + return Status::NotSupported( + "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported"); + } + + // Lower CPU priority for threads from the specified pool. virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {} // Converts seconds-since-Jan-01-1970 to a printable string virtual std::string TimeToString(uint64_t time) = 0; - // Generates a unique id that can be used to identify a db + // Generates a human-readable unique ID that can be used to identify a DB. + // In built-in implementations, this is an RFC-4122 UUID string, but might + // not be in all implementations. Overriding is not recommended. + // NOTE: this has not be validated for use in cryptography virtual std::string GenerateUniqueId(); // OptimizeForLogWrite will create a new EnvOptions object that is a copy of @@ -504,9 +617,16 @@ const EnvOptions& env_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // blob files. + virtual EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + // Returns the status of all threads that belong to the current Env. virtual Status GetThreadList(std::vector* /*thread_list*/) { - return Status::NotSupported("Not supported."); + return Status::NotSupported("Env::GetThreadList() not supported."); } // Returns the pointer to ThreadStatusUpdater. This function will be @@ -525,17 +645,39 @@ // Get the amount of free disk space virtual Status GetFreeSpace(const std::string& /*path*/, uint64_t* /*diskfree*/) { - return Status::NotSupported(); + return Status::NotSupported("Env::GetFreeSpace() not supported."); + } + + // Check whether the specified path is a directory + virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) { + return Status::NotSupported("Env::IsDirectory() not supported."); } virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {} + // Get the FileSystem implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetFileSystem() const; + + // Get the SystemClock implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetSystemClock() const; + // If you're adding methods here, remember to add them to EnvWrapper too. protected: // The pointer to an internal structure that will update the // status of each thread. ThreadStatusUpdater* thread_status_updater_; + + // Pointer to the underlying FileSystem implementation + std::shared_ptr file_system_; + + // Pointer to the underlying SystemClock implementation + std::shared_ptr system_clock_; + + private: + static const size_t kMaxHostNameLen = 256; }; // The factory function to construct a ThreadStatusUpdater. Any Env @@ -556,6 +698,10 @@ // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // REQUIRES: External synchronization virtual Status Read(size_t n, Slice* result, char* scratch) = 0; @@ -580,14 +726,16 @@ // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { - return Status::NotSupported("InvalidateCache not supported."); + return Status::NotSupported( + "SequentialFile::InvalidateCache not supported."); } // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/, Slice* /*result*/, char* /*scratch*/) { - return Status::NotSupported(); + return Status::NotSupported( + "SequentialFile::PositionedRead() not supported."); } // If you're adding methods here, remember to add them to @@ -599,7 +747,8 @@ // File offset in bytes uint64_t offset; - // Length to read in bytes + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). size_t len; // A buffer that MultiRead() can optionally place data in. It can @@ -628,6 +777,10 @@ // "*result" is used. If an error was encountered, returns a non-OK // status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. virtual Status Read(uint64_t offset, size_t n, Slice* result, @@ -690,7 +843,8 @@ // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { - return Status::NotSupported("InvalidateCache not supported."); + return Status::NotSupported( + "RandomAccessFile::InvalidateCache not supported."); } // If you're adding methods here, remember to add them to @@ -722,10 +876,22 @@ virtual ~WritableFile(); // Append data to the end of the file - // Note: A WriteabelFile object must support either Append or + // Note: A WriteableFile object must support either Append or // PositionedAppend, so the users cannot mix the two. virtual Status Append(const Slice& data) = 0; + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) { + return Append(data); + } + // PositionedAppend data to the specified offset. The new EOF after append // must be larger than the previous EOF. This is to be used when writes are // not backed by OS buffers and hence has to always start from the start of @@ -748,7 +914,21 @@ // required is queried via GetRequiredBufferAlignment() virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) { - return Status::NotSupported(); + return Status::NotSupported( + "WritableFile::PositionedAppend() not supported."); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const DataVerificationInfo& /* verification_info */) { + return Status::NotSupported("PositionedAppend"); } // Truncate is necessary to trim the file to the correct size @@ -823,7 +1003,7 @@ // If the system is not caching the file contents, then this is a noop. // This call has no effect on dirty pages in the cache. virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) { - return Status::NotSupported("InvalidateCache not supported."); + return Status::NotSupported("WritableFile::InvalidateCache not supported."); } // Sync a file range with disk. @@ -857,8 +1037,10 @@ if (new_last_preallocated_block > last_preallocated_block_) { size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; + // TODO: Don't ignore errors from allocate Allocate(block_size * last_preallocated_block_, - block_size * num_spanned_blocks); + block_size * num_spanned_blocks) + .PermitUncheckedError(); last_preallocated_block_ = new_last_preallocated_block; } } @@ -908,6 +1090,11 @@ // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Returns Status::OK() on success. virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; @@ -973,6 +1160,10 @@ }; // An interface for writing log messages. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. class Logger { public: size_t kDoNotSupportGetLogFileSize = (std::numeric_limits::max)(); @@ -996,11 +1187,17 @@ virtual void LogHeader(const char* format, va_list ap) { // Default implementation does a simple INFO level log write. // Please override as per the logger class requirement. - Logv(format, ap); + Logv(InfoLogLevel::INFO_LEVEL, format, ap); } // Write an entry to the log file with the specified format. - virtual void Logv(const char* format, va_list ap) = 0; + // + // Users who override the `Logv()` overload taking `InfoLogLevel` do not need + // to implement this, unless they explicitly invoke it in + // `Logv(InfoLogLevel, ...)`. + virtual void Logv(const char* /* format */, va_list /* ap */) { + assert(false); + } // Write an entry to the log file with the specified log level // and format. Any log with level under the internal log level @@ -1027,7 +1224,9 @@ InfoLogLevel log_level_; }; -// Identifies a locked file. +// Identifies a locked file. Except in custom Env/Filesystem implementations, +// the lifetime of a FileLock object should be managed only by LockFile() and +// UnlockFile(). class FileLock { public: FileLock() {} @@ -1147,232 +1346,297 @@ // functionality of another Env. class EnvWrapper : public Env { public: + // The Target struct allows an Env to be stored as a raw (Env*) or + // std::shared_ptr. By using this struct, the wrapping/calling + // class does not need to worry about the ownership/lifetime of the + // wrapped target env. If the guard is set, then the Env will point + // to the guard.get(). + struct Target { + Env* env; // The raw Env + std::shared_ptr guard; // The guarded Env + + // Creates a Target without assuming ownership of the target Env + explicit Target(Env* t) : env(t) {} + + // Creates a Target from the guarded env, assuming ownership + explicit Target(std::unique_ptr&& t) : guard(t.release()) { + env = guard.get(); + } + + // Creates a Target from the guarded env, assuming ownership + explicit Target(const std::shared_ptr& t) : guard(t) { + env = guard.get(); + } + + // Makes sure the raw Env is not nullptr + void Prepare() { + if (guard.get() != nullptr) { + env = guard.get(); + } else if (env == nullptr) { + env = Env::Default(); + } + } + }; + // Initialize an EnvWrapper that delegates all calls to *t - explicit EnvWrapper(Env* t) : target_(t) {} + explicit EnvWrapper(Env* t); + explicit EnvWrapper(std::unique_ptr&& t); + explicit EnvWrapper(const std::shared_ptr& t); ~EnvWrapper() override; // Return the target to which this Env forwards all calls - Env* target() const { return target_; } + Env* target() const { return target_.env; } + + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return target_.env->Name(); } // The following text is boilerplate that forwards all methods to target() + Status RegisterDbPaths(const std::vector& paths) override { + return target_.env->RegisterDbPaths(paths); + } + + Status UnregisterDbPaths(const std::vector& paths) override { + return target_.env->UnregisterDbPaths(paths); + } + Status NewSequentialFile(const std::string& f, std::unique_ptr* r, const EnvOptions& options) override { - return target_->NewSequentialFile(f, r, options); + return target_.env->NewSequentialFile(f, r, options); } Status NewRandomAccessFile(const std::string& f, std::unique_ptr* r, const EnvOptions& options) override { - return target_->NewRandomAccessFile(f, r, options); + return target_.env->NewRandomAccessFile(f, r, options); } Status NewWritableFile(const std::string& f, std::unique_ptr* r, const EnvOptions& options) override { - return target_->NewWritableFile(f, r, options); + return target_.env->NewWritableFile(f, r, options); } Status ReopenWritableFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& options) override { - return target_->ReopenWritableFile(fname, result, options); + return target_.env->ReopenWritableFile(fname, result, options); } Status ReuseWritableFile(const std::string& fname, const std::string& old_fname, std::unique_ptr* r, const EnvOptions& options) override { - return target_->ReuseWritableFile(fname, old_fname, r, options); + return target_.env->ReuseWritableFile(fname, old_fname, r, options); } Status NewRandomRWFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& options) override { - return target_->NewRandomRWFile(fname, result, options); + return target_.env->NewRandomRWFile(fname, result, options); } Status NewMemoryMappedFileBuffer( const std::string& fname, std::unique_ptr* result) override { - return target_->NewMemoryMappedFileBuffer(fname, result); + return target_.env->NewMemoryMappedFileBuffer(fname, result); } Status NewDirectory(const std::string& name, std::unique_ptr* result) override { - return target_->NewDirectory(name, result); + return target_.env->NewDirectory(name, result); } Status FileExists(const std::string& f) override { - return target_->FileExists(f); + return target_.env->FileExists(f); } Status GetChildren(const std::string& dir, std::vector* r) override { - return target_->GetChildren(dir, r); + return target_.env->GetChildren(dir, r); } Status GetChildrenFileAttributes( const std::string& dir, std::vector* result) override { - return target_->GetChildrenFileAttributes(dir, result); + return target_.env->GetChildrenFileAttributes(dir, result); } Status DeleteFile(const std::string& f) override { - return target_->DeleteFile(f); + return target_.env->DeleteFile(f); } Status Truncate(const std::string& fname, size_t size) override { - return target_->Truncate(fname, size); + return target_.env->Truncate(fname, size); } Status CreateDir(const std::string& d) override { - return target_->CreateDir(d); + return target_.env->CreateDir(d); } Status CreateDirIfMissing(const std::string& d) override { - return target_->CreateDirIfMissing(d); + return target_.env->CreateDirIfMissing(d); } Status DeleteDir(const std::string& d) override { - return target_->DeleteDir(d); + return target_.env->DeleteDir(d); } Status GetFileSize(const std::string& f, uint64_t* s) override { - return target_->GetFileSize(f, s); + return target_.env->GetFileSize(f, s); } Status GetFileModificationTime(const std::string& fname, uint64_t* file_mtime) override { - return target_->GetFileModificationTime(fname, file_mtime); + return target_.env->GetFileModificationTime(fname, file_mtime); } Status RenameFile(const std::string& s, const std::string& t) override { - return target_->RenameFile(s, t); + return target_.env->RenameFile(s, t); } Status LinkFile(const std::string& s, const std::string& t) override { - return target_->LinkFile(s, t); + return target_.env->LinkFile(s, t); } Status NumFileLinks(const std::string& fname, uint64_t* count) override { - return target_->NumFileLinks(fname, count); + return target_.env->NumFileLinks(fname, count); } Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { - return target_->AreFilesSame(first, second, res); + return target_.env->AreFilesSame(first, second, res); } Status LockFile(const std::string& f, FileLock** l) override { - return target_->LockFile(f, l); + return target_.env->LockFile(f, l); } - Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } + Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); } + + Status IsDirectory(const std::string& path, bool* is_dir) override { + return target_.env->IsDirectory(path, is_dir); + } Status LoadLibrary(const std::string& lib_name, const std::string& search_path, std::shared_ptr* result) override { - return target_->LoadLibrary(lib_name, search_path, result); + return target_.env->LoadLibrary(lib_name, search_path, result); } void Schedule(void (*f)(void* arg), void* a, Priority pri, void* tag = nullptr, void (*u)(void* arg) = nullptr) override { - return target_->Schedule(f, a, pri, tag, u); + return target_.env->Schedule(f, a, pri, tag, u); } int UnSchedule(void* tag, Priority pri) override { - return target_->UnSchedule(tag, pri); + return target_.env->UnSchedule(tag, pri); } void StartThread(void (*f)(void*), void* a) override { - return target_->StartThread(f, a); + return target_.env->StartThread(f, a); } - void WaitForJoin() override { return target_->WaitForJoin(); } + void WaitForJoin() override { return target_.env->WaitForJoin(); } unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { - return target_->GetThreadPoolQueueLen(pri); + return target_.env->GetThreadPoolQueueLen(pri); } Status GetTestDirectory(std::string* path) override { - return target_->GetTestDirectory(path); + return target_.env->GetTestDirectory(path); } Status NewLogger(const std::string& fname, std::shared_ptr* result) override { - return target_->NewLogger(fname, result); + return target_.env->NewLogger(fname, result); } - uint64_t NowMicros() override { return target_->NowMicros(); } - uint64_t NowNanos() override { return target_->NowNanos(); } - uint64_t NowCPUNanos() override { return target_->NowCPUNanos(); } + uint64_t NowMicros() override { return target_.env->NowMicros(); } + uint64_t NowNanos() override { return target_.env->NowNanos(); } + uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); } void SleepForMicroseconds(int micros) override { - target_->SleepForMicroseconds(micros); + target_.env->SleepForMicroseconds(micros); } Status GetHostName(char* name, uint64_t len) override { - return target_->GetHostName(name, len); + return target_.env->GetHostName(name, len); } Status GetCurrentTime(int64_t* unix_time) override { - return target_->GetCurrentTime(unix_time); + return target_.env->GetCurrentTime(unix_time); } Status GetAbsolutePath(const std::string& db_path, std::string* output_path) override { - return target_->GetAbsolutePath(db_path, output_path); + return target_.env->GetAbsolutePath(db_path, output_path); } void SetBackgroundThreads(int num, Priority pri) override { - return target_->SetBackgroundThreads(num, pri); + return target_.env->SetBackgroundThreads(num, pri); } int GetBackgroundThreads(Priority pri) override { - return target_->GetBackgroundThreads(pri); + return target_.env->GetBackgroundThreads(pri); } Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { - return target_->SetAllowNonOwnerAccess(allow_non_owner_access); + return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access); } void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { - return target_->IncBackgroundThreadsIfNeeded(num, pri); + return target_.env->IncBackgroundThreadsIfNeeded(num, pri); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + target_.env->LowerThreadPoolIOPriority(pool); } - void LowerThreadPoolIOPriority(Priority pool = LOW) override { - target_->LowerThreadPoolIOPriority(pool); + void LowerThreadPoolCPUPriority(Priority pool) override { + target_.env->LowerThreadPoolCPUPriority(pool); } - void LowerThreadPoolCPUPriority(Priority pool = LOW) override { - target_->LowerThreadPoolCPUPriority(pool); + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return target_.env->LowerThreadPoolCPUPriority(pool, pri); } std::string TimeToString(uint64_t time) override { - return target_->TimeToString(time); + return target_.env->TimeToString(time); } Status GetThreadList(std::vector* thread_list) override { - return target_->GetThreadList(thread_list); + return target_.env->GetThreadList(thread_list); } ThreadStatusUpdater* GetThreadStatusUpdater() const override { - return target_->GetThreadStatusUpdater(); + return target_.env->GetThreadStatusUpdater(); } - uint64_t GetThreadID() const override { return target_->GetThreadID(); } + uint64_t GetThreadID() const override { return target_.env->GetThreadID(); } std::string GenerateUniqueId() override { - return target_->GenerateUniqueId(); + return target_.env->GenerateUniqueId(); } EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { - return target_->OptimizeForLogRead(env_options); + return target_.env->OptimizeForLogRead(env_options); } EnvOptions OptimizeForManifestRead( const EnvOptions& env_options) const override { - return target_->OptimizeForManifestRead(env_options); + return target_.env->OptimizeForManifestRead(env_options); } EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, const DBOptions& db_options) const override { - return target_->OptimizeForLogWrite(env_options, db_options); + return target_.env->OptimizeForLogWrite(env_options, db_options); } EnvOptions OptimizeForManifestWrite( const EnvOptions& env_options) const override { - return target_->OptimizeForManifestWrite(env_options); + return target_.env->OptimizeForManifestWrite(env_options); } EnvOptions OptimizeForCompactionTableWrite( const EnvOptions& env_options, const ImmutableDBOptions& immutable_ops) const override { - return target_->OptimizeForCompactionTableWrite(env_options, immutable_ops); + return target_.env->OptimizeForCompactionTableWrite(env_options, + immutable_ops); } EnvOptions OptimizeForCompactionTableRead( const EnvOptions& env_options, const ImmutableDBOptions& db_options) const override { - return target_->OptimizeForCompactionTableRead(env_options, db_options); + return target_.env->OptimizeForCompactionTableRead(env_options, db_options); + } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_.env->OptimizeForBlobFileRead(env_options, db_options); } Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { - return target_->GetFreeSpace(path, diskfree); + return target_.env->GetFreeSpace(path, diskfree); } void SanitizeEnvOptions(EnvOptions* env_opts) const override { - target_->SanitizeEnvOptions(env_opts); + target_.env->SanitizeEnvOptions(env_opts); } + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE private: - Env* target_; + Target target_; }; class SequentialFileWrapper : public SequentialFile { @@ -1435,9 +1699,18 @@ explicit WritableFileWrapper(WritableFile* t) : target_(t) {} Status Append(const Slice& data) override { return target_->Append(data); } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + return target_->Append(data, verification_info); + } Status PositionedAppend(const Slice& data, uint64_t offset) override { return target_->PositionedAppend(data, offset); } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + return target_->PositionedAppend(data, offset, verification_info); + } Status Truncate(uint64_t size) override { return target_->Truncate(size); } Status Close() override { return target_->Close(); } Status Flush() override { return target_->Flush(); } @@ -1586,4 +1859,8 @@ Status NewEnvLogger(const std::string& fname, Env* env, std::shared_ptr* result); +// Creates a new Env based on Env::Default() but modified to use the specified +// FileSystem. +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs); + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,15 +9,24 @@ #include -#include "env.h" +#include "rocksdb/customizable.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { class EncryptionProvider; +struct ConfigOptions; + // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. -Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider); +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider); +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base_fs, + const std::shared_ptr& provider); // BlockAccessCipherStream is the base class for any cipher stream that // supports random access at block level (without requiring data from other @@ -53,10 +62,38 @@ }; // BlockCipher -class BlockCipher { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class BlockCipher : public Customizable { public: virtual ~BlockCipher(){}; + // Creates a new BlockCipher from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "ROT13", a ROT13BlockCipher is created. + // + // @param config_options Options to control how this cipher is created + // and initialized. + // @param value The value might be: + // - ROT13 Create a ROT13 Cipher + // - ROT13:nn Create a ROT13 Cipher with block size of nn + // @param result The new cipher object + // @return OK if the cipher was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "BlockCipher"; } + // Short-cut method to create a ROT13 BlockCipher. + // This cipher is only suitable for test purposes and should not be used in + // production!!! + static std::shared_ptr NewROT13Cipher(size_t block_size); + // BlockSize returns the size of each block supported by this cipher stream. virtual size_t BlockSize() = 0; @@ -69,138 +106,360 @@ virtual Status Decrypt(char* data) = 0; }; -// Implements a BlockCipher using ROT13. -// -// Note: This is a sample implementation of BlockCipher, -// it is NOT considered safe and should NOT be used in production. -class ROT13BlockCipher : public BlockCipher { - private: - size_t blockSize_; - - public: - ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {} - virtual ~ROT13BlockCipher(){}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return blockSize_; } - - // Encrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Encrypt(char* data) override; - - // Decrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Decrypt(char* data) override; -}; - -// CTRCipherStream implements BlockAccessCipherStream using an -// Counter operations mode. -// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation -// -// Note: This is a possible implementation of BlockAccessCipherStream, -// it is considered suitable for use. -class CTRCipherStream final : public BlockAccessCipherStream { - private: - BlockCipher& cipher_; - std::string iv_; - uint64_t initialCounter_; - - public: - CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter) - : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){}; - virtual ~CTRCipherStream(){}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return cipher_.BlockSize(); } - - protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. - virtual void AllocateScratch(std::string&) override; - - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status EncryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; - - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status DecryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; -}; - // The encryption provider is used to create a cipher stream for a specific // file. The returned cipher stream will be used for actual // encryption/decryption actions. -class EncryptionProvider { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class EncryptionProvider : public Customizable { public: virtual ~EncryptionProvider(){}; + // Creates a new EncryptionProvider from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "CTR", a CTREncryptionProvider will be + // created. If the value is ends with "://test" (e.g CTR://test"), the + // provider will be initialized in "TEST" mode prior to being returned. + // + // @param config_options Options to control how this provider is created + // and initialized. + // @param value The value might be: + // - CTR Create a CTR provider + // - CTR://test Create a CTR provider and initialize it for tests. + // @param result The new provider object + // @return OK if the provider was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + static const char* Type() { return "EncryptionProvider"; } + + // Short-cut method to create a CTR-provider + static std::shared_ptr NewCTRProvider( + const std::shared_ptr& cipher); + // GetPrefixLength returns the length of the prefix that is added to every // file and used for storing encryption options. For optimal performance, the // prefix length should be a multiple of the page size. - virtual size_t GetPrefixLength() = 0; + virtual size_t GetPrefixLength() const = 0; // CreateNewPrefix initialized an allocated block of prefix memory // for a new file. virtual Status CreateNewPrefix(const std::string& fname, char* prefix, - size_t prefixLength) = 0; + size_t prefixLength) const = 0; + + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; // CreateCipherStream creates a block access cipher stream for a file given // given name and options. virtual Status CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, std::unique_ptr* result) = 0; + + // Returns a string representing an encryption marker prefix for this + // provider. If a marker is provided, this marker can be used to tell whether + // or not a file is encrypted by this provider. The maker will also be part + // of any encryption prefix for this provider. + virtual std::string GetMarker() const { return ""; } }; -// This encryption provider uses a CTR cipher stream, with a given block cipher -// and IV. -// -// Note: This is a possible implementation of EncryptionProvider, -// it is considered suitable for use, provided a safe BlockCipher is used. -class CTREncryptionProvider : public EncryptionProvider { - private: - BlockCipher& cipher_; +class EncryptedSequentialFile : public FSSequentialFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + uint64_t offset_; + size_t prefixLength_; + + public: + // Default ctor. Given underlying sequential file is supposed to be at + // offset == prefixLength. + EncryptedSequentialFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + offset_(prefixLength), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + IOStatus Skip(uint64_t n) override; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; +}; +// A file abstraction for randomly reading the contents of a file. +class EncryptedRandomAccessFile : public FSRandomAccessFile { protected: - const static size_t defaultPrefixLength = 4096; + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; public: - CTREncryptionProvider(BlockCipher& c) : cipher_(c){}; - virtual ~CTREncryptionProvider() {} + EncryptedRandomAccessFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + // Readahead the file starting from offset by n bytes for caching. + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + size_t GetUniqueId(char* id, size_t max_size) const override; + + void Hint(AccessPattern pattern) override; + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + IOStatus InvalidateCache(size_t offset, size_t length) override; +}; - // GetPrefixLength returns the length of the prefix that is added to every - // file and used for storing encryption options. For optimal performance, the - // prefix length should be a multiple of the page size. - virtual size_t GetPrefixLength() override; +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class EncryptedWritableFile : public FSWritableFile { + protected: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; - // CreateNewPrefix initialized an allocated block of prefix memory - // for a new file. - virtual Status CreateNewPrefix(const std::string& fname, char* prefix, - size_t prefixLength) override; + public: + // Default ctor. Prefix is assumed to be written already. + EncryptedWritableFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + + // true if Sync() and Fsync() are safe to call concurrently with Append() + // and Flush(). + bool IsSyncThreadSafe() const override; + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + /* + * Get the size of valid data in the file. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + IOStatus InvalidateCache(size_t offset, size_t length) override; + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override; + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override; + + void SetPreallocationBlockSize(size_t size) override; + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override; + + // Pre-allocates space for a file. + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; - // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. - virtual Status CreateCipherStream( - const std::string& fname, const EnvOptions& options, Slice& prefix, - std::unique_ptr* result) override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; +}; +// A file abstraction for random reading and writing. +class EncryptedRandomRWFile : public FSRandomRWFile { protected: - // PopulateSecretPrefixPart initializes the data into a new prefix block - // that will be encrypted. This function will store the data in plain text. - // It will be encrypted later (before written to disk). - // Returns the amount of space (starting from the start of the prefix) - // that has been initialized. - virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, - size_t blockSize); - - // CreateCipherStreamFromPrefix creates a block access cipher stream for a - // file given given name and options. The given prefix is already decrypted. - virtual Status CreateCipherStreamFromPrefix( - const std::string& fname, const EnvOptions& options, - uint64_t initialCounter, const Slice& iv, const Slice& prefix, - std::unique_ptr* result); + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomRWFile(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; +class EncryptedFileSystem : public FileSystemWrapper { + public: + explicit EncryptedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + static const char* kClassName() { return "EncryptedFileSystem"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return FileSystemWrapper::IsInstanceOf(name); + } + } +}; } // namespace ROCKSDB_NAMESPACE #endif // !defined(ROCKSDB_LITE) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,37 +14,90 @@ #include #include +#include "rocksdb/customizable.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { -// FileChecksumFunc is the function class to generates the checksum value +// The unknown file checksum. +constexpr char kUnknownFileChecksum[] = ""; +// The unknown sst file checksum function name. +constexpr char kUnknownFileChecksumFuncName[] = "Unknown"; +// The standard DB file checksum function name. +// This is the name of the checksum function returned by +// GetFileChecksumGenCrc32cFactory(); +constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c"; + +struct FileChecksumGenContext { + std::string file_name; + // The name of the requested checksum generator. + // Checksum factories may use or ignore requested_checksum_func_name, + // and checksum factories written before this field was available are still + // compatible. + std::string requested_checksum_func_name; +}; + +// FileChecksumGenerator is the class to generates the checksum value // for each file when the file is written to the file system. -class FileChecksumFunc { +// Implementations may assume that +// * Finalize is called at most once during the life of the object +// * All calls to Update come before Finalize +// * All calls to GetChecksum come after Finalize +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenerator { public: - virtual ~FileChecksumFunc() {} - // Return the checksum of concat (A, data[0,n-1]) where init_checksum is the - // returned value of some string A. It is used to maintain the checksum of a - // stream of data - virtual std::string Extend(const std::string& init_checksum, const char* data, - size_t n) = 0; + virtual ~FileChecksumGenerator() {} - // Return the checksum value of data[0,n-1] - virtual std::string Value(const char* data, size_t n) = 0; - - // Return a processed value of the checksum for store in somewhere - virtual std::string ProcessChecksum(const std::string& checksum) = 0; + // Update the current result after process the data. For different checksum + // functions, the temporal results may be stored and used in Update to + // include the new data. + virtual void Update(const char* data, size_t n) = 0; + + // Generate the final results if no further new data will be updated. + virtual void Finalize() = 0; + + // Get the checksum. The result should not be the empty string and may + // include arbitrary bytes, including non-printable characters. + virtual std::string GetChecksum() const = 0; // Returns a name that identifies the current file checksum function. virtual const char* Name() const = 0; }; +// Create the FileChecksumGenerator object for each SST file. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileChecksumGenFactory : public Customizable { + public: + ~FileChecksumGenFactory() override {} + static const char* Type() { return "FileChecksumGenFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + // Create a new FileChecksumGenerator. + virtual std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) = 0; + + // Return the name of this FileChecksumGenFactory. + const char* Name() const override = 0; +}; + // FileChecksumList stores the checksum information of a list of files (e.g., -// SST files). The FileChecksumLIst can be used to store the checksum +// SST files). The FileChecksumList can be used to store the checksum // information of all SST file getting from the MANIFEST, which are // the checksum information of all valid SST file of a DB instance. It can // also be used to store the checksum information of a list of SST files to // be ingested. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. class FileChecksumList { public: virtual ~FileChecksumList() {} @@ -80,7 +133,14 @@ // Create a new file checksum list. extern FileChecksumList* NewFileChecksumList(); -// Create a Crc32c based file checksum function -extern FileChecksumFunc* CreateFileChecksumFuncCrc32c(); +// Return a shared_ptr of the builtin Crc32c based file checksum generator +// factory object, which can be shared to create the Crc32c based checksum +// generator object. +// Note: this implementation is compatible with many other crc32c checksum +// implementations and uses big-endian encoding of the result, unlike most +// other crc32c checksums in RocksDB, which alter the result with +// crc32c::Mask and use little-endian encoding. +extern std::shared_ptr +GetFileChecksumGenCrc32cFactory(); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_system.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_system.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h 2025-05-19 16:14:27.000000000 +0000 @@ -17,6 +17,7 @@ #pragma once #include + #include #include #include @@ -24,10 +25,14 @@ #include #include #include +#include #include + +#include "rocksdb/customizable.h" #include "rocksdb/env.h" #include "rocksdb/io_status.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" #include "rocksdb/thread_status.h" namespace ROCKSDB_NAMESPACE { @@ -43,6 +48,7 @@ struct ImmutableDBOptions; struct MutableDBOptions; class RateLimiter; +struct ConfigOptions; using AccessPattern = RandomAccessFile::AccessPattern; using FileAttributes = Env::FileAttributes; @@ -77,14 +83,53 @@ // honored. More hints can be added here in the future to indicate things like // storage media (HDD/SSD) to be used, replication level etc. struct IOOptions { - // Timeout for the operation in milliseconds - std::chrono::milliseconds timeout; + // Timeout for the operation in microseconds + std::chrono::microseconds timeout; // Priority - high or low IOPriority prio; // Type of data being read/written IOType type; + + // EXPERIMENTAL + // An option map that's opaque to RocksDB. It can be used to implement a + // custom contract between a FileSystem user and the provider. This is only + // useful in cases where a RocksDB user directly uses the FileSystem or file + // object for their own purposes, and wants to pass extra options to APIs + // such as NewRandomAccessFile and NewWritableFile. + std::unordered_map property_bag; + + // Force directory fsync, some file systems like btrfs may skip directory + // fsync, set this to force the fsync + bool force_dir_fsync; + + IOOptions() : IOOptions(false) {} + + explicit IOOptions(bool force_dir_fsync_) + : timeout(std::chrono::microseconds::zero()), + prio(IOPriority::kIOLow), + type(IOType::kUnknown), + force_dir_fsync(force_dir_fsync_) {} +}; + +struct DirFsyncOptions { + enum FsyncReason : uint8_t { + kNewFileSynced, + kFileRenamed, + kDirRenamed, + kFileDeleted, + kDefault, + } reason; + + std::string renamed_new_name; // for kFileRenamed + // add other options for other FsyncReason + + DirFsyncOptions(); + + explicit DirFsyncOptions(std::string file_renamed_new_name); + + explicit DirFsyncOptions(FsyncReason fsync_reason); }; // File scope options that control how a file is opened/created and accessed @@ -95,13 +140,32 @@ // to be issued for the file open/creation IOOptions io_options; - FileOptions() : EnvOptions() {} + // EXPERIMENTAL + // The feature is in development and is subject to change. + // When creating a new file, set the temperature of the file so that + // underlying file systems can put it with appropriate storage media and/or + // coding. + Temperature temperature = Temperature::kUnknown; + + // The checksum type that is used to calculate the checksum value for + // handoff during file writes. + ChecksumType handoff_checksum_type; + + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) - : EnvOptions(opts) {} + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const EnvOptions& opts) - : EnvOptions(opts) {} + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} + + FileOptions(const FileOptions& opts) + : EnvOptions(opts), + io_options(opts.io_options), + temperature(opts.temperature), + handoff_checksum_type(opts.handoff_checksum_type) {} + + FileOptions& operator=(const FileOptions&) = default; }; // A structure to pass back some debugging information from the FileSystem @@ -116,12 +180,36 @@ // To be set by the FileSystem implementation std::string msg; + // To be set by the underlying FileSystem implementation. + std::string request_id; + + // In order to log required information in IO tracing for different + // operations, Each bit in trace_data stores which corresponding info from + // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it + // means bit at position 0 is set so TraceData::kRequestID (request_id) will + // be logged in the trace record. + // + enum TraceData : char { + // The value of each enum represents the bitwise position for + // that information in trace_data which will be used by IOTracer for + // tracing. Make sure to add them sequentially. + kRequestID = 0, + }; + uint64_t trace_data = 0; + IODebugContext() {} void AddCounter(std::string& name, uint64_t value) { counters.emplace(name, value); } + // Called by underlying file system to set request_id and log request_id in + // IOTracing. + void SetRequestId(const std::string& _request_id) { + request_id = _request_id; + trace_data |= (1 << TraceData::kRequestID); + } + std::string ToString() { std::ostringstream ss; ss << file_path << ", "; @@ -147,7 +235,13 @@ // of the APIs is of type IOStatus, which can indicate an error code/sub-code, // as well as metadata about the error such as its scope and whether its // retryable. -class FileSystem { +// NewCompositeEnv can be used to create an Env with a custom FileSystem for +// DBOptions::env. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class FileSystem : public Customizable { public: FileSystem(); @@ -156,21 +250,61 @@ virtual ~FileSystem(); - virtual const char* Name() const = 0; - static const char* Type() { return "FileSystem"; } + static const char* kDefaultName() { return "DefaultFileSystem"; } // Loads the FileSystem specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status Load(const std::string& value, std::shared_ptr* result); - // Return a default fie_system suitable for the current operating - // system. Sophisticated users may wish to provide their own Env - // implementation instead of relying on this default file_system - // - // The result of Default() belongs to rocksdb and must never be deleted. + // Loads the FileSystem specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // @param config_options Controls how the FileSystem is loaded + // @param value The name and optional properties describing the file system + // to load. + // @param result On success, returns the loaded FileSystem + // @return OK if the FileSystem was successfully loaded. + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + + // Return a default FileSystem suitable for the current operating + // system. static std::shared_ptr Default(); + // Handles the event when a new DB or a new ColumnFamily starts using the + // specified data paths. + // + // The data paths might be shared by different DBs or ColumnFamilies, + // so RegisterDbPaths might be called with the same data paths. + // For example, when CreateColumnFamily is called multiple times with the same + // data path, RegisterDbPaths will also be called with the same data path. + // + // If the return status is ok, then the paths must be correspondingly + // called in UnregisterDbPaths; + // otherwise this method should have no side effect, and UnregisterDbPaths + // do not need to be called for the paths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status RegisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // Handles the event a DB or a ColumnFamily stops using the specified data + // paths. + // + // It should be called corresponding to each successful RegisterDbPaths. + // + // Different implementations may take different actions. + // By default, it's a no-op and returns Status::OK. + virtual Status UnregisterDbPaths(const std::vector& /*paths*/) { + return Status::OK(); + } + // Create a brand new sequentially-readable file with the specified name. // On success, stores a pointer to the new file in *result and returns OK. // On failure stores nullptr in *result and returns non-OK. If the file does @@ -216,17 +350,18 @@ std::unique_ptr* result, IODebugContext* dbg) = 0; - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. + // Create an object that writes to a file with the specified name. + // `FSWritableFile::Append()`s will append after any existing content. If the + // file does not already exist, creates it. + // + // On success, stores a pointer to the file in *result and returns OK. On + // failure stores nullptr in *result and returns non-OK. // // The returned file will only be accessed by one thread at a time. virtual IOStatus ReopenWritableFile( const std::string& /*fname*/, const FileOptions& /*options*/, std::unique_ptr* /*result*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("ReopenWritableFile"); } // Reuse an existing file by renaming it and opening it as writable. @@ -234,7 +369,7 @@ const std::string& old_fname, const FileOptions& file_opts, std::unique_ptr* result, - IODebugContext* dbg) = 0; + IODebugContext* dbg); // Open `fname` for random read and write, if file doesn't exist the file // will be created. On success, stores a pointer to the new file in @@ -330,6 +465,10 @@ return IOStatus::OK(); } +// This seems to clash with a macro on Windows, so #undef it here +#ifdef DeleteFile +#undef DeleteFile +#endif // Delete the named file. virtual IOStatus DeleteFile(const std::string& fname, const IOOptions& options, @@ -424,7 +563,7 @@ IODebugContext* dbg) = 0; // Create and returns a default logger (an instance of EnvLogger) for storing - // informational messages. Derived classes can overide to provide custom + // informational messages. Derived classes can override to provide custom // logger. virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, std::shared_ptr* result, @@ -436,6 +575,10 @@ std::string* output_path, IODebugContext* dbg) = 0; + // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions + // copy constructor + virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {} + // OptimizeForLogRead will create a new FileOptions object that is a copy of // the FileOptions in the parameters, but is optimized for reading log files. virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const; @@ -473,6 +616,13 @@ const FileOptions& file_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading blob files. + virtual FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + // This seems to clash with a macro on Windows, so #undef it here #ifdef GetFreeSpace #undef GetFreeSpace @@ -483,9 +633,13 @@ const IOOptions& /*options*/, uint64_t* /*diskfree*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("GetFreeSpace"); } + virtual IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& options, bool* is_dir, + IODebugContext* /*dgb*/) = 0; + // If you're adding methods here, remember to add them to EnvWrapper too. private: @@ -506,6 +660,10 @@ // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // REQUIRES: External synchronization virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) = 0; @@ -540,7 +698,7 @@ const IOOptions& /*options*/, Slice* /*result*/, char* /*scratch*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("PositionedRead"); } // If you're adding methods here, remember to add them to @@ -552,7 +710,8 @@ // File offset in bytes uint64_t offset; - // Length to read in bytes + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). size_t len; // A buffer that MultiRead() can optionally place data in. It can @@ -582,6 +741,10 @@ // "*result" is used. If an error was encountered, returns a non-OK // status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, @@ -589,19 +752,22 @@ IODebugContext* dbg) const = 0; // Readahead the file starting from offset by n bytes for caching. + // If it's not implemented (default: `NotSupported`), RocksDB will create + // internal prefetch buffer to improve read performance. virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::OK(); + return IOStatus::NotSupported("Prefetch"); } // Read a bunch of blocks as described by reqs. The blocks can // optionally be read in parallel. This is a synchronous call, i.e it // should return after all reads have completed. The reads will be - // non-overlapping. If the function return Status is not ok, status of - // individual requests will be ignored and return status will be assumed - // for all read requests. The function return status is only meant for any - // any errors that occur before even processing specific read requests + // non-overlapping but can be in any order. If the function return Status + // is not ok, status of individual requests will be ignored and return + // status will be assumed for all read requests. The function return status + // is only meant for errors that occur before processing individual read + // requests. virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { assert(reqs != nullptr); @@ -656,6 +822,13 @@ // RandomAccessFileWrapper too. }; +// A data structure brings the data verification information, which is +// used together with data being written to a file. +struct DataVerificationInfo { + // checksum of the data being written. + Slice checksum; +}; + // A file abstraction for sequential writing. The implementation // must provide buffering since callers may append small fragments // at a time to the file. @@ -678,11 +851,25 @@ virtual ~FSWritableFile() {} // Append data to the end of the file - // Note: A WriteabelFile object must support either Append or + // Note: A WriteableFile object must support either Append or // PositionedAppend, so the users cannot mix the two. virtual IOStatus Append(const Slice& data, const IOOptions& options, IODebugContext* dbg) = 0; + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) { + return Append(data, options, dbg); + } + // PositionedAppend data to the specified offset. The new EOF after append // must be larger than the previous EOF. This is to be used when writes are // not backed by OS buffers and hence has to always start from the start of @@ -707,7 +894,23 @@ uint64_t /* offset */, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("PositionedAppend"); + } + + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual IOStatus PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const IOOptions& /*options*/, + const DataVerificationInfo& /* verification_info */, + IODebugContext* /*dbg*/) { + return IOStatus::NotSupported("PositionedAppend"); } // Truncate is necessary to trim the file to the correct size @@ -825,7 +1028,8 @@ size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; Allocate(block_size * last_preallocated_block_, - block_size * num_spanned_blocks, options, dbg); + block_size * num_spanned_blocks, options, dbg) + .PermitUncheckedError(); last_preallocated_block_ = new_last_preallocated_block; } } @@ -878,6 +1082,11 @@ // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Returns Status::OK() on success. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -931,6 +1140,15 @@ // Fsync directory. Can be called concurrently from multiple threads. virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; + // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it + // may fsync directory or just the renaming file (e.g. btrfs). By default, it + // just calls directory fsync. + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& /*dir_fsync_options*/) { + return Fsync(options, dbg); + } + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { return 0; } @@ -972,11 +1190,15 @@ class FileSystemWrapper : public FileSystem { public: // Initialize an EnvWrapper that delegates all calls to *t - explicit FileSystemWrapper(FileSystem* t) : target_(t) {} + explicit FileSystemWrapper(const std::shared_ptr& t); ~FileSystemWrapper() override {} + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + const char* Name() const override { return target_->Name(); } + // Return the target to which this Env forwards all calls - FileSystem* target() const { return target_; } + FileSystem* target() const { return target_.get(); } // The following text is boilerplate that forwards all methods to target() IOStatus NewSequentialFile(const std::string& f, @@ -1120,6 +1342,10 @@ return target_->NewLogger(fname, options, result, dbg); } + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeFileOptions(opts); + } + FileOptions OptimizeForLogRead( const FileOptions& file_options) const override { return target_->OptimizeForLogRead(file_options); @@ -1147,19 +1373,37 @@ const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(file_options, db_options); } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, uint64_t* diskfree, IODebugContext* dbg) override { return target_->GetFreeSpace(path, options, diskfree, dbg); } + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override { + return target_->IsDirectory(path, options, is_dir, dbg); + } - private: - FileSystem* target_; + const Customizable* Inner() const override { return target_.get(); } + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + protected: + std::shared_ptr target_; }; class FSSequentialFileWrapper : public FSSequentialFile { public: - explicit FSSequentialFileWrapper(FSSequentialFile* target) - : target_(target) {} + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {} + + FSSequentialFile* target() const { return target_; } IOStatus Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) override { @@ -1183,10 +1427,24 @@ FSSequentialFile* target_; }; +class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSSequentialFileOwnerWrapper(std::unique_ptr&& t) + : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + class FSRandomAccessFileWrapper : public FSRandomAccessFile { public: - explicit FSRandomAccessFileWrapper(FSRandomAccessFile* target) - : target_(target) {} + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {} + + FSRandomAccessFile* target() const { return target_; } IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -1214,22 +1472,51 @@ } private: + std::unique_ptr guard_; FSRandomAccessFile* target_; }; +class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomAccessFileOwnerWrapper( + std::unique_ptr&& t) + : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + class FSWritableFileWrapper : public FSWritableFile { public: + // Creates a FileWrapper around the input File object and without + // taking ownership of the object explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {} + FSWritableFile* target() const { return target_; } + IOStatus Append(const Slice& data, const IOOptions& options, IODebugContext* dbg) override { return target_->Append(data, options, dbg); } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->Append(data, options, verification_info, dbg); + } IOStatus PositionedAppend(const Slice& data, uint64_t offset, const IOOptions& options, IODebugContext* dbg) override { return target_->PositionedAppend(data, offset, options, dbg); } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + const DataVerificationInfo& verification_info, + IODebugContext* dbg) override { + return target_->PositionedAppend(data, offset, options, verification_info, + dbg); + } IOStatus Truncate(uint64_t size, const IOOptions& options, IODebugContext* dbg) override { return target_->Truncate(size, options, dbg); @@ -1302,9 +1589,24 @@ FSWritableFile* target_; }; +class FSWritableFileOwnerWrapper : public FSWritableFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSWritableFileOwnerWrapper(std::unique_ptr&& t) + : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + class FSRandomRWFileWrapper : public FSRandomRWFile { public: - explicit FSRandomRWFileWrapper(FSRandomRWFile* target) : target_(target) {} + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {} + + FSRandomRWFile* target() const { return target_; } bool use_direct_io() const override { return target_->use_direct_io(); } size_t GetRequiredBufferAlignment() const override { @@ -1336,23 +1638,56 @@ FSRandomRWFile* target_; }; +class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper { + public: + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSRandomRWFileOwnerWrapper(std::unique_ptr&& t) + : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {} + + private: + std::unique_ptr guard_; +}; + class FSDirectoryWrapper : public FSDirectory { public: - explicit FSDirectoryWrapper(FSDirectory* target) : target_(target) {} + // Creates a FileWrapper around the input File object and takes + // ownership of the object + explicit FSDirectoryWrapper(std::unique_ptr&& t) + : guard_(std::move(t)) { + target_ = guard_.get(); + } + + // Creates a FileWrapper around the input File object and without + // taking ownership of the object + explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {} IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { return target_->Fsync(options, dbg); } + + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + } + size_t GetUniqueId(char* id, size_t max_size) const override { return target_->GetUniqueId(id, max_size); } private: + std::unique_ptr guard_; FSDirectory* target_; }; +// A utility routine: write "data" to the named file. +extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, + const std::string& fname, + bool should_sync = false); + // A utility routine: read contents of named file into *data -extern Status ReadFileToString(FileSystem* fs, const std::string& fname, - std::string* data); +extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, + std::string* data); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h 2025-05-19 16:14:27.000000000 +0000 @@ -20,50 +20,70 @@ #pragma once #include + +#include #include #include #include #include #include "rocksdb/advanced_options.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { class Slice; struct BlockBasedTableOptions; +struct ConfigOptions; // A class that takes a bunch of keys, then generates filter class FilterBitsBuilder { public: virtual ~FilterBitsBuilder() {} - // Add Key to filter, you could use any way to store the key. - // Such as: storing hashes or original keys - // Keys are in sorted order and duplicated keys are possible. + // Add a key (or prefix) to the filter. Typically, a builder will keep + // a set of 64-bit key hashes and only build the filter in Finish + // when the final number of keys is known. Keys are added in sorted order + // and duplicated keys are possible, so typically, the builder will + // only add this key if its hash is different from the most recently + // added. virtual void AddKey(const Slice& key) = 0; + // Called by RocksDB before Finish to populate + // TableProperties::num_filter_entries, so should represent the + // number of unique keys (and/or prefixes) added, but does not have + // to be exact. + virtual size_t EstimateEntriesAdded() { + // Default implementation for backward compatibility. + // 0 conspicuously stands for "unknown". + return 0; + } + // Generate the filter using the keys that are added // The return value of this function would be the filter bits, // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; - // Calculate num of keys that can be added and generate a filter - // <= the specified number of bytes. -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4702) // unreachable code -#endif - virtual int CalculateNumEntry(const uint32_t /*bytes*/) { -#ifndef ROCKSDB_LITE - throw std::runtime_error("CalculateNumEntry not Implemented"); -#else - abort(); -#endif - return 0; + // Approximate the number of keys that can be added and generate a filter + // <= the specified number of bytes. Callers (including RocksDB) should + // only use this result for optimizing performance and not as a guarantee. + // This default implementation is for compatibility with older custom + // FilterBitsBuilders only implementing deprecated CalculateNumEntry. + virtual size_t ApproximateNumEntries(size_t bytes) { + bytes = std::min(bytes, size_t{0xffffffff}); + return static_cast(CalculateNumEntry(static_cast(bytes))); + } + + // Old, DEPRECATED version of ApproximateNumEntries. This is not + // called by RocksDB except as the default implementation of + // ApproximateNumEntries for API compatibility. + virtual int CalculateNumEntry(const uint32_t bytes) { + // DEBUG: ideally should not rely on this implementation + assert(false); + // RELEASE: something reasonably conservative: 2 bytes per entry + return static_cast(bytes / 2); } -#if defined(_MSC_VER) -#pragma warning(pop) -#endif }; // A class that checks if a key can be in filter @@ -93,18 +113,32 @@ // Options for the table being built const BlockBasedTableOptions& table_options; - // Name of the column family for the table (or empty string if unknown) - std::string column_family_name; - - // The compactions style in effect for the table + // BEGIN from (DB|ColumnFamily)Options in effect at table creation time CompactionStyle compaction_style = kCompactionStyleLevel; - // The table level at time of constructing the SST file, or -1 if unknown. - // (The table file could later be used at a different level.) - int level_at_creation = -1; + // Number of LSM levels, or -1 if unknown + int num_levels = -1; // An optional logger for reporting errors, warnings, etc. Logger* info_log = nullptr; + // END from (DB|ColumnFamily)Options + + // Name of the column family for the table (or empty string if unknown) + // TODO: consider changing to Slice + std::string column_family_name; + + // The table level at time of constructing the SST file, or -1 if unknown + // or N/A as in SstFileWriter. (The table file could later be used at a + // different level.) + int level_at_creation = -1; + + // True if known to be going into bottommost sorted run for applicable + // key range (which might not even be last level with data). False + // otherwise. + bool is_bottommost = false; + + // Reason for creating the file with the filter + TableFileCreationReason reason = TableFileCreationReason::kMisc; }; // We add a new format of filter block called full filter block @@ -125,12 +159,27 @@ public: virtual ~FilterPolicy(); + // Creates a new FilterPolicy based on the input value string and returns the + // result The value might be an ID, and ID with properties, or an old-style + // policy string. + // The value describes the FilterPolicy being created. + // For BloomFilters, value may be a ":"-delimited value of the form: + // "bloomfilter:[bits_per_key]:[use_block_based_builder]", + // e.g. ""bloomfilter:4:true" + // The above string is equivalent to calling NewBloomFilterPolicy(4, true). + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + // Return the name of this policy. Note that if the filter encoding // changes in an incompatible way, the name returned by this method // must be changed. Otherwise, old incompatible filters may be // passed to methods of this type. virtual const char* Name() const = 0; + // DEPRECATED: This function is part of the deprecated block-based + // filter, which will be removed in a future release. + // // keys[0,n-1] contains a list of keys (potentially with duplicates) // that are ordered according to the user supplied comparator. // Append a filter that summarizes keys[0,n-1] to *dst. @@ -140,6 +189,9 @@ virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const = 0; + // DEPRECATED: This function is part of the deprecated block-based + // filter, which will be removed in a future release. + // // "filter" contains the data appended by a preceding call to // CreateFilter() on this class. This method must return true if // the key was in the list of keys passed to CreateFilter(). @@ -152,6 +204,7 @@ // NOTE: This function is only called by GetBuilderWithContext() below for // custom FilterPolicy implementations. Thus, it is not necessary to // override this function if overriding GetBuilderWithContext(). + // DEPRECATED: This function will be removed in a future release. virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; } // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy @@ -197,4 +250,49 @@ // trailing spaces in keys. extern const FilterPolicy* NewBloomFilterPolicy( double bits_per_key, bool use_block_based_builder = false); + +// A new Bloom alternative that saves about 30% space compared to +// Bloom filters, with similar query times but roughly 3-4x CPU time +// and 3x temporary space usage during construction. For example, if +// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same +// 0.95% FP rate as Bloom filter but only using about 7 bits per key. +// +// The space savings of Ribbon filters makes sense for lower (higher +// numbered; larger; longer-lived) levels of LSM, whereas the speed of +// Bloom filters make sense for highest levels of LSM. Setting +// bloom_before_level allows for this design with Level and Universal +// compaction styles. For example, bloom_before_level=1 means that Bloom +// filters will be used in level 0, including flushes, and Ribbon +// filters elsewhere, including FIFO compaction and external SST files. +// For this option, memtable flushes are considered level -1 (so that +// flushes can be distinguished from intra-L0 compaction). +// bloom_before_level=0 (default) -> Generate Bloom filters only for +// flushes under Level and Universal compaction styles. +// bloom_before_level=-1 -> Always generate Ribbon filters (except in +// some extreme or exceptional cases). +// +// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier +// versions reading the data will behave as if no filter was used +// (degraded performance until compaction rebuilds filters). All +// built-in FilterPolicies (Bloom or Ribbon) are able to read other +// kinds of built-in filters. +// +// Note: the current Ribbon filter schema uses some extra resources +// when constructing very large filters. For example, for 100 million +// keys in a single filter (one SST file without partitioned filters), +// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom. +// However, the savings in filter space from just ~60 open SST files +// makes up for the additional temporary memory use. +// +// Also consider using optimize_filters_for_memory to save filter +// memory. +extern const FilterPolicy* NewRibbonFilterPolicy( + double bloom_equivalent_bits_per_key, int bloom_before_level = 0); + +// Old name and old default behavior (DEPRECATED) +inline const FilterPolicy* NewExperimentalRibbonFilterPolicy( + double bloom_equivalent_bits_per_key) { + return NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, -1); +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,16 +6,23 @@ #pragma once #include + +#include "rocksdb/customizable.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { class Slice; class BlockBuilder; +struct ConfigOptions; struct Options; // FlushBlockPolicy provides a configurable way to determine when to flush a -// block in the block based tables, +// block in the block based tables. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. class FlushBlockPolicy { public: // Keep track of the key/value sequences and return the boolean value to @@ -25,10 +32,16 @@ virtual ~FlushBlockPolicy() {} }; -class FlushBlockPolicyFactory { +class FlushBlockPolicyFactory : public Customizable { public: - // Return the name of the flush block policy. - virtual const char* Name() const = 0; + static const char* Type() { return "FlushBlockPolicyFactory"; } + + // Creates a FlushBlockPolicyFactory based on the input value. + // By default, this method can create EveryKey or BySize PolicyFactory, + // which take now config_options. + static Status CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result); // Return a new block flush policy that flushes data blocks by data size. // FlushBlockPolicy may need to access the metadata of the data block @@ -45,9 +58,10 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory { public: - FlushBlockBySizePolicyFactory() {} + FlushBlockBySizePolicyFactory(); - const char* Name() const override { return "FlushBlockBySizePolicyFactory"; } + static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; } + const char* Name() const override { return kClassName(); } FlushBlockPolicy* NewFlushBlockPolicy( const BlockBasedTableOptions& table_options, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace detail { +template +struct IndexSequence {}; + +template +struct IndexSequenceHelper + : public IndexSequenceHelper {}; + +template +struct IndexSequenceHelper<0U, Next...> { + using type = IndexSequence; +}; + +template +using make_index_sequence = typename IndexSequenceHelper::type; + +template +void call(Function f, Tuple t, IndexSequence) { + f(std::get(t)...); +} + +template +void call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + call(f, t, make_index_sequence{}); +} +} // namespace detail + +template +class FunctorWrapper { + public: + explicit FunctorWrapper(std::function functor, Args &&...args) + : functor_(std::move(functor)), args_(std::forward(args)...) {} + + void invoke() { detail::call(functor_, args_); } + + private: + std::function functor_; + std::tuple args_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/io_status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/io_status.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h 2025-05-19 16:14:27.000000000 +0000 @@ -126,6 +126,11 @@ return IOStatus(kIOError, kPathNotFound, msg, msg2); } + static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); } + static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) { + return IOStatus(kIOError, kIOFenced, msg, msg2); + } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. // std::string ToString() const; @@ -170,6 +175,9 @@ } inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED retryable_ = s.retryable_; data_loss_ = s.data_loss_; scope_ = s.scope_; @@ -179,6 +187,10 @@ // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED code_ = s.code_; subcode_ = s.subcode_; retryable_ = s.retryable_; @@ -204,16 +216,18 @@ #endif { if (this != &s) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + s.checked_ = true; + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED code_ = std::move(s.code_); s.code_ = kOk; subcode_ = std::move(s.subcode_); s.subcode_ = kNone; retryable_ = s.retryable_; - retryable_ = false; data_loss_ = s.data_loss_; - data_loss_ = false; scope_ = s.scope_; - scope_ = kIOErrorScopeFileSystem; + s.scope_ = kIOErrorScopeFileSystem; delete[] state_; state_ = nullptr; std::swap(state_, s.state_); @@ -222,11 +236,34 @@ } inline bool IOStatus::operator==(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED return (code_ == rhs.code_); } inline bool IOStatus::operator!=(const IOStatus& rhs) const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; + rhs.checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED return !(*this == rhs); } +inline IOStatus status_to_io_status(Status&& status) { + if (status.ok()) { + // Fast path + return IOStatus::OK(); + } else { + const char* state = status.getState(); + if (state) { + return IOStatus(status.code(), status.subcode(), + Slice(state, strlen(status.getState()) + 1), Slice()); + } else { + return IOStatus(status.code(), status.subcode()); + } + } +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,6 +14,32 @@ namespace ROCKSDB_NAMESPACE { +// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each +// item in Temperature class. +struct FileIOByTemperature { + // the number of bytes read to Temperature::kHot file + uint64_t hot_file_bytes_read; + // the number of bytes read to Temperature::kWarm file + uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCold file + uint64_t cold_file_bytes_read; + // total number of reads to Temperature::kHot file + uint64_t hot_file_read_count; + // total number of reads to Temperature::kWarm file + uint64_t warm_file_read_count; + // total number of reads to Temperature::kCold file + uint64_t cold_file_read_count; + // reset all the statistics to 0. + void Reset() { + hot_file_bytes_read = 0; + warm_file_bytes_read = 0; + cold_file_bytes_read = 0; + hot_file_read_count = 0; + warm_file_read_count = 0; + cold_file_read_count = 0; + } +}; + struct IOStatsContext { // reset all io-stats counter to zero void Reset(); @@ -48,9 +74,19 @@ uint64_t cpu_write_nanos; // CPU time spent in read() and pread() uint64_t cpu_read_nanos; + + FileIOByTemperature file_io_stats_by_temperature; }; -// Get Thread-local IOStatsContext object pointer +// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, +// non-thread-local IOStatsContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local IOStatsContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. IOStatsContext* get_iostats_context(); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iterator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -53,11 +53,13 @@ // All Seek*() methods clear any error status() that the iterator had prior to // the call; after the seek, status() indicates only the error (if any) that // happened during the seek, not any past errors. + // Target does not contain timestamp. virtual void Seek(const Slice& target) = 0; // Position at the last key in the source that at or before target. // The iterator is Valid() after this call iff the source contains // an entry that comes at or before target. + // Target does not contain timestamp. virtual void SeekForPrev(const Slice& target) = 0; // Moves to the next entry in the source. After this call, Valid() is @@ -90,6 +92,10 @@ // If supported, renew the iterator to represent the latest state. The // iterator will be invalidated after the call. Not supported if // ReadOptions.snapshot is given when creating the iterator. + // + // WARNING: Do not use `Iterator::Refresh()` API on DBs where `DeleteRange()` + // has been used or will be used. This feature combination is neither + // supported nor programmatically prevented. virtual Status Refresh() { return Status::NotSupported("Refresh() is not supported"); } @@ -108,6 +114,11 @@ // Get the user-key portion of the internal key at which the iteration // stopped. virtual Status GetProperty(std::string prop_name, std::string* prop); + + virtual Slice timestamp() const { + assert(false); + return Slice(); + } }; // Return an empty iterator (yields nothing). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/listener.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/listener.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,30 +11,35 @@ #include #include #include + #include "rocksdb/compaction_job_stats.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/io_status.h" #include "rocksdb/status.h" #include "rocksdb/table_properties.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { -typedef std::unordered_map> - TablePropertiesCollection; +using TablePropertiesCollection = + std::unordered_map>; class DB; class ColumnFamilyHandle; class Status; struct CompactionJobStats; -enum CompressionType : unsigned char; - -enum class TableFileCreationReason { - kFlush, - kCompaction, - kRecovery, - kMisc, -}; -struct TableFileCreationBriefInfo { - // the name of the database where the file was created +struct FileCreationBriefInfo { + FileCreationBriefInfo() = default; + FileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id) + : db_name(_db_name), + cf_name(_cf_name), + file_path(_file_path), + job_id(_job_id) {} + // the name of the database where the file was created. std::string db_name; // the name of the column family where the file was created. std::string cf_name; @@ -42,7 +47,10 @@ std::string file_path; // the id of the job (which could be flush or compaction) that // created the file. - int job_id; + int job_id = 0; +}; + +struct TableFileCreationBriefInfo : public FileCreationBriefInfo { // reason of creating the table. TableFileCreationReason reason; }; @@ -57,6 +65,48 @@ TableProperties table_properties; // The status indicating whether the creation was successful or not. Status status; + // The checksum of the table file being created + std::string file_checksum; + // The checksum function name of checksum generator used for this table file + std::string file_checksum_func_name; +}; + +struct BlobFileCreationBriefInfo : public FileCreationBriefInfo { + BlobFileCreationBriefInfo(const std::string& _db_name, + const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason) + : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id), + reason(_reason) {} + // reason of creating the blob file. + BlobFileCreationReason reason; +}; + +struct BlobFileCreationInfo : public BlobFileCreationBriefInfo { + BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name, + const std::string& _file_path, int _job_id, + BlobFileCreationReason _reason, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + Status _status, const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id, + _reason), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + status(_status), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name) {} + + // the number of blob in a file. + uint64_t total_blob_count; + // the total bytes in a file. + uint64_t total_blob_bytes; + // The status indicating whether the creation was successful or not. + Status status; + // The checksum of the blob file being created. + std::string file_checksum; + // The checksum function name of checksum generator used for this blob file. + std::string file_checksum_func_name; }; enum class CompactionReason : int { @@ -93,6 +143,10 @@ kExternalSstIngestion, // Compaction due to SST file being too old kPeriodicCompaction, + // Compaction in order to move files to temperature + kChangeTemperature, + // Compaction scheduled to force garbage collection of blob files + kForcedBlobGC, // total number of compaction reasons, new reasons must be added above this. kNumOfReasons, }; @@ -110,13 +164,24 @@ kAutoCompaction = 0x09, kManualFlush = 0x0a, kErrorRecovery = 0xb, + // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable + // will not be called to avoid many small immutable memtables. + kErrorRecoveryRetryFlush = 0xc, + kWalFull = 0xd, }; +// TODO: In the future, BackgroundErrorReason will only be used to indicate +// why the BG Error is happening (e.g., flush, compaction). We may introduce +// other data structure to indicate other essential information such as +// the file type (e.g., Manifest, SST) and special context. enum class BackgroundErrorReason { kFlush, kCompaction, kWriteCallback, kMemTable, + kManifestWrite, + kFlushNoWAL, + kManifestWriteNoWAL, }; enum class WriteStallCondition { @@ -137,30 +202,113 @@ #ifndef ROCKSDB_LITE -struct TableFileDeletionInfo { +struct FileDeletionInfo { + FileDeletionInfo() = default; + + FileDeletionInfo(const std::string& _db_name, const std::string& _file_path, + int _job_id, Status _status) + : db_name(_db_name), + file_path(_file_path), + job_id(_job_id), + status(_status) {} // The name of the database where the file was deleted. std::string db_name; // The path to the deleted file. std::string file_path; // The id of the job which deleted the file. - int job_id; + int job_id = 0; // The status indicating whether the deletion was successful or not. Status status; }; +struct TableFileDeletionInfo : public FileDeletionInfo {}; + +struct BlobFileDeletionInfo : public FileDeletionInfo { + BlobFileDeletionInfo(const std::string& _db_name, + const std::string& _file_path, int _job_id, + Status _status) + : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {} +}; + +enum class FileOperationType { + kRead, + kWrite, + kTruncate, + kClose, + kFlush, + kSync, + kFsync, + kRangeSync, + kAppend, + kPositionedAppend, + kOpen +}; + struct FileOperationInfo { - using TimePoint = std::chrono::time_point; + using Duration = std::chrono::nanoseconds; + using SteadyTimePoint = + std::chrono::time_point; + using SystemTimePoint = + std::chrono::time_point; + using StartTimePoint = std::pair; + using FinishTimePoint = SteadyTimePoint; + FileOperationType type; const std::string& path; uint64_t offset; size_t length; - const TimePoint& start_timestamp; - const TimePoint& finish_timestamp; + const Duration duration; + const SystemTimePoint& start_ts; Status status; - FileOperationInfo(const std::string& _path, const TimePoint& start, - const TimePoint& finish) - : path(_path), start_timestamp(start), finish_timestamp(finish) {} + FileOperationInfo(const FileOperationType _type, const std::string& _path, + const StartTimePoint& _start_ts, + const FinishTimePoint& _finish_ts, const Status& _status) + : type(_type), + path(_path), + duration(std::chrono::duration_cast( + _finish_ts - _start_ts.second)), + start_ts(_start_ts.first), + status(_status) {} + static StartTimePoint StartNow() { + return std::make_pair( + std::chrono::system_clock::now(), std::chrono::steady_clock::now()); + } + static FinishTimePoint FinishNow() { + return std::chrono::steady_clock::now(); + } +}; + +struct BlobFileInfo { + BlobFileInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number) + : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {} + + std::string blob_file_path; + uint64_t blob_file_number; +}; + +struct BlobFileAdditionInfo : public BlobFileInfo { + BlobFileAdditionInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _total_blob_count, + const uint64_t _total_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes) {} + uint64_t total_blob_count; + uint64_t total_blob_bytes; +}; + +struct BlobFileGarbageInfo : public BlobFileInfo { + BlobFileGarbageInfo(const std::string& _blob_file_path, + const uint64_t _blob_file_number, + const uint64_t _garbage_blob_count, + const uint64_t _garbage_blob_bytes) + : BlobFileInfo(_blob_file_path, _blob_file_number), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes) {} + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; }; struct FlushJobInfo { @@ -196,6 +344,12 @@ TableProperties table_properties; FlushReason flush_reason; + + // Compression algorithm used for blob output files + CompressionType blob_compression_type; + + // Information about blob files created during flush in Integrated BlobDB. + std::vector blob_file_addition_infos; }; struct CompactionFileInfo { @@ -210,6 +364,7 @@ }; struct CompactionJobInfo { + ~CompactionJobInfo() { status.PermitUncheckedError(); } // the id of the column family where the compaction happened. uint32_t cf_id; // the name of the column family where the compaction happened. @@ -253,9 +408,19 @@ // Compression algorithm used for output files CompressionType compression; - // If non-null, this variable stores detailed information - // about this compaction. + // Statistics and other additional details on the compaction CompactionJobStats stats; + + // Compression algorithm used for blob output files. + CompressionType blob_compression_type; + + // Information about blob files created during compaction in Integrated + // BlobDB. + std::vector blob_file_addition_infos; + + // Information about blob files deleted during compaction in Integrated + // BlobDB. + std::vector blob_file_garbage_infos; }; struct MemTableInfo { @@ -288,18 +453,49 @@ TableProperties table_properties; }; +// Result of auto background error recovery +struct BackgroundErrorRecoveryInfo { + // The original error that triggered the recovery + Status old_bg_error; + + // The final bg_error after all recovery attempts. Status::OK() means + // the recovery was successful and the database is fully operational. + Status new_bg_error; +}; + +struct IOErrorInfo { + IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation, + const std::string& _file_path, size_t _length, uint64_t _offset) + : io_status(_io_status), + operation(_operation), + file_path(_file_path), + length(_length), + offset(_offset) {} + + IOStatus io_status; + FileOperationType operation; + std::string file_path; + size_t length; + uint64_t offset; +}; + // EventListener class contains a set of callback functions that will // be called when specific RocksDB event happens such as flush. It can // be used as a building block for developing custom features such as // stats-collector or external compaction algorithm. // -// Note that callback functions should not run for an extended period of -// time before the function returns, otherwise RocksDB may be blocked. -// For example, it is not suggested to do DB::CompactFiles() (as it may -// run for a long while) or issue many of DB::Put() (as Put may be blocked -// in certain cases) in the same thread in the EventListener callback. -// However, doing DB::CompactFiles() and DB::Put() in another thread is -// considered safe. +// IMPORTANT +// Because compaction is needed to resolve a "writes stopped" condition, +// calling or waiting for any blocking DB write function (no_slowdown=false) +// from a compaction-related listener callback can hang RocksDB. For DB +// writes from a callback we recommend a WriteBatch and no_slowdown=true, +// because the WriteBatch can accumulate writes for later in case DB::Write +// returns Status::Incomplete. Similarly, calling CompactRange or similar +// could hang by waiting for a background worker that is occupied until the +// callback returns. +// +// Otherwise, callback functions should not run for an extended period of +// time before the function returns, because this will slow RocksDB. // // [Threading] All EventListener callback will be called using the // actual thread that involves in that specific event. For example, it @@ -310,8 +506,21 @@ // the current thread holding any DB mutex. This is to prevent potential // deadlock and performance issue when using EventListener callback // in a complex way. -class EventListener { +// +// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into +// RocksDB, because RocksDB is not exception-safe. This could cause undefined +// behavior including data loss, unreported corruption, deadlocks, and more. +class EventListener : public Customizable { public: + static const char* Type() { return "EventListener"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& id, + std::shared_ptr* result); + const char* Name() const override { + // Since EventListeners did not have a name previously, we will assume + // an empty name. Instances should override this method. + return ""; + } // A callback function to RocksDB which will be called whenever a // registered RocksDB flushes a file. The default implementation is // no-op. @@ -459,7 +668,27 @@ // operation finishes. virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {} - // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If + // A callback function for RocksDB which will be called whenever a file flush + // operation finishes. + virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file sync + // operation finishes. + virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // rangeSync operation finishes. + virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file + // truncate operation finishes. + virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {} + + // A callback function for RocksDB which will be called whenever a file close + // operation finishes. + virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {} + + // If true, the OnFile*Finish functions will be called. If // false, then they won't be called. virtual bool ShouldBeNotifiedOnFileIO() { return false; } @@ -472,13 +701,56 @@ Status /* bg_error */, bool* /* auto_recovery */) {} + // DEPRECATED // A callback function for RocksDB which will be called once the database // is recovered from read-only mode after an error. When this is called, it // means normal writes to the database can be issued and the user can // initiate any further recovery actions needed - virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {} + virtual void OnErrorRecoveryCompleted(Status old_bg_error) { + old_bg_error.PermitUncheckedError(); + } + + // A callback function for RocksDB which will be called once the recovery + // attempt from a background retryable error is completed. The recovery + // may have been successful or not. In either case, the callback is called + // with the old and new error. If info.new_bg_error is Status::OK(), that + // means the recovery succeeded. + virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) { + } + + // A callback function for RocksDB which will be called before + // a blob file is being created. It will follow by OnBlobFileCreated after + // the creation finishes. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreationStarted( + const BlobFileCreationBriefInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is created. + // It will be called whether the file is successfully created or not. User can + // check info.status to see if it succeeded or not. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever + // a blob file is deleted. + // + // Note that if applications would like to use the passed reference + // outside this function call, they should make copies from these + // returned value. + virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {} + + // A callback function for RocksDB which will be called whenever an IO error + // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback. + virtual void OnIOError(const IOErrorInfo& /*info*/) {} - virtual ~EventListener() {} + ~EventListener() override {} }; #else diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,22 +5,23 @@ #pragma once -#include "rocksdb/status.h" - #include +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" + namespace ROCKSDB_NAMESPACE { // MemoryAllocator is an interface that a client can implement to supply custom // memory allocation and deallocation methods. See rocksdb/cache.h for more // information. // All methods should be thread-safe. -class MemoryAllocator { +class MemoryAllocator : public Customizable { public: - virtual ~MemoryAllocator() = default; - - // Name of the cache allocator, printed in the log - virtual const char* Name() const = 0; + static const char* Type() { return "MemoryAllocator"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); // Allocate a block of at least size. Has to be thread-safe. virtual void* Allocate(size_t size) = 0; @@ -34,9 +35,12 @@ // default implementation just returns the allocation size return allocation_size; } + + std::string GetId() const override { return GenerateIndividualId(); } }; struct JemallocAllocatorOptions { + static const char* kName() { return "JemallocAllocatorOptions"; } // Jemalloc tcache cache allocations by size class. For each size class, // it caches between 20 (for large size classes) to 200 (for small size // classes). To reduce tcache memory usage in case the allocator is access @@ -45,31 +49,31 @@ bool limit_tcache_size = false; // Lower bound of allocation size to use tcache, if limit_tcache_size=true. - // When used with block cache, it is recommneded to set it to block_size/4. + // When used with block cache, it is recommended to set it to block_size/4. size_t tcache_size_lower_bound = 1024; // Upper bound of allocation size to use tcache, if limit_tcache_size=true. - // When used with block cache, it is recommneded to set it to block_size. + // When used with block cache, it is recommended to set it to block_size. size_t tcache_size_upper_bound = 16 * 1024; }; -// Generate memory allocators which allocates through Jemalloc and utilize -// MADV_DONTDUMP through madvice to exclude cache items from core dump. +// Generate memory allocator which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvise to exclude cache items from core dump. // Applications can use the allocator with block cache to exclude block cache // usage from core dump. // // Implementation details: -// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all -// allocations of the JemallocNodumpAllocator is through the same arena. -// The memory allocator hooks memory allocation of the arena, and call -// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from -// core dump. Side benefit of using single arena would be reduce of jemalloc -// metadata for some workload. +// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator are through the same arena. +// The memory allocator hooks memory allocation of the arena, and calls +// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduction of jemalloc +// metadata for some workloads. // // To mitigate mutex contention for using one single arena, jemalloc tcache // (thread-local cache) is enabled to cache unused allocations for future use. -// The tcache normally incur 0.5M extra memory usage per-thread. The usage -// can be reduce by limitting allocation sizes to cache. +// The tcache normally incurs 0.5M extra memory usage per-thread. The usage +// can be reduced by limiting allocation sizes to cache. extern Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h 2025-05-19 16:14:27.000000000 +0000 @@ -35,11 +35,15 @@ #pragma once -#include #include #include + #include #include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -48,8 +52,9 @@ class LookupKey; class SliceTransform; class Logger; +struct DBOptions; -typedef void* KeyHandle; +using KeyHandle = void*; extern Slice GetLengthPrefixedSlice(const char* data); @@ -59,10 +64,10 @@ // concatenated with values. class KeyComparator { public: - typedef ROCKSDB_NAMESPACE::Slice DecodedType; + using DecodedType = ROCKSDB_NAMESPACE::Slice; virtual DecodedType decode_key(const char* key) const { - // The format of key is frozen and can be terated as a part of the API + // The format of key is frozen and can be treated as a part of the API // contract. Refer to MemTable::Add for details. return GetLengthPrefixedSlice(key); } @@ -120,7 +125,7 @@ return true; } - // Same as ::InsertWithHint, but allow concurrnet write + // Same as ::InsertWithHint, but allow concurrent write // // If hint points to nullptr, a new hint will be allocated on heap, otherwise // the hint will be updated to reflect the last insert location. The hint is @@ -194,6 +199,17 @@ return 0; } + // Returns a vector of unique random memtable entries of approximate + // size 'target_sample_size' (this size is not strictly enforced). + virtual void UniqueRandomSample(const uint64_t num_entries, + const uint64_t target_sample_size, + std::unordered_set* entries) { + (void)num_entries; + (void)target_sample_size; + (void)entries; + assert(false); + } + // Report an approximation of how much memory has been used other than memory // that was allocated through the allocator. Safe to call from any thread. virtual size_t ApproximateMemoryUsage() = 0; @@ -230,6 +246,8 @@ virtual void SeekForPrev(const Slice& internal_key, const char* memtable_key) = 0; + virtual void RandomSeek() {} + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() = 0; @@ -274,9 +292,14 @@ // This is the base class for all factories that are used by RocksDB to create // new MemTableRep objects -class MemTableRepFactory { +class MemTableRepFactory : public Customizable { public: - virtual ~MemTableRepFactory() {} + ~MemTableRepFactory() override {} + + static const char* Type() { return "MemTableRepFactory"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::unique_ptr* factory); virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, const SliceTransform*, @@ -288,7 +311,7 @@ return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); } - virtual const char* Name() const = 0; + const char* Name() const override = 0; // Return true if the current MemTableRep supports concurrent inserts // Default: false @@ -310,20 +333,27 @@ // seeks with consecutive keys. class SkipListFactory : public MemTableRepFactory { public: - explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} + explicit SkipListFactory(size_t lookahead = 0); + + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "SkipListFactory"; } + static const char* kNickName() { return "skip_list"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + std::string GetId() const override; + // Methods for MemTableRepFactory class overrides using MemTableRepFactory::CreateMemTableRep; virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, const SliceTransform*, Logger* logger) override; - virtual const char* Name() const override { return "SkipListFactory"; } bool IsInsertConcurrentlySupported() const override { return true; } bool CanHandleDuplicatedKey() const override { return true; } private: - const size_t lookahead_; + size_t lookahead_; }; #ifndef ROCKSDB_LITE @@ -336,17 +366,22 @@ // VectorRep. On initialization, the underlying array will be at least count // bytes reserved for usage. class VectorRepFactory : public MemTableRepFactory { - const size_t count_; + size_t count_; public: - explicit VectorRepFactory(size_t count = 0) : count_(count) {} + explicit VectorRepFactory(size_t count = 0); + // Methods for Configurable/Customizable class overrides + static const char* kClassName() { return "VectorRepFactory"; } + static const char* kNickName() { return "vector"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + // Methods for MemTableRepFactory class overrides using MemTableRepFactory::CreateMemTableRep; virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*, const SliceTransform*, Logger* logger) override; - - virtual const char* Name() const override { return "VectorRepFactory"; } }; // This class contains a fixed array of buckets, each diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include #include +#include "rocksdb/customizable.h" #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -43,10 +44,16 @@ // // Refer to rocksdb-merge wiki for more details and example implementations. // -class MergeOperator { +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class MergeOperator : public Customizable { public: virtual ~MergeOperator() {} static const char* Type() { return "MergeOperator"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& id, + std::shared_ptr* result); // Gives the client a way to express the read -> modify -> write semantics // key: (IN) The key that's associated with this merge operation. @@ -109,7 +116,7 @@ Slice& existing_operand; }; - // This function applies a stack of merge operands in chrionological order + // This function applies a stack of merge operands in chronological order // on top of an existing value. There are two ways in which this method is // being used: // a) During Get() operation, it used to calculate the final value of a key @@ -125,7 +132,7 @@ // In the example above, Get(K) operation will call FullMerge with a base // value of 2 and operands [+1, +2]. Compaction process might decide to // collapse the beginning of the history up to the snapshot by performing - // full Merge with base value of 0 and operands [+1, +2, +7, +3]. + // full Merge with base value of 0 and operands [+1, +2, +7, +4]. virtual bool FullMergeV2(const MergeOperationInput& merge_in, MergeOperationOutput* merge_out) const; @@ -176,7 +183,7 @@ // PartialMergeMulti should combine them into a single merge operation that is // saved into *new_value, and then it should return true. *new_value should // be constructed such that a call to DB::Merge(key, *new_value) would yield - // the same result as subquential individual calls to DB::Merge(key, operand) + // the same result as sequential individual calls to DB::Merge(key, operand) // for each operand in operand_list from front() to back(). // // The string that new_value is pointing to will be empty. @@ -198,7 +205,7 @@ // TODO: the name is currently not stored persistently and thus // no checking is enforced. Client is responsible for providing // consistent MergeOperator between DB opens. - virtual const char* Name() const = 0; + virtual const char* Name() const override = 0; // Determines whether the PartialMerge can be called with just a single // merge operand. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/metadata.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/metadata.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,79 +5,86 @@ #pragma once -#include - +#include #include +#include #include #include +#include "rocksdb/options.h" #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { -struct ColumnFamilyMetaData; -struct LevelMetaData; -struct SstFileMetaData; -// The metadata that describes a column family. -struct ColumnFamilyMetaData { - ColumnFamilyMetaData() : size(0), file_count(0), name("") {} - ColumnFamilyMetaData(const std::string& _name, uint64_t _size, - const std::vector&& _levels) - : size(_size), name(_name), levels(_levels) {} +// Basic identifiers and metadata for a file in a DB. This only includes +// information considered relevant for taking backups, checkpoints, or other +// services relating to DB file storage. +// This is only appropriate for immutable files, such as SST files or all +// files in a backup. See also LiveFileStorageInfo. +struct FileStorageInfo { + // The name of the file within its directory (e.g. "123456.sst") + std::string relative_filename; + // The directory containing the file, without a trailing '/'. This could be + // a DB path, wal_dir, etc. + std::string directory; + + // The id of the file within a single DB. Set to 0 if the file does not have + // a number (e.g. CURRENT) + uint64_t file_number = 0; + // The type of the file as part of a DB. + FileType file_type = kTempFile; - // The size of this column family in bytes, which is equal to the sum of - // the file size of its "levels". - uint64_t size; - // The number of files in this column family. - size_t file_count; - // The name of the column family. - std::string name; - // The metadata of all levels in this column family. - std::vector levels; -}; + // File size in bytes. See also `trim_to_size`. + uint64_t size = 0; -// The metadata that describes a level. -struct LevelMetaData { - LevelMetaData(int _level, uint64_t _size, - const std::vector&& _files) - : level(_level), size(_size), files(_files) {} + // This feature is experimental and subject to change. + Temperature temperature = Temperature::kUnknown; - // The level which this meta data describes. - const int level; - // The size of this level in bytes, which is equal to the sum of - // the file size of its "files". - const uint64_t size; - // The metadata of all sst files in this level. - const std::vector files; + // The checksum of a SST file, the value is decided by the file content and + // the checksum algorithm used for this SST file. The checksum function is + // identified by the file_checksum_func_name. If the checksum function is + // not specified, file_checksum is "0" by default. + std::string file_checksum; + + // The name of the checksum function used to generate the file checksum + // value. If file checksum is not enabled (e.g., sst_file_checksum_func is + // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is + // "Unknown". + std::string file_checksum_func_name; }; -// The metadata that describes a SST file. -struct SstFileMetaData { - SstFileMetaData() - : size(0), - file_number(0), - smallest_seqno(0), - largest_seqno(0), - num_reads_sampled(0), - being_compacted(false), - num_entries(0), - num_deletions(0), - oldest_blob_file_number(0) {} +// Adds to FileStorageInfo the ability to capture the state of files that +// might change in a running DB. +struct LiveFileStorageInfo : public FileStorageInfo { + // If non-empty, this string represents the "saved" contents of the file + // for the current context. (This field is used for checkpointing CURRENT + // file.) In that case, size == replacement_contents.size() and file on disk + // should be ignored. If empty string, the file on disk should still have + // "saved" contents. (See trim_to_size.) + std::string replacement_contents; + + // If true, the file on disk is allowed to be larger than `size` but only + // the first `size` bytes should be used for the current context. If false, + // the file is corrupt if size on disk does not equal `size`. + bool trim_to_size = false; +}; + +// The metadata that describes an SST file. (Does not need to extend +// LiveFileStorageInfo because SST files are always immutable.) +struct SstFileMetaData : public FileStorageInfo { + SstFileMetaData() {} SstFileMetaData(const std::string& _file_name, uint64_t _file_number, - const std::string& _path, size_t _size, + const std::string& _directory, size_t _size, SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, - bool _being_compacted, uint64_t _oldest_blob_file_number, + bool _being_compacted, Temperature _temperature, + uint64_t _oldest_blob_file_number, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, std::string& _file_checksum, std::string& _file_checksum_func_name) - : size(_size), - name(_file_name), - file_number(_file_number), - db_path(_path), - smallest_seqno(_smallest_seqno), + : smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), smallestkey(_smallestkey), largestkey(_largestkey), @@ -87,52 +94,61 @@ num_deletions(0), oldest_blob_file_number(_oldest_blob_file_number), oldest_ancester_time(_oldest_ancester_time), - file_creation_time(_file_creation_time), - file_checksum(_file_checksum), - file_checksum_func_name(_file_checksum_func_name) {} - - // File size in bytes. - size_t size; - // The name of the file. - std::string name; - // The id of the file. - uint64_t file_number; - // The full path where the file locates. - std::string db_path; + file_creation_time(_file_creation_time) { + if (!_file_name.empty()) { + if (_file_name[0] == '/') { + relative_filename = _file_name.substr(1); + name = _file_name; // Deprecated field + } else { + relative_filename = _file_name; + name = std::string("/") + _file_name; // Deprecated field + } + assert(relative_filename.size() + 1 == name.size()); + assert(relative_filename[0] != '/'); + assert(name[0] == '/'); + } + directory = _directory; + db_path = _directory; // Deprecated field + file_number = _file_number; + file_type = kTableFile; + size = _size; + temperature = _temperature; + file_checksum = _file_checksum; + file_checksum_func_name = _file_checksum_func_name; + } + + SequenceNumber smallest_seqno = 0; // Smallest sequence number in file. + SequenceNumber largest_seqno = 0; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + uint64_t num_reads_sampled = 0; // How many times the file is read. + bool being_compacted = + false; // true if the file is currently being compacted. - SequenceNumber smallest_seqno; // Smallest sequence number in file. - SequenceNumber largest_seqno; // Largest sequence number in file. - std::string smallestkey; // Smallest user defined key in the file. - std::string largestkey; // Largest user defined key in the file. - uint64_t num_reads_sampled; // How many times the file is read. - bool being_compacted; // true if the file is currently being compacted. + uint64_t num_entries = 0; + uint64_t num_deletions = 0; - uint64_t num_entries; - uint64_t num_deletions; - - uint64_t oldest_blob_file_number; // The id of the oldest blob file - // referenced by the file. + uint64_t oldest_blob_file_number = 0; // The id of the oldest blob file + // referenced by the file. // An SST file may be generated by compactions whose input files may // in turn be generated by earlier compactions. The creation time of the - // oldest SST file that is the compaction ancester of this file. - // The timestamp is provided Env::GetCurrentTime(). - // 0 if the information is not available. - uint64_t oldest_ancester_time; - // Timestamp when the SST file is created, provided by Env::GetCurrentTime(). + // oldest SST file that is the compaction ancestor of this file. + // The timestamp is provided SystemClock::GetCurrentTime(). // 0 if the information is not available. - uint64_t file_creation_time; - - // The checksum of a SST file, the value is decided by the file content and - // the checksum algorithm used for this SST file. The checksum function is - // identified by the file_checksum_func_name. If the checksum function is - // not specified, file_checksum is "0" by default. - std::string file_checksum; + // + // Note: for TTL blob files, it contains the start of the expiration range. + uint64_t oldest_ancester_time = 0; + // Timestamp when the SST file is created, provided by + // SystemClock::GetCurrentTime(). 0 if the information is not available. + uint64_t file_creation_time = 0; + + // DEPRECATED: The name of the file within its directory with a + // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct + // instead. + std::string name; - // The name of the checksum function used to generate the file checksum - // value. If file checksum is not enabled (e.g., sst_file_checksum_func is - // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is - // "Unknown". - std::string file_checksum_func_name; + // DEPRECATED: replaced by `directory` in base struct + std::string db_path; }; // The full set of metadata associated with each SST file. @@ -142,6 +158,84 @@ LiveFileMetaData() : column_family_name(), level(0) {} }; +// The MetaData that describes a Blob file +struct BlobMetaData { + BlobMetaData() + : blob_file_number(0), + blob_file_size(0), + total_blob_count(0), + total_blob_bytes(0), + garbage_blob_count(0), + garbage_blob_bytes(0) {} + + BlobMetaData(uint64_t _file_number, const std::string& _file_name, + const std::string& _file_path, uint64_t _file_size, + uint64_t _total_blob_count, uint64_t _total_blob_bytes, + uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes, + const std::string& _file_checksum, + const std::string& _file_checksum_func_name) + : blob_file_number(_file_number), + blob_file_name(_file_name), + blob_file_path(_file_path), + blob_file_size(_file_size), + total_blob_count(_total_blob_count), + total_blob_bytes(_total_blob_bytes), + garbage_blob_count(_garbage_blob_count), + garbage_blob_bytes(_garbage_blob_bytes), + checksum_method(_file_checksum), + checksum_value(_file_checksum_func_name) {} + uint64_t blob_file_number; + std::string blob_file_name; + std::string blob_file_path; + uint64_t blob_file_size; + uint64_t total_blob_count; + uint64_t total_blob_bytes; + uint64_t garbage_blob_count; + uint64_t garbage_blob_bytes; + std::string checksum_method; + std::string checksum_value; +}; + +// The metadata that describes a level. +struct LevelMetaData { + LevelMetaData(int _level, uint64_t _size, + const std::vector&& _files) + : level(_level), size(_size), files(_files) {} + + // The level which this meta data describes. + const int level; + // The size of this level in bytes, which is equal to the sum of + // the file size of its "files". + const uint64_t size; + // The metadata of all sst files in this level. + const std::vector files; +}; + +// The metadata that describes a column family. +struct ColumnFamilyMetaData { + ColumnFamilyMetaData() : size(0), file_count(0), name("") {} + ColumnFamilyMetaData(const std::string& _name, uint64_t _size, + const std::vector&& _levels) + : size(_size), name(_name), levels(_levels) {} + + // The size of this column family in bytes, which is equal to the sum of + // the file size of its "levels". + uint64_t size; + // The number of files in this column family. + size_t file_count; + // The name of the column family. + std::string name; + // The metadata of all levels in this column family. + std::vector levels; + + // The total size of all blob files + uint64_t blob_file_size = 0; + // The number of blob files in this column family. + size_t blob_file_count = 0; + // The metadata of the blobs in this column family + std::vector blob_files; +}; + // Metadata returned as output from ExportColumnFamily() and used as input to // CreateColumnFamiliesWithImport(). struct ExportImportFilesMetaData { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,7 @@ #include #include + #include #include #include @@ -18,9 +19,14 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/comparator.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/data_structure.h" #include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/listener.h" +#include "rocksdb/sst_partitioner.h" +#include "rocksdb/types.h" #include "rocksdb/universal_compaction.h" #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" @@ -51,36 +57,11 @@ class WalFilter; class FileSystem; -// DB contents are stored in a set of blocks, each of which holds a -// sequence of key,value pairs. Each block may be compressed before -// being stored in a file. The following enum describes which -// compression method (if any) is used to compress a block. -enum CompressionType : unsigned char { - // NOTE: do not change the values of existing entries, as these are - // part of the persistent format on disk. - kNoCompression = 0x0, - kSnappyCompression = 0x1, - kZlibCompression = 0x2, - kBZip2Compression = 0x3, - kLZ4Compression = 0x4, - kLZ4HCCompression = 0x5, - kXpressCompression = 0x6, - kZSTD = 0x7, - - // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than - // 0.8.0 or consider a possibility of downgrading the service or copying - // the database files to another service running with an older version of - // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will - // eventually remove the option from the public API. - kZSTDNotFinalCompression = 0x40, - - // kDisableCompressionOption is used to disable some compression options. - kDisableCompressionOption = 0xff, -}; - struct Options; struct DbPath; +using FileTypeSet = SmallEnumSet; + struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // The function recovers options to a previous version. Only 4.6 or later // versions are supported. @@ -148,9 +129,10 @@ // Allows an application to modify/delete a key-value during background // compaction. // - // If the client requires a new compaction filter to be used for different - // compaction runs, it can specify compaction_filter_factory instead of this - // option. The client should specify only one of the two. + // If the client requires a new `CompactionFilter` to be used for different + // compaction runs and/or requires a `CompactionFilter` for table file + // creations outside of compaction, it can specify compaction_filter_factory + // instead of this option. The client should specify only one of the two. // compaction_filter takes precedence over compaction_filter_factory if // client specifies both. // @@ -161,12 +143,21 @@ // Default: nullptr const CompactionFilter* compaction_filter = nullptr; - // This is a factory that provides compaction filter objects which allow - // an application to modify/delete a key-value during background compaction. + // This is a factory that provides `CompactionFilter` objects which allow + // an application to modify/delete a key-value during table file creation. // - // A new filter will be created on each compaction run. If multithreaded - // compaction is being used, each created CompactionFilter will only be used - // from a single thread and so does not need to be thread-safe. + // Unlike the `compaction_filter` option, which is used when compaction + // creates a table file, this factory allows using a `CompactionFilter` when a + // table file is created for various reasons. The factory can decide what + // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by + // default the decision is to use a `CompactionFilter` for + // `TableFileCreationReason::kCompaction` only. + // + // Each thread of work involving creating table files will create a new + // `CompactionFilter` when it will be used according to the above + // `TableFileCreationReason`-based decision. This allows the application to + // know about the different ongoing threads of work and makes it unnecessary + // for `CompactionFilter` to provide thread-safety. // // Default: nullptr std::shared_ptr compaction_filter_factory = nullptr; @@ -220,14 +211,18 @@ CompressionType compression; // Compression algorithm that will be used for the bottommost level that - // contain files. + // contain files. The behavior for num_levels = 1 is not well defined. + // Right now, with num_levels = 1, all compaction outputs will use + // bottommost_compression and all flush outputs still use options.compression, + // but the behavior is subject to change. // // Default: kDisableCompressionOption (Disabled) CompressionType bottommost_compression = kDisableCompressionOption; // different options for compression algorithms used by bottommost_compression // if it is enabled. To enable it, please see the definition of - // CompressionOptions. + // CompressionOptions. Behavior for num_levels = 1 is the same as + // options.bottommost_compression. CompressionOptions bottommost_compression_opts; // different options for compression algorithms @@ -308,6 +303,15 @@ // Default: nullptr std::shared_ptr compaction_thread_limiter = nullptr; + // If non-nullptr, use the specified factory for a function to determine the + // partitioning of sst files. This helps compaction to split the files + // on interesting boundaries (key prefixes) to make propagation of sst + // files less write amplifying (covering the whole key space). + // THE FEATURE IS STILL EXPERIMENTAL + // + // Default: nullptr + std::shared_ptr sst_partitioner_factory = nullptr; + // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options @@ -318,8 +322,24 @@ enum class WALRecoveryMode : char { // Original levelDB recovery - // We tolerate incomplete record in trailing data on all logs - // Use case : This is legacy behavior + // + // We tolerate the last record in any log to be incomplete due to a crash + // while writing it. Zeroed bytes from preallocation are also tolerated in the + // trailing data of any log. + // + // Use case: Applications for which updates, once applied, must not be rolled + // back even after a crash-recovery. In this recovery mode, RocksDB guarantees + // this as long as `WritableFile::Append()` writes are durable. In case the + // user needs the guarantee in more situations (e.g., when + // `WritableFile::Append()` writes to page cache, but the user desires this + // guarantee in face of power-loss crash-recovery), RocksDB offers various + // mechanisms to additionally invoke `WritableFile::Sync()` in order to + // strengthen the guarantee. + // + // This differs from `kPointInTimeRecovery` in that, in case a corruption is + // detected during recovery, this mode will refuse to open the DB. Whereas, + // `kPointInTimeRecovery` will stop recovery just before the corruption since + // that is a valid point-in-time to which to recover. kTolerateCorruptedTailRecords = 0x00, // Recover from clean shutdown // We don't expect to find any corruption in the WAL @@ -347,6 +367,86 @@ DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} }; +extern const char* kHostnameForDbHostId; + +enum class CompactionServiceJobStatus : char { + kSuccess, + kFailure, + kUseLocal, +}; + +struct CompactionServiceJobInfo { + std::string db_name; + std::string db_id; + std::string db_session_id; + uint64_t job_id; // job_id is only unique within the current DB and session, + // restart DB will reset the job_id. `db_id` and + // `db_session_id` could help you build unique id across + // different DBs and sessions. + + Env::Priority priority; + + CompactionServiceJobInfo(std::string db_name_, std::string db_id_, + std::string db_session_id_, uint64_t job_id_, + Env::Priority priority_) + : db_name(std::move(db_name_)), + db_id(std::move(db_id_)), + db_session_id(std::move(db_session_id_)), + job_id(job_id_), + priority(priority_) {} +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class CompactionService : public Customizable { + public: + static const char* Type() { return "CompactionService"; } + + // Returns the name of this compaction service. + const char* Name() const override = 0; + + // Start the compaction with input information, which can be passed to + // `DB::OpenAndCompact()`. + // job_id is pre-assigned, it will be reset after DB re-open. + // Warning: deprecated, please use the new interface + // `StartV2(CompactionServiceJobInfo, ...)` instead. + virtual CompactionServiceJobStatus Start( + const std::string& /*compaction_service_input*/, uint64_t /*job_id*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + // Start the remote compaction with `compaction_service_input`, which can be + // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the + // information the user might want to know, which includes `job_id`. + virtual CompactionServiceJobStatus StartV2( + const CompactionServiceJobInfo& info, + const std::string& compaction_service_input) { + // Default implementation to call legacy interface, please override and + // replace the legacy implementation + return Start(compaction_service_input, info.job_id); + } + + // Wait compaction to be finish. + // Warning: deprecated, please use the new interface + // `WaitForCompleteV2(CompactionServiceJobInfo, ...)` instead. + virtual CompactionServiceJobStatus WaitForComplete( + uint64_t /*job_id*/, std::string* /*compaction_service_result*/) { + return CompactionServiceJobStatus::kUseLocal; + } + + // Wait for remote compaction to finish. + virtual CompactionServiceJobStatus WaitForCompleteV2( + const CompactionServiceJobInfo& info, + std::string* compaction_service_result) { + // Default implementation to call legacy interface, please override and + // replace the legacy implementation + return WaitForComplete(info.job_id, compaction_service_result); + } + + ~CompactionService() override = default; +}; + struct DBOptions { // The function recovers options to the option as in version 4.6. DBOptions* OldDefaults(int rocksdb_major_version = 4, @@ -389,6 +489,23 @@ // Default: true bool paranoid_checks = true; + // If true, during memtable flush, RocksDB will validate total entries + // read in flush, and compare with counter inserted into it. + // The option is here to turn the feature off in case this new validation + // feature has a bug. + // Default: true + bool flush_verify_memtable_count = true; + + // If true, the log numbers and sizes of the synced WALs are tracked + // in MANIFEST, then during DB recovery, if a synced WAL is missing + // from disk, or the WAL's size does not match the recorded size in + // MANIFEST, an error will be reported and the recovery will be aborted. + // + // Note that this option does not work with secondary instance. + // + // Default: false + bool track_and_verify_wals_in_manifest = false; + // Use the specified object to interact with the environment, // e.g. to read/write files, schedule background work, etc. In the near // future, support for doing storage operations such as read/write files @@ -396,12 +513,7 @@ // Default: Env::Default() Env* env = Env::Default(); - // Use the specified object to interact with the storage to - // read/write files. This is in addition to env. This option should be used - // if the desired storage subsystem provides a FileSystem implementation. - std::shared_ptr file_system = nullptr; - - // Use to control write rate of flush and compaction. Flush has higher + // Use to control write/read rate of flush and compaction. Flush has higher // priority than compaction. Rate limiting is disabled if nullptr. // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. // Default: nullptr @@ -456,8 +568,18 @@ // (i.e. the ones that are causing all the space amplification). If set to 0 // (default), we will dynamically choose the WAL size limit to be // [sum of all write_buffer_size * max_write_buffer_number] * 4 - // This option takes effect only when there are more than one column family as - // otherwise the wal size is dictated by the write_buffer_size. + // + // For example, with 15 column families, each with + // write_buffer_size = 128 MB + // max_write_buffer_number = 6 + // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB + // + // The RocksDB wiki has some discussion about how the WAL interacts + // with memtables and flushing of column families. + // https://github.com/facebook/rocksdb/wiki/Column-Families + // + // This option takes effect only when there are more than one column + // family as otherwise the wal size is dictated by the write_buffer_size. // // Default: 0 // @@ -541,7 +663,7 @@ // Dynamically changeable through SetDBOptions() API. int base_background_compactions = -1; - // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the + // DEPRECATED: RocksDB automatically decides this based on the // value of max_background_jobs. For backwards compatibility we will set // `max_background_jobs = max_background_compactions + max_background_flushes` // in the case where user sets at least one of `max_background_compactions` or @@ -563,9 +685,11 @@ // concurrently perform a compaction job by breaking it into multiple, // smaller ones that are run simultaneously. // Default: 1 (i.e. no subcompactions) + // + // Dynamically changeable through SetDBOptions() API. uint32_t max_subcompactions = 1; - // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the + // DEPRECATED: RocksDB automatically decides this based on the // value of max_background_jobs. For backwards compatibility we will set // `max_background_jobs = max_background_compactions + max_background_flushes` // in the case where user sets at least one of `max_background_compactions` or @@ -648,7 +772,9 @@ // large amounts of data (such as xfs's allocsize option). size_t manifest_preallocation_size = 4 * 1024 * 1024; - // Allow the OS to mmap file for reading sst tables. Default: false + // Allow the OS to mmap file for reading sst tables. + // Not recommended for 32-bit OS. + // Default: false bool allow_mmap_reads = false; // Allow the OS to mmap file for writing. @@ -675,7 +801,15 @@ // Not supported in ROCKSDB_LITE mode! bool use_direct_io_for_flush_and_compaction = false; - // If false, fallocate() calls are bypassed + // If false, fallocate() calls are bypassed, which disables file + // preallocation. The file space preallocation is used to increase the file + // write/append performance. By default, RocksDB preallocates space for WAL, + // SST, Manifest files, the extra space is truncated when the file is written. + // Warning: if you're using btrfs, we would recommend setting + // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra + // allocated space cannot be freed, which could be significant if you have + // lots of files. More details about this limitation: + // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md bool allow_fallocate = true; // Disable child process inherit open files. Default: true @@ -717,6 +851,23 @@ // Default: true bool advise_random_on_open = true; + // [experimental] + // Used to activate or deactive the Mempurge feature (memtable garbage + // collection). (deactivated by default). At every flush, the total useful + // payload (total entries minus garbage entries) is estimated as a ratio + // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then + // compared to this `threshold` value: + // - if ratio1.0 : aggressive mempurge. + // 0 < threshold < 1.0: mempurge triggered only for very low useful payload + // ratios. + // [experimental] + double experimental_mempurge_threshold = 0.0; + // Amount of data to build up in memtables across all column // families before writing to disk. // @@ -795,7 +946,7 @@ size_t random_access_max_buffer_size = 1024 * 1024; // This is the maximum buffer size that is used by WritableFileWriter. - // On Windows, we need to maintain an aligned buffer for writes. + // With direct IO, we need to maintain an aligned buffer for writes. // We allow the buffer to grow until it's size hits the limit in buffered // IO and fix the buffer size when using direct IO to ensure alignment of // write requests if the logical sector size is unusual @@ -822,7 +973,7 @@ // Allows OS to incrementally sync files to disk while they are being // written, asynchronously, in the background. This operation can be used // to smooth out write I/Os over time. Users shouldn't rely on it for - // persistency guarantee. + // persistence guarantee. // Issue one request for every bytes_per_sync written. 0 turns it off. // // You may consider using rate_limiter to regulate write rate to device. @@ -1060,16 +1211,9 @@ // Immutable. bool allow_ingest_behind = false; - // Needed to support differential snapshots. - // If set to true then DB will only process deletes with sequence number - // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts). - // Clients are responsible to periodically call this method to advance - // the cutoff time. If this method is never called and preserve_deletes - // is set to true NO deletes will ever be processed. - // At the moment this only keeps normal deletes, SingleDeletes will - // not be preserved. + // Deprecated, will be removed in a future release. + // Please try using user-defined timestamp instead. // DEFAULT: false - // Immutable (TODO: make it dynamically changeable) bool preserve_deletes = false; // If enabled it uses two queues for writes, one for the ones with @@ -1124,12 +1268,94 @@ // Default: 0 size_t log_readahead_size = 0; - // If user does NOT provide SST file checksum function, the SST file checksum - // will NOT be used. The single checksum instance are shared by options and - // file writers. Make sure the algorithm is thread safe. + // If user does NOT provide the checksum generator factory, the file checksum + // will NOT be used. A new file checksum generator object will be created + // when a SST file is created. Therefore, each created FileChecksumGenerator + // will only be used from a single thread and so does not need to be + // thread-safe. // // Default: nullptr - std::shared_ptr sst_file_checksum_func = nullptr; + std::shared_ptr file_checksum_gen_factory = nullptr; + + // By default, RocksDB recovery fails if any table file referenced in + // MANIFEST are missing after scanning the MANIFEST. + // Best-efforts recovery is another recovery mode that + // tries to restore the database to the most recent point in time without + // missing file. + // Currently not compatible with atomic flush. Furthermore, WAL files will + // not be used for recovery if best_efforts_recovery is true. + // Default: false + bool best_efforts_recovery = false; + + // It defines how many times db resume is called by a separate thread when + // background retryable IO Error happens. When background retryable IO + // Error happens, SetBGError is called to deal with the error. If the error + // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), + // then db resume is called in background to recover from the error. If this + // value is 0 or negative, db resume will not be called. + // + // Default: INT_MAX + int max_bgerror_resume_count = INT_MAX; + + // If max_bgerror_resume_count is >= 2, db resume is called multiple times. + // This option decides how long to wait to retry the next resume if the + // previous resume fails and satisfy redo resume conditions. + // + // Default: 1000000 (microseconds). + uint64_t bgerror_resume_retry_interval = 1000000; + + // It allows user to opt-in to get error messages containing corrupted + // keys/values. Corrupt keys, values will be logged in the + // messages/logs/status that will help users with the useful information + // regarding affected data. By default value is set false to prevent users + // data to be exposed in the logs/messages etc. + // + // Default: false + bool allow_data_in_errors = false; + + // A string identifying the machine hosting the DB. This + // will be written as a property in every SST file written by the DB (or + // by offline writers such as SstFileWriter and RepairDB). It can be useful + // for troubleshooting in memory corruption caused by a failing host when + // writing a file, by tracing back to the writing host. These corruptions + // may not be caught by the checksum since they happen before checksumming. + // If left as default, the table writer will substitute it with the actual + // hostname when writing the SST file. If set to an empty string, the + // property will not be written to the SST file. + // + // Default: hostname + std::string db_host_id = kHostnameForDbHostId; + + // Use this if your DB want to enable checksum handoff for specific file + // types writes. Make sure that the File_system you use support the + // crc32c checksum verification + // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile. + // NOTE: currently RocksDB only generates crc32c based checksum for the + // handoff. If the storage layer has different checksum support, user + // should enble this set as empty. Otherwise,it may cause unexpected + // write failures. + FileTypeSet checksum_handoff_file_types; + + // EXPERIMENTAL + // CompactionService is a feature allows the user to run compactions on a + // different host or process, which offloads the background load from the + // primary host. + // It's an experimental feature, the interface will be changed without + // backward/forward compatibility support for now. Some known issues are still + // under development. + std::shared_ptr compaction_service = nullptr; + + // It indicates, which lowest cache tier we want to + // use for a certain DB. Currently we support volatile_tier and + // non_volatile_tier. They are layered. By setting it to kVolatileTier, only + // the block cache (current implemented volatile_tier) is used. So + // cache entries will not spill to secondary cache (current + // implemented non_volatile_tier), and block cache lookup misses will not + // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use + // both block cache and secondary cache. + // + // Default: kNonVolatileBlockTier + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; }; // Options to control the behavior of a database (passed to DB::Open) @@ -1141,7 +1367,11 @@ const ColumnFamilyOptions& column_family_options) : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} - // The function recovers options to the option as in version 4.6. + // Change to some default settings from an older version. + // NOT MAINTAINED: This function has not been and is not maintained. + // DEPRECATED: This function might be removed in a future release. + // In general, defaults are changed to suit broad interests. Opting + // out of a change on upgrade should be deliberate and considered. Options* OldDefaults(int rocksdb_major_version = 4, int rocksdb_minor_version = 6); @@ -1164,6 +1394,12 @@ // Use this if your DB is very small (like under 1GB) and you don't want to // spend lots of memory for memtables. Options* OptimizeForSmallDb(); + + // Disable some checks that should not be necessary in the absence of + // software logic errors or CPU+memory hardware errors. This can improve + // write speeds but is only recommended for temporary use. Does not + // change protection against corrupt storage (e.g. verify_checksums). + Options* DisableExtraChecks(); }; // @@ -1204,19 +1440,28 @@ // Default: nullptr const Slice* iterate_lower_bound; - // "iterate_upper_bound" defines the extent upto which the forward iterator + // "iterate_upper_bound" defines the extent up to which the forward iterator // can returns entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is - // not a valid entry. If prefix_extractor is not null, the Seek target - // and iterate_upper_bound need to have the same prefix. - // This is because ordering is not guaranteed outside of prefix domain. + // not a valid entry. If prefix_extractor is not null: + // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used + // to infer whether prefix iterating (e.g. applying prefix bloom filter) + // can be used within RocksDB. This is done by comparing + // iterate_upper_bound with the seek key. + // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes + // effect if it shares the same prefix as the seek key. If + // iterate_upper_bound is outside the prefix of the seek key, then keys + // returned outside the prefix range will be undefined, just as if + // iterate_upper_bound = null. + // If iterate_upper_bound is not null, SeekToLast() will position the iterator + // at the first key smaller than iterate_upper_bound. // // Default: nullptr const Slice* iterate_upper_bound; // RocksDB does auto-readahead for iterators on noticing more than two reads // for a table file. The readahead starts at 8KB and doubles on every - // additional read upto 256KB. + // additional read up to 256KB. // This option can help if most of the range scans are large, and if it is // determined that a larger readahead than that enabled by auto-readahead is // needed. @@ -1242,11 +1487,12 @@ // Default: true bool verify_checksums; - // Should the "data block"/"index block"" read for this iteration be placed in + // Should the "data block"/"index block" read for this iteration be placed in // block cache? // Callers may wish to set this field to false for bulk scans. // This would help not to the change eviction order of existing items in the - // block cache. Default: true + // block cache. + // Default: true bool fill_cache; // Specify to create a tailing iterator -- a special iterator that has a @@ -1267,13 +1513,15 @@ // If true when calling Get(), we also skip prefix bloom when reading from // block based table. It provides a way to read existing data after // changing implementation of prefix extractor. + // Default: false bool total_order_seek; // When true, by default use total_order_seek = true, and RocksDB can // selectively enable prefix seek mode if won't generate a different result // from total_order_seek, based on seek key, and iterator upper bound. - // Not suppported in ROCKSDB_LITE mode, in the way that even with value true + // Not supported in ROCKSDB_LITE mode, in the way that even with value true // prefix mode is not used. + // Default: false bool auto_prefix_mode; // Enforce that the iterator only iterates over the same prefix as the seek. @@ -1298,9 +1546,11 @@ // Default: false bool background_purge_on_iterator_cleanup; - // If true, keys deleted using the DeleteRange() API will be visible to - // readers until they are naturally deleted during compaction. This improves - // read performance in DBs with many range deletions. + // If true, range tombstones handling will be skipped in key lookup paths. + // For DB instances that don't use DeleteRange() calls, this setting can + // be used to optimize the read performance. + // Note that, if this assumption (of no previous DeleteRange() calls) is + // broken, stale keys could be served in read paths. // Default: false bool ignore_range_deletions; @@ -1312,10 +1562,8 @@ // Default: empty (every table will be scanned) std::function table_filter; - // Needed to support differential snapshots. Has 2 effects: - // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum - // 2) if this param > 0 iterator will return INTERNAL keys instead of - // user keys; e.g. return tombstones as well. + // Deprecated, will be removed in a future release. + // Please try using user-defined timestamp instead. // Default: 0 (don't filter by seqnum, return user keys) SequenceNumber iter_start_seqnum; @@ -1323,9 +1571,52 @@ // specified timestamp. All timestamps of the same database must be of the // same length and format. The user is responsible for providing a customized // compare function via Comparator to order tuples. + // For iterator, iter_start_ts is the lower bound (older) and timestamp + // serves as the upper bound. Versions of the same record that fall in + // the timestamp range will be returned. If iter_start_ts is nullptr, + // only the most recent version visible to timestamp is returned. // The user-specified timestamp feature is still under active development, // and the API is subject to change. + // Default: nullptr const Slice* timestamp; + const Slice* iter_start_ts; + + // Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + // in microseconds. + // It should be set to microseconds since epoch, i.e, gettimeofday or + // equivalent plus allowed duration in microseconds. The best way is to use + // env->NowMicros() + some timeout. + // This is best efforts. The call may exceed the deadline if there is IO + // involved and the file system doesn't support deadlines, or due to + // checking for deadline periodically rather than for every key if + // processing a batch + std::chrono::microseconds deadline; + + // A timeout in microseconds to be passed to the underlying FileSystem for + // reads. As opposed to deadline, this determines the timeout for each + // individual file read request. If a MultiGet/Get/Seek/Next etc call + // results in multiple reads, each read can last up to io_timeout us. + std::chrono::microseconds io_timeout; + + // It limits the maximum cumulative value size of the keys in batch while + // reading through MultiGet. Once the cumulative value size exceeds this + // soft limit then all the remaining keys are returned with status Aborted. + // + // Default: std::numeric_limits::max() + uint64_t value_size_soft_limit; + + // For iterators, RocksDB does auto-readahead on noticing more than two + // sequential reads for a table file if user doesn't provide readahead_size. + // The readahead starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size only when reads are sequential. However at each + // level, if iterator moves over next file, readahead_size starts again from + // 8KB. + // + // By enabling this option, RocksDB will do some enhancements for + // prefetching the data. + // + // Default: false + bool adaptive_readahead; ReadOptions(); ReadOptions(bool cksum, bool cache); @@ -1371,7 +1662,7 @@ bool no_slowdown; // If true, this write request is of lower priority if compaction is - // behind. In this case, no_slowdown = true, the request will be cancelled + // behind. In this case, no_slowdown = true, the request will be canceled // immediately with Status::Incomplete() returned. Otherwise, it will be // slowed down. The slowdown value is determined by RocksDB to guarantee // it introduces minimum impacts to high priority writes. @@ -1486,6 +1777,15 @@ bool allow_write_stall = false; // If > 0, it will replace the option in the DBOptions for this compaction. uint32_t max_subcompactions = 0; + // Set user-defined timestamp low bound, the data with older timestamp than + // low bound maybe GCed by compaction. Default: nullptr + Slice* full_history_ts_low = nullptr; + + // Allows cancellation of an in-progress manual compaction. + // + // Cancellation can be delayed waiting on automatic compactions when used + // together with `exclusive_manual_compaction == true`. + std::atomic* canceled = nullptr; }; // IngestExternalFileOptions is used by IngestExternalFile() @@ -1505,7 +1805,7 @@ bool allow_blocking_flush = true; // Set to true if you would like duplicate keys in the file being ingested // to be skipped rather than overwriting existing data under that key. - // Usecase: back-fill of some historical data in the database without + // Use case: back-fill of some historical data in the database without // over-writing existing newer version of data. // This option could only be used if the DB has been running // with allow_ingest_behind=true since the dawn of time. @@ -1535,6 +1835,26 @@ // Using a large readahead size (> 2MB) can typically improve the performance // of forward iteration on spinning disks. size_t verify_checksums_readahead_size = 0; + // Set to TRUE if user wants to verify the sst file checksum of ingested + // files. The DB checksum function will generate the checksum of each + // ingested file (if file_checksum_gen_factory is set) and compare the + // checksum function name and checksum with the ingested checksum information. + // + // If this option is set to True: 1) if DB does not enable checksum + // (file_checksum_gen_factory == nullptr), the ingested checksum information + // will be ignored; 2) If DB enable the checksum function, we calculate the + // sst file checksum after the file is moved or copied and compare the + // checksum and checksum name. If checksum or checksum function name does + // not match, ingestion will be failed. If the verification is successful, + // checksum and checksum function name will be stored in Manifest. + // If this option is set to FALSE, 1) if DB does not enable checksum, + // the ingested checksum information will be ignored; 2) if DB enable the + // checksum, we only verify the ingested checksum function name and we + // trust the ingested checksum. If the checksum function name matches, we + // store the checksum in Manifest. DB does not calculate the checksum during + // ingestion. However, if no checksum information is provided with the + // ingested files, DB will generate the checksum and store in the Manifest. + bool verify_file_checksum = true; }; enum TraceFilterType : uint64_t { @@ -1543,7 +1863,13 @@ // Do not trace the get operations kTraceFilterGet = 0x1 << 0, // Do not trace the write operations - kTraceFilterWrite = 0x1 << 1 + kTraceFilterWrite = 0x1 << 1, + // Do not trace the `Iterator::Seek()` operations + kTraceFilterIteratorSeek = 0x1 << 2, + // Do not trace the `Iterator::SeekForPrev()` operations + kTraceFilterIteratorSeekForPrev = 0x1 << 3, + // Do not trace the `MultiGet()` operations + kTraceFilterMultiGet = 0x1 << 4, }; // TraceOptions is used for StartTrace @@ -1556,6 +1882,13 @@ uint64_t sampling_frequency = 1; // Note: The filtering happens before sampling. uint64_t filter = kTraceFilterNone; + // When true, the order of write records in the trace will match the order of + // the corresponding write records in the WAL and applied to the DB. There may + // be a performance penalty associated with preserving this ordering. + // + // Default: false. This means write records in the trace may be in an order + // different from the WAL's order. + bool preserve_write_order = false; }; // ImportColumnFamilyOptions is used by ImportColumnFamily() @@ -1584,4 +1917,35 @@ double files_size_error_margin = -1.0; }; +struct CompactionServiceOptionsOverride { + // Currently pointer configurations are not passed to compaction service + // compaction so the user needs to set it. It will be removed once pointer + // configuration passing is supported. + Env* env = Env::Default(); + std::shared_ptr file_checksum_gen_factory = nullptr; + + const Comparator* comparator = BytewiseComparator(); + std::shared_ptr merge_operator = nullptr; + const CompactionFilter* compaction_filter = nullptr; + std::shared_ptr compaction_filter_factory = nullptr; + std::shared_ptr prefix_extractor = nullptr; + std::shared_ptr table_factory; + std::shared_ptr sst_partitioner_factory = nullptr; + + // statistics is used to collect DB operation metrics, the metrics won't be + // returned to CompactionService primary host, to collect that, the user needs + // to set it here. + std::shared_ptr statistics = nullptr; +}; + +#ifndef ROCKSDB_LITE +struct LiveFilesStorageInfoOptions { + // Whether to populate FileStorageInfo::file_checksum* or leave blank + bool include_checksum_info = false; + // Flushes memtables if total size in bytes of live WAL files is >= this + // number. Default: always force a flush without checking sizes. + uint64_t wal_size_for_flush = 0; +}; +#endif // !ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -30,10 +30,10 @@ // total number of user key returned (only include keys that are found, does // not include keys that are deleted or merged without a final put - uint64_t user_key_return_count; + uint64_t user_key_return_count = 0; // total nanos spent on reading data from SST files - uint64_t get_from_table_nanos; + uint64_t get_from_table_nanos = 0; uint64_t block_cache_hit_count = 0; // total number of block cache hits uint64_t block_cache_miss_count = 0; // total number of block cache misses @@ -57,7 +57,7 @@ // enable per level perf context and allocate storage for PerfContextByLevel void EnablePerLevelPerfContext(); - // temporarily disable per level perf contxt by setting the flag to false + // temporarily disable per level perf context by setting the flag to false void DisablePerLevelPerfContext(); // free the space for PerfContextByLevel, also disable per level perf context @@ -74,6 +74,9 @@ uint64_t filter_block_read_count; // total number of filter block reads uint64_t compression_dict_block_read_count; // total number of compression // dictionary block reads + + uint64_t secondary_cache_hit_count; // total number of secondary cache hits + uint64_t block_checksum_time; // total nanos spent on block checksum uint64_t block_decompress_time; // total nanos spent on block decompression @@ -221,12 +224,24 @@ uint64_t iter_prev_cpu_nanos; uint64_t iter_seek_cpu_nanos; + // Time spent in encrypting data. Populated when EncryptedEnv is used. + uint64_t encrypt_data_nanos; + // Time spent in decrypting data. Populated when EncryptedEnv is used. + uint64_t decrypt_data_nanos; + std::map* level_to_perf_context = nullptr; bool per_level_perf_context_enabled = false; }; -// Get Thread-local PerfContext object pointer -// if defined(NPERF_CONTEXT), then the pointer is not thread-local +// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global, +// non-thread-local PerfContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local PerfContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. PerfContext* get_perf_context(); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -24,14 +24,14 @@ // cache interface is specifically designed for persistent read cache. class PersistentCache { public: - typedef std::vector> StatsType; + using StatsType = std::vector>; virtual ~PersistentCache() {} // Insert to page cache // // page_key Identifier to identify a page uniquely across restarts - // data Page data + // data Page data to copy (caller retains ownership) // size Size of the page virtual Status Insert(const Slice& key, const char* data, const size_t size) = 0; @@ -56,6 +56,12 @@ virtual StatsType Stats() = 0; virtual std::string GetPrintableOptions() const = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharding the same persistent cache to partition the key space. Typically + // the client will allocate a new id at startup and prepend the id to its + // cache keys. + virtual uint64_t NewId() = 0; }; // Factor method to create a new persistent cache diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,12 +9,17 @@ #pragma once +#include "rocksdb/customizable.h" #include "rocksdb/env.h" #include "rocksdb/statistics.h" +#include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { -class RateLimiter { +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class RateLimiter : public Customizable { public: enum class OpType { // Limitation: we currently only invoke Request() with OpType::kRead for @@ -28,11 +33,20 @@ kAllIo, }; + static const char* Type() { return "RateLimiter"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + // For API compatibility, default to rate-limiting writes only. - explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {} + explicit RateLimiter(Mode mode = Mode::kWritesOnly); virtual ~RateLimiter() {} + // Deprecated. Will be removed in a major release. Derived classes + // should implement this method. + virtual const char* Name() const override { return ""; } + // This API allows user to dynamically change rate limiter's bytes per second. // REQUIRED: bytes_per_second > 0 virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; @@ -45,13 +59,15 @@ // Request for token for bytes. If this request can not be satisfied, the call // is blocked. Caller is responsible to make sure // bytes <= GetSingleBurstBytes() + // and bytes >= 0. virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) { assert(false); } // Request for token for bytes and potentially update statistics. If this // request can not be satisfied, the call is blocked. Caller is responsible to - // make sure bytes <= GetSingleBurstBytes(). + // make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. virtual void Request(const int64_t bytes, const Env::IOPriority pri, Statistics* /* stats */) { // For API compatibility, default implementation calls the older API in @@ -62,7 +78,8 @@ // Requests token to read or write bytes and potentially updates statistics. // // If this request can not be satisfied, the call is blocked. Caller is - // responsible to make sure bytes <= GetSingleBurstBytes(). + // responsible to make sure bytes <= GetSingleBurstBytes() + // and bytes >= 0. virtual void Request(const int64_t bytes, const Env::IOPriority pri, Statistics* stats, OpType op_type) { if (IsRateLimited(op_type)) { @@ -89,6 +106,20 @@ virtual int64_t GetTotalRequests( const Env::IOPriority pri = Env::IO_TOTAL) const = 0; + // Total # of requests that are pending for bytes in rate limiter + // For convenience, this function is supported by the RateLimiter returned + // by NewGenericRateLimiter but is not required by RocksDB. + // + // REQUIRED: total_pending_request != nullptr + virtual Status GetTotalPendingRequests( + int64_t* total_pending_requests, + const Env::IOPriority pri = Env::IO_TOTAL) const { + assert(total_pending_requests != nullptr); + (void)total_pending_requests; + (void)pri; + return Status::NotSupported(); + } + virtual int64_t GetBytesPerSecond() const = 0; virtual bool IsRateLimited(OpType op_type) { @@ -105,7 +136,7 @@ Mode GetMode() { return mode_; } private: - const Mode mode_; + Mode mode_; }; // Create a RateLimiter object, which can be shared among RocksDB instances to diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,12 @@ #pragma once +// For testing purposes +#if ROCKSDB_NAMESPACE == 42 +#undef ROCKSDB_NAMESPACE +#endif + +// Normal logic #ifndef ROCKSDB_NAMESPACE #define ROCKSDB_NAMESPACE rocksdb #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,85 @@ +// Copyright (c) 2021, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/customizable.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A handle for lookup result. The handle may not be immediately ready or +// have a valid value. The caller must call isReady() to determine if its +// ready, and call Wait() in order to block until it becomes ready. +// The caller must call value() after it becomes ready to determine if the +// handle successfullly read the item. +class SecondaryCacheResultHandle { + public: + virtual ~SecondaryCacheResultHandle() {} + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the value. If nullptr, it means the lookup was unsuccessful + virtual void* Value() = 0; + + // Return the size of value + virtual size_t Size() = 0; +}; + +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SecondaryCache : public Customizable { + public: + virtual ~SecondaryCache() {} + + static const char* Type() { return "SecondaryCache"; } + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Insert the given value into this cache. The value is not written + // directly. Rather, the SaveToCallback provided by helper_cb will be + // used to extract the persistable data in value, which will be written + // to this tier. The implementation may or may not write it to cache + // depending on the admission control policy, even if the return status is + // success. + virtual Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) = 0; + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0; + + // At the discretion of the implementation, erase the data associated + // with key + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready + virtual void WaitAll(std::vector handles) = 0; + + virtual std::string GetPrintableOptions() const override = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,26 +14,41 @@ #pragma once +#include #include +#include "rocksdb/customizable.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { class Slice; +struct ConfigOptions; -/* - * A SliceTransform is a generic pluggable way of transforming one string - * to another. Its primary use-case is in configuring rocksdb - * to store prefix blooms by setting prefix_extractor in - * ColumnFamilyOptions. - */ -class SliceTransform { +// A SliceTransform is a generic pluggable way of transforming one string +// to another. Its primary use-case is in configuring rocksdb +// to store prefix blooms by setting prefix_extractor in +// ColumnFamilyOptions. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SliceTransform : public Customizable { public: virtual ~SliceTransform(){}; // Return the name of this transformation. - virtual const char* Name() const = 0; + virtual const char* Name() const override = 0; + static const char* Type() { return "SliceTransform"; } + + // Creates and configures a new SliceTransform from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result); + + // Returns a string representation of this SliceTransform, representing the ID + // and any additional properties + std::string AsString() const; // Extract a prefix from a specified key. This method is called when // a key is inserted into the db, and the returned slice is used to @@ -54,7 +69,7 @@ // prefix size of 4. // // Wiki documentation here: - // https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes + // https://github.com/facebook/rocksdb/wiki/Prefix-Seek // virtual bool InDomain(const Slice& key) const = 0; @@ -62,7 +77,7 @@ virtual bool InRange(const Slice& /*dst*/) const { return false; } // Some SliceTransform will have a full length which can be used to - // determine if two keys are consecuitive. Can be disabled by always + // determine if two keys are consecutive. Can be disabled by always // returning 0 virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } @@ -94,10 +109,15 @@ } }; +// The prefix is the first `prefix_len` bytes of the key, and keys shorter +// then `prefix_len` are not InDomain. extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); +// The prefix is the first min(length(key),`cap_len`) bytes of the key, and +// all keys are InDomain. extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); +// Prefix is equal to key. All keys are InDomain. extern const SliceTransform* NewNoopTransform(); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,7 +11,7 @@ class SSTDumpTool { public: - int Run(int argc, char** argv, Options options = Options()); + int Run(int argc, char const* const* argv, Options options = Options()); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,6 +11,7 @@ #include #include "rocksdb/file_system.h" +#include "rocksdb/statistics.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { @@ -18,17 +19,17 @@ class Env; class Logger; -// SstFileManager is used to track SST files in the DB and control their -// deletion rate. -// All SstFileManager public functions are thread-safe. -// SstFileManager is not extensible. +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. +// SstFileManager is NOT an extensible interface but a public interface for +// result of NewSstFileManager. Any derived classes must be RocksDB internal. class SstFileManager { public: virtual ~SstFileManager() {} // Update the maximum allowed space that should be used by RocksDB, if - // the total size of the SST files exceeds max_allowed_space, writes to - // RocksDB will fail. + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. // // Setting max_allowed_space to 0 will disable this feature; maximum allowed // space will be infinite (Default value). @@ -42,14 +43,14 @@ // other background functions may continue, such as logging and flushing. virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; - // Return true if the total size of SST files exceeded the maximum allowed - // space usage. + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. // // thread-safe. virtual bool IsMaxAllowedSpaceReached() = 0; - // Returns true if the total size of SST files as well as estimated size - // of ongoing compactions exceeds the maximums allowed space usage. + // Returns true if the total size of SST and blob files as well as estimated + // size of ongoing compactions exceeds the maximums allowed space usage. virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; // Return the total size of all tracked files. @@ -80,10 +81,13 @@ // Return the total size of trash files // thread-safe virtual uint64_t GetTotalTrashSize() = 0; + + // Set the statistics ptr to dump the stat information + virtual void SetStatisticsPtr(const std::shared_ptr& stats) = 0; }; // Create a new SstFileManager that can be shared among multiple RocksDB -// instances to track SST file and control there deletion rate. +// instances to track SST and blob files and control there deletion rate. // Even though SstFileManager don't track WAL files but it still control // there deletion rate. // diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h 2025-05-19 16:14:27.000000000 +0000 @@ -34,6 +34,8 @@ largest_key(""), smallest_range_del_key(""), largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), sequence_number(0), file_size(0), num_entries(0), @@ -50,6 +52,8 @@ largest_key(_largest_key), smallest_range_del_key(""), largest_range_del_key(""), + file_checksum(""), + file_checksum_func_name(""), sequence_number(_sequence_number), file_size(_file_size), num_entries(_num_entries), @@ -62,6 +66,8 @@ std::string smallest_range_del_key; // smallest range deletion user key in file std::string largest_range_del_key; // largest range deletion user key in file + std::string file_checksum; // sst file checksum; + std::string file_checksum_func_name; // The name of file checksum function SequenceNumber sequence_number; // sequence number of all keys in file uint64_t file_size; // file size in bytes uint64_t num_entries; // number of entries in file @@ -80,6 +86,9 @@ // hint that this file pages is not needed every time we write 1MB to the // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be // passed. + // The `skip_filters` option is DEPRECATED and could be removed in the + // future. Use `BlockBasedTableOptions::filter_policy` to control filter + // generation. SstFileWriter(const EnvOptions& env_options, const Options& options, ColumnFamilyHandle* column_family = nullptr, bool invalidate_page_cache = true, @@ -103,21 +112,40 @@ // Add a Put key with value to currently opened file (deprecated) // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); // Add a Put key with value to currently opened file // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. Status Put(const Slice& user_key, const Slice& value); + // Add a Put (key with timestamp, value) to the currently opened file + // REQUIRES: key is after any previously added key according to the + // comparator. + // REQUIRES: the timestamp's size is equal to what is expected by + // the comparator. + Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value); + // Add a Merge key with value to currently opened file // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. Status Merge(const Slice& user_key, const Slice& value); // Add a deletion key to currently opened file // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: comparator is *not* timestamp-aware. Status Delete(const Slice& user_key); + // Add a deletion key with timestamp to the currently opened file + // REQUIRES: key is after any previously added key according to the + // comparator. + // REQUIRES: the timestamp's size is equal to what is expected by + // the comparator. + Status Delete(const Slice& user_key, const Slice& timestamp); + // Add a range deletion tombstone to currently opened file + // REQUIRES: comparator is *not* timestamp-aware. Status DeleteRange(const Slice& begin_key, const Slice& end_key); // Finalize writing to sst file and close file. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +enum PartitionerResult : char { + // Partitioner does not require to create new file + kNotRequired = 0x0, + // Partitioner is requesting forcefully to create new file + kRequired = 0x1 + // Additional constants can be added +}; + +struct PartitionerRequest { + PartitionerRequest(const Slice& prev_user_key_, + const Slice& current_user_key_, + uint64_t current_output_file_size_) + : prev_user_key(&prev_user_key_), + current_user_key(¤t_user_key_), + current_output_file_size(current_output_file_size_) {} + const Slice* prev_user_key; + const Slice* current_user_key; + uint64_t current_output_file_size; +}; + +/* + * A SstPartitioner is a generic pluggable way of defining the partition + * of SST files. Compaction job will split the SST files on partition boundary + * to lower the write amplification during SST file promote to higher level. + */ +class SstPartitioner { + public: + virtual ~SstPartitioner() {} + + // Return the name of this partitioner. + virtual const char* Name() const = 0; + + // It is called for all keys in compaction. When partitioner want to create + // new SST file it needs to return true. It means compaction job will finish + // current SST file where last key is "prev_user_key" parameter and start new + // SST file where first key is "current_user_key". Returns decision if + // partition boundary was detected and compaction should create new file. + virtual PartitionerResult ShouldPartition( + const PartitionerRequest& request) = 0; + + // Called with smallest and largest keys in SST file when compaction try to do + // trivial move. Returns true is partitioner allows to do trivial move. + virtual bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) = 0; + + // Context information of a compaction run + struct Context { + // Does this compaction run include all data files + bool is_full_compaction; + // Is this compaction requested by the client (true), + // or is it occurring as an automatic compaction process + bool is_manual_compaction; + // Output level for this compaction + int output_level; + // Smallest key for compaction + Slice smallest_user_key; + // Largest key for compaction + Slice largest_user_key; + }; +}; + +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class SstPartitionerFactory : public Customizable { + public: + ~SstPartitionerFactory() override {} + static const char* Type() { return "SstPartitionerFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + + virtual std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& context) const = 0; + + // Returns a name that identifies this partitioner factory. + const char* Name() const override = 0; +}; + +/* + * Fixed key prefix partitioner. It splits the output SST files when prefix + * defined by size changes. + */ +class SstPartitionerFixedPrefix : public SstPartitioner { + public: + explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {} + + virtual ~SstPartitionerFixedPrefix() override {} + + const char* Name() const override { return "SstPartitionerFixedPrefix"; } + + PartitionerResult ShouldPartition(const PartitionerRequest& request) override; + + bool CanDoTrivialMove(const Slice& smallest_user_key, + const Slice& largest_user_key) override; + + private: + size_t len_; +}; + +/* + * Factory for fixed prefix partitioner. + */ +class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory { + public: + explicit SstPartitionerFixedPrefixFactory(size_t len); + + ~SstPartitionerFixedPrefixFactory() override {} + + static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override; + + private: + size_t len_; +}; + +extern std::shared_ptr +NewSstPartitionerFixedPrefixFactory(size_t prefix_len); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/statistics.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/statistics.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,17 +13,19 @@ #include #include +#include "rocksdb/customizable.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { /** - * Keep adding ticker's here. - * 1. Any ticker should be added before TICKER_ENUM_MAX. + * Keep adding tickers here. + * 1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking + * over its old value. * 2. Add a readable string in TickersNameMap below for the newly added ticker. * 3. Add a corresponding enum value to TickerType.java in the java API * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType - * and toCppTickers + * and toCppTickers */ enum Tickers : uint32_t { // total block cache misses @@ -117,7 +119,7 @@ COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. // Deletions obsoleted before bottom level due to file gap optimization. COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, - // If a compaction was cancelled in sfm to prevent ENOSPC + // If a compaction was canceled in sfm to prevent ENOSPC COMPACTION_CANCELLED, // Number of keys written to the database via the Put and Write call's @@ -183,7 +185,7 @@ // over large number of keys with same userkey. NUMBER_OF_RESEEKS_IN_ITERATION, - // Record the number of calls to GetUpadtesSince. Useful to keep track of + // Record the number of calls to GetUpdatesSince. Useful to keep track of // transaction log iterator refreshes GET_UPDATES_SINCE_CALLS, BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache @@ -205,6 +207,14 @@ COMPACT_WRITE_BYTES, // Bytes written during compaction FLUSH_WRITE_BYTES, // Bytes written during flush + // Compaction read and write statistics broken down by CompactionReason + COMPACT_READ_BYTES_MARKED, + COMPACT_READ_BYTES_PERIODIC, + COMPACT_READ_BYTES_TTL, + COMPACT_WRITE_BYTES_MARKED, + COMPACT_WRITE_BYTES_PERIODIC, + COMPACT_WRITE_BYTES_TTL, + // Number of table's properties loaded directly from file, without creating // table reader object. NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, @@ -239,35 +249,42 @@ NUMBER_ITER_SKIP, // BlobDB specific stats - // # of Put/PutTTL/PutUntil to BlobDB. + // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_PUT, - // # of Write to BlobDB. + // # of Write to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_WRITE, - // # of Get to BlobDB. + // # of Get to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_GET, - // # of MultiGet to BlobDB. + // # of MultiGet to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_MULTIGET, - // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. + // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only + // applicable to legacy BlobDB. BLOB_DB_NUM_SEEK, - // # of Next to BlobDB iterator. + // # of Next to BlobDB iterator. Only applicable to legacy BlobDB. BLOB_DB_NUM_NEXT, - // # of Prev to BlobDB iterator. + // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB. BLOB_DB_NUM_PREV, - // # of keys written to BlobDB. + // # of keys written to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_KEYS_WRITTEN, - // # of keys read from BlobDB. + // # of keys read from BlobDB. Only applicable to legacy BlobDB. BLOB_DB_NUM_KEYS_READ, - // # of bytes (key + value) written to BlobDB. + // # of bytes (key + value) written to BlobDB. Only applicable to legacy + // BlobDB. BLOB_DB_BYTES_WRITTEN, - // # of bytes (keys + value) read from BlobDB. + // # of bytes (keys + value) read from BlobDB. Only applicable to legacy + // BlobDB. BLOB_DB_BYTES_READ, - // # of keys written by BlobDB as non-TTL inlined value. + // # of keys written by BlobDB as non-TTL inlined value. Only applicable to + // legacy BlobDB. BLOB_DB_WRITE_INLINED, - // # of keys written by BlobDB as TTL inlined value. + // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy + // BlobDB. BLOB_DB_WRITE_INLINED_TTL, - // # of keys written by BlobDB as non-TTL blob value. + // # of keys written by BlobDB as non-TTL blob value. Only applicable to + // legacy BlobDB. BLOB_DB_WRITE_BLOB, - // # of keys written by BlobDB as TTL blob value. + // # of keys written by BlobDB as TTL blob value. Only applicable to legacy + // BlobDB. BLOB_DB_WRITE_BLOB_TTL, // # of bytes written to blob file. BLOB_DB_BLOB_FILE_BYTES_WRITTEN, @@ -276,22 +293,24 @@ // # of times a blob files being synced. BLOB_DB_BLOB_FILE_SYNCED, // # of blob index evicted from base DB by BlobDB compaction filter because - // of expiration. + // of expiration. Only applicable to legacy BlobDB. BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, // size of blob index evicted from base DB by BlobDB compaction filter - // because of expiration. + // because of expiration. Only applicable to legacy BlobDB. BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, // # of blob index evicted from base DB by BlobDB compaction filter because - // of corresponding file deleted. + // of corresponding file deleted. Only applicable to legacy BlobDB. BLOB_DB_BLOB_INDEX_EVICTED_COUNT, // size of blob index evicted from base DB by BlobDB compaction filter - // because of corresponding file deleted. + // because of corresponding file deleted. Only applicable to legacy BlobDB. BLOB_DB_BLOB_INDEX_EVICTED_SIZE, - // # of blob files that were obsoleted by garbage collection. + // # of blob files that were obsoleted by garbage collection. Only applicable + // to legacy BlobDB. BLOB_DB_GC_NUM_FILES, - // # of blob files generated by garbage collection. + // # of blob files generated by garbage collection. Only applicable to legacy + // BlobDB. BLOB_DB_GC_NUM_NEW_FILES, - // # of BlobDB garbage collection failures. + // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB. BLOB_DB_GC_FAILURES, // # of keys dropped by BlobDB garbage collection because they had been // overwritten. DEPRECATED. @@ -309,11 +328,14 @@ BLOB_DB_GC_BYTES_EXPIRED, // # of bytes relocated to new blob file by garbage collection. BLOB_DB_GC_BYTES_RELOCATED, - // # of blob files evicted because of BlobDB is full. + // # of blob files evicted because of BlobDB is full. Only applicable to + // legacy BlobDB. BLOB_DB_FIFO_NUM_FILES_EVICTED, - // # of keys in the blob files evicted because of BlobDB is full. + // # of keys in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. BLOB_DB_FIFO_NUM_KEYS_EVICTED, - // # of bytes in the blob files evicted because of BlobDB is full. + // # of bytes in the blob files evicted because of BlobDB is full. Only + // applicable to legacy BlobDB. BLOB_DB_FIFO_BYTES_EVICTED, // These counters indicate a performance issue in WritePrepared transactions. @@ -342,6 +364,67 @@ BLOCK_CACHE_COMPRESSION_DICT_ADD, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, + + // # of blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD + BLOCK_CACHE_ADD_REDUNDANT, + // # of index blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD + BLOCK_CACHE_INDEX_ADD_REDUNDANT, + // # of filter blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD + BLOCK_CACHE_FILTER_ADD_REDUNDANT, + // # of data blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD + BLOCK_CACHE_DATA_ADD_REDUNDANT, + // # of dict blocks redundantly inserted into block cache. + // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT + // <= BLOCK_CACHE_COMPRESSION_DICT_ADD + BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + + // # of files marked as trash by sst file manager and will be deleted + // later by background thread. + FILES_MARKED_TRASH, + // # of files deleted immediately by sst file manger through delete scheduler. + FILES_DELETED_IMMEDIATELY, + + // The counters for error handler, not that, bg_io_error is the subset of + // bg_error and bg_retryable_io_error is the subset of bg_io_error + ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_AUTORESUME_COUNT, + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + + // Statistics for memtable garbage collection: + // Raw bytes of data (payload) present on memtable at flush time. + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + // Outdated bytes of data present on memtable at flush time. + MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + + // Secondary cache statistics + SECONDARY_CACHE_HITS, + + // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. + VERIFY_CHECKSUM_READ_BYTES, + + // Bytes read/written while creating backups + BACKUP_READ_BYTES, + BACKUP_WRITE_BYTES, + + // Remote compaction read/write statistics + REMOTE_COMPACT_READ_BYTES, + REMOTE_COMPACT_WRITE_BYTES, + + // Tiered storage related statistics + HOT_FILE_READ_BYTES, + WARM_FILE_READ_BYTES, + COLD_FILE_READ_BYTES, + HOT_FILE_READ_COUNT, + WARM_FILE_READ_COUNT, + COLD_FILE_READ_COUNT, + TICKER_ENUM_MAX }; @@ -400,21 +483,23 @@ READ_NUM_MERGE_OPERANDS, // BlobDB specific stats - // Size of keys written to BlobDB. + // Size of keys written to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_KEY_SIZE, - // Size of values written to BlobDB. + // Size of values written to BlobDB. Only applicable to legacy BlobDB. BLOB_DB_VALUE_SIZE, - // BlobDB Put/PutWithTTL/PutUntil/Write latency. + // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy + // BlobDB. BLOB_DB_WRITE_MICROS, - // BlobDB Get lagency. + // BlobDB Get latency. Only applicable to legacy BlobDB. BLOB_DB_GET_MICROS, - // BlobDB MultiGet latency. + // BlobDB MultiGet latency. Only applicable to legacy BlobDB. BLOB_DB_MULTIGET_MICROS, - // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. + // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to + // legacy BlobDB. BLOB_DB_SEEK_MICROS, - // BlobDB Next latency. + // BlobDB Next latency. Only applicable to legacy BlobDB. BLOB_DB_NEXT_MICROS, - // BlobDB Prev latency. + // BlobDB Prev latency. Only applicable to legacy BlobDB. BLOB_DB_PREV_MICROS, // Blob file write latency. BLOB_DB_BLOB_FILE_WRITE_MICROS, @@ -432,6 +517,17 @@ FLUSH_TIME, SST_BATCH_SIZE, + // MultiGet stats logged per level + // Num of index and filter blocks read from file system per level. + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + // Num of data blocks read from file system per level. + NUM_DATA_BLOCKS_READ_PER_LEVEL, + // Num of sst files read from file system per level. + NUM_SST_READ_PER_LEVEL, + + // Error handler statistics + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + HISTOGRAM_ENUM_MAX, }; @@ -456,6 +552,10 @@ // Usage: // options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); enum StatsLevel : uint8_t { + // Disable all metrics + kDisableAll, + // Disable tickers + kExceptTickers = kDisableAll, // Disable timer stats, and skip histogram stats kExceptHistogramOrTimers, // Skip timer stats @@ -481,10 +581,21 @@ // options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); // HistogramData hist; // options.statistics->histogramData(FLUSH_TIME, &hist); -class Statistics { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class Statistics : public Customizable { public: - virtual ~Statistics() {} + ~Statistics() override {} static const char* Type() { return "Statistics"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::shared_ptr* result); + // Default name of empty, for backwards compatibility. Derived classes should + // override this method. + // This default implementation will likely be removed in a future release + const char* Name() const override { return ""; } virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; virtual void histogramData(uint32_t type, HistogramData* const data) const = 0; @@ -516,7 +627,10 @@ // Resets all ticker and histogram stats virtual Status Reset() { return Status::NotSupported("Not implemented"); } - // String representation of the statistic object. +#ifndef ROCKSDB_LITE + using Customizable::ToString; +#endif // ROCKSDB_LITE + // String representation of the statistic object. Must be thread-safe. virtual std::string ToString() const { // Do nothing by default return std::string("ToString(): not implemented"); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h 2025-05-19 16:14:27.000000000 +0000 @@ -53,6 +53,7 @@ // REQUIRES: Valid() virtual uint64_t GetStatsTime() const = 0; + // DEPRECATED (was never used) virtual int GetFormatVersion() const { return -1; } // Return the current stats history as an std::map which specifies the diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/status.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h 2025-05-19 16:14:27.000000000 +0000 @@ -16,7 +16,17 @@ #pragma once +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include +#include +#endif + #include + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED +#include "port/stack_trace.h" +#endif + #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -25,7 +35,16 @@ public: // Create a success status. Status() : code_(kOk), subcode_(kNone), sev_(kNoError), state_(nullptr) {} - ~Status() { delete[] state_; } + ~Status() { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!checked_) { + fprintf(stderr, "Failed to check Status %p\n", this); + port::PrintStack(); + abort(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + delete[] state_; + } // Copy the specified status. Status(const Status& s); @@ -43,6 +62,17 @@ bool operator==(const Status& rhs) const; bool operator!=(const Status& rhs) const; + // In case of intentionally swallowing an error, user must explicitly call + // this function. That way we are easily able to search the code to find where + // error swallowing occurs. + inline void PermitUncheckedError() const { MarkChecked(); } + + inline void MustCheck() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } + enum Code : unsigned char { kOk = 0, kNotFound = 1, @@ -63,7 +93,10 @@ kMaxCode }; - Code code() const { return code_; } + Code code() const { + MarkChecked(); + return code_; + } enum SubCode : unsigned char { kNone = 0, @@ -78,10 +111,16 @@ kPathNotFound = 9, KMergeOperandsInsufficientCapacity = 10, kManualCompactionPaused = 11, + kOverwritten = 12, + kTxnNotPrepared = 13, + kIOFenced = 14, kMaxSubCode }; - SubCode subcode() const { return subcode_; } + SubCode subcode() const { + MarkChecked(); + return subcode_; + } enum Severity : unsigned char { kNoError = 0, @@ -93,21 +132,43 @@ }; Status(const Status& s, Severity sev); - Severity severity() const { return sev_; } + + Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg) + : Status(_code, _subcode, msg, "", _sev) {} + + Severity severity() const { + MarkChecked(); + return sev_; + } // Returns a C style string indicating the message of the Status - const char* getState() const { return state_; } + const char* getState() const { + MarkChecked(); + return state_; + } // Return a success status. static Status OK() { return Status(); } + // Successful, though an existing something was overwritten + // Note: using variants of OK status for program logic is discouraged, + // but it can be useful for communicating statistical information without + // changing public APIs. + static Status OkOverwritten() { return Status(kOk, kOverwritten); } + // Return error status of an appropriate type. static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kNotFound, msg, msg2); } + // Fast path for not found without malloc; static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); } + static Status NotFound(SubCode sc, const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kNotFound, sc, msg, msg2); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kCorruption, msg, msg2); } @@ -217,60 +278,126 @@ return Status(kIOError, kPathNotFound, msg, msg2); } + static Status TxnNotPrepared() { + return Status(kInvalidArgument, kTxnNotPrepared); + } + static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2); + } + // Returns true iff the status indicates success. - bool ok() const { return code() == kOk; } + bool ok() const { + MarkChecked(); + return code() == kOk; + } + + // Returns true iff the status indicates success *with* something + // overwritten + bool IsOkOverwritten() const { + MarkChecked(); + return code() == kOk && subcode() == kOverwritten; + } // Returns true iff the status indicates a NotFound error. - bool IsNotFound() const { return code() == kNotFound; } + bool IsNotFound() const { + MarkChecked(); + return code() == kNotFound; + } // Returns true iff the status indicates a Corruption error. - bool IsCorruption() const { return code() == kCorruption; } + bool IsCorruption() const { + MarkChecked(); + return code() == kCorruption; + } // Returns true iff the status indicates a NotSupported error. - bool IsNotSupported() const { return code() == kNotSupported; } + bool IsNotSupported() const { + MarkChecked(); + return code() == kNotSupported; + } // Returns true iff the status indicates an InvalidArgument error. - bool IsInvalidArgument() const { return code() == kInvalidArgument; } + bool IsInvalidArgument() const { + MarkChecked(); + return code() == kInvalidArgument; + } // Returns true iff the status indicates an IOError. - bool IsIOError() const { return code() == kIOError; } + bool IsIOError() const { + MarkChecked(); + return code() == kIOError; + } // Returns true iff the status indicates an MergeInProgress. - bool IsMergeInProgress() const { return code() == kMergeInProgress; } + bool IsMergeInProgress() const { + MarkChecked(); + return code() == kMergeInProgress; + } // Returns true iff the status indicates Incomplete - bool IsIncomplete() const { return code() == kIncomplete; } + bool IsIncomplete() const { + MarkChecked(); + return code() == kIncomplete; + } // Returns true iff the status indicates Shutdown In progress - bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } + bool IsShutdownInProgress() const { + MarkChecked(); + return code() == kShutdownInProgress; + } - bool IsTimedOut() const { return code() == kTimedOut; } + bool IsTimedOut() const { + MarkChecked(); + return code() == kTimedOut; + } - bool IsAborted() const { return code() == kAborted; } + bool IsAborted() const { + MarkChecked(); + return code() == kAborted; + } bool IsLockLimit() const { + MarkChecked(); return code() == kAborted && subcode() == kLockLimit; } // Returns true iff the status indicates that a resource is Busy and // temporarily could not be acquired. - bool IsBusy() const { return code() == kBusy; } + bool IsBusy() const { + MarkChecked(); + return code() == kBusy; + } - bool IsDeadlock() const { return code() == kBusy && subcode() == kDeadlock; } + bool IsDeadlock() const { + MarkChecked(); + return code() == kBusy && subcode() == kDeadlock; + } // Returns true iff the status indicated that the operation has Expired. - bool IsExpired() const { return code() == kExpired; } + bool IsExpired() const { + MarkChecked(); + return code() == kExpired; + } // Returns true iff the status indicates a TryAgain error. // This usually means that the operation failed, but may succeed if // re-attempted. - bool IsTryAgain() const { return code() == kTryAgain; } + bool IsTryAgain() const { + MarkChecked(); + return code() == kTryAgain; + } // Returns true iff the status indicates the proposed compaction is too large - bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; } + bool IsCompactionTooLarge() const { + MarkChecked(); + return code() == kCompactionTooLarge; + } // Returns true iff the status indicates Column Family Dropped - bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; } + bool IsColumnFamilyDropped() const { + MarkChecked(); + return code() == kColumnFamilyDropped; + } // Returns true iff the status indicates a NoSpace error // This is caused by an I/O error returning the specific "out of space" @@ -278,6 +405,7 @@ // with a specific subcode, enabling users to take the appropriate action // if needed bool IsNoSpace() const { + MarkChecked(); return (code() == kIOError) && (subcode() == kNoSpace); } @@ -285,6 +413,7 @@ // cases where we limit the memory used in certain operations (eg. the size // of a write batch) in order to avoid out of memory exceptions. bool IsMemoryLimit() const { + MarkChecked(); return (code() == kAborted) && (subcode() == kMemoryLimit); } @@ -293,52 +422,76 @@ // directory" error condition. A PathNotFound error is an I/O error with // a specific subcode, enabling users to take appropriate action if necessary bool IsPathNotFound() const { - return (code() == kIOError) && (subcode() == kPathNotFound); + MarkChecked(); + return (code() == kIOError || code() == kNotFound) && + (subcode() == kPathNotFound); } // Returns true iff the status indicates manual compaction paused. This // is caused by a call to PauseManualCompaction bool IsManualCompactionPaused() const { + MarkChecked(); return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); } + // Returns true iff the status indicates a TxnNotPrepared error. + bool IsTxnNotPrepared() const { + MarkChecked(); + return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared); + } + + // Returns true iff the status indicates a IOFenced error. + bool IsIOFenced() const { + MarkChecked(); + return (code() == kIOError) && (subcode() == kIOFenced); + } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; protected: - // A nullptr state_ (which is always the case for OK) means the message - // is empty. - // of the following form: - // state_[0..3] == length of message - // state_[4..] == message Code code_; SubCode subcode_; Severity sev_; + // A nullptr state_ (which is at least the case for OK) means the extra + // message is empty. const char* state_; +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + mutable bool checked_ = false; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED explicit Status(Code _code, SubCode _subcode = kNone) : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {} - Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2, + Severity sev = kNoError); Status(Code _code, const Slice& msg, const Slice& msg2) : Status(_code, kNone, msg, msg2) {} static const char* CopyState(const char* s); + + inline void MarkChecked() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } }; inline Status::Status(const Status& s) : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) { + s.MarkChecked(); state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline Status::Status(const Status& s, Severity sev) : code_(s.code_), subcode_(s.subcode_), sev_(sev) { + s.MarkChecked(); state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline Status& Status::operator=(const Status& s) { - // The following condition catches both aliasing (when this == &s), - // and the common case where both s and *this are ok. if (this != &s) { + s.MarkChecked(); + MustCheck(); code_ = s.code_; subcode_ = s.subcode_; sev_ = s.sev_; @@ -353,6 +506,7 @@ noexcept #endif : Status() { + s.MarkChecked(); *this = std::move(s); } @@ -362,6 +516,8 @@ #endif { if (this != &s) { + s.MarkChecked(); + MustCheck(); code_ = std::move(s.code_); s.code_ = kOk; subcode_ = std::move(s.subcode_); @@ -376,10 +532,14 @@ } inline bool Status::operator==(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); return (code_ == rhs.code_); } inline bool Status::operator!=(const Status& rhs) const { + MarkChecked(); + rhs.MarkChecked(); return !(*this == rhs); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include + +#include "rocksdb/customizable.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; + +// A SystemClock is an interface used by the rocksdb implementation to access +// operating system time-related functionality. +class SystemClock : public Customizable { + public: + virtual ~SystemClock() {} + + static const char* Type() { return "SystemClock"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + // The name of this system clock + virtual const char* Name() const override = 0; + + // The name/nickname for the Default SystemClock. This name can be used + // to determine if the clock is the default one. + static const char* kDefaultName() { return "DefaultClock"; } + + // Return a default SystemClock suitable for the current operating + // system. + static const std::shared_ptr& Default(); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // Returns the number of micro-seconds of CPU time used by the current thread. + // 0 indicates not supported. + virtual uint64_t CPUMicros() { return 0; } + + // Returns the number of nano-seconds of CPU time used by the current thread. + // Default implementation simply relies on CPUMicros. + // 0 indicates not supported. + virtual uint64_t CPUNanos() { return CPUMicros() * 1000; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; +}; + +// Wrapper class for a SystemClock. Redirects all methods (except Name) +// of the SystemClock interface to the target/wrapped class. +class SystemClockWrapper : public SystemClock { + public: + explicit SystemClockWrapper(const std::shared_ptr& t); + + uint64_t NowMicros() override { return target_->NowMicros(); } + + uint64_t NowNanos() override { return target_->NowNanos(); } + + uint64_t CPUMicros() override { return target_->CPUMicros(); } + + uint64_t CPUNanos() override { return target_->CPUNanos(); } + + virtual void SleepForMicroseconds(int micros) override { + return target_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return target_->GetCurrentTime(unix_time); + } + + std::string TimeToString(uint64_t time) override { + return target_->TimeToString(time); + } + + Status PrepareOptions(const ConfigOptions& options) override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const override; +#endif // ROCKSDB_LITE + const Customizable* Inner() const override { return target_.get(); } + + protected: + std::shared_ptr target_; +}; + +} // end namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h 2025-05-19 16:14:27.000000000 +0000 @@ -22,35 +22,91 @@ #include #include -#include "rocksdb/cache.h" +#include "rocksdb/customizable.h" #include "rocksdb/env.h" -#include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { // -- Block-based Table +class Cache; +class FilterPolicy; class FlushBlockPolicyFactory; class PersistentCache; class RandomAccessFile; struct TableReaderOptions; struct TableBuilderOptions; class TableBuilder; +class TableFactory; class TableReader; class WritableFileWriter; +struct ConfigOptions; struct EnvOptions; -struct Options; +// Types of checksums to use for checking integrity of logical blocks within +// files. All checksums currently use 32 bits of checking power (1 in 4B +// chance of failing to detect random corruption). enum ChecksumType : char { kNoChecksum = 0x0, kCRC32c = 0x1, kxxHash = 0x2, kxxHash64 = 0x3, + kXXH3 = 0x4, // Supported since RocksDB 6.27 +}; + +// `PinningTier` is used to specify which tier of block-based tables should +// be affected by a block cache pinning setting (see +// `MetadataCacheOptions` below). +enum class PinningTier { + // For compatibility, this value specifies to fallback to the behavior + // indicated by the deprecated options, + // `pin_l0_filter_and_index_blocks_in_cache` and + // `pin_top_level_index_and_filter`. + kFallback, + + // This tier contains no block-based tables. + kNone, + + // This tier contains block-based tables that may have originated from a + // memtable flush. In particular, it includes tables from L0 that are smaller + // than 1.5 times the current `write_buffer_size`. Note these criteria imply + // it can include intra-L0 compaction outputs and ingested files, as long as + // they are not abnormally large compared to flushed files in L0. + kFlushedAndSimilar, + + // This tier contains all block-based tables. + kAll, +}; + +// `MetadataCacheOptions` contains members indicating the desired caching +// behavior for the different categories of metadata blocks. +struct MetadataCacheOptions { + // The tier of block-based tables whose top-level index into metadata + // partitions will be pinned. Currently indexes and filters may be + // partitioned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise any top-level index into metadata partitions would be + // held in table reader memory, outside the block cache. + PinningTier top_level_index_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose metadata partitions will be pinned. + // Currently indexes and filters may be partitioned. + PinningTier partition_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose unpartitioned metadata blocks will be + // pinned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise the unpartitioned meta-blocks would be held in table + // reader memory, outside the block cache. + PinningTier unpartitioned_pinning = PinningTier::kFallback; }; // For advanced user only struct BlockBasedTableOptions { + static const char* kName() { return "BlockTableOptions"; }; // @flush_block_policy_factory creates the instances of flush block policy. // which provides a configurable way to determine when to flush a block in // the block based tables. If not set, table builder will use the default @@ -65,9 +121,10 @@ // caching as they should now apply to range tombstone and compression // dictionary meta-blocks, in addition to index and filter meta-blocks. // - // Indicating if we'd put index/filter blocks to the block cache. - // If not specified, each "table reader" object will pre-load index/filter - // block during table initialization. + // Whether to put index/filter blocks in the block cache. When false, + // each "table reader" object will pre-load index/filter blocks during + // table initialization. Index and filter partition blocks always use + // block cache regardless of this option. bool cache_index_and_filter_blocks = false; // If cache_index_and_filter_blocks is enabled, cache index and filter @@ -76,12 +133,44 @@ // than data blocks. bool cache_index_and_filter_blocks_with_high_priority = true; + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating each of the following variables that + // has the default value, `PinningTier::kFallback`: + // + // - `MetadataCacheOptions::partition_pinning` + // - `MetadataCacheOptions::unpartitioned_pinning` + // + // The updated value is chosen as follows: + // + // - `pin_l0_filter_and_index_blocks_in_cache == false` -> + // `PinningTier::kNone` + // - `pin_l0_filter_and_index_blocks_in_cache == true` -> + // `PinningTier::kFlushedAndSimilar` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // // if cache_index_and_filter_blocks is true and the below is true, then // filter and index blocks are stored in the cache, but a reference is // held in the "table reader" object so the blocks are pinned and only // evicted from cache when the table reader is freed. bool pin_l0_filter_and_index_blocks_in_cache = false; + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating + // `MetadataCacheOptions::top_level_index_pinning` when it has the + // default value, `PinningTier::kFallback`. + // + // The updated value is chosen as follows: + // + // - `pin_top_level_index_and_filter == false` -> + // `PinningTier::kNone` + // - `pin_top_level_index_and_filter == true` -> + // `PinningTier::kAll` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // // If cache_index_and_filter_blocks is true and the below is true, then // the top-level index of partitioned filter and index blocks are stored in // the cache, but a reference is held in the "table reader" object so the @@ -89,6 +178,12 @@ // freed. This is not limited to l0 in LSM tree. bool pin_top_level_index_and_filter = true; + // The desired block cache pinning behavior for the different categories of + // metadata blocks. While pinning can reduce block cache contention, users + // must take care not to pin excessive amounts of data, which risks + // overflowing block cache. + MetadataCacheOptions metadata_cache_options; + // The index type that will be used for this table. enum IndexType : char { // A space efficient index block that is optimized for @@ -100,6 +195,8 @@ kHashSearch = 0x01, // A two-level index implementation. Both levels are binary search indexes. + // Second level index blocks ("partitions") use block cache even when + // cache_index_and_filter_blocks=false. kTwoLevelIndexSearch = 0x02, // Like kBinarySearch, but index also contains first key of each block. @@ -113,11 +210,6 @@ // e.g. when prefix changes. // Makes the index significantly bigger (2x or more), especially when keys // are long. - // - // IO errors are not handled correctly in this mode right now: if an error - // happens when lazily reading a block in value(), value() returns empty - // slice, and you need to call Valid()/status() afterwards. - // TODO(kolmike): Fix it. kBinarySearchWithFirstKey = 0x03, }; @@ -167,7 +259,7 @@ // block size specified here corresponds to uncompressed data. The // actual size of the unit read from disk may be smaller if // compression is enabled. This parameter can be changed dynamically. - size_t block_size = 4 * 1024; + uint64_t block_size = 4 * 1024; // This is used to close a block before it reaches the configured // 'block_size'. If the percentage of free space in the current block is less @@ -196,13 +288,68 @@ // separately uint64_t metadata_block_size = 4096; + // If true, a dynamically updating charge to block cache, loosely based + // on the actual memory usage of table building, will occur to account + // the memory, if block cache available. + // + // Charged memory usage includes: + // 1. (new) Bloom Filter and Ribbon Filter construction + // 2. More to come... + // + // Note: + // 1. (new) Bloom Filter and Ribbon Filter construction + // + // If additional temporary memory of Ribbon Filter uses up too much memory + // relative to the avaible space left in the block cache + // at some point (i.e, causing a cache full when strict_capacity_limit = + // true), construction will fall back to (new) Bloom Filter. + // + // Default: false + bool reserve_table_builder_memory = false; + // Note: currently this option requires kTwoLevelIndexSearch to be set as // well. // TODO(myabandeh): remove the note above once the limitation is lifted // Use partitioned full filters for each SST file. This option is - // incompatible with block-based filters. + // incompatible with block-based filters. Filter partition blocks use + // block cache even when cache_index_and_filter_blocks=false. bool partition_filters = false; + // Option to generate Bloom/Ribbon filters that minimize memory + // internal fragmentation. + // + // When false, malloc_usable_size is not available, or format_version < 5, + // filters are generated without regard to internal fragmentation when + // loaded into memory (historical behavior). When true (and + // malloc_usable_size is available and format_version >= 5), then + // filters are generated to "round up" and "round down" their sizes to + // minimize internal fragmentation when loaded into memory, assuming the + // reading DB has the same memory allocation characteristics as the + // generating DB. This option does not break forward or backward + // compatibility. + // + // While individual filters will vary in bits/key and false positive rate + // when setting is true, the implementation attempts to maintain a weighted + // average FP rate for filters consistent with this option set to false. + // + // With Jemalloc for example, this setting is expected to save about 10% of + // the memory footprint and block cache charge of filters, while increasing + // disk usage of filters by about 1-2% due to encoding efficiency losses + // with variance in bits/key. + // + // NOTE: Because some memory counted by block cache might be unmapped pages + // within internal fragmentation, this option can increase observed RSS + // memory usage. With cache_index_and_filter_blocks=true, this option makes + // the block cache better at using space it is allowed. (These issues + // should not arise with partitioned filters.) + // + // NOTE: Do not set to true if you do not trust malloc_usable_size. With + // this option, RocksDB might access an allocated memory object beyond its + // original size if malloc_usable_size says it is safe to do so. While this + // can be considered bad practice, it should not produce undefined behavior + // unless malloc_usable_size is buggy or broken. + bool optimize_filters_for_memory = false; + // Use delta encoding to compress keys in blocks. // ReadOptions::pin_data requires this option to be disabled. // @@ -246,10 +393,9 @@ // Default: 0 (disabled) uint32_t read_amp_bytes_per_bit = 0; - // We currently have five versions: - // 0 -- This version is currently written out by all RocksDB's versions by - // default. Can be read by really old RocksDB's. Doesn't support changing - // checksum (default is CRC32). + // We currently have these versions: + // 0 -- This version can be read by really old RocksDB's. Doesn't support + // changing checksum type (default is CRC32). // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default // checksum, like xxHash. It is written by RocksDB when // BlockBasedTableOptions::checksum is something other than kCRC32c. (version @@ -272,7 +418,7 @@ // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned // filters use a generally faster and more accurate Bloom filter // implementation, with a different schema. - uint32_t format_version = 2; + uint32_t format_version = 5; // Store index blocks on disk in compressed format. Changing this option to // false will avoid the overhead of decompression if index blocks are evicted @@ -316,6 +462,55 @@ IndexShorteningMode index_shortening = IndexShorteningMode::kShortenSeparators; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead + // starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size and max_auto_readahead_size can be configured. + // + // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit + // auto prefetching will be done. If max_auto_readahead_size provided is less + // than 8KB (which is initial readahead size used by rocksdb in case of + // auto-readahead), readahead size will remain same as + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch + // the blocks. + // + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{max_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 256 KB (256 * 1024). + size_t max_auto_readahead_size = 256 * 1024; + + // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and + // filter blocks) which are already in memory into block cache at the time of + // flush. On a flush, the block that is in memory (in memtables) get flushed + // to the device. If using Direct IO, additional IO is incurred to read this + // data back into memory again, which is avoided by enabling this option. This + // further helps if the workload exhibits high temporal locality, where most + // of the reads go to recently written data. This also helps in case of + // Distributed FileSystem. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{prepopulate_block_cache=kFlushOnly;}"}})); + enum class PrepopulateBlockCache : char { + // Disable prepopulate block cache. + kDisable, + // Prepopulate blocks during flush only. + kFlushOnly, + }; + + PrepopulateBlockCache prepopulate_block_cache = + PrepopulateBlockCache::kDisable; }; // Table Properties that are specific to block-based table properties. @@ -361,6 +556,7 @@ const uint32_t kPlainTableVariableLength = 0; struct PlainTableOptions { + static const char* kName() { return "PlainTableOptions"; }; // @user_key_len: plain table has optimization for fix-sized keys, which can // be specified via user_key_len. Alternatively, you can pass // `kPlainTableVariableLength` if your keys have variable @@ -408,7 +604,7 @@ // @store_index_in_file: compute plain table index and bloom filter during // file building and store it in file. When reading - // file, index will be mmaped instead of recomputation. + // file, index will be mapped instead of recomputation. bool store_index_in_file = false; }; @@ -454,6 +650,8 @@ }; struct CuckooTableOptions { + static const char* kName() { return "CuckooTableOptions"; }; + // Determines the utilization of hash tables. Smaller values // result in larger hash tables with fewer collisions. double hash_table_ratio = 0.9; @@ -491,18 +689,21 @@ class RandomAccessFileReader; // A base class for table factories. -class TableFactory { +class TableFactory : public Customizable { public: - virtual ~TableFactory() {} + virtual ~TableFactory() override {} - // The type of the table. - // - // The client of this package should switch to a new name whenever - // the table format implementation changes. - // - // Names starting with "rocksdb." are reserved and should not be used - // by any clients of this package. - virtual const char* Name() const = 0; + static const char* kBlockCacheOpts() { return "BlockCache"; }; + static const char* kBlockBasedTableName() { return "BlockBasedTable"; }; + static const char* kPlainTableName() { return "PlainTable"; } + static const char* kCuckooTableName() { return "CuckooTable"; }; + + // Creates and configures a new TableFactory from the input options and id. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* factory); + + static const char* Type() { return "TableFactory"; } // Returns a Table object table that can fetch data from file specified // in parameter file. It's the caller's responsibility to make sure @@ -525,7 +726,19 @@ const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, - bool prefetch_index_and_filter_in_cache = true) const = 0; + bool prefetch_index_and_filter_in_cache = true) const { + ReadOptions ro; + return NewTableReader(ro, table_reader_options, std::move(file), file_size, + table_reader, prefetch_index_and_filter_in_cache); + } + + // Overload of the above function that allows the caller to pass in a + // ReadOptions + virtual Status NewTableReader( + const ReadOptions& ro, const TableReaderOptions& table_reader_options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + bool prefetch_index_and_filter_in_cache) const = 0; // Return a table builder to write to a file for this table type. // @@ -547,40 +760,7 @@ // to use in this table. virtual TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const = 0; - - // Sanitizes the specified DB Options and ColumnFamilyOptions. - // - // If the function cannot find a way to sanitize the input DB Options, - // a non-ok Status will be returned. - virtual Status SanitizeOptions(const DBOptions& db_opts, - const ColumnFamilyOptions& cf_opts) const = 0; - - // Return a string that contains printable format of table configurations. - // RocksDB prints configurations at DB Open(). - virtual std::string GetPrintableTableOptions() const = 0; - - virtual Status GetOptionString(std::string* /*opt_string*/, - const std::string& /*delimiter*/) const { - return Status::NotSupported( - "The table factory doesn't implement GetOptionString()."); - } - - // Returns the raw pointer of the table options that is used by this - // TableFactory, or nullptr if this function is not supported. - // Since the return value is a raw pointer, the TableFactory owns the - // pointer and the caller should not delete the pointer. - // - // In certain case, it is desirable to alter the underlying options when the - // TableFactory is not used by any open DB by casting the returned pointer - // to the right class. For instance, if BlockBasedTableFactory is used, - // then the pointer can be casted to BlockBasedTableOptions. - // - // Note that changing the underlying TableFactory options while the - // TableFactory is currently used by any open DB is undefined behavior. - // Developers should use DB::SetOption() instead to dynamically change - // options while the DB is open. - virtual void* GetOptions() { return nullptr; } + WritableFileWriter* file) const = 0; // Return is delete range supported virtual bool IsDeleteRangeSupported() const { return false; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,12 @@ #pragma once #include + #include +#include #include + +#include "rocksdb/customizable.h" #include "rocksdb/status.h" #include "rocksdb/types.h" @@ -26,10 +30,14 @@ // ++pos) { // ... // } -typedef std::map UserCollectedProperties; +using UserCollectedProperties = std::map; // table properties' human-readable names in the property block. struct TablePropertiesNames { + static const std::string kDbId; + static const std::string kDbSessionId; + static const std::string kDbHostId; + static const std::string kOriginalFileNumber; static const std::string kDataSize; static const std::string kIndexSize; static const std::string kIndexPartitions; @@ -41,6 +49,7 @@ static const std::string kRawValueSize; static const std::string kNumDataBlocks; static const std::string kNumEntries; + static const std::string kNumFilterEntries; static const std::string kDeletedKeys; static const std::string kMergeOperands; static const std::string kNumRangeDeletions; @@ -58,18 +67,23 @@ static const std::string kCreationTime; static const std::string kOldestKeyTime; static const std::string kFileCreationTime; + static const std::string kSlowCompressionEstimatedDataSize; + static const std::string kFastCompressionEstimatedDataSize; }; -extern const std::string kPropertiesBlock; -extern const std::string kCompressionDictBlock; -extern const std::string kRangeDelBlock; - // `TablePropertiesCollector` provides the mechanism for users to collect // their own properties that they are interested in. This class is essentially // a collection of callback functions that will be invoked during table // building. It is constructed with TablePropertiesCollectorFactory. The methods // don't need to be thread-safe, as we will create exactly one -// TablePropertiesCollector object per table and then call it sequentially +// TablePropertiesCollector object per table and then call it sequentially. +// +// Statuses from these callbacks are currently logged when not OK, but +// otherwise ignored by RocksDB. +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. class TablePropertiesCollector { public: virtual ~TablePropertiesCollector() {} @@ -96,9 +110,9 @@ } // Called after each new block is cut - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) { // Nothing to do here. Callback registers can override. return; } @@ -122,26 +136,47 @@ // Constructs TablePropertiesCollector. Internals create a new // TablePropertiesCollector for each new table -class TablePropertiesCollectorFactory { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class TablePropertiesCollectorFactory : public Customizable { public: struct Context { uint32_t column_family_id; + // The level at creating the SST file (i.e, table), of which the + // properties are being collected. + int level_at_creation = kUnknownLevelAtCreation; static const uint32_t kUnknownColumnFamily; + static const int kUnknownLevelAtCreation = -1; }; - virtual ~TablePropertiesCollectorFactory() {} + ~TablePropertiesCollectorFactory() override {} + static const char* Type() { return "TablePropertiesCollectorFactory"; } + static Status CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result); + // has to be thread-safe virtual TablePropertiesCollector* CreateTablePropertiesCollector( TablePropertiesCollectorFactory::Context context) = 0; // The name of the properties collector can be used for debugging purpose. - virtual const char* Name() const = 0; + const char* Name() const override = 0; + + // Can be overridden by sub-classes to return the Name, followed by + // configuration info that will // be logged to the info log when the + // DB is opened + virtual std::string ToString() const { return Name(); } }; // TableProperties contains a bunch of read-only properties of its associated // table. struct TableProperties { public: + // the file number at creation time, or 0 for unknown. When known, + // combining with db_session_id must uniquely identify an SST file. + uint64_t orig_file_number = 0; // the total size of all data blocks. uint64_t data_size = 0; // the size of index block. @@ -165,6 +200,8 @@ uint64_t num_data_blocks = 0; // the number of entries in this table uint64_t num_entries = 0; + // the number of unique entries (keys or prefixes) added to filters + uint64_t num_filter_entries = 0; // the number of deletions in the table uint64_t num_deletions = 0; // the number of merge operands in the table @@ -187,6 +224,35 @@ uint64_t oldest_key_time = 0; // Actual SST file creation time. 0 means unknown. uint64_t file_creation_time = 0; + // Estimated size of data blocks if compressed using a relatively slower + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t slow_compression_estimated_data_size = 0; + // Estimated size of data blocks if compressed using a relatively faster + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t fast_compression_estimated_data_size = 0; + // Offset of the value of the property "external sst file global seqno" in the + // file if the property exists. + // 0 means not exists. + uint64_t external_sst_file_global_seqno_offset = 0; + + // DB identity + // db_id is an identifier generated the first time the DB is created + // If DB identity is unset or unassigned, `db_id` will be an empty string. + std::string db_id; + + // DB session identity + // db_session_id is an identifier that gets reset every time the DB is opened + // If DB session identity is unset or unassigned, `db_session_id` will be an + // empty string. + std::string db_session_id; + + // Location of the machine hosting the DB instance + // db_host_id identifies the location of the host in some form + // (hostname by default, but can also be any string of the user's choosing). + // It can potentially change whenever the DB is opened + std::string db_host_id; // Name of the column family with which this SST file is associated. // If column family is unknown, `column_family_name` will be an empty string. @@ -222,9 +288,6 @@ UserCollectedProperties user_collected_properties; UserCollectedProperties readable_properties; - // The offset of the value of each property in the file. - std::map properties_offsets; - // convert this object to a human readable form // @prop_delim: delimiter for each property. std::string ToString(const std::string& prop_delim = "; ", @@ -233,6 +296,11 @@ // Aggregate the numerical member variables of the specified // TableProperties. void Add(const TableProperties& tp); + + // Subset of properties that make sense when added together + // between tables. Keys match field names in this class instead + // of using full property names. + std::map GetAggregatablePropertiesAsMap() const; }; // Extra properties diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,13 +13,15 @@ #pragma once -#include #include +#include #include #include #include #include +#include "rocksdb/rocksdb_namespace.h" + #if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \ defined(ROCKSDB_SUPPORT_THREAD_LOCAL) #define ROCKSDB_USING_THREAD_STATUS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h 2025-05-19 16:14:27.000000000 +0000 @@ -19,8 +19,7 @@ // a time. class TraceWriter { public: - TraceWriter() {} - virtual ~TraceWriter() {} + virtual ~TraceWriter() = default; virtual Status Write(const Slice& data) = 0; virtual Status Close() = 0; @@ -28,21 +27,26 @@ }; // TraceReader allows reading RocksDB traces from any system, one operation at -// a time. A RocksDB Replayer could depend on this to replay opertions. +// a time. A RocksDB Replayer could depend on this to replay operations. class TraceReader { public: - TraceReader() {} - virtual ~TraceReader() {} + virtual ~TraceReader() = default; virtual Status Read(std::string* data) = 0; virtual Status Close() = 0; + + // Seek back to the trace header. Replayer can call this method to restart + // replaying. Note this method may fail if the reader is already closed. + virtual Status Reset() = 0; }; -// Factory methods to read/write traces from/to a file. +// Factory methods to write/read traces to/from a file. +// The implementations may not be thread-safe. Status NewFileTraceWriter(Env* env, const EnvOptions& env_options, const std::string& trace_filename, std::unique_ptr* trace_writer); Status NewFileTraceReader(Env* env, const EnvOptions& env_options, const std::string& trace_filename, std::unique_ptr* trace_reader); + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,247 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class ColumnFamilyHandle; +class DB; + +// Supported trace record types. +enum TraceType : char { + kTraceNone = 0, + kTraceBegin = 1, + kTraceEnd = 2, + // Query level tracing related trace types. + kTraceWrite = 3, + kTraceGet = 4, + kTraceIteratorSeek = 5, + kTraceIteratorSeekForPrev = 6, + // Block cache tracing related trace types. + kBlockTraceIndexBlock = 7, + kBlockTraceFilterBlock = 8, + kBlockTraceDataBlock = 9, + kBlockTraceUncompressionDictBlock = 10, + kBlockTraceRangeDeletionBlock = 11, + // IO tracing related trace type. + kIOTracer = 12, + // Query level tracing related trace type. + kTraceMultiGet = 13, + // All trace types should be added before kTraceMax + kTraceMax, +}; + +class GetQueryTraceRecord; +class IteratorSeekQueryTraceRecord; +class MultiGetQueryTraceRecord; +class TraceRecordResult; +class WriteQueryTraceRecord; + +// Base class for all types of trace records. +class TraceRecord { + public: + explicit TraceRecord(uint64_t timestamp); + + virtual ~TraceRecord() = default; + + // Type of the trace record. + virtual TraceType GetTraceType() const = 0; + + // Timestamp (in microseconds) of this trace. + virtual uint64_t GetTimestamp() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const WriteQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const GetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const IteratorSeekQueryTraceRecord& record, + std::unique_ptr* result) = 0; + + virtual Status Handle(const MultiGetQueryTraceRecord& record, + std::unique_ptr* result) = 0; + }; + + // Accept the handler and report the corresponding result in `result`. + virtual Status Accept(Handler* handler, + std::unique_ptr* result) = 0; + + // Create a handler for the exeution of TraceRecord. + static Handler* NewExecutionHandler( + DB* db, const std::vector& handles); + + private: + uint64_t timestamp_; +}; + +// Base class for all query types of trace records. +class QueryTraceRecord : public TraceRecord { + public: + explicit QueryTraceRecord(uint64_t timestamp); +}; + +// Trace record for DB::Write() operation. +class WriteQueryTraceRecord : public QueryTraceRecord { + public: + WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp); + + WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp); + + virtual ~WriteQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceWrite; } + + // rep string for the WriteBatch. + virtual Slice GetWriteBatchRep() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + PinnableSlice rep_; +}; + +// Trace record for DB::Get() operation +class GetQueryTraceRecord : public QueryTraceRecord { + public: + GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key, + uint64_t timestamp); + + GetQueryTraceRecord(uint32_t column_family_id, const std::string& key, + uint64_t timestamp); + + virtual ~GetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceGet; } + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to get. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Base class for all Iterator related operations. +class IteratorQueryTraceRecord : public QueryTraceRecord { + public: + explicit IteratorQueryTraceRecord(uint64_t timestamp); + + IteratorQueryTraceRecord(PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorQueryTraceRecord(const std::string& lower_bound, + const std::string& upper_bound, uint64_t timestamp); + + virtual ~IteratorQueryTraceRecord() override; + + // Get the iterator's lower/upper bound. They may be used in ReadOptions to + // create an Iterator instance. + virtual Slice GetLowerBound() const; + virtual Slice GetUpperBound() const; + + private: + PinnableSlice lower_; + PinnableSlice upper_; +}; + +// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation. +class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord { + public: + // Currently we only support Seek() and SeekForPrev(). + enum SeekType { + kSeek = kTraceIteratorSeek, + kSeekForPrev = kTraceIteratorSeekForPrev + }; + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + PinnableSlice&& key, PinnableSlice&& lower_bound, + PinnableSlice&& upper_bound, uint64_t timestamp); + + IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id, + const std::string& key, + const std::string& lower_bound, + const std::string& upper_bound, + uint64_t timestamp); + + virtual ~IteratorSeekQueryTraceRecord() override; + + // Trace type matches the seek type. + TraceType GetTraceType() const override; + + // Type of seek, Seek or SeekForPrev. + virtual SeekType GetSeekType() const; + + // Column family ID. + virtual uint32_t GetColumnFamilyID() const; + + // Key to seek to. + virtual Slice GetKey() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + SeekType type_; + uint32_t cf_id_; + PinnableSlice key_; +}; + +// Trace record for DB::MultiGet() operation. +class MultiGetQueryTraceRecord : public QueryTraceRecord { + public: + MultiGetQueryTraceRecord(std::vector column_family_ids, + std::vector&& keys, + uint64_t timestamp); + + MultiGetQueryTraceRecord(std::vector column_family_ids, + const std::vector& keys, + uint64_t timestamp); + + virtual ~MultiGetQueryTraceRecord() override; + + TraceType GetTraceType() const override { return kTraceMultiGet; } + + // Column familiy IDs. + virtual std::vector GetColumnFamilyIDs() const; + + // Keys to get. + virtual std::vector GetKeys() const; + + Status Accept(Handler* handler, + std::unique_ptr* result) override; + + private: + std::vector cf_ids_; + std::vector keys_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_record.h" + +namespace ROCKSDB_NAMESPACE { + +class IteratorTraceExecutionResult; +class MultiValuesTraceExecutionResult; +class SingleValueTraceExecutionResult; +class StatusOnlyTraceExecutionResult; + +// Base class for the results of all types of trace records. +// Theses classes can be used to report the execution result of +// TraceRecord::Handler::Handle() or TraceRecord::Accept(). +class TraceRecordResult { + public: + explicit TraceRecordResult(TraceType trace_type); + + virtual ~TraceRecordResult() = default; + + // Trace type of the corresponding TraceRecord. + virtual TraceType GetTraceType() const; + + class Handler { + public: + virtual ~Handler() = default; + + virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0; + + virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0; + + virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0; + + virtual Status Handle(const IteratorTraceExecutionResult& result) = 0; + }; + + // Accept the handler. + virtual Status Accept(Handler* handler) = 0; + + private: + TraceType trace_type_; +}; + +// Base class for the results from the trace record execution handler (created +// by TraceRecord::NewExecutionHandler()). +// +// The actual execution status or returned values may be hidden from +// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a +// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may +// return Status::NotFound() but TraceRecord::Handler::Handle() or +// TraceRecord::Accept() will still return Status::OK(). The actual status from +// DB::Get() and the returned value string may be saved in a +// SingleValueTraceExecutionResult. +class TraceExecutionResult : public TraceRecordResult { + public: + TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + // Execution start/end timestamps and request latency in microseconds. + virtual uint64_t GetStartTimestamp() const; + virtual uint64_t GetEndTimestamp() const; + inline uint64_t GetLatency() const { + return GetEndTimestamp() - GetStartTimestamp(); + } + + private: + uint64_t ts_start_; + uint64_t ts_end_; +}; + +// Result for operations that only return a single Status. +// Example operation: DB::Write() +class StatusOnlyTraceExecutionResult : public TraceExecutionResult { + public: + StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~StatusOnlyTraceExecutionResult() override = default; + + // Return value of DB::Write(), etc. + virtual const Status& GetStatus() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; +}; + +// Result for operations that return a Status and a value. +// Example operation: DB::Get() +class SingleValueTraceExecutionResult : public TraceExecutionResult { + public: + SingleValueTraceExecutionResult(Status status, const std::string& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + SingleValueTraceExecutionResult(Status status, std::string&& value, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~SingleValueTraceExecutionResult() override; + + // Return status of DB::Get(). + virtual const Status& GetStatus() const; + + // Value for the searched key. + virtual const std::string& GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + Status status_; + std::string value_; +}; + +// Result for operations that return multiple Status(es) and values as vectors. +// Example operation: DB::MultiGet() +class MultiValuesTraceExecutionResult : public TraceExecutionResult { + public: + MultiValuesTraceExecutionResult(std::vector multi_status, + std::vector values, + uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + virtual ~MultiValuesTraceExecutionResult() override; + + // Returned Status(es) of DB::MultiGet(). + virtual const std::vector& GetMultiStatus() const; + + // Returned values for the searched keys. + virtual const std::vector& GetValues() const; + + virtual Status Accept(Handler* handler) override; + + private: + std::vector multi_status_; + std::vector values_; +}; + +// Result for Iterator operations. +// Example operations: Iterator::Seek(), Iterator::SeekForPrev() +class IteratorTraceExecutionResult : public TraceExecutionResult { + public: + IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key, + PinnableSlice&& value, uint64_t start_timestamp, + uint64_t end_timestamp, TraceType trace_type); + + IteratorTraceExecutionResult(bool valid, Status status, + const std::string& key, const std::string& value, + uint64_t start_timestamp, uint64_t end_timestamp, + TraceType trace_type); + + virtual ~IteratorTraceExecutionResult() override; + + // Return if the Iterator is valid. + virtual bool GetValid() const; + + // Return the status of the Iterator. + virtual const Status& GetStatus() const; + + // Key of the current iterating entry, empty if GetValid() is false. + virtual Slice GetKey() const; + + // Value of the current iterating entry, empty if GetValid() is false. + virtual Slice GetValue() const; + + virtual Status Accept(Handler* handler) override; + + private: + bool valid_; + Status status_; + PinnableSlice key_; + PinnableSlice value_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { class LogFile; -typedef std::vector> VectorLogPtr; +using VectorLogPtr = std::vector>; enum WalFileType { /* Indicates that WAL file is in archive directory. WAL files are moved from diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/types.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/types.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,12 +12,44 @@ // Define all public custom types here. +using ColumnFamilyId = uint32_t; + // Represents a sequence number in a WAL file. -typedef uint64_t SequenceNumber; +using SequenceNumber = uint64_t; const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + +enum class BlobFileCreationReason { + kFlush, + kCompaction, + kRecovery, +}; + +// The types of files RocksDB uses in a DB directory. (Available for +// advanced options.) +enum FileType { + kWalFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile, + kOptionsFile, + kBlobFile +}; + // User-oriented representation of internal key types. +// Ordering of this enum entries should not change. enum EntryType { kEntryPut, kEntryDelete, @@ -25,30 +57,8 @@ kEntryMerge, kEntryRangeDeletion, kEntryBlobIndex, + kEntryDeleteWithTimestamp, kEntryOther, }; -// tuple. -struct FullKey { - Slice user_key; - SequenceNumber sequence; - EntryType type; - - FullKey() : sequence(0) {} // Intentionally left uninitialized (for speed) - FullKey(const Slice& u, const SequenceNumber& seq, EntryType t) - : user_key(u), sequence(seq), type(t) {} - std::string DebugString(bool hex = false) const; - - void clear() { - user_key.clear(); - sequence = 0; - type = EntryType::kEntryPut; - } -}; - -// Parse slice representing internal key to FullKey -// Parsed FullKey is valid for as long as the memory pointed to by -// internal_key is alive. -bool ParseFullKey(const Slice& internal_key, FullKey* result); - } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/table_properties.h" + +namespace ROCKSDB_NAMESPACE { + +// EXPERIMENTAL: This API is subject to change +// +// Computes a stable, universally unique 192-bit (24 binary char) identifier +// for an SST file from TableProperties. This is supported for table (SST) +// files created with RocksDB 6.24 and later. NotSupported will be returned +// for other cases. The first 16 bytes (128 bits) is of sufficient quality +// for almost all applications, and shorter prefixes are usable as a +// hash of the full unique id. +// +// Note: .c_str() is not compatible with binary char strings, so using +// .c_str() on the result will often result in information loss and very +// poor uniqueness probability. +// +// More detail: the first 128 bits are *guaranteed* unique for SST files +// generated in the same process (even different DBs, RocksDB >= 6.26), +// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26) +// so that the "all zeros" value can be used reliably for a null ID. +// Assuming one generates many SST files in the lifetime of each process, +// the probability of collision between processes is "better than +// random": if processes generate n SST files on average, we expect to +// generate roughly 2^64 * sqrt(n) files before first collision in the +// first 128 bits. See https://github.com/pdillinger/unique_id +// Using the full 192 bits, we expect to generate roughly 2^96 * sqrt(n) +// files before first collision. +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id); + +// EXPERIMENTAL: This API is subject to change +// +// Converts a binary string (unique id) to hexadecimal, with each 64 bits +// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B +// Also works on unique id prefix. +std::string UniqueIdToHumanString(const std::string &id); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,12 @@ #pragma once -#include #include +#include #include +#include "rocksdb/rocksdb_namespace.h" + namespace ROCKSDB_NAMESPACE { // @@ -36,12 +38,12 @@ // The size amplification is defined as the amount (in percentage) of // additional storage needed to store a single byte of data in the database. // For example, a size amplification of 2% means that a database that - // contains 100 bytes of user-data may occupy upto 102 bytes of + // contains 100 bytes of user-data may occupy up to 102 bytes of // physical storage. By this definition, a fully compacted database has // a size amplification of 0%. Rocksdb uses the following heuristic // to calculate size amplification: it assumes that all files excluding // the earliest file contribute to the size amplification. - // Default: 200, which means that a 100 byte database could require upto + // Default: 200, which means that a 100 byte database could require up to // 300 bytes of storage. unsigned int max_size_amplification_percent; @@ -72,6 +74,13 @@ // Default: false bool allow_trivial_move; + // EXPERIMENTAL + // If true, try to limit compaction size under max_compaction_bytes. + // This might cause higher write amplification, but can prevent some + // problem caused by large compactions. + // Default: false + bool incremental; + // Default set of parameters CompactionOptionsUniversal() : size_ratio(1), @@ -80,7 +89,8 @@ max_size_amplification_percent(200), compression_size_percent(-1), stop_style(kCompactionStopStyleTotalSize), - allow_trivial_move(false) {} + allow_trivial_move(false), + incremental(false) {} }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,616 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/io_status.h" +#include "rocksdb/metadata.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The default DB file checksum function name. +constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; +// The default BackupEngine file checksum function name. +constexpr char kBackupFileChecksumFuncName[] = "crc32c"; + +struct BackupEngineOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // share_table_files supports table and blob files. + // + // If share_table_files == true, the backup directory will share table and + // blob files among backups, to save space among backups of the same DB and to + // enable incremental backups by only copying new files. + // If share_table_files == false, each backup will be on its own and will not + // share any data with other backups. + // + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup and + // restore even on a machine crash/reboot. Backup and restore processes are + // slower with sync enabled. If sync == false, we can only guarantee that + // other previously synced backups and restores are not modified while + // creating a new one. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "backup_rate_limiter" + // Default: 0 + uint64_t backup_rate_limit; + + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr backup_rate_limiter{nullptr}; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // This limit only applies to writes. To also limit reads, + // a rate limiter able to also limit reads (e.g, its mode = kAllIo) + // have to be passed in through the option "restore_rate_limiter" + // Default: 0 + uint64_t restore_rate_limit; + + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr restore_rate_limiter{nullptr}; + + // share_files_with_checksum supports table and blob files. + // + // Only used if share_table_files is set to true. Setting to false is + // DEPRECATED and potentially dangerous because in that case BackupEngine + // can lose data if backing up databases with distinct or divergent + // history, for example if restoring from a backup other than the latest, + // writing to the DB, and creating another backup. Setting to true (default) + // prevents these issues by ensuring that different table files (SSTs) and + // blob files with the same number are treated as distinct. See + // share_files_with_checksum_naming and ShareFilesNaming. + // + // Default: true + bool share_files_with_checksum; + + // Up to this many background threads will copy files for CreateNewBackup() + // and RestoreDBFromBackup() + // Default: 1 + int max_background_operations; + + // During backup user can get callback every time next + // callback_trigger_interval_size bytes being copied. + // Default: 4194304 + uint64_t callback_trigger_interval_size; + + // For BackupEngineReadOnly, Open() will open at most this many of the + // latest non-corrupted backups. + // + // Note: this setting is ignored (behaves like INT_MAX) for any kind of + // writable BackupEngine because it would inhibit accounting for shared + // files for proper backup deletion, including purging any incompletely + // created backups on creation of a new backup. + // + // Default: INT_MAX + int max_valid_backups_to_open; + + // ShareFilesNaming describes possible naming schemes for backup + // table and blob file names when they are stored in the + // shared_checksum directory (i.e., both share_table_files and + // share_files_with_checksum are true). + enum ShareFilesNaming : uint32_t { + // Backup blob filenames are __.blob and + // backup SST filenames are __.sst + // where is an unsigned decimal integer. This is the + // original/legacy naming scheme for share_files_with_checksum, + // with two problems: + // * At massive scale, collisions on this triple with different file + // contents is plausible. + // * Determining the name to use requires computing the checksum, + // so generally requires reading the whole file even if the file + // is already backed up. + // + // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** + kLegacyCrc32cAndFileSize = 1U, + + // Backup SST filenames are _s.sst. This + // pair of values should be very strongly unique for a given SST file + // and easily determined before computing a checksum. The 's' indicates + // the value is a DB session id, not a checksum. + // + // Exceptions: + // * For blob files, kLegacyCrc32cAndFileSize is used as currently + // db_session_id is not supported by the blob file format. + // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize + // will be used instead, matching the names assigned by RocksDB versions + // not supporting the newer naming scheme. + // * See also flags below. + kUseDbSessionId = 2U, + + kMaskNoNamingFlags = 0xffffU, + + // If not already part of the naming scheme, insert + // _ + // before .sst and .blob in the name. In case of user code actually parsing + // the last _ before the .sst and .blob as the file size, this + // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this + // option makes official that unofficial feature of the backup metadata. + // + // We do not consider SST and blob file sizes to have sufficient entropy to + // contribute significantly to naming uniqueness. + kFlagIncludeFileSize = 1U << 31, + + kMaskNamingFlags = ~kMaskNoNamingFlags, + }; + + // Naming option for share_files_with_checksum table and blob files. See + // ShareFilesNaming for details. + // + // Modifying this option cannot introduce a downgrade compatibility issue + // because RocksDB can read, restore, and delete backups using different file + // names, and it's OK for a backup directory to use a mixture of table and + // blob files naming schemes. + // + // However, modifying this option and saving more backups to the same + // directory can lead to the same file getting saved again to that + // directory, under the new shared name in addition to the old shared + // name. + // + // Default: kUseDbSessionId | kFlagIncludeFileSize + // + // Note: This option comes into effect only if both share_files_with_checksum + // and share_table_files are true. + ShareFilesNaming share_files_with_checksum_naming; + + void Dump(Logger* logger) const; + + explicit BackupEngineOptions( + const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, + bool _sync = true, bool _destroy_old_data = false, + bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, + uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, + int _max_valid_backups_to_open = INT_MAX, + ShareFilesNaming _share_files_with_checksum_naming = + static_cast(kUseDbSessionId | kFlagIncludeFileSize)) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(true), + max_background_operations(_max_background_operations), + callback_trigger_interval_size(_callback_trigger_interval_size), + max_valid_backups_to_open(_max_valid_backups_to_open), + share_files_with_checksum_naming(_share_files_with_checksum_naming) { + assert(share_table_files || !share_files_with_checksum); + assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0); + } +}; + +inline BackupEngineOptions::ShareFilesNaming operator&( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert(r == BackupEngineOptions::kMaskNoNamingFlags || + (r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l & r); +} + +inline BackupEngineOptions::ShareFilesNaming operator|( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l | r); +} + +struct CreateBackupOptions { + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + bool flush_before_backup = false; + + // Callback for reporting progress, based on callback_trigger_interval_size. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + std::function progress_callback = []() {}; + + // If false, background_thread_cpu_priority is ignored. + // Otherwise, the cpu priority can be decreased, + // if you try to increase the priority, the priority will not change. + // The initial priority of the threads is CpuPriority::kNormal, + // so you can decrease to priorities lower than kNormal. + bool decrease_background_thread_cpu_priority = false; + CpuPriority background_thread_cpu_priority = CpuPriority::kNormal; +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupEngineOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +using BackupID = uint32_t; + +using BackupFileInfo = FileStorageInfo; + +struct BackupInfo { + BackupID backup_id = 0U; + // Creation time, according to GetCurrentTime + int64_t timestamp = 0; + + // Total size in bytes (based on file payloads, not including filesystem + // overheads or backup meta file) + uint64_t size = 0U; + + // Number of backed up files, some of which might be shared with other + // backups. Does not include backup meta file. + uint32_t number_files = 0U; + + // Backup API user metadata + std::string app_metadata; + + // Backup file details, if requested with include_file_details=true + std::vector file_details; + + // DB "name" (a directory in the backup_env) for opening this backup as a + // read-only DB. This should also be used as the DBOptions::wal_dir, such + // as by default setting wal_dir="". See also env_for_open. + // This field is only set if include_file_details=true + std::string name_for_open; + + // An Env(+FileSystem) for opening this backup as a read-only DB, with + // DB::OpenForReadOnly or similar. This field is only set if + // include_file_details=true. (The FileSystem in this Env takes care + // of making shared backup files openable from the `name_for_open` DB + // directory.) See also name_for_open. + // + // This Env might or might not be shared with other backups. To work + // around DBOptions::env being a raw pointer, this is a shared_ptr so + // that keeping either this BackupInfo, the BackupEngine, or a copy of + // this shared_ptr alive is sufficient to keep the Env alive for use by + // a read-only DB. + std::shared_ptr env_for_open; + + BackupInfo() {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; +}; + +// Read-only functions of a BackupEngine. (Restore writes to another directory +// not the backup directory.) See BackupEngine comments for details on +// safe concurrent operations. +class BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnlyBase() {} + + // Returns info about the latest good backup in backup_info, or NotFound + // no good backup exists. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetLatestBackupInfo( + BackupInfo* backup_info, bool include_file_details = false) const = 0; + + // Returns info about a specific backup in backup_info, or NotFound + // or Corruption status if the requested backup id does not exist or is + // known corrupt. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const = 0; + + // Returns info about non-corrupt backups in backup_infos. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual void GetBackupInfo(std::vector* backup_infos, + bool include_file_details = false) const = 0; + + // Returns info about corrupt backups in corrupt_backups. + // WARNING: Any write to the BackupEngine could trigger automatic + // GarbageCollect(), which could delete files that would be needed to + // manually recover a corrupt backup or to preserve an unrecognized (e.g. + // incompatible future version) backup. + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const = 0; + + // Restore to specified db_dir and wal_dir from backup_id. + virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + } + + // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id + virtual IOStatus RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual IOStatus RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromLatestBackup(options, db_dir, wal_dir); + } + + // If verify_with_checksum is true, this function + // inspects the current checksums and file sizes of backup files to see if + // they match our expectation. + // + // If verify_with_checksum is false, this function + // checks that each file exists and that the size of the file matches our + // expectation. It does not check file checksum. + // + // If this BackupEngine created the backup, it compares the files' current + // sizes (and current checksum) against the number of bytes written to + // them (and the checksum calculated) during creation. + // Otherwise, it compares the files' current sizes (and checksums) against + // their sizes (and checksums) when the BackupEngine was opened. + // + // Returns Status::OK() if all checks are good + virtual IOStatus VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const = 0; +}; + +// Append-only functions of a BackupEngine. See BackupEngine comment for +// details on distinction between Append and Write operations and safe +// concurrent operations. +class BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngineAppendOnlyBase() {} + + // same as CreateNewBackup, but stores extra application metadata. + virtual IOStatus CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, + const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0; + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackupWithMetadata(options, db, app_metadata); + } + + // Captures the state of the database by creating a new (latest) backup. + // On success (OK status), the BackupID of the new backup is saved to + // *new_backup_id when not nullptr. + // NOTE: db_paths and cf_paths are not supported for creating backups, + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. + virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db, + BackupID* new_backup_id = nullptr) { + return CreateNewBackupWithMetadata(options, db, "", new_backup_id); + } + + // keep here for backward compatibility. + virtual IOStatus CreateNewBackup( + DB* db, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackup(options, db); + } + + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediately, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. + virtual void StopBackup() = 0; + + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). This does not delete corrupt backups but can delete files that + // would be needed to manually recover a corrupt backup or to preserve an + // unrecognized (e.g. incompatible future version) backup. + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. + // NOTE2: For concurrency and interference purposes (see BackupEngine + // comment), GarbageCollect (GC) is like other Append operations, even + // though it seems different. Although GC can delete physical data, it does + // not delete any logical data read by Read operations. GC can interfere + // with Append or Write operations in another BackupEngine on the same + // backup_dir, because temporary files will be treated as obsolete and + // deleted. + virtual IOStatus GarbageCollect() = 0; +}; + +// A backup engine for organizing and managing backups. +// This class is not user-extensible. +// +// This class declaration adds "Write" operations in addition to the +// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase. +// +// # Concurrency between threads on the same BackupEngine* object +// +// As of version 6.20, BackupEngine* operations are generally thread-safe, +// using a read-write lock, though single-thread operation is still +// recommended to avoid TOCTOU bugs. Specifically, particular kinds of +// concurrent operations behave like this: +// +// op1\op2| Read | Append | Write +// -------|-------|--------|-------- +// Read | conc | block | block +// Append | block | block | block +// Write | block | block | block +// +// conc = operations safely proceed concurrently +// block = one of the operations safely blocks until the other completes. +// There is generally no guarantee as to which completes first. +// +// StopBackup is the only operation that affects an ongoing operation. +// +// # Interleaving operations between BackupEngine* objects open on the +// same backup_dir +// +// It is recommended only to have one BackupEngine* object open for a given +// backup_dir, but it is possible to mix / interleave some operations +// (regardless of whether they are concurrent) with these caveats: +// +// op1\op2| Open | Read | Append | Write +// -------|--------|--------|--------|-------- +// Open | conc | conc | atomic | unspec +// Read | conc | conc | old | unspec +// Append | atomic | old | unspec | unspec +// Write | unspec | unspec | unspec | unspec +// +// Special case: Open with destroy_old_data=true is really a Write +// +// conc = operations safely proceed, concurrently when applicable +// atomic = operations are effectively atomic; if a concurrent Append +// operation has not completed at some key point during Open, the +// opened BackupEngine* will never see the result of the Append op. +// old = Read operations do not include any state changes from other +// BackupEngine* objects; they return the state at their Open time. +// unspec = Behavior is unspecified, including possibly trashing the +// backup_dir, but is "memory safe" (no C++ undefined behavior) +// +class BackupEngine : public BackupEngineReadOnlyBase, + public BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngine() {} + + // BackupEngineOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngine** backup_engine_ptr); + + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngine** backup_engine_ptr) { + return BackupEngine::Open(options, db_env, backup_engine_ptr); + } + + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. + virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. + virtual IOStatus DeleteBackup(BackupID backup_id) = 0; +}; + +// A variant of BackupEngine that only allows "Read" operations. See +// BackupEngine comment for details. This class is not user-extensible. +class BackupEngineReadOnly : public BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnly() {} + + static IOStatus Open(const BackupEngineOptions& options, Env* db_env, + BackupEngineReadOnly** backup_engine_ptr); + // keep for backward compatibility. + static IOStatus Open(Env* db_env, const BackupEngineOptions& options, + BackupEngineReadOnly** backup_engine_ptr) { + return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h 2025-05-19 16:14:27.000000000 +0000 @@ -1,341 +1,26 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This is a DEPRECATED header for API backward compatibility. Please +// use backup_engine.h. #pragma once #ifndef ROCKSDB_LITE +// A legacy unnecessary include #include -#include -#include -#include -#include -#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/backup_engine.h" -#include "rocksdb/env.h" -#include "rocksdb/status.h" +// A legacy unnecessary include +#include "rocksdb/utilities/stackable_db.h" namespace ROCKSDB_NAMESPACE { -struct BackupableDBOptions { - // Where to keep the backup files. Has to be different than dbname_ - // Best to set this to dbname_ + "/backups" - // Required - std::string backup_dir; - - // Backup Env object. It will be used for backup file I/O. If it's - // nullptr, backups will be written out using DBs Env. If it's - // non-nullptr, backup's I/O will be performed using this object. - // If you want to have backups on HDFS, use HDFS Env here! - // Default: nullptr - Env* backup_env; - - // If share_table_files == true, backup will assume that table files with - // same name have the same contents. This enables incremental backups and - // avoids unnecessary data copies. - // If share_table_files == false, each backup will be on its own and will - // not share any data with other backups. - // default: true - bool share_table_files; - - // Backup info and error messages will be written to info_log - // if non-nullptr. - // Default: nullptr - Logger* info_log; - - // If sync == true, we can guarantee you'll get consistent backup even - // on a machine crash/reboot. Backup process is slower with sync enabled. - // If sync == false, we don't guarantee anything on machine reboot. However, - // chances are some of the backups are consistent. - // Default: true - bool sync; - - // If true, it will delete whatever backups there are already - // Default: false - bool destroy_old_data; - - // If false, we won't backup log files. This option can be useful for backing - // up in-memory databases where log file are persisted, but table files are in - // memory. - // Default: true - bool backup_log_files; - - // Max bytes that can be transferred in a second during backup. - // If 0, go as fast as you can - // Default: 0 - uint64_t backup_rate_limit; - - // Backup rate limiter. Used to control transfer speed for backup. If this is - // not null, backup_rate_limit is ignored. - // Default: nullptr - std::shared_ptr backup_rate_limiter{nullptr}; - - // Max bytes that can be transferred in a second during restore. - // If 0, go as fast as you can - // Default: 0 - uint64_t restore_rate_limit; - - // Restore rate limiter. Used to control transfer speed during restore. If - // this is not null, restore_rate_limit is ignored. - // Default: nullptr - std::shared_ptr restore_rate_limiter{nullptr}; - - // Only used if share_table_files is set to true. If true, will consider that - // backups can come from different databases, hence a sst is not uniquely - // identifed by its name, but by the triple (file name, crc32, file length) - // Default: false - // Note: this is an experimental option, and you'll need to set it manually - // *turn it on only if you know what you're doing* - bool share_files_with_checksum; - - // Up to this many background threads will copy files for CreateNewBackup() - // and RestoreDBFromBackup() - // Default: 1 - int max_background_operations; - - // During backup user can get callback every time next - // callback_trigger_interval_size bytes being copied. - // Default: 4194304 - uint64_t callback_trigger_interval_size; - - // For BackupEngineReadOnly, Open() will open at most this many of the - // latest non-corrupted backups. - // - // Note: this setting is ignored (behaves like INT_MAX) for any kind of - // writable BackupEngine because it would inhibit accounting for shared - // files for proper backup deletion, including purging any incompletely - // created backups on creation of a new backup. - // - // Default: INT_MAX - int max_valid_backups_to_open; - - void Dump(Logger* logger) const; - - explicit BackupableDBOptions( - const std::string& _backup_dir, Env* _backup_env = nullptr, - bool _share_table_files = true, Logger* _info_log = nullptr, - bool _sync = true, bool _destroy_old_data = false, - bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, - uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, - uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, - int _max_valid_backups_to_open = INT_MAX) - : backup_dir(_backup_dir), - backup_env(_backup_env), - share_table_files(_share_table_files), - info_log(_info_log), - sync(_sync), - destroy_old_data(_destroy_old_data), - backup_log_files(_backup_log_files), - backup_rate_limit(_backup_rate_limit), - restore_rate_limit(_restore_rate_limit), - share_files_with_checksum(false), - max_background_operations(_max_background_operations), - callback_trigger_interval_size(_callback_trigger_interval_size), - max_valid_backups_to_open(_max_valid_backups_to_open) { - assert(share_table_files || !share_files_with_checksum); - } -}; - -struct RestoreOptions { - // If true, restore won't overwrite the existing log files in wal_dir. It will - // also move all log files from archive directory to wal_dir. Use this option - // in combination with BackupableDBOptions::backup_log_files = false for - // persisting in-memory databases. - // Default: false - bool keep_log_files; - - explicit RestoreOptions(bool _keep_log_files = false) - : keep_log_files(_keep_log_files) {} -}; - -typedef uint32_t BackupID; - -struct BackupInfo { - BackupID backup_id; - int64_t timestamp; - uint64_t size; - - uint32_t number_files; - std::string app_metadata; - - BackupInfo() {} - - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, - uint32_t _number_files, const std::string& _app_metadata) - : backup_id(_backup_id), - timestamp(_timestamp), - size(_size), - number_files(_number_files), - app_metadata(_app_metadata) {} -}; - -class BackupStatistics { - public: - BackupStatistics() { - number_success_backup = 0; - number_fail_backup = 0; - } - - BackupStatistics(uint32_t _number_success_backup, - uint32_t _number_fail_backup) - : number_success_backup(_number_success_backup), - number_fail_backup(_number_fail_backup) {} - - ~BackupStatistics() {} - - void IncrementNumberSuccessBackup(); - void IncrementNumberFailBackup(); - - uint32_t GetNumberSuccessBackup() const; - uint32_t GetNumberFailBackup() const; - - std::string ToString() const; - - private: - uint32_t number_success_backup; - uint32_t number_fail_backup; -}; - -// A backup engine for accessing information about backups and restoring from -// them. -class BackupEngineReadOnly { - public: - virtual ~BackupEngineReadOnly() {} - - static Status Open(Env* db_env, const BackupableDBOptions& options, - BackupEngineReadOnly** backup_engine_ptr); - - // Returns info about backups in backup_info - // You can GetBackupInfo safely, even with other BackupEngine performing - // backups on the same directory - virtual void GetBackupInfo(std::vector* backup_info) = 0; - - // Returns info about corrupt backups in corrupt_backups - virtual void GetCorruptedBackups( - std::vector* corrupt_backup_ids) = 0; - - // Restoring DB from backup is NOT safe when there is another BackupEngine - // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's - // responsibility to synchronize the operation, i.e. don't delete the backup - // when you're restoring from it - // See also the corresponding doc in BackupEngine - virtual Status RestoreDBFromBackup( - BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - - // See the corresponding doc in BackupEngine - virtual Status RestoreDBFromLatestBackup( - const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - - // checks that each file exists and that the size of the file matches our - // expectations. it does not check file checksum. - // - // If this BackupEngine created the backup, it compares the files' current - // sizes against the number of bytes written to them during creation. - // Otherwise, it compares the files' current sizes against their sizes when - // the BackupEngine was opened. - // - // Returns Status::OK() if all checks are good - virtual Status VerifyBackup(BackupID backup_id) = 0; -}; - -// A backup engine for creating new backups. -class BackupEngine { - public: - virtual ~BackupEngine() {} - - // BackupableDBOptions have to be the same as the ones used in previous - // BackupEngines for the same backup directory. - static Status Open(Env* db_env, const BackupableDBOptions& options, - BackupEngine** backup_engine_ptr); - - // same as CreateNewBackup, but stores extra application metadata - // Flush will always trigger if 2PC is enabled. - // If write-ahead logs are disabled, set flush_before_backup=true to - // avoid losing unflushed key/value pairs from the memtable. - virtual Status CreateNewBackupWithMetadata( - DB* db, const std::string& app_metadata, bool flush_before_backup = false, - std::function progress_callback = []() {}) = 0; - - // Captures the state of the database in the latest backup - // NOT a thread safe call - // Flush will always trigger if 2PC is enabled. - // If write-ahead logs are disabled, set flush_before_backup=true to - // avoid losing unflushed key/value pairs from the memtable. - virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false, - std::function progress_callback = - []() {}) { - return CreateNewBackupWithMetadata(db, "", flush_before_backup, - progress_callback); - } - - // Deletes old backups, keeping latest num_backups_to_keep alive. - // See also DeleteBackup. - virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; - - // Deletes a specific backup. If this operation (or PurgeOldBackups) - // is not completed due to crash, power failure, etc. the state - // will be cleaned up the next time you call DeleteBackup, - // PurgeOldBackups, or GarbageCollect. - virtual Status DeleteBackup(BackupID backup_id) = 0; - - // Call this from another thread if you want to stop the backup - // that is currently happening. It will return immediatelly, will - // not wait for the backup to stop. - // The backup will stop ASAP and the call to CreateNewBackup will - // return Status::Incomplete(). It will not clean up after itself, but - // the state will remain consistent. The state will be cleaned up the - // next time you call CreateNewBackup or GarbageCollect. - virtual void StopBackup() = 0; - - // Returns info about backups in backup_info - virtual void GetBackupInfo(std::vector* backup_info) = 0; - - // Returns info about corrupt backups in corrupt_backups - virtual void GetCorruptedBackups( - std::vector* corrupt_backup_ids) = 0; - - // restore from backup with backup_id - // IMPORTANT -- if options_.share_table_files == true, - // options_.share_files_with_checksum == false, you restore DB from some - // backup that is not the latest, and you start creating new backups from the - // new DB, they will probably fail. - // - // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. - // If you add new data to the DB and try creating a new backup now, the - // database will diverge from backups 4 and 5 and the new backup will fail. - // If you want to create new backup, you will first have to delete backups 4 - // and 5. - virtual Status RestoreDBFromBackup( - BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - - // restore from the latest backup - virtual Status RestoreDBFromLatestBackup( - const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& restore_options = RestoreOptions()) = 0; - - // checks that each file exists and that the size of the file matches our - // expectations. it does not check file checksum. - // Returns Status::OK() if all checks are good - virtual Status VerifyBackup(BackupID backup_id) = 0; - - // Will delete any files left over from incomplete creation or deletion of - // a backup. This is not normally needed as those operations also clean up - // after prior incomplete calls to the same kind of operation (create or - // delete). - // NOTE: This is not designed to delete arbitrary files added to the backup - // directory outside of BackupEngine, and clean-up is always subject to - // permissions on and availability of the underlying filesystem. - virtual Status GarbageCollect() = 0; -}; +using BackupableDBOptions = BackupEngineOptions; } // namespace ROCKSDB_NAMESPACE + #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,142 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/io_status.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +// The classes and functions in this header file is used for dumping out the +// blocks in a block cache, storing or transfering the blocks to another +// destination host, and load these blocks to the secondary cache at destination +// host. +// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They +// my be changed in the future when the development continues. + +// The major and minor version number of the data format to be stored/trandfered +// via CacheDumpWriter and read out via CacheDumpReader +static const int kCacheDumpMajorVersion = 0; +static const int kCacheDumpMinorVersion = 1; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to write or transfer the data that is created by +// CacheDumper. We pack one block with its block type, dump time, block key in +// the block cache, block len, block crc32c checksum and block itself as a unit +// and it is stored via WritePacket. Before we call WritePacket, we must call +// WriteMetadata once, which stores the sequence number, block unit checksum, +// and block unit size. +// We provide file based CacheDumpWriter to store the metadata and its package +// sequentially in a file as the defualt implementation. Users can implement +// their own CacheDumpWriter to store/transfer the data. For example, user can +// create a subclass which transfer the metadata and package on the fly. +class CacheDumpWriter { + public: + virtual ~CacheDumpWriter() = default; + + // Called ONCE before the calls to WritePacket + virtual IOStatus WriteMetadata(const Slice& metadata) = 0; + virtual IOStatus WritePacket(const Slice& data) = 0; + virtual IOStatus Close() = 0; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is an abstract class to read or receive the data that is stored +// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called +// once before we call a ReadPacket. +class CacheDumpReader { + public: + virtual ~CacheDumpReader() = default; + // Called ONCE before the calls to ReadPacket + virtual IOStatus ReadMetadata(std::string* metadata) = 0; + // Sets data to empty string on EOF + virtual IOStatus ReadPacket(std::string* data) = 0; + // (Close not needed) +}; + +// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any +// dump or load process related control variables can be added here. +struct CacheDumpOptions { + SystemClock* clock; +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This the class to dump out the block in the block cache, store/transfer them +// via CacheDumpWriter. In order to dump out the blocks belonging to a certain +// DB or a list of DB (block cache can be shared by many DB), user needs to call +// SetDumpFilter to specify a list of DB to filter out the blocks that do not +// belong to those DB. +// A typical use case is: when we migrate a DB instance from host A to host B. +// We need to reopen the DB at host B after all the files are copied to host B. +// At this moment, the block cache at host B does not have any block from this +// migrated DB. Therefore, the read performance can be low due to cache warm up. +// By using CacheDumper before we shut down the DB at host A and using +// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache +// ahead. This function can be used in other use cases also. +class CacheDumper { + public: + virtual ~CacheDumper() = default; + // Only dump the blocks in the block cache that belong to the DBs in this list + virtual Status SetDumpFilter(std::vector db_list) { + (void)db_list; + return Status::NotSupported("SetDumpFilter is not supported"); + } + // The main function to dump out all the blocks that satisfy the filter + // condition from block cache to a certain CacheDumpWriter in one shot. This + // process may take some time. + virtual IOStatus DumpCacheEntriesToWriter() { + return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported"); + } +}; + +// NOTE that: this class is EXPERIMENTAL! May be changed in the future! +// This is the class to load the dumped blocks to the destination cache. For now +// we only load the blocks to the SecondaryCache. In the future, we may plan to +// support loading to the block cache. +class CacheDumpedLoader { + public: + virtual ~CacheDumpedLoader() = default; + virtual IOStatus RestoreCacheEntriesToSecondaryCache() { + return IOStatus::NotSupported( + "RestoreCacheEntriesToSecondaryCache is not supported"); + } +}; + +// Get the writer which stores all the metadata and data sequentially to a file +IOStatus NewToFileCacheDumpWriter(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* writer); + +// Get the reader which read out the metadata and data sequentially from a file +IOStatus NewFromFileCacheDumpReader(const std::shared_ptr& fs, + const FileOptions& file_opts, + const std::string& file_name, + std::unique_ptr* reader); + +// Get the default cache dumper +Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options, + const std::shared_ptr& cache, + std::unique_ptr&& writer, + std::unique_ptr* cache_dumper); + +// Get the default cache dump loader +Status NewDefaultCacheDumpedLoader( + const CacheDumpOptions& dump_options, + const BlockBasedTableOptions& toptions, + const std::shared_ptr& secondary_cache, + std::unique_ptr&& reader, + std::unique_ptr* cache_dump_loader); + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h 2025-05-19 16:14:27.000000000 +0000 @@ -24,21 +24,28 @@ // Creates a Checkpoint object to be used for creating openable snapshots static Status Create(DB* db, Checkpoint** checkpoint_ptr); - // Builds an openable snapshot of RocksDB on the same disk, which - // accepts an output directory on the same disk, and under the directory - // (1) hard-linked SST files pointing to existing live SST files - // SST files will be copied if output directory is on a different filesystem - // (2) a copied manifest files and other files - // The directory should not already exist and will be created by this API. - // The directory will be an absolute path + // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an + // absolute path. The specified directory should not exist, since it will be + // created by the API. + // When a checkpoint is created, + // (1) SST and blob files are hard linked if the output directory is on the + // same filesystem as the database, and copied otherwise. + // (2) other required files (like MANIFEST) are always copied. // log_size_for_flush: if the total log file size is equal or larger than // this value, then a flush is triggered for all the column families. The // default value is 0, which means flush is always triggered. If you move // away from the default, the checkpoint may not contain up-to-date data // if WAL writing is not always enabled. // Flush will always trigger if it is 2PC. + // sequence_number_ptr: if it is not nullptr, the value it points to will be + // set to a sequence number guaranteed to be part of the DB, not necessarily + // the latest. The default value of this parameter is nullptr. + // NOTE: db_paths and cf_paths are not supported for creating checkpoints + // and NotSupported will be returned when the DB (without WALs) uses more + // than one directory. virtual Status CreateCheckpoint(const std::string& checkpoint_dir, - uint64_t log_size_for_flush = 0); + uint64_t log_size_for_flush = 0, + uint64_t* sequence_number_ptr = nullptr); // Exports all live SST files of a specified Column Family onto export_dir, // returning SST files information in metadata. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,368 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/convenience.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { +// The FactoryFunc functions are used to create a new customizable object +// without going through the ObjectRegistry. This methodology is especially +// useful in LITE mode, where there is no ObjectRegistry. The methods take +// in an ID of the object to create and a pointer to store the created object. +// If the factory successfully recognized the input ID, the method should return +// success; otherwise false should be returned. On success, the object +// parameter contains the new object. +template +using SharedFactoryFunc = + std::function*)>; + +template +using UniqueFactoryFunc = + std::function*)>; + +template +using StaticFactoryFunc = std::function; + +// Creates a new shared customizable instance object based on the +// input parameters using the object registry. +// +// The id parameter specifies the instance class of the object to create. +// The opt_map parameter specifies the configuration of the new instance. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewSharedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewSharedObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new managed customizable instance object based on the +// input parameters using the object registry. Unlike "shared" objects, +// managed objects are limited to a single instance per ID. +// +// The id parameter specifies the instance class of the object to create. +// If an object with this id exists in the registry, the existing object +// will be returned. If the object does not exist, a new one will be created. +// +// The opt_map parameter specifies the configuration of the new instance. +// If the object already exists, the existing object is returned "as is" and +// this parameter is ignored. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the object. This string +// will be used by the object registry to locate the appropriate object to +// create or return. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The managed instance. +template +static Status NewManagedObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::shared_ptr* result) { + Status status; + if (!id.empty()) { +#ifndef ROCKSDB_LITE + status = config_options.registry->GetOrCreateManagedObject( + id, result, [config_options, opt_map](T* object) { + return object->ConfigureFromMap(config_options, opt_map); + }); +#else + (void)result; + (void)opt_map; + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } + } else { + status = Status::NotSupported("Cannot reset object "); + } + return status; +} + +// Creates a new shared Customizable object based on the input parameters. +// This method parses the input value to determine the type of instance to +// create. If there is an existing instance (in result) and it is the same ID +// as the object being created, the existing configuration is stored and used as +// the default for the new object. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadSharedObject(const ConfigOptions& config_options, + const std::string& value, + const SharedFactoryFunc& func, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewSharedObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } +} + +// Creates a new shared Customizable object based on the input parameters. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there should be an "id=value" pairing or an error may +// result. +// +// The "id" field from the value (either the whole field or "id=XX") is used +// to determine the type/id of the object to return. For a given id, there +// the same instance of the object will be returned from this method (as opposed +// to LoadSharedObject which would create different objects for the same id. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration. If ignore_unsupported_options=true, unknown instance types +// are ignored. If invoke_prepare_options=true, the resulting instance will be +// initialized (via PrepareOptions) +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadManagedObject(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, nullptr, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (value.empty()) { // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else { + return NewManagedObject(config_options, id, opt_map, result); + } +} + +// Creates a new unique pointer customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewUniqueObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, + std::unique_ptr* result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewUniqueObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + result->reset(); + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new unique customizable instance object based on the input +// parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadUniqueObject(const ConfigOptions& config_options, + const std::string& value, + const UniqueFactoryFunc& func, + std::unique_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewUniqueObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, result->get(), + opt_map); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters using the object registry. +// @see NewSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param id The identifier of the new object being created. This string +// will be used by the object registry to locate the appropriate object to +// create. +// @param opt_map Optional name-value pairs of properties to set for the newly +// created object +// @param result The newly created and configured instance. +template +static Status NewStaticObject( + const ConfigOptions& config_options, const std::string& id, + const std::unordered_map& opt_map, T** result) { + if (!id.empty()) { + Status status; +#ifndef ROCKSDB_LITE + status = config_options.registry->NewStaticObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + status = Status::OK(); + } else if (status.ok()) { + status = + Customizable::ConfigureNewObject(config_options, *result, opt_map); + } + return status; + } else if (opt_map.empty()) { + // There was no ID and no map (everything empty), so reset/clear the result + *result = nullptr; + return Status::OK(); + } else { + return Status::NotSupported("Cannot reset object "); + } +} + +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to create and initailize the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadStaticObject(const ConfigOptions& config_options, + const std::string& value, + const StaticFactoryFunc& func, T** result) { + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, *result, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (func == nullptr || + !func(id, result)) { // No factory, or it failed + return NewStaticObject(config_options, id, opt_map, result); + } else { + return Customizable::ConfigureNewObject(config_options, *result, opt_map); + } +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h 2025-05-19 16:14:27.000000000 +0000 @@ -57,7 +57,7 @@ static Status Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, - DBWithTTL** dbptr, std::vector ttls, + DBWithTTL** dbptr, const std::vector& ttls, bool read_only = false); virtual void SetTtl(int32_t ttl) = 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h 2025-05-19 16:14:27.000000000 +0000 @@ -76,7 +76,8 @@ // Store in *result the names of the children of the specified directory. // The names are relative to "dir". // Original contents of *results are dropped. - Status GetChildren(const std::string& dir, std::vector* result); + Status GetChildren(const std::string& dir, + std::vector* result) override; // Delete the named file. Status DeleteFile(const std::string& fname) override; @@ -116,18 +117,16 @@ // to go away. // // May create the named file if it does not already exist. - Status LockFile(const std::string& fname, FileLock** lock); + Status LockFile(const std::string& fname, FileLock** lock) override; // Release the lock acquired by a previous successful call to LockFile. // REQUIRES: lock was returned by a successful LockFile() call // REQUIRES: lock has not already been unlocked. - Status UnlockFile(FileLock* lock); + Status UnlockFile(FileLock* lock) override; // Get full directory name for this db. - Status GetAbsolutePath(const std::string& db_path, std::string* output_path); - - // Generate unique id - std::string GenerateUniqueId(); + Status GetAbsolutePath(const std::string& db_path, + std::string* output_path) override; // Get default EnvLibrados static EnvLibrados* Default(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,7 @@ #include #include + #include #include #include @@ -16,6 +17,7 @@ #include #include +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/ldb_tool.h" @@ -30,6 +32,7 @@ public: // Command-line arguments static const std::string ARG_ENV_URI; + static const std::string ARG_FS_URI; static const std::string ARG_DB; static const std::string ARG_PATH; static const std::string ARG_SECONDARY_PATH; @@ -57,6 +60,7 @@ static const std::string ARG_FILE_SIZE; static const std::string ARG_CREATE_IF_MISSING; static const std::string ARG_NO_VALUE; + static const std::string ARG_DISABLE_CONSISTENCY_CHECKS; struct ParsedParams { std::string cmd; @@ -75,13 +79,17 @@ SelectCommand); static LDBCommand* InitFromCmdLineArgs( - int argc, char** argv, const Options& options, + int argc, char const* const* argv, const Options& options, const LDBOptions& ldb_options, const std::vector* column_families); bool ValidateCmdLineOptions(); - virtual Options PrepareOptionsForOpenDB(); + virtual void PrepareOptions(); + + virtual void OverrideBaseOptions(); + + virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts); virtual void SetDBOptions(Options options) { options_ = options; } @@ -130,6 +138,7 @@ protected: LDBCommandExecuteResult exec_state_; std::string env_uri_; + std::string fs_uri_; std::string db_path_; // If empty, open DB as primary. If non-empty, open the DB as secondary // with this secondary path. When running against a database opened by @@ -161,7 +170,8 @@ // If true, try to construct options from DB's option files. bool try_load_options_; - bool ignore_unknown_options_; + // The value passed to options.force_consistency_checks. + bool force_consistency_checks_; bool create_if_missing_; @@ -237,6 +247,7 @@ Options options_; std::vector column_families_; + ConfigOptions config_options_; LDBOptions ldb_options_; private: @@ -264,11 +275,13 @@ class LDBCommandRunner { public: - static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name); + static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name, + bool to_stderr = true); // Returns the status code to return. 0 is no error. static int RunCommand( - int argc, char** argv, Options options, const LDBOptions& ldb_options, + int argc, char const* const* argv, Options options, + const LDBOptions& ldb_options, const std::vector* column_families); }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,10 @@ // #pragma once +#include + +#include "rocksdb/rocksdb_namespace.h" + #ifdef FAILED #undef FAILED #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,6 +11,7 @@ #include +#include "rocksdb/compression_type.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { @@ -23,8 +24,6 @@ struct Options; class Snapshot; -enum CompressionType : unsigned char; - // Options to control the behavior of a database (passed to // DB::Open). A LevelDBOptions object can be initialized as though // it were a LevelDB Options object, and then it can be converted into diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,83 +8,302 @@ #ifndef ROCKSDB_LITE #include +#include #include -#include +#include #include #include #include + #include "rocksdb/status.h" +#include "rocksdb/utilities/regex.h" namespace ROCKSDB_NAMESPACE { +class Customizable; class Logger; +class ObjectLibrary; + // Returns a new T when called with a string. Populates the std::unique_ptr // argument if granting ownership to caller. template using FactoryFunc = std::function*, std::string*)>; +// The signature of the function for loading factories +// into an object library. This method is expected to register +// factory functions in the supplied ObjectLibrary. +// The ObjectLibrary is the library in which the factories will be loaded. +// The std::string is the argument passed to the loader function. +// The RegistrarFunc should return the number of objects loaded into this +// library +using RegistrarFunc = std::function; + +template +using ConfigureFunc = std::function; + class ObjectLibrary { - public: + private: // Base class for an Entry in the Registry. class Entry { public: virtual ~Entry() {} - Entry(const std::string& name) : name_(std::move(name)) {} + virtual bool Matches(const std::string& target) const = 0; + virtual const char* Name() const = 0; + }; - // Checks to see if the target matches this entry - virtual bool matches(const std::string& target) const { - return name_ == target; + // A class that implements an Entry based on Regex. + // + // WARNING: some regexes are problematic for std::regex; see + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 for example + // + // This class is deprecated and will be removed in a future release + class RegexEntry : public Entry { + public: + explicit RegexEntry(const std::string& name) : name_(name) { + Regex::Parse(name, ®ex_).PermitUncheckedError(); + } + + bool Matches(const std::string& target) const override { + return regex_.Matches(target); } - const std::string& Name() const { return name_; } + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + Regex regex_; // The pattern for this entry + }; + public: + // Class for matching target strings to a pattern. + // Entries consist of a name that starts the pattern and attributes + // The following attributes can be added to the entry: + // -Suffix: Comparable to name(suffix) + // -Separator: Comparable to name(separator).+ or name(separator).* + // -Number: Comparable to name(separator).[0-9]+ + // -AltName: Comparable to (name|alt) + // -Optional: Comparable to name(separator)? + // Multiple separators can be combined and cause multiple matches. + // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#") + // is roughly equivalent to "(A|B)@.+#.+" + // + // Note that though this class does provide some regex-style matching, + // it is not a full regex parser and has some key differences: + // - Separators are matched left-most. For example, an entry + // Name("Hello").AddSeparator(" ").AddSuffix("!") would match + // "Hello world!", but not "Hello world!!" + // - No backtracking is necessary, enabling reliably efficient matching + class PatternEntry : public Entry { private: - const std::string name_; // The name of the Entry - }; // End class Entry + enum Quantifier { + kMatchZeroOrMore, // [suffix].* + kMatchAtLeastOne, // [suffix].+ + kMatchExact, // [suffix] + kMatchNumeric, // [suffix][0-9]+ + }; + + public: + // Short-cut for creating an entry that matches to a + // Customizable::IndividualId + static PatternEntry AsIndividualId(const std::string& name) { + PatternEntry entry(name, true); + entry.AddSeparator("@"); + entry.AddSeparator("#"); + return entry; + } + + // Creates a new PatternEntry for "name". If optional is true, + // Matches will also return true if name==target + explicit PatternEntry(const std::string& name, bool optional = true) + : name_(name), optional_(optional), slength_(0) { + nlength_ = name_.size(); + } + + // Adds a suffix (exact match of separator with no trailing characters) to + // the separator + PatternEntry& AddSuffix(const std::string& suffix) { + separators_.emplace_back(suffix, kMatchExact); + slength_ += suffix.size(); + return *this; + } + // Adds a separator (exact match of separator with trailing characters) to + // the entry + // If at_least_one is true, the separator must be followed by at least + // one character (e.g. separator.+). + // If at_least_one is false, the separator may be followed by zero or + // more characters (e.g. separator.*). + PatternEntry& AddSeparator(const std::string& separator, + bool at_least_one = true) { + slength_ += separator.size(); + if (at_least_one) { + separators_.emplace_back(separator, kMatchAtLeastOne); + ++slength_; + } else { + separators_.emplace_back(separator, kMatchZeroOrMore); + } + return *this; + } + + // Adds a separator (exact match of separator with trailing numbers) to the + // entry + PatternEntry& AddNumber(const std::string& separator) { + separators_.emplace_back(separator, kMatchNumeric); + slength_ += separator.size() + 1; + return *this; + } + + // Sets another name that this entry will match, similar to (name|alt) + PatternEntry& AnotherName(const std::string& alt) { + names_.emplace_back(alt); + return *this; + } + + // Sets whether the separators are required -- similar to name(separator)? + // If optional is true, then name(separator)? would match + // If optional is false, then the separators must also match + PatternEntry& SetOptional(bool optional) { + optional_ = optional; + return *this; + } + + // Checks to see if the target matches this entry + bool Matches(const std::string& target) const override; + const char* Name() const override { return name_.c_str(); } + + private: + size_t MatchSeparatorAt(size_t start, Quantifier mode, + const std::string& target, size_t tlen, + const std::string& pattern) const; + + bool MatchesTarget(const std::string& name, size_t nlen, + const std::string& target, size_t ylen) const; + std::string name_; // The base name for this entry + size_t nlength_; // The length of name_ + std::vector names_; // Alternative names for this entry + bool optional_; // Whether matching of separators is required + size_t slength_; // The minimum required length to match the separators + std::vector> + separators_; // What to match + }; // End class Entry + + private: // An Entry containing a FactoryFunc for creating new Objects template class FactoryEntry : public Entry { public: - FactoryEntry(const std::string& name, FactoryFunc f) - : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {} - ~FactoryEntry() override {} - bool matches(const std::string& target) const override { - return std::regex_match(target, pattern_); + FactoryEntry(Entry* e, FactoryFunc f) + : entry_(e), factory_(std::move(f)) {} + bool Matches(const std::string& target) const override { + return entry_->Matches(target); } + const char* Name() const override { return entry_->Name(); } + // Creates a new T object. T* NewFactoryObject(const std::string& target, std::unique_ptr* guard, std::string* msg) const { return factory_(target, guard, msg); } + const FactoryFunc& GetFactory() const { return factory_; } private: - std::regex pattern_; // The pattern for this entry + std::unique_ptr entry_; // What to match for this entry FactoryFunc factory_; }; // End class FactoryEntry public: - // Finds the entry matching the input name and type - const Entry* FindEntry(const std::string& type, - const std::string& name) const; + explicit ObjectLibrary(const std::string& id) { id_ = id; } + + const std::string& GetID() const { return id_; } + + // Finds the factory function for the input target. + // @see PatternEntry for the matching rules to target + // @return If matched, the FactoryFunc for this target, else nullptr + template + FactoryFunc FindFactory(const std::string& target) const { + std::unique_lock lock(mu_); + auto factories = factories_.find(T::Type()); + if (factories != factories_.end()) { + for (const auto& e : factories->second) { + if (e->Matches(target)) { + const auto* fe = + static_cast*>(e.get()); + return fe->GetFactory(); + } + } + } + return nullptr; + } + + // Returns the total number of factories registered for this library. + // This method returns the sum of all factories registered for all types. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(size_t* num_types) const; + void Dump(Logger* logger) const; - // Registers the factory with the library for the pattern. + // Registers the factory with the library for the regular expression pattern. // If the pattern matches, the factory may be used to create a new object. + // + // WARNING: some regexes are problematic for std::regex; see + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 for example + // + // Deprecated. Will be removed in a major release. Code should use AddFactory + // instead. template const FactoryFunc& Register(const std::string& pattern, const FactoryFunc& factory) { - std::unique_ptr entry(new FactoryEntry(pattern, factory)); - AddEntry(T::Type(), entry); + std::unique_ptr entry( + new FactoryEntry(new RegexEntry(pattern), factory)); + AddFactoryEntry(T::Type(), std::move(entry)); return factory; } + + // Registers the factory with the library for the name. + // If name==target, the factory may be used to create a new object. + template + const FactoryFunc& AddFactory(const std::string& name, + const FactoryFunc& func) { + std::unique_ptr entry( + new FactoryEntry(new PatternEntry(name), func)); + AddFactoryEntry(T::Type(), std::move(entry)); + return func; + } + + // Registers the factory with the library for the entry. + // If the entry matches the target, the factory may be used to create a new + // object. + // @see PatternEntry for the matching rules. + template + const FactoryFunc& AddFactory(const PatternEntry& entry, + const FactoryFunc& func) { + std::unique_ptr factory( + new FactoryEntry(new PatternEntry(entry), func)); + AddFactoryEntry(T::Type(), std::move(factory)); + return func; + } + + // Invokes the registrar function with the supplied arg for this library. + int Register(const RegistrarFunc& registrar, const std::string& arg) { + return registrar(*this, arg); + } + // Returns the default ObjectLibrary static std::shared_ptr& Default(); private: - // Adds the input entry to the list for the given type - void AddEntry(const std::string& type, std::unique_ptr& entry); + void AddFactoryEntry(const char* type, std::unique_ptr&& entry) { + std::unique_lock lock(mu_); + auto& factories = factories_[type]; + factories.emplace_back(std::move(entry)); + } + // Protects the entry map + mutable std::mutex mu_; // ** FactoryFunctions for this loader, organized by type - std::unordered_map>> entries_; + std::unordered_map>> + factories_; + + // The name for this library + std::string id_; }; // The ObjectRegistry is used to register objects that can be created by a @@ -93,30 +312,46 @@ class ObjectRegistry { public: static std::shared_ptr NewInstance(); - - ObjectRegistry(); + static std::shared_ptr NewInstance( + const std::shared_ptr& parent); + static std::shared_ptr Default(); + explicit ObjectRegistry(const std::shared_ptr& parent) + : parent_(parent) {} + + std::shared_ptr AddLibrary(const std::string& id) { + auto library = std::make_shared(id); + AddLibrary(library); + return library; + } void AddLibrary(const std::shared_ptr& library) { - libraries_.emplace_back(library); + std::unique_lock lock(library_mutex_); + libraries_.push_back(library); + } + + void AddLibrary(const std::string& id, const RegistrarFunc& registrar, + const std::string& arg) { + auto library = AddLibrary(id); + library->Register(registrar, arg); } - // Creates a new T using the factory function that was registered with a - // pattern that matches the provided "target" string according to - // std::regex_match. + // Creates a new T using the factory function that was registered for this + // target. Searches through the libraries to find the first library where + // there is an entry that matches target (see PatternEntry for the matching + // rules). // // If no registered functions match, returns nullptr. If multiple functions // match, the factory function used is unspecified. // - // Populates res_guard with result pointer if caller is granted ownership. + // Populates guard with result pointer if caller is granted ownership. + // Deprecated. Use NewShared/Static/UniqueObject instead. template T* NewObject(const std::string& target, std::unique_ptr* guard, std::string* errmsg) { guard->reset(); - const auto* basic = FindEntry(T::Type(), target); - if (basic != nullptr) { - const auto* factory = - static_cast*>(basic); - return factory->NewFactoryObject(target, guard, errmsg); + auto factory = FindFactory(target); + if (factory != nullptr) { + return factory(target, guard, errmsg); } else { *errmsg = std::string("Could not load ") + T::Type(); return nullptr; @@ -125,7 +360,7 @@ // Creates a new unique T using the input factory functions. // Returns OK if a new unique T was successfully created - // Returns NotFound if the type/target could not be created + // Returns NotSupported if the type/target could not be created // Returns InvalidArgument if the factory return an unguarded object // (meaning it cannot be managed by a unique ptr) template @@ -134,7 +369,7 @@ std::string errmsg; T* ptr = NewObject(target, result, &errmsg); if (ptr == nullptr) { - return Status::NotFound(errmsg, target); + return Status::NotSupported(errmsg, target); } else if (*result) { return Status::OK(); } else { @@ -146,7 +381,7 @@ // Creates a new shared T using the input factory functions. // Returns OK if a new shared T was successfully created - // Returns NotFound if the type/target could not be created + // Returns NotSupported if the type/target could not be created // Returns InvalidArgument if the factory return an unguarded object // (meaning it cannot be managed by a shared ptr) template @@ -156,7 +391,7 @@ std::unique_ptr guard; T* ptr = NewObject(target, &guard, &errmsg); if (ptr == nullptr) { - return Status::NotFound(errmsg, target); + return Status::NotSupported(errmsg, target); } else if (guard) { result->reset(guard.release()); return Status::OK(); @@ -169,7 +404,7 @@ // Creates a new static T using the input factory functions. // Returns OK if a new static T was successfully created - // Returns NotFound if the type/target could not be created + // Returns NotSupported if the type/target could not be created // Returns InvalidArgument if the factory return a guarded object // (meaning it is managed by a unique ptr) template @@ -178,7 +413,7 @@ std::unique_ptr guard; T* ptr = NewObject(target, &guard, &errmsg); if (ptr == nullptr) { - return Status::NotFound(errmsg, target); + return Status::NotSupported(errmsg, target); } else if (guard.get()) { return Status::InvalidArgument(std::string("Cannot make a static ") + T::Type() + " from a guarded one ", @@ -189,17 +424,167 @@ } } + // Sets the object for the given id/type to be the input object + // If the registry does not contain this id/type, the object is added and OK + // is returned. If the registry contains a different object, an error is + // returned. If the registry contains the input object, OK is returned. + template + Status SetManagedObject(const std::shared_ptr& object) { + assert(object != nullptr); + return SetManagedObject(object->GetId(), object); + } + + template + Status SetManagedObject(const std::string& id, + const std::shared_ptr& object) { + const auto c = std::static_pointer_cast(object); + return SetManagedObject(T::Type(), id, c); + } + + // Returns the object for the given id, if one exists. + // If the object is not found in the registry, a nullptr is returned + template + std::shared_ptr GetManagedObject(const std::string& id) const { + auto c = GetManagedObject(T::Type(), id); + return std::static_pointer_cast(c); + } + + // Returns the set of managed objects found in the registry matching + // the input type and ID. + // If the input id is not empty, then only objects of that class + // (IsInstanceOf(id)) will be returned (for example, only return LRUCache + // objects) If the input id is empty, then all objects of that type (all Cache + // objects) + template + Status ListManagedObjects(const std::string& id, + std::vector>* results) const { + std::vector> customizables; + results->clear(); + Status s = ListManagedObjects(T::Type(), id, &customizables); + if (s.ok()) { + for (const auto& c : customizables) { + results->push_back(std::static_pointer_cast(c)); + } + } + return s; + } + + template + Status ListManagedObjects(std::vector>* results) const { + return ListManagedObjects("", results); + } + + // Creates a new ManagedObject in the registry for the id if one does not + // currently exist. If an object with that ID already exists, the current + // object is returned. + // + // The ID is the identifier of the object to be returned/created and returned + // in result + // If a new object is created (using the object factories), the cfunc + // parameter will be invoked to configure the new object. + template + Status GetOrCreateManagedObject(const std::string& id, + std::shared_ptr* result, + const ConfigureFunc& cfunc = nullptr) { + if (parent_ != nullptr) { + auto object = parent_->GetManagedObject(T::Type(), id); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + { + std::unique_lock lock(objects_mutex_); + auto key = ToManagedObjectKey(T::Type(), id); + auto iter = managed_objects_.find(key); + if (iter != managed_objects_.end()) { + auto object = iter->second.lock(); + if (object != nullptr) { + *result = std::static_pointer_cast(object); + return Status::OK(); + } + } + std::shared_ptr object; + Status s = NewSharedObject(id, &object); + if (s.ok() && cfunc != nullptr) { + s = cfunc(object.get()); + } + if (s.ok()) { + auto c = std::static_pointer_cast(object); + if (id != c->Name()) { + // If the ID is not the base name of the class, add the new + // object under the input ID + managed_objects_[key] = c; + } + if (id != c->GetId() && c->GetId() != c->Name()) { + // If the input and current ID do not match, and the + // current ID is not the base bame, add the new object under + // its new ID + key = ToManagedObjectKey(T::Type(), c->GetId()); + managed_objects_[key] = c; + } + *result = object; + } + return s; + } + } + // Dump the contents of the registry to the logger void Dump(Logger* logger) const; private: - const ObjectLibrary::Entry* FindEntry(const std::string& type, - const std::string& name) const; + explicit ObjectRegistry(const std::shared_ptr& library) { + libraries_.push_back(library); + } + static std::string ToManagedObjectKey(const std::string& type, + const std::string& id) { + return type + "://" + id; + } + + // Returns the Customizable managed object associated with the key (Type/ID). + // If not found, nullptr is returned. + std::shared_ptr GetManagedObject(const std::string& type, + const std::string& id) const; + Status ListManagedObjects( + const std::string& type, const std::string& pattern, + std::vector>* results) const; + // Sets the managed object associated with the key (Type/ID) to c. + // If the named managed object does not exist, the object is added and OK is + // returned If the object exists and is the same as c, OK is returned + // Otherwise, an error status is returned. + Status SetManagedObject(const std::string& type, const std::string& id, + const std::shared_ptr& c); + + // Searches (from back to front) the libraries looking for the + // factory that matches this name. + // Returns the factory if it is found, and nullptr otherwise + template + const FactoryFunc FindFactory(const std::string& name) const { + { + std::unique_lock lock(library_mutex_); + for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); + ++iter) { + const auto factory = iter->get()->FindFactory(name); + if (factory != nullptr) { + return factory; + } + } + } + if (parent_ == nullptr) { + return nullptr; + } else { + return parent_->FindFactory(name); + } + } // The set of libraries to search for factories for this registry. // The libraries are searched in reverse order (back to front) when // searching for entries. std::vector> libraries_; + std::map> managed_objects_; + std::shared_ptr parent_; + mutable std::mutex objects_mutex_; // Mutex for managed objects + mutable std::mutex library_mutex_; // Mutex for managed libraries }; } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h 2025-05-19 16:14:27.000000000 +0000 @@ -51,6 +51,8 @@ uint32_t occ_lock_buckets = (1 << 20); }; +// Range deletions (including those in `WriteBatch`es passed to `Write()`) are +// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status` class OptimisticTransactionDB : public StackableDB { public: // Open an OptimisticTransactionDB similar to DB::Open(). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,946 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class OptionTypeInfo; + +// The underlying "class/type" of the option. +// This enum is used to determine how the option should +// be converted to/from strings and compared. +enum class OptionType { + kBoolean, + kInt, + kInt32T, + kInt64T, + kUInt, + kUInt8T, + kUInt32T, + kUInt64T, + kSizeT, + kString, + kDouble, + kCompactionStyle, + kCompactionPri, + kCompressionType, + kCompactionStopStyle, + kFilterPolicy, + kChecksumType, + kEncodingType, + kEnv, + kEnum, + kStruct, + kVector, + kConfigurable, + kCustomizable, + kEncodedString, + kUnknown, +}; + +enum class OptionVerificationType { + kNormal, + kByName, // The option is pointer typed so we can only verify + // based on it's name. + kByNameAllowNull, // Same as kByName, but it also allows the case + // where one of them is a nullptr. + kByNameAllowFromNull, // Same as kByName, but it also allows the case + // where the old option is nullptr. + kDeprecated, // The option is no longer used in rocksdb. The RocksDB + // OptionsParser will still accept this option if it + // happen to exists in some Options file. However, + // the parser will not include it in serialization + // and verification processes. + kAlias, // This option represents is a name/shortcut for + // another option and should not be written or verified + // independently +}; + +// A set of modifier flags used to alter how an option is evaluated or +// processed. These flags can be combined together (e.g. kMutable | kShared). +// The kCompare flags can be used to control if/when options are compared. +// If kCompareNever is set, two related options would never be compared (always +// equal) If kCompareExact is set, the options will only be compared if the +// sanity mode +// is exact +// kMutable means the option can be changed after it is prepared +// kShared means the option is contained in a std::shared_ptr +// kUnique means the option is contained in a std::uniqued_ptr +// kRawPointer means the option is a raw pointer value. +// kAllowNull means that an option is allowed to be null for verification +// purposes. +// kDontSerialize means this option should not be serialized and included in +// the string representation. +// kDontPrepare means do not call PrepareOptions for this pointer value. +enum class OptionTypeFlags : uint32_t { + kNone = 0x00, // No flags + kCompareDefault = 0x0, + kCompareNever = ConfigOptions::kSanityLevelNone, + kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible, + kCompareExact = ConfigOptions::kSanityLevelExactMatch, + + kMutable = 0x0100, // Option is mutable + kRawPointer = 0x0200, // The option is stored as a raw pointer + kShared = 0x0400, // The option is stored as a shared_ptr + kUnique = 0x0800, // The option is stored as a unique_ptr + kAllowNull = 0x1000, // The option can be null + kDontSerialize = 0x2000, // Don't serialize the option + kDontPrepare = 0x4000, // Don't prepare or sanitize this option + kStringNameOnly = 0x8000, // The option serializes to a name only +}; + +inline OptionTypeFlags operator|(const OptionTypeFlags &a, + const OptionTypeFlags &b) { + return static_cast(static_cast(a) | + static_cast(b)); +} + +inline OptionTypeFlags operator&(const OptionTypeFlags &a, + const OptionTypeFlags &b) { + return static_cast(static_cast(a) & + static_cast(b)); +} + +// Converts an string into its enumerated value. +// @param type_map Mapping between strings and enum values +// @param type The string representation of the enum +// @param value Returns the enum value represented by the string +// @return true if the string was found in the enum map, false otherwise. +template +bool ParseEnum(const std::unordered_map& type_map, + const std::string& type, T* value) { + auto iter = type_map.find(type); + if (iter != type_map.end()) { + *value = iter->second; + return true; + } + return false; +} + +// Converts an enum into its string representation. +// @param type_map Mapping between strings and enum values +// @param type The enum +// @param value Returned as the string representation of the enum +// @return true if the enum was found in the enum map, false otherwise. +template +bool SerializeEnum(const std::unordered_map& type_map, + const T& type, std::string* value) { + for (const auto& pair : type_map) { + if (pair.second == type) { + *value = pair.first; + return true; + } + } + return false; +} + +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result); + +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value); +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch); + +// Function for converting a option string value into its underlying +// representation in "addr" +// On success, Status::OK is returned and addr is set to the parsed form +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the value is parsed +// @param name The name of the options being parsed +// @param value The string representation of the option +// @param addr Pointer to the object +using ParseFunc = std::function; + +// Function for converting an option "addr" into its string representation. +// On success, Status::OK is returned and value is the serialized form. +// On failure, a non-OK status is returned +// @param opts The ConfigOptions controlling how the values are serialized +// @param name The name of the options being serialized +// @param addr Pointer to the value being serialized +// @param value The result of the serialization. +using SerializeFunc = std::function; + +// Function for comparing two option values +// If they are not equal, updates "mismatch" with the name of the bad option +// @param opts The ConfigOptions controlling how the values are compared +// @param name The name of the options being compared +// @param addr1 The first address to compare +// @param addr2 The address to compare to +// @param mismatch If the values are not equal, the name of the option that +// first differs +using EqualsFunc = std::function; + +// A struct for storing constant option information such as option name, +// option type, and offset. +class OptionTypeInfo { + public: + // A simple "normal", non-mutable Type "type" at offset + OptionTypeInfo(int offset, OptionType type) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(OptionVerificationType::kNormal), + flags_(OptionTypeFlags::kNone) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags) + : offset_(offset), + parse_func_(nullptr), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(nullptr), + equals_func_(nullptr), + type_(type), + verification_(verification), + flags_(flags) {} + + OptionTypeInfo(int offset, OptionType type, + OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) + : offset_(offset), + parse_func_(parse_func), + serialize_func_(serialize_func), + equals_func_(equals_func), + type_(type), + verification_(verification), + flags_(flags) {} + + // Creates an OptionTypeInfo for an enum type. Enums use an additional + // map to convert the enums to/from their string representation. + // To create an OptionTypeInfo that is an Enum, one should: + // - Create a static map of string values to the corresponding enum value + // - Call this method passing the static map in as a parameter. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + template + static OptionTypeInfo Enum( + int offset, const std::unordered_map* const map, + OptionTypeFlags flags = OptionTypeFlags::kNone) { + return OptionTypeInfo( + offset, OptionType::kEnum, OptionVerificationType::kNormal, flags, + // Uses the map argument to convert the input string into + // its corresponding enum value. If value is found in the map, + // addr is updated to the corresponding map entry. + // @return OK if the value is found in the map + // @return InvalidArgument if the value is not found in the map + [map](const ConfigOptions&, const std::string& name, + const std::string& value, void* addr) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (ParseEnum(*map, value, static_cast(addr))) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }, + // Uses the map argument to convert the input enum into + // its corresponding string value. If enum value is found in the map, + // value is updated to the corresponding string value in the map. + // @return OK if the enum is found in the map + // @return InvalidArgument if the enum is not found in the map + [map](const ConfigOptions&, const std::string& name, const void* addr, + std::string* value) { + if (map == nullptr) { + return Status::NotSupported("No enum mapping ", name); + } else if (SerializeEnum(*map, (*static_cast(addr)), + value)) { + return Status::OK(); + } else { + return Status::InvalidArgument("No mapping for enum ", name); + } + }, + // Casts addr1 and addr2 to the enum type and returns true if + // they are equal, false otherwise. + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { + return (*static_cast(addr1) == + *static_cast(addr2)); + }); + } // End OptionTypeInfo::Enum + + // Creates an OptionTypeInfo for a Struct type. Structs have a + // map of string-OptionTypeInfo associated with them that describes how + // to process the object for parsing, serializing, and matching. + // Structs also have a struct_name, which is the name of the object + // as registered in the parent map. + // When processing a struct, the option name can be specified as: + // - Meaning to process the entire struct. + // - Meaning to process the single field + // - Process the single fields + // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions + // are all examples of Struct options. + // + // To create an OptionTypeInfo that is a Struct, one should: + // - Create a static map of string-OptionTypeInfo corresponding to the + // properties of the object that can be set via the options. + // - Call this method passing the name and map in as parameters. + // Note that it is not necessary to add a new OptionType or make any + // other changes -- the returned object handles parsing, serialization, and + // comparisons. + // + // @param offset The offset in the option object for this enum + // @param map The string to enum mapping for this enum + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags) { + return OptionTypeInfo( + offset, OptionType::kStruct, verification, flags, + // Parses the struct and updates the fields at addr + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + return ParseStruct(opts, struct_name, struct_map, name, value, addr); + }, + // Serializes the struct options into value + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr, + std::string* value) { + return SerializeStruct(opts, struct_name, struct_map, name, addr, + value); + }, + // Compares the struct fields of addr1 and addr2 for equality + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + return StructsAreEqual(opts, struct_name, struct_map, name, addr1, + addr2, mismatch); + }); + } + static OptionTypeInfo Struct( + const std::string& struct_name, + const std::unordered_map* struct_map, + int offset, OptionVerificationType verification, OptionTypeFlags flags, + const ParseFunc& parse_func) { + return OptionTypeInfo( + offset, OptionType::kStruct, verification, flags, parse_func, + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr, + std::string* value) { + return SerializeStruct(opts, struct_name, struct_map, name, addr, + value); + }, + [struct_name, struct_map](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + return StructsAreEqual(opts, struct_name, struct_map, name, addr1, + addr2, mismatch); + }); + } + + template + static OptionTypeInfo Vector(int _offset, + OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + return OptionTypeInfo( + _offset, OptionType::kVector, _verification, _flags, + [elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseVector(opts, elem_info, separator, name, value, + result); + }, + [elem_info, separator](const ConfigOptions& opts, + const std::string& name, const void* addr, + std::string* value) { + const auto& vec = *(static_cast*>(addr)); + return SerializeVector(opts, elem_info, separator, name, vec, + value); + }, + [elem_info](const ConfigOptions& opts, const std::string& name, + const void* addr1, const void* addr2, + std::string* mismatch) { + const auto& vec1 = *(static_cast*>(addr1)); + const auto& vec2 = *(static_cast*>(addr2)); + return VectorsAreEqual(opts, elem_info, name, vec1, vec2, + mismatch); + }); + } + + // Create a new std::shared_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::shared_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomSharedPtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto* shared = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + shared->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, shared); + } + }, + serialize_func, equals_func); + } + + // Create a new std::unique_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::unique_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomUniquePtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kUnique, + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto* unique = static_cast*>(addr); + if (name == kIdPropName() && value.empty()) { + unique->reset(); + return Status::OK(); + } else { + return T::CreateFromString(opts, value, unique); + } + }, + serialize_func, equals_func); + } + + // Create a new Customizable* OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // T object. + // + // @param _offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomRawPtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kRawPointer, + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto** pointer = static_cast(addr); + if (name == kIdPropName() && value.empty()) { + *pointer = nullptr; + return Status::OK(); + } else { + return T::CreateFromString(opts, value, pointer); + } + }, + serialize_func, equals_func); + } + + bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; } + + bool IsEditable(const ConfigOptions& opts) const { + if (opts.mutable_options_only) { + return IsMutable(); + } else { + return true; + } + } + bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); } + + bool IsDeprecated() const { + return IsEnabled(OptionVerificationType::kDeprecated); + } + + // Returns true if the option is marked as an Alias. + // Aliases are valid options that are parsed but are not converted to strings + // or compared. + bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); } + + bool IsEnabled(OptionVerificationType ovf) const { + return verification_ == ovf; + } + + // Returns the sanity level for comparing the option. + // If the options should not be compared, returns None + // If the option has a compare flag, returns it. + // Otherwise, returns "exact" + ConfigOptions::SanityLevel GetSanityLevel() const { + if (IsDeprecated() || IsAlias()) { + return ConfigOptions::SanityLevel::kSanityLevelNone; + } else { + auto match = (flags_ & OptionTypeFlags::kCompareExact); + if (match == OptionTypeFlags::kCompareDefault) { + return ConfigOptions::SanityLevel::kSanityLevelExactMatch; + } else { + return (ConfigOptions::SanityLevel)match; + } + } + } + + // Returns true if the option should be serialized. + // Options should be serialized if the are not deprecated, aliases, + // or marked as "Don't Serialize". + bool ShouldSerialize() const { + if (IsDeprecated() || IsAlias()) { + return false; + } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { + return false; + } else { + return true; + } + } + + // Returns true if the option is allowed to be null. + // Options can be null if the verification type is allow from null + // or if the flags specify allow null. + bool CanBeNull() const { + return (IsEnabled(OptionTypeFlags::kAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowNull) || + IsEnabled(OptionVerificationType::kByNameAllowFromNull)); + } + + bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); } + + bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); } + + bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); } + + bool IsByName() const { + return (verification_ == OptionVerificationType::kByName || + verification_ == OptionVerificationType::kByNameAllowNull || + verification_ == OptionVerificationType::kByNameAllowFromNull); + } + + bool IsStruct() const { return (type_ == OptionType::kStruct); } + + bool IsConfigurable() const { + return (type_ == OptionType::kConfigurable || + type_ == OptionType::kCustomizable); + } + + bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + const T* AsRawPointer(const void* const base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + const void* opt_addr = static_cast(base_addr) + offset_; + if (IsUniquePtr()) { + const std::unique_ptr* ptr = + static_cast*>(opt_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + const std::shared_ptr* ptr = + static_cast*>(opt_addr); + return ptr->get(); + } else if (IsRawPtr()) { + const T* const* ptr = static_cast(opt_addr); + return *ptr; + } else { + return static_cast(opt_addr); + } + } + + // Returns the underlying pointer for the type at base_addr + // The value returned is the underlying "raw" pointer, offset from base. + template + T* AsRawPointer(void* base_addr) const { + if (base_addr == nullptr) { + return nullptr; + } + void* opt_addr = static_cast(base_addr) + offset_; + if (IsUniquePtr()) { + std::unique_ptr* ptr = static_cast*>(opt_addr); + return ptr->get(); + } else if (IsSharedPtr()) { + std::shared_ptr* ptr = static_cast*>(opt_addr); + return ptr->get(); + } else if (IsRawPtr()) { + T** ptr = static_cast(opt_addr); + return *ptr; + } else { + return static_cast(opt_addr); + } + } + + // Parses the option in "opt_value" according to the rules of this class + // and updates the value at "opt_ptr". + // On success, Status::OK() is returned. On failure: + // NotFound means the opt_name is not valid for this option + // NotSupported means we do not know how to parse the value for this option + // InvalidArgument means the opt_value is not valid for this option. + Status Parse(const ConfigOptions& config_options, const std::string& opt_name, + const std::string& opt_value, void* const opt_ptr) const; + + // Serializes the option in "opt_addr" according to the rules of this class + // into the value at "opt_value". + Status Serialize(const ConfigOptions& config_options, + const std::string& opt_name, const void* const opt_ptr, + std::string* opt_value) const; + + // Compares the "addr1" and "addr2" values according to the rules of this + // class and returns true if they match. On a failed match, mismatch is the + // name of the option that failed to match. + bool AreEqual(const ConfigOptions& config_options, + const std::string& opt_name, const void* const addr1, + const void* const addr2, std::string* mismatch) const; + + // Used to override the match rules for "ByName" options. + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr) const; + bool AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, const void* const this_ptr, + const std::string& that_value) const; + + // Parses the input opts_map according to the type_map for the opt_addr + // For each name-value pair in opts_map, find the corresponding name in + // type_map If the name is found: + // - set the corresponding value in opt_addr, returning the status on + // failure; + // If the name is not found: + // - If unused is specified, add the name-value to unused and continue + // - If ingore_unknown_options is false, return NotFound + // Returns OK if all options were either: + // - Successfully set + // - options were not found and ignore_unknown_options=true + // - options were not found and unused was specified + // Note that this method is much less sophisticated than the comparable + // Configurable::Configure methods. For example, on error, there is no + // attempt to return opt_addr to the initial state. Additionally, there + // is no effort to initialize (Configurable::PrepareOptions) the object + // on success. This method should typically only be used for simpler, + // standalone structures and not those that contain shared and embedded + // objects. + static Status ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + static Status ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + + // Parses the input value according to the map for the struct at opt_addr + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it, based on struct_name and + // opt_name. + static Status ParseStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const std::string& value, void* opt_addr); + + // Serializes the values from opt_addr using the rules in type_map. + // Returns the serialized form in result. + // Returns OK on success or non-OK if some option could not be serialized. + static Status SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* value); + + // Serializes the input addr according to the map for the struct to value. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static Status SerializeStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* opt_addr, std::string* value); + + // Compares the values in this_addr and that_addr using the rules in type_map. + // If the values are equal, returns true + // If the values are not equal, returns false and sets mismatch to the name + // of the first value that did not match. + static bool TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& map, + const void* this_addr, const void* that_addr, std::string* mismatch); + + // Compares the input offsets according to the map for the struct and returns + // true if they are equivalent, false otherwise. + // struct_name is the name of the struct option as registered + // opt_name is the name of the option being evaluated. This may + // be the whole struct or a sub-element of it + static bool StructsAreEqual( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* map, + const std::string& opt_name, const void* this_offset, + const void* that_offset, std::string* mismatch); + + // Finds the entry for the opt_name in the opt_map, returning + // nullptr if not found. + // If found, elem_name will be the name of option to find. + // This may be opt_name, or a substring of opt_name. + // For "simple" options, opt_name will be equal to elem_name. Given the + // opt_name "opt", elem_name will equal "opt". + // For "embedded" options (like structs), elem_name may be opt_name + // or a field within the opt_name. For example, given the struct "struct", + // and opt_name of "struct.field", elem_name will be "field" + static const OptionTypeInfo* Find( + const std::string& opt_name, + const std::unordered_map& opt_map, + std::string* elem_name); + + // Returns the next token marked by the delimiter from "opts" after start in + // token and updates end to point to where that token stops. Delimiters inside + // of braces are ignored. Returns OK if a token is found and an error if the + // input opts string is mis-formatted. + // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points + // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B" + // + // @param opts The string in which to find the next token + // @param delimiter The delimiter between tokens + // @param start The position in opts to start looking for the token + // @param ed Returns the end position in opts of the token + // @param token Returns the token + // @returns OK if a token was found + // @return InvalidArgument if the braces mismatch + // (e.g. "{a={b=c;}" ) -- missing closing brace + // @return InvalidArgument if an expected delimiter is not found + // e.g. "{a=b}c=d;" -- missing delimiter before "c" + static Status NextToken(const std::string& opts, char delimiter, size_t start, + size_t* end, std::string* token); + + constexpr static const char* kIdPropName() { return "id"; } + constexpr static const char* kIdPropSuffix() { return ".id"; } + + private: + int offset_; + + // The optional function to convert a string to its representation + ParseFunc parse_func_; + + // The optional function to convert a value to its string representation + SerializeFunc serialize_func_; + + // The optional function to match two option values + EqualsFunc equals_func_; + + OptionType type_; + OptionVerificationType verification_; + OptionTypeFlags flags_; +}; + +// Parses the input value into elements of the result vector. This method +// will break the input value into the individual tokens (based on the +// separator), where each of those tokens will be parsed based on the rules of +// elem_info. The result vector will be populated with elements based on the +// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses +// integers, the result vector will contain the integers 1,2,3,4,5 +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this vector option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::vector* result) { + result->clear(); + Status status; + + // Turn off ignore_unknown_objects so we can tell if the returned + // object is valid or not. + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + for (size_t start = 0, end = 0; + status.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + T elem; + status = elem_info.Parse(copy, name, token, &elem); + if (status.ok()) { + result->emplace_back(elem); + } else if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + return status; +} + +// Serializes the input vector into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in vec into their serialized form, using elem_info to perform the +// serialization. +// For example, if the vec contains the integers 1,2,3,4,5 and elem_info +// serializes the output would be 1:2:3:4:5 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this vector option +// @param vec The input vector to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeVector(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::vector& vec, + std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : vec) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input vectors vec1 and vec2 for equality +// If the vectors are the same size, elements of the vectors are compared one by +// one using elem_info to perform the comparison. +// +// @param config_options Controls how the vectors are compared. +// @param elem_info Controls how individual elements in the vectors are compared +// @param name The name associated with this vector option +// @param vec1,vec2 The vectors to compare. +// @param mismatch If the vectors are not equivalent, mismatch will point to +// the first +// element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool VectorsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::vector& vec1, const std::vector& vec2, + std::string* mismatch) { + if (vec1.size() != vec2.size()) { + *mismatch = name; + return false; + } else { + for (size_t i = 0; i < vec1.size(); ++i) { + if (!elem_info.AreEqual( + config_options, name, reinterpret_cast(&vec1[i]), + reinterpret_cast(&vec2[i]), mismatch)) { + return false; + } + } + return true; + } +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,12 +11,14 @@ #include #include +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; // Constructs the DBOptions and ColumnFamilyDescriptors by loading the // latest RocksDB options file stored in the specified rocksdb database. // @@ -45,20 +47,26 @@ // pointer options of BlockBasedTableOptions (flush_block_policy_factory, // block_cache, and block_cache_compressed), which will be initialized with // default values. Developers can further specify these three options by -// casting the return value of TableFactoroy::GetOptions() to +// casting the return value of TableFactory::GetOptions() to // BlockBasedTableOptions and making necessary changes. // // ignore_unknown_options can be set to true if you want to ignore options -// that are from a newer version of the db, esentially for forward +// that are from a newer version of the db, essentially for forward // compatibility. // +// config_options contains a set of options that controls the processing +// of the options. The LoadLatestOptions(ConfigOptions...) should be preferred; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding options +// in the ConfigOptions parameter. +// // examples/options_file_example.cc demonstrates how to use this function // to open a RocksDB instance. // // @return the function returns an OK status when it went successfully. If // the specified "dbpath" does not contain any option file, then a // Status::NotFound will be returned. A return value other than -// Status::OK or Status::NotFound indicates there're some error related +// Status::OK or Status::NotFound indicates there is some error related // to the options file itself. // // @see LoadOptionsFromFile @@ -67,16 +75,30 @@ std::vector* cf_descs, bool ignore_unknown_options = false, std::shared_ptr* cache = {}); +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); // Similar to LoadLatestOptions, this function constructs the DBOptions // and ColumnFamilyDescriptors based on the specified RocksDB Options file. // +// The LoadOptionsFile(ConfigOptions...) should be preferred; +// the alternative signature may be deprecated in a future release. The +// equivalent functionality can be achieved by setting the corresponding +// options in the ConfigOptions parameter. +// // @see LoadLatestOptions Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, DBOptions* db_options, std::vector* cf_descs, bool ignore_unknown_options = false, std::shared_ptr* cache = {}); +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& options_file_name, + DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); // Returns the latest options file name under the specified db path. Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, @@ -97,6 +119,10 @@ const std::string& dbpath, Env* env, const DBOptions& db_options, const std::vector& cf_descs, bool ignore_unknown_options = false); +Status CheckOptionsCompatibility( + const ConfigOptions& config_options, const std::string& dbpath, + const DBOptions& db_options, + const std::vector& cf_descs); } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper for parsed regular expressions. The regex syntax and matching is +// compatible with std::regex. +// +// !!!!!! WARNING !!!!!!: The implementation currently uses std::regex, which +// has terrible performance in some cases, including possible crash due to +// stack overflow. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 +// for example. Avoid use in production as much as possible. +// +// Internal note: see also TestRegex +class Regex { + public: + // Note: Cannot be constructed with a pattern, so that syntax errors can + // be handled without using exceptions. + + // Parse returns OK and saves to `out` when the pattern is valid regex + // syntax (modified ECMAScript), or else returns InvalidArgument. + // See https://en.cppreference.com/w/cpp/regex/ecmascript + static Status Parse(const char *pattern, Regex *out); + static Status Parse(const std::string &pattern, Regex *out); + + // Checks that the whole of str is matched by this regex. If called on a + // default-constructed Regex, will trigger assertion failure in DEBUG build + // or return false in release build. + bool Matches(const std::string &str) const; + + private: + class Impl; + std::shared_ptr impl_; // shared_ptr for simple implementation +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class TraceRecord; +class TraceRecordResult; + +struct ReplayOptions { + // Number of threads used for replaying. If 0 or 1, replay using + // single thread. + uint32_t num_threads; + + // Enables fast forwarding a replay by increasing/reducing the delay between + // the ingested traces. + // If > 0.0 and < 1.0, slow down the replay by this amount. + // If 1.0, replay the operations at the same rate as in the trace stream. + // If > 1, speed up the replay by this amount. + double fast_forward; + + ReplayOptions() : num_threads(1), fast_forward(1.0) {} + + ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio) + : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {} +}; + +// Replayer helps to replay the captured RocksDB query level operations. +// The Replayer can either be created from DB::NewReplayer method, or be +// instantiated via db_bench today, on using "replay" benchmark. +class Replayer { + public: + virtual ~Replayer() = default; + + // Make some preparation before replaying the trace. This will also reset the + // replayer in order to restart replaying. + virtual Status Prepare() = 0; + + // Return the timestamp when the trace recording was started. + virtual uint64_t GetHeaderTimestamp() const = 0; + + // Atomically read one trace into a TraceRecord (excluding the header and + // footer traces). + // Return Status::OK() on success; + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the read trace type is not supported. + virtual Status Next(std::unique_ptr* record) = 0; + + // Execute one TraceRecord. + // Return Status::OK() if the execution was successful. Get/MultiGet traces + // will still return Status::OK() even if they got Status::NotFound() + // from DB::Get() or DB::MultiGet(); + // Status::Incomplete() if Prepare() was not called or no more available + // trace; + // Status::NotSupported() if the operation is not supported; + // Otherwise, return the corresponding error status. + // + // The actual operation execution status and result(s) will be saved in + // result. For example, a GetQueryTraceRecord will have its DB::Get() status + // and the returned value saved in a SingleValueTraceExecutionResult. + virtual Status Execute(const std::unique_ptr& record, + std::unique_ptr* result) = 0; + + // Replay all the traces from the provided trace stream, taking the delay + // between the traces into consideration. + // + // result_callback reports the status of executing a trace record, and the + // actual operation execution result (See the description for Execute()). + virtual Status Replay( + const ReplayOptions& options, + const std::function&&)>& + result_callback) = 0; +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,7 +25,7 @@ // can help users tune their current block cache size, and determine how // efficient they are using the memory. // -// Since GetSimCapacity() returns the capacity for simulutation, it differs from +// Since GetSimCapacity() returns the capacity for simulation, it differs from // actual memory usage, which can be estimated as: // sim_capacity * entry_size / (entry_size + block_size), // where 76 <= entry_size <= 104, @@ -60,7 +60,7 @@ // sets the maximum configured capacity of the simcache. When the new // capacity is less than the old capacity and the existing usage is // greater than new capacity, the implementation will purge old entries - // to fit new capapicty. + // to fit new capacity. virtual void SetSimCapacity(size_t capacity) = 0; // returns the lookup times of simcache diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h 2025-05-19 16:14:27.000000000 +0000 @@ -141,6 +141,11 @@ import_options, metadata, handle); } + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_opts) override { + return db_->VerifyFileChecksums(read_opts); + } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } virtual Status VerifyChecksum(const ReadOptions& options) override { @@ -347,6 +352,17 @@ db_->GetLiveFilesMetaData(metadata); } + virtual Status GetLiveFilesChecksumInfo( + FileChecksumList* checksum_list) override { + return db_->GetLiveFilesChecksumInfo(checksum_list); + } + + virtual Status GetLiveFilesStorageInfo( + const LiveFilesStorageInfoOptions& opts, + std::vector* files) override { + return db_->GetLiveFilesStorageInfo(opts, files); + } + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, ColumnFamilyMetaData* cf_meta) override { db_->GetColumnFamilyMetaData(column_family, cf_meta); @@ -362,6 +378,31 @@ using DB::EndBlockCacheTrace; Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartIOTrace(options, std::move(trace_writer)); + } + + using DB::EndIOTrace; + Status EndIOTrace() override { return db_->EndIOTrace(); } + + using DB::StartTrace; + Status StartTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartTrace(options, std::move(trace_writer)); + } + + using DB::EndTrace; + Status EndTrace() override { return db_->EndTrace(); } + + using DB::NewDefaultReplayer; + Status NewDefaultReplayer(const std::vector& handles, + std::unique_ptr&& reader, + std::unique_ptr* replayer) override { + return db_->NewDefaultReplayer(handles, std::move(reader), replayer); + } + #endif // ROCKSDB_LITE virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, @@ -378,6 +419,16 @@ return db_->SetPreserveDeletesSequenceNumber(seqnum); } + Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string ts_low) override { + return db_->IncreaseFullHistoryTsLow(column_family, ts_low); + } + + Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, + std::string* ts_low) override { + return db_->GetFullHistoryTsLow(column_family, ts_low); + } + virtual Status GetSortedWalFiles(VectorLogPtr& files) override { return db_->GetSortedWalFiles(files); } @@ -392,6 +443,13 @@ return db_->GetCreationTimeOfOldestFile(creation_time); } + // WARNING: This API is planned for removal in RocksDB 7.0 since it does not + // operate at the proper level of abstraction for a key-value store, and its + // contract/restrictions are poorly documented. For example, it returns non-OK + // `Status` for non-bottommost files and files undergoing compaction. Since we + // do not plan to maintain it, the contract will likely remain underspecified + // until its removal. Any user is encouraged to read the implementation + // carefully and migrate away from it when possible. virtual Status DeleteFile(std::string name) override { return db_->DeleteFile(name); } @@ -400,6 +458,10 @@ return db_->GetDbIdentity(identity); } + virtual Status GetDbSessionId(std::string& session_id) const override { + return db_->GetDbSessionId(session_id); + } + using DB::SetOptions; virtual Status SetOptions(ColumnFamilyHandle* column_family_handle, const std::unordered_map& diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,13 +14,27 @@ // A factory of a table property collector that marks a SST // file as need-compaction when it observe at least "D" deletion -// entries in any "N" consecutive entires. +// entries in any "N" consecutive entries or the ratio of tombstone +// entries in the whole file >= the specified deletion ratio. class CompactOnDeletionCollectorFactory : public TablePropertiesCollectorFactory { public: - virtual ~CompactOnDeletionCollectorFactory() {} + // A factory of a table property collector that marks a SST + // file as need-compaction when it observe at least "D" deletion + // entries in any "N" consecutive entries, or the ratio of tombstone + // entries >= deletion_ratio. + // + // @param sliding_window_size "N" + // @param deletion_trigger "D" + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + CompactOnDeletionCollectorFactory(size_t sliding_window_size, + size_t deletion_trigger, + double deletion_ratio); - virtual TablePropertiesCollector* CreateTablePropertiesCollector( + ~CompactOnDeletionCollectorFactory() {} + + TablePropertiesCollector* CreateTablePropertiesCollector( TablePropertiesCollectorFactory::Context context) override; // Change the value of sliding_window_size "N" @@ -28,47 +42,49 @@ void SetWindowSize(size_t sliding_window_size) { sliding_window_size_.store(sliding_window_size); } + size_t GetWindowSize() const { return sliding_window_size_.load(); } // Change the value of deletion_trigger "D" void SetDeletionTrigger(size_t deletion_trigger) { deletion_trigger_.store(deletion_trigger); } - virtual const char* Name() const override { - return "CompactOnDeletionCollector"; + size_t GetDeletionTrigger() const { return deletion_trigger_.load(); } + // Change deletion ratio. + // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction + // based on deletion ratio. + void SetDeletionRatio(double deletion_ratio) { + deletion_ratio_.store(deletion_ratio); } - private: - friend std::shared_ptr - NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, - size_t deletion_trigger); - // A factory of a table property collector that marks a SST - // file as need-compaction when it observe at least "D" deletion - // entries in any "N" consecutive entires. - // - // @param sliding_window_size "N" - // @param deletion_trigger "D" - CompactOnDeletionCollectorFactory(size_t sliding_window_size, - size_t deletion_trigger) - : sliding_window_size_(sliding_window_size), - deletion_trigger_(deletion_trigger) {} + double GetDeletionRatio() const { return deletion_ratio_.load(); } + static const char* kClassName() { return "CompactOnDeletionCollector"; } + const char* Name() const override { return kClassName(); } + std::string ToString() const override; + + private: std::atomic sliding_window_size_; std::atomic deletion_trigger_; + std::atomic deletion_ratio_; }; // Creates a factory of a table property collector that marks a SST // file as need-compaction when it observe at least "D" deletion -// entries in any "N" consecutive entires. +// entries in any "N" consecutive entries, or the ratio of tombstone +// entries >= deletion_ratio. // // @param sliding_window_size "N". Note that this number will be // round up to the smallest multiple of 128 that is no less // than the specified size. // @param deletion_trigger "D". Note that even when "N" is changed, // the specified number for "D" will not be changed. +// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction +// based on deletion ratio. Disabled by default. extern std::shared_ptr NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, - size_t deletion_trigger); + size_t deletion_trigger, + double deletion_ratio = 0); } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h 2025-05-19 16:14:27.000000000 +0000 @@ -24,6 +24,83 @@ using TransactionID = uint64_t; +/* + class Endpoint allows to define prefix ranges. + + Prefix ranges are introduced below. + + == Basic Ranges == + Let's start from basic ranges. Key Comparator defines ordering of rowkeys. + Then, one can specify finite closed ranges by just providing rowkeys of their + endpoints: + + lower_endpoint <= X <= upper_endpoint + + However our goal is to provide a richer set of endpoints. Read on. + + == Lexicographic ordering == + A lexicographic (or dictionary) ordering satisfies these criteria: If there + are two keys in form + key_a = {prefix_a, suffix_a} + key_b = {prefix_b, suffix_b} + and + prefix_a < prefix_b + then + key_a < key_b. + + == Prefix ranges == + With lexicographic ordering, one may want to define ranges in form + + "prefix is $PREFIX" + + which translates to a range in form + + {$PREFIX, -infinity} < X < {$PREFIX, +infinity} + + where -infinity will compare less than any possible suffix, and +infinity + will compare as greater than any possible suffix. + + class Endpoint allows to define these kind of rangtes. + + == Notes == + BytewiseComparator and ReverseBytewiseComparator produce lexicographic + ordering. + + The row comparison function is able to compare key prefixes. If the data + domain includes keys A and B, then the comparison function is able to compare + equal-length prefixes: + + min_len= min(byte_length(A), byte_length(B)); + cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid + + == Other options == + As far as MyRocks is concerned, the alternative to prefix ranges would be to + support both open (non-inclusive) and closed (inclusive) range endpoints. +*/ + +class Endpoint { + public: + Slice slice; + + /* + true : the key has a "+infinity" suffix. A suffix that would compare as + greater than any other suffix + false : otherwise + */ + bool inf_suffix; + + explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false) + : slice(slice_arg), inf_suffix(inf_suffix_arg) {} + + explicit Endpoint(const char* s, bool inf_suffix_arg = false) + : slice(s), inf_suffix(inf_suffix_arg) {} + + Endpoint(const char* s, size_t size, bool inf_suffix_arg = false) + : slice(s, size), inf_suffix(inf_suffix_arg) {} + + Endpoint() : inf_suffix(false) {} +}; + // Provides notification to the caller of SetSnapshotOnNextOperation when // the actual snapshot gets created class TransactionNotifier { @@ -139,7 +216,9 @@ // // If this transaction was created by a TransactionDB(), Status::Expired() // may be returned if this transaction has lived for longer than - // TransactionOptions.expiration. + // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if + // TransactionOptions.skip_prepare is false and Prepare is not called on this + // transaction before Commit. virtual Status Commit() = 0; // Discard all batched writes in this transaction. @@ -275,6 +354,12 @@ } } + // Get a range lock on [start_endpoint; end_endpoint]. + virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&, + const Endpoint&) { + return Status::NotSupported(); + } + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, std::string* value, bool exclusive = true, const bool do_validate = true) = 0; @@ -491,7 +576,8 @@ AWAITING_PREPARE = 1, PREPARED = 2, AWAITING_COMMIT = 3, - COMMITED = 4, + COMMITTED = 4, + COMMITED = COMMITTED, // old misspelled name AWAITING_ROLLBACK = 5, ROLLEDBACK = 6, LOCKS_STOLEN = 7, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h 2025-05-19 16:14:27.000000000 +0000 @@ -31,6 +31,122 @@ const uint32_t kInitialMaxDeadlocks = 5; +class LockManager; +struct RangeLockInfo; + +// A lock manager handle +// The workflow is as follows: +// * Use a factory method (like NewRangeLockManager()) to create a lock +// manager and get its handle. +// * A Handle for a particular kind of lock manager will have extra +// methods and parameters to control the lock manager +// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It +// will be used to perform locking. +class LockManagerHandle { + public: + // PessimisticTransactionDB will call this to get the Lock Manager it's going + // to use. + virtual LockManager* getLockManager() = 0; + + virtual ~LockManagerHandle() {} +}; + +// Same as class Endpoint, but use std::string to manage the buffer allocation +struct EndpointWithString { + std::string slice; + bool inf_suffix; +}; + +struct RangeDeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + + EndpointWithString m_start; + EndpointWithString m_end; +}; + +struct RangeDeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit RangeDeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +// A handle to control RangeLockManager (Range-based lock manager) from outside +// RocksDB +class RangeLockManagerHandle : public LockManagerHandle { + public: + // Set total amount of lock memory to use. + // + // @return 0 Ok + // @return EDOM Failed to set because currently using more memory than + // specified + virtual int SetMaxLockMemory(size_t max_lock_memory) = 0; + virtual size_t GetMaxLockMemory() = 0; + + using RangeLockStatus = + std::unordered_multimap; + + // Lock Escalation barrier check function. + // It is called for a couple of endpoints A and B, such that A < B. + // If escalation_barrier_check_func(A, B)==true, then there's a lock + // escalation barrier between A and B, and lock escalation is not allowed + // to bridge the gap between A and B. + // + // The function may be called from any thread that acquires or releases + // locks. It should not throw exceptions. There is currently no way to return + // an error. + using EscalationBarrierFunc = + std::function; + + // Set the user-provided barrier check function + virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0; + + virtual RangeLockStatus GetRangeLockStatusData() = 0; + + class Counters { + public: + // Number of times lock escalation was triggered (for all column families) + uint64_t escalation_count; + + // Number of times lock acquisition had to wait for a conflicting lock + // to be released. This counts both successful waits (where the desired + // lock was acquired) and waits that timed out or got other error. + uint64_t lock_wait_count; + + // How much memory is currently used for locks (total for all column + // families) + uint64_t current_lock_memory; + }; + + // Get the current counter values + virtual Counters GetStatus() = 0; + + // Functions for range-based Deadlock reporting. + virtual std::vector GetRangeDeadlockInfoBuffer() = 0; + virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; + + virtual ~RangeLockManagerHandle() {} +}; + +// A factory function to create a Range Lock Manager. The created object should +// be: +// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in +// range-locking mode +// 2. Used to control the lock manager when the DB is already open. +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr mutex_factory); + struct TransactionDBOptions { // Specifies the maximum number of keys that can be locked at the same time // per column family. @@ -92,9 +208,13 @@ // for the special way that myrocks uses this operands. bool rollback_merge_operands = false; + // nullptr means use default lock manager. + // Other value means the user provides a custom lock manager. + std::shared_ptr lock_mgr_handle; + // If true, the TransactionDB implementation might skip concurrency control // unless it is overridden by TransactionOptions or - // TransactionDBWriteOptimizations. This can be used in conjuction with + // TransactionDBWriteOptimizations. This can be used in conjunction with // DBOptions::unordered_write when the TransactionDB is used solely for write // ordering rather than concurrency control. bool skip_concurrency_control = false; @@ -172,6 +292,10 @@ // Default: false bool skip_concurrency_control = false; + // In pessimistic transaction, if this is true, then you can skip Prepare + // before Commit, otherwise, you must Prepare before Commit. + bool skip_prepare = true; + // See TransactionDBOptions::default_write_batch_flush_threshold for // description. If a negative value is specified, then the default value from // TransactionDBOptions is used. @@ -198,6 +322,13 @@ bool exclusive; }; +struct RangeLockInfo { + EndpointWithString start; + EndpointWithString end; + std::vector ids; + bool exclusive; +}; + struct DeadlockInfo { TransactionID m_txn_id; uint32_t m_cf_id; @@ -233,6 +364,17 @@ // falls back to the un-optimized version of ::Write return Write(opts, updates); } + // Transactional `DeleteRange()` is not yet supported. + // However, users who know their deleted range does not conflict with + // anything can still use it via the `Write()` API. In all cases, the + // `Write()` overload specifying `TransactionDBWriteOptimizations` must be + // used and `skip_concurrency_control` must be set. When using either + // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must + // additionally be set. + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } // Open a TransactionDB similar to DB::Open(). // Internally call PrepareWrap() and WrapDB() // If the return status is not ok, then dbptr is set to nullptr. @@ -292,6 +434,7 @@ // The mapping is column family id -> KeyLockInfo virtual std::unordered_multimap GetLockStatusData() = 0; + virtual std::vector GetDeadlockInfoBuffer() = 0; virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h 2025-05-19 16:14:27.000000000 +0000 @@ -61,7 +61,7 @@ // // Returns OK if notified. // Returns TimedOut if timeout is reached. - // Returns other status if TransactionDB should otherwis stop waiting and + // Returns other status if TransactionDB should otherwise stop waiting and // fail the operation. // May return OK spuriously even if not notified. virtual Status WaitFor(std::shared_ptr mutex, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h 2025-05-19 16:14:27.000000000 +0000 @@ -40,12 +40,13 @@ kDeleteRangeRecord, kLogDataRecord, kXIDRecord, + kUnknownRecord, }; // an entry for Put, Merge, Delete, or SingleDelete entry for write batches. // Used in WBWIIterator. struct WriteEntry { - WriteType type; + WriteType type = kUnknownRecord; Slice key; Slice value; }; @@ -168,7 +169,7 @@ // returned iterator will also delete the base_iterator. // // Updating write batch with the current key of the iterator is not safe. - // We strongly recommand users not to do it. It will invalidate the current + // We strongly recommend users not to do it. It will invalidate the current // key() and value() of the iterator. This invalidation happens even before // the write batch update finishes. The state may recover after Next() is // called. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/version.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/version.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h 2025-05-19 16:14:27.000000000 +0000 @@ -4,9 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + #define ROCKSDB_MAJOR 6 -#define ROCKSDB_MINOR 8 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_MINOR 29 +#define ROCKSDB_PATCH 5 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these @@ -14,3 +19,23 @@ #define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR #define __ROCKSDB_MINOR__ ROCKSDB_MINOR #define __ROCKSDB_PATCH__ ROCKSDB_PATCH + +namespace ROCKSDB_NAMESPACE { +// Returns a set of properties indicating how/when/where this version of RocksDB +// was created. +const std::unordered_map& GetRocksBuildProperties(); + +// Returns the current version of RocksDB as a string (e.g. "6.16.0"). +// If with_patch is true, the patch is included (6.16.x). +// Otherwise, only major and minor version is included (6.16) +std::string GetRocksVersionAsString(bool with_patch = true); + +// Gets the set of build properties (@see GetRocksBuildProperties) into a +// string. Properties are returned one-per-line, with the first line being: +// " from RocksDB . +// If verbose is true, the full set of properties is +// printed. If verbose is false, only the version information (@see +// GetRocksVersionString) is printed. +std::string GetRocksBuildInfoAsString(const std::string& program, + bool verbose = false); +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,17 +8,26 @@ #include #include +#include "rocksdb/customizable.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { class WriteBatch; +struct ConfigOptions; // WALFilter allows an application to inspect write-ahead-log (WAL) // records or modify their processing on recovery. // Please see the details below. -class WalFilter { +// +// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// because RocksDB is not exception-safe. This could cause undefined behavior +// including data loss, unreported corruption, deadlocks, and more. +class WalFilter : public Customizable { public: + static const char* Type() { return "WalFilter"; } + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, WalFilter** result); enum class WalProcessingOption { // Continue processing as usual kContinueProcessing = 0, @@ -96,7 +105,7 @@ // Returns a name that identifies this WAL filter. // The name will be printed to LOG file on start up for diagnosis. - virtual const char* Name() const = 0; + virtual const char* Name() const override = 0; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,10 +25,13 @@ #pragma once #include + #include +#include #include #include #include + #include "rocksdb/status.h" #include "rocksdb/write_batch_base.h" @@ -61,11 +64,19 @@ class WriteBatch : public WriteBatchBase { public: explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0); - explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz); + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, + size_t protection_bytes_per_key); ~WriteBatch() override; using WriteBatchBase::Put; // Store the mapping "key->value" in the database. + // The following Put(..., const Slice& key, ...) API can also be used when + // user-defined timestamp is enabled as long as `key` points to a contiguous + // buffer with timestamp appended after user key. The caller is responsible + // for setting up the memory buffer pointed to by `key`. Status Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; Status Put(const Slice& key, const Slice& value) override { @@ -75,6 +86,10 @@ // Variant of Put() that gathers output like writev(2). The key and value // that will be written to the database are concatenations of arrays of // slices. + // The following Put(..., const SliceParts& key, ...) API can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. Status Put(ColumnFamilyHandle* column_family, const SliceParts& key, const SliceParts& value) override; Status Put(const SliceParts& key, const SliceParts& value) override { @@ -83,10 +98,18 @@ using WriteBatchBase::Delete; // If the database contains a mapping for "key", erase it. Else do nothing. + // The following Delete(..., const Slice& key) can be used when user-defined + // timestamp is enabled as long as `key` points to a contiguous buffer with + // timestamp appended after user key. The caller is responsible for setting + // up the memory buffer pointed to by `key`. Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override; Status Delete(const Slice& key) override { return Delete(nullptr, key); } // variant that takes SliceParts + // These two variants of Delete(..., const SliceParts& key) can be used when + // user-defined timestamp is enabled as long as the timestamp is the last + // Slice in `key`, a SliceParts (array of Slices). The caller is responsible + // for setting up the `key` SliceParts object. Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key) override; Status Delete(const SliceParts& key) override { return Delete(nullptr, key); } @@ -265,6 +288,12 @@ return Status::InvalidArgument("MarkCommit() handler not defined."); } + virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/, + const Slice& /*commit_ts*/) { + return Status::InvalidArgument( + "MarkCommitWithTimestamp() handler not defined."); + } + // Continue is called by WriteBatch::Iterate. If it returns false, // iteration is halted. Otherwise, it continues iterating. The default // implementation always returns true. @@ -307,17 +336,62 @@ // Returns true if MarkEndPrepare will be called during Iterate bool HasEndPrepare() const; - // Returns trie if MarkCommit will be called during Iterate + // Returns true if MarkCommit will be called during Iterate bool HasCommit() const; - // Returns trie if MarkRollback will be called during Iterate + // Returns true if MarkRollback will be called during Iterate bool HasRollback() const; - // Assign timestamp to write batch - Status AssignTimestamp(const Slice& ts); - - // Assign timestamps to write batch - Status AssignTimestamps(const std::vector& ts_list); + // Experimental. + // Assign timestamp to write batch. + // This requires that all keys, if enable timestamp, (possibly from multiple + // column families) in the write batch have timestamps of the same format. + // + // checker: callable object to check the timestamp sizes of column families. + // + // in: cf, the column family id. + // in/out: ts_sz. Input as the expected timestamp size of the column + // family, output as the actual timestamp size of the column family. + // ret: OK if assignment succeeds. + // Status checker(uint32_t cf, size_t& ts_sz); + // + // User can call checker(uint32_t cf, size_t& ts_sz) which does the + // following: + // 1. find out the timestamp size of the column family whose id equals `cf`. + // 2. if cf's timestamp size is 0, then set ts_sz to 0 and return OK. + // 3. otherwise, compare ts_sz with cf's timestamp size and return + // Status::InvalidArgument() if different. + Status AssignTimestamp( + const Slice& ts, + std::function checker = + [](uint32_t /*cf*/, size_t& /*ts_sz*/) { return Status::OK(); }); + + // Experimental. + // Assign timestamps to write batch. + // This API allows the write batch to include keys from multiple column + // families whose timestamps' formats can differ. For example, some column + // families can enable timestamp, while others disable the feature. + // If key does not have timestamp, then put an empty Slice in ts_list as + // a placeholder. + // + // checker: callable object specified by caller to check the timestamp sizes + // of column families. + // + // in: cf, the column family id. + // in/out: ts_sz. Input as the expected timestamp size of the column + // family, output as the actual timestamp size of the column family. + // ret: OK if assignment succeeds. + // Status checker(uint32_t cf, size_t& ts_sz); + // + // User can call checker(uint32_t cf, size_t& ts_sz) which does the + // following: + // 1. find out the timestamp size of the column family whose id equals `cf`. + // 2. compare ts_sz with cf's timestamp size and return + // Status::InvalidArgument() if different. + Status AssignTimestamps( + const std::vector& ts_list, + std::function checker = + [](uint32_t /*cf*/, size_t& /*ts_sz*/) { return Status::OK(); }); using WriteBatchBase::GetWriteBatch; WriteBatch* GetWriteBatch() override { return this; } @@ -338,6 +412,9 @@ void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; } + struct ProtectionInfo; + size_t GetProtectionBytesPerKey() const; + private: friend class WriteBatchInternal; friend class LocalSavePoint; @@ -367,11 +444,10 @@ // more details. bool is_latest_persistent_state_ = false; + std::unique_ptr prot_info_; + protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ - const size_t timestamp_size_; - - // Intentionally copyable }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,46 +13,93 @@ #pragma once #include +#include #include +#include +#include + #include "rocksdb/cache.h" namespace ROCKSDB_NAMESPACE { +class CacheReservationManager; + +// Interface to block and signal DB instances, intended for RocksDB +// internal use only. Each DB instance contains ptr to StallInterface. +class StallInterface { + public: + virtual ~StallInterface() {} + + virtual void Block() = 0; -class WriteBufferManager { + virtual void Signal() = 0; +}; + +class WriteBufferManager final { public: - // _buffer_size = 0 indicates no limit. Memory won't be capped. + // Parameters: + // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. // memory_usage() won't be valid and ShouldFlush() will always return true. - // if `cache` is provided, we'll put dummy entries in the cache and cost - // the memory allocated to the cache. It can be used even if _buffer_size = 0. + // + // cache_: if `cache` is provided, we'll put dummy entries in the cache and + // cost the memory allocated to the cache. It can be used even if _buffer_size + // = 0. + // + // allow_stall: if set true, it will enable stalling of writes when + // memory_usage() exceeds buffer_size. It will wait for flush to complete and + // memory usage to drop down. explicit WriteBufferManager(size_t _buffer_size, - std::shared_ptr cache = {}); + std::shared_ptr cache = {}, + bool allow_stall = false); // No copying allowed WriteBufferManager(const WriteBufferManager&) = delete; WriteBufferManager& operator=(const WriteBufferManager&) = delete; ~WriteBufferManager(); - bool enabled() const { return buffer_size_ != 0; } + // Returns true if buffer_limit is passed to limit the total memory usage and + // is greater than 0. + bool enabled() const { return buffer_size() > 0; } - bool cost_to_cache() const { return cache_rep_ != nullptr; } + // Returns true if pointer to cache is passed. + bool cost_to_cache() const { return cache_res_mgr_ != nullptr; } + // Returns the total memory used by memtables. // Only valid if enabled() size_t memory_usage() const { return memory_used_.load(std::memory_order_relaxed); } + + // Returns the total memory used by active memtables. size_t mutable_memtable_memory_usage() const { return memory_active_.load(std::memory_order_relaxed); } - size_t buffer_size() const { return buffer_size_; } + + size_t dummy_entries_in_cache_usage() const; + + // Returns the buffer_size. + size_t buffer_size() const { + return buffer_size_.load(std::memory_order_relaxed); + } + + void SetBufferSize(size_t new_size) { + buffer_size_.store(new_size, std::memory_order_relaxed); + mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); + // Check if stall is active and can be ended. + MaybeEndWriteStall(); + } + + // Below functions should be called by RocksDB internally. // Should only be called from write thread bool ShouldFlush() const { if (enabled()) { - if (mutable_memtable_memory_usage() > mutable_limit_) { + if (mutable_memtable_memory_usage() > + mutable_limit_.load(std::memory_order_relaxed)) { return true; } - if (memory_usage() >= buffer_size_ && - mutable_memtable_memory_usage() >= buffer_size_ / 2) { + size_t local_size = buffer_size(); + if (memory_usage() >= local_size && + mutable_memtable_memory_usage() >= local_size / 2) { // If the memory exceeds the buffer size, we trigger more aggressive // flush. But if already more than half memory is being flushed, // triggering more flush may not help. We will hold it instead. @@ -62,39 +109,66 @@ return false; } - void ReserveMem(size_t mem) { - if (cache_rep_ != nullptr) { - ReserveMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_add(mem, std::memory_order_relaxed); - } - if (enabled()) { - memory_active_.fetch_add(mem, std::memory_order_relaxed); + // Returns true if total memory usage exceeded buffer_size. + // We stall the writes untill memory_usage drops below buffer_size. When the + // function returns true, all writer threads (including one checking this + // condition) across all DBs will be stalled. Stall is allowed only if user + // pass allow_stall = true during WriteBufferManager instance creation. + // + // Should only be called by RocksDB internally . + bool ShouldStall() const { + if (!allow_stall_ || !enabled()) { + return false; } + + return IsStallActive() || IsStallThresholdExceeded(); } - // We are in the process of freeing `mem` bytes, so it is not considered - // when checking the soft limit. - void ScheduleFreeMem(size_t mem) { - if (enabled()) { - memory_active_.fetch_sub(mem, std::memory_order_relaxed); - } + + // Returns true if stall is active. + bool IsStallActive() const { + return stall_active_.load(std::memory_order_relaxed); } - void FreeMem(size_t mem) { - if (cache_rep_ != nullptr) { - FreeMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_sub(mem, std::memory_order_relaxed); - } + + // Returns true if stalling condition is met. + bool IsStallThresholdExceeded() const { + return memory_usage() >= buffer_size_; } + void ReserveMem(size_t mem); + + // We are in the process of freeing `mem` bytes, so it is not considered + // when checking the soft limit. + void ScheduleFreeMem(size_t mem); + + void FreeMem(size_t mem); + + // Add the DB instance to the queue and block the DB. + // Should only be called by RocksDB internally. + void BeginWriteStall(StallInterface* wbm_stall); + + // If stall conditions have resolved, remove DB instances from queue and + // signal them to continue. + void MaybeEndWriteStall(); + + void RemoveDBFromQueue(StallInterface* wbm_stall); + private: - const size_t buffer_size_; - const size_t mutable_limit_; + std::atomic buffer_size_; + std::atomic mutable_limit_; std::atomic memory_used_; // Memory that hasn't been scheduled to free. std::atomic memory_active_; - struct CacheRep; - std::unique_ptr cache_rep_; + std::unique_ptr cache_res_mgr_; + // Protects cache_res_mgr_ + std::mutex cache_res_mgr_mu_; + + std::list queue_; + // Protects the queue_ and stall_active_. + std::mutex mu_; + bool allow_stall_; + // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() + // while holding mu_, but it can be read without a lock. + std::atomic stall_active_; void ReserveMemWithCache(size_t mem); void FreeMemWithCache(size_t mem); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/CMakeLists.txt 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -4,6 +4,8 @@ message("Please consider switching to CMake 3.11.4 or newer") endif() +set(CMAKE_JAVA_COMPILE_FLAGS -source 7) + set(JNI_NATIVE_SOURCES rocksjni/backupablejni.cc rocksjni/backupenginejni.cc @@ -11,6 +13,7 @@ rocksjni/cassandra_value_operator.cc rocksjni/checkpoint.cc rocksjni/clock_cache.cc + rocksjni/cache.cc rocksjni/columnfamilyhandle.cc rocksjni/compaction_filter.cc rocksjni/compaction_filter_factory.cc @@ -24,8 +27,12 @@ rocksjni/comparator.cc rocksjni/comparatorjnicallback.cc rocksjni/compression_options.cc + rocksjni/concurrent_task_limiter.cc + rocksjni/config_options.cc rocksjni/env.cc rocksjni/env_options.cc + rocksjni/event_listener.cc + rocksjni/event_listener_jnicallback.cc rocksjni/filter.cc rocksjni/ingest_external_file_options.cc rocksjni/iterator.cc @@ -53,11 +60,13 @@ rocksjni/sst_file_writerjni.cc rocksjni/sst_file_readerjni.cc rocksjni/sst_file_reader_iterator.cc + rocksjni/sst_partitioner.cc rocksjni/statistics.cc rocksjni/statisticsjni.cc rocksjni/table.cc rocksjni/table_filter.cc rocksjni/table_filter_jnicallback.cc + rocksjni/testable_event_listener.cc rocksjni/thread_status.cc rocksjni/trace_writer.cc rocksjni/trace_writer_jnicallback.cc @@ -82,6 +91,7 @@ src/main/java/org/rocksdb/AbstractCompactionFilter.java src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java src/main/java/org/rocksdb/AbstractComparator.java + src/main/java/org/rocksdb/AbstractEventListener.java src/main/java/org/rocksdb/AbstractImmutableNativeReference.java src/main/java/org/rocksdb/AbstractMutableOptions.java src/main/java/org/rocksdb/AbstractNativeReference.java @@ -95,12 +105,14 @@ src/main/java/org/rocksdb/AccessHint.java src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java + src/main/java/org/rocksdb/BackgroundErrorReason.java src/main/java/org/rocksdb/BackupableDBOptions.java src/main/java/org/rocksdb/BackupEngine.java src/main/java/org/rocksdb/BackupInfo.java src/main/java/org/rocksdb/BlockBasedTableConfig.java src/main/java/org/rocksdb/BloomFilter.java src/main/java/org/rocksdb/BuiltinComparator.java + src/main/java/org/rocksdb/ByteBufferGetStatus.java src/main/java/org/rocksdb/Cache.java src/main/java/org/rocksdb/CassandraCompactionFilter.java src/main/java/org/rocksdb/CassandraValueMergeOperator.java @@ -126,6 +138,7 @@ src/main/java/org/rocksdb/ComparatorType.java src/main/java/org/rocksdb/CompressionOptions.java src/main/java/org/rocksdb/CompressionType.java + src/main/java/org/rocksdb/ConfigOptions.java src/main/java/org/rocksdb/DataBlockIndexType.java src/main/java/org/rocksdb/DBOptionsInterface.java src/main/java/org/rocksdb/DBOptions.java @@ -134,8 +147,13 @@ src/main/java/org/rocksdb/EncodingType.java src/main/java/org/rocksdb/Env.java src/main/java/org/rocksdb/EnvOptions.java + src/main/java/org/rocksdb/EventListener.java src/main/java/org/rocksdb/Experimental.java + src/main/java/org/rocksdb/ExternalFileIngestionInfo.java src/main/java/org/rocksdb/Filter.java + src/main/java/org/rocksdb/FileOperationInfo.java + src/main/java/org/rocksdb/FlushJobInfo.java + src/main/java/org/rocksdb/FlushReason.java src/main/java/org/rocksdb/FlushOptions.java src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java src/main/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -143,10 +161,14 @@ src/main/java/org/rocksdb/HistogramData.java src/main/java/org/rocksdb/HistogramType.java src/main/java/org/rocksdb/Holder.java + src/main/java/org/rocksdb/IndexShorteningMode.java src/main/java/org/rocksdb/IndexType.java src/main/java/org/rocksdb/InfoLogLevel.java src/main/java/org/rocksdb/IngestExternalFileOptions.java src/main/java/org/rocksdb/LevelMetaData.java + src/main/java/org/rocksdb/ConcurrentTaskLimiter.java + src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java + src/main/java/org/rocksdb/KeyMayExist.java src/main/java/org/rocksdb/LiveFileMetaData.java src/main/java/org/rocksdb/LogFile.java src/main/java/org/rocksdb/Logger.java @@ -154,6 +176,7 @@ src/main/java/org/rocksdb/MemoryUsageType.java src/main/java/org/rocksdb/MemoryUtil.java src/main/java/org/rocksdb/MemTableConfig.java + src/main/java/org/rocksdb/MemTableInfo.java src/main/java/org/rocksdb/MergeOperator.java src/main/java/org/rocksdb/MutableColumnFamilyOptions.java src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -168,6 +191,7 @@ src/main/java/org/rocksdb/OptimisticTransactionDB.java src/main/java/org/rocksdb/OptimisticTransactionOptions.java src/main/java/org/rocksdb/Options.java + src/main/java/org/rocksdb/OptionString.java src/main/java/org/rocksdb/OptionsUtil.java src/main/java/org/rocksdb/PersistentCache.java src/main/java/org/rocksdb/PlainTableConfig.java @@ -189,15 +213,18 @@ src/main/java/org/rocksdb/RocksMemEnv.java src/main/java/org/rocksdb/RocksMutableObject.java src/main/java/org/rocksdb/RocksObject.java + src/main/java/org/rocksdb/SanityLevel.java src/main/java/org/rocksdb/SizeApproximationFlag.java src/main/java/org/rocksdb/SkipListMemTableConfig.java src/main/java/org/rocksdb/Slice.java src/main/java/org/rocksdb/Snapshot.java src/main/java/org/rocksdb/SstFileManager.java src/main/java/org/rocksdb/SstFileMetaData.java - src/main/java/org/rocksdb/SstFileWriter.java src/main/java/org/rocksdb/SstFileReader.java src/main/java/org/rocksdb/SstFileReaderIterator.java + src/main/java/org/rocksdb/SstFileWriter.java + src/main/java/org/rocksdb/SstPartitionerFactory.java + src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java src/main/java/org/rocksdb/StateType.java src/main/java/org/rocksdb/StatisticsCollectorCallback.java src/main/java/org/rocksdb/StatisticsCollector.java @@ -206,6 +233,10 @@ src/main/java/org/rocksdb/StatsLevel.java src/main/java/org/rocksdb/Status.java src/main/java/org/rocksdb/StringAppendOperator.java + src/main/java/org/rocksdb/TableFileCreationBriefInfo.java + src/main/java/org/rocksdb/TableFileCreationInfo.java + src/main/java/org/rocksdb/TableFileCreationReason.java + src/main/java/org/rocksdb/TableFileDeletionInfo.java src/main/java/org/rocksdb/TableFilter.java src/main/java/org/rocksdb/TableProperties.java src/main/java/org/rocksdb/TableFormatConfig.java @@ -235,6 +266,8 @@ src/main/java/org/rocksdb/WriteBatchWithIndex.java src/main/java/org/rocksdb/WriteOptions.java src/main/java/org/rocksdb/WriteBufferManager.java + src/main/java/org/rocksdb/WriteStallCondition.java + src/main/java/org/rocksdb/WriteStallInfo.java src/main/java/org/rocksdb/util/ByteUtil.java src/main/java/org/rocksdb/util/BytewiseComparator.java src/main/java/org/rocksdb/util/Environment.java @@ -255,6 +288,7 @@ src/test/java/org/rocksdb/WriteBatchTest.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java src/test/java/org/rocksdb/util/WriteBatchGetter.java + src/test/java/org/rocksdb/test/TestableEventListener.java ) include(FindJava) @@ -316,19 +350,18 @@ if (DEFINED CUSTOM_DEPS_URL) set(DEPS_URL ${CUSTOM_DEPS_URL}/) else () - # This is a URL for artifacts from a "fake" release on pdillinger's fork, - # so as not to put binaries in git (ew). We should move to hosting these - # under the facebook account on github, or something else more reliable - # than maven.org, which has been failing frequently from Travis. - set(DEPS_URL "https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps") + # Using a Facebook AWS account for S3 storage. (maven.org has a history + # of failing in Travis builds.) + set(DEPS_URL "https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars") endif() if(NOT EXISTS ${JAVA_JUNIT_JAR}) message("Downloading ${JAVA_JUNIT_JAR}") file(DOWNLOAD ${DEPS_URL}/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) + list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) - message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}") + message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}: ${error_message}") endif() file(RENAME ${JAVA_TMP_JAR} ${JAVA_JUNIT_JAR}) endif() @@ -336,8 +369,9 @@ message("Downloading ${JAVA_HAMCR_JAR}") file(DOWNLOAD ${DEPS_URL}/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) + list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) - message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}") + message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}: ${error_message}") endif() file(RENAME ${JAVA_TMP_JAR} ${JAVA_HAMCR_JAR}) endif() @@ -345,8 +379,9 @@ message("Downloading ${JAVA_MOCKITO_JAR}") file(DOWNLOAD ${DEPS_URL}/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) + list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) - message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}") + message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}: ${error_message}") endif() file(RENAME ${JAVA_TMP_JAR} ${JAVA_MOCKITO_JAR}) endif() @@ -354,8 +389,9 @@ message("Downloading ${JAVA_CGLIB_JAR}") file(DOWNLOAD ${DEPS_URL}/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) + list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) - message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}") + message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}: ${error_message}") endif() file(RENAME ${JAVA_TMP_JAR} ${JAVA_CGLIB_JAR}) endif() @@ -363,8 +399,9 @@ message("Downloading ${JAVA_ASSERTJ_JAR}") file(DOWNLOAD ${DEPS_URL}/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) + list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) - message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}") + message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}: ${error_message}") endif() file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR}) endif() @@ -376,6 +413,7 @@ org.rocksdb.AbstractCompactionFilter org.rocksdb.AbstractCompactionFilterFactory org.rocksdb.AbstractComparator + org.rocksdb.AbstractEventListener org.rocksdb.AbstractImmutableNativeReference org.rocksdb.AbstractNativeReference org.rocksdb.AbstractRocksIterator @@ -392,6 +430,7 @@ org.rocksdb.CassandraValueMergeOperator org.rocksdb.Checkpoint org.rocksdb.ClockCache + org.rocksdb.Cache org.rocksdb.ColumnFamilyHandle org.rocksdb.ColumnFamilyOptions org.rocksdb.CompactionJobInfo @@ -402,6 +441,8 @@ org.rocksdb.CompactRangeOptions org.rocksdb.ComparatorOptions org.rocksdb.CompressionOptions + org.rocksdb.ConcurrentTaskLimiterImpl + org.rocksdb.ConfigOptions org.rocksdb.DBOptions org.rocksdb.DirectSlice org.rocksdb.Env @@ -443,6 +484,8 @@ org.rocksdb.SstFileWriter org.rocksdb.SstFileReader org.rocksdb.SstFileReaderIterator + org.rocksdb.SstPartitionerFactory + org.rocksdb.SstPartitionerFixedPrefixFactory org.rocksdb.Statistics org.rocksdb.StringAppendOperator org.rocksdb.TableFormatConfig @@ -468,6 +511,7 @@ org.rocksdb.WriteBatchTest org.rocksdb.WriteBatchTestInternalHelper org.rocksdb.WriteBufferManager + org.rocksdb.test.TestableEventListener ) create_javah( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/Makefile 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile 2025-05-19 16:14:27.000000000 +0000 @@ -2,6 +2,7 @@ org.rocksdb.AbstractCompactionFilter\ org.rocksdb.AbstractCompactionFilterFactory\ org.rocksdb.AbstractComparator\ + org.rocksdb.AbstractEventListener\ org.rocksdb.AbstractSlice\ org.rocksdb.AbstractTableFilter\ org.rocksdb.AbstractTraceWriter\ @@ -13,6 +14,7 @@ org.rocksdb.BloomFilter\ org.rocksdb.Checkpoint\ org.rocksdb.ClockCache\ + org.rocksdb.Cache\ org.rocksdb.CassandraCompactionFilter\ org.rocksdb.CassandraValueMergeOperator\ org.rocksdb.ColumnFamilyHandle\ @@ -25,6 +27,7 @@ org.rocksdb.CompactRangeOptions\ org.rocksdb.ComparatorOptions\ org.rocksdb.CompressionOptions\ + org.rocksdb.ConfigOptions\ org.rocksdb.DBOptions\ org.rocksdb.DirectSlice\ org.rocksdb.Env\ @@ -35,6 +38,9 @@ org.rocksdb.HashLinkedListMemTableConfig\ org.rocksdb.HashSkipListMemTableConfig\ org.rocksdb.HdfsEnv\ + org.rocksdb.ConcurrentTaskLimiter\ + org.rocksdb.ConcurrentTaskLimiterImpl\ + org.rocksdb.KeyMayExist\ org.rocksdb.Logger\ org.rocksdb.LRUCache\ org.rocksdb.MemoryUsageType\ @@ -62,6 +68,8 @@ org.rocksdb.SstFileWriter\ org.rocksdb.SstFileReader\ org.rocksdb.SstFileReaderIterator\ + org.rocksdb.SstPartitionerFactory\ + org.rocksdb.SstPartitionerFixedPrefixFactory\ org.rocksdb.Statistics\ org.rocksdb.ThreadStatus\ org.rocksdb.TimedEnv\ @@ -82,7 +90,9 @@ org.rocksdb.WriteBufferManager\ org.rocksdb.WBWIRocksIterator -NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\ +NATIVE_JAVA_TEST_CLASSES = \ + org.rocksdb.RocksDBExceptionTest\ + org.rocksdb.test.TestableEventListener\ org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\ org.rocksdb.WriteBatchTest\ org.rocksdb.WriteBatchTestInternalHelper @@ -93,16 +103,15 @@ NATIVE_INCLUDE = ./include ARCH := $(shell getconf LONG_BIT) -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar -ifeq ($(PLATFORM), OS_MACOSX) -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -endif +SHA256_CMD ?= sha256sum JAVA_TESTS = \ org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.BackupEngineTest\ + org.rocksdb.BlobOptionsTest\ org.rocksdb.BlockBasedTableConfigTest\ org.rocksdb.BuiltinComparatorTest\ + org.rocksdb.BytewiseComparatorRegressionTest\ org.rocksdb.util.BytewiseComparatorTest\ org.rocksdb.util.BytewiseComparatorIntTest\ org.rocksdb.CheckPointTest\ @@ -124,6 +133,7 @@ org.rocksdb.DirectSliceTest\ org.rocksdb.util.EnvironmentTest\ org.rocksdb.EnvOptionsTest\ + org.rocksdb.EventListenerTest\ org.rocksdb.HdfsEnvTest\ org.rocksdb.IngestExternalFileOptionsTest\ org.rocksdb.util.IntComparatorTest\ @@ -132,14 +142,18 @@ org.rocksdb.FlushTest\ org.rocksdb.InfoLogLevelTest\ org.rocksdb.KeyMayExistTest\ + org.rocksdb.ConcurrentTaskLimiterTest\ org.rocksdb.LoggerTest\ org.rocksdb.LRUCacheTest\ org.rocksdb.MemoryUtilTest\ org.rocksdb.MemTableTest\ org.rocksdb.MergeTest\ + org.rocksdb.MultiGetManyKeysTest\ + org.rocksdb.MultiGetTest\ org.rocksdb.MixedOptionsTest\ org.rocksdb.MutableColumnFamilyOptionsTest\ org.rocksdb.MutableDBOptionsTest\ + org.rocksdb.MutableOptionsGetSetTest \ org.rocksdb.NativeComparatorWrapperTest\ org.rocksdb.NativeLibraryLoaderTest\ org.rocksdb.OptimisticTransactionTest\ @@ -158,11 +172,13 @@ org.rocksdb.RocksIteratorTest\ org.rocksdb.RocksMemEnvTest\ org.rocksdb.util.SizeUnitTest\ + org.rocksdb.SecondaryDBTest\ org.rocksdb.SliceTest\ org.rocksdb.SnapshotTest\ org.rocksdb.SstFileManagerTest\ org.rocksdb.SstFileWriterTest\ org.rocksdb.SstFileReaderTest\ + org.rocksdb.SstPartitionerTest\ org.rocksdb.TableFilterTest\ org.rocksdb.TimedEnvTest\ org.rocksdb.TransactionTest\ @@ -197,31 +213,77 @@ SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes JAVA_TEST_LIBDIR = test-libs -JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar -JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar -JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar -JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar -JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar -JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR) +JAVA_JUNIT_VER = 4.13.1 +JAVA_JUNIT_SHA256 = c30719db974d6452793fe191b3638a5777005485bae145924044530ffa5f6122 +JAVA_JUNIT_JAR = junit-$(JAVA_JUNIT_VER).jar +JAVA_JUNIT_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_JUNIT_JAR) +JAVA_HAMCREST_VER = 2.2 +JAVA_HAMCREST_SHA256 = 5e62846a89f05cd78cd9c1a553f340d002458380c320455dd1f8fc5497a8a1c1 +JAVA_HAMCREST_JAR = hamcrest-$(JAVA_HAMCREST_VER).jar +JAVA_HAMCREST_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_HAMCREST_JAR) +JAVA_MOCKITO_VER = 1.10.19 +JAVA_MOCKITO_SHA256 = d1a7a7ef14b3db5c0fc3e0a63a81b374b510afe85add9f7984b97911f4c70605 +JAVA_MOCKITO_JAR = mockito-all-$(JAVA_MOCKITO_VER).jar +JAVA_MOCKITO_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_MOCKITO_JAR) +JAVA_CGLIB_VER = 3.3.0 +JAVA_CGLIB_SHA256 = 9fe0c26d7464140ccdfe019ac687be1fb906122b508ab54beb810db0f09a9212 +JAVA_CGLIB_JAR = cglib-$(JAVA_CGLIB_VER).jar +JAVA_CGLIB_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_CGLIB_JAR) +JAVA_ASSERTJ_VER = 2.9.0 +JAVA_ASSERTJ_SHA256 = 5e88ea3ecbe3c48aa1346fec76c84979fa9c8d22499f11479011691230e8babf +JAVA_ASSERTJ_JAR = assertj-core-$(JAVA_ASSERTJ_VER).jar +JAVA_ASSERTJ_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_ASSERTJ_JAR) +JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR_PATH):$(JAVA_HAMCREST_JAR_PATH):$(JAVA_MOCKITO_JAR_PATH):$(JAVA_CGLIB_JAR_PATH):$(JAVA_ASSERTJ_JAR_PATH) MVN_LOCAL = ~/.m2/repository +# Set the path of the java commands +ifeq ($(JAVA_CMD),) +ifneq ($(JAVA_HOME),) +JAVA_CMD := $(JAVA_HOME)/bin/java +else +JAVA_CMD := java +endif +endif + +ifeq ($(JAVAC_CMD),) +ifneq ($(JAVA_HOME),) +JAVAC_CMD := $(JAVA_HOME)/bin/javac +else +JAVAC_CMD := javac +endif +endif + +ifeq ($(JAVAH_CMD),) +ifneq ($(JAVA_HOME),) +JAVAH_CMD := $(JAVA_HOME)/bin/javah +else +JAVAH_CMD := javah +endif +endif + +ifeq ($(JAVADOC_CMD),) +ifneq ($(JAVA_HOME),) +JAVADOC_CMD := $(JAVA_HOME)/bin/javadoc +else +JAVADOC_CMD := javadoc +endif +endif + # Set the default JAVA_ARGS to "" for DEBUG_LEVEL=0 -JAVA_ARGS? = +JAVA_ARGS ?= -JAVAC_ARGS? = +JAVAC_ARGS ?= # When debugging add -Xcheck:jni to the java args ifneq ($(DEBUG_LEVEL),0) - JAVA_ARGS = -ea -Xcheck:jni - JAVAC_ARGS = -Xlint:deprecation -Xlint:unchecked + JAVA_ARGS += -ea -Xcheck:jni + JAVAC_ARGS += -Xlint:deprecation -Xlint:unchecked endif -# This is a URL for artifacts from a "fake" release on pdillinger's fork, -# so as not to put binaries in git (ew). We should move to hosting these -# under the facebook account on github, or something else more reliable -# than maven.org, which has been failing frequently from Travis. -DEPS_URL?=https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps +# Using a Facebook AWS account for S3 storage. (maven.org has a history +# of failing in Travis builds.) +DEPS_URL?=https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars clean: clean-not-downloaded clean-downloaded @@ -237,75 +299,132 @@ javadocs: java $(AM_V_GEN)mkdir -p $(JAVADOC) - $(AM_V_at)javadoc -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org + $(AM_V_at)$(JAVADOC_CMD) -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org javalib: java java_test javadocs java: $(AM_V_GEN)mkdir -p $(MAIN_CLASSES) -ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) - $(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\ +ifeq ($(shell $(JAVAC_CMD) -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -d $(MAIN_CLASSES)\ $(MAIN_SRC)/org/rocksdb/util/*.java\ $(MAIN_SRC)/org/rocksdb/*.java else - $(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\ + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\ $(MAIN_SRC)/org/rocksdb/util/*.java\ $(MAIN_SRC)/org/rocksdb/*.java endif $(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md $(AM_V_at)@rm -f ./HISTORY-CPP.md -ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) - $(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) +ifeq ($(shell $(JAVAH_CMD) -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) + $(AM_V_at)$(JAVAH_CMD) -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) endif sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) - $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java $(AM_V_at)@rm -rf /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found - java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found column_family_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) - $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java $(AM_V_at)@rm -rf /tmp/rocksdbjni - java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni transaction_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) - $(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java + $(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java $(AM_V_at)@rm -rf /tmp/rocksdbjni - java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni + $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni optimistic_transaction_sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) - $(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java + $(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java $(AM_V_at)@rm -rf /tmp/rocksdbjni - java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni + $(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni -resolve_test_deps: - test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)" - test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_JUNIT_JAR) --location $(DEPS_URL)/junit-4.12.jar - test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_HAMCR_JAR) --location $(DEPS_URL)/hamcrest-core-1.3.jar - test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_MOCKITO_JAR)" --location $(DEPS_URL)/mockito-all-1.10.19.jar - test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_CGLIB_JAR)" --location $(DEPS_URL)/cglib-2.2.2.jar - test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR)" --location $(DEPS_URL)/assertj-core-1.7.1.jar +$(JAVA_TEST_LIBDIR): + mkdir -p "$(JAVA_TEST_LIBDIR)" + +$(JAVA_JUNIT_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR))) + cp -v $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output $(JAVA_JUNIT_JAR_PATH) --location $(DEPS_URL)/$(JAVA_JUNIT_JAR) + JAVA_JUNIT_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_JUNIT_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_JUNIT_SHA256)" != "$$JAVA_JUNIT_SHA256_ACTUAL" ]; then \ + echo $(JAVA_JUNIT_JAR_PATH) checksum mismatch, expected=\"$(JAVA_JUNIT_SHA256)\" actual=\"$$JAVA_JUNIT_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_HAMCREST_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR))) + cp -v $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output $(JAVA_HAMCREST_JAR_PATH) --location $(DEPS_URL)/$(JAVA_HAMCREST_JAR) + JAVA_HAMCREST_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_HAMCREST_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_HAMCREST_SHA256)" != "$$JAVA_HAMCREST_SHA256_ACTUAL" ]; then \ + echo $(JAVA_HAMCREST_JAR_PATH) checksum mismatch, expected=\"$(JAVA_HAMCREST_SHA256)\" actual=\"$$JAVA_HAMCREST_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_MOCKITO_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR))) + cp -v $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_MOCKITO_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_MOCKITO_JAR) + JAVA_MOCKITO_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_MOCKITO_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_MOCKITO_SHA256)" != "$$JAVA_MOCKITO_SHA256_ACTUAL" ]; then \ + echo $(JAVA_MOCKITO_JAR_PATH) checksum mismatch, expected=\"$(JAVA_MOCKITO_SHA256)\" actual=\"$$JAVA_MOCKITO_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_CGLIB_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR))) + cp -v $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_CGLIB_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_CGLIB_JAR) + JAVA_CGLIB_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_CGLIB_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_CGLIB_SHA256)" != "$$JAVA_CGLIB_SHA256_ACTUAL" ]; then \ + echo $(JAVA_CGLIB_JAR_PATH) checksum mismatch, expected=\"$(JAVA_CGLIB_SHA256)\" actual=\"$$JAVA_CGLIB_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_ASSERTJ_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR))) + cp -v $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_ASSERTJ_JAR) + JAVA_ASSERTJ_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_ASSERTJ_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_ASSERTJ_SHA256)" != "$$JAVA_ASSERTJ_SHA256_ACTUAL" ]; then \ + echo $(JAVA_ASSERTJ_JAR_PATH) checksum mismatch, expected=\"$(JAVA_ASSERTJ_SHA256)\" actual=\"$$JAVA_ASSERTJ_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +resolve_test_deps: $(JAVA_JUNIT_JAR_PATH) $(JAVA_HAMCREST_JAR_PATH) $(JAVA_MOCKITO_JAR_PATH) $(JAVA_CGLIB_JAR_PATH) $(JAVA_ASSERTJ_JAR_PATH) java_test: java resolve_test_deps $(AM_V_GEN)mkdir -p $(TEST_CLASSES) -ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0) - $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\ +ifeq ($(shell $(JAVAC_CMD) -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0) + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\ $(TEST_SRC)/org/rocksdb/test/*.java\ $(TEST_SRC)/org/rocksdb/util/*.java\ $(TEST_SRC)/org/rocksdb/*.java - $(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) + $(AM_V_at)$(JAVAH_CMD) -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) else - $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\ + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\ $(TEST_SRC)/org/rocksdb/test/*.java\ $(TEST_SRC)/org/rocksdb/util/*.java\ $(TEST_SRC)/org/rocksdb/*.java @@ -314,8 +433,8 @@ test: java java_test run_test run_test: - java $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS) + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS) db_bench: java $(AM_V_GEN)mkdir -p $(BENCHMARK_MAIN_CLASSES) - $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java + $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ cd /rocksdb-local-build make clean-not-downloaded -PORTABLE=1 make rocksdbjavastatic - -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target +PORTABLE=1 make -j2 rocksdbjavastatic +cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh 2025-05-19 16:14:27.000000000 +0000 @@ -27,8 +27,8 @@ fi else make clean-not-downloaded - PORTABLE=1 make -j2 rocksdbjavastatic + PORTABLE=1 make -j2 rocksdbjavastatic fi -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target +cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/README.md 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,12 @@ **Note**: This uses a specific build of RocksDB that is set in the `` element of the `dependencies` section of the `pom.xml` file. If you are testing local changes you should build and install a SNAPSHOT version of rocksdbjni, and update the `pom.xml` of rocksdbjni-jmh file to test with this. +For instance, this is how to install the OSX jar you just built for 6.26.0 + +```bash +$ mvn install:install-file -Dfile=./java/target/rocksdbjni-6.26.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=6.26.0-SNAPSHOT -Dpackaging=jar +``` + ```bash $ mvn package ``` diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/pom.xml mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/pom.xml 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml 2025-05-19 16:14:27.000000000 +0000 @@ -50,7 +50,7 @@ org.rocksdb rocksdbjni - 6.6.0-SNAPSHOT + 6.27.0-SNAPSHOT diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java 2025-05-19 16:14:27.000000000 +0000 @@ -6,23 +6,26 @@ */ package org.rocksdb.jmh; -import org.openjdk.jmh.annotations.*; -import org.rocksdb.*; -import org.rocksdb.util.FileUtils; +import static org.rocksdb.util.KVUtils.ba; +import static org.rocksdb.util.KVUtils.keys; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.rocksdb.*; +import org.rocksdb.util.FileUtils; -import static org.rocksdb.util.KVUtils.ba; -import static org.rocksdb.util.KVUtils.keys; - -@State(Scope.Benchmark) +@State(Scope.Thread) public class MultiGetBenchmarks { - @Param({ "no_column_family", "1_column_family", @@ -31,8 +34,7 @@ }) String columnFamilyTestType; - @Param("100000") - int keyCount; + @Param({"10000", "25000", "100000"}) int keyCount; @Param({ "10", @@ -42,6 +44,9 @@ }) int multiGetSize; + @Param({"16", "64", "250", "1000", "4000", "16000"}) int valueSize; + @Param({"16"}) int keySize; // big enough + Path dbDir; DBOptions options; int cfs = 0; // number of column families @@ -85,7 +90,8 @@ // store initial data for retrieving via get for (int i = 0; i < cfs; i++) { for (int j = 0; j < keyCount; j++) { - db.put(cfHandles[i], ba("key" + j), ba("value" + j)); + final byte[] paddedValue = Arrays.copyOf(ba("value" + j), valueSize); + db.put(cfHandles[i], ba("key" + j), paddedValue); } } @@ -149,10 +155,78 @@ } } + ByteBuffer keysBuffer; + ByteBuffer valuesBuffer; + + List valueBuffersList; + List keyBuffersList; + + @Setup + public void allocateSliceBuffers() { + keysBuffer = ByteBuffer.allocateDirect(keyCount * valueSize); + valuesBuffer = ByteBuffer.allocateDirect(keyCount * valueSize); + valueBuffersList = new ArrayList<>(); + keyBuffersList = new ArrayList<>(); + for (int i = 0; i < keyCount; i++) { + valueBuffersList.add(valuesBuffer.slice()); + valuesBuffer.position(i * valueSize); + keyBuffersList.add(keysBuffer.slice()); + keysBuffer.position(i * keySize); + } + } + + @TearDown + public void freeSliceBuffers() { + valueBuffersList.clear(); + } + @Benchmark public List multiGet10() throws RocksDBException { final int fromKeyIdx = next(multiGetSize, keyCount); - final List keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize); - return db.multiGetAsList(keys); + if (fromKeyIdx >= 0) { + final List keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize); + final List valueResults = db.multiGetAsList(keys); + for (final byte[] result : valueResults) { + if (result.length != valueSize) + throw new RuntimeException("Test valueSize assumption wrong"); + } + } + return new ArrayList<>(); + } + + @Benchmark + public List multiGetDirect10() throws RocksDBException { + final int fromKeyIdx = next(multiGetSize, keyCount); + if (fromKeyIdx >= 0) { + final List keys = keys(keyBuffersList, fromKeyIdx, fromKeyIdx + multiGetSize); + final List results = db.multiGetByteBuffers( + keys, valueBuffersList.subList(fromKeyIdx, fromKeyIdx + multiGetSize)); + for (final RocksDB.MultiGetInstance result : results) { + if (result.status.getCode() != Status.Code.Ok) + throw new RuntimeException("Test status assumption wrong"); + if (result.valueSize != valueSize) + throw new RuntimeException("Test valueSize assumption wrong"); + } + return results; + } + return new ArrayList<>(); + } + + public static void main(final String[] args) throws RunnerException { + final org.openjdk.jmh.runner.options.Options opt = + new OptionsBuilder() + .include(MultiGetBenchmarks.class.getSimpleName()) + .forks(1) + .jvmArgs("-ea") + .warmupIterations(1) + .measurementIterations(2) + .forks(2) + .param("columnFamilyTestType=", "1_column_family") + .param("multiGetSize=", "10", "1000") + .param("keyCount=", "1000") + .output("jmh_output") + .build(); + + new Runner(opt).run(); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java 2025-05-19 16:14:27.000000000 +0000 @@ -6,11 +6,12 @@ */ package org.rocksdb.util; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; -import static java.nio.charset.StandardCharsets.UTF_8; - public final class KVUtils { /** @@ -55,4 +56,17 @@ } return keys; } + + public static List keys( + final List keyBuffers, final int from, final int to) { + final List keys = new ArrayList<>(to - from); + for (int i = from; i < to; i++) { + final ByteBuffer key = keyBuffers.get(i); + key.clear(); + key.put(ba("key" + i)); + key.flip(); + keys.add(key); + } + return keys; + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/pom.xml.template mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/pom.xml.template 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,178 @@ + + + 4.0.0 + + org.rocksdb + rocksdbjni + ${ROCKSDB_JAVA_VERSION} + + RocksDB JNI + RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files + for Mac OSX, and a .dll for Windows x64. + + https://rocksdb.org + 2012 + + + + Apache License 2.0 + http://www.apache.org/licenses/LICENSE-2.0.html + repo + + + GNU General Public License, version 2 + http://www.gnu.org/licenses/gpl-2.0.html + repo + + + + + scm:git:https://github.com/facebook/rocksdb.git + scm:git:https://github.com/facebook/rocksdb.git + scm:git:https://github.com/facebook/rocksdb.git + + + + Facebook + https://www.facebook.com + + + + + Facebook + help@facebook.com + America/New_York + + architect + + + + + + + rocksdb - Google Groups + rocksdb-subscribe@googlegroups.com + rocksdb-unsubscribe@googlegroups.com + rocksdb@googlegroups.com + https://groups.google.com/forum/#!forum/rocksdb + + + + + 1.7 + 1.7 + UTF-8 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.2 + + ${project.build.source} + ${project.build.target} + ${project.build.sourceEncoding} + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.18.1 + + ${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory} + false + false + + ${project.build.directory}/* + + + + + org.jacoco + jacoco-maven-plugin + 0.7.2.201409121644 + + + + prepare-agent + + + + report + prepare-package + + report + + + + + + org.codehaus.gmaven + groovy-maven-plugin + 2.0 + + + process-classes + + execute + + + + Xenu + + + String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8') + matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/) + String major_version = matcher.getAt(0).getAt(1) + matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/) + String minor_version = matcher.getAt(0).getAt(1) + matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/) + String patch_version = matcher.getAt(0).getAt(1) + String version = String.format('%s.%s.%s', major_version, minor_version, patch_version) + // Set version to be used in pom.properties + project.version = version + // Set version to be set as jar name + project.build.finalName = project.artifactId + "-" + version + + + + + + + + + + + junit + junit + 4.13.1 + test + + + org.hamcrest + hamcrest + 2.2 + test + + + cglib + cglib + 3.3.0 + test + + + org.assertj + assertj-core + 2.9.0 + test + + + org.mockito + mockito-all + 1.10.19 + test + + + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/cache.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::Cache. + +#include "rocksdb/cache.h" + +#include + +#include "include/org_rocksdb_Cache.h" + +/* + * Class: org_rocksdb_Cache + * Method: getUsage + * Signature: (J)J + */ +jlong Java_org_rocksdb_Cache_getUsage(JNIEnv*, jclass, jlong jhandle) { + auto* sptr_cache = + reinterpret_cast*>(jhandle); + return static_cast(sptr_cache->get()->GetUsage()); +} + +/* + * Class: org_rocksdb_Cache + * Method: getPinnedUsage + * Signature: (J)J + */ +jlong Java_org_rocksdb_Cache_getPinnedUsage(JNIEnv*, jclass, jlong jhandle) { + auto* sptr_cache = + reinterpret_cast*>(jhandle); + return static_cast(sptr_cache->get()->GetPinnedUsage()); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -134,6 +134,27 @@ /* * Class: org_rocksdb_CompressionOptions + * Method: setMaxDictBufferBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes( + JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) { + auto* opt = reinterpret_cast(jhandle); + opt->max_dict_buffer_bytes = static_cast(jmax_dict_buffer_bytes); +} + +/* + * Class: org_rocksdb_CompressionOptions + * Method: maxDictBufferBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_dict_buffer_bytes); +} +/* + * Class: org_rocksdb_CompressionOptions * Method: setEnabled * Signature: (JZ)V */ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,90 @@ +#include "rocksdb/concurrent_task_limiter.h" + +#include + +#include +#include + +#include "include/org_rocksdb_ConcurrentTaskLimiterImpl.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: newConcurrentTaskLimiterImpl0 + * Signature: (Ljava/lang/String;I)J + */ +jlong Java_org_rocksdb_ConcurrentTaskLimiterImpl_newConcurrentTaskLimiterImpl0( + JNIEnv* env, jclass, jstring jname, jint limit) { + jboolean has_exception = JNI_FALSE; + std::string name = + ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception); + if (JNI_TRUE == has_exception) { + return 0; + } + + auto* ptr = new std::shared_ptr( + ROCKSDB_NAMESPACE::NewConcurrentTaskLimiter(name, limit)); + + return reinterpret_cast(ptr); +} + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: name + * Signature: (J)Ljava/lang/String; + */ +jstring Java_org_rocksdb_ConcurrentTaskLimiterImpl_name(JNIEnv* env, jclass, + jlong handle) { + const auto& limiter = *reinterpret_cast< + std::shared_ptr*>(handle); + return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &limiter->GetName()); +} + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: setMaxOutstandingTask + * Signature: (JI)V + */ +void Java_org_rocksdb_ConcurrentTaskLimiterImpl_setMaxOutstandingTask( + JNIEnv*, jclass, jlong handle, jint max_outstanding_task) { + const auto& limiter = *reinterpret_cast< + std::shared_ptr*>(handle); + limiter->SetMaxOutstandingTask(static_cast(max_outstanding_task)); +} + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: resetMaxOutstandingTask + * Signature: (J)V + */ +void Java_org_rocksdb_ConcurrentTaskLimiterImpl_resetMaxOutstandingTask( + JNIEnv*, jclass, jlong handle) { + const auto& limiter = *reinterpret_cast< + std::shared_ptr*>(handle); + limiter->ResetMaxOutstandingTask(); +} + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: outstandingTask + * Signature: (J)I + */ +jint Java_org_rocksdb_ConcurrentTaskLimiterImpl_outstandingTask(JNIEnv*, jclass, + jlong handle) { + const auto& limiter = *reinterpret_cast< + std::shared_ptr*>(handle); + return static_cast(limiter->GetOutstandingTask()); +} + +/* + * Class: org_rocksdb_ConcurrentTaskLimiterImpl + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ConcurrentTaskLimiterImpl_disposeInternal(JNIEnv*, + jobject, + jlong jhandle) { + auto* ptr = reinterpret_cast< + std::shared_ptr*>(jhandle); + delete ptr; // delete std::shared_ptr +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ and enables +// calling C++ ROCKSDB_NAMESPACE::ConfigOptions methods +// from Java side. + +#include + +#include "include/org_rocksdb_ConfigOptions.h" +#include "rocksdb/convenience.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_ConfigOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ConfigOptions_disposeInternal(JNIEnv *, jobject, + jlong jhandle) { + auto *co = reinterpret_cast(jhandle); + assert(co != nullptr); + delete co; +} + +/* + * Class: org_rocksdb_ConfigOptions + * Method: newConfigOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) { + auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions(); + return reinterpret_cast(cfg_opt); +} + +/* + * Class: org_rocksdb_ConfigOptions + * Method: setDelimiter + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass, + jlong handle, jstring s) { + auto *cfg_opt = reinterpret_cast(handle); + const char *delim = env->GetStringUTFChars(s, nullptr); + if (delim == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + cfg_opt->delimiter = delim; + env->ReleaseStringUTFChars(s, delim); +} + +/* + * Class: org_rocksdb_ConfigOptions + * Method: setIgnoreUnknownOptions + * Signature: (JZ)V + */ +void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass, + jlong handle, + jboolean b) { + auto *cfg_opt = reinterpret_cast(handle); + cfg_opt->ignore_unknown_options = static_cast(b); +} + +/* + * Class: org_rocksdb_ConfigOptions + * Method: setInputStringsEscaped + * Signature: (JZ)V + */ +void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass, + jlong handle, + jboolean b) { + auto *cfg_opt = reinterpret_cast(handle); + cfg_opt->input_strings_escaped = static_cast(b); +} + +/* + * Class: org_rocksdb_ConfigOptions + * Method: setSanityLevel + * Signature: (JI)V + */ +void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass, + jlong handle, jbyte level) { + auto *cfg_opt = reinterpret_cast(handle); + cfg_opt->sanity_level = ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::EventListener. + +#include + +#include + +#include "include/org_rocksdb_AbstractEventListener.h" +#include "rocksjni/event_listener_jnicallback.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_AbstractEventListener + * Method: createNewEventListener + * Signature: (J)J + */ +jlong Java_org_rocksdb_AbstractEventListener_createNewEventListener( + JNIEnv* env, jobject jobj, jlong jenabled_event_callback_values) { + auto enabled_event_callbacks = + ROCKSDB_NAMESPACE::EnabledEventCallbackJni::toCppEnabledEventCallbacks( + jenabled_event_callback_values); + auto* sptr_event_listener = + new std::shared_ptr( + new ROCKSDB_NAMESPACE::EventListenerJniCallback( + env, jobj, enabled_event_callbacks)); + return reinterpret_cast(sptr_event_listener); +} + +/* + * Class: org_rocksdb_AbstractEventListener + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_AbstractEventListener_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { + delete reinterpret_cast*>( + jhandle); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,502 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::EventListener. + +#include "rocksjni/event_listener_jnicallback.h" + +#include "rocksjni/portal.h" + +namespace ROCKSDB_NAMESPACE { +EventListenerJniCallback::EventListenerJniCallback( + JNIEnv* env, jobject jevent_listener, + const std::set& enabled_event_callbacks) + : JniCallback(env, jevent_listener), + m_enabled_event_callbacks(enabled_event_callbacks) { + InitCallbackMethodId( + m_on_flush_completed_proxy_mid, EnabledEventCallback::ON_FLUSH_COMPLETED, + env, AbstractEventListenerJni::getOnFlushCompletedProxyMethodId); + + InitCallbackMethodId(m_on_flush_begin_proxy_mid, + EnabledEventCallback::ON_FLUSH_BEGIN, env, + AbstractEventListenerJni::getOnFlushBeginProxyMethodId); + + InitCallbackMethodId(m_on_table_file_deleted_mid, + EnabledEventCallback::ON_TABLE_FILE_DELETED, env, + AbstractEventListenerJni::getOnTableFileDeletedMethodId); + + InitCallbackMethodId( + m_on_compaction_begin_proxy_mid, + EnabledEventCallback::ON_COMPACTION_BEGIN, env, + AbstractEventListenerJni::getOnCompactionBeginProxyMethodId); + + InitCallbackMethodId( + m_on_compaction_completed_proxy_mid, + EnabledEventCallback::ON_COMPACTION_COMPLETED, env, + AbstractEventListenerJni::getOnCompactionCompletedProxyMethodId); + + InitCallbackMethodId(m_on_table_file_created_mid, + EnabledEventCallback::ON_TABLE_FILE_CREATED, env, + AbstractEventListenerJni::getOnTableFileCreatedMethodId); + + InitCallbackMethodId( + m_on_table_file_creation_started_mid, + EnabledEventCallback::ON_TABLE_FILE_CREATION_STARTED, env, + AbstractEventListenerJni::getOnTableFileCreationStartedMethodId); + + InitCallbackMethodId(m_on_mem_table_sealed_mid, + EnabledEventCallback::ON_MEMTABLE_SEALED, env, + AbstractEventListenerJni::getOnMemTableSealedMethodId); + + InitCallbackMethodId( + m_on_column_family_handle_deletion_started_mid, + EnabledEventCallback::ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, env, + AbstractEventListenerJni::getOnColumnFamilyHandleDeletionStartedMethodId); + + InitCallbackMethodId( + m_on_external_file_ingested_proxy_mid, + EnabledEventCallback::ON_EXTERNAL_FILE_INGESTED, env, + AbstractEventListenerJni::getOnExternalFileIngestedProxyMethodId); + + InitCallbackMethodId( + m_on_background_error_proxy_mid, + EnabledEventCallback::ON_BACKGROUND_ERROR, env, + AbstractEventListenerJni::getOnBackgroundErrorProxyMethodId); + + InitCallbackMethodId( + m_on_stall_conditions_changed_mid, + EnabledEventCallback::ON_STALL_CONDITIONS_CHANGED, env, + AbstractEventListenerJni::getOnStallConditionsChangedMethodId); + + InitCallbackMethodId(m_on_file_read_finish_mid, + EnabledEventCallback::ON_FILE_READ_FINISH, env, + AbstractEventListenerJni::getOnFileReadFinishMethodId); + + InitCallbackMethodId(m_on_file_write_finish_mid, + EnabledEventCallback::ON_FILE_WRITE_FINISH, env, + AbstractEventListenerJni::getOnFileWriteFinishMethodId); + + InitCallbackMethodId(m_on_file_flush_finish_mid, + EnabledEventCallback::ON_FILE_FLUSH_FINISH, env, + AbstractEventListenerJni::getOnFileFlushFinishMethodId); + + InitCallbackMethodId(m_on_file_sync_finish_mid, + EnabledEventCallback::ON_FILE_SYNC_FINISH, env, + AbstractEventListenerJni::getOnFileSyncFinishMethodId); + + InitCallbackMethodId( + m_on_file_range_sync_finish_mid, + EnabledEventCallback::ON_FILE_RANGE_SYNC_FINISH, env, + AbstractEventListenerJni::getOnFileRangeSyncFinishMethodId); + + InitCallbackMethodId( + m_on_file_truncate_finish_mid, + EnabledEventCallback::ON_FILE_TRUNCATE_FINISH, env, + AbstractEventListenerJni::getOnFileTruncateFinishMethodId); + + InitCallbackMethodId(m_on_file_close_finish_mid, + EnabledEventCallback::ON_FILE_CLOSE_FINISH, env, + AbstractEventListenerJni::getOnFileCloseFinishMethodId); + + InitCallbackMethodId( + m_should_be_notified_on_file_io, + EnabledEventCallback::SHOULD_BE_NOTIFIED_ON_FILE_IO, env, + AbstractEventListenerJni::getShouldBeNotifiedOnFileIOMethodId); + + InitCallbackMethodId( + m_on_error_recovery_begin_proxy_mid, + EnabledEventCallback::ON_ERROR_RECOVERY_BEGIN, env, + AbstractEventListenerJni::getOnErrorRecoveryBeginProxyMethodId); + + InitCallbackMethodId( + m_on_error_recovery_completed_mid, + EnabledEventCallback::ON_ERROR_RECOVERY_COMPLETED, env, + AbstractEventListenerJni::getOnErrorRecoveryCompletedMethodId); +} + +EventListenerJniCallback::~EventListenerJniCallback() {} + +void EventListenerJniCallback::OnFlushCompleted( + DB* db, const FlushJobInfo& flush_job_info) { + if (m_on_flush_completed_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jflush_job_info = SetupCallbackInvocation( + env, attached_thread, flush_job_info, + FlushJobInfoJni::fromCppFlushJobInfo); + + if (jflush_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_flush_completed_proxy_mid, + reinterpret_cast(db), jflush_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info}); +} + +void EventListenerJniCallback::OnFlushBegin( + DB* db, const FlushJobInfo& flush_job_info) { + if (m_on_flush_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jflush_job_info = SetupCallbackInvocation( + env, attached_thread, flush_job_info, + FlushJobInfoJni::fromCppFlushJobInfo); + + if (jflush_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_flush_begin_proxy_mid, + reinterpret_cast(db), jflush_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info}); +} + +void EventListenerJniCallback::OnTableFileDeleted( + const TableFileDeletionInfo& info) { + if (m_on_table_file_deleted_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jdeletion_info = SetupCallbackInvocation( + env, attached_thread, info, + TableFileDeletionInfoJni::fromCppTableFileDeletionInfo); + + if (jdeletion_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_deleted_mid, + jdeletion_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jdeletion_info}); +} + +void EventListenerJniCallback::OnCompactionBegin(DB* db, + const CompactionJobInfo& ci) { + if (m_on_compaction_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcompaction_job_info = SetupCallbackInvocation( + env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo); + + if (jcompaction_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_compaction_begin_proxy_mid, + reinterpret_cast(db), jcompaction_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info}); +} + +void EventListenerJniCallback::OnCompactionCompleted( + DB* db, const CompactionJobInfo& ci) { + if (m_on_compaction_completed_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcompaction_job_info = SetupCallbackInvocation( + env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo); + + if (jcompaction_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_compaction_completed_proxy_mid, + reinterpret_cast(db), jcompaction_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info}); +} + +void EventListenerJniCallback::OnTableFileCreated( + const TableFileCreationInfo& info) { + if (m_on_table_file_created_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jfile_creation_info = SetupCallbackInvocation( + env, attached_thread, info, + TableFileCreationInfoJni::fromCppTableFileCreationInfo); + + if (jfile_creation_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_created_mid, + jfile_creation_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jfile_creation_info}); +} + +void EventListenerJniCallback::OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) { + if (m_on_table_file_creation_started_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcreation_brief_info = + SetupCallbackInvocation( + env, attached_thread, info, + TableFileCreationBriefInfoJni::fromCppTableFileCreationBriefInfo); + + if (jcreation_brief_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_creation_started_mid, + jcreation_brief_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcreation_brief_info}); +} + +void EventListenerJniCallback::OnMemTableSealed(const MemTableInfo& info) { + if (m_on_mem_table_sealed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jmem_table_info = SetupCallbackInvocation( + env, attached_thread, info, MemTableInfoJni::fromCppMemTableInfo); + + if (jmem_table_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_mem_table_sealed_mid, + jmem_table_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jmem_table_info}); +} + +void EventListenerJniCallback::OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* handle) { + if (m_on_column_family_handle_deletion_started_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcf_handle = SetupCallbackInvocation( + env, attached_thread, *handle, + ColumnFamilyHandleJni::fromCppColumnFamilyHandle); + + if (jcf_handle != nullptr) { + env->CallVoidMethod(m_jcallback_obj, + m_on_column_family_handle_deletion_started_mid, + jcf_handle); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcf_handle}); +} + +void EventListenerJniCallback::OnExternalFileIngested( + DB* db, const ExternalFileIngestionInfo& info) { + if (m_on_external_file_ingested_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jingestion_info = SetupCallbackInvocation( + env, attached_thread, info, + ExternalFileIngestionInfoJni::fromCppExternalFileIngestionInfo); + + if (jingestion_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_external_file_ingested_proxy_mid, + reinterpret_cast(db), jingestion_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jingestion_info}); +} + +void EventListenerJniCallback::OnBackgroundError(BackgroundErrorReason reason, + Status* bg_error) { + if (m_on_background_error_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jstatus = SetupCallbackInvocation( + env, attached_thread, *bg_error, StatusJni::construct); + + if (jstatus != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_background_error_proxy_mid, + static_cast(reason), jstatus); + } + + CleanupCallbackInvocation(env, attached_thread, {&jstatus}); +} + +void EventListenerJniCallback::OnStallConditionsChanged( + const WriteStallInfo& info) { + if (m_on_stall_conditions_changed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jwrite_stall_info = SetupCallbackInvocation( + env, attached_thread, info, WriteStallInfoJni::fromCppWriteStallInfo); + + if (jwrite_stall_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_stall_conditions_changed_mid, + jwrite_stall_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jwrite_stall_info}); +} + +void EventListenerJniCallback::OnFileReadFinish(const FileOperationInfo& info) { + OnFileOperation(m_on_file_read_finish_mid, info); +} + +void EventListenerJniCallback::OnFileWriteFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_write_finish_mid, info); +} + +void EventListenerJniCallback::OnFileFlushFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_flush_finish_mid, info); +} + +void EventListenerJniCallback::OnFileSyncFinish(const FileOperationInfo& info) { + OnFileOperation(m_on_file_sync_finish_mid, info); +} + +void EventListenerJniCallback::OnFileRangeSyncFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_range_sync_finish_mid, info); +} + +void EventListenerJniCallback::OnFileTruncateFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_truncate_finish_mid, info); +} + +void EventListenerJniCallback::OnFileCloseFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_close_finish_mid, info); +} + +bool EventListenerJniCallback::ShouldBeNotifiedOnFileIO() { + if (m_should_be_notified_on_file_io == nullptr) { + return false; + } + + jboolean attached_thread = JNI_FALSE; + JNIEnv* env = getJniEnv(&attached_thread); + assert(env != nullptr); + + jboolean jshould_be_notified = + env->CallBooleanMethod(m_jcallback_obj, m_should_be_notified_on_file_io); + + CleanupCallbackInvocation(env, attached_thread, {}); + + return static_cast(jshould_be_notified); +} + +void EventListenerJniCallback::OnErrorRecoveryBegin( + BackgroundErrorReason reason, Status bg_error, bool* auto_recovery) { + if (m_on_error_recovery_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jbg_error = SetupCallbackInvocation( + env, attached_thread, bg_error, StatusJni::construct); + + if (jbg_error != nullptr) { + jboolean jauto_recovery = env->CallBooleanMethod( + m_jcallback_obj, m_on_error_recovery_begin_proxy_mid, + static_cast(reason), jbg_error); + *auto_recovery = jauto_recovery == JNI_TRUE; + } + + CleanupCallbackInvocation(env, attached_thread, {&jbg_error}); +} + +void EventListenerJniCallback::OnErrorRecoveryCompleted(Status old_bg_error) { + if (m_on_error_recovery_completed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jold_bg_error = SetupCallbackInvocation( + env, attached_thread, old_bg_error, StatusJni::construct); + + if (jold_bg_error != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_error_recovery_completed_mid, + jold_bg_error); + } + + CleanupCallbackInvocation(env, attached_thread, {&jold_bg_error}); +} + +void EventListenerJniCallback::InitCallbackMethodId( + jmethodID& mid, EnabledEventCallback eec, JNIEnv* env, + jmethodID (*get_id)(JNIEnv* env)) { + if (m_enabled_event_callbacks.count(eec) == 1) { + mid = get_id(env); + } else { + mid = nullptr; + } +} + +template +jobject EventListenerJniCallback::SetupCallbackInvocation( + JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj, + jobject (*convert)(JNIEnv* env, const T* cpp_obj)) { + attached_thread = JNI_FALSE; + env = getJniEnv(&attached_thread); + assert(env != nullptr); + + return convert(env, &cpp_obj); +} + +void EventListenerJniCallback::CleanupCallbackInvocation( + JNIEnv* env, jboolean attached_thread, + std::initializer_list refs) { + for (auto* ref : refs) { + if (*ref == nullptr) continue; + env->DeleteLocalRef(*ref); + } + + if (env->ExceptionCheck()) { + // exception thrown from CallVoidMethod + env->ExceptionDescribe(); // print out exception to stderr + } + + releaseJniEnv(attached_thread); +} + +void EventListenerJniCallback::OnFileOperation(const jmethodID& mid, + const FileOperationInfo& info) { + if (mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jop_info = SetupCallbackInvocation( + env, attached_thread, info, + FileOperationInfoJni::fromCppFileOperationInfo); + + if (jop_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, mid, jop_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jop_info}); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::EventListener. + +#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ + +#include + +#include +#include + +#include "rocksdb/listener.h" +#include "rocksjni/jnicallback.h" + +namespace ROCKSDB_NAMESPACE { + +enum EnabledEventCallback { + ON_FLUSH_COMPLETED = 0x0, + ON_FLUSH_BEGIN = 0x1, + ON_TABLE_FILE_DELETED = 0x2, + ON_COMPACTION_BEGIN = 0x3, + ON_COMPACTION_COMPLETED = 0x4, + ON_TABLE_FILE_CREATED = 0x5, + ON_TABLE_FILE_CREATION_STARTED = 0x6, + ON_MEMTABLE_SEALED = 0x7, + ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8, + ON_EXTERNAL_FILE_INGESTED = 0x9, + ON_BACKGROUND_ERROR = 0xA, + ON_STALL_CONDITIONS_CHANGED = 0xB, + ON_FILE_READ_FINISH = 0xC, + ON_FILE_WRITE_FINISH = 0xD, + ON_FILE_FLUSH_FINISH = 0xE, + ON_FILE_SYNC_FINISH = 0xF, + ON_FILE_RANGE_SYNC_FINISH = 0x10, + ON_FILE_TRUNCATE_FINISH = 0x11, + ON_FILE_CLOSE_FINISH = 0x12, + SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13, + ON_ERROR_RECOVERY_BEGIN = 0x14, + ON_ERROR_RECOVERY_COMPLETED = 0x15, + + NUM_ENABLED_EVENT_CALLBACK = 0x16, +}; + +class EventListenerJniCallback : public JniCallback, public EventListener { + public: + EventListenerJniCallback( + JNIEnv* env, jobject jevent_listener, + const std::set& enabled_event_callbacks); + virtual ~EventListenerJniCallback(); + virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnTableFileDeleted(const TableFileDeletionInfo& info); + virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci); + virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci); + virtual void OnTableFileCreated(const TableFileCreationInfo& info); + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info); + virtual void OnMemTableSealed(const MemTableInfo& info); + virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle); + virtual void OnExternalFileIngested(DB* db, + const ExternalFileIngestionInfo& info); + virtual void OnBackgroundError(BackgroundErrorReason reason, + Status* bg_error); + virtual void OnStallConditionsChanged(const WriteStallInfo& info); + virtual void OnFileReadFinish(const FileOperationInfo& info); + virtual void OnFileWriteFinish(const FileOperationInfo& info); + virtual void OnFileFlushFinish(const FileOperationInfo& info); + virtual void OnFileSyncFinish(const FileOperationInfo& info); + virtual void OnFileRangeSyncFinish(const FileOperationInfo& info); + virtual void OnFileTruncateFinish(const FileOperationInfo& info); + virtual void OnFileCloseFinish(const FileOperationInfo& info); + virtual bool ShouldBeNotifiedOnFileIO(); + virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason, + Status bg_error, bool* auto_recovery); + virtual void OnErrorRecoveryCompleted(Status old_bg_error); + + private: + inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec, + JNIEnv* env, + jmethodID (*get_id)(JNIEnv* env)); + template + inline jobject SetupCallbackInvocation( + JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj, + jobject (*convert)(JNIEnv* env, const T* cpp_obj)); + inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread, + std::initializer_list refs); + inline void OnFileOperation(const jmethodID& mid, + const FileOperationInfo& info); + + const std::set m_enabled_event_callbacks; + jmethodID m_on_flush_completed_proxy_mid; + jmethodID m_on_flush_begin_proxy_mid; + jmethodID m_on_table_file_deleted_mid; + jmethodID m_on_compaction_begin_proxy_mid; + jmethodID m_on_compaction_completed_proxy_mid; + jmethodID m_on_table_file_created_mid; + jmethodID m_on_table_file_creation_started_mid; + jmethodID m_on_mem_table_sealed_mid; + jmethodID m_on_column_family_handle_deletion_started_mid; + jmethodID m_on_external_file_ingested_proxy_mid; + jmethodID m_on_background_error_proxy_mid; + jmethodID m_on_stall_conditions_changed_mid; + jmethodID m_on_file_read_finish_mid; + jmethodID m_on_file_write_finish_mid; + jmethodID m_on_file_flush_finish_mid; + jmethodID m_on_file_sync_finish_mid; + jmethodID m_on_file_range_sync_finish_mid; + jmethodID m_on_file_truncate_finish_mid; + jmethodID m_on_file_close_finish_mid; + jmethodID m_should_be_notified_on_file_io; + jmethodID m_on_error_recovery_begin_proxy_mid; + jmethodID m_on_error_recovery_completed_mid; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -83,6 +83,23 @@ /* * Class: org_rocksdb_RocksIterator + * Method: refresh0 + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_refresh0(JNIEnv* env, jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + ROCKSDB_NAMESPACE::Status s = it->Refresh(); + + if (s.ok()) { + return; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksIterator * Method: seek0 * Signature: (J[BI)V */ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc 2025-05-19 16:14:27.000000000 +0000 @@ -43,11 +43,10 @@ JNIEnv* env = getJniEnv(&attached_thread); assert(env != nullptr); - if(m_jcallback_obj != nullptr) { + if (m_jcallback_obj != nullptr) { env->DeleteGlobalRef(m_jcallback_obj); } releaseJniEnv(attached_thread); } -// @lint-ignore TXT4 T25377293 Grandfathered in -} // namespace ROCKSDB_NAMESPACE \ No newline at end of file +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h 2025-05-19 16:14:27.000000000 +0000 @@ -19,6 +19,8 @@ JniCallback(JNIEnv* env, jobject jcallback_obj); virtual ~JniCallback(); + const jobject& GetJavaObject() const { return m_jcallback_obj; } + protected: JavaVM* m_jvm; jobject m_jcallback_obj; @@ -27,5 +29,4 @@ }; } // namespace ROCKSDB_NAMESPACE -// @lint-ignore TXT4 T25377293 Grandfathered in #endif // JAVA_ROCKSJNI_JNICALLBACK_H_ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -22,20 +22,14 @@ * Signature: ([J[J)Ljava/util/Map; */ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( - JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) { - std::vector dbs; - jsize db_handle_count = env->GetArrayLength(jdb_handles); - if(db_handle_count > 0) { - jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr); - if (ptr_jdb_handles == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - for (jsize i = 0; i < db_handle_count; i++) { - dbs.push_back( - reinterpret_cast(ptr_jdb_handles[i])); - } - env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT); + JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) { + jboolean has_exception = JNI_FALSE; + std::vector dbs = + ROCKSDB_NAMESPACE::JniUtil::fromJPointers( + env, jdb_handles, &has_exception); + if (has_exception == JNI_TRUE) { + // exception thrown: OutOfMemoryError + return nullptr; } std::unordered_set cache_set; @@ -103,5 +97,4 @@ } return jusage_by_type; - } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -30,7 +30,7 @@ * Method: newSharedStringAppendOperator * Signature: (C)J */ -jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator( +jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__C( JNIEnv* /*env*/, jclass /*jclazz*/, jchar jdelim) { auto* sptr_string_append_op = new std::shared_ptr( @@ -39,6 +39,20 @@ return reinterpret_cast(sptr_string_append_op); } +jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__Ljava_lang_String_2( + JNIEnv* env, jclass /*jclass*/, jstring jdelim) { + jboolean has_exception = JNI_FALSE; + auto delim = + ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdelim, &has_exception); + if (has_exception == JNI_TRUE) { + return 0; + } + auto* sptr_string_append_op = + new std::shared_ptr( + ROCKSDB_NAMESPACE::MergeOperators::CreateStringAppendOperator(delim)); + return reinterpret_cast(sptr_string_append_op); +} + /* * Class: org_rocksdb_StringAppendOperator * Method: disposeInternal diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,9 +6,12 @@ // This file implements the "bridge" between Java and C++ for // ROCKSDB_NAMESPACE::Options. +#include "rocksdb/options.h" + #include #include #include + #include #include @@ -19,22 +22,20 @@ #include "include/org_rocksdb_Options.h" #include "include/org_rocksdb_ReadOptions.h" #include "include/org_rocksdb_WriteOptions.h" - -#include "rocksjni/comparatorjnicallback.h" -#include "rocksjni/portal.h" -#include "rocksjni/statisticsjni.h" -#include "rocksjni/table_filter_jnicallback.h" - #include "rocksdb/comparator.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" -#include "rocksdb/options.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/sst_partitioner.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" +#include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/portal.h" +#include "rocksjni/statisticsjni.h" +#include "rocksjni/table_filter_jnicallback.h" #include "utilities/merge_operators.h" /* @@ -552,7 +553,8 @@ void Java_org_rocksdb_Options_dbPaths( JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { - jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); + jboolean is_copy; + jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { // exception thrown: OutOfMemoryError return; @@ -580,7 +582,8 @@ ptr_jtarget_size[i] = static_cast(db_path.target_size); } - env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT); + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); } /* @@ -916,6 +919,135 @@ return env->NewStringUTF(tf->Name()); } +static std::vector +rocksdb_convert_cf_paths_from_java_helper(JNIEnv* env, jobjectArray path_array, + jlongArray size_array, + jboolean* has_exception) { + jboolean copy_str_has_exception; + std::vector paths = ROCKSDB_NAMESPACE::JniUtil::copyStrings( + env, path_array, ©_str_has_exception); + if (JNI_TRUE == copy_str_has_exception) { + // Exception thrown + *has_exception = JNI_TRUE; + return {}; + } + + if (static_cast(env->GetArrayLength(size_array)) != paths.size()) { + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, + ROCKSDB_NAMESPACE::Status::InvalidArgument( + ROCKSDB_NAMESPACE::Slice("There should be a corresponding target " + "size for every path and vice versa."))); + *has_exception = JNI_TRUE; + return {}; + } + + jlong* size_array_ptr = env->GetLongArrayElements(size_array, nullptr); + if (nullptr == size_array_ptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return {}; + } + std::vector cf_paths; + for (size_t i = 0; i < paths.size(); ++i) { + jlong target_size = size_array_ptr[i]; + if (target_size < 0) { + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, + ROCKSDB_NAMESPACE::Status::InvalidArgument(ROCKSDB_NAMESPACE::Slice( + "Path target size has to be positive."))); + *has_exception = JNI_TRUE; + env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT); + return {}; + } + cf_paths.push_back(ROCKSDB_NAMESPACE::DbPath( + paths[i], static_cast(target_size))); + } + + env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT); + + return cf_paths; +} + +/* + * Class: org_rocksdb_Options + * Method: setCfPaths + * Signature: (J[Ljava/lang/String;[J)V + */ +void Java_org_rocksdb_Options_setCfPaths(JNIEnv* env, jclass, jlong jhandle, + jobjectArray path_array, + jlongArray size_array) { + auto* options = reinterpret_cast(jhandle); + jboolean has_exception = JNI_FALSE; + std::vector cf_paths = + rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array, + &has_exception); + if (JNI_FALSE == has_exception) { + options->cf_paths = std::move(cf_paths); + } +} + +/* + * Class: org_rocksdb_Options + * Method: cfPathsLen + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_cfPathsLen(JNIEnv*, jclass, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->cf_paths.size()); +} + +template +static void rocksdb_convert_cf_paths_to_java_helper(JNIEnv* env, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { + jboolean is_copy; + jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); + if (ptr_jtarget_size == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + auto* opt = reinterpret_cast(jhandle); + const jsize len = env->GetArrayLength(jpaths); + for (jsize i = 0; i < len; i++) { + ROCKSDB_NAMESPACE::DbPath cf_path = opt->cf_paths[i]; + + jstring jpath = env->NewStringUTF(cf_path.path.c_str()); + if (jpath == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT); + return; + } + env->SetObjectArrayElement(jpaths, i, jpath); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jpath); + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT); + return; + } + + ptr_jtarget_size[i] = static_cast(cf_path.target_size); + + env->DeleteLocalRef(jpath); + } + + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, + is_copy ? 0 : JNI_ABORT); +} + +/* + * Class: org_rocksdb_Options + * Method: cfPaths + * Signature: (J[Ljava/lang/String;[J)V + */ +void Java_org_rocksdb_Options_cfPaths(JNIEnv* env, jclass, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { + rocksdb_convert_cf_paths_to_java_helper( + env, jhandle, jpaths, jtarget_sizes); +} + /* * Class: org_rocksdb_Options * Method: setMaxManifestFileSize @@ -1092,6 +1224,29 @@ /* * Class: org_rocksdb_Options + * Method: setMaxWriteBatchGroupSizeBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxWriteBatchGroupSizeBytes( + JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) { + auto* opt = reinterpret_cast(jhandle); + opt->max_write_batch_group_size_bytes = + static_cast(jmax_write_batch_group_size_bytes); +} + +/* + * Class: org_rocksdb_Options + * Method: maxWriteBatchGroupSizeBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_write_batch_group_size_bytes); +} + +/* + * Class: org_rocksdb_Options * Method: manifestPreallocationSize * Signature: (J)J */ @@ -1131,6 +1286,34 @@ } /* + * Method: setSstPartitionerFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setSstPartitionerFactory(JNIEnv*, jobject, + jlong jhandle, + jlong factory_handle) { + auto* options = reinterpret_cast(jhandle); + auto factory = reinterpret_cast< + std::shared_ptr*>( + factory_handle); + options->sst_partitioner_factory = *factory; +} + +/* + * Class: org_rocksdb_Options + * Method: setCompactionThreadLimiter + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setCompactionThreadLimiter( + JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) { + auto* options = reinterpret_cast(jhandle); + auto* limiter = reinterpret_cast< + std::shared_ptr*>( + jlimiter_handle); + options->compaction_thread_limiter = *limiter; +} + +/* * Class: org_rocksdb_Options * Method: allowMmapReads * Signature: (J)Z @@ -1587,6 +1770,76 @@ return static_cast(opt->strict_bytes_per_sync); } +// Note: the RocksJava API currently only supports EventListeners implemented in +// Java. It could be extended in future to also support adding/removing +// EventListeners implemented in C++. +static void rocksdb_set_event_listeners_helper( + JNIEnv* env, jlongArray jlistener_array, + std::vector>& + listener_sptr_vec) { + jlong* ptr_jlistener_array = + env->GetLongArrayElements(jlistener_array, nullptr); + if (ptr_jlistener_array == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + const jsize array_size = env->GetArrayLength(jlistener_array); + listener_sptr_vec.clear(); + for (jsize i = 0; i < array_size; ++i) { + const auto& listener_sptr = + *reinterpret_cast*>( + ptr_jlistener_array[i]); + listener_sptr_vec.push_back(listener_sptr); + } +} + +/* + * Class: org_rocksdb_Options + * Method: setEventListeners + * Signature: (J[J)V + */ +void Java_org_rocksdb_Options_setEventListeners(JNIEnv* env, jclass, + jlong jhandle, + jlongArray jlistener_array) { + auto* opt = reinterpret_cast(jhandle); + rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners); +} + +// Note: the RocksJava API currently only supports EventListeners implemented in +// Java. It could be extended in future to also support adding/removing +// EventListeners implemented in C++. +static jobjectArray rocksdb_get_event_listeners_helper( + JNIEnv* env, + const std::vector>& + listener_sptr_vec) { + jsize sz = static_cast(listener_sptr_vec.size()); + jclass jlistener_clazz = + ROCKSDB_NAMESPACE::AbstractEventListenerJni::getJClass(env); + jobjectArray jlisteners = env->NewObjectArray(sz, jlistener_clazz, nullptr); + if (jlisteners == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + for (jsize i = 0; i < sz; ++i) { + const auto* jni_cb = + static_cast( + listener_sptr_vec[i].get()); + env->SetObjectArrayElement(jlisteners, i, jni_cb->GetJavaObject()); + } + return jlisteners; +} + +/* + * Class: org_rocksdb_Options + * Method: eventListeners + * Signature: (J)[Lorg/rocksdb/AbstractEventListener; + */ +jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return rocksdb_get_event_listeners_helper(env, opt->listeners); +} + /* * Class: org_rocksdb_Options * Method: setEnableThreadTracking @@ -1793,7 +2046,7 @@ * Signature: (JZ)V */ void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jboolean jskip_checking_sst_file_sizes_on_db_open) { auto* opt = reinterpret_cast(jhandle); opt->skip_checking_sst_file_sizes_on_db_open = @@ -1806,7 +2059,7 @@ * Signature: (J)Z */ jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); } @@ -1957,6 +2210,162 @@ /* * Class: org_rocksdb_Options + * Method: setAvoidUnnecessaryBlockingIO + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setAvoidUnnecessaryBlockingIO( + JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) { + auto* opt = reinterpret_cast(jhandle); + opt->avoid_unnecessary_blocking_io = static_cast(avoid_blocking_io); +} + +/* + * Class: org_rocksdb_Options + * Method: avoidUnnecessaryBlockingIO + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_avoidUnnecessaryBlockingIO(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->avoid_unnecessary_blocking_io); +} + +/* + * Class: org_rocksdb_Options + * Method: setPersistStatsToDisk + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setPersistStatsToDisk( + JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) { + auto* opt = reinterpret_cast(jhandle); + opt->persist_stats_to_disk = static_cast(persist_stats_to_disk); +} + +/* + * Class: org_rocksdb_Options + * Method: persistStatsToDisk + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_persistStatsToDisk(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->persist_stats_to_disk); +} + +/* + * Class: org_rocksdb_Options + * Method: setWriteDbidToManifest + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setWriteDbidToManifest( + JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) { + auto* opt = reinterpret_cast(jhandle); + opt->write_dbid_to_manifest = static_cast(jwrite_dbid_to_manifest); +} + +/* + * Class: org_rocksdb_Options + * Method: writeDbidToManifest + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_writeDbidToManifest(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->write_dbid_to_manifest); +} + +/* + * Class: org_rocksdb_Options + * Method: setLogReadaheadSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setLogReadaheadSize(JNIEnv*, jclass, + jlong jhandle, + jlong jlog_readahead_size) { + auto* opt = reinterpret_cast(jhandle); + opt->log_readahead_size = static_cast(jlog_readahead_size); +} + +/* + * Class: org_rocksdb_Options + * Method: logReasaheadSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_logReadaheadSize(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->log_readahead_size); +} + +/* + * Class: org_rocksdb_Options + * Method: setBestEffortsRecovery + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setBestEffortsRecovery( + JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) { + auto* opt = reinterpret_cast(jhandle); + opt->best_efforts_recovery = static_cast(jbest_efforts_recovery); +} + +/* + * Class: org_rocksdb_Options + * Method: bestEffortsRecovery + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_bestEffortsRecovery(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->best_efforts_recovery); +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxBgErrorResumeCount + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMaxBgErrorResumeCount( + JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) { + auto* opt = reinterpret_cast(jhandle); + opt->max_bgerror_resume_count = static_cast(jmax_bgerror_resume_count); +} + +/* + * Class: org_rocksdb_Options + * Method: maxBgerrorResumeCount + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_maxBgerrorResumeCount(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_bgerror_resume_count); +} + +/* + * Class: org_rocksdb_Options + * Method: setBgerrorResumeRetryInterval + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setBgerrorResumeRetryInterval( + JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) { + auto* opt = reinterpret_cast(jhandle); + opt->bgerror_resume_retry_interval = + static_cast(jbgerror_resume_retry_interval); +} + +/* + * Class: org_rocksdb_Options + * Method: bgerrorResumeRetryInterval + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_bgerrorResumeRetryInterval(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->bgerror_resume_retry_interval); +} + +/* + * Class: org_rocksdb_Options * Method: setAvoidFlushDuringShutdown * Signature: (JZ)V */ @@ -2833,16 +3242,45 @@ /* * Class: org_rocksdb_Options + * Method: oldDefaults + * Signature: (JII)V + */ +void Java_org_rocksdb_Options_oldDefaults(JNIEnv*, jclass, jlong jhandle, + jint major_version, + jint minor_version) { + reinterpret_cast(jhandle)->OldDefaults( + major_version, minor_version); +} + +/* + * Class: org_rocksdb_Options * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_Options_optimizeForSmallDb( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_Options_optimizeForSmallDb__J(JNIEnv*, jobject, + jlong jhandle) { reinterpret_cast(jhandle)->OptimizeForSmallDb(); } /* * Class: org_rocksdb_Options + * Method: optimizeForSmallDb + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_optimizeForSmallDb__JJ(JNIEnv*, jclass, + jlong jhandle, + jlong cache_handle) { + auto* cache_sptr_ptr = + reinterpret_cast*>( + cache_handle); + auto* options_ptr = reinterpret_cast(jhandle); + auto* cf_options_ptr = + static_cast(options_ptr); + cf_options_ptr->OptimizeForSmallDb(cache_sptr_ptr); +} + +/* + * Class: org_rocksdb_Options * Method: optimizeForPointLookup * Signature: (JJ)V */ @@ -3188,6 +3626,29 @@ /* * Class: org_rocksdb_Options + * Method: setPeriodicCompactionSeconds + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setPeriodicCompactionSeconds( + JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) { + auto* opts = reinterpret_cast(jhandle); + opts->periodic_compaction_seconds = + static_cast(jperiodicCompactionSeconds); +} + +/* + * Class: org_rocksdb_Options + * Method: periodicCompactionSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->periodic_compaction_seconds); +} + +/* + * Class: org_rocksdb_Options * Method: setCompactionOptionsUniversal * Signature: (JJ)V */ @@ -3236,6 +3697,170 @@ return static_cast(opts->force_consistency_checks); } +/// BLOB options + +/* + * Class: org_rocksdb_Options + * Method: setEnableBlobFiles + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jobject, + jlong jhandle, + jboolean jenable_blob_files) { + auto* opts = reinterpret_cast(jhandle); + opts->enable_blob_files = static_cast(jenable_blob_files); +} + +/* + * Class: org_rocksdb_Options + * Method: enableBlobFiles + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->enable_blob_files); +} + +/* + * Class: org_rocksdb_Options + * Method: setMinBlobSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jobject, jlong jhandle, + jlong jmin_blob_size) { + auto* opts = reinterpret_cast(jhandle); + opts->min_blob_size = static_cast(jmin_blob_size); +} + +/* + * Class: org_rocksdb_Options + * Method: minBlobSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jobject, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->min_blob_size); +} + +/* + * Class: org_rocksdb_Options + * Method: setMinBlobSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jobject, jlong jhandle, + jlong jblob_file_size) { + auto* opts = reinterpret_cast(jhandle); + opts->blob_file_size = static_cast(jblob_file_size); +} + +/* + * Class: org_rocksdb_Options + * Method: minBlobSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jobject, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->blob_file_size); +} + +/* + * Class: org_rocksdb_Options + * Method: setBlobCompressionType + * Signature: (JB)V + */ +void Java_org_rocksdb_Options_setBlobCompressionType( + JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) { + auto* opts = reinterpret_cast(jhandle); + opts->blob_compression_type = + ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType( + jblob_compression_type_value); +} + +/* + * Class: org_rocksdb_Options + * Method: blobCompressionType + * Signature: (J)B + */ +jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( + opts->blob_compression_type); +} + +/* + * Class: org_rocksdb_Options + * Method: setEnableBlobGarbageCollection + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setEnableBlobGarbageCollection( + JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) { + auto* opts = reinterpret_cast(jhandle); + opts->enable_blob_garbage_collection = + static_cast(jenable_blob_garbage_collection); +} + +/* + * Class: org_rocksdb_Options + * Method: enableBlobGarbageCollection + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->enable_blob_garbage_collection); +} + +/* + * Class: org_rocksdb_Options + * Method: setBlobGarbageCollectionAgeCutoff + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setBlobGarbageCollectionAgeCutoff( + JNIEnv*, jobject, jlong jhandle, + jdouble jblob_garbage_collection_age_cutoff) { + auto* opts = reinterpret_cast(jhandle); + opts->blob_garbage_collection_age_cutoff = + static_cast(jblob_garbage_collection_age_cutoff); +} + +/* + * Class: org_rocksdb_Options + * Method: blobGarbageCollectionAgeCutoff + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*, + jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->blob_garbage_collection_age_cutoff); +} + +/* + * Class: org_rocksdb_Options + * Method: setBlobGarbageCollectionForceThreshold + * Signature: (JD)V + */ +void Java_org_rocksdb_Options_setBlobGarbageCollectionForceThreshold( + JNIEnv*, jobject, jlong jhandle, + jdouble jblob_garbage_collection_force_threshold) { + auto* opts = reinterpret_cast(jhandle); + opts->blob_garbage_collection_force_threshold = + static_cast(jblob_garbage_collection_force_threshold); +} + +/* + * Class: org_rocksdb_Options + * Method: blobGarbageCollectionForceThreshold + * Signature: (J)D + */ +jdouble Java_org_rocksdb_Options_blobGarbageCollectionForceThreshold( + JNIEnv*, jobject, jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->blob_garbage_collection_force_threshold); +} + ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::ColumnFamilyOptions @@ -3277,9 +3902,43 @@ /* * Class: org_rocksdb_ColumnFamilyOptions * Method: getColumnFamilyOptionsFromProps + * Signature: (JLjava/lang/String;)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__JLjava_lang_String_2( + JNIEnv* env, jclass, jlong cfg_handle, jstring jopt_string) { + const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr); + if (opt_string == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + auto* config_options = + reinterpret_cast(cfg_handle); + auto* cf_options = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(); + ROCKSDB_NAMESPACE::Status status = + ROCKSDB_NAMESPACE::GetColumnFamilyOptionsFromString( + *config_options, ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string, + cf_options); + + env->ReleaseStringUTFChars(jopt_string, opt_string); + + // Check if ColumnFamilyOptions creation was possible. + jlong ret_value = 0; + if (status.ok()) { + ret_value = reinterpret_cast(cf_options); + } else { + // if operation failed the ColumnFamilyOptions need to be deleted + // again to prevent a memory leak. + delete cf_options; + } + return ret_value; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: getColumnFamilyOptionsFromProps * Signature: (Ljava/util/String;)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps( +jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljava_lang_String_2( JNIEnv* env, jclass, jstring jopt_string) { const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr); if (opt_string == nullptr) { @@ -3320,17 +3979,45 @@ /* * Class: org_rocksdb_ColumnFamilyOptions + * Method: oldDefaults + * Signature: (JII)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_oldDefaults(JNIEnv*, jclass, + jlong jhandle, + jint major_version, + jint minor_version) { + reinterpret_cast(jhandle) + ->OldDefaults(major_version, minor_version); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__J(JNIEnv*, + jobject, + jlong jhandle) { reinterpret_cast(jhandle) ->OptimizeForSmallDb(); } /* * Class: org_rocksdb_ColumnFamilyOptions + * Method: optimizeForSmallDb + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__JJ( + JNIEnv*, jclass, jlong jhandle, jlong cache_handle) { + auto* cache_sptr_ptr = + reinterpret_cast*>( + cache_handle); + reinterpret_cast(jhandle) + ->OptimizeForSmallDb(cache_sptr_ptr); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions * Method: optimizeForPointLookup * Signature: (JJ)V */ @@ -3588,6 +4275,35 @@ } /* + * Method: setSstPartitionerFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory( + JNIEnv*, jobject, jlong jhandle, jlong factory_handle) { + auto* options = + reinterpret_cast(jhandle); + auto factory = reinterpret_cast< + std::shared_ptr*>( + factory_handle); + options->sst_partitioner_factory = *factory; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setCompactionThreadLimiter + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setCompactionThreadLimiter( + JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) { + auto* options = + reinterpret_cast(jhandle); + auto* limiter = reinterpret_cast< + std::shared_ptr*>( + jlimiter_handle); + options->compaction_thread_limiter = *limiter; +} + +/* * Method: tableFactoryName * Signature: (J)Ljava/lang/String */ @@ -3606,6 +4322,52 @@ /* * Class: org_rocksdb_ColumnFamilyOptions + * Method: setCfPaths + * Signature: (J[Ljava/lang/String;[J)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setCfPaths(JNIEnv* env, jclass, + jlong jhandle, + jobjectArray path_array, + jlongArray size_array) { + auto* options = + reinterpret_cast(jhandle); + jboolean has_exception = JNI_FALSE; + std::vector cf_paths = + rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array, + &has_exception); + if (JNI_FALSE == has_exception) { + options->cf_paths = std::move(cf_paths); + } +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: cfPathsLen + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_cfPathsLen(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = + reinterpret_cast(jhandle); + return static_cast(opt->cf_paths.size()); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: cfPaths + * Signature: (J[Ljava/lang/String;[J)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_cfPaths(JNIEnv* env, jclass, + jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { + rocksdb_convert_cf_paths_to_java_helper< + ROCKSDB_NAMESPACE::ColumnFamilyOptions>(env, jhandle, jpaths, + jtarget_sizes); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions * Method: minWriteBufferNumberToMerge * Signature: (J)I */ @@ -4458,8 +5220,8 @@ JNIEnv* env, jobject, jlong jhandle, jintArray jmax_bytes_for_level_multiplier_additional) { jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional); - jint* additionals = - env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, 0); + jint* additionals = env->GetIntArrayElements( + jmax_bytes_for_level_multiplier_additional, nullptr); if (additionals == nullptr) { // exception thrown: OutOfMemoryError return; @@ -4576,6 +5338,32 @@ /* * Class: org_rocksdb_ColumnFamilyOptions + * Method: setPeriodicCompactionSeconds + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setPeriodicCompactionSeconds( + JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) { + auto* cf_opts = + reinterpret_cast(jhandle); + cf_opts->periodic_compaction_seconds = + static_cast(jperiodicCompactionSeconds); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: periodicCompactionSeconds + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL +Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jobject, + jlong jhandle) { + auto* cf_opts = + reinterpret_cast(jhandle); + return static_cast(cf_opts->periodic_compaction_seconds); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions * Method: setCompactionOptionsUniversal * Signature: (JJ)V */ @@ -4626,7 +5414,187 @@ JNIEnv*, jobject, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); - return static_cast(cf_opts->force_consistency_checks); + return static_cast(cf_opts->force_consistency_checks); +} + +/// BLOB options + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setEnableBlobFiles + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobFiles( + JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_files) { + auto* opts = + reinterpret_cast(jhandle); + opts->enable_blob_files = static_cast(jenable_blob_files); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: enableBlobFiles + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->enable_blob_files); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMinBlobSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jobject, + jlong jhandle, + jlong jmin_blob_size) { + auto* opts = + reinterpret_cast(jhandle); + opts->min_blob_size = static_cast(jmin_blob_size); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: minBlobSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->min_blob_size); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMinBlobSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileSize( + JNIEnv*, jobject, jlong jhandle, jlong jblob_file_size) { + auto* opts = + reinterpret_cast(jhandle); + opts->blob_file_size = static_cast(jblob_file_size); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: minBlobSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->blob_file_size); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setBlobCompressionType + * Signature: (JB)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompressionType( + JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) { + auto* opts = + reinterpret_cast(jhandle); + opts->blob_compression_type = + ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType( + jblob_compression_type_value); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: blobCompressionType + * Signature: (J)B + */ +jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( + opts->blob_compression_type); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setEnableBlobGarbageCollection + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobGarbageCollection( + JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) { + auto* opts = + reinterpret_cast(jhandle); + opts->enable_blob_garbage_collection = + static_cast(jenable_blob_garbage_collection); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: enableBlobGarbageCollection + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobGarbageCollection( + JNIEnv*, jobject, jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->enable_blob_garbage_collection); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setBlobGarbageCollectionAgeCutoff + * Signature: (JD)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionAgeCutoff( + JNIEnv*, jobject, jlong jhandle, + jdouble jblob_garbage_collection_age_cutoff) { + auto* opts = + reinterpret_cast(jhandle); + opts->blob_garbage_collection_age_cutoff = + static_cast(jblob_garbage_collection_age_cutoff); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: blobGarbageCollectionAgeCutoff + * Signature: (J)D + */ +jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionAgeCutoff( + JNIEnv*, jobject, jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->blob_garbage_collection_age_cutoff); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setBlobGarbageCollectionForceThreshold + * Signature: (JD)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionForceThreshold( + JNIEnv*, jobject, jlong jhandle, + jdouble jblob_garbage_collection_force_threshold) { + auto* opts = + reinterpret_cast(jhandle); + opts->blob_garbage_collection_force_threshold = + static_cast(jblob_garbage_collection_force_threshold); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: blobGarbageCollectionAgeCutoff + * Signature: (J)D + */ +jdouble +Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionForceThreshold( + JNIEnv*, jobject, jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->blob_garbage_collection_force_threshold); } ///////////////////////////////////////////////////////////////////// @@ -4670,9 +5638,42 @@ /* * Class: org_rocksdb_DBOptions * Method: getDBOptionsFromProps + * Signature: (JLjava/lang/String;)J + */ +jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__JLjava_lang_String_2( + JNIEnv* env, jclass, jlong config_handle, jstring jopt_string) { + const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr); + if (opt_string == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + + auto* config_options = + reinterpret_cast(config_handle); + auto* db_options = new ROCKSDB_NAMESPACE::DBOptions(); + ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetDBOptionsFromString( + *config_options, ROCKSDB_NAMESPACE::DBOptions(), opt_string, db_options); + + env->ReleaseStringUTFChars(jopt_string, opt_string); + + // Check if DBOptions creation was possible. + jlong ret_value = 0; + if (status.ok()) { + ret_value = reinterpret_cast(db_options); + } else { + // if operation failed the DBOptions need to be deleted + // again to prevent a memory leak. + delete db_options; + } + return ret_value; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: getDBOptionsFromProps * Signature: (Ljava/util/String;)J */ -jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps( +jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2( JNIEnv* env, jclass, jstring jopt_string) { const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr); if (opt_string == nullptr) { @@ -5078,7 +6079,8 @@ void Java_org_rocksdb_DBOptions_dbPaths( JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { - jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); + jboolean is_copy; + jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { // exception thrown: OutOfMemoryError return; @@ -5106,7 +6108,8 @@ ptr_jtarget_size[i] = static_cast(db_path.target_size); } - env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT); + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); } /* @@ -5498,6 +6501,29 @@ /* * Class: org_rocksdb_DBOptions + * Method: setMaxWriteBatchGroupSizeBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setMaxWriteBatchGroupSizeBytes( + JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) { + auto* opt = reinterpret_cast(jhandle); + opt->max_write_batch_group_size_bytes = + static_cast(jmax_write_batch_group_size_bytes); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxWriteBatchGroupSizeBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_write_batch_group_size_bytes); +} + +/* + * Class: org_rocksdb_DBOptions * Method: setManifestPreallocationSize * Signature: (JJ)V */ @@ -5994,6 +7020,29 @@ /* * Class: org_rocksdb_DBOptions + * Method: setEventListeners + * Signature: (J[J)V + */ +void Java_org_rocksdb_DBOptions_setEventListeners(JNIEnv* env, jclass, + jlong jhandle, + jlongArray jlistener_array) { + auto* opt = reinterpret_cast(jhandle); + rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: eventListeners + * Signature: (J)[Lorg/rocksdb/AbstractEventListener; + */ +jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return rocksdb_get_event_listeners_helper(env, opt->listeners); +} + +/* + * Class: org_rocksdb_DBOptions * Method: setDelayedWriteRate * Signature: (JJ)V */ @@ -6198,7 +7247,7 @@ * Signature: (JZ)V */ void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jobject, jlong jhandle, + JNIEnv*, jclass, jlong jhandle, jboolean jskip_checking_sst_file_sizes_on_db_open) { auto* opt = reinterpret_cast(jhandle); opt->skip_checking_sst_file_sizes_on_db_open = @@ -6211,7 +7260,7 @@ * Signature: (J)Z */ jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen( - JNIEnv*, jobject, jlong jhandle) { + JNIEnv*, jclass, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_checking_sst_file_sizes_on_db_open); } @@ -6491,6 +7540,162 @@ return static_cast(opt->avoid_flush_during_shutdown); } +/* + * Class: org_rocksdb_DBOptions + * Method: setAvoidUnnecessaryBlockingIO + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setAvoidUnnecessaryBlockingIO( + JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) { + auto* opt = reinterpret_cast(jhandle); + opt->avoid_unnecessary_blocking_io = static_cast(avoid_blocking_io); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: avoidUnnecessaryBlockingIO + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_avoidUnnecessaryBlockingIO(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->avoid_unnecessary_blocking_io); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setPersistStatsToDisk + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setPersistStatsToDisk( + JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) { + auto* opt = reinterpret_cast(jhandle); + opt->persist_stats_to_disk = static_cast(persist_stats_to_disk); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: persistStatsToDisk + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_persistStatsToDisk(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->persist_stats_to_disk); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setWriteDbidToManifest + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setWriteDbidToManifest( + JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) { + auto* opt = reinterpret_cast(jhandle); + opt->write_dbid_to_manifest = static_cast(jwrite_dbid_to_manifest); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: writeDbidToManifest + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_writeDbidToManifest(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->write_dbid_to_manifest); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setLogReadaheadSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setLogReadaheadSize(JNIEnv*, jclass, + jlong jhandle, + jlong jlog_readahead_size) { + auto* opt = reinterpret_cast(jhandle); + opt->log_readahead_size = static_cast(jlog_readahead_size); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: logReasaheadSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_logReadaheadSize(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->log_readahead_size); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setBestEffortsRecovery + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setBestEffortsRecovery( + JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) { + auto* opt = reinterpret_cast(jhandle); + opt->best_efforts_recovery = static_cast(jbest_efforts_recovery); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: bestEffortsRecovery + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_bestEffortsRecovery(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->best_efforts_recovery); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxBgErrorResumeCount + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setMaxBgErrorResumeCount( + JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) { + auto* opt = reinterpret_cast(jhandle); + opt->max_bgerror_resume_count = static_cast(jmax_bgerror_resume_count); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxBgerrorResumeCount + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_maxBgerrorResumeCount(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_bgerror_resume_count); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setBgerrorResumeRetryInterval + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setBgerrorResumeRetryInterval( + JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) { + auto* opt = reinterpret_cast(jhandle); + opt->bgerror_resume_retry_interval = + static_cast(jbgerror_resume_retry_interval); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: bgerrorResumeRetryInterval + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_bgerrorResumeRetryInterval(JNIEnv*, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->bgerror_resume_retry_interval); +} + ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::WriteOptions @@ -7062,6 +8267,141 @@ return static_cast(opt->iter_start_seqnum); } +/* + * Class: org_rocksdb_ReadOptions + * Method: autoPrefixMode + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jobject, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->auto_prefix_mode); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setAutoPrefixMode + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setAutoPrefixMode( + JNIEnv*, jobject, jlong jhandle, jboolean jauto_prefix_mode) { + auto* opt = reinterpret_cast(jhandle); + opt->auto_prefix_mode = static_cast(jauto_prefix_mode); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: timestamp + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + auto& timestamp_slice_handle = opt->timestamp; + return reinterpret_cast(timestamp_slice_handle); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setTimestamp + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jobject, jlong jhandle, + jlong jtimestamp_slice_handle) { + auto* opt = reinterpret_cast(jhandle); + opt->timestamp = + reinterpret_cast(jtimestamp_slice_handle); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: iterStartTs + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jobject, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + auto& iter_start_ts_handle = opt->iter_start_ts; + return reinterpret_cast(iter_start_ts_handle); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setIterStartTs + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jobject, + jlong jhandle, + jlong jiter_start_ts_handle) { + auto* opt = reinterpret_cast(jhandle); + opt->iter_start_ts = + reinterpret_cast(jiter_start_ts_handle); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: deadline + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->deadline.count()); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setDeadline + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jobject, jlong jhandle, + jlong jdeadline) { + auto* opt = reinterpret_cast(jhandle); + opt->deadline = std::chrono::microseconds(static_cast(jdeadline)); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: ioTimeout + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jobject, jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->io_timeout.count()); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setIoTimeout + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jobject, jlong jhandle, + jlong jio_timeout) { + auto* opt = reinterpret_cast(jhandle); + opt->io_timeout = + std::chrono::microseconds(static_cast(jio_timeout)); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: valueSizeSofLimit + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jobject, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->value_size_soft_limit); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setValueSizeSofLimit + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setValueSizeSoftLimit( + JNIEnv*, jobject, jlong jhandle, jlong jvalue_size_soft_limit) { + auto* opt = reinterpret_cast(jhandle); + opt->value_size_soft_limit = static_cast(jvalue_size_soft_limit); +} + ///////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::ComparatorOptions diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -55,7 +55,7 @@ * Method: loadLatestOptions * Signature: (Ljava/lang/String;JLjava/util/List;Z)V */ -void Java_org_rocksdb_OptionsUtil_loadLatestOptions( +void Java_org_rocksdb_OptionsUtil_loadLatestOptions__Ljava_lang_String_2JJLjava_util_List_2Z( JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle, jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) { jboolean has_exception = JNI_FALSE; @@ -80,10 +80,40 @@ /* * Class: org_rocksdb_OptionsUtil + * Method: loadLatestOptions_1 + * Signature: (JLjava/lang/String;JLjava/util/List;)V + */ +void Java_org_rocksdb_OptionsUtil_loadLatestOptions__JLjava_lang_String_2JLjava_util_List_2( + JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jdbpath, + jlong jdb_opts_handle, jobject jcfds) { + jboolean has_exception = JNI_FALSE; + auto db_path = + ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception); + if (has_exception == JNI_TRUE) { + // exception occurred + return; + } + std::vector cf_descs; + auto* config_options = + reinterpret_cast(cfg_handle); + auto* db_options = + reinterpret_cast(jdb_opts_handle); + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadLatestOptions( + *config_options, db_path, db_options, &cf_descs); + if (!s.ok()) { + // error, raise an exception + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + } else { + build_column_family_descriptor_list(env, jcfds, cf_descs); + } +} + +/* + * Class: org_rocksdb_OptionsUtil * Method: loadOptionsFromFile * Signature: (Ljava/lang/String;JJLjava/util/List;Z)V */ -void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile( +void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__Ljava_lang_String_2JJLjava_util_List_2Z( JNIEnv* env, jclass /*jcls*/, jstring jopts_file_name, jlong jenv_handle, jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) { jboolean has_exception = JNI_FALSE; @@ -101,6 +131,36 @@ if (!s.ok()) { // error, raise an exception ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + } else { + build_column_family_descriptor_list(env, jcfds, cf_descs); + } +} + +/* + * Class: org_rocksdb_OptionsUtil + * Method: loadOptionsFromFile + * Signature: (JLjava/lang/String;JLjava/util/List;)V + */ +void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__JLjava_lang_String_2JLjava_util_List_2( + JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jopts_file_name, + jlong jdb_opts_handle, jobject jcfds) { + jboolean has_exception = JNI_FALSE; + auto opts_file_name = ROCKSDB_NAMESPACE::JniUtil::copyStdString( + env, jopts_file_name, &has_exception); + if (has_exception == JNI_TRUE) { + // exception occurred + return; + } + std::vector cf_descs; + auto* config_options = + reinterpret_cast(cfg_handle); + auto* db_options = + reinterpret_cast(jdb_opts_handle); + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadOptionsFromFile( + *config_options, opts_file_name, db_options, &cf_descs); + if (!s.ok()) { + // error, raise an exception + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } else { build_column_family_descriptor_list(env, jcfds, cf_descs); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/portal.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/portal.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,18 +10,21 @@ #ifndef JAVA_ROCKSJNI_PORTAL_H_ #define JAVA_ROCKSJNI_PORTAL_H_ +#include + #include #include #include #include #include -#include #include #include +#include #include #include #include +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" #include "rocksdb/rate_limiter.h" @@ -33,6 +36,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksjni/compaction_filter_factory_jnicallback.h" #include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/event_listener_jnicallback.h" #include "rocksjni/loggerjnicallback.h" #include "rocksjni/table_filter_jnicallback.h" #include "rocksjni/trace_writer_jnicallback.h" @@ -222,7 +226,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -260,7 +264,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -325,7 +329,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -346,7 +350,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSubCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -367,7 +371,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getStateMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -437,6 +441,10 @@ return jstatus; } + static jobject construct(JNIEnv* env, const Status* status) { + return construct(env, *status); + } + // Returns the equivalent org.rocksdb.Status.Code for the provided // C++ ROCKSDB_NAMESPACE::Status::Code enum static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) { @@ -933,7 +941,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getStatusMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -1024,7 +1032,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getIteratorMethod(JNIEnv* env) { jclass jlist_clazz = getListClass(env); @@ -1045,7 +1053,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getHasNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); @@ -1065,7 +1073,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); @@ -1086,7 +1094,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getArrayListConstructorMethodId(JNIEnv* env) { jclass jarray_list_clazz = getArrayListClass(env); @@ -1106,7 +1114,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jlist_clazz = getListClass(env); @@ -1243,10 +1251,11 @@ * Get the Java Method: ByteBuffer#allocate * * @param env A pointer to the Java environment - * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or nullptr + * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or + * nullptr * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getAllocateMethodId(JNIEnv* env, jclass jbytebuffer_clazz = nullptr) { @@ -1269,7 +1278,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getArrayMethodId(JNIEnv* env, jclass jbytebuffer_clazz = nullptr) { @@ -1291,9 +1300,9 @@ return constructWith(env, direct, nullptr, capacity, jbytebuffer_clazz); } - static jobject constructWith( - JNIEnv* env, const bool direct, const char* buf, const size_t capacity, - jclass jbytebuffer_clazz = nullptr) { + static jobject constructWith(JNIEnv* env, const bool direct, const char* buf, + const size_t capacity, + jclass jbytebuffer_clazz = nullptr) { if (direct) { bool allocated = false; if (buf == nullptr) { @@ -1478,7 +1487,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2365,7 +2374,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMapPutMethodId(JNIEnv* env) { jclass jlist_clazz = getJClass(env); @@ -2897,7 +2906,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2917,7 +2926,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2937,7 +2946,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMergeCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2957,7 +2966,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMergeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2977,7 +2986,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2997,7 +3006,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3017,7 +3026,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3037,7 +3046,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSingleDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3057,7 +3066,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3077,7 +3086,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteRangeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3097,7 +3106,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogDataMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3117,7 +3126,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3137,7 +3146,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3157,7 +3166,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3177,7 +3186,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkNoopMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3197,7 +3206,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkRollbackMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3217,7 +3226,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkCommitMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3232,12 +3241,33 @@ } /** + * Get the Java Method: WriteBatch.Handler#markCommitWithTimestamp + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID or nullptr if the class or method id could not + * be retrieved + */ + static jmethodID getMarkCommitWithTimestampMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + static jmethodID mid = + env->GetMethodID(jclazz, "markCommitWithTimestamp", "([B[B)V"); + assert(mid != nullptr); + return mid; + } + + /** * Get the Java Method: WriteBatch.Handler#shouldContinue * * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getContinueMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3273,7 +3303,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3364,7 +3394,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3460,6 +3490,19 @@ : public RocksDBNativeClass { public: + static jobject fromCppColumnFamilyHandle( + JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, reinterpret_cast(info)); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } + /** * Get the Java Class org.rocksdb.ColumnFamilyHandle * @@ -3540,7 +3583,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3561,7 +3604,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3628,7 +3671,7 @@ * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3645,7 +3688,7 @@ * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3662,7 +3705,7 @@ * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3698,7 +3741,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3995,7 +4038,7 @@ * @param env A pointer to the Java environment * * @return The Java Field ID or nullptr if the class or field id could not - * be retieved + * be retrieved */ static jfieldID getWriteEntryField(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -4316,7 +4359,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -4854,7 +4897,7 @@ case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND: return 0x5E; case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED: - // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX. + // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). return -0x01; case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED: return 0x60; @@ -4944,8 +4987,73 @@ return -0x0C; case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN: return -0x0D; + case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH: + return -0x0E; + case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY: + return -0X0F; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED: + return -0x10; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC: + return -0x11; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL: + return -0x12; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED: + return -0x13; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC: + return -0x14; + case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: + return -0x15; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: + return -0x16; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: + return -0x17; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: + return -0x18; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: + return -0x19; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: + return -0x1A; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: + return -0x1B; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH: + return -0x1C; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH: + return -0x1D; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS: + return -0x1E; + case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES: + return -0x1F; + case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES: + return -0x20; + case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES: + return -0x21; + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES: + return -0x22; + case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES: + return -0x23; + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES: + return -0x24; + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES: + return -0x25; + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES: + return -0x26; + case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT: + return -0x27; + case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT: + return -0x28; + case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT: + return -0x29; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: - // 0x5F for backwards compatibility on current minor version. + // 0x5F was the max value in the initial copy of tickers to Java. + // Since these values are exposed directly to Java clients, we keep + // the value the same forever. + // + // TODO: This particular case seems confusing and unnecessary to pin the + // value since it's meant to be the number of tickers, not an actual + // ticker value. But we aren't yet in a position to fix it since the + // number of tickers doesn't fit in the Java representation (jbyte). return 0x5F; default: // undefined/default @@ -5148,7 +5256,7 @@ case 0x5E: return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND; case -0x01: - // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX. + // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX). return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED; case 0x60: return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED; @@ -5239,8 +5347,74 @@ return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; case -0x0D: return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN; + case -0x0E: + return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH; + case -0x0F: + return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY; + case -0x10: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED; + case -0x11: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC; + case -0x12: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL; + case -0x13: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED; + case -0x14: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; + case -0x15: + return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; + case -0x16: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; + case -0x17: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; + case -0x18: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; + case -0x19: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; + case -0x1A: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; + case -0x1B: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; + case -0x1C: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH; + case -0x1D: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + case -0x1E: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS; + case -0x1F: + return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES; + case -0x20: + return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES; + case -0x21: + return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES; + case -0x22: + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES; + case -0x23: + return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES; + case -0x24: + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES; + case -0x25: + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES; + case -0x26: + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES; + case -0x27: + return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT; + case -0x28: + return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT; + case -0x29: + return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT; case 0x5F: - // 0x5F for backwards compatibility on current minor version. + // 0x5F was the max value in the initial copy of tickers to Java. + // Since these values are exposed directly to Java clients, we keep + // the value the same forever. + // + // TODO: This particular case seems confusing and unnecessary to pin the + // value since it's meant to be the number of tickers, not an actual + // ticker value. But we aren't yet in a position to fix it since the + // number of tickers doesn't fit in the Java representation (jbyte). return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX; default: @@ -5351,6 +5525,15 @@ return 0x2D; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: return 0x2E; + case ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: + return 0x2F; + case ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL: + return 0x30; + case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: + return 0x31; + case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: + return 0x31; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5458,6 +5641,16 @@ return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; case 0x2E: return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS; + case 0x2F: + return ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; + case 0x30: + return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; + case 0x31: + return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; + case 0x32: + return ROCKSDB_NAMESPACE::Histograms:: + ERROR_HANDLER_AUTORESUME_RETRY_COUNT; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -5650,7 +5843,8 @@ return nullptr; } - jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr); + jboolean is_copy; + jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy); if(body == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jkey); @@ -5660,7 +5854,8 @@ for(size_t i = 0; i < len; ++i) { body[i] = static_cast(transaction_ids[i]); } - env->ReleaseLongArrayElements(jtransaction_ids, body, 0); + env->ReleaseLongArrayElements(jtransaction_ids, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); jobject jwaiting_transactions = env->CallObjectMethod(jtransaction, mid, static_cast(column_family_id), jkey, jtransaction_ids); @@ -5931,7 +6126,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFilterMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -5971,7 +6166,11 @@ return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(JJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V"); + jmethodID mid = env->GetMethodID( + jclazz, "", + "(JJJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/" + "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;Ljava/util/Map;Ljava/util/Map;)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -6080,25 +6279,8 @@ return nullptr; } - // Map - jobject jproperties_offsets = ROCKSDB_NAMESPACE::HashMapJni::fromCppMap( - env, &table_properties.properties_offsets); - if (env->ExceptionCheck()) { - // exception occurred creating java map - env->DeleteLocalRef(jcolumn_family_name); - env->DeleteLocalRef(jfilter_policy_name); - env->DeleteLocalRef(jcomparator_name); - env->DeleteLocalRef(jmerge_operator_name); - env->DeleteLocalRef(jprefix_extractor_name); - env->DeleteLocalRef(jproperty_collectors_names); - env->DeleteLocalRef(jcompression_name); - env->DeleteLocalRef(juser_collected_properties); - env->DeleteLocalRef(jreadable_properties); - return nullptr; - } - - jobject jtable_properties = env->NewObject(jclazz, mid, - static_cast(table_properties.data_size), + jobject jtable_properties = env->NewObject( + jclazz, mid, static_cast(table_properties.data_size), static_cast(table_properties.index_size), static_cast(table_properties.index_partitions), static_cast(table_properties.top_level_index_size), @@ -6117,17 +6299,16 @@ static_cast(table_properties.column_family_id), static_cast(table_properties.creation_time), static_cast(table_properties.oldest_key_time), - jcolumn_family_name, - jfilter_policy_name, - jcomparator_name, - jmerge_operator_name, - jprefix_extractor_name, - jproperty_collectors_names, - jcompression_name, - juser_collected_properties, - jreadable_properties, - jproperties_offsets - ); + static_cast( + table_properties.slow_compression_estimated_data_size), + static_cast( + table_properties.fast_compression_estimated_data_size), + static_cast( + table_properties.external_sst_file_global_seqno_offset), + jcolumn_family_name, jfilter_policy_name, jcomparator_name, + jmerge_operator_name, jprefix_extractor_name, + jproperty_collectors_names, jcompression_name, + juser_collected_properties, jreadable_properties); if (env->ExceptionCheck()) { return nullptr; @@ -6201,7 +6382,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyNameMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -6221,7 +6402,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -6365,6 +6546,51 @@ } }; +// The portal class for org.rocksdb.IndexShorteningMode +class IndexShorteningModeJni { + public: + // Returns the equivalent org.rocksdb.IndexShorteningMode for the provided + // C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum + static jbyte toJavaIndexShorteningMode( + const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode& + index_shortening_mode) { + switch (index_shortening_mode) { + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening: + return 0x0; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators: + return 0x1; + case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor: + return 0x2; + default: + return 0x7F; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum for + // the provided Java org.rocksdb.IndexShorteningMode + static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode + toCppIndexShorteningMode(jbyte jindex_shortening_mode) { + switch (jindex_shortening_mode) { + case 0x0: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kNoShortening; + case 0x1: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + case 0x2: + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + default: + // undefined/default + return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparators; + } + } +}; + // The portal class for org.rocksdb.Priority class PriorityJni { public: @@ -6670,7 +6896,8 @@ env->DeleteLocalRef(jcf_name); return nullptr; } - jlong *body = env->GetLongArrayElements(joperation_properties, nullptr); + jboolean is_copy; + jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy); if (body == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jdb_name); @@ -6681,7 +6908,8 @@ for (size_t i = 0; i < len; ++i) { body[i] = static_cast(thread_status->op_properties[i]); } - env->ReleaseLongArrayElements(joperation_properties, body, 0); + env->ReleaseLongArrayElements(joperation_properties, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); jobject jcfd = env->NewObject(jclazz, mid, static_cast(thread_status->thread_id), @@ -6829,6 +7057,10 @@ return ROCKSDB_NAMESPACE::CompactionReason::kFlush; case 0x0D: return ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion; + case 0x0E: + return ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction; + case 0x0F: + return ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature; default: // undefined/default return ROCKSDB_NAMESPACE::CompactionReason::kUnknown; @@ -7302,7 +7534,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getWriteProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7323,7 +7555,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7344,7 +7576,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getGetFileSizeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7385,7 +7617,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7407,7 +7639,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7428,7 +7660,7 @@ * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7530,5 +7762,796 @@ } } }; +// The portal class for org.rocksdb.SanityLevel +class SanityLevelJni { + public: + // Returns the equivalent org.rocksdb.SanityLevel for the provided + // C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum + static jbyte toJavaSanityLevel( + const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel &sanity_level) { + switch (sanity_level) { + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::kSanityLevelNone: + return 0x0; + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel:: + kSanityLevelLooselyCompatible: + return 0x1; + case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel:: + kSanityLevelExactMatch: + return -0x01; + default: + return -0x01; // undefined + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum for + // the provided Java org.rocksdb.SanityLevel + static ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel toCppSanityLevel( + jbyte sanity_level) { + switch (sanity_level) { + case 0x0: + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelNone; + case 0x1: + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelLooselyCompatible; + default: + // undefined/default + return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelExactMatch; + } + } +}; + +// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback +class EnabledEventCallbackJni { + public: + // Returns the set of equivalent C++ + // ROCKSDB_NAMESPACE::EnabledEventCallbackJni::EnabledEventCallback enums for + // the provided Java jenabled_event_callback_values + static std::set toCppEnabledEventCallbacks( + jlong jenabled_event_callback_values) { + std::set enabled_event_callbacks; + for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK; + ++i) { + if (((1ULL << i) & jenabled_event_callback_values) > 0) { + enabled_event_callbacks.emplace(static_cast(i)); + } + } + return enabled_event_callbacks; + } +}; + +// The portal class for org.rocksdb.AbstractEventListener +class AbstractEventListenerJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::EventListenerJniCallback*, + AbstractEventListenerJni> { + public: + /** + * Get the Java Class org.rocksdb.AbstractEventListener + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractEventListener"); + } + + /** + * Get the Java Method: AbstractEventListener#onFlushCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFlushBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileDeleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionBeginProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionCompletedProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreated + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreationStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onTableFileCreationStarted", + "(Lorg/rocksdb/TableFileCreationBriefInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onMemTableSealed + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed", + "(Lorg/rocksdb/MemTableInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: + * AbstractEventListener#onColumnFamilyHandleDeletionStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted", + "(Lorg/rocksdb/ColumnFamilyHandle;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onExternalFileIngestedProxy", + "(JLorg/rocksdb/ExternalFileIngestionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onBackgroundError + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy", + "(BLorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onStallConditionsChanged + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged", + "(Lorg/rocksdb/WriteStallInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileReadFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileWriteFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileFlushFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileTruncateFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileCloseFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy", + "(BLorg/rocksdb/Status;)Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted", + "(Lorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } +}; + +class FlushJobInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.FlushJobInfo object. + * + * @param env A pointer to the Java environment + * @param flush_job_info A Cpp flush job info object + * + * @return A reference to a Java org.rocksdb.FlushJobInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppFlushJobInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jfile_path); + return nullptr; + } + jobject jtable_properties = TablePropertiesJni::fromCppTableProperties( + env, flush_job_info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jfile_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, static_cast(flush_job_info->cf_id), jcf_name, + jfile_path, static_cast(flush_job_info->thread_id), + static_cast(flush_job_info->job_id), + static_cast(flush_job_info->triggered_writes_slowdown), + static_cast(flush_job_info->triggered_writes_stop), + static_cast(flush_job_info->smallest_seqno), + static_cast(flush_job_info->largest_seqno), jtable_properties, + static_cast(flush_job_info->flush_reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/" + "rocksdb/TableProperties;B)V"); + } +}; + +class TableFileDeletionInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.TableFileDeletionInfo object. + * + * @param env A pointer to the Java environment + * @param file_del_info A Cpp table file deletion info object + * + * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppTableFileDeletionInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, file_del_info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, + JniUtil::toJavaString(env, &file_del_info->file_path), + static_cast(file_del_info->job_id), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V"); + } +}; + +class CompactionJobInfoJni : public JavaClass { + public: + static jobject fromCppCompactionJobInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, + reinterpret_cast(compaction_job_info)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } +}; + +class TableFileCreationInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jtable_properties); + return nullptr; + } + return env->NewObject(jclazz, ctor, static_cast(info->file_size), + jtable_properties, jstatus, jdb_name, jcf_name, + jfile_path, static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/" + "String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class TableFileCreationBriefInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationBriefInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path, + static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class MemTableInfoJni : public JavaClass { + public: + static jobject fromCppMemTableInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->first_seqno), + static_cast(info->earliest_seqno), + static_cast(info->num_entries), + static_cast(info->num_deletes)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;JJJJ)V"); + } +}; + +class ExternalFileIngestionInfoJni : public JavaClass { + public: + static jobject fromCppExternalFileIngestionInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jexternal_file_path = + JniUtil::toJavaString(env, &info->external_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jstring jinternal_file_path = + JniUtil::toJavaString(env, &info->internal_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + env->DeleteLocalRef(jinternal_file_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path, + static_cast(info->global_seqno), jtable_properties); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;JLorg/rocksdb/TableProperties;)V"); + } +}; + +class WriteStallInfoJni : public JavaClass { + public: + static jobject fromCppWriteStallInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->condition.cur), + static_cast(info->condition.prev)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;BB)V"); + } +}; + +class FileOperationInfoJni : public JavaClass { + public: + static jobject fromCppFileOperationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jpath = JniUtil::toJavaString(env, &info->path); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jpath); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jpath, static_cast(info->offset), + static_cast(info->length), + static_cast(info->start_ts.time_since_epoch().count()), + static_cast(info->duration.count()), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V"); + } +}; } // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_PORTAL_H_ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc 2025-05-19 16:14:27.000000000 +0000 @@ -27,5 +27,4 @@ // I think this is okay, as Comparator and JniCallback both have virtual // destructors... delete reinterpret_cast(handle); - // @lint-ignore TXT4 T25377293 Grandfathered in -} \ No newline at end of file +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,7 @@ #include #include #include + #include #include #include @@ -22,6 +23,7 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/types.h" +#include "rocksdb/version.h" #include "rocksjni/portal.h" #ifdef min @@ -70,15 +72,19 @@ /* * Class: org_rocksdb_RocksDB * Method: openROnly - * Signature: (JLjava/lang/String;)J + * Signature: (JLjava/lang/String;Z)J */ -jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2( - JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) { +jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z( + JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path, + jboolean jerror_if_wal_file_exists) { + const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE; return rocksdb_open_helper( env, jopt_handle, jdb_path, - [](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path, - ROCKSDB_NAMESPACE::DB** db) { - return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db); + [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options, + const std::string& db_path, + ROCKSDB_NAMESPACE::DB** db) { + return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db, + error_if_wal_file_exists); }); } @@ -170,21 +176,25 @@ /* * Class: org_rocksdb_RocksDB * Method: openROnly - * Signature: (JLjava/lang/String;[[B[J)[J + * Signature: (JLjava/lang/String;[[B[JZ)[J */ -jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3J( +jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ( JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path, - jobjectArray jcolumn_names, jlongArray jcolumn_options) { + jobjectArray jcolumn_names, jlongArray jcolumn_options, + jboolean jerror_if_wal_file_exists) { + const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE; return rocksdb_open_helper( env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options, - [](const ROCKSDB_NAMESPACE::DBOptions& options, - const std::string& db_path, - const std::vector& - column_families, - std::vector* handles, - ROCKSDB_NAMESPACE::DB** db) { + [error_if_wal_file_exists]( + const ROCKSDB_NAMESPACE::DBOptions& options, + const std::string& db_path, + const std::vector& + column_families, + std::vector* handles, + ROCKSDB_NAMESPACE::DB** db) { return ROCKSDB_NAMESPACE::DB::OpenForReadOnly( - options, db_path, column_families, handles, db); + options, db_path, column_families, handles, db, + error_if_wal_file_exists); }); } @@ -208,6 +218,72 @@ /* * Class: org_rocksdb_RocksDB + * Method: openAsSecondary + * Signature: (JLjava/lang/String;Ljava/lang/String;)J + */ +jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2( + JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path, + jstring jsecondary_db_path) { + const char* secondary_db_path = + env->GetStringUTFChars(jsecondary_db_path, nullptr); + if (secondary_db_path == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + + jlong db_handle = rocksdb_open_helper( + env, jopt_handle, jdb_path, + [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options, + const std::string& db_path, + ROCKSDB_NAMESPACE::DB** db) { + return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path, + secondary_db_path, db); + }); + + // we have now finished with secondary_db_path + env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path); + + return db_handle; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: openAsSecondary + * Signature: (JLjava/lang/String;Ljava/lang/String;[[B[J)[J + */ +jlongArray +Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2_3_3B_3J( + JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path, + jstring jsecondary_db_path, jobjectArray jcolumn_names, + jlongArray jcolumn_options) { + const char* secondary_db_path = + env->GetStringUTFChars(jsecondary_db_path, nullptr); + if (secondary_db_path == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + jlongArray jhandles = rocksdb_open_helper( + env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options, + [secondary_db_path]( + const ROCKSDB_NAMESPACE::DBOptions& options, + const std::string& db_path, + const std::vector& + column_families, + std::vector* handles, + ROCKSDB_NAMESPACE::DB** db) { + return ROCKSDB_NAMESPACE::DB::OpenAsSecondary( + options, db_path, secondary_db_path, column_families, handles, db); + }); + + // we have now finished with secondary_db_path + env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path); + + return jhandles; +} + +/* + * Class: org_rocksdb_RocksDB * Method: disposeInternal * Signature: (J)V */ @@ -345,8 +421,8 @@ std::vector cf_descriptors; cf_descriptors.reserve(jlen); - jboolean jcf_options_handles_is_copy = JNI_FALSE; - jlong *jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, &jcf_options_handles_is_copy); + jlong* jcf_options_handles_elems = + env->GetLongArrayElements(jcf_options_handles, nullptr); if(jcf_options_handles_elems == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -1600,34 +1676,37 @@ } } -inline void multi_get_helper_release_keys( - JNIEnv* env, std::vector>& keys_to_free) { +inline void multi_get_helper_release_keys(std::vector& keys_to_free) { auto end = keys_to_free.end(); for (auto it = keys_to_free.begin(); it != end; ++it) { - delete[] it->first; - env->DeleteLocalRef(it->second); + delete[] * it; } keys_to_free.clear(); } /** - * cf multi get + * @brief fill a native array of cf handles from java handles * - * @return byte[][] of values or nullptr if an exception occurs - */ -jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, - const ROCKSDB_NAMESPACE::ReadOptions& rOpt, - jobjectArray jkeys, jintArray jkey_offs, - jintArray jkey_lens, - jlongArray jcolumn_family_handles) { - std::vector cf_handles; + * @param env + * @param cf_handles to fill from the java variants + * @param jcolumn_family_handles + * @return true if the copy succeeds + * @return false if a JNI exception is generated + */ +inline bool cf_handles_from_jcf_handles( + JNIEnv* env, + std::vector& cf_handles, + jlongArray jcolumn_family_handles) { if (jcolumn_family_handles != nullptr) { const jsize len_cols = env->GetArrayLength(jcolumn_family_handles); jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr); if (jcfh == nullptr) { // exception thrown: OutOfMemoryError - return nullptr; + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, + "Insufficient Memory for CF handle array."); + return false; } for (jsize i = 0; i < len_cols; i++) { @@ -1637,36 +1716,53 @@ } env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT); } + return true; +} - const jsize len_keys = env->GetArrayLength(jkeys); - if (env->EnsureLocalCapacity(len_keys) != 0) { - // exception thrown: OutOfMemoryError - return nullptr; - } - +/** + * @brief copy keys from JNI into vector of slices for Rocks API + * + * @param keys to instantiate + * @param jkeys + * @param jkey_offs + * @param jkey_lens + * @return true if the copy succeeds + * @return false if a JNI exception is raised + */ +inline bool keys_from_jkeys(JNIEnv* env, + std::vector& keys, + std::vector& keys_to_free, + jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens) { jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr); if (jkey_off == nullptr) { // exception thrown: OutOfMemoryError - return nullptr; + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array."); + return false; } jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr); if (jkey_len == nullptr) { // exception thrown: OutOfMemoryError env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - return nullptr; + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array."); + return false; } - std::vector keys; - std::vector> keys_to_free; + const jsize len_keys = env->GetArrayLength(jkeys); for (jsize i = 0; i < len_keys; i++) { jobject jkey = env->GetObjectArrayElement(jkeys, i); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - multi_get_helper_release_keys(env, keys_to_free); - return nullptr; + multi_get_helper_release_keys(keys_to_free); + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, + "Insufficient Memory for key object array."); + return false; } jbyteArray jkey_ba = reinterpret_cast(jkey); @@ -1680,20 +1776,86 @@ env->DeleteLocalRef(jkey); env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); - multi_get_helper_release_keys(env, keys_to_free); - return nullptr; + multi_get_helper_release_keys(keys_to_free); + jclass exception_cls = + (env)->FindClass("java/lang/ArrayIndexOutOfBoundsException"); + (env)->ThrowNew(exception_cls, "Invalid byte array region index."); + return false; } ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), len_key); keys.push_back(key_slice); - keys_to_free.push_back(std::pair(key, jkey)); + env->DeleteLocalRef(jkey); + keys_to_free.push_back(key); } // cleanup jkey_off and jken_len env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); + return true; +} + +inline bool keys_from_bytebuffers(JNIEnv* env, + std::vector& keys, + jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens) { + jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr); + if (jkey_off == nullptr) { + // exception thrown: OutOfMemoryError + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array."); + return false; + } + + jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr); + if (jkey_len == nullptr) { + // exception thrown: OutOfMemoryError + env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array."); + return false; + } + + const jsize len_keys = env->GetArrayLength(jkeys); + for (jsize i = 0; i < len_keys; i++) { + jobject jkey = env->GetObjectArrayElement(jkeys, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return false; + } + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + ROCKSDB_NAMESPACE::Slice key_slice(key + jkey_off[i], jkey_len[i]); + keys.push_back(key_slice); + + env->DeleteLocalRef(jkey); + } + return true; +} + +/** + * cf multi get + * + * @return byte[][] of values or nullptr if an + * exception occurs + */ +jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, + const ROCKSDB_NAMESPACE::ReadOptions& rOpt, + jobjectArray jkeys, jintArray jkey_offs, + jintArray jkey_lens, + jlongArray jcolumn_family_handles) { + std::vector cf_handles; + if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) { + return nullptr; + } + + std::vector keys; + std::vector keys_to_free; + if (!keys_from_jkeys(env, keys, keys_to_free, jkeys, jkey_offs, jkey_lens)) { + return nullptr; + } + std::vector values; std::vector s; if (cf_handles.size() == 0) { @@ -1703,22 +1865,18 @@ } // free up allocated byte arrays - multi_get_helper_release_keys(env, keys_to_free); + multi_get_helper_release_keys(keys_to_free); // prepare the results jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray( env, static_cast(s.size())); if (jresults == nullptr) { // exception occurred + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for results."); return nullptr; } - // TODO(AR) it is not clear to me why EnsureLocalCapacity is needed for the - // loop as we cleanup references with env->DeleteLocalRef(jentry_value); - if (env->EnsureLocalCapacity(static_cast(s.size())) != 0) { - // exception thrown: OutOfMemoryError - return nullptr; - } // add to the jresults for (std::vector::size_type i = 0; i != s.size(); i++) { @@ -1735,14 +1893,16 @@ jentry_value, 0, static_cast(jvalue_len), const_cast(reinterpret_cast(value->c_str()))); if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException + // exception thrown: + // ArrayIndexOutOfBoundsException env->DeleteLocalRef(jentry_value); return nullptr; } env->SetObjectArrayElement(jresults, static_cast(i), jentry_value); if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException + // exception thrown: + // ArrayIndexOutOfBoundsException env->DeleteLocalRef(jentry_value); return nullptr; } @@ -1754,14 +1914,129 @@ return jresults; } +/** + * cf multi get + * + * fill supplied native buffers, or raise JNI + * exception on a problem + */ + +/** + * @brief multi_get_helper_direct for fast-path multiget (io_uring) on Linux + * + * @param env + * @param db + * @param rOpt read options + * @param jcolumn_family_handles 0, 1, or n column family handles + * @param jkeys + * @param jkey_offsets + * @param jkey_lengths + * @param jvalues byte buffers to receive values + * @param jvalue_sizes returned actual sizes of data values for keys + * @param jstatuses returned java RocksDB status values for per key + */ +void multi_get_helper_direct(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db, + const ROCKSDB_NAMESPACE::ReadOptions& rOpt, + jlongArray jcolumn_family_handles, + jobjectArray jkeys, jintArray jkey_offsets, + jintArray jkey_lengths, jobjectArray jvalues, + jintArray jvalue_sizes, jobjectArray jstatuses) { + const jsize num_keys = env->GetArrayLength(jkeys); + + std::vector keys; + if (!keys_from_bytebuffers(env, keys, jkeys, jkey_offsets, jkey_lengths)) { + return; + } + + std::vector values(num_keys); + + std::vector cf_handles; + if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) { + return; + } + + std::vector s(num_keys); + if (cf_handles.size() == 0) { + // we can use the more efficient call here + auto cf_handle = db->DefaultColumnFamily(); + db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(), + s.data()); + } else if (cf_handles.size() == 1) { + // we can use the more efficient call here + auto cf_handle = cf_handles[0]; + db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(), + s.data()); + } else { + // multiple CFs version + db->MultiGet(rOpt, num_keys, cf_handles.data(), keys.data(), values.data(), + s.data()); + } + + // prepare the results + jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray( + env, static_cast(s.size())); + if (jresults == nullptr) { + // exception occurred + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, "Insufficient Memory for results."); + return; + } + + std::vector value_size; + for (int i = 0; i < num_keys; i++) { + auto jstatus = ROCKSDB_NAMESPACE::StatusJni::construct(env, s[i]); + if (jstatus == nullptr) { + // exception in context + return; + } + env->SetObjectArrayElement(jstatuses, i, jstatus); + + if (s[i].ok()) { + jobject jvalue_bytebuf = env->GetObjectArrayElement(jvalues, i); + if (env->ExceptionCheck()) { + // ArrayIndexOutOfBoundsException is thrown + return; + } + jlong jvalue_capacity = env->GetDirectBufferCapacity(jvalue_bytebuf); + if (jvalue_capacity == -1) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value(s) argument (argument is not a valid direct " + "ByteBuffer)"); + return; + } + void* jvalue_address = env->GetDirectBufferAddress(jvalue_bytebuf); + if (jvalue_address == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value(s) argument (argument is not a valid direct " + "ByteBuffer)"); + return; + } + + // record num returned, push back that number, which may be bigger then + // the ByteBuffer supplied. then copy as much as fits in the ByteBuffer. + value_size.push_back(static_cast(values[i].size())); + auto copy_bytes = + std::min(static_cast(values[i].size()), jvalue_capacity); + memcpy(jvalue_address, values[i].data(), copy_bytes); + } else { + // bad status for this + value_size.push_back(0); + } + } + + env->SetIntArrayRegion(jvalue_sizes, 0, num_keys, value_size.data()); +} + /* * Class: org_rocksdb_RocksDB * Method: multiGet * Signature: (J[[B[I[I)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) { + JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys, + jintArray jkey_offs, jintArray jkey_lens) { return multi_get_helper( env, jdb, reinterpret_cast(jdb_handle), ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs, jkey_lens, nullptr); @@ -1773,8 +2048,8 @@ * Signature: (J[[B[I[I[J)[[B */ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J( - JNIEnv* env, jobject jdb, jlong jdb_handle, - jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens, + JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys, + jintArray jkey_offs, jintArray jkey_lens, jlongArray jcolumn_family_handles) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), @@ -1811,38 +2086,60 @@ jkey_offs, jkey_lens, jcolumn_family_handles); } +/* + * Class: org_rocksdb_RocksDB + * Method: multiGet + * Signature: + * (JJ[J[Ljava/nio/ByteBuffer;[I[I[Ljava/nio/ByteBuffer;[I[Lorg/rocksdb/Status;)V + */ +void Java_org_rocksdb_RocksDB_multiGet__JJ_3J_3Ljava_nio_ByteBuffer_2_3I_3I_3Ljava_nio_ByteBuffer_2_3I_3Lorg_rocksdb_Status_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jlongArray jcolumn_family_handles, jobjectArray jkeys, + jintArray jkey_offsets, jintArray jkey_lengths, jobjectArray jvalues, + jintArray jvalues_sizes, jobjectArray jstatus_objects) { + return multi_get_helper_direct( + env, jdb, reinterpret_cast(jdb_handle), + *reinterpret_cast(jropt_handle), + jcolumn_family_handles, jkeys, jkey_offsets, jkey_lengths, jvalues, + jvalues_sizes, jstatus_objects); +} +// private native void +// multiGet(final long dbHandle, final long rOptHandle, +// final long[] columnFamilyHandles, final ByteBuffer[] keysArray, +// final ByteBuffer[] valuesArray); + ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::DB::KeyMayExist bool key_may_exist_helper(JNIEnv* env, jlong jdb_handle, jlong jcf_handle, - jlong jread_opts_handle, - jbyteArray jkey, jint jkey_offset, jint jkey_len, - bool* has_exception, std::string* value, bool* value_found) { + jlong jread_opts_handle, jbyteArray jkey, + jint jkey_offset, jint jkey_len, bool* has_exception, + std::string* value, bool* value_found) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { cf_handle = db->DefaultColumnFamily(); - } else { - cf_handle = - reinterpret_cast(jcf_handle); - } - ROCKSDB_NAMESPACE::ReadOptions read_opts = - jread_opts_handle == 0 - ? ROCKSDB_NAMESPACE::ReadOptions() - : *(reinterpret_cast( - jread_opts_handle)); - - jbyte* key = new jbyte[jkey_len]; - env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key); - if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - delete[] key; - *has_exception = true; - return false; + } else { + cf_handle = + reinterpret_cast(jcf_handle); + } + ROCKSDB_NAMESPACE::ReadOptions read_opts = + jread_opts_handle == 0 + ? ROCKSDB_NAMESPACE::ReadOptions() + : *(reinterpret_cast( + jread_opts_handle)); + + jbyte* key = new jbyte[jkey_len]; + env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + delete[] key; + *has_exception = true; + return false; } ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - const bool exists = db->KeyMayExist( - read_opts, cf_handle, key_slice, value, value_found); + const bool exists = + db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found); // cleanup delete[] key; @@ -1850,6 +2147,49 @@ return exists; } +bool key_may_exist_direct_helper(JNIEnv* env, jlong jdb_handle, + jlong jcf_handle, jlong jread_opts_handle, + jobject jkey, jint jkey_offset, jint jkey_len, + bool* has_exception, std::string* value, + bool* value_found) { + auto* db = reinterpret_cast(jdb_handle); + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; + if (jcf_handle == 0) { + cf_handle = db->DefaultColumnFamily(); + } else { + cf_handle = + reinterpret_cast(jcf_handle); + } + ROCKSDB_NAMESPACE::ReadOptions read_opts = + jread_opts_handle == 0 + ? ROCKSDB_NAMESPACE::ReadOptions() + : *(reinterpret_cast( + jread_opts_handle)); + + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid key argument (argument is not a valid direct ByteBuffer)"); + *has_exception = true; + return false; + } + if (env->GetDirectBufferCapacity(jkey) < (jkey_offset + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid key argument. Capacity is less than requested region (offset " + "+ length)."); + *has_exception = true; + return false; + } + + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + + const bool exists = + db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found); + + return exists; +} /* * Class: org_rocksdb_RocksDB @@ -1880,22 +2220,114 @@ /* * Class: org_rocksdb_RocksDB + * Method: keyMayExistDirect + * Signature: (JJJLjava/nio/ByteBuffer;II)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyMayExistDirect( + JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) { + bool has_exception = false; + std::string value; + bool value_found = false; + + const bool exists = key_may_exist_direct_helper( + env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset, + jkey_len, &has_exception, &value, &value_found); + if (has_exception) { + // java exception already raised + return false; + } + + return static_cast(exists); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyMayExistDirectFoundValue + * Signature: + * (JJJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)[J + */ +jintArray Java_org_rocksdb_RocksDB_keyMayExistDirectFoundValue( + JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len, + jobject jval, jint jval_offset, jint jval_len) { + char* val_buffer = reinterpret_cast(env->GetDirectBufferAddress(jval)); + if (val_buffer == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value argument (argument is not a valid direct ByteBuffer)"); + return nullptr; + } + + if (env->GetDirectBufferCapacity(jval) < (jval_offset + jval_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid value argument. Capacity is less than requested region " + "(offset + length)."); + return nullptr; + } + + bool has_exception = false; + std::string cvalue; + bool value_found = false; + + const bool exists = key_may_exist_direct_helper( + env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset, + jkey_len, &has_exception, &cvalue, &value_found); + + if (has_exception) { + // java exception already raised + return nullptr; + } + + const jint cvalue_len = static_cast(cvalue.size()); + const jint length = std::min(jval_len, cvalue_len); + memcpy(val_buffer + jval_offset, cvalue.c_str(), length); + + // keep consistent with java KeyMayExistEnum.values() + const int kNotExist = 0; + const int kExistsWithoutValue = 1; + const int kExistsWithValue = 2; + + // TODO fix return value/type + // exists/value_found/neither + // cvalue_len + jintArray jresult = env->NewIntArray(2); + const jint jexists = + exists ? (value_found ? kExistsWithValue : kExistsWithoutValue) + : kNotExist; + + env->SetIntArrayRegion(jresult, 0, 1, &jexists); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jresult); + return nullptr; + } + env->SetIntArrayRegion(jresult, 1, 1, &cvalue_len); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jresult); + return nullptr; + } + + return jresult; +} + +/* + * Class: org_rocksdb_RocksDB * Method: keyMayExistFoundValue * Signature: (JJJ[BII)[[B */ jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue( JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jlong jread_opts_handle, - jbyteArray jkey, jint jkey_offset, jint jkey_len) { - + jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) { bool has_exception = false; std::string value; bool value_found = false; const bool exists = key_may_exist_helper( - env, jdb_handle, jcf_handle, jread_opts_handle, - jkey, jkey_offset, jkey_len, - &has_exception, &value, &value_found); + env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset, + jkey_len, &has_exception, &value, &value_found); if (has_exception) { // java exception already raised @@ -1930,12 +2362,12 @@ env->DeleteLocalRef(jresult_flags); return nullptr; } - + env->SetObjectArrayElement(jresults, 0, jresult_flags); if (env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jresult_flags); - return nullptr; + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jresult_flags); + return nullptr; } env->DeleteLocalRef(jresult_flags); @@ -2267,9 +2699,7 @@ const jsize jlen = env->GetArrayLength(jrange_slice_handles); const size_t range_count = jlen / 2; - jboolean jranges_is_copy = JNI_FALSE; - jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, - &jranges_is_copy); + jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, nullptr); if (jranges == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -2277,10 +2707,11 @@ auto ranges = std::unique_ptr( new ROCKSDB_NAMESPACE::Range[range_count]); + size_t range_offset = 0; for (jsize i = 0; i < jlen; ++i) { auto* start = reinterpret_cast(jranges[i]); auto* limit = reinterpret_cast(jranges[++i]); - ranges.get()[i] = ROCKSDB_NAMESPACE::Range(*start, *limit); + ranges.get()[range_offset++] = ROCKSDB_NAMESPACE::Range(*start, *limit); } auto* db = reinterpret_cast(jdb_handle); @@ -2353,14 +2784,13 @@ static_cast(count), static_cast(sizes)}; - const jsize jcount = static_cast(count); - jlongArray jsizes = env->NewLongArray(jcount); + jlongArray jsizes = env->NewLongArray(2); if (jsizes == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } - env->SetLongArrayRegion(jsizes, 0, jcount, results); + env->SetLongArrayRegion(jsizes, 0, 2, results); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException env->DeleteLocalRef(jsizes); @@ -2497,6 +2927,9 @@ auto* db = reinterpret_cast(jdb_handle); auto* cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle == nullptr) { + cf_handle = db->DefaultColumnFamily(); + } auto s = db->SetOptions(cf_handle, options_map); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); @@ -2563,6 +2996,55 @@ /* * Class: org_rocksdb_RocksDB + * Method: getOptions + * Signature: (JJ)Ljava/lang/String; + */ +jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle) { + auto* db = reinterpret_cast(jdb_handle); + + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; + if (jcf_handle == 0) { + cf_handle = db->DefaultColumnFamily(); + } else { + cf_handle = + reinterpret_cast(jcf_handle); + } + + auto options = db->GetOptions(cf_handle); + std::string options_as_string; + ROCKSDB_NAMESPACE::Status s = + GetStringFromColumnFamilyOptions(&options_as_string, options); + if (!s.ok()) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } + return env->NewStringUTF(options_as_string.c_str()); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getDBOptions + * Signature: (J)Ljava/lang/String; + */ +jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject, + jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + + auto options = db->GetDBOptions(); + std::string options_as_string; + ROCKSDB_NAMESPACE::Status s = + GetStringFromDBOptions(&options_as_string, options); + if (!s.ok()) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } + return env->NewStringUTF(options_as_string.c_str()); +} + +/* + * Class: org_rocksdb_RocksDB * Method: compactFiles * Signature: (JJJ[Ljava/lang/String;IIJ)[Ljava/lang/String; */ @@ -2612,6 +3094,17 @@ /* * Class: org_rocksdb_RocksDB + * Method: cancelAllBackgroundWork + * Signature: (JZ)V + */ +void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork( + JNIEnv*, jobject, jlong jdb_handle, jboolean jwait) { + auto* db = reinterpret_cast(jdb_handle); + ROCKSDB_NAMESPACE::CancelAllBackgroundWork(db, jwait); +} + +/* + * Class: org_rocksdb_RocksDB * Method: pauseBackgroundWork * Signature: (J)V */ @@ -2809,7 +3302,7 @@ * Method: setPreserveDeletesSequenceNumber * Signature: (JJ)Z */ -jboolean JNICALL Java_org_rocksdb_RocksDB_setPreserveDeletesSequenceNumber( +jboolean Java_org_rocksdb_RocksDB_setPreserveDeletesSequenceNumber( JNIEnv*, jobject, jlong jdb_handle, jlong jseq_number) { auto* db = reinterpret_cast(jdb_handle); if (db->SetPreserveDeletesSequenceNumber( @@ -3168,9 +3661,8 @@ reinterpret_cast(jcf_handle); } const jsize jlen = env->GetArrayLength(jrange_slice_handles); - jboolean jrange_slice_handles_is_copy = JNI_FALSE; - jlong *jrange_slice_handle = env->GetLongArrayElements( - jrange_slice_handles, &jrange_slice_handles_is_copy); + jlong* jrange_slice_handle = + env->GetLongArrayElements(jrange_slice_handles, nullptr); if (jrange_slice_handle == nullptr) { // exception occurred return nullptr; @@ -3298,8 +3790,7 @@ * Method: endTrace * Signature: (J)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_endTrace( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jobject, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->EndTrace(); if (!s.ok()) { @@ -3309,6 +3800,20 @@ /* * Class: org_rocksdb_RocksDB + * Method: tryCatchUpWithPrimary + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jobject, + jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + auto s = db->TryCatchUpWithPrimary(); + if (!s.ok()) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_RocksDB * Method: destroyDB * Signature: (Ljava/lang/String;J)V */ @@ -3367,9 +3872,11 @@ * Method: deleteFilesInRanges * Signature: (JJLjava/util/List;Z)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_deleteFilesInRanges( - JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jcf_handle, - jobjectArray ranges, jboolean include_end) { +void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jobject /*jdb*/, + jlong jdb_handle, + jlong jcf_handle, + jobjectArray ranges, + jboolean include_end) { jsize length = env->GetArrayLength(ranges); std::vector rangesVector; @@ -3404,3 +3911,15 @@ ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } } + +/* + * Class: org_rocksdb_RocksDB + * Method: version + * Signature: ()I + */ +jint Java_org_rocksdb_RocksDB_version(JNIEnv*, jclass) { + uint32_t encodedVersion = (ROCKSDB_MAJOR & 0xff) << 16; + encodedVersion |= (ROCKSDB_MINOR & 0xff) << 8; + encodedVersion |= (ROCKSDB_PATCH & 0xff); + return static_cast(encodedVersion); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/slice.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/slice.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc 2025-05-19 16:14:27.000000000 +0000 @@ -229,6 +229,17 @@ } /* + * Class: org_rocksdb_DirectSlice + * Method: setLength0 + * Signature: (JI)V + */ +void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/, + jlong handle, jint length) { + auto* slice = reinterpret_cast(handle); + slice->size_ = length; +} + +/* * Class: org_rocksdb_Slice * Method: disposeInternalBuf * Signature: (JJ)V diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -251,3 +251,20 @@ ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seekPrev, env, jtarget, jtarget_off, jtarget_len); } + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: refresh0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + ROCKSDB_NAMESPACE::Status s = it->Refresh(); + + if (s.ok()) { + return; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ and enables +// calling C++ ROCKSDB_NAMESPACE::SstFileManager methods +// from Java side. + +#include "rocksdb/sst_partitioner.h" + +#include + +#include + +#include "include/org_rocksdb_SstPartitionerFixedPrefixFactory.h" +#include "rocksdb/sst_file_manager.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_SstPartitionerFixedPrefixFactory + * Method: newSstPartitionerFixedPrefixFactory0 + * Signature: (J)J + */ +jlong Java_org_rocksdb_SstPartitionerFixedPrefixFactory_newSstPartitionerFixedPrefixFactory0( + JNIEnv*, jclass, jlong prefix_len) { + auto* ptr = new std::shared_ptr( + ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory(prefix_len)); + return reinterpret_cast(ptr); +} + +/* + * Class: org_rocksdb_SstPartitionerFixedPrefixFactory + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_SstPartitionerFixedPrefixFactory_disposeInternal( + JNIEnv*, jobject, jlong jhandle) { + auto* ptr = reinterpret_cast< + std::shared_ptr*>(jhandle); + delete ptr; // delete std::shared_ptr +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc 2025-05-19 16:14:27.000000000 +0000 @@ -28,5 +28,4 @@ return true; } -// @lint-ignore TXT4 T25377293 Grandfathered in -}; // namespace ROCKSDB_NAMESPACE \ No newline at end of file +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h 2025-05-19 16:14:27.000000000 +0000 @@ -30,5 +30,4 @@ } // namespace ROCKSDB_NAMESPACE -// @lint-ignore TXT4 T25377293 Grandfathered in -#endif // JAVA_ROCKSJNI_STATISTICSJNI_H_ \ No newline at end of file +#endif // JAVA_ROCKSJNI_STATISTICSJNI_H_ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/table.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/table.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc 2025-05-19 16:14:27.000000000 +0000 @@ -42,25 +42,25 @@ /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZZZZBBDBZJJJJIIIJZZJZZIIZZJIJI)J + * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( - JNIEnv*, jobject, jboolean jcache_index_and_filter_blocks, + JNIEnv *, jobject, jboolean jcache_index_and_filter_blocks, jboolean jcache_index_and_filter_blocks_with_high_priority, jboolean jpin_l0_filter_and_index_blocks_in_cache, jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value, jbyte jdata_block_index_type_value, jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value, jboolean jno_block_cache, jlong jblock_cache_handle, - jlong jpersistent_cache_handle, - jlong jblock_cache_compressed_handle, jlong jblock_size, - jint jblock_size_deviation, jint jblock_restart_interval, + jlong jpersistent_cache_handle, jlong jblock_cache_compressed_handle, + jlong jblock_size, jint jblock_size_deviation, jint jblock_restart_interval, jint jindex_block_restart_interval, jlong jmetadata_block_size, - jboolean jpartition_filters, jboolean juse_delta_encoding, - jlong jfilter_policy_handle, jboolean jwhole_key_filtering, - jboolean jverify_compression, jint jread_amp_bytes_per_bit, - jint jformat_version, jboolean jenable_index_compression, - jboolean jblock_align, jlong jblock_cache_size, + jboolean jpartition_filters, jboolean joptimize_filters_for_memory, + jboolean juse_delta_encoding, jlong jfilter_policy_handle, + jboolean jwhole_key_filtering, jboolean jverify_compression, + jint jread_amp_bytes_per_bit, jint jformat_version, + jboolean jenable_index_compression, jboolean jblock_align, + jbyte jindex_shortening, jlong jblock_cache_size, jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size, jint jblock_cache_compressed_num_shard_bits) { ROCKSDB_NAMESPACE::BlockBasedTableOptions options; @@ -131,6 +131,8 @@ options.index_block_restart_interval = static_cast(jindex_block_restart_interval); options.metadata_block_size = static_cast(jmetadata_block_size); options.partition_filters = static_cast(jpartition_filters); + options.optimize_filters_for_memory = + static_cast(joptimize_filters_for_memory); options.use_delta_encoding = static_cast(juse_delta_encoding); if (jfilter_policy_handle > 0) { std::shared_ptr *pFilterPolicy = @@ -144,6 +146,9 @@ options.format_version = static_cast(jformat_version); options.enable_index_compression = static_cast(jenable_index_compression); options.block_align = static_cast(jblock_align); + options.index_shortening = + ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode( + jindex_shortening); return reinterpret_cast( ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options)); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,216 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include +#include + +#include "include/org_rocksdb_test_TestableEventListener.h" +#include "rocksdb/listener.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" + +using ROCKSDB_NAMESPACE::BackgroundErrorReason; +using ROCKSDB_NAMESPACE::CompactionJobInfo; +using ROCKSDB_NAMESPACE::CompactionJobStats; +using ROCKSDB_NAMESPACE::CompactionReason; +using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo; +using ROCKSDB_NAMESPACE::FileOperationInfo; +using ROCKSDB_NAMESPACE::FileOperationType; +using ROCKSDB_NAMESPACE::FlushJobInfo; +using ROCKSDB_NAMESPACE::FlushReason; +using ROCKSDB_NAMESPACE::MemTableInfo; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TableFileCreationBriefInfo; +using ROCKSDB_NAMESPACE::TableFileCreationInfo; +using ROCKSDB_NAMESPACE::TableFileCreationReason; +using ROCKSDB_NAMESPACE::TableFileDeletionInfo; +using ROCKSDB_NAMESPACE::TableProperties; +using ROCKSDB_NAMESPACE::WriteStallCondition; +using ROCKSDB_NAMESPACE::WriteStallInfo; + +static TableProperties newTablePropertiesForTest() { + TableProperties table_properties; + table_properties.data_size = UINT64_MAX; + table_properties.index_size = UINT64_MAX; + table_properties.index_partitions = UINT64_MAX; + table_properties.top_level_index_size = UINT64_MAX; + table_properties.index_key_is_user_key = UINT64_MAX; + table_properties.index_value_is_delta_encoded = UINT64_MAX; + table_properties.filter_size = UINT64_MAX; + table_properties.raw_key_size = UINT64_MAX; + table_properties.raw_value_size = UINT64_MAX; + table_properties.num_data_blocks = UINT64_MAX; + table_properties.num_entries = UINT64_MAX; + table_properties.num_deletions = UINT64_MAX; + table_properties.num_merge_operands = UINT64_MAX; + table_properties.num_range_deletions = UINT64_MAX; + table_properties.format_version = UINT64_MAX; + table_properties.fixed_key_len = UINT64_MAX; + table_properties.column_family_id = UINT64_MAX; + table_properties.creation_time = UINT64_MAX; + table_properties.oldest_key_time = UINT64_MAX; + table_properties.file_creation_time = UINT64_MAX; + table_properties.slow_compression_estimated_data_size = UINT64_MAX; + table_properties.fast_compression_estimated_data_size = UINT64_MAX; + table_properties.external_sst_file_global_seqno_offset = UINT64_MAX; + table_properties.db_id = "dbId"; + table_properties.db_session_id = "sessionId"; + table_properties.column_family_name = "columnFamilyName"; + table_properties.filter_policy_name = "filterPolicyName"; + table_properties.comparator_name = "comparatorName"; + table_properties.merge_operator_name = "mergeOperatorName"; + table_properties.prefix_extractor_name = "prefixExtractorName"; + table_properties.property_collectors_names = "propertyCollectorsNames"; + table_properties.compression_name = "compressionName"; + table_properties.compression_options = "compressionOptions"; + table_properties.user_collected_properties = {{"key", "value"}}; + table_properties.readable_properties = {{"key", "value"}}; + return table_properties; +} + +/* + * Class: org_rocksdb_test_TestableEventListener + * Method: invokeAllCallbacks + * Signature: (J)V + */ +void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks( + JNIEnv *, jclass, jlong jhandle) { + const auto &el = + *reinterpret_cast *>( + jhandle); + + TableProperties table_properties = newTablePropertiesForTest(); + + FlushJobInfo flush_job_info; + flush_job_info.cf_id = INT_MAX; + flush_job_info.cf_name = "testColumnFamily"; + flush_job_info.file_path = "/file/path"; + flush_job_info.file_number = UINT64_MAX; + flush_job_info.oldest_blob_file_number = UINT64_MAX; + flush_job_info.thread_id = UINT64_MAX; + flush_job_info.job_id = INT_MAX; + flush_job_info.triggered_writes_slowdown = true; + flush_job_info.triggered_writes_stop = true; + flush_job_info.smallest_seqno = UINT64_MAX; + flush_job_info.largest_seqno = UINT64_MAX; + flush_job_info.table_properties = table_properties; + flush_job_info.flush_reason = FlushReason::kManualFlush; + + el->OnFlushCompleted(nullptr, flush_job_info); + el->OnFlushBegin(nullptr, flush_job_info); + + Status status = Status::Incomplete(Status::SubCode::kNoSpace); + + TableFileDeletionInfo file_deletion_info; + file_deletion_info.db_name = "dbName"; + file_deletion_info.file_path = "/file/path"; + file_deletion_info.job_id = INT_MAX; + file_deletion_info.status = status; + + el->OnTableFileDeleted(file_deletion_info); + + CompactionJobInfo compaction_job_info; + compaction_job_info.cf_id = UINT32_MAX; + compaction_job_info.cf_name = "compactionColumnFamily"; + compaction_job_info.status = status; + compaction_job_info.thread_id = UINT64_MAX; + compaction_job_info.job_id = INT_MAX; + compaction_job_info.base_input_level = INT_MAX; + compaction_job_info.output_level = INT_MAX; + compaction_job_info.input_files = {"inputFile.sst"}; + compaction_job_info.input_file_infos = {}; + compaction_job_info.output_files = {"outputFile.sst"}; + compaction_job_info.output_file_infos = {}; + compaction_job_info.table_properties = { + {"tableProperties", std::shared_ptr( + &table_properties, [](TableProperties *) {})}}; + compaction_job_info.compaction_reason = CompactionReason::kFlush; + compaction_job_info.compression = CompressionType::kSnappyCompression; + + compaction_job_info.stats = CompactionJobStats(); + + el->OnCompactionBegin(nullptr, compaction_job_info); + el->OnCompactionCompleted(nullptr, compaction_job_info); + + TableFileCreationInfo file_creation_info; + file_creation_info.file_size = UINT64_MAX; + file_creation_info.table_properties = table_properties; + file_creation_info.status = status; + file_creation_info.file_checksum = "fileChecksum"; + file_creation_info.file_checksum_func_name = "fileChecksumFuncName"; + file_creation_info.db_name = "dbName"; + file_creation_info.cf_name = "columnFamilyName"; + file_creation_info.file_path = "/file/path"; + file_creation_info.job_id = INT_MAX; + file_creation_info.reason = TableFileCreationReason::kMisc; + + el->OnTableFileCreated(file_creation_info); + + TableFileCreationBriefInfo file_creation_brief_info; + file_creation_brief_info.db_name = "dbName"; + file_creation_brief_info.cf_name = "columnFamilyName"; + file_creation_brief_info.file_path = "/file/path"; + file_creation_brief_info.job_id = INT_MAX; + file_creation_brief_info.reason = TableFileCreationReason::kMisc; + + el->OnTableFileCreationStarted(file_creation_brief_info); + + MemTableInfo mem_table_info; + mem_table_info.cf_name = "columnFamilyName"; + mem_table_info.first_seqno = UINT64_MAX; + mem_table_info.earliest_seqno = UINT64_MAX; + mem_table_info.num_entries = UINT64_MAX; + mem_table_info.num_deletes = UINT64_MAX; + + el->OnMemTableSealed(mem_table_info); + el->OnColumnFamilyHandleDeletionStarted(nullptr); + + ExternalFileIngestionInfo file_ingestion_info; + file_ingestion_info.cf_name = "columnFamilyName"; + file_ingestion_info.external_file_path = "/external/file/path"; + file_ingestion_info.internal_file_path = "/internal/file/path"; + file_ingestion_info.global_seqno = UINT64_MAX; + file_ingestion_info.table_properties = table_properties; + el->OnExternalFileIngested(nullptr, file_ingestion_info); + + el->OnBackgroundError(BackgroundErrorReason::kFlush, &status); + + WriteStallInfo write_stall_info; + write_stall_info.cf_name = "columnFamilyName"; + write_stall_info.condition.cur = WriteStallCondition::kDelayed; + write_stall_info.condition.prev = WriteStallCondition::kStopped; + el->OnStallConditionsChanged(write_stall_info); + + FileOperationInfo op_info = FileOperationInfo( + FileOperationType::kRead, "/file/path", + std::make_pair(std::chrono::time_point( + std::chrono::nanoseconds(1600699420000000000ll)), + std::chrono::time_point( + std::chrono::nanoseconds(1600699420000000000ll))), + std::chrono::time_point( + std::chrono::nanoseconds(1600699425000000000ll)), + status); + op_info.offset = UINT64_MAX; + op_info.length = SIZE_MAX; + op_info.status = status; + + el->OnFileReadFinish(op_info); + el->OnFileWriteFinish(op_info); + el->OnFileFlushFinish(op_info); + el->OnFileSyncFinish(op_info); + el->OnFileRangeSyncFinish(op_info); + el->OnFileTruncateFinish(op_info); + el->OnFileCloseFinish(op_info); + el->ShouldBeNotifiedOnFileIO(); + + bool auto_recovery; + el->OnErrorRecoveryBegin(BackgroundErrorReason::kFlush, status, + &auto_recovery); + el->OnErrorRecoveryCompleted(status); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc 2025-05-19 16:14:27.000000000 +0000 @@ -14,8 +14,6 @@ #include "rocksdb/utilities/transaction.h" #include "rocksjni/portal.h" -using namespace std::placeholders; - #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable : 4503) // identifier' : decorated name length @@ -220,8 +218,8 @@ const ROCKSDB_NAMESPACE::ReadOptions&, ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, std::string*)>( - &ROCKSDB_NAMESPACE::Transaction::Get, txn, _1, column_family_handle, - _2, _3); + &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1, + column_family_handle, std::placeholders::_2, std::placeholders::_3); return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len); } @@ -238,7 +236,8 @@ std::bind( - &ROCKSDB_NAMESPACE::Transaction::Get, txn, _1, _2, _3); + &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len); } @@ -402,8 +401,8 @@ const ROCKSDB_NAMESPACE::ReadOptions&, const std::vector&, const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, _1, column_family_handles, - _2, _3); + &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1, + column_family_handles, std::placeholders::_2, std::placeholders::_3); return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, jkey_parts); } @@ -421,7 +420,8 @@ ROCKSDB_NAMESPACE::Transaction::*)( const ROCKSDB_NAMESPACE::ReadOptions&, const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, _1, _2, _3); + &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1, + std::placeholders::_2, std::placeholders::_3); return txn_multi_get_helper(env, fn_multi_get, jread_options_handle, jkey_parts); } @@ -444,8 +444,9 @@ const ROCKSDB_NAMESPACE::ReadOptions&, ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, std::string*, bool, bool)>( - &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, _1, - column_family_handle, _2, _3, jexclusive, jdo_validate); + &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, + std::placeholders::_1, column_family_handle, std::placeholders::_2, + std::placeholders::_3, jexclusive, jdo_validate); return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, jkey_part_len); } @@ -464,7 +465,8 @@ std::bind( - &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, _1, _2, _3, + &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, jexclusive, jdo_validate); return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey, jkey_part_len); @@ -492,8 +494,9 @@ const ROCKSDB_NAMESPACE::ReadOptions&, const std::vector&, const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, _1, - column_family_handles, _2, _3); + &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, + std::placeholders::_1, column_family_handles, std::placeholders::_2, + std::placeholders::_3); return txn_multi_get_helper(env, fn_multi_get_for_update, jread_options_handle, jkey_parts); } @@ -511,7 +514,8 @@ ROCKSDB_NAMESPACE::Status> (ROCKSDB_NAMESPACE::Transaction::*)( const ROCKSDB_NAMESPACE::ReadOptions&, const std::vector&, std::vector*)>( - &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, _1, _2, _3); + &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); return txn_multi_get_helper(env, fn_multi_get_for_update, jread_options_handle, jkey_parts); } @@ -605,7 +609,8 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&, bool)>(&ROCKSDB_NAMESPACE::Transaction::Put, txn, - column_family_handle, _1, _2, jassume_tracked); + column_family_handle, std::placeholders::_1, + std::placeholders::_2, jassume_tracked); txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); } @@ -623,7 +628,8 @@ FnWriteKV fn_put = std::bind( - &ROCKSDB_NAMESPACE::Transaction::Put, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1, + std::placeholders::_2); txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len); } @@ -689,6 +695,7 @@ // out of memory env->DeleteLocalRef(jobj_value_part); env->DeleteLocalRef(jobj_key_part); + env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); free_parts(env, jparts_to_free); return; } @@ -698,6 +705,7 @@ env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT); env->DeleteLocalRef(jobj_value_part); env->DeleteLocalRef(jobj_key_part); + env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); free_parts(env, jparts_to_free); return; } @@ -748,8 +756,8 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::SliceParts&, const ROCKSDB_NAMESPACE::SliceParts&, bool)>( - &ROCKSDB_NAMESPACE::Transaction::Put, txn, column_family_handle, _1, - _2, jassume_tracked); + &ROCKSDB_NAMESPACE::Transaction::Put, txn, column_family_handle, + std::placeholders::_1, std::placeholders::_2, jassume_tracked); txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len, jvalue_parts, jvalue_parts_len); } @@ -766,7 +774,8 @@ FnWriteKVParts fn_put_parts = std::bind( - &ROCKSDB_NAMESPACE::Transaction::Put, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1, + std::placeholders::_2); txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len, jvalue_parts, jvalue_parts_len); } @@ -789,7 +798,8 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&, bool)>(&ROCKSDB_NAMESPACE::Transaction::Merge, txn, - column_family_handle, _1, _2, jassume_tracked); + column_family_handle, std::placeholders::_1, + std::placeholders::_2, jassume_tracked); txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); } @@ -805,7 +815,8 @@ FnWriteKV fn_merge = std::bind( - &ROCKSDB_NAMESPACE::Transaction::Merge, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::Merge, txn, std::placeholders::_1, + std::placeholders::_2); txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len); } @@ -854,7 +865,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, bool)>( &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle, - _1, jassume_tracked); + std::placeholders::_1, jassume_tracked); txn_write_k_helper(env, fn_delete, jkey, jkey_part_len); } @@ -869,7 +880,7 @@ auto* txn = reinterpret_cast(jhandle); FnWriteK fn_delete = std::bind( - &ROCKSDB_NAMESPACE::Transaction::Delete, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1); txn_write_k_helper(env, fn_delete, jkey, jkey_part_len); } @@ -949,7 +960,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::SliceParts&, bool)>( &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle, - _1, jassume_tracked); + std::placeholders::_1, jassume_tracked); txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len); } @@ -965,7 +976,7 @@ auto* txn = reinterpret_cast(jhandle); FnWriteKParts fn_delete_parts = std::bind( - &ROCKSDB_NAMESPACE::Transaction::Delete, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1); txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len); } @@ -986,7 +997,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, bool)>( &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, - column_family_handle, _1, jassume_tracked); + column_family_handle, std::placeholders::_1, jassume_tracked); txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len); } @@ -1003,7 +1014,8 @@ auto* txn = reinterpret_cast(jhandle); FnWriteK fn_single_delete = std::bind( - &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, + std::placeholders::_1); txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len); } @@ -1025,7 +1037,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::SliceParts&, bool)>( &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, - column_family_handle, _1, jassume_tracked); + column_family_handle, std::placeholders::_1, jassume_tracked); txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts, jkey_parts_len); } @@ -1043,7 +1055,8 @@ auto* txn = reinterpret_cast(jhandle); FnWriteKParts fn_single_delete_parts = std::bind( - &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, + std::placeholders::_1); txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts, jkey_parts_len); } @@ -1066,7 +1079,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>( &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, - column_family_handle, _1, _2); + column_family_handle, std::placeholders::_1, std::placeholders::_2); txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, jval_len); } @@ -1083,7 +1096,8 @@ FnWriteKV fn_put_untracked = std::bind( - &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, + std::placeholders::_1, std::placeholders::_2); txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval, jval_len); } @@ -1106,7 +1120,7 @@ const ROCKSDB_NAMESPACE::SliceParts&, const ROCKSDB_NAMESPACE::SliceParts&)>( &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, column_family_handle, - _1, _2); + std::placeholders::_1, std::placeholders::_2); txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts, jkey_parts_len, jvalue_parts, jvalue_parts_len); } @@ -1123,7 +1137,8 @@ FnWriteKVParts fn_put_parts_untracked = std::bind( - &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, std::placeholders::_1, + std::placeholders::_2); txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts, jkey_parts_len, jvalue_parts, jvalue_parts_len); } @@ -1146,7 +1161,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>( &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, - column_family_handle, _1, _2); + column_family_handle, std::placeholders::_1, std::placeholders::_2); txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, jval_len); } @@ -1163,7 +1178,8 @@ FnWriteKV fn_merge_untracked = std::bind( - &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, _1, _2); + &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, + std::placeholders::_1, std::placeholders::_2); txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval, jval_len); } @@ -1184,7 +1200,7 @@ ROCKSDB_NAMESPACE::Transaction::*)(ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::Slice&)>( &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, - column_family_handle, _1); + column_family_handle, std::placeholders::_1); txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len); } @@ -1201,7 +1217,8 @@ auto* txn = reinterpret_cast(jhandle); FnWriteK fn_delete_untracked = std::bind( - &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, + std::placeholders::_1); txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len); } @@ -1222,7 +1239,7 @@ ROCKSDB_NAMESPACE::ColumnFamilyHandle*, const ROCKSDB_NAMESPACE::SliceParts&)>( &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, - column_family_handle, _1); + column_family_handle, std::placeholders::_1); txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts, jkey_parts_len); } @@ -1239,7 +1256,8 @@ FnWriteKParts fn_delete_untracked_parts = std::bind( - &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, _1); + &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, + std::placeholders::_1); txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts, jkey_parts_len); } @@ -1605,7 +1623,7 @@ case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_COMMIT: return 0x3; - case ROCKSDB_NAMESPACE::Transaction::TransactionState::COMMITED: + case ROCKSDB_NAMESPACE::Transaction::TransactionState::COMMITTED: return 0x4; case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_ROLLBACK: diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -197,7 +197,7 @@ *cfOptions, std::string(reinterpret_cast(cfname), len), &handle, jttl); - env->ReleaseByteArrayElements(jcolumn_name, cfname, 0); + env->ReleaseByteArrayElements(jcolumn_name, cfname, JNI_ABORT); if (s.ok()) { return reinterpret_cast(handle); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc 2025-05-19 16:14:27.000000000 +0000 @@ -363,10 +363,10 @@ /* * Class: org_rocksdb_WriteBatch - * Method: removeDirect + * Method: deleteDirect * Signature: (JLjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatch_removeDirect(JNIEnv* env, jobject /*jobj*/, +void Java_org_rocksdb_WriteBatch_deleteDirect(JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey, jint jkey_offset, jint jkey_len, jlong jcf_handle) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -47,7 +47,7 @@ ROCKSDB_NAMESPACE::WriteBufferManager wb(options.db_write_buffer_size); options.memtable_factory = factory; ROCKSDB_NAMESPACE::MemTable* mem = new ROCKSDB_NAMESPACE::MemTable( - cmp, ROCKSDB_NAMESPACE::ImmutableCFOptions(options), + cmp, ROCKSDB_NAMESPACE::ImmutableOptions(options), ROCKSDB_NAMESPACE::MutableCFOptions(options), &wb, ROCKSDB_NAMESPACE::kMaxSequenceNumber, 0 /* column_family_id */); mem->Ref(); @@ -63,10 +63,10 @@ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ikey.clear(); - bool parsed = ROCKSDB_NAMESPACE::ParseInternalKey(iter->key(), &ikey); - if (!parsed) { - assert(parsed); - } + ROCKSDB_NAMESPACE::Status pik_status = ROCKSDB_NAMESPACE::ParseInternalKey( + iter->key(), &ikey, true /* log_err_key */); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); switch (ikey.type) { case ROCKSDB_NAMESPACE::kTypeValue: state.append("Put("); @@ -119,7 +119,7 @@ break; } state.append("@"); - state.append(ROCKSDB_NAMESPACE::NumberToString(ikey.sequence)); + state.append(ROCKSDB_NAMESPACE::ToString(ikey.sequence)); } if (!s.ok()) { state.append(s.ToString()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc 2025-05-19 16:14:27.000000000 +0000 @@ -301,10 +301,10 @@ /* * Class: org_rocksdb_WriteBatchWithIndex - * Method: removeDirect + * Method: deleteDirect * Signature: (JLjava/nio/ByteBuffer;IIJ)V */ -void Java_org_rocksdb_WriteBatchWithIndex_removeDirect( +void Java_org_rocksdb_WriteBatchWithIndex_deleteDirect( JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey, jint jkey_offset, jint jkey_len, jlong jcf_handle) { auto* wb = reinterpret_cast(jwb_handle); @@ -533,20 +533,24 @@ /* * Class: org_rocksdb_WriteBatchWithIndex * Method: iteratorWithBase - * Signature: (JJJ)J + * Signature: (JJJJ)J */ -jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(JNIEnv* /*env*/, - jobject /*jobj*/, - jlong jwbwi_handle, - jlong jcf_handle, - jlong jbi_handle) { +jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase( + JNIEnv*, jobject, jlong jwbwi_handle, jlong jcf_handle, + jlong jbase_iterator_handle, jlong jread_opts_handle) { auto* wbwi = reinterpret_cast(jwbwi_handle); auto* cf_handle = reinterpret_cast(jcf_handle); auto* base_iterator = - reinterpret_cast(jbi_handle); - auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator); + reinterpret_cast(jbase_iterator_handle); + ROCKSDB_NAMESPACE::ReadOptions* read_opts = + jread_opts_handle == 0 + ? nullptr + : reinterpret_cast( + jread_opts_handle); + auto* iterator = + wbwi->NewIteratorWithBase(cf_handle, base_iterator, read_opts); return reinterpret_cast(iterator); } @@ -860,3 +864,13 @@ return jresults; } + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: refresh0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_refresh0(JNIEnv* env) { + ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported"); + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,14 +16,15 @@ * Signature: (JJ)J */ jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager( - JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle) { + JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle, + jboolean allow_stall) { auto* cache_ptr = reinterpret_cast*>( jcache_handle); auto* write_buffer_manager = new std::shared_ptr( - std::make_shared(jbuffer_size, - *cache_ptr)); + std::make_shared( + jbuffer_size, *cache_ptr, allow_stall)); return reinterpret_cast(write_buffer_manager); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc 2025-05-19 16:14:27.000000000 +0000 @@ -108,7 +108,7 @@ // exception thrown return; } - + m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env); if(m_jMarkRollbackMethodId == nullptr) { // exception thrown @@ -121,6 +121,13 @@ return; } + m_jMarkCommitWithTimestampMethodId = + WriteBatchHandlerJni::getMarkCommitWithTimestampMethodId(env); + if (m_jMarkCommitWithTimestampMethodId == nullptr) { + // exception thrown + return; + } + m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env); if(m_jContinueMethodId == nullptr) { // exception thrown @@ -424,6 +431,23 @@ return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? + } else { + return ROCKSDB_NAMESPACE::Status(*status); + } +} + +ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommitWithTimestamp( + const Slice& xid, const Slice& ts) { + auto markCommitWithTimestamp = [this](jbyteArray j_xid, jbyteArray j_ts) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkCommitWithTimestampMethodId, + j_xid, j_ts); + }; + auto status = + WriteBatchHandlerJniCallback::kv_op(xid, ts, markCommitWithTimestamp); + if (status == nullptr) { + return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is + // an Exception but we don't know + // the ROCKSDB_NAMESPACE::Status? } else { return ROCKSDB_NAMESPACE::Status(*status); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h 2025-05-19 16:14:27.000000000 +0000 @@ -48,6 +48,7 @@ Status MarkNoop(bool empty_batch); Status MarkRollback(const Slice& xid); Status MarkCommit(const Slice& xid); + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts); bool Continue(); private: @@ -69,6 +70,7 @@ jmethodID m_jMarkNoopMethodId; jmethodID m_jMarkRollbackMethodId; jmethodID m_jMarkCommitMethodId; + jmethodID m_jMarkCommitWithTimestampMethodId; jmethodID m_jContinueMethodId; /** * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni.pom mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni.pom 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom 1970-01-01 00:00:00.000000000 +0000 @@ -1,150 +0,0 @@ - - - 4.0.0 - RocksDB JNI - http://rocksdb.org/ - org.rocksdb - rocksdbjni - - - - RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files - for Mac OSX, and a .dll for Windows x64. - - - - Apache License 2.0 - http://www.apache.org/licenses/LICENSE-2.0.html - repo - - - GNU General Public License, version 2 - http://www.gnu.org/licenses/gpl-2.0.html - repo - - - - scm:git:git://github.com/dropwizard/metrics.git - scm:git:git@github.com:dropwizard/metrics.git - http://github.com/dropwizard/metrics/ - HEAD - - - - Facebook - help@facebook.com - America/New_York - - architect - - - - - - 1.7 - 1.7 - UTF-8 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.2 - - ${project.build.source} - ${project.build.target} - ${project.build.sourceEncoding} - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.18.1 - - ${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory} - false - false - - ${project.build.directory}/* - - - - - org.jacoco - jacoco-maven-plugin - 0.7.2.201409121644 - - - - prepare-agent - - - - report - prepare-package - - report - - - - - - org.codehaus.gmaven - groovy-maven-plugin - 2.0 - - - process-classes - - execute - - - - Xenu - - - String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8') - matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/) - String major_version = matcher.getAt(0).getAt(1) - matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/) - String minor_version = matcher.getAt(0).getAt(1) - matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/) - String patch_version = matcher.getAt(0).getAt(1) - String version = String.format('%s.%s.%s', major_version, minor_version, patch_version) - // Set version to be used in pom.properties - project.version = version - // Set version to be set as jar name - project.build.finalName = project.artifactId + "-" + version - - - - - - - - - - - junit - junit - 4.12 - test - - - org.assertj - assertj-core - 1.7.1 - test - - - org.mockito - mockito-all - 1.10.19 - test - - - diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java 2025-05-19 16:14:27.000000000 +0000 @@ -111,7 +111,7 @@ // Read a key using the snapshot. readOptions.setSnapshot(snapshot); final byte[] value = txn.getForUpdate(readOptions, key1, true); - assert(value == value1); + assert (value == null); try { // Attempt to commit transaction diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java 2025-05-19 16:14:27.000000000 +0000 @@ -53,8 +53,8 @@ try { // put and get from non-default column family - db.put(columnFamilyHandles.get(0), new WriteOptions(), - "key".getBytes(), "value".getBytes()); + db.put( + columnFamilyHandles.get(1), new WriteOptions(), "key".getBytes(), "value".getBytes()); // atomic write try (final WriteBatch wb = new WriteBatch()) { @@ -62,7 +62,7 @@ "value2".getBytes()); wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes()); - wb.remove(columnFamilyHandles.get(0), "key".getBytes()); + wb.delete(columnFamilyHandles.get(1), "key".getBytes()); db.write(new WriteOptions(), wb); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java 2025-05-19 16:14:27.000000000 +0000 @@ -45,7 +45,7 @@ .setStatistics(stats) .setWriteBufferSize(8 * SizeUnit.KB) .setMaxWriteBufferNumber(3) - .setMaxBackgroundCompactions(10) + .setMaxBackgroundJobs(10) .setCompressionType(CompressionType.SNAPPY_COMPRESSION) .setCompactionStyle(CompactionStyle.UNIVERSAL); } catch (final IllegalArgumentException e) { @@ -55,7 +55,7 @@ assert (options.createIfMissing() == true); assert (options.writeBufferSize() == 8 * SizeUnit.KB); assert (options.maxWriteBufferNumber() == 3); - assert (options.maxBackgroundCompactions() == 10); + assert (options.maxBackgroundJobs() == 10); assert (options.compressionType() == CompressionType.SNAPPY_COMPRESSION); assert (options.compactionStyle() == CompactionStyle.UNIVERSAL); @@ -87,24 +87,17 @@ options.setRateLimiter(rateLimiter); final BlockBasedTableConfig table_options = new BlockBasedTableConfig(); - table_options.setBlockCacheSize(64 * SizeUnit.KB) - .setFilter(bloomFilter) - .setCacheNumShardBits(6) + Cache cache = new LRUCache(64 * 1024, 6); + table_options.setBlockCache(cache) + .setFilterPolicy(bloomFilter) .setBlockSizeDeviation(5) .setBlockRestartInterval(10) .setCacheIndexAndFilterBlocks(true) - .setHashIndexAllowCollision(false) - .setBlockCacheCompressedSize(64 * SizeUnit.KB) - .setBlockCacheCompressedNumShardBits(10); + .setBlockCacheCompressed(new LRUCache(64 * 1000, 10)); - assert (table_options.blockCacheSize() == 64 * SizeUnit.KB); - assert (table_options.cacheNumShardBits() == 6); assert (table_options.blockSizeDeviation() == 5); assert (table_options.blockRestartInterval() == 10); assert (table_options.cacheIndexAndFilterBlocks() == true); - assert (table_options.hashIndexAllowCollision() == false); - assert (table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); - assert (table_options.blockCacheCompressedNumShardBits() == 10); options.setTableFormatConfig(table_options); assert (options.tableFactoryName().equals("BlockBasedTable")); @@ -203,14 +196,14 @@ len = db.get(readOptions, testKey, enoughArray); assert (len == testValue.length); - db.remove(testKey); + db.delete(testKey); len = db.get(testKey, enoughArray); assert (len == RocksDB.NOT_FOUND); // repeat the test with WriteOptions try (final WriteOptions writeOpts = new WriteOptions()) { writeOpts.setSync(true); - writeOpts.setDisableWAL(true); + writeOpts.setDisableWAL(false); db.put(writeOpts, testKey, testValue); len = db.get(testKey, enoughArray); assert (len == testValue.length); @@ -284,15 +277,15 @@ } } - Map values = db.multiGet(keys); + List values = db.multiGetAsList(keys); assert (values.size() == keys.size()); - for (final byte[] value1 : values.values()) { + for (final byte[] value1 : values) { assert (value1 != null); } - values = db.multiGet(new ReadOptions(), keys); + values = db.multiGetAsList(new ReadOptions(), keys); assert (values.size() == keys.size()); - for (final byte[] value1 : values.values()) { + for (final byte[] value1 : values) { assert (value1 != null); } } catch (final RocksDBException e) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.rocksdb.AbstractEventListener.EnabledEventCallback.*; + +/** + * Base class for Event Listeners. + */ +public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener { + public enum EnabledEventCallback { + ON_FLUSH_COMPLETED((byte) 0x0), + ON_FLUSH_BEGIN((byte) 0x1), + ON_TABLE_FILE_DELETED((byte) 0x2), + ON_COMPACTION_BEGIN((byte) 0x3), + ON_COMPACTION_COMPLETED((byte) 0x4), + ON_TABLE_FILE_CREATED((byte) 0x5), + ON_TABLE_FILE_CREATION_STARTED((byte) 0x6), + ON_MEMTABLE_SEALED((byte) 0x7), + ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED((byte) 0x8), + ON_EXTERNAL_FILE_INGESTED((byte) 0x9), + ON_BACKGROUND_ERROR((byte) 0xA), + ON_STALL_CONDITIONS_CHANGED((byte) 0xB), + ON_FILE_READ_FINISH((byte) 0xC), + ON_FILE_WRITE_FINISH((byte) 0xD), + ON_FILE_FLUSH_FINISH((byte) 0xE), + ON_FILE_SYNC_FINISH((byte) 0xF), + ON_FILE_RANGE_SYNC_FINISH((byte) 0x10), + ON_FILE_TRUNCATE_FINISH((byte) 0x11), + ON_FILE_CLOSE_FINISH((byte) 0x12), + SHOULD_BE_NOTIFIED_ON_FILE_IO((byte) 0x13), + ON_ERROR_RECOVERY_BEGIN((byte) 0x14), + ON_ERROR_RECOVERY_COMPLETED((byte) 0x15); + + private final byte value; + + EnabledEventCallback(final byte value) { + this.value = value; + } + + /** + * Get the internal representation value. + * + * @return the internal representation value + */ + byte getValue() { + return value; + } + + /** + * Get the EnabledEventCallbacks from the internal representation value. + * + * @return the enabled event callback. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static EnabledEventCallback fromValue(final byte value) { + for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) { + if (enabledEventCallback.value == value) { + return enabledEventCallback; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for EnabledEventCallback: " + value); + } + } + + /** + * Creates an Event Listener that will + * received all callbacks from C++. + * + * If you don't need all callbacks, it is much more efficient to + * just register for the ones you need by calling + * {@link #AbstractEventListener(EnabledEventCallback...)} instead. + */ + protected AbstractEventListener() { + this(ON_FLUSH_COMPLETED, ON_FLUSH_BEGIN, ON_TABLE_FILE_DELETED, ON_COMPACTION_BEGIN, + ON_COMPACTION_COMPLETED, ON_TABLE_FILE_CREATED, ON_TABLE_FILE_CREATION_STARTED, + ON_MEMTABLE_SEALED, ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, ON_EXTERNAL_FILE_INGESTED, + ON_BACKGROUND_ERROR, ON_STALL_CONDITIONS_CHANGED, ON_FILE_READ_FINISH, ON_FILE_WRITE_FINISH, + ON_FILE_FLUSH_FINISH, ON_FILE_SYNC_FINISH, ON_FILE_RANGE_SYNC_FINISH, + ON_FILE_TRUNCATE_FINISH, ON_FILE_CLOSE_FINISH, SHOULD_BE_NOTIFIED_ON_FILE_IO, + ON_ERROR_RECOVERY_BEGIN, ON_ERROR_RECOVERY_COMPLETED); + } + + /** + * Creates an Event Listener that will + * receive only certain callbacks from C++. + * + * @param enabledEventCallbacks callbacks to enable in Java. + */ + protected AbstractEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(packToLong(enabledEventCallbacks)); + } + + /** + * Pack EnabledEventCallbacks to a long. + * + * @param enabledEventCallbacks the flags + * + * @return a long + */ + private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) { + long l = 0; + for (int i = 0; i < enabledEventCallbacks.length; i++) { + l |= 1 << enabledEventCallbacks[i].getValue(); + } + return l; + } + + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)}. + * + * @param dbHandle native handle of the database + * @param flushJobInfo the flush job info + */ + private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onFlushCompleted(db, flushJobInfo); + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onFlushBegin(RocksDB, FlushJobInfo)}. + * + * @param dbHandle native handle of the database + * @param flushJobInfo the flush job info + */ + private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onFlushBegin(db, flushJobInfo); + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + // no-op + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onCompactionBegin(RocksDB, CompactionJobInfo)}. + * + * @param dbHandle native handle of the database + * @param compactionJobInfo the flush job info + */ + private void onCompactionBeginProxy( + final long dbHandle, final CompactionJobInfo compactionJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onCompactionBegin(db, compactionJobInfo); + } + + @Override + public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}. + * + * @param dbHandle native handle of the database + * @param compactionJobInfo the flush job info + */ + private void onCompactionCompletedProxy( + final long dbHandle, final CompactionJobInfo compactionJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onCompactionCompleted(db, compactionJobInfo); + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + // no-op + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + // no-op + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + // no-op + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + // no-op + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onExternalFileIngested(RocksDB, ExternalFileIngestionInfo)}. + * + * @param dbHandle native handle of the database + * @param externalFileIngestionInfo the flush job info + */ + private void onExternalFileIngestedProxy( + final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onExternalFileIngested(db, externalFileIngestionInfo); + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onBackgroundError(BackgroundErrorReason, Status)}. + * + * @param reasonByte byte value representing error reason + * @param backgroundError status with error code + */ + private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) { + onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError); + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + // no-op + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + return true; + } + + /** + * Called from JNI, proxy for + * {@link #onErrorRecoveryBegin(BackgroundErrorReason, Status)}. + * + * @param reasonByte byte value representing error reason + * @param backgroundError status with error code + */ + private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) { + return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError); + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + // no-op + } + + @Override + protected long initializeNative(final long... nativeParameterHandles) { + return createNewEventListener(nativeParameterHandles[0]); + } + + /** + * Deletes underlying C++ native callback object pointer + */ + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private native long createNewEventListener(final long enabledEventCallbackValues); + private native void disposeInternal(final long handle); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -7,7 +7,7 @@ protected static final String KEY_VALUE_PAIR_SEPARATOR = ";"; protected static final char KEY_VALUE_SEPARATOR = '='; - static final String INT_ARRAY_INT_SEPARATOR = ","; + static final String INT_ARRAY_INT_SEPARATOR = ":"; protected final String[] keys; private final String[] values; @@ -59,6 +59,7 @@ K extends MutableOptionKey> { private final Map> options = new LinkedHashMap<>(); + private final List unknown = new ArrayList<>(); protected abstract U self(); @@ -213,44 +214,147 @@ return ((MutableOptionValue.MutableOptionEnumValue) value).asObject(); } - public U fromString( - final String keyStr, final String valueStr) + /** + * Parse a string into a long value, accepting values expressed as a double (such as 9.00) which + * are meant to be a long, not a double + * + * @param value the string containing a value which represents a long + * @return the long value of the parsed string + */ + private long parseAsLong(final String value) { + try { + return Long.parseLong(value); + } catch (NumberFormatException nfe) { + final double doubleValue = Double.parseDouble(value); + if (doubleValue != Math.round(doubleValue)) + throw new IllegalArgumentException("Unable to parse or round " + value + " to int"); + return Math.round(doubleValue); + } + } + + /** + * Parse a string into an int value, accepting values expressed as a double (such as 9.00) which + * are meant to be an int, not a double + * + * @param value the string containing a value which represents an int + * @return the int value of the parsed string + */ + private int parseAsInt(final String value) { + try { + return Integer.parseInt(value); + } catch (NumberFormatException nfe) { + final double doubleValue = Double.parseDouble(value); + if (doubleValue != Math.round(doubleValue)) + throw new IllegalArgumentException("Unable to parse or round " + value + " to long"); + return (int) Math.round(doubleValue); + } + } + + /** + * Constructs a builder for mutable column family options from a hierarchical parsed options + * string representation. The {@link OptionString.Parser} class output has been used to create a + * (name,value)-list; each value may be either a simple string or a (name, value)-list in turn. + * + * @param options a list of parsed option string objects + * @param ignoreUnknown what to do if the key is not one of the keys we expect + * + * @return a builder with the values from the parsed input set + * + * @throws IllegalArgumentException if an option value is of the wrong type, or a key is empty + */ + protected U fromParsed(final List options, final boolean ignoreUnknown) { + Objects.requireNonNull(options); + + for (final OptionString.Entry option : options) { + try { + if (option.key.isEmpty()) { + throw new IllegalArgumentException("options string is invalid: " + option); + } + fromOptionString(option, ignoreUnknown); + } catch (NumberFormatException nfe) { + throw new IllegalArgumentException( + "" + option.key + "=" + option.value + " - not a valid value for its type", nfe); + } + } + + return self(); + } + + /** + * Set a value in the builder from the supplied option string + * + * @param option the option key/value to add to this builder + * @param ignoreUnknown if this is not set, throw an exception when a key is not in the known + * set + * @return the same object, after adding options + * @throws IllegalArgumentException if the key is unkown, or a value has the wrong type/form + */ + private U fromOptionString(final OptionString.Entry option, final boolean ignoreUnknown) throws IllegalArgumentException { - Objects.requireNonNull(keyStr); - Objects.requireNonNull(valueStr); + Objects.requireNonNull(option.key); + Objects.requireNonNull(option.value); + + final K key = allKeys().get(option.key); + if (key == null && ignoreUnknown) { + unknown.add(option); + return self(); + } else if (key == null) { + throw new IllegalArgumentException("Key: " + key + " is not a known option key"); + } - final K key = allKeys().get(keyStr); - switch(key.getValueType()) { + if (!option.value.isList()) { + throw new IllegalArgumentException( + "Option: " + key + " is not a simple value or list, don't know how to parse it"); + } + + // Check that simple values are the single item in the array + if (key.getValueType() != MutableOptionKey.ValueType.INT_ARRAY) { + { + if (option.value.list.size() != 1) { + throw new IllegalArgumentException( + "Simple value does not have exactly 1 item: " + option.value.list); + } + } + } + + final List valueStrs = option.value.list; + final String valueStr = valueStrs.get(0); + + switch (key.getValueType()) { case DOUBLE: return setDouble(key, Double.parseDouble(valueStr)); case LONG: - return setLong(key, Long.parseLong(valueStr)); + return setLong(key, parseAsLong(valueStr)); case INT: - return setInt(key, Integer.parseInt(valueStr)); + return setInt(key, parseAsInt(valueStr)); case BOOLEAN: return setBoolean(key, Boolean.parseBoolean(valueStr)); case INT_ARRAY: - final String[] strInts = valueStr - .trim().split(INT_ARRAY_INT_SEPARATOR); - if(strInts == null || strInts.length == 0) { - throw new IllegalArgumentException( - "int array value is not correctly formatted"); - } - - final int value[] = new int[strInts.length]; - int i = 0; - for(final String strInt : strInts) { - value[i++] = Integer.parseInt(strInt); + final int[] value = new int[valueStrs.size()]; + for (int i = 0; i < valueStrs.size(); i++) { + value[i] = Integer.parseInt(valueStrs.get(i)); } return setIntArray(key, value); + + case ENUM: + final CompressionType compressionType = CompressionType.getFromInternal(valueStr); + return setEnum(key, compressionType); + + default: + throw new IllegalStateException(key + " has unknown value type: " + key.getValueType()); } + } - throw new IllegalStateException( - key + " has unknown value type: " + key.getValueType()); + /** + * + * @return the list of keys encountered which were not known to the type being generated + */ + public List getUnknown() { + return new ArrayList<>(unknown); } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java 2025-05-19 16:14:27.000000000 +0000 @@ -67,7 +67,7 @@ @Override @Deprecated protected void finalize() throws Throwable { - if(isOwningHandle()) { + if (isOwningHandle()) { //TODO(AR) log a warning message... developer should have called close() } dispose(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java 2025-05-19 16:14:27.000000000 +0000 @@ -93,6 +93,12 @@ } @Override + public void refresh() throws RocksDBException { + assert (isOwningHandle()); + refresh0(nativeHandle_); + } + + @Override public void status() throws RocksDBException { assert (isOwningHandle()); status0(nativeHandle_); @@ -118,6 +124,7 @@ abstract void seekToLast0(long handle); abstract void next0(long handle); abstract void prev0(long handle); + abstract void refresh0(long handle) throws RocksDBException; abstract void seek0(long handle, byte[] target, int targetLen); abstract void seekForPrev0(long handle, byte[] target, int targetLen); abstract void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java 2025-05-19 16:14:27.000000000 +0000 @@ -56,7 +56,21 @@ delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } - public void put(ByteBuffer key, ByteBuffer value) throws RocksDBException { + @Override + @Deprecated + public void remove(final ByteBuffer key) throws RocksDBException { + this.delete(key); + } + + @Override + @Deprecated + public void remove(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) + throws RocksDBException { + this.delete(columnFamilyHandle, key); + } + + @Override + public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), 0); @@ -65,8 +79,8 @@ } @Override - public void put(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key, ByteBuffer value) - throws RocksDBException { + public void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); @@ -85,6 +99,19 @@ delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } + @Override + public void delete(final ByteBuffer key) throws RocksDBException { + deleteDirect(nativeHandle_, key, key.position(), key.remaining(), 0); + key.position(key.limit()); + } + + @Override + public void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) + throws RocksDBException { + deleteDirect( + nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_); + key.position(key.limit()); + } @Override public void singleDelete(byte[] key) throws RocksDBException { @@ -110,19 +137,6 @@ columnFamilyHandle.nativeHandle_); } - public void remove(ByteBuffer key) throws RocksDBException { - removeDirect(nativeHandle_, key, key.position(), key.remaining(), 0); - key.position(key.limit()); - } - - @Override - public void remove(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key) - throws RocksDBException { - removeDirect( - nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_); - key.position(key.limit()); - } - @Override public void putLogData(byte[] blob) throws RocksDBException { putLogData(nativeHandle_, blob, blob.length); @@ -184,13 +198,13 @@ abstract void delete(final long handle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; - abstract void singleDelete(final long handle, final byte[] key, - final int keyLen) throws RocksDBException; + abstract void singleDelete(final long handle, final byte[] key, final int keyLen) + throws RocksDBException; - abstract void singleDelete(final long handle, final byte[] key, - final int keyLen, final long cfHandle) throws RocksDBException; + abstract void singleDelete(final long handle, final byte[] key, final int keyLen, + final long cfHandle) throws RocksDBException; - abstract void removeDirect(final long handle, final ByteBuffer key, final int keyOffset, + abstract void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -301,7 +301,7 @@ * @return the reference to the current options. */ @Experimental("Turning this feature on or off for an existing DB can cause" + - "unexpected LSM tree structure so it's not recommended") + " unexpected LSM tree structure so it's not recommended") T setLevelCompactionDynamicLevelBytes( boolean enableLevelCompactionDynamicLevelBytes); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -461,4 +461,258 @@ * @return the time-to-live. */ long ttl(); + + /** + * Files older than this value will be picked up for compaction, and + * re-written to the same level as they were before. + * One main use of the feature is to make sure a file goes through compaction + * filters periodically. Users can also use the feature to clear up SST + * files using old format. + * + * A file's age is computed by looking at file_creation_time or creation_time + * table properties in order, if they have valid non-zero values; if not, the + * age is based on the file's last modified time (given by the underlying + * Env). + * + * Supported in Level and FIFO compaction. + * In FIFO compaction, this option has the same meaning as TTL and whichever + * stricter will be used. + * Pre-req: max_open_file == -1. + * unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 + * + * Values: + * 0: Turn off Periodic compactions. + * UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature + * as needed. For now, RocksDB will change this value to 30 days + * (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction + * process at least once every 30 days if not compacted sooner. + * In FIFO compaction, since the option has the same meaning as ttl, + * when this value is left default, and ttl is left to 0, 30 days will be + * used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + * + * Default: 0xfffffffffffffffe (allow RocksDB to auto-tune) + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param periodicCompactionSeconds the periodic compaction in seconds. + * + * @return the reference to the current options. + */ + T setPeriodicCompactionSeconds(final long periodicCompactionSeconds); + + /** + * Get the periodicCompactionSeconds. + * + * See {@link #setPeriodicCompactionSeconds(long)}. + * + * @return the periodic compaction in seconds. + */ + long periodicCompactionSeconds(); + + // + // BEGIN options for blobs (integrated BlobDB) + // + + /** + * When set, large values (blobs) are written to separate blob files, and only + * pointers to them are stored in SST files. This can reduce write amplification + * for large-value use cases at the cost of introducing a level of indirection + * for reads. See also the options min_blob_size, blob_file_size, + * blob_compression_type, enable_blob_garbage_collection, and + * blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param enableBlobFiles true iff blob files should be enabled + * + * @return the reference to the current options. + */ + T setEnableBlobFiles(final boolean enableBlobFiles); + + /** + * When set, large values (blobs) are written to separate blob files, and only + * pointers to them are stored in SST files. This can reduce write amplification + * for large-value use cases at the cost of introducing a level of indirection + * for reads. See also the options min_blob_size, blob_file_size, + * blob_compression_type, enable_blob_garbage_collection, and + * blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return true iff blob files are enabled + */ + boolean enableBlobFiles(); + + /** + * Set the size of the smallest value to be stored separately in a blob file. Values + * which have an uncompressed size smaller than this threshold are stored + * alongside the keys in SST files in the usual fashion. A value of zero for + * this option means that all values are stored in blob files. Note that + * enable_blob_files has to be set in order for this option to have any effect. + * + * Default: 0 + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param minBlobSize the size of the smallest value to be stored separately in a blob file + * @return the reference to the current options. + */ + T setMinBlobSize(final long minBlobSize); + + /** + * Get the size of the smallest value to be stored separately in a blob file. Values + * which have an uncompressed size smaller than this threshold are stored + * alongside the keys in SST files in the usual fashion. A value of zero for + * this option means that all values are stored in blob files. Note that + * enable_blob_files has to be set in order for this option to have any effect. + * + * Default: 0 + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return the current minimum size of value which is stored separately in a blob + */ + long minBlobSize(); + + /** + * Set the size limit for blob files. When writing blob files, a new file is opened + * once this limit is reached. Note that enable_blob_files has to be set in + * order for this option to have any effect. + * + * Default: 256 MB + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param blobFileSize the size limit for blob files + * + * @return the reference to the current options. + */ + T setBlobFileSize(final long blobFileSize); + + /** + * The size limit for blob files. When writing blob files, a new file is opened + * once this limit is reached. + * + * @return the current size limit for blob files + */ + long blobFileSize(); + + /** + * Set the compression algorithm to use for large values stored in blob files. Note + * that enable_blob_files has to be set in order for this option to have any + * effect. + * + * Default: no compression + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param compressionType the compression algorithm to use. + * + * @return the reference to the current options. + */ + T setBlobCompressionType(CompressionType compressionType); + + /** + * Get the compression algorithm in use for large values stored in blob files. + * Note that enable_blob_files has to be set in order for this option to have any + * effect. + * + * @return the current compression algorithm + */ + CompressionType blobCompressionType(); + + /** + * Enable/disable garbage collection of blobs. Blob GC is performed as part of + * compaction. Valid blobs residing in blob files older than a cutoff get + * relocated to new files as they are encountered during compaction, which makes + * it possible to clean up blob files once they contain nothing but + * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * @param enableBlobGarbageCollection the new enabled/disabled state of blob garbage collection + * + * @return the reference to the current options. + */ + T setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection); + + /** + * Query whether garbage collection of blobs is enabled.Blob GC is performed as part of + * compaction. Valid blobs residing in blob files older than a cutoff get + * relocated to new files as they are encountered during compaction, which makes + * it possible to clean up blob files once they contain nothing but + * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * @return true iff blob garbage collection is currently enabled. + */ + boolean enableBlobGarbageCollection(); + + /** + * Set cutoff in terms of blob file age for garbage collection. Blobs in the + * oldest N blob files will be relocated when encountered during compaction, + * where N = garbage_collection_cutoff * number_of_blob_files. Note that + * enable_blob_garbage_collection has to be set in order for this option to have + * any effect. + * + * Default: 0.25 + * + * @param blobGarbageCollectionAgeCutoff the new age cutoff + * + * @return the reference to the current options. + */ + T setBlobGarbageCollectionAgeCutoff(double blobGarbageCollectionAgeCutoff); + /** + * Get cutoff in terms of blob file age for garbage collection. Blobs in the + * oldest N blob files will be relocated when encountered during compaction, + * where N = garbage_collection_cutoff * number_of_blob_files. Note that + * enable_blob_garbage_collection has to be set in order for this option to have + * any effect. + * + * Default: 0.25 + * + * @return the current age cutoff for garbage collection + */ + double blobGarbageCollectionAgeCutoff(); + + /** + * If the ratio of garbage in the oldest blob files exceeds this threshold, + * targeted compactions are scheduled in order to force garbage collecting + * the blob files in question, assuming they are all eligible based on the + * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is + * currently only supported with leveled compactions. + * + * Note that {@link #enableBlobGarbageCollection} has to be set in order for this + * option to have any effect. + * + * Default: 1.0 + * + * Dynamically changeable through the SetOptions() API + * + * @param blobGarbageCollectionForceThreshold new value for the threshold + * @return the reference to the current options + */ + T setBlobGarbageCollectionForceThreshold(double blobGarbageCollectionForceThreshold); + + /** + * Get the current value for the {@link #blobGarbageCollectionForceThreshold} + * @return the current threshold at which garbage collection of blobs is forced + */ + double blobGarbageCollectionForceThreshold(); + + // + // END options for blobs (integrated BlobDB) + // } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum BackgroundErrorReason { + FLUSH((byte) 0x0), + COMPACTION((byte) 0x1), + WRITE_CALLBACK((byte) 0x2), + MEMTABLE((byte) 0x3); + + private final byte value; + + BackgroundErrorReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the BackgroundErrorReason from the internal representation value. + * + * @return the background error reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static BackgroundErrorReason fromValue(final byte value) { + for (final BackgroundErrorReason backgroundErrorReason : BackgroundErrorReason.values()) { + if (backgroundErrorReason.value == value) { + return backgroundErrorReason; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for BackgroundErrorReason: " + value); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java 2025-05-19 16:14:27.000000000 +0000 @@ -15,7 +15,7 @@ public BlockBasedTableConfig() { //TODO(AR) flushBlockPolicyFactory cacheIndexAndFilterBlocks = false; - cacheIndexAndFilterBlocksWithHighPriority = false; + cacheIndexAndFilterBlocksWithHighPriority = true; pinL0FilterAndIndexBlocksInCache = false; pinTopLevelIndexAndFilter = true; indexType = IndexType.kBinarySearch; @@ -32,14 +32,16 @@ indexBlockRestartInterval = 1; metadataBlockSize = 4096; partitionFilters = false; + optimizeFiltersForMemory = false; useDeltaEncoding = true; filterPolicy = null; wholeKeyFiltering = true; - verifyCompression = true; + verifyCompression = false; readAmpBytesPerBit = 0; - formatVersion = 2; + formatVersion = 5; enableIndexCompression = true; blockAlign = false; + indexShortening = IndexShorteningMode.kShortenSeparators; // NOTE: ONLY used if blockCache == null blockCacheSize = 8 * 1024 * 1024; @@ -77,7 +79,7 @@ /** * Indicates if index and filter blocks will be treated as high-priority in the block cache. - * See note below about applicability. If not specified, defaults to false. + * See note below about applicability. If not specified, defaults to true. * * @return if index and filter blocks will be treated as high-priority. */ @@ -453,6 +455,65 @@ return this; } + /*** + * Option to generate Bloom filters that minimize memory + * internal fragmentation. + * + * See {@link #setOptimizeFiltersForMemory(boolean)}. + * + * @return true if bloom filters are used to minimize memory internal + * fragmentation + */ + @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation") + public boolean optimizeFiltersForMemory() { + return optimizeFiltersForMemory; + } + + /** + * Option to generate Bloom filters that minimize memory + * internal fragmentation. + * + * When false, malloc_usable_size is not available, or format_version < 5, + * filters are generated without regard to internal fragmentation when + * loaded into memory (historical behavior). When true (and + * malloc_usable_size is available and {@link #formatVersion()} >= 5), + * then Bloom filters are generated to "round up" and "round down" their + * sizes to minimize internal fragmentation when loaded into memory, assuming + * the reading DB has the same memory allocation characteristics as the + * generating DB. This option does not break forward or backward + * compatibility. + * + * While individual filters will vary in bits/key and false positive rate + * when setting is true, the implementation attempts to maintain a weighted + * average FP rate for filters consistent with this option set to false. + * + * With Jemalloc for example, this setting is expected to save about 10% of + * the memory footprint and block cache charge of filters, while increasing + * disk usage of filters by about 1-2% due to encoding efficiency losses + * with variance in bits/key. + * + * NOTE: Because some memory counted by block cache might be unmapped pages + * within internal fragmentation, this option can increase observed RSS + * memory usage. With {@link #cacheIndexAndFilterBlocks()} == true, + * this option makes the block cache better at using space it is allowed. + * + * NOTE: Do not set to true if you do not trust malloc_usable_size. With + * this option, RocksDB might access an allocated memory object beyond its + * original size if malloc_usable_size says it is safe to do so. While this + * can be considered bad practice, it should not produce undefined behavior + * unless malloc_usable_size is buggy or broken. + * + * @param optimizeFiltersForMemory true to enable Bloom filters that minimize + * memory internal fragmentation, or false to disable. + * + * @return the reference to the current config. + */ + @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation") + public BlockBasedTableConfig setOptimizeFiltersForMemory(final boolean optimizeFiltersForMemory) { + this.optimizeFiltersForMemory = optimizeFiltersForMemory; + return this; + } + /** * Determine if delta encoding is being used to compress block keys. * @@ -648,10 +709,13 @@ *
  • 4 - Can be read by RocksDB's versions since 5.16. Changes the way we * encode the values in index blocks. If you don't plan to run RocksDB before * version 5.16 and you are using index_block_restart_interval > 1, you should - * probably use this as it would reduce the index size.
  • + * probably use this as it would reduce the index size. + * This option only affects newly written tables. When reading existing + * tables, the information about version is read from the footer. + *
  • 5 - Can be read by RocksDB's versions since 6.6.0. + * Full and partitioned filters use a generally faster and more accurate + * Bloom filter implementation, with a different schema.
  • * - *

    This option only affects newly written tables. When reading existing - * tables, the information about version is read from the footer.

    * * @param formatVersion integer representing the version to be used. * @@ -659,7 +723,7 @@ */ public BlockBasedTableConfig setFormatVersion( final int formatVersion) { - assert(formatVersion >= 0 && formatVersion <= 4); + assert (formatVersion >= 0); this.formatVersion = formatVersion; return this; } @@ -717,6 +781,28 @@ return this; } + /** + * Get the index shortening mode. + * + * @return the index shortening mode. + */ + public IndexShorteningMode indexShortening() { + return indexShortening; + } + + /** + * Set the index shortening mode. + * + * See {@link IndexShorteningMode}. + * + * @param indexShortening the index shortening mode. + * + * @return the reference to the current option. + */ + public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexShortening) { + this.indexShortening = indexShortening; + return this; + } /** * Get the size of the cache in bytes that will be used by RocksDB. @@ -900,54 +986,35 @@ } return newTableFactoryHandle(cacheIndexAndFilterBlocks, - cacheIndexAndFilterBlocksWithHighPriority, - pinL0FilterAndIndexBlocksInCache, pinTopLevelIndexAndFilter, - indexType.getValue(), dataBlockIndexType.getValue(), - dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, - blockCacheHandle, persistentCacheHandle, blockCacheCompressedHandle, - blockSize, blockSizeDeviation, blockRestartInterval, - indexBlockRestartInterval, metadataBlockSize, partitionFilters, - useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, - verifyCompression, readAmpBytesPerBit, formatVersion, - enableIndexCompression, blockAlign, - blockCacheSize, blockCacheNumShardBits, + cacheIndexAndFilterBlocksWithHighPriority, pinL0FilterAndIndexBlocksInCache, + pinTopLevelIndexAndFilter, indexType.getValue(), dataBlockIndexType.getValue(), + dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, blockCacheHandle, + persistentCacheHandle, blockCacheCompressedHandle, blockSize, blockSizeDeviation, + blockRestartInterval, indexBlockRestartInterval, metadataBlockSize, partitionFilters, + optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, + verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign, + indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits, blockCacheCompressedSize, blockCacheCompressedNumShardBits); } - private native long newTableFactoryHandle( - final boolean cacheIndexAndFilterBlocks, + private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, final boolean cacheIndexAndFilterBlocksWithHighPriority, - final boolean pinL0FilterAndIndexBlocksInCache, - final boolean pinTopLevelIndexAndFilter, - final byte indexTypeValue, - final byte dataBlockIndexTypeValue, - final double dataBlockHashTableUtilRatio, - final byte checksumTypeValue, - final boolean noBlockCache, - final long blockCacheHandle, - final long persistentCacheHandle, - final long blockCacheCompressedHandle, - final long blockSize, - final int blockSizeDeviation, - final int blockRestartInterval, - final int indexBlockRestartInterval, - final long metadataBlockSize, - final boolean partitionFilters, - final boolean useDeltaEncoding, - final long filterPolicyHandle, - final boolean wholeKeyFiltering, - final boolean verifyCompression, - final int readAmpBytesPerBit, - final int formatVersion, - final boolean enableIndexCompression, - final boolean blockAlign, + final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter, + final byte indexTypeValue, final byte dataBlockIndexTypeValue, + final double dataBlockHashTableUtilRatio, final byte checksumTypeValue, + final boolean noBlockCache, final long blockCacheHandle, final long persistentCacheHandle, + final long blockCacheCompressedHandle, final long blockSize, final int blockSizeDeviation, + final int blockRestartInterval, final int indexBlockRestartInterval, + final long metadataBlockSize, final boolean partitionFilters, + final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding, + final long filterPolicyHandle, final boolean wholeKeyFiltering, + final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, + final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening, - @Deprecated final long blockCacheSize, - @Deprecated final int blockCacheNumShardBits, + @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits, @Deprecated final long blockCacheCompressedSize, - @Deprecated final int blockCacheCompressedNumShardBits - ); + @Deprecated final int blockCacheCompressedNumShardBits); //TODO(AR) flushBlockPolicyFactory private boolean cacheIndexAndFilterBlocks; @@ -968,6 +1035,7 @@ private int indexBlockRestartInterval; private long metadataBlockSize; private boolean partitionFilters; + private boolean optimizeFiltersForMemory; private boolean useDeltaEncoding; private Filter filterPolicy; private boolean wholeKeyFiltering; @@ -976,6 +1044,7 @@ private int formatVersion; private boolean enableIndexCompression; private boolean blockAlign; + private IndexShorteningMode indexShortening; // NOTE: ONLY used if blockCache == null @Deprecated private long blockCacheSize; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,44 @@ +package org.rocksdb; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + * A ByteBuffer containing fetched data, together with a result for the fetch + * and the total size of the object fetched. + * + * Used for the individual results of + * {@link RocksDB#multiGetByteBuffers(List, List)} + * {@link RocksDB#multiGetByteBuffers(List, List, List)} + * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List)} + * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List, List)} + */ +public class ByteBufferGetStatus { + public final Status status; + public final int requiredSize; + public final ByteBuffer value; + + /** + * Constructor used for success status, when the value is contained in the buffer + * + * @param status the status of the request to fetch into the buffer + * @param requiredSize the size of the data, which may be bigger than the buffer + * @param value the buffer containing as much of the value as fits + */ + ByteBufferGetStatus(final Status status, final int requiredSize, final ByteBuffer value) { + this.status = status; + this.requiredSize = requiredSize; + this.value = value; + } + + /** + * Constructor used for a failure status, when no value is filled in + * + * @param status the status of the request to fetch into the buffer + */ + ByteBufferGetStatus(final Status status) { + this.status = status; + this.requiredSize = 0; + this.value = null; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java 2025-05-19 16:14:27.000000000 +0000 @@ -10,4 +10,31 @@ protected Cache(final long nativeHandle) { super(nativeHandle); } + + /** + * Returns the memory size for the entries + * residing in cache. + * + * @return cache usage size. + * + */ + public long getUsage() { + assert (isOwningHandle()); + return getUsage(this.nativeHandle_); + } + + /** + * Returns the memory size for the entries + * being pinned in cache. + * + * @return cache pinned usage size. + * + */ + public long getPinnedUsage() { + assert (isOwningHandle()); + return getPinnedUsage(this.nativeHandle_); + } + + private native static long getUsage(final long handle); + private native static long getPinnedUsage(final long handle); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java 2025-05-19 16:14:27.000000000 +0000 @@ -20,7 +20,11 @@ /** * XX Hash */ - kxxHash((byte) 2); + kxxHash((byte) 2), + /** + * XX Hash 64 + */ + kxxHash64((byte) 3); /** * Returns the byte value of the enumerations value diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java 2025-05-19 16:14:27.000000000 +0000 @@ -13,6 +13,12 @@ * ColumnFamily Pointers. */ public class ColumnFamilyHandle extends RocksObject { + /** + * Constructs column family Java object, which operates on underlying native object. + * + * @param rocksDB db instance associated with this column family + * @param nativeHandle native handle to underlying native ColumnFamily object + */ ColumnFamilyHandle(final RocksDB rocksDB, final long nativeHandle) { super(nativeHandle); @@ -25,6 +31,28 @@ } /** + * Constructor called only from JNI. + * + * NOTE: we are producing an additional Java Object here to represent the underlying native C++ + * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user + * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as + * they will have been presented it when they opened the database or added a Column Family. + * + * + * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects + * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could + * also help us to improve the Java API semantics for Java users. See for example + * https://github.com/facebook/rocksdb/issues/2687. + * + * @param nativeHandle native handle to the column family. + */ + ColumnFamilyHandle(final long nativeHandle) { + super(nativeHandle); + rocksDB_ = null; + disOwnNativeHandle(); + } + + /** * Gets the name of the Column Family. * * @return The name of the Column Family. @@ -32,6 +60,7 @@ * @throws RocksDBException if an error occurs whilst retrieving the name. */ public byte[] getName() throws RocksDBException { + assert(isOwningHandle() || isDefaultColumnFamily()); return getName(nativeHandle_); } @@ -41,6 +70,7 @@ * @return the ID of the Column Family. */ public int getID() { + assert(isOwningHandle() || isDefaultColumnFamily()); return getID(nativeHandle_); } @@ -59,7 +89,7 @@ * descriptor. */ public ColumnFamilyDescriptor getDescriptor() throws RocksDBException { - assert(isOwningHandle()); + assert(isOwningHandle() || isDefaultColumnFamily()); return getDescriptor(nativeHandle_); } @@ -85,12 +115,18 @@ @Override public int hashCode() { try { - return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_); + int result = Objects.hash(getID(), rocksDB_.nativeHandle_); + result = 31 * result + Arrays.hashCode(getName()); + return result; } catch (RocksDBException e) { throw new RuntimeException("Cannot calculate hash code of column family handle", e); } } + protected boolean isDefaultColumnFamily() { + return nativeHandle_ == rocksDB_.getDefaultColumnFamily().nativeHandle_; + } + /** *

    Deletes underlying C++ iterator pointer.

    * diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,9 +5,8 @@ package org.rocksdb; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; +import java.nio.file.Paths; +import java.util.*; /** * ColumnFamilyOptions to control the behavior of a database. It will be used @@ -52,6 +51,8 @@ this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_; this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_; this.compressionOptions_ = other.compressionOptions_; + this.compactionThreadLimiter_ = other.compactionThreadLimiter_; + this.sstPartitionerFactory_ = other.sstPartitionerFactory_; } /** @@ -96,20 +97,40 @@ */ public static ColumnFamilyOptions getColumnFamilyOptionsFromProps( final Properties properties) { - if (properties == null || properties.size() == 0) { - throw new IllegalArgumentException( - "Properties value must contain at least one value."); - } ColumnFamilyOptions columnFamilyOptions = null; - StringBuilder stringBuilder = new StringBuilder(); - for (final String name : properties.stringPropertyNames()){ - stringBuilder.append(name); - stringBuilder.append("="); - stringBuilder.append(properties.getProperty(name)); - stringBuilder.append(";"); + final long handle = + getColumnFamilyOptionsFromProps(Options.getOptionStringFromProps(properties)); + if (handle != 0) { + columnFamilyOptions = new ColumnFamilyOptions(handle); } - long handle = getColumnFamilyOptionsFromProps( - stringBuilder.toString()); + return columnFamilyOptions; + } + + /** + *

    Method to get a options instance by using pre-configured + * property values. If one or many values are undefined in + * the context of RocksDB the method will return a null + * value.

    + * + *

    Note: Property keys can be derived from + * getter methods within the options class. Example: the method + * {@code writeBufferSize()} has a property key: + * {@code write_buffer_size}.

    + * + * @param cfgOpts ConfigOptions controlling how the properties are parsed. + * @param properties {@link java.util.Properties} instance. + * + * @return {@link org.rocksdb.ColumnFamilyOptions instance} + * or null. + * + * @throws java.lang.IllegalArgumentException if null or empty + * {@link Properties} instance is passed to the method call. + */ + public static ColumnFamilyOptions getColumnFamilyOptionsFromProps( + final ConfigOptions cfgOpts, final Properties properties) { + ColumnFamilyOptions columnFamilyOptions = null; + final long handle = getColumnFamilyOptionsFromProps( + cfgOpts.nativeHandle_, Options.getOptionStringFromProps(properties)); if (handle != 0){ columnFamilyOptions = new ColumnFamilyOptions(handle); } @@ -117,12 +138,24 @@ } @Override + public ColumnFamilyOptions oldDefaults(final int majorVersion, final int minorVersion) { + oldDefaults(nativeHandle_, majorVersion, minorVersion); + return this; + } + + @Override public ColumnFamilyOptions optimizeForSmallDb() { optimizeForSmallDb(nativeHandle_); return this; } @Override + public ColumnFamilyOptions optimizeForSmallDb(final Cache cache) { + optimizeForSmallDb(nativeHandle_, cache.getNativeHandle()); + return this; + } + + @Override public ColumnFamilyOptions optimizeForPointLookup( final long blockCacheSizeMb) { optimizeForPointLookup(nativeHandle_, @@ -307,7 +340,7 @@ final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); - for (final Byte byteCompressionType : byteCompressionTypes) { + for (final byte byteCompressionType : byteCompressionTypes) { compressionLevels.add(CompressionType.getCompressionType( byteCompressionType)); } @@ -576,6 +609,45 @@ } @Override + public ColumnFamilyOptions setCfPaths(final Collection cfPaths) { + assert (isOwningHandle()); + + final int len = cfPaths.size(); + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; + + int i = 0; + for (final DbPath dbPath : cfPaths) { + paths[i] = dbPath.path.toString(); + targetSizes[i] = dbPath.targetSize; + i++; + } + setCfPaths(nativeHandle_, paths, targetSizes); + return this; + } + + @Override + public List cfPaths() { + final int len = (int) cfPathsLen(nativeHandle_); + + if (len == 0) { + return Collections.emptyList(); + } + + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; + + cfPaths(nativeHandle_, paths, targetSizes); + + final List cfPaths = new ArrayList<>(); + for (int i = 0; i < len; i++) { + cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i])); + } + + return cfPaths; + } + + @Override public ColumnFamilyOptions setInplaceUpdateSupport( final boolean inplaceUpdateSupport) { setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); @@ -787,6 +859,17 @@ } @Override + public ColumnFamilyOptions setPeriodicCompactionSeconds(final long periodicCompactionSeconds) { + setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds); + return this; + } + + @Override + public long periodicCompactionSeconds() { + return periodicCompactionSeconds(nativeHandle_); + } + + @Override public ColumnFamilyOptions setCompactionOptionsUniversal( final CompactionOptionsUniversal compactionOptionsUniversal) { setCompactionOptionsUniversal(nativeHandle_, @@ -824,8 +907,304 @@ return forceConsistencyChecks(nativeHandle_); } + @Override + public ColumnFamilyOptions setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) { + setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_); + this.sstPartitionerFactory_ = sstPartitionerFactory; + return this; + } + + @Override + public ColumnFamilyOptions setCompactionThreadLimiter( + final ConcurrentTaskLimiter compactionThreadLimiter) { + setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_); + this.compactionThreadLimiter_ = compactionThreadLimiter; + return this; + } + + @Override + public ConcurrentTaskLimiter compactionThreadLimiter() { + assert (isOwningHandle()); + return this.compactionThreadLimiter_; + } + + @Override + public SstPartitionerFactory sstPartitionerFactory() { + return sstPartitionerFactory_; + } + + // + // BEGIN options for blobs (integrated BlobDB) + // + + /** + * When set, large values (blobs) are written to separate blob files, and only + * pointers to them are stored in SST files. This can reduce write amplification + * for large-value use cases at the cost of introducing a level of indirection + * for reads. See also the options min_blob_size, blob_file_size, + * blob_compression_type, enable_blob_garbage_collection, and + * blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param enableBlobFiles true iff blob files should be enabled + * + * @return the reference to the current options. + */ + @Override + public ColumnFamilyOptions setEnableBlobFiles(final boolean enableBlobFiles) { + setEnableBlobFiles(nativeHandle_, enableBlobFiles); + return this; + } + + /** + * When set, large values (blobs) are written to separate blob files, and only + * pointers to them are stored in SST files. This can reduce write amplification + * for large-value use cases at the cost of introducing a level of indirection + * for reads. See also the options min_blob_size, blob_file_size, + * blob_compression_type, enable_blob_garbage_collection, and + * blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return true iff blob files are currently enabled + */ + public boolean enableBlobFiles() { + return enableBlobFiles(nativeHandle_); + } + + /** + * Set the size of the smallest value to be stored separately in a blob file. Values + * which have an uncompressed size smaller than this threshold are stored + * alongside the keys in SST files in the usual fashion. A value of zero for + * this option means that all values are stored in blob files. Note that + * enable_blob_files has to be set in order for this option to have any effect. + * + * Default: 0 + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param minBlobSize the size of the smallest value to be stored separately in a blob file + * @return these options, updated with the supplied minimum blob size value + */ + @Override + public ColumnFamilyOptions setMinBlobSize(final long minBlobSize) { + setMinBlobSize(nativeHandle_, minBlobSize); + return this; + } + + /** + * Get the size of the smallest value to be stored separately in a blob file. Values + * which have an uncompressed size smaller than this threshold are stored + * alongside the keys in SST files in the usual fashion. A value of zero for + * this option means that all values are stored in blob files. Note that + * enable_blob_files has to be set in order for this option to have any effect. + * + * Default: 0 + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return the current minimum blob size + */ + @Override + public long minBlobSize() { + return minBlobSize(nativeHandle_); + } + + /** + * Set the size limit for blob files. When writing blob files, a new file is opened + * once this limit is reached. Note that enable_blob_files has to be set in + * order for this option to have any effect. + * + * Default: 256 MB + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param blobFileSize the new size limit for blob files + * + * @return the reference to the current options. + */ + @Override + public ColumnFamilyOptions setBlobFileSize(final long blobFileSize) { + setBlobFileSize(nativeHandle_, blobFileSize); + return this; + } + + /** + * Get the size limit for blob files. When writing blob files, a new file is opened + * once this limit is reached. Note that enable_blob_files has to be set in + * order for this option to have any effect. + * + * Default: 256 MB + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return the size limit for blob files + */ + @Override + public long blobFileSize() { + return blobFileSize(nativeHandle_); + } + + /** + * Set the compression algorithm to use for large values stored in blob files. Note + * that enable_blob_files has to be set in order for this option to have any + * effect. + * + * Default: no compression + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @param compressionType the compression algorithm to use + * + * @return the reference to the current options. + */ + @Override + public ColumnFamilyOptions setBlobCompressionType(final CompressionType compressionType) { + setBlobCompressionType(nativeHandle_, compressionType.getValue()); + return this; + } + + /** + * Get the compression algorithm to use for large values stored in blob files. Note + * that enable_blob_files has to be set in order for this option to have any + * effect. + * + * Default: no compression + * + * Dynamically changeable through + * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. + * + * @return the compression algorithm currently in use for blobs + */ + @Override + public CompressionType blobCompressionType() { + return CompressionType.values()[blobCompressionType(nativeHandle_)]; + } + + /** + * Enable/disable garbage collection of blobs. Blob GC is performed as part of + * compaction. Valid blobs residing in blob files older than a cutoff get + * relocated to new files as they are encountered during compaction, which makes + * it possible to clean up blob files once they contain nothing but + * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * @param enableBlobGarbageCollection true iff blob garbage collection is to be enabled + * + * @return the reference to the current options. + */ + @Override + public ColumnFamilyOptions setEnableBlobGarbageCollection( + final boolean enableBlobGarbageCollection) { + setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection); + return this; + } + + /** + * Get enabled/disables state for garbage collection of blobs. Blob GC is performed as part of + * compaction. Valid blobs residing in blob files older than a cutoff get + * relocated to new files as they are encountered during compaction, which makes + * it possible to clean up blob files once they contain nothing but + * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. + * + * Default: false + * + * @return true iff blob garbage collection is currently enabled + */ + @Override + public boolean enableBlobGarbageCollection() { + return enableBlobGarbageCollection(nativeHandle_); + } + + /** + * Set the cutoff in terms of blob file age for garbage collection. Blobs in the + * oldest N blob files will be relocated when encountered during compaction, + * where N = garbage_collection_cutoff * number_of_blob_files. Note that + * enable_blob_garbage_collection has to be set in order for this option to have + * any effect. + * + * Default: 0.25 + * + * @param blobGarbageCollectionAgeCutoff the new blob garbage collection age cutoff + * + * @return the reference to the current options. + */ + @Override + public ColumnFamilyOptions setBlobGarbageCollectionAgeCutoff( + final double blobGarbageCollectionAgeCutoff) { + setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff); + return this; + } + + /** + * Get the cutoff in terms of blob file age for garbage collection. Blobs in the + * oldest N blob files will be relocated when encountered during compaction, + * where N = garbage_collection_cutoff * number_of_blob_files. Note that + * enable_blob_garbage_collection has to be set in order for this option to have + * any effect. + * + * Default: 0.25 + * + * @return the current blob garbage collection age cutoff + */ + @Override + public double blobGarbageCollectionAgeCutoff() { + return blobGarbageCollectionAgeCutoff(nativeHandle_); + } + + /** + * If the ratio of garbage in the oldest blob files exceeds this threshold, + * targeted compactions are scheduled in order to force garbage collecting + * the blob files in question, assuming they are all eligible based on the + * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is + * currently only supported with leveled compactions. + * + * Note that {@link #enableBlobGarbageCollection} has to be set in order for this + * option to have any effect. + * + * Default: 1.0 + * + * Dynamically changeable through the SetOptions() API + * + * @param blobGarbageCollectionForceThreshold new value for the threshold + * @return the reference to the current options + */ + @Override + public ColumnFamilyOptions setBlobGarbageCollectionForceThreshold( + final double blobGarbageCollectionForceThreshold) { + setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold); + return this; + } + + /** + * Get the current value for the {@link #blobGarbageCollectionForceThreshold} + * @return the current threshold at which garbage collection of blobs is forced + */ + @Override + public double blobGarbageCollectionForceThreshold() { + return blobGarbageCollectionForceThreshold(nativeHandle_); + } + + // + // END options for blobs (integrated BlobDB) + // + private static native long getColumnFamilyOptionsFromProps( - String optString); + final long cfgHandle, String optString); + private static native long getColumnFamilyOptionsFromProps(final String optString); private static native long newColumnFamilyOptions(); private static native long copyColumnFamilyOptions(final long handle); @@ -833,7 +1212,10 @@ final long optionsHandle); @Override protected final native void disposeInternal(final long handle); + private static native void oldDefaults( + final long handle, final int majorVersion, final int minorVersion); private native void optimizeForSmallDb(final long handle); + private static native void optimizeForSmallDb(final long handle, final long cacheHandle); private native void optimizeForPointLookup(long handle, long blockCacheSizeMb); private native void optimizeLevelStyleCompaction(long handle, @@ -922,6 +1304,11 @@ private native String memTableFactoryName(long handle); private native void setTableFactory(long handle, long factoryHandle); private native String tableFactoryName(long handle); + private static native void setCfPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native long cfPathsLen(final long handle); + private static native void cfPaths( + final long handle, final String[] paths, final long[] targetSizes); private native void setInplaceUpdateSupport( long handle, boolean inplaceUpdateSupport); private native boolean inplaceUpdateSupport(long handle); @@ -977,6 +1364,9 @@ private native boolean reportBgIoStats(final long handle); private native void setTtl(final long handle, final long ttl); private native long ttl(final long handle); + private native void setPeriodicCompactionSeconds( + final long handle, final long periodicCompactionSeconds); + private native long periodicCompactionSeconds(final long handle); private native void setCompactionOptionsUniversal(final long handle, final long compactionOptionsUniversalHandle); private native void setCompactionOptionsFIFO(final long handle, @@ -984,6 +1374,27 @@ private native void setForceConsistencyChecks(final long handle, final boolean forceConsistencyChecks); private native boolean forceConsistencyChecks(final long handle); + private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); + private static native void setCompactionThreadLimiter( + final long nativeHandle_, final long compactionThreadLimiterHandle); + + private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); + private native boolean enableBlobFiles(final long nativeHandle_); + private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); + private native long minBlobSize(final long nativeHandle_); + private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); + private native long blobFileSize(final long nativeHandle_); + private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType); + private native byte blobCompressionType(final long nativeHandle_); + private native void setEnableBlobGarbageCollection( + final long nativeHandle_, final boolean enableBlobGarbageCollection); + private native boolean enableBlobGarbageCollection(final long nativeHandle_); + private native void setBlobGarbageCollectionAgeCutoff( + final long nativeHandle_, final double blobGarbageCollectionAgeCutoff); + private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); + private native void setBlobGarbageCollectionForceThreshold( + final long nativeHandle_, final double blobGarbageCollectionForceThreshold); + private native double blobGarbageCollectionForceThreshold(final long nativeHandle_); // instance variables // NOTE: If you add new member variables, please update the copy constructor above! @@ -997,5 +1408,6 @@ private CompactionOptionsFIFO compactionOptionsFIFO_; private CompressionOptions bottommostCompressionOptions_; private CompressionOptions compressionOptions_; - + private SstPartitionerFactory sstPartitionerFactory_; + private ConcurrentTaskLimiter compactionThreadLimiter_; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,9 +5,22 @@ package org.rocksdb; +import java.util.Collection; +import java.util.List; + public interface ColumnFamilyOptionsInterface> extends AdvancedColumnFamilyOptionsInterface { /** + * The function recovers options to a previous version. Only 4.6 or later + * versions are supported. + * + * @param majorVersion The major version to recover default values of options + * @param minorVersion The minor version to recover default values of options + * @return the instance of the current object. + */ + T oldDefaults(int majorVersion, int minorVersion); + + /** * Use this if your DB is very small (like under 1GB) and you don't want to * spend lots of memory for memtables. * @@ -16,6 +29,16 @@ T optimizeForSmallDb(); /** + * Some functions that make it easier to optimize RocksDB + * Use this if your DB is very small (like under 1GB) and you don't want to + * spend lots of memory for memtables. + * + * @param cache An optional cache object is passed in to be used as the block cache + * @return the instance of the current object. + */ + T optimizeForSmallDb(Cache cache); + + /** * Use this if you don't need to keep the data sorted, i.e. you'll never use * an iterator, only Put() and Get() API calls * @@ -372,6 +395,30 @@ String tableFactoryName(); /** + * A list of paths where SST files for this column family + * can be put into, with its target size. Similar to db_paths, + * newer data is placed into paths specified earlier in the + * vector while older data gradually moves to paths specified + * later in the vector. + * Note that, if a path is supplied to multiple column + * families, it would have files and total size from all + * the column families combined. User should provision for the + * total size(from all the column families) in such cases. + * + * If left empty, db_paths will be used. + * Default: empty + * + * @param paths collection of paths for SST files. + * @return the reference of the current options. + */ + T setCfPaths(final Collection paths); + + /** + * @return collection of paths for SST files. + */ + List cfPaths(); + + /** * Compression algorithm that will be used for the bottommost level that * contain files. If level-compaction is used, this option will only affect * levels after base level. @@ -438,6 +485,46 @@ CompressionOptions compressionOptions(); /** + * If non-nullptr, use the specified factory for a function to determine the + * partitioning of sst files. This helps compaction to split the files + * on interesting boundaries (key prefixes) to make propagation of sst + * files less write amplifying (covering the whole key space). + * + * Default: nullptr + * + * @param factory The factory reference + * @return the reference of the current options. + */ + @Experimental("Caution: this option is experimental") + T setSstPartitionerFactory(SstPartitionerFactory factory); + + /** + * Get SST partitioner factory + * + * @return SST partitioner factory + */ + @Experimental("Caution: this option is experimental") + SstPartitionerFactory sstPartitionerFactory(); + + /** + * Compaction concurrent thread limiter for the column family. + * If non-nullptr, use given concurrent thread limiter to control + * the max outstanding compaction tasks. Limiter can be shared with + * multiple column families across db instances. + * + * @param concurrentTaskLimiter The compaction thread limiter. + * @return the reference of the current options. + */ + T setCompactionThreadLimiter(ConcurrentTaskLimiter concurrentTaskLimiter); + + /** + * Get compaction thread limiter + * + * @return Compaction thread limiter + */ + ConcurrentTaskLimiter compactionThreadLimiter(); + + /** * Default memtable memory budget used with the following methods: * *
      diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -15,13 +15,14 @@ private final static byte VALUE_kIfHaveCompactionFilter = 1; private final static byte VALUE_kForce = 2; - // For level based compaction, we can configure if we want to skip/force bottommost level compaction. - // The order of this neum MUST follow the C++ layer. See BottommostLevelCompaction in db/options.h + // For level based compaction, we can configure if we want to skip/force bottommost level + // compaction. The order of this enum MUST follow the C++ layer. See BottommostLevelCompaction in + // db/options.h public enum BottommostLevelCompaction { /** * Skip bottommost level compaction */ - kSkip((byte)VALUE_kSkip), + kSkip(VALUE_kSkip), /** * Only compact bottommost level if there is a compaction filter. This is the default option */ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -20,6 +20,8 @@ */ private CompactionJobInfo(final long nativeHandle) { super(nativeHandle); + // We do not own the native object! + disOwnNativeHandle(); } /** diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java 2025-05-19 16:14:27.000000000 +0000 @@ -78,7 +78,17 @@ /** * Compaction caused by external sst file ingestion */ - kExternalSstIngestion((byte)0x0D); + kExternalSstIngestion((byte) 0x0D), + + /** + * Compaction due to SST file being too old + */ + kPeriodicCompaction((byte) 0x0E), + + /** + * Compaction in order to move files to temperature + */ + kChangeTemperature((byte) 0x0F); private final byte value; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java 2025-05-19 16:14:27.000000000 +0000 @@ -14,16 +14,15 @@ * compression method (if any) is used to compress a block.

      */ public enum CompressionType { - - NO_COMPRESSION((byte) 0x0, null), - SNAPPY_COMPRESSION((byte) 0x1, "snappy"), - ZLIB_COMPRESSION((byte) 0x2, "z"), - BZLIB2_COMPRESSION((byte) 0x3, "bzip2"), - LZ4_COMPRESSION((byte) 0x4, "lz4"), - LZ4HC_COMPRESSION((byte) 0x5, "lz4hc"), - XPRESS_COMPRESSION((byte) 0x6, "xpress"), - ZSTD_COMPRESSION((byte)0x7, "zstd"), - DISABLE_COMPRESSION_OPTION((byte)0x7F, null); + NO_COMPRESSION((byte) 0x0, null, "kNoCompression"), + SNAPPY_COMPRESSION((byte) 0x1, "snappy", "kSnappyCompression"), + ZLIB_COMPRESSION((byte) 0x2, "z", "kZlibCompression"), + BZLIB2_COMPRESSION((byte) 0x3, "bzip2", "kBZip2Compression"), + LZ4_COMPRESSION((byte) 0x4, "lz4", "kLZ4Compression"), + LZ4HC_COMPRESSION((byte) 0x5, "lz4hc", "kLZ4HCCompression"), + XPRESS_COMPRESSION((byte) 0x6, "xpress", "kXpressCompression"), + ZSTD_COMPRESSION((byte) 0x7, "zstd", "kZSTD"), + DISABLE_COMPRESSION_OPTION((byte) 0x7F, null, "kDisableCompressionOption"); /** *

      Get the CompressionType enumeration value by @@ -71,6 +70,27 @@ } /** + *

      Get a CompressionType value based on the string key in the C++ options output. + * This gets used in support of getting options into Java from an options string, + * which is generated at the C++ level. + *

      + * + * @param internalName the internal (C++) name by which the option is known. + * + * @return CompressionType instance (optional) + */ + static CompressionType getFromInternal(final String internalName) { + for (final CompressionType compressionType : CompressionType.values()) { + if (compressionType.internalName_.equals(internalName)) { + return compressionType; + } + } + + throw new IllegalArgumentException( + "Illegal internalName '" + internalName + " ' provided for CompressionType."); + } + + /** *

      Returns the byte value of the enumerations value.

      * * @return byte representation @@ -89,11 +109,13 @@ return libraryName_; } - CompressionType(final byte value, final String libraryName) { + CompressionType(final byte value, final String libraryName, final String internalName) { value_ = value; libraryName_ = libraryName; + internalName_ = internalName; } private final byte value_; private final String libraryName_; + private final String internalName_; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,38 @@ +package org.rocksdb; + +public abstract class ConcurrentTaskLimiter extends RocksObject { + protected ConcurrentTaskLimiter(final long nativeHandle) { + super(nativeHandle); + } + + /** + * Returns a name that identifies this concurrent task limiter. + * + * @return Concurrent task limiter name. + */ + public abstract String name(); + + /** + * Set max concurrent tasks.
      + * limit = 0 means no new task allowed.
      + * limit < 0 means no limitation. + * + * @param maxOutstandinsTask max concurrent tasks. + * @return the reference to the current instance of ConcurrentTaskLimiter. + */ + public abstract ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandinsTask); + + /** + * Reset to unlimited max concurrent task. + * + * @return the reference to the current instance of ConcurrentTaskLimiter. + */ + public abstract ConcurrentTaskLimiter resetMaxOutstandingTask(); + + /** + * Returns current outstanding task count. + * + * @return current outstanding task count. + */ + public abstract int outstandingTask(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,42 @@ +package org.rocksdb; + +public class ConcurrentTaskLimiterImpl extends ConcurrentTaskLimiter { + public ConcurrentTaskLimiterImpl(final String name, final int maxOutstandingTask) { + super(newConcurrentTaskLimiterImpl0(name, maxOutstandingTask)); + } + + @Override + public String name() { + assert (isOwningHandle()); + return name(nativeHandle_); + } + + @Override + public ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandingTask) { + assert (isOwningHandle()); + setMaxOutstandingTask(nativeHandle_, maxOutstandingTask); + return this; + } + + @Override + public ConcurrentTaskLimiter resetMaxOutstandingTask() { + assert (isOwningHandle()); + resetMaxOutstandingTask(nativeHandle_); + return this; + } + + @Override + public int outstandingTask() { + assert (isOwningHandle()); + return outstandingTask(nativeHandle_); + } + + private static native long newConcurrentTaskLimiterImpl0( + final String name, final int maxOutstandingTask); + private static native String name(final long handle); + private static native void setMaxOutstandingTask(final long handle, final int limit); + private static native void resetMaxOutstandingTask(final long handle); + private static native int outstandingTask(final long handle); + + @Override protected final native void disposeInternal(final long handle); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,47 @@ +package org.rocksdb; + +public class ConfigOptions extends RocksObject { + static { + RocksDB.loadLibrary(); + } + + /** + * Construct with default Options + */ + public ConfigOptions() { + super(newConfigOptions()); + } + + public ConfigOptions setDelimiter(final String delimiter) { + setDelimiter(nativeHandle_, delimiter); + return this; + } + public ConfigOptions setIgnoreUnknownOptions(final boolean ignore) { + setIgnoreUnknownOptions(nativeHandle_, ignore); + return this; + } + + public ConfigOptions setEnv(final Env env) { + setEnv(nativeHandle_, env.nativeHandle_); + return this; + } + + public ConfigOptions setInputStringsEscaped(final boolean escaped) { + setInputStringsEscaped(nativeHandle_, escaped); + return this; + } + + public ConfigOptions setSanityLevel(final SanityLevel level) { + setSanityLevel(nativeHandle_, level.getValue()); + return this; + } + + @Override protected final native void disposeInternal(final long handle); + + private native static long newConfigOptions(); + private native static void setEnv(final long handle, final long envHandle); + private native static void setDelimiter(final long handle, final String delimiter); + private native static void setIgnoreUnknownOptions(final long handle, final boolean ignore); + private native static void setInputStringsEscaped(final long handle, final boolean escaped); + private native static void setSanityLevel(final long handle, final byte level); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -71,6 +71,7 @@ * {@code allowMmapReads()} has a property key: * {@code allow_mmap_reads}.

      * + * @param cfgOpts The ConfigOptions to control how the string is processed. * @param properties {@link java.util.Properties} instance. * * @return {@link org.rocksdb.DBOptions instance} @@ -80,22 +81,40 @@ * {@link java.util.Properties} instance is passed to the method call. */ public static DBOptions getDBOptionsFromProps( - final Properties properties) { - if (properties == null || properties.size() == 0) { - throw new IllegalArgumentException( - "Properties value must contain at least one value."); - } + final ConfigOptions cfgOpts, final Properties properties) { DBOptions dbOptions = null; - StringBuilder stringBuilder = new StringBuilder(); - for (final String name : properties.stringPropertyNames()){ - stringBuilder.append(name); - stringBuilder.append("="); - stringBuilder.append(properties.getProperty(name)); - stringBuilder.append(";"); + final String optionsString = Options.getOptionStringFromProps(properties); + final long handle = getDBOptionsFromProps(cfgOpts.nativeHandle_, optionsString); + if (handle != 0) { + dbOptions = new DBOptions(handle); } - long handle = getDBOptionsFromProps( - stringBuilder.toString()); - if (handle != 0){ + return dbOptions; + } + + /** + *

      Method to get a options instance by using pre-configured + * property values. If one or many values are undefined in + * the context of RocksDB the method will return a null + * value.

      + * + *

      Note: Property keys can be derived from + * getter methods within the options class. Example: the method + * {@code allowMmapReads()} has a property key: + * {@code allow_mmap_reads}.

      + * + * @param properties {@link java.util.Properties} instance. + * + * @return {@link org.rocksdb.DBOptions instance} + * or null. + * + * @throws java.lang.IllegalArgumentException if null or empty + * {@link java.util.Properties} instance is passed to the method call. + */ + public static DBOptions getDBOptionsFromProps(final Properties properties) { + DBOptions dbOptions = null; + final String optionsString = Options.getOptionStringFromProps(properties); + final long handle = getDBOptionsFromProps(optionsString); + if (handle != 0) { dbOptions = new DBOptions(handle); } return dbOptions; @@ -554,6 +573,18 @@ } @Override + public DBOptions setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes) { + setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes); + return this; + } + + @Override + public long maxWriteBatchGroupSizeBytes() { + assert (isOwningHandle()); + return maxWriteBatchGroupSizeBytes(nativeHandle_); + } + + @Override public DBOptions setManifestPreallocationSize( final long size) { assert(isOwningHandle()); @@ -853,32 +884,18 @@ return strictBytesPerSync(nativeHandle_); } - //TODO(AR) NOW -// @Override -// public DBOptions setListeners(final List listeners) { -// assert(isOwningHandle()); -// final long[] eventListenerHandlers = new long[listeners.size()]; -// for (int i = 0; i < eventListenerHandlers.length; i++) { -// eventListenerHandlers[i] = listeners.get(i).nativeHandle_; -// } -// setEventListeners(nativeHandle_, eventListenerHandlers); -// return this; -// } -// -// @Override -// public Collection listeners() { -// assert(isOwningHandle()); -// final long[] eventListenerHandlers = listeners(nativeHandle_); -// if (eventListenerHandlers == null || eventListenerHandlers.length == 0) { -// return Collections.emptyList(); -// } -// -// final List eventListeners = new ArrayList<>(); -// for (final long eventListenerHandle : eventListenerHandlers) { -// eventListeners.add(new EventListener(eventListenerHandle)); //TODO(AR) check ownership is set to false! -// } -// return eventListeners; -// } + @Override + public DBOptions setListeners(final List listeners) { + assert (isOwningHandle()); + setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners)); + return this; + } + + @Override + public List listeners() { + assert (isOwningHandle()); + return Arrays.asList(eventListeners(nativeHandle_)); + } @Override public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) { @@ -992,6 +1009,19 @@ } @Override + public DBOptions setSkipCheckingSstFileSizesOnDbOpen( + final boolean skipCheckingSstFileSizesOnDbOpen) { + setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen); + return this; + } + + @Override + public boolean skipCheckingSstFileSizesOnDbOpen() { + assert (isOwningHandle()); + return skipCheckingSstFileSizesOnDbOpen(nativeHandle_); + } + + @Override public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) { assert(isOwningHandle()); setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue()); @@ -1160,6 +1190,90 @@ return atomicFlush(nativeHandle_); } + @Override + public DBOptions setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO) { + setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO); + return this; + } + + @Override + public boolean avoidUnnecessaryBlockingIO() { + assert (isOwningHandle()); + return avoidUnnecessaryBlockingIO(nativeHandle_); + } + + @Override + public DBOptions setPersistStatsToDisk(final boolean persistStatsToDisk) { + setPersistStatsToDisk(nativeHandle_, persistStatsToDisk); + return this; + } + + @Override + public boolean persistStatsToDisk() { + assert (isOwningHandle()); + return persistStatsToDisk(nativeHandle_); + } + + @Override + public DBOptions setWriteDbidToManifest(final boolean writeDbidToManifest) { + setWriteDbidToManifest(nativeHandle_, writeDbidToManifest); + return this; + } + + @Override + public boolean writeDbidToManifest() { + assert (isOwningHandle()); + return writeDbidToManifest(nativeHandle_); + } + + @Override + public DBOptions setLogReadaheadSize(final long logReadaheadSize) { + setLogReadaheadSize(nativeHandle_, logReadaheadSize); + return this; + } + + @Override + public long logReadaheadSize() { + assert (isOwningHandle()); + return logReadaheadSize(nativeHandle_); + } + + @Override + public DBOptions setBestEffortsRecovery(final boolean bestEffortsRecovery) { + setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery); + return this; + } + + @Override + public boolean bestEffortsRecovery() { + assert (isOwningHandle()); + return bestEffortsRecovery(nativeHandle_); + } + + @Override + public DBOptions setMaxBgErrorResumeCount(final int maxBgerrorResumeCount) { + setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount); + return this; + } + + @Override + public int maxBgerrorResumeCount() { + assert (isOwningHandle()); + return maxBgerrorResumeCount(nativeHandle_); + } + + @Override + public DBOptions setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval) { + setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval); + return this; + } + + @Override + public long bgerrorResumeRetryInterval() { + assert (isOwningHandle()); + return bgerrorResumeRetryInterval(nativeHandle_); + } + static final int DEFAULT_NUM_SHARD_BITS = -1; @@ -1175,8 +1289,8 @@ super(nativeHandle); } - private static native long getDBOptionsFromProps( - String optString); + private static native long getDBOptionsFromProps(long cfgHandle, String optString); + private static native long getDBOptionsFromProps(String optString); private static native long newDBOptions(); private static native long copyDBOptions(final long handle); @@ -1262,6 +1376,9 @@ private native long walTtlSeconds(long handle); private native void setWalSizeLimitMB(long handle, long sizeLimitMB); private native long walSizeLimitMB(long handle); + private static native void setMaxWriteBatchGroupSizeBytes( + final long handle, final long maxWriteBatchGroupSizeBytes); + private static native long maxWriteBatchGroupSizeBytes(final long handle); private native void setManifestPreallocationSize( long handle, long size) throws IllegalArgumentException; private native long manifestPreallocationSize(long handle); @@ -1328,6 +1445,9 @@ final long handle, final boolean strictBytesPerSync); private native boolean strictBytesPerSync( final long handle); + private static native void setEventListeners( + final long handle, final long[] eventListenerHandles); + private static native AbstractEventListener[] eventListeners(final long handle); private native void setEnableThreadTracking(long handle, boolean enableThreadTracking); private native boolean enableThreadTracking(long handle); @@ -1354,6 +1474,9 @@ private native void setSkipStatsUpdateOnDbOpen(final long handle, final boolean skipStatsUpdateOnDbOpen); private native boolean skipStatsUpdateOnDbOpen(final long handle); + private static native void setSkipCheckingSstFileSizesOnDbOpen( + final long handle, final boolean skipChecking); + private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); private native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); private native byte walRecoveryMode(final long handle); @@ -1391,6 +1514,26 @@ private native void setAtomicFlush(final long handle, final boolean atomicFlush); private native boolean atomicFlush(final long handle); + private static native void setAvoidUnnecessaryBlockingIO( + final long handle, final boolean avoidBlockingIO); + private static native boolean avoidUnnecessaryBlockingIO(final long handle); + private static native void setPersistStatsToDisk( + final long handle, final boolean persistStatsToDisk); + private static native boolean persistStatsToDisk(final long handle); + private static native void setWriteDbidToManifest( + final long handle, final boolean writeDbidToManifest); + private static native boolean writeDbidToManifest(final long handle); + private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize); + private static native long logReadaheadSize(final long handle); + private static native void setBestEffortsRecovery( + final long handle, final boolean bestEffortsRecovery); + private static native boolean bestEffortsRecovery(final long handle); + private static native void setMaxBgErrorResumeCount( + final long handle, final int maxBgerrorRecumeCount); + private static native int maxBgerrorResumeCount(final long handle); + private static native void setBgerrorResumeRetryInterval( + final long handle, final long bgerrorResumeRetryInterval); + private static native long bgerrorResumeRetryInterval(final long handle); // instance variables // NOTE: If you add new member variables, please update the copy constructor above! diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -625,7 +625,7 @@ * then WAL_size_limit_MB, they will be deleted starting with the * earliest until size_limit is met. All empty files will be deleted. *
    1. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * WAL files will be checked every WAL_ttl_seconds / 2 and those that * are older than WAL_ttl_seconds will be deleted.
    2. *
    3. If both are not 0, WAL files will be checked every 10 min and both * checks will be performed with ttl being first.
    4. @@ -648,7 +648,7 @@ * then WAL_size_limit_MB, they will be deleted starting with the * earliest until size_limit is met. All empty files will be deleted. *
    5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * WAL files will be checked every WAL_ttl_seconds / 2 and those that * are older than WAL_ttl_seconds will be deleted.
    6. *
    7. If both are not 0, WAL files will be checked every 10 min and both * checks will be performed with ttl being first.
    8. @@ -704,6 +704,29 @@ long walSizeLimitMB(); /** + * The maximum limit of number of bytes that are written in a single batch + * of WAL or memtable write. It is followed when the leader write size + * is larger than 1/8 of this limit. + * + * Default: 1 MB + * + * @param maxWriteBatchGroupSizeBytes the maximum limit of number of bytes, see description. + * @return the instance of the current object. + */ + T setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes); + + /** + * The maximum limit of number of bytes that are written in a single batch + * of WAL or memtable write. It is followed when the leader write size + * is larger than 1/8 of this limit. + * + * Default: 1 MB + * + * @return the maximum limit of number of bytes, see description. + */ + long maxWriteBatchGroupSizeBytes(); + + /** * Number of bytes to preallocate (via fallocate) the manifest * files. Default is 4mb, which is reasonable to reduce random IO * as well as prevent overallocation for mounts that preallocate @@ -1032,24 +1055,31 @@ */ boolean useAdaptiveMutex(); - //TODO(AR) NOW -// /** -// * Sets the {@link EventListener}s whose callback functions -// * will be called when specific RocksDB event happens. -// * -// * @param listeners the listeners who should be notified on various events. -// * -// * @return the instance of the current object. -// */ -// T setListeners(final List listeners); -// -// /** -// * Gets the {@link EventListener}s whose callback functions -// * will be called when specific RocksDB event happens. -// * -// * @return a collection of Event listeners. -// */ -// Collection listeners(); + /** + * Sets the {@link EventListener}s whose callback functions + * will be called when specific RocksDB event happens. + * + * Note: the RocksJava API currently only supports EventListeners implemented in Java. + * It could be extended in future to also support adding/removing EventListeners implemented in + * C++. + * + * @param listeners the listeners who should be notified on various events. + * + * @return the instance of the current object. + */ + T setListeners(final List listeners); + + /** + * Sets the {@link EventListener}s whose callback functions + * will be called when specific RocksDB event happens. + * + * Note: the RocksJava API currently only supports EventListeners implemented in Java. + * It could be extended in future to also support adding/removing EventListeners implemented in + * C++. + * + * @return the instance of the current object. + */ + List listeners(); /** * If true, then the status of the threads involved in this DB will @@ -1279,6 +1309,36 @@ boolean skipStatsUpdateOnDbOpen(); /** + * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files. + * This may significantly speed up startup if there are many sst files, + * especially when using non-default Env with expensive GetFileSize(). + * We'll still check that all required sst files exist. + * If {@code paranoid_checks} is false, this option is ignored, and sst files are + * not checked at all. + * + * Default: false + * + * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked + * when calling {@link RocksDB#open(String)}. + * @return the reference to the current options. + */ + T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen); + + /** + * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files. + * This may significantly speed up startup if there are many sst files, + * especially when using non-default Env with expensive GetFileSize(). + * We'll still check that all required sst files exist. + * If {@code paranoid_checks} is false, this option is ignored, and sst files are + * not checked at all. + * + * Default: false + * + * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}. + */ + boolean skipCheckingSstFileSizesOnDbOpen(); + + /** * Recovery mode to control the consistency while replaying WAL * * Default: {@link WALRecoveryMode#PointInTimeRecovery} @@ -1561,4 +1621,199 @@ * @return true if atomic flush is enabled. */ boolean atomicFlush(); + + /** + * If true, working thread may avoid doing unnecessary and long-latency + * operation (such as deleting obsolete files directly or deleting memtable) + * and will instead schedule a background job to do it. + * Use it if you're latency-sensitive. + * If set to true, takes precedence over + * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}. + * + * @param avoidUnnecessaryBlockingIO If true, working thread may avoid doing unnecessary + * operation. + * @return the reference to the current options. + */ + T setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO); + + /** + * If true, working thread may avoid doing unnecessary and long-latency + * operation (such as deleting obsolete files directly or deleting memtable) + * and will instead schedule a background job to do it. + * Use it if you're latency-sensitive. + * If set to true, takes precedence over + * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}. + * + * @return true, if working thread may avoid doing unnecessary operation. + */ + boolean avoidUnnecessaryBlockingIO(); + + /** + * If true, automatically persist stats to a hidden column family (column + * family name: ___rocksdb_stats_history___) every + * stats_persist_period_sec seconds; otherwise, write to an in-memory + * struct. User can query through `GetStatsHistory` API. + * If user attempts to create a column family with the same name on a DB + * which have previously set persist_stats_to_disk to true, the column family + * creation will fail, but the hidden column family will survive, as well as + * the previously persisted statistics. + * When peristing stats to disk, the stat name will be limited at 100 bytes. + * Default: false + * + * @param persistStatsToDisk true if stats should be persisted to hidden column family. + * @return the instance of the current object. + */ + T setPersistStatsToDisk(final boolean persistStatsToDisk); + + /** + * If true, automatically persist stats to a hidden column family (column + * family name: ___rocksdb_stats_history___) every + * stats_persist_period_sec seconds; otherwise, write to an in-memory + * struct. User can query through `GetStatsHistory` API. + * If user attempts to create a column family with the same name on a DB + * which have previously set persist_stats_to_disk to true, the column family + * creation will fail, but the hidden column family will survive, as well as + * the previously persisted statistics. + * When peristing stats to disk, the stat name will be limited at 100 bytes. + * Default: false + * + * @return true if stats should be persisted to hidden column family. + */ + boolean persistStatsToDisk(); + + /** + * Historically DB ID has always been stored in Identity File in DB folder. + * If this flag is true, the DB ID is written to Manifest file in addition + * to the Identity file. By doing this 2 problems are solved + * 1. We don't checksum the Identity file where as Manifest file is. + * 2. Since the source of truth for DB is Manifest file DB ID will sit with + * the source of truth. Previously the Identity file could be copied + * independent of Manifest and that can result in wrong DB ID. + * We recommend setting this flag to true. + * Default: false + * + * @param writeDbidToManifest if true, then DB ID will be written to Manifest file. + * @return the instance of the current object. + */ + T setWriteDbidToManifest(final boolean writeDbidToManifest); + + /** + * Historically DB ID has always been stored in Identity File in DB folder. + * If this flag is true, the DB ID is written to Manifest file in addition + * to the Identity file. By doing this 2 problems are solved + * 1. We don't checksum the Identity file where as Manifest file is. + * 2. Since the source of truth for DB is Manifest file DB ID will sit with + * the source of truth. Previously the Identity file could be copied + * independent of Manifest and that can result in wrong DB ID. + * We recommend setting this flag to true. + * Default: false + * + * @return true, if DB ID will be written to Manifest file. + */ + boolean writeDbidToManifest(); + + /** + * The number of bytes to prefetch when reading the log. This is mostly useful + * for reading a remotely located log, as it can save the number of + * round-trips. If 0, then the prefetching is disabled. + * + * Default: 0 + * + * @param logReadaheadSize the number of bytes to prefetch when reading the log. + * @return the instance of the current object. + */ + T setLogReadaheadSize(final long logReadaheadSize); + + /** + * The number of bytes to prefetch when reading the log. This is mostly useful + * for reading a remotely located log, as it can save the number of + * round-trips. If 0, then the prefetching is disabled. + * + * Default: 0 + * + * @return the number of bytes to prefetch when reading the log. + */ + long logReadaheadSize(); + + /** + * By default, RocksDB recovery fails if any table file referenced in + * MANIFEST are missing after scanning the MANIFEST. + * Best-efforts recovery is another recovery mode that + * tries to restore the database to the most recent point in time without + * missing file. + * Currently not compatible with atomic flush. Furthermore, WAL files will + * not be used for recovery if best_efforts_recovery is true. + * Default: false + * + * @param bestEffortsRecovery if true, RocksDB will use best-efforts mode when recovering. + * @return the instance of the current object. + */ + T setBestEffortsRecovery(final boolean bestEffortsRecovery); + + /** + * By default, RocksDB recovery fails if any table file referenced in + * MANIFEST are missing after scanning the MANIFEST. + * Best-efforts recovery is another recovery mode that + * tries to restore the database to the most recent point in time without + * missing file. + * Currently not compatible with atomic flush. Furthermore, WAL files will + * not be used for recovery if best_efforts_recovery is true. + * Default: false + * + * @return true, if RocksDB uses best-efforts mode when recovering. + */ + boolean bestEffortsRecovery(); + + /** + * It defines how many times db resume is called by a separate thread when + * background retryable IO Error happens. When background retryable IO + * Error happens, SetBGError is called to deal with the error. If the error + * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), + * then db resume is called in background to recover from the error. If this + * value is 0 or negative, db resume will not be called. + * + * Default: INT_MAX + * + * @param maxBgerrorResumeCount maximum number of times db resume should be called when IO Error + * happens. + * @return the instance of the current object. + */ + T setMaxBgErrorResumeCount(final int maxBgerrorResumeCount); + + /** + * It defines how many times db resume is called by a separate thread when + * background retryable IO Error happens. When background retryable IO + * Error happens, SetBGError is called to deal with the error. If the error + * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), + * then db resume is called in background to recover from the error. If this + * value is 0 or negative, db resume will not be called. + * + * Default: INT_MAX + * + * @return maximum number of times db resume should be called when IO Error happens. + */ + int maxBgerrorResumeCount(); + + /** + * If max_bgerror_resume_count is ≥ 2, db resume is called multiple times. + * This option decides how long to wait to retry the next resume if the + * previous resume fails and satisfy redo resume conditions. + * + * Default: 1000000 (microseconds). + * + * @param bgerrorResumeRetryInterval how many microseconds to wait between DB resume attempts. + * @return the instance of the current object. + */ + T setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval); + + /** + * If max_bgerror_resume_count is ≥ 2, db resume is called multiple times. + * This option decides how long to wait to retry the next resume if the + * previous resume fails and satisfy redo resume conditions. + * + * Default: 1000000 (microseconds). + * + * @return the instance of the current object. + */ + long bgerrorResumeRetryInterval(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java 2025-05-19 16:14:27.000000000 +0000 @@ -110,6 +110,10 @@ this.internalBufferOffset += n; } + public void setLength(final int n) { + setLength0(getNativeHandle(), n); + } + @Override protected void disposeInternal() { final long nativeHandle = getNativeHandle(); @@ -127,6 +131,7 @@ private native void clear0(long handle, boolean internalBuffer, long internalBufferOffset); private native void removePrefix0(long handle, int length); + private native void setLength0(long handle, int length); private native void disposeInternalBuf(final long handle, long internalBufferOffset); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java 2025-05-19 16:14:27.000000000 +0000 @@ -43,8 +43,8 @@ } /** - *

      Sets the number of background worker threads of the flush pool - * for this environment.

      + *

      Sets the number of background worker threads of the low priority + * pool for this environment.

      *

      Default number: 1

      * * @param number the number of threads diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,335 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.List; + +/** + * EventListener class contains a set of callback functions that will + * be called when specific RocksDB event happens such as flush. It can + * be used as a building block for developing custom features such as + * stats-collector or external compaction algorithm. + * + * Note that callback functions should not run for an extended period of + * time before the function returns, otherwise RocksDB may be blocked. + * For example, it is not suggested to do + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * CompactionJobInfo)} (as it may run for a long while) or issue many of + * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} + * (as Put may be blocked in certain cases) in the same thread in the + * EventListener callback. + * + * However, doing + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in + * another thread is considered safe. + * + * [Threading] All EventListener callback will be called using the + * actual thread that involves in that specific event. For example, it + * is the RocksDB background flush thread that does the actual flush to + * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}. + * + * [Locking] All EventListener callbacks are designed to be called without + * the current thread holding any DB mutex. This is to prevent potential + * deadlock and performance issue when using EventListener callback + * in a complex way. + */ +public interface EventListener { + /** + * A callback function to RocksDB which will be called before a + * RocksDB starts to flush memtables. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db the database + * @param flushJobInfo the flush job info, contains data copied from + * respective native structure. + */ + void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo); + + /** + * callback function to RocksDB which will be called whenever a + * registered RocksDB flushes a file. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db the database + * @param flushJobInfo the flush job info, contains data copied from + * respective native structure. + */ + void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a SST file is deleted. Different from + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)} and + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)}, + * this callback is designed for external logging + * service and thus only provide string parameters instead + * of a pointer to DB. Applications that build logic basic based + * on file creations and deletions is suggested to implement + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from the + * returned value. + * + * @param tableFileDeletionInfo the table file deletion info, + * contains data copied from respective native structure. + */ + void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo); + + /** + * A callback function to RocksDB which will be called before a + * RocksDB starts to compact. The default implementation is + * no-op. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db a pointer to the rocksdb instance which just compacted + * a file. + * @param compactionJobInfo a reference to a native CompactionJobInfo struct, + * which is released after this function is returned, and must be copied + * if it is needed outside of this function. + */ + void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a registered RocksDB compacts a file. The default implementation + * is a no-op. + * + * Note that this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db a pointer to the rocksdb instance which just compacted + * a file. + * @param compactionJobInfo a reference to a native CompactionJobInfo struct, + * which is released after this function is returned, and must be copied + * if it is needed outside of this function. + */ + void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a SST file is created. Different from OnCompactionCompleted and + * OnFlushCompleted, this callback is designed for external logging + * service and thus only provide string parameters instead + * of a pointer to DB. Applications that build logic basic based + * on file creations and deletions is suggested to implement + * OnFlushCompleted and OnCompactionCompleted. + * + * Historically it will only be called if the file is successfully created. + * Now it will also be called on failure case. User can check info.status + * to see if it succeeded or not. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param tableFileCreationInfo the table file creation info, + * contains data copied from respective native structure. + */ + void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo); + + /** + * A callback function for RocksDB which will be called before + * a SST file is being created. It will follow by OnTableFileCreated after + * the creation finishes. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param tableFileCreationBriefInfo the table file creation brief info, + * contains data copied from respective native structure. + */ + void onTableFileCreationStarted(final TableFileCreationBriefInfo tableFileCreationBriefInfo); + + /** + * A callback function for RocksDB which will be called before + * a memtable is made immutable. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param memTableInfo the mem table info, contains data + * copied from respective native structure. + */ + void onMemTableSealed(final MemTableInfo memTableInfo); + + /** + * A callback function for RocksDB which will be called before + * a column family handle is deleted. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param columnFamilyHandle is a pointer to the column family handle to be + * deleted which will become a dangling pointer after the deletion. + */ + void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle); + + /** + * A callback function for RocksDB which will be called after an external + * file is ingested using IngestExternalFile. + * + * Note that the this function will run on the same thread as + * IngestExternalFile(), if this function is blocked, IngestExternalFile() + * will be blocked from finishing. + * + * @param db the database + * @param externalFileIngestionInfo the external file ingestion info, + * contains data copied from respective native structure. + */ + void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo); + + /** + * A callback function for RocksDB which will be called before setting the + * background error status to a non-OK value. The new background error status + * is provided in `bg_error` and can be modified by the callback. E.g., a + * callback can suppress errors by resetting it to Status::OK(), thus + * preventing the database from entering read-only mode. We do not provide any + * guarantee when failed flushes/compactions will be rescheduled if the user + * suppresses an error. + * + * Note that this function can run on the same threads as flush, compaction, + * and user writes. So, it is extremely important not to perform heavy + * computations or blocking calls in this function. + * + * @param backgroundErrorReason background error reason code + * @param backgroundError background error codes + */ + void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError); + + /** + * A callback function for RocksDB which will be called whenever a change + * of superversion triggers a change of the stall conditions. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param writeStallInfo write stall info, + * contains data copied from respective native structure. + */ + void onStallConditionsChanged(final WriteStallInfo writeStallInfo); + + /** + * A callback function for RocksDB which will be called whenever a file read + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileReadFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file write + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileWriteFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file flush + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileFlushFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file sync + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileSyncFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file + * rangeSync operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file + * truncate operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileTruncateFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file close + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileCloseFinish(final FileOperationInfo fileOperationInfo); + + /** + * If true, the {@link #onFileReadFinish(FileOperationInfo)} + * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If + * false, then they won't be called. + * + * Default: false + * + * @return whether to callback when file read/write is finished + */ + boolean shouldBeNotifiedOnFileIO(); + + /** + * A callback function for RocksDB which will be called just before + * starting the automatic recovery process for recoverable background + * errors, such as NoSpace(). The callback can suppress the automatic + * recovery by setting returning false. The database will then + * have to be transitioned out of read-only mode by calling + * RocksDB#resume(). + * + * @param backgroundErrorReason background error reason code + * @param backgroundError background error codes + * @return return {@code false} if the automatic recovery should be suppressed + */ + boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError); + + /** + * A callback function for RocksDB which will be called once the database + * is recovered from read-only mode after an error. When this is called, it + * means normal writes to the database can be issued and the user can + * initiate any further recovery actions needed + * + * @param oldBackgroundError old background error codes + */ + void onErrorRecoveryCompleted(final Status oldBackgroundError); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class ExternalFileIngestionInfo { + private final String columnFamilyName; + private final String externalFilePath; + private final String internalFilePath; + private final long globalSeqno; + private final TableProperties tableProperties; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + ExternalFileIngestionInfo(final String columnFamilyName, final String externalFilePath, + final String internalFilePath, final long globalSeqno, + final TableProperties tableProperties) { + this.columnFamilyName = columnFamilyName; + this.externalFilePath = externalFilePath; + this.internalFilePath = internalFilePath; + this.globalSeqno = globalSeqno; + this.tableProperties = tableProperties; + } + + /** + * Get the name of the column family. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path of the file outside the DB. + * + * @return the path of the file outside the DB. + */ + public String getExternalFilePath() { + return externalFilePath; + } + + /** + * Get the path of the file inside the DB. + * + * @return the path of the file inside the DB. + */ + public String getInternalFilePath() { + return internalFilePath; + } + + /** + * Get the global sequence number assigned to keys in this file. + * + * @return the global sequence number. + */ + public long getGlobalSeqno() { + return globalSeqno; + } + + /** + * Get the Table properties of the table being flushed. + * + * @return the table properties. + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o; + return globalSeqno == that.globalSeqno + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(externalFilePath, that.externalFilePath) + && Objects.equals(internalFilePath, that.internalFilePath) + && Objects.equals(tableProperties, that.tableProperties); + } + + @Override + public int hashCode() { + return Objects.hash( + columnFamilyName, externalFilePath, internalFilePath, globalSeqno, tableProperties); + } + + @Override + public String toString() { + return "ExternalFileIngestionInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", externalFilePath='" + externalFilePath + + '\'' + ", internalFilePath='" + internalFilePath + '\'' + ", globalSeqno=" + globalSeqno + + ", tableProperties=" + tableProperties + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +/** + * Java representation of FileOperationInfo struct from include/rocksdb/listener.h + */ +public class FileOperationInfo { + private final String path; + private final long offset; + private final long length; + private final long startTimestamp; + private final long duration; + private final Status status; + + /** + * Access is private as this will only be constructed from + * C++ via JNI. + */ + FileOperationInfo(final String path, final long offset, final long length, + final long startTimestamp, final long duration, final Status status) { + this.path = path; + this.offset = offset; + this.length = length; + this.startTimestamp = startTimestamp; + this.duration = duration; + this.status = status; + } + + /** + * Get the file path. + * + * @return the file path. + */ + public String getPath() { + return path; + } + + /** + * Get the offset. + * + * @return the offset. + */ + public long getOffset() { + return offset; + } + + /** + * Get the length. + * + * @return the length. + */ + public long getLength() { + return length; + } + + /** + * Get the start timestamp (in nanoseconds). + * + * @return the start timestamp. + */ + public long getStartTimestamp() { + return startTimestamp; + } + + /** + * Get the operation duration (in nanoseconds). + * + * @return the operation duration. + */ + public long getDuration() { + return duration; + } + + /** + * Get the status. + * + * @return the status. + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + FileOperationInfo that = (FileOperationInfo) o; + return offset == that.offset && length == that.length && startTimestamp == that.startTimestamp + && duration == that.duration && Objects.equals(path, that.path) + && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(path, offset, length, startTimestamp, duration, status); + } + + @Override + public String toString() { + return "FileOperationInfo{" + + "path='" + path + '\'' + ", offset=" + offset + ", length=" + length + ", startTimestamp=" + + startTimestamp + ", duration=" + duration + ", status=" + status + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,186 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class FlushJobInfo { + private final long columnFamilyId; + private final String columnFamilyName; + private final String filePath; + private final long threadId; + private final int jobId; + private final boolean triggeredWritesSlowdown; + private final boolean triggeredWritesStop; + private final long smallestSeqno; + private final long largestSeqno; + private final TableProperties tableProperties; + private final FlushReason flushReason; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + FlushJobInfo(final long columnFamilyId, final String columnFamilyName, final String filePath, + final long threadId, final int jobId, final boolean triggeredWritesSlowdown, + final boolean triggeredWritesStop, final long smallestSeqno, final long largestSeqno, + final TableProperties tableProperties, final byte flushReasonValue) { + this.columnFamilyId = columnFamilyId; + this.columnFamilyName = columnFamilyName; + this.filePath = filePath; + this.threadId = threadId; + this.jobId = jobId; + this.triggeredWritesSlowdown = triggeredWritesSlowdown; + this.triggeredWritesStop = triggeredWritesStop; + this.smallestSeqno = smallestSeqno; + this.largestSeqno = largestSeqno; + this.tableProperties = tableProperties; + this.flushReason = FlushReason.fromValue(flushReasonValue); + } + + /** + * Get the id of the column family. + * + * @return the id of the column family + */ + public long getColumnFamilyId() { + return columnFamilyId; + } + + /** + * Get the name of the column family. + * + * @return the name of the column family + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path to the newly created file. + * + * @return the path to the newly created file + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the thread that completed this flush job. + * + * @return the id of the thread that completed this flush job + */ + public long getThreadId() { + return threadId; + } + + /** + * Get the job id, which is unique in the same thread. + * + * @return the job id + */ + public int getJobId() { + return jobId; + } + + /** + * Determine if rocksdb is currently slowing-down all writes to prevent + * creating too many Level 0 files as compaction seems not able to + * catch up the write request speed. + * + * This indicates that there are too many files in Level 0. + * + * @return true if rocksdb is currently slowing-down all writes, + * false otherwise + */ + public boolean isTriggeredWritesSlowdown() { + return triggeredWritesSlowdown; + } + + /** + * Determine if rocksdb is currently blocking any writes to prevent + * creating more L0 files. + * + * This indicates that there are too many files in level 0. + * Compactions should try to compact L0 files down to lower levels as soon + * as possible. + * + * @return true if rocksdb is currently blocking any writes, false otherwise + */ + public boolean isTriggeredWritesStop() { + return triggeredWritesStop; + } + + /** + * Get the smallest sequence number in the newly created file. + * + * @return the smallest sequence number + */ + public long getSmallestSeqno() { + return smallestSeqno; + } + + /** + * Get the largest sequence number in the newly created file. + * + * @return the largest sequence number + */ + public long getLargestSeqno() { + return largestSeqno; + } + + /** + * Get the Table properties of the table being flushed. + * + * @return the Table properties of the table being flushed + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + /** + * Get the reason for initiating the flush. + * + * @return the reason for initiating the flush. + */ + public FlushReason getFlushReason() { + return flushReason; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + FlushJobInfo that = (FlushJobInfo) o; + return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId + && triggeredWritesSlowdown == that.triggeredWritesSlowdown + && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno + && largestSeqno == that.largestSeqno + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filePath, that.filePath) + && Objects.equals(tableProperties, that.tableProperties) && flushReason == that.flushReason; + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyId, columnFamilyName, filePath, threadId, jobId, + triggeredWritesSlowdown, triggeredWritesStop, smallestSeqno, largestSeqno, tableProperties, + flushReason); + } + + @Override + public String toString() { + return "FlushJobInfo{" + + "columnFamilyId=" + columnFamilyId + ", columnFamilyName='" + columnFamilyName + '\'' + + ", filePath='" + filePath + '\'' + ", threadId=" + threadId + ", jobId=" + jobId + + ", triggeredWritesSlowdown=" + triggeredWritesSlowdown + + ", triggeredWritesStop=" + triggeredWritesStop + ", smallestSeqno=" + smallestSeqno + + ", largestSeqno=" + largestSeqno + ", tableProperties=" + tableProperties + + ", flushReason=" + flushReason + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum FlushReason { + OTHERS((byte) 0x00), + GET_LIVE_FILES((byte) 0x01), + SHUTDOWN((byte) 0x02), + EXTERNAL_FILE_INGESTION((byte) 0x03), + MANUAL_COMPACTION((byte) 0x04), + WRITE_BUFFER_MANAGER((byte) 0x05), + WRITE_BUFFER_FULL((byte) 0x06), + TEST((byte) 0x07), + DELETE_FILES((byte) 0x08), + AUTO_COMPACTION((byte) 0x09), + MANUAL_FLUSH((byte) 0x0a), + ERROR_RECOVERY((byte) 0xb); + + private final byte value; + + FlushReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the FlushReason from the internal representation value. + * + * @return the flush reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static FlushReason fromValue(final byte value) { + for (final FlushReason flushReason : FlushReason.values()) { + if (flushReason.value == value) { + return flushReason; + } + } + + throw new IllegalArgumentException("Illegal value provided for FlushReason: " + value); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java 2025-05-19 16:14:27.000000000 +0000 @@ -159,6 +159,27 @@ */ BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E), + /** + * Num of Index and Filter blocks read from file system per level in MultiGet + * request + */ + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F), + + /** + * Num of Data blocks read from file system per level in MultiGet request. + */ + NUM_DATA_BLOCKS_READ_PER_LEVEL((byte) 0x30), + + /** + * Num of SST files read from file system per level in MultiGet request. + */ + NUM_SST_READ_PER_LEVEL((byte) 0x31), + + /** + * The number of retry in auto resume + */ + ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,60 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +/** + * This enum allows trading off increased index size for improved iterator + * seek performance in some situations, particularly when block cache is + * disabled ({@link ReadOptions#fillCache()} == false and direct IO is + * enabled ({@link DBOptions#useDirectReads()} == true). + * The default mode is the best tradeoff for most use cases. + * This option only affects newly written tables. + * + * The index contains a key separating each pair of consecutive blocks. + * Let A be the highest key in one block, B the lowest key in the next block, + * and I the index entry separating these two blocks: + * [ ... A] I [B ...] + * I is allowed to be anywhere in [A, B). + * If an iterator is seeked to a key in (A, I], we'll unnecessarily read the + * first block, then immediately fall through to the second block. + * However, if I=A, this can't happen, and we'll read only the second block. + * In kNoShortening mode, we use I=A. In other modes, we use the shortest + * key in [A, B), which usually significantly reduces index size. + * + * There's a similar story for the last index entry, which is an upper bound + * of the highest key in the file. If it's shortened and therefore + * overestimated, iterator is likely to unnecessarily read the last data block + * from each file on each seek. + */ +public enum IndexShorteningMode { + /** + * Use full keys. + */ + kNoShortening((byte) 0), + /** + * Shorten index keys between blocks, but use full key for the last index + * key, which is the upper bound of the whole file. + */ + kShortenSeparators((byte) 1), + /** + * Shorten both keys between blocks and key after last block. + */ + kShortenSeparatorsAndSuccessor((byte) 2); + + private final byte value; + + IndexShorteningMode(final byte value) { + this.value = value; + } + + /** + * Returns the byte value of the enumerations value. + * + * @return byte representation + */ + byte getValue() { + return value; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java 2025-05-19 16:14:27.000000000 +0000 @@ -22,7 +22,21 @@ /** * A two-level index implementation. Both levels are binary search indexes. */ - kTwoLevelIndexSearch((byte) 2); + kTwoLevelIndexSearch((byte) 2), + /** + * Like {@link #kBinarySearch}, but index also contains first key of each block. + * This allows iterators to defer reading the block until it's actually + * needed. May significantly reduce read amplification of short range scans. + * Without it, iterator seek usually reads one block from each level-0 file + * and from each level, which may be expensive. + * Works best in combination with: + * - IndexShorteningMode::kNoShortening, + * - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + * e.g. when prefix changes. + * Makes the index significantly bigger (2x or more), especially when keys + * are long. + */ + kBinarySearchWithFirstKey((byte) 3); /** * Returns the byte value of the enumerations value diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,36 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class KeyMayExist { + @Override + public boolean equals(final Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + final KeyMayExist that = (KeyMayExist) o; + return (valueLength == that.valueLength && exists == that.exists); + } + + @Override + public int hashCode() { + return Objects.hash(exists, valueLength); + } + + public enum KeyMayExistEnum { kNotExist, kExistsWithoutValue, kExistsWithValue } + ; + + public KeyMayExist(final KeyMayExistEnum exists, final int valueLength) { + this.exists = exists; + this.valueLength = valueLength; + } + + public final KeyMayExistEnum exists; + public final int valueLength; +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class MemTableInfo { + private final String columnFamilyName; + private final long firstSeqno; + private final long earliestSeqno; + private final long numEntries; + private final long numDeletes; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + MemTableInfo(final String columnFamilyName, final long firstSeqno, final long earliestSeqno, + final long numEntries, final long numDeletes) { + this.columnFamilyName = columnFamilyName; + this.firstSeqno = firstSeqno; + this.earliestSeqno = earliestSeqno; + this.numEntries = numEntries; + this.numDeletes = numDeletes; + } + + /** + * Get the name of the column family to which memtable belongs. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the Sequence number of the first element that was inserted into the + * memtable. + * + * @return the sequence number of the first inserted element. + */ + public long getFirstSeqno() { + return firstSeqno; + } + + /** + * Get the Sequence number that is guaranteed to be smaller than or equal + * to the sequence number of any key that could be inserted into this + * memtable. It can then be assumed that any write with a larger(or equal) + * sequence number will be present in this memtable or a later memtable. + * + * @return the earliest sequence number. + */ + public long getEarliestSeqno() { + return earliestSeqno; + } + + /** + * Get the total number of entries in memtable. + * + * @return the total number of entries. + */ + public long getNumEntries() { + return numEntries; + } + + /** + * Get the total number of deletes in memtable. + * + * @return the total number of deletes. + */ + public long getNumDeletes() { + return numDeletes; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + MemTableInfo that = (MemTableInfo) o; + return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno + && numEntries == that.numEntries && numDeletes == that.numDeletes + && Objects.equals(columnFamilyName, that.columnFamilyName); + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyName, firstSeqno, earliestSeqno, numEntries, numDeletes); + } + + @Override + public String toString() { + return "MemTableInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", firstSeqno=" + firstSeqno + + ", earliestSeqno=" + earliestSeqno + ", numEntries=" + numEntries + + ", numDeletes=" + numDeletes + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -39,42 +39,25 @@ * * The format is: key1=value1;key2=value2;key3=value3 etc * - * For int[] values, each int should be separated by a comma, e.g. + * For int[] values, each int should be separated by a colon, e.g. * - * key1=value1;intArrayKey1=1,2,3 + * key1=value1;intArrayKey1=1:2:3 * * @param str The string representation of the mutable column family options + * @param ignoreUnknown what to do if the key is not one of the keys we expect * * @return A builder for the mutable column family options */ - public static MutableColumnFamilyOptionsBuilder parse(final String str) { + public static MutableColumnFamilyOptionsBuilder parse( + final String str, final boolean ignoreUnknown) { Objects.requireNonNull(str); - final MutableColumnFamilyOptionsBuilder builder = - new MutableColumnFamilyOptionsBuilder(); - - final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR); - for(final String option : options) { - final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR); - if(equalsOffset <= 0) { - throw new IllegalArgumentException( - "options string has an invalid key=value pair"); - } - - final String key = option.substring(0, equalsOffset); - if(key.isEmpty()) { - throw new IllegalArgumentException("options string is invalid"); - } - - final String value = option.substring(equalsOffset + 1); - if(value.isEmpty()) { - throw new IllegalArgumentException("options string is invalid"); - } - - builder.fromString(key, value); - } + final List parsedOptions = OptionString.Parser.parse(str); + return new MutableColumnFamilyOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown); + } - return builder; + public static MutableColumnFamilyOptionsBuilder parse(final String str) { + return parse(str, false); } private interface MutableColumnFamilyOptionKey extends MutableOptionKey {} @@ -117,7 +100,8 @@ max_bytes_for_level_base(ValueType.LONG), max_bytes_for_level_multiplier(ValueType.INT), max_bytes_for_level_multiplier_additional(ValueType.INT_ARRAY), - ttl(ValueType.LONG); + ttl(ValueType.LONG), + periodic_compaction_seconds(ValueType.LONG); private final ValueType valueType; CompactionOption(final ValueType valueType) { @@ -130,11 +114,31 @@ } } + public enum BlobOption implements MutableColumnFamilyOptionKey { + enable_blob_files(ValueType.BOOLEAN), + min_blob_size(ValueType.LONG), + blob_file_size(ValueType.LONG), + blob_compression_type(ValueType.ENUM), + enable_blob_garbage_collection(ValueType.BOOLEAN), + blob_garbage_collection_age_cutoff(ValueType.DOUBLE), + blob_garbage_collection_force_threshold(ValueType.DOUBLE); + + private final ValueType valueType; + BlobOption(final ValueType valueType) { + this.valueType = valueType; + } + + @Override + public ValueType getValueType() { + return valueType; + } + } + public enum MiscOption implements MutableColumnFamilyOptionKey { max_sequential_skip_in_iterations(ValueType.LONG), paranoid_file_checks(ValueType.BOOLEAN), report_bg_io_stats(ValueType.BOOLEAN), - compression_type(ValueType.ENUM); + compression(ValueType.ENUM); private final ValueType valueType; MiscOption(final ValueType valueType) { @@ -164,6 +168,10 @@ for(final MutableColumnFamilyOptionKey key : MiscOption.values()) { ALL_KEYS_LOOKUP.put(key.name(), key); } + + for (final MutableColumnFamilyOptionKey key : BlobOption.values()) { + ALL_KEYS_LOOKUP.put(key.name(), key); + } } private MutableColumnFamilyOptionsBuilder() { @@ -437,12 +445,12 @@ @Override public MutableColumnFamilyOptionsBuilder setCompressionType( final CompressionType compressionType) { - return setEnum(MiscOption.compression_type, compressionType); + return setEnum(MiscOption.compression, compressionType); } @Override public CompressionType compressionType() { - return (CompressionType)getEnum(MiscOption.compression_type); + return (CompressionType) getEnum(MiscOption.compression); } @Override @@ -465,5 +473,92 @@ public long ttl() { return getLong(CompactionOption.ttl); } + + @Override + public MutableColumnFamilyOptionsBuilder setPeriodicCompactionSeconds( + final long periodicCompactionSeconds) { + return setLong(CompactionOption.periodic_compaction_seconds, periodicCompactionSeconds); + } + + @Override + public long periodicCompactionSeconds() { + return getLong(CompactionOption.periodic_compaction_seconds); + } + + @Override + public MutableColumnFamilyOptionsBuilder setEnableBlobFiles(final boolean enableBlobFiles) { + return setBoolean(BlobOption.enable_blob_files, enableBlobFiles); + } + + @Override + public boolean enableBlobFiles() { + return getBoolean(BlobOption.enable_blob_files); + } + + @Override + public MutableColumnFamilyOptionsBuilder setMinBlobSize(final long minBlobSize) { + return setLong(BlobOption.min_blob_size, minBlobSize); + } + + @Override + public long minBlobSize() { + return getLong(BlobOption.min_blob_size); + } + + @Override + public MutableColumnFamilyOptionsBuilder setBlobFileSize(final long blobFileSize) { + return setLong(BlobOption.blob_file_size, blobFileSize); + } + + @Override + public long blobFileSize() { + return getLong(BlobOption.blob_file_size); + } + + @Override + public MutableColumnFamilyOptionsBuilder setBlobCompressionType( + final CompressionType compressionType) { + return setEnum(BlobOption.blob_compression_type, compressionType); + } + + @Override + public CompressionType blobCompressionType() { + return (CompressionType) getEnum(BlobOption.blob_compression_type); + } + + @Override + public MutableColumnFamilyOptionsBuilder setEnableBlobGarbageCollection( + final boolean enableBlobGarbageCollection) { + return setBoolean(BlobOption.enable_blob_garbage_collection, enableBlobGarbageCollection); + } + + @Override + public boolean enableBlobGarbageCollection() { + return getBoolean(BlobOption.enable_blob_garbage_collection); + } + + @Override + public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionAgeCutoff( + final double blobGarbageCollectionAgeCutoff) { + return setDouble( + BlobOption.blob_garbage_collection_age_cutoff, blobGarbageCollectionAgeCutoff); + } + + @Override + public double blobGarbageCollectionAgeCutoff() { + return getDouble(BlobOption.blob_garbage_collection_age_cutoff); + } + + @Override + public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionForceThreshold( + final double blobGarbageCollectionForceThreshold) { + return setDouble( + BlobOption.blob_garbage_collection_force_threshold, blobGarbageCollectionForceThreshold); + } + + @Override + public double blobGarbageCollectionForceThreshold() { + return getDouble(BlobOption.blob_garbage_collection_force_threshold); + } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -26,7 +26,7 @@ * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms * while overflowing the underlying platform specific value. */ - MutableColumnFamilyOptionsInterface setWriteBufferSize(long writeBufferSize); + T setWriteBufferSize(long writeBufferSize); /** * Return size of write buffer size. @@ -43,8 +43,7 @@ * @param disableAutoCompactions true if auto-compactions are disabled. * @return the reference to the current option. */ - MutableColumnFamilyOptionsInterface setDisableAutoCompactions( - boolean disableAutoCompactions); + T setDisableAutoCompactions(boolean disableAutoCompactions); /** * Disable automatic compactions. Manual compactions can still @@ -64,8 +63,7 @@ * level-0 compaction * @return the reference to the current option. */ - MutableColumnFamilyOptionsInterface setLevel0FileNumCompactionTrigger( - int level0FileNumCompactionTrigger); + T setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger); /** * Number of files to trigger level-0 compaction. A value < 0 means that @@ -86,7 +84,7 @@ * @return the reference to the current option. * @see #maxCompactionBytes() */ - MutableColumnFamilyOptionsInterface setMaxCompactionBytes(final long maxCompactionBytes); + T setMaxCompactionBytes(final long maxCompactionBytes); /** * We try to limit number of bytes in one compaction to be lower than this diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,7 @@ package org.rocksdb; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Objects; @@ -41,40 +42,22 @@ * * For int[] values, each int should be separated by a comma, e.g. * - * key1=value1;intArrayKey1=1,2,3 + * key1=value1;intArrayKey1=1:2:3 * * @param str The string representation of the mutable db options + * @param ignoreUnknown what to do if the key is not one of the keys we expect * * @return A builder for the mutable db options */ - public static MutableDBOptionsBuilder parse(final String str) { + public static MutableDBOptionsBuilder parse(final String str, boolean ignoreUnknown) { Objects.requireNonNull(str); - final MutableDBOptionsBuilder builder = - new MutableDBOptionsBuilder(); - - final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR); - for(final String option : options) { - final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR); - if(equalsOffset <= 0) { - throw new IllegalArgumentException( - "options string has an invalid key=value pair"); - } - - final String key = option.substring(0, equalsOffset); - if(key.isEmpty()) { - throw new IllegalArgumentException("options string is invalid"); - } - - final String value = option.substring(equalsOffset + 1); - if(value.isEmpty()) { - throw new IllegalArgumentException("options string is invalid"); - } - - builder.fromString(key, value); - } + final List parsedOptions = OptionString.Parser.parse(str); + return new MutableDBOptions.MutableDBOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown); + } - return builder; + public static MutableDBOptionsBuilder parse(final String str) { + return parse(str, false); } private interface MutableDBOptionKey extends MutableOptionKey {} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -202,12 +202,24 @@ long delayedWriteRate(); /** - *

      Once write-ahead logs exceed this size, we will start forcing the - * flush of column families whose memtables are backed by the oldest live - * WAL file (i.e. the ones that are causing all the space amplification). + *

      Set the max total write-ahead log size. Once write-ahead logs exceed this size, we will + * start forcing the flush of column families whose memtables are backed by the oldest live WAL + * file *

      + *

      The oldest WAL files are the ones that are causing all the space amplification. + *

      + * For example, with 15 column families, each with + * write_buffer_size = 128 MB + * max_write_buffer_number = 6 + * max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = + * 45GB + *

      + * The RocksDB wiki has some discussion about how the WAL interacts + * with memtables and flushing of column families, at + * ... + *

      *

      If set to 0 (default), we will dynamically choose the WAL size limit to - * be [sum of all write_buffer_size * max_write_buffer_number] * 2

      + * be [sum of all write_buffer_size * max_write_buffer_number] * 4

      *

      This option takes effect only when there are more than one column family as * otherwise the wal size is dictated by the write_buffer_size.

      *

      Default: 0

      @@ -218,13 +230,30 @@ T setMaxTotalWalSize(long maxTotalWalSize); /** - *

      Returns the max total wal size. Once write-ahead logs exceed this size, + *

      Returns the max total write-ahead log size. Once write-ahead logs exceed this size, * we will start forcing the flush of column families whose memtables are - * backed by the oldest live WAL file (i.e. the ones that are causing all - * the space amplification).

      + * backed by the oldest live WAL file.

      + *

      The oldest WAL files are the ones that are causing all the space amplification. + *

      + * For example, with 15 column families, each with + * write_buffer_size = 128 MB + * max_write_buffer_number = 6 + * max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = + * 45GB + *

      + * The RocksDB wiki has some discussion about how the WAL interacts + * with memtables and flushing of column families, at + * ... + *

      + *

      If set to 0 (default), we will dynamically choose the WAL size limit to + * be [sum of all write_buffer_size * max_write_buffer_number] * 4

      + *

      This option takes effect only when there are more than one column family as + * otherwise the wal size is dictated by the write_buffer_size.

      + *

      Default: 0

      + * * *

      If set to 0 (default), we will dynamically choose the WAL size limit - * to be [sum of all write_buffer_size * max_write_buffer_number] * 2 + * to be [sum of all write_buffer_size * max_write_buffer_number] * 4 *

      * * @return max total wal size diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java 2025-05-19 16:14:27.000000000 +0000 @@ -326,7 +326,7 @@ String asString() { final StringBuilder builder = new StringBuilder(); for(int i = 0; i < value.length; i++) { - builder.append(i); + builder.append(value[i]); if(i + 1 < value.length) { builder.append(INT_ARRAY_INT_SEPARATOR); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java 2025-05-19 16:14:27.000000000 +0000 @@ -18,7 +18,11 @@ private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb"); private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); + private static final /* @Nullable */ String fallbackJniLibraryName = + Environment.getFallbackJniLibraryName("rocksdb"); private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); + private static final /* @Nullable */ String fallbackJniLibraryFileName = + Environment.getFallbackJniLibraryFileName("rocksdb"); private static final String tempFilePrefix = "librocksdbjni"; private static final String tempFileSuffix = Environment.getJniLibraryExtension(); @@ -49,14 +53,33 @@ */ public synchronized void loadLibrary(final String tmpDir) throws IOException { try { - System.loadLibrary(sharedLibraryName); - } catch(final UnsatisfiedLinkError ule1) { + // try dynamic library + System.loadLibrary(sharedLibraryName); + return; + } catch (final UnsatisfiedLinkError ule) { + // ignore - try from static library + } + + try { + // try static library + System.loadLibrary(jniLibraryName); + return; + } catch (final UnsatisfiedLinkError ule) { + // ignore - then try static library fallback or from jar + } + + if (fallbackJniLibraryName != null) { try { - System.loadLibrary(jniLibraryName); - } catch(final UnsatisfiedLinkError ule2) { - loadLibraryFromJar(tmpDir); + // try static library fallback + System.loadLibrary(fallbackJniLibraryName); + return; + } catch (final UnsatisfiedLinkError ule) { + // ignore - then try from jar } } + + // try jar + loadLibraryFromJar(tmpDir); } /** @@ -83,38 +106,62 @@ File loadLibraryFromJarToTemp(final String tmpDir) throws IOException { - final File temp; - if (tmpDir == null || tmpDir.isEmpty()) { - temp = File.createTempFile(tempFilePrefix, tempFileSuffix); - } else { - temp = new File(tmpDir, jniLibraryFileName); - if (temp.exists() && !temp.delete()) { - throw new RuntimeException("File: " + temp.getAbsolutePath() - + " already exists and cannot be removed."); + InputStream is = null; + try { + // attempt to look up the static library in the jar file + String libraryFileName = jniLibraryFileName; + is = getClass().getClassLoader().getResourceAsStream(libraryFileName); + + if (is == null) { + // is there a fallback we can try + if (fallbackJniLibraryFileName == null) { + throw new RuntimeException(libraryFileName + " was not found inside JAR."); + } + + // attempt to look up the fallback static library in the jar file + libraryFileName = fallbackJniLibraryFileName; + is = getClass().getClassLoader().getResourceAsStream(libraryFileName); + if (is == null) { + throw new RuntimeException(libraryFileName + " was not found inside JAR."); + } } - if (!temp.createNewFile()) { - throw new RuntimeException("File: " + temp.getAbsolutePath() - + " could not be created."); + + // create a temporary file to copy the library to + final File temp; + if (tmpDir == null || tmpDir.isEmpty()) { + temp = File.createTempFile(tempFilePrefix, tempFileSuffix); + } else { + final File parentDir = new File(tmpDir); + if (!parentDir.exists()) { + throw new RuntimeException( + "Directory: " + parentDir.getAbsolutePath() + " does not exist!"); + } + temp = new File(parentDir, libraryFileName); + if (temp.exists() && !temp.delete()) { + throw new RuntimeException( + "File: " + temp.getAbsolutePath() + " already exists and cannot be removed."); + } + if (!temp.createNewFile()) { + throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created."); + } + } + if (!temp.exists()) { + throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); + } else { + temp.deleteOnExit(); } - } - if (!temp.exists()) { - throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); - } else { - temp.deleteOnExit(); - } + // copy the library from the Jar file to the temp destination + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); - // attempt to copy the library from the Jar file to the temp destination - try (final InputStream is = getClass().getClassLoader(). - getResourceAsStream(jniLibraryFileName)) { - if (is == null) { - throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); - } else { - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + // return the temporary library file + return temp; + + } finally { + if (is != null) { + is.close(); } } - - return temp; } /** diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,256 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class OptionString { + private final static char kvPairSeparator = ';'; + private final static char kvSeparator = '='; + private final static char complexValueBegin = '{'; + private final static char complexValueEnd = '}'; + private final static char wrappedValueBegin = '{'; + private final static char wrappedValueEnd = '}'; + private final static char arrayValueSeparator = ':'; + + static class Value { + final List list; + final List complex; + + public Value(final List list, final List complex) { + this.list = list; + this.complex = complex; + } + + public boolean isList() { + return (this.list != null && this.complex == null); + } + + public static Value fromList(final List list) { + return new Value(list, null); + } + + public static Value fromComplex(final List complex) { + return new Value(null, complex); + } + + public String toString() { + final StringBuilder sb = new StringBuilder(); + if (isList()) { + for (final String item : list) { + sb.append(item).append(arrayValueSeparator); + } + // remove the final separator + if (sb.length() > 0) + sb.delete(sb.length() - 1, sb.length()); + } else { + sb.append('['); + for (final Entry entry : complex) { + sb.append(entry.toString()).append(';'); + } + sb.append(']'); + } + return sb.toString(); + } + } + + static class Entry { + public final String key; + public final Value value; + + private Entry(final String key, final Value value) { + this.key = key; + this.value = value; + } + + public String toString() { + return "" + key + "=" + value; + } + } + + static class Parser { + static class Exception extends RuntimeException { + public Exception(final String s) { + super(s); + } + } + + final String str; + final StringBuilder sb; + + private Parser(final String str) { + this.str = str; + this.sb = new StringBuilder(str); + } + + private void exception(final String message) { + final int pos = str.length() - sb.length(); + final int before = Math.min(pos, 64); + final int after = Math.min(64, str.length() - pos); + final String here = + str.substring(pos - before, pos) + "__*HERE*__" + str.substring(pos, pos + after); + + throw new Parser.Exception(message + " at [" + here + "]"); + } + + private void skipWhite() { + while (sb.length() > 0 && Character.isWhitespace(sb.charAt(0))) { + sb.delete(0, 1); + } + } + + private char first() { + if (sb.length() == 0) + exception("Unexpected end of input"); + return sb.charAt(0); + } + + private char next() { + if (sb.length() == 0) + exception("Unexpected end of input"); + final char c = sb.charAt(0); + sb.delete(0, 1); + return c; + } + + private boolean hasNext() { + return (sb.length() > 0); + } + + private boolean is(final char c) { + return (sb.length() > 0 && sb.charAt(0) == c); + } + + private boolean isKeyChar() { + if (!hasNext()) + return false; + final char c = first(); + return (Character.isAlphabetic(c) || Character.isDigit(c) || "_".indexOf(c) != -1); + } + + private boolean isValueChar() { + if (!hasNext()) + return false; + final char c = first(); + return (Character.isAlphabetic(c) || Character.isDigit(c) || "_-+.[]".indexOf(c) != -1); + } + + private String parseKey() { + final StringBuilder sbKey = new StringBuilder(); + sbKey.append(next()); + while (isKeyChar()) { + sbKey.append(next()); + } + + return sbKey.toString(); + } + + private String parseSimpleValue() { + if (is(wrappedValueBegin)) { + next(); + final String result = parseSimpleValue(); + if (!is(wrappedValueEnd)) { + exception("Expected to end a wrapped value with " + wrappedValueEnd); + } + next(); + + return result; + } else { + final StringBuilder sbValue = new StringBuilder(); + while (isValueChar()) sbValue.append(next()); + + return sbValue.toString(); + } + } + + private List parseList() { + final List list = new ArrayList<>(1); + while (true) { + list.add(parseSimpleValue()); + if (!is(arrayValueSeparator)) + break; + + next(); + } + + return list; + } + + private Entry parseOption() { + skipWhite(); + if (!isKeyChar()) { + exception("No valid key character(s) for key in key=value "); + } + final String key = parseKey(); + skipWhite(); + if (is(kvSeparator)) { + next(); + } else { + exception("Expected = separating key and value"); + } + skipWhite(); + final Value value = parseValue(); + return new Entry(key, value); + } + + private Value parseValue() { + skipWhite(); + if (is(complexValueBegin)) { + next(); + skipWhite(); + final Value value = Value.fromComplex(parseComplex()); + skipWhite(); + if (is(complexValueEnd)) { + next(); + skipWhite(); + } else { + exception("Expected } ending complex value"); + } + return value; + } else if (isValueChar()) { + return Value.fromList(parseList()); + } + + exception("No valid value character(s) for value in key=value"); + return null; + } + + private List parseComplex() { + final List entries = new ArrayList<>(); + + skipWhite(); + if (hasNext()) { + entries.add(parseOption()); + skipWhite(); + while (is(kvPairSeparator)) { + next(); + skipWhite(); + if (!isKeyChar()) { + // the separator was a terminator + break; + } + entries.add(parseOption()); + skipWhite(); + } + } + return entries; + } + + public static List parse(final String str) { + Objects.requireNonNull(str); + + final Parser parser = new Parser(str); + final List result = parser.parseComplex(); + if (parser.hasNext()) { + parser.exception("Unexpected end of parsing "); + } + + return result; + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java 2025-05-19 16:14:27.000000000 +0000 @@ -6,10 +6,7 @@ package org.rocksdb; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Options to control the behavior of a database. It will be used @@ -28,6 +25,25 @@ } /** + * Converts the input properties into a Options-style formatted string + * @param properties The set of properties to convert + * @return The Options-style representation of those properties. + */ + public static String getOptionStringFromProps(final Properties properties) { + if (properties == null || properties.size() == 0) { + throw new IllegalArgumentException("Properties value must contain at least one value."); + } + StringBuilder stringBuilder = new StringBuilder(); + for (final String name : properties.stringPropertyNames()) { + stringBuilder.append(name); + stringBuilder.append("="); + stringBuilder.append(properties.getProperty(name)); + stringBuilder.append(";"); + } + return stringBuilder.toString(); + } + + /** * Construct options for opening a RocksDB. * * This constructor will create (by allocating a block of memory) @@ -75,6 +91,10 @@ this.compressionOptions_ = other.compressionOptions_; this.rowCache_ = other.rowCache_; this.writeBufferManager_ = other.writeBufferManager_; + this.compactionThreadLimiter_ = other.compactionThreadLimiter_; + this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_; + this.walFilter_ = other.walFilter_; + this.sstPartitionerFactory_ = other.sstPartitionerFactory_; } @Override @@ -141,12 +161,24 @@ } @Override + public Options oldDefaults(final int majorVersion, final int minorVersion) { + oldDefaults(nativeHandle_, majorVersion, minorVersion); + return this; + } + + @Override public Options optimizeForSmallDb() { optimizeForSmallDb(nativeHandle_); return this; } @Override + public Options optimizeForSmallDb(final Cache cache) { + optimizeForSmallDb(nativeHandle_, cache.getNativeHandle()); + return this; + } + + @Override public Options optimizeForPointLookup( long blockCacheSizeMb) { optimizeForPointLookup(nativeHandle_, @@ -633,6 +665,18 @@ } @Override + public Options setMaxWriteBatchGroupSizeBytes(long maxWriteBatchGroupSizeBytes) { + setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes); + return this; + } + + @Override + public long maxWriteBatchGroupSizeBytes() { + assert (isOwningHandle()); + return maxWriteBatchGroupSizeBytes(nativeHandle_); + } + + @Override public Options setWalSizeLimitMB(final long sizeLimitMB) { assert(isOwningHandle()); setWalSizeLimitMB(nativeHandle_, sizeLimitMB); @@ -930,6 +974,19 @@ } @Override + public Options setListeners(final List listeners) { + assert (isOwningHandle()); + setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners)); + return this; + } + + @Override + public List listeners() { + assert (isOwningHandle()); + return Arrays.asList(eventListeners(nativeHandle_)); + } + + @Override public Options setEnableThreadTracking(final boolean enableThreadTracking) { assert(isOwningHandle()); setEnableThreadTracking(nativeHandle_, enableThreadTracking); @@ -1038,6 +1095,18 @@ } @Override + public Options setSkipCheckingSstFileSizesOnDbOpen(boolean skipCheckingSstFileSizesOnDbOpen) { + setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen); + return this; + } + + @Override + public boolean skipCheckingSstFileSizesOnDbOpen() { + assert (isOwningHandle()); + return skipCheckingSstFileSizesOnDbOpen(nativeHandle_); + } + + @Override public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) { assert(isOwningHandle()); setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue()); @@ -1268,6 +1337,45 @@ } @Override + public Options setCfPaths(final Collection cfPaths) { + assert (isOwningHandle()); + + final int len = cfPaths.size(); + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; + + int i = 0; + for (final DbPath dbPath : cfPaths) { + paths[i] = dbPath.path.toString(); + targetSizes[i] = dbPath.targetSize; + i++; + } + setCfPaths(nativeHandle_, paths, targetSizes); + return this; + } + + @Override + public List cfPaths() { + final int len = (int) cfPathsLen(nativeHandle_); + + if (len == 0) { + return Collections.emptyList(); + } + + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; + + cfPaths(nativeHandle_, paths, targetSizes); + + final List cfPaths = new ArrayList<>(); + for (int i = 0; i < len; i++) { + cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i])); + } + + return cfPaths; + } + + @Override public Options useFixedLengthPrefixExtractor(final int n) { assert(isOwningHandle()); useFixedLengthPrefixExtractor(nativeHandle_, n); @@ -1303,7 +1411,7 @@ final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); - for (final Byte byteCompressionType : byteCompressionTypes) { + for (final byte byteCompressionType : byteCompressionTypes) { compressionLevels.add(CompressionType.getCompressionType( byteCompressionType)); } @@ -1744,6 +1852,17 @@ } @Override + public Options setPeriodicCompactionSeconds(final long periodicCompactionSeconds) { + setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds); + return this; + } + + @Override + public long periodicCompactionSeconds() { + return periodicCompactionSeconds(nativeHandle_); + } + + @Override public Options setCompactionOptionsUniversal( final CompactionOptionsUniversal compactionOptionsUniversal) { setCompactionOptionsUniversal(nativeHandle_, @@ -1792,6 +1911,201 @@ return atomicFlush(nativeHandle_); } + @Override + public Options setAvoidUnnecessaryBlockingIO(boolean avoidUnnecessaryBlockingIO) { + setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO); + return this; + } + + @Override + public boolean avoidUnnecessaryBlockingIO() { + assert (isOwningHandle()); + return avoidUnnecessaryBlockingIO(nativeHandle_); + } + + @Override + public Options setPersistStatsToDisk(boolean persistStatsToDisk) { + setPersistStatsToDisk(nativeHandle_, persistStatsToDisk); + return this; + } + + @Override + public boolean persistStatsToDisk() { + assert (isOwningHandle()); + return persistStatsToDisk(nativeHandle_); + } + + @Override + public Options setWriteDbidToManifest(boolean writeDbidToManifest) { + setWriteDbidToManifest(nativeHandle_, writeDbidToManifest); + return this; + } + + @Override + public boolean writeDbidToManifest() { + assert (isOwningHandle()); + return writeDbidToManifest(nativeHandle_); + } + + @Override + public Options setLogReadaheadSize(long logReadaheadSize) { + setLogReadaheadSize(nativeHandle_, logReadaheadSize); + return this; + } + + @Override + public long logReadaheadSize() { + assert (isOwningHandle()); + return logReadaheadSize(nativeHandle_); + } + + @Override + public Options setBestEffortsRecovery(boolean bestEffortsRecovery) { + setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery); + return this; + } + + @Override + public boolean bestEffortsRecovery() { + assert (isOwningHandle()); + return bestEffortsRecovery(nativeHandle_); + } + + @Override + public Options setMaxBgErrorResumeCount(int maxBgerrorResumeCount) { + setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount); + return this; + } + + @Override + public int maxBgerrorResumeCount() { + assert (isOwningHandle()); + return maxBgerrorResumeCount(nativeHandle_); + } + + @Override + public Options setBgerrorResumeRetryInterval(long bgerrorResumeRetryInterval) { + setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval); + return this; + } + + @Override + public long bgerrorResumeRetryInterval() { + assert (isOwningHandle()); + return bgerrorResumeRetryInterval(nativeHandle_); + } + + @Override + public Options setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) { + setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_); + this.sstPartitionerFactory_ = sstPartitionerFactory; + return this; + } + + @Override + public SstPartitionerFactory sstPartitionerFactory() { + return sstPartitionerFactory_; + } + + @Override + public Options setCompactionThreadLimiter(final ConcurrentTaskLimiter compactionThreadLimiter) { + setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_); + this.compactionThreadLimiter_ = compactionThreadLimiter; + return this; + } + + @Override + public ConcurrentTaskLimiter compactionThreadLimiter() { + assert (isOwningHandle()); + return this.compactionThreadLimiter_; + } + + // + // BEGIN options for blobs (integrated BlobDB) + // + + @Override + public Options setEnableBlobFiles(final boolean enableBlobFiles) { + setEnableBlobFiles(nativeHandle_, enableBlobFiles); + return this; + } + + @Override + public boolean enableBlobFiles() { + return enableBlobFiles(nativeHandle_); + } + + @Override + public Options setMinBlobSize(final long minBlobSize) { + setMinBlobSize(nativeHandle_, minBlobSize); + return this; + } + + @Override + public long minBlobSize() { + return minBlobSize(nativeHandle_); + } + + @Override + public Options setBlobFileSize(final long blobFileSize) { + setBlobFileSize(nativeHandle_, blobFileSize); + return this; + } + + @Override + public long blobFileSize() { + return blobFileSize(nativeHandle_); + } + + @Override + public Options setBlobCompressionType(CompressionType compressionType) { + setBlobCompressionType(nativeHandle_, compressionType.getValue()); + return this; + } + + @Override + public CompressionType blobCompressionType() { + return CompressionType.values()[blobCompressionType(nativeHandle_)]; + } + + @Override + public Options setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection) { + setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection); + return this; + } + + @Override + public boolean enableBlobGarbageCollection() { + return enableBlobGarbageCollection(nativeHandle_); + } + + @Override + public Options setBlobGarbageCollectionAgeCutoff(final double blobGarbageCollectionAgeCutoff) { + setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff); + return this; + } + + @Override + public double blobGarbageCollectionAgeCutoff() { + return blobGarbageCollectionAgeCutoff(nativeHandle_); + } + + @Override + public Options setBlobGarbageCollectionForceThreshold( + final double blobGarbageCollectionForceThreshold) { + setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold); + return this; + } + + @Override + public double blobGarbageCollectionForceThreshold() { + return blobGarbageCollectionForceThreshold(nativeHandle_); + } + + // + // END options for blobs (integrated BlobDB) + // + private native static long newOptions(); private native static long newOptions(long dbOptHandle, long cfOptHandle); @@ -1881,6 +2195,9 @@ private native long walTtlSeconds(long handle); private native void setWalSizeLimitMB(long handle, long sizeLimitMB); private native long walSizeLimitMB(long handle); + private static native void setMaxWriteBatchGroupSizeBytes( + final long handle, final long maxWriteBatchGroupSizeBytes); + private static native long maxWriteBatchGroupSizeBytes(final long handle); private native void setManifestPreallocationSize( long handle, long size) throws IllegalArgumentException; private native long manifestPreallocationSize(long handle); @@ -1947,6 +2264,9 @@ final long handle, final boolean strictBytesPerSync); private native boolean strictBytesPerSync( final long handle); + private static native void setEventListeners( + final long handle, final long[] eventListenerHandles); + private static native AbstractEventListener[] eventListeners(final long handle); private native void setEnableThreadTracking(long handle, boolean enableThreadTracking); private native boolean enableThreadTracking(long handle); @@ -1973,6 +2293,9 @@ private native void setSkipStatsUpdateOnDbOpen(final long handle, final boolean skipStatsUpdateOnDbOpen); private native boolean skipStatsUpdateOnDbOpen(final long handle); + private static native void setSkipCheckingSstFileSizesOnDbOpen( + final long handle, final boolean skipChecking); + private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle); private native void setWalRecoveryMode(final long handle, final byte walRecoveryMode); private native byte walRecoveryMode(final long handle); @@ -2010,7 +2333,10 @@ // CF native handles + private static native void oldDefaults( + final long handle, final int majorVersion, final int minorVersion); private native void optimizeForSmallDb(final long handle); + private static native void optimizeForSmallDb(final long handle, final long cacheHandle); private native void optimizeForPointLookup(long handle, long blockCacheSizeMb); private native void optimizeLevelStyleCompaction(long handle, @@ -2097,6 +2423,11 @@ private native String memTableFactoryName(long handle); private native void setTableFactory(long handle, long factoryHandle); private native String tableFactoryName(long handle); + private static native void setCfPaths( + final long handle, final String[] paths, final long[] targetSizes); + private static native long cfPathsLen(final long handle); + private static native void cfPaths( + final long handle, final String[] paths, final long[] targetSizes); private native void setInplaceUpdateSupport( long handle, boolean inplaceUpdateSupport); private native boolean inplaceUpdateSupport(long handle); @@ -2152,6 +2483,9 @@ private native boolean reportBgIoStats(final long handle); private native void setTtl(final long handle, final long ttl); private native long ttl(final long handle); + private native void setPeriodicCompactionSeconds( + final long handle, final long periodicCompactionSeconds); + private native long periodicCompactionSeconds(final long handle); private native void setCompactionOptionsUniversal(final long handle, final long compactionOptionsUniversalHandle); private native void setCompactionOptionsFIFO(final long handle, @@ -2162,6 +2496,47 @@ private native void setAtomicFlush(final long handle, final boolean atomicFlush); private native boolean atomicFlush(final long handle); + private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); + private static native void setCompactionThreadLimiter( + final long nativeHandle_, final long newLimiterHandle); + private static native void setAvoidUnnecessaryBlockingIO( + final long handle, final boolean avoidBlockingIO); + private static native boolean avoidUnnecessaryBlockingIO(final long handle); + private static native void setPersistStatsToDisk( + final long handle, final boolean persistStatsToDisk); + private static native boolean persistStatsToDisk(final long handle); + private static native void setWriteDbidToManifest( + final long handle, final boolean writeDbidToManifest); + private static native boolean writeDbidToManifest(final long handle); + private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize); + private static native long logReadaheadSize(final long handle); + private static native void setBestEffortsRecovery( + final long handle, final boolean bestEffortsRecovery); + private static native boolean bestEffortsRecovery(final long handle); + private static native void setMaxBgErrorResumeCount( + final long handle, final int maxBgerrorRecumeCount); + private static native int maxBgerrorResumeCount(final long handle); + private static native void setBgerrorResumeRetryInterval( + final long handle, final long bgerrorResumeRetryInterval); + private static native long bgerrorResumeRetryInterval(final long handle); + + private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); + private native boolean enableBlobFiles(final long nativeHandle_); + private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); + private native long minBlobSize(final long nativeHandle_); + private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize); + private native long blobFileSize(final long nativeHandle_); + private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType); + private native byte blobCompressionType(final long nativeHandle_); + private native void setEnableBlobGarbageCollection( + final long nativeHandle_, final boolean enableBlobGarbageCollection); + private native boolean enableBlobGarbageCollection(final long nativeHandle_); + private native void setBlobGarbageCollectionAgeCutoff( + final long nativeHandle_, final double blobGarbageCollectionAgeCutoff); + private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_); + private native void setBlobGarbageCollectionForceThreshold( + final long nativeHandle_, final double blobGarbageCollectionForceThreshold); + private native double blobGarbageCollectionForceThreshold(final long nativeHandle_); // instance variables // NOTE: If you add new member variables, please update the copy constructor above! @@ -2180,4 +2555,6 @@ private Cache rowCache_; private WalFilter walFilter_; private WriteBufferManager writeBufferManager_; + private SstPartitionerFactory sstPartitionerFactory_; + private ConcurrentTaskLimiter compactionThreadLimiter_; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,7 +5,6 @@ package org.rocksdb; -import java.util.ArrayList; import java.util.List; public class OptionsUtil { @@ -59,7 +58,7 @@ * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be * returned. * @param ignoreUnknownOptions this flag can be set to true if you want to - * ignore options that are from a newer version of the db, esentially for + * ignore options that are from a newer version of the db, essentially for * forward compatibility. * * @throws RocksDBException thrown if error happens in underlying @@ -76,6 +75,25 @@ * and ColumnFamilyDescriptors based on the specified RocksDB Options file. * See LoadLatestOptions above. * + * @param dbPath the path to the RocksDB. + * @param configOptions {@link org.rocksdb.ConfigOptions} instance. + * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be + * filled and returned. + * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be + * returned. + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static void loadLatestOptions(ConfigOptions configOptions, String dbPath, + DBOptions dbOptions, List cfDescs) throws RocksDBException { + loadLatestOptions(configOptions.nativeHandle_, dbPath, dbOptions.nativeHandle_, cfDescs); + } + + /** + * Similar to LoadLatestOptions, this function constructs the DBOptions + * and ColumnFamilyDescriptors based on the specified RocksDB Options file. + * See LoadLatestOptions above. + * * @param optionsFileName the RocksDB options file path. * @param env {@link org.rocksdb.Env} instance. * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be @@ -112,6 +130,26 @@ } /** + * Similar to LoadLatestOptions, this function constructs the DBOptions + * and ColumnFamilyDescriptors based on the specified RocksDB Options file. + * See LoadLatestOptions above. + * + * @param optionsFileName the RocksDB options file path. + * @param configOptions {@link org.rocksdb.ConfigOptions} instance. + * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be + * filled and returned. + * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be + * returned. + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static void loadOptionsFromFile(ConfigOptions configOptions, String optionsFileName, + DBOptions dbOptions, List cfDescs) throws RocksDBException { + loadOptionsFromFile( + configOptions.nativeHandle_, optionsFileName, dbOptions.nativeHandle_, cfDescs); + } + + /** * Returns the latest options file name under the specified RocksDB path. * * @param dbPath the path to the RocksDB. @@ -134,9 +172,13 @@ // native methods private native static void loadLatestOptions(String dbPath, long envHandle, long dbOptionsHandle, List cfDescs, boolean ignoreUnknownOptions) throws RocksDBException; + private native static void loadLatestOptions(long cfgHandle, String dbPath, long dbOptionsHandle, + List cfDescs) throws RocksDBException; private native static void loadOptionsFromFile(String optionsFileName, long envHandle, long dbOptionsHandle, List cfDescs, boolean ignoreUnknownOptions) throws RocksDBException; + private native static void loadOptionsFromFile(long cfgHandle, String optionsFileName, + long dbOptionsHandle, List cfDescs) throws RocksDBException; private native static String getLatestOptionsFileName(String dbPath, long envHandle) throws RocksDBException; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -37,6 +37,8 @@ super(copyReadOptions(other.nativeHandle_)); this.iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_; this.iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_; + this.timestampSlice_ = other.timestampSlice_; + this.iterStartTs_ = other.iterStartTs_; } /** @@ -437,16 +439,15 @@ * * Default: null * - * @param iterateLowerBound Slice representing the upper bound + * @param iterateLowerBound Slice representing the lower bound * @return the reference to the current ReadOptions. */ - public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) { + public ReadOptions setIterateLowerBound(final AbstractSlice iterateLowerBound) { assert(isOwningHandle()); - if (iterateLowerBound != null) { - // Hold onto a reference so it doesn't get garbage collected out from under us. - iterateLowerBoundSlice_ = iterateLowerBound; - setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle()); - } + setIterateLowerBound( + nativeHandle_, iterateLowerBound == null ? 0 : iterateLowerBound.getNativeHandle()); + // Hold onto a reference so it doesn't get garbage collected out from under us. + iterateLowerBoundSlice_ = iterateLowerBound; return this; } @@ -485,13 +486,12 @@ * @param iterateUpperBound Slice representing the upper bound * @return the reference to the current ReadOptions. */ - public ReadOptions setIterateUpperBound(final Slice iterateUpperBound) { + public ReadOptions setIterateUpperBound(final AbstractSlice iterateUpperBound) { assert(isOwningHandle()); - if (iterateUpperBound != null) { - // Hold onto a reference so it doesn't get garbage collected out from under us. - iterateUpperBoundSlice_ = iterateUpperBound; - setIterateUpperBound(nativeHandle_, iterateUpperBoundSlice_.getNativeHandle()); - } + setIterateUpperBound( + nativeHandle_, iterateUpperBound == null ? 0 : iterateUpperBound.getNativeHandle()); + // Hold onto a reference so it doesn't get garbage collected out from under us. + iterateUpperBoundSlice_ = iterateUpperBound; return this; } @@ -562,6 +562,233 @@ return iterStartSeqnum(nativeHandle_); } + /** + * When true, by default use total_order_seek = true, and RocksDB can + * selectively enable prefix seek mode if won't generate a different result + * from total_order_seek, based on seek key, and iterator upper bound. + * Not supported in ROCKSDB_LITE mode, in the way that even with value true + * prefix mode is not used. + * Default: false + * + * @return true if auto prefix mode is set. + * + */ + public boolean autoPrefixMode() { + assert (isOwningHandle()); + return autoPrefixMode(nativeHandle_); + } + + /** + * When true, by default use total_order_seek = true, and RocksDB can + * selectively enable prefix seek mode if won't generate a different result + * from total_order_seek, based on seek key, and iterator upper bound. + * Not supported in ROCKSDB_LITE mode, in the way that even with value true + * prefix mode is not used. + * Default: false + * @param mode auto prefix mode + * @return the reference to the current ReadOptions. + */ + public ReadOptions setAutoPrefixMode(final boolean mode) { + assert (isOwningHandle()); + setAutoPrefixMode(nativeHandle_, mode); + return this; + } + + /** + * Timestamp of operation. Read should return the latest data visible to the + * specified timestamp. All timestamps of the same database must be of the + * same length and format. The user is responsible for providing a customized + * compare function via Comparator to order >key, timestamp> tuples. + * For iterator, iter_start_ts is the lower bound (older) and timestamp + * serves as the upper bound. Versions of the same record that fall in + * the timestamp range will be returned. If iter_start_ts is nullptr, + * only the most recent version visible to timestamp is returned. + * The user-specified timestamp feature is still under active development, + * and the API is subject to change. + * + * Default: null + * @see #iterStartTs() + * @return Reference to timestamp or null if there is no timestamp defined. + */ + public Slice timestamp() { + assert (isOwningHandle()); + final long timestampSliceHandle = timestamp(nativeHandle_); + if (timestampSliceHandle != 0) { + return new Slice(timestampSliceHandle); + } else { + return null; + } + } + + /** + * Timestamp of operation. Read should return the latest data visible to the + * specified timestamp. All timestamps of the same database must be of the + * same length and format. The user is responsible for providing a customized + * compare function via Comparator to order {@code } tuples. + * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp + * serves as the upper bound. Versions of the same record that fall in + * the timestamp range will be returned. If iter_start_ts is nullptr, + * only the most recent version visible to timestamp is returned. + * The user-specified timestamp feature is still under active development, + * and the API is subject to change. + * + * Default: null + * @see #setIterStartTs(AbstractSlice) + * @param timestamp Slice representing the timestamp + * @return the reference to the current ReadOptions. + */ + public ReadOptions setTimestamp(final AbstractSlice timestamp) { + assert (isOwningHandle()); + setTimestamp(nativeHandle_, timestamp == null ? 0 : timestamp.getNativeHandle()); + timestampSlice_ = timestamp; + return this; + } + + /** + * Timestamp of operation. Read should return the latest data visible to the + * specified timestamp. All timestamps of the same database must be of the + * same length and format. The user is responsible for providing a customized + * compare function via Comparator to order {@code } tuples. + * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp + * serves as the upper bound. Versions of the same record that fall in + * the timestamp range will be returned. If iter_start_ts is nullptr, + * only the most recent version visible to timestamp is returned. + * The user-specified timestamp feature is still under active development, + * and the API is subject to change. + * + * Default: null + * @return Reference to lower bound timestamp or null if there is no lower bound timestamp + * defined. + */ + public Slice iterStartTs() { + assert (isOwningHandle()); + final long iterStartTsHandle = iterStartTs(nativeHandle_); + if (iterStartTsHandle != 0) { + return new Slice(iterStartTsHandle); + } else { + return null; + } + } + + /** + * Timestamp of operation. Read should return the latest data visible to the + * specified timestamp. All timestamps of the same database must be of the + * same length and format. The user is responsible for providing a customized + * compare function via Comparator to order {@code } tuples. + * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp + * serves as the upper bound. Versions of the same record that fall in + * the timestamp range will be returned. If iter_start_ts is nullptr, + * only the most recent version visible to timestamp is returned. + * The user-specified timestamp feature is still under active development, + * and the API is subject to change. + * + * Default: null + * + * @param iterStartTs Reference to lower bound timestamp or null if there is no lower bound + * timestamp defined + * @return the reference to the current ReadOptions. + */ + public ReadOptions setIterStartTs(final AbstractSlice iterStartTs) { + assert (isOwningHandle()); + setIterStartTs(nativeHandle_, iterStartTs == null ? 0 : iterStartTs.getNativeHandle()); + iterStartTs_ = iterStartTs; + return this; + } + + /** + * Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + * in microseconds. + * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or + * equivalent plus allowed duration in microseconds. The best way is to use + * {@code env->NowMicros() + some timeout}. + * This is best efforts. The call may exceed the deadline if there is IO + * involved and the file system doesn't support deadlines, or due to + * checking for deadline periodically rather than for every key if + * processing a batch + * + * @return deadline time in microseconds + */ + public long deadline() { + assert (isOwningHandle()); + return deadline(nativeHandle_); + } + + /** + * Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + * in microseconds. + * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or + * equivalent plus allowed duration in microseconds. The best way is to use + * {@code env->NowMicros() + some timeout}. + * This is best efforts. The call may exceed the deadline if there is IO + * involved and the file system doesn't support deadlines, or due to + * checking for deadline periodically rather than for every key if + * processing a batch + * + * @param deadlineTime deadline time in microseconds. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setDeadline(final long deadlineTime) { + assert (isOwningHandle()); + setDeadline(nativeHandle_, deadlineTime); + return this; + } + + /** + * A timeout in microseconds to be passed to the underlying FileSystem for + * reads. As opposed to deadline, this determines the timeout for each + * individual file read request. If a MultiGet/Get/Seek/Next etc call + * results in multiple reads, each read can last up to io_timeout us. + * @return ioTimeout time in microseconds + */ + public long ioTimeout() { + assert (isOwningHandle()); + return ioTimeout(nativeHandle_); + } + + /** + * A timeout in microseconds to be passed to the underlying FileSystem for + * reads. As opposed to deadline, this determines the timeout for each + * individual file read request. If a MultiGet/Get/Seek/Next etc call + * results in multiple reads, each read can last up to io_timeout us. + * + * @param ioTimeout time in microseconds. + * @return the reference to the current ReadOptions. + */ + public ReadOptions setIoTimeout(final long ioTimeout) { + assert (isOwningHandle()); + setIoTimeout(nativeHandle_, ioTimeout); + return this; + } + + /** + * It limits the maximum cumulative value size of the keys in batch while + * reading through MultiGet. Once the cumulative value size exceeds this + * soft limit then all the remaining keys are returned with status Aborted. + * + * Default: {@code std::numeric_limits::max()} + * @return actual valueSizeSofLimit + */ + public long valueSizeSoftLimit() { + assert (isOwningHandle()); + return valueSizeSoftLimit(nativeHandle_); + } + + /** + * It limits the maximum cumulative value size of the keys in batch while + * reading through MultiGet. Once the cumulative value size exceeds this + * soft limit then all the remaining keys are returned with status Aborted. + * + * Default: {@code std::numeric_limits::max()} + * + * @param valueSizeSofLimit + * @return the reference to the current ReadOptions + */ + public ReadOptions setValueSizeSoftLimit(final long valueSizeSofLimit) { + assert (isOwningHandle()); + setValueSizeSoftLimit(nativeHandle_, valueSizeSofLimit); + return this; + } + // instance variables // NOTE: If you add new member variables, please update the copy constructor above! // @@ -570,8 +797,10 @@ // freely leave scope without us losing the Java Slice object, which during // close() would also reap its associated rocksdb::Slice native object since // it's possibly (likely) to be an owning handle. - private Slice iterateLowerBoundSlice_; - private Slice iterateUpperBoundSlice_; + private AbstractSlice iterateLowerBoundSlice_; + private AbstractSlice iterateUpperBoundSlice_; + private AbstractSlice timestampSlice_; + private AbstractSlice iterStartTs_; private native static long newReadOptions(); private native static long newReadOptions(final boolean verifyChecksums, @@ -619,4 +848,16 @@ final long tableFilterHandle); private native void setIterStartSeqnum(final long handle, final long seqNum); private native long iterStartSeqnum(final long handle); + private native boolean autoPrefixMode(final long handle); + private native void setAutoPrefixMode(final long handle, final boolean autoPrefixMode); + private native long timestamp(final long handle); + private native void setTimestamp(final long handle, final long timestampSliceHandle); + private native long iterStartTs(final long handle); + private native void setIterStartTs(final long handle, final long iterStartTsHandle); + private native long deadline(final long handle); + private native void setDeadline(final long handle, final long deadlineTime); + private native long ioTimeout(final long handle); + private native void setIoTimeout(final long handle, final long ioTimeout); + private native long valueSizeSoftLimit(final long handle); + private native void setValueSizeSoftLimit(final long handle, final long softLimit); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.List; + /** * RocksCallbackObject is similar to {@link RocksObject} but varies * in its construction as it is designed for Java objects which have functions @@ -27,6 +29,27 @@ } /** + * Given a list of RocksCallbackObjects, it returns a list + * of the native handles of the underlying objects. + * + * @param objectList the rocks callback objects + * + * @return the native handles + */ + static /* @Nullable */ long[] toNativeHandleList( + /* @Nullable */ final List objectList) { + if (objectList == null) { + return null; + } + final int len = objectList.size(); + final long[] handleList = new long[len]; + for (int i = 0; i < len; i++) { + handleList[i] = objectList.get(i).nativeHandle_; + } + return handleList; + } + + /** * Construct the Native C++ object which will callback * to our object methods * diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java 2025-05-19 16:14:27.000000000 +0000 @@ -31,13 +31,15 @@ LOADED } - private static AtomicReference libraryLoaded - = new AtomicReference<>(LibraryState.NOT_LOADED); + private static final AtomicReference libraryLoaded = + new AtomicReference<>(LibraryState.NOT_LOADED); static { RocksDB.loadLibrary(); } + private final List ownedColumnFamilyHandles = new ArrayList<>(); + /** * Loads the necessary library files. * Calling this method twice will have no effect. @@ -59,18 +61,21 @@ if (compressionType.getLibraryName() != null) { System.loadLibrary(compressionType.getLibraryName()); } - } catch (UnsatisfiedLinkError e) { + } catch (final UnsatisfiedLinkError e) { // since it may be optional, we ignore its loading failure here. } } try { NativeLibraryLoader.getInstance().loadLibrary(tmpDir); - } catch (IOException e) { + } catch (final IOException e) { libraryLoaded.set(LibraryState.NOT_LOADED); throw new RuntimeException("Unable to load the RocksDB shared library", e); } + final int encodedVersion = version(); + version = Version.fromEncodedVersion(encodedVersion); + libraryLoaded.set(LibraryState.LOADED); return; } @@ -107,7 +112,7 @@ System.load(path + "/" + Environment.getSharedLibraryFileName( compressionType.getLibraryName())); break; - } catch (UnsatisfiedLinkError e) { + } catch (final UnsatisfiedLinkError e) { // since they are optional, we ignore loading fails. } } @@ -120,7 +125,7 @@ Environment.getJniLibraryFileName("rocksdbjni")); success = true; break; - } catch (UnsatisfiedLinkError e) { + } catch (final UnsatisfiedLinkError e) { err = e; } } @@ -129,6 +134,9 @@ throw err; } + final int encodedVersion = version(); + version = Version.fromEncodedVersion(encodedVersion); + libraryLoaded.set(LibraryState.LOADED); return; } @@ -142,6 +150,10 @@ } } + public static Version rocksdbVersion() { + return version; + } + /** * Private constructor. * @@ -297,9 +309,12 @@ db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + return db; } @@ -319,12 +334,63 @@ throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. - Options options = new Options(); + final Options options = new Options(); return openReadOnly(options, path); } /** * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the specified + * options and db path. + * + * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically. + * + * @param options {@link Options} instance. + * @param path the path to the RocksDB. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(final Options options, final String path) + throws RocksDBException { + return openReadOnly(options, path, false); + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the specified + * options and db path. + * + * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically. + * + * @param options {@link Options} instance. + * @param path the path to the RocksDB. + * @param errorIfWalFileExists true to raise an error when opening the db + * if a Write Ahead Log file exists, false otherwise. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(final Options options, final String path, + final boolean errorIfWalFileExists) throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path, errorIfWalFileExists)); + db.storeOptionsInstance(options); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the default * options. * @@ -345,8 +411,7 @@ // This allows to use the rocksjni default Options instead of // the c++ one. final DBOptions options = new DBOptions(); - return openReadOnly(options, path, columnFamilyDescriptors, - columnFamilyHandles); + return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); } /** @@ -354,26 +419,27 @@ * Read-Only mode given the path to the database using the specified * options and db path. * - * Options instance *should* not be disposed before all DBs using this options - * instance have been closed. If user doesn't call options dispose explicitly, - * then this options instance will be GC'd automatically. + *

      This open method allows to open RocksDB using a subset of available + * column families

      + *

      Options instance *should* not be disposed before all DBs using this + * options instance have been closed. If user doesn't call options dispose + * explicitly,then this options instance will be GC'd automatically.

      * - * @param options {@link Options} instance. + * @param options {@link DBOptions} instance. * @param path the path to the RocksDB. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ - public static RocksDB openReadOnly(final Options options, final String path) - throws RocksDBException { - // when non-default Options is used, keeping an Options reference - // in RocksDB can prevent Java to GC during the life-time of - // the currently-created RocksDB. - final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path)); - db.storeOptionsInstance(options); - return db; + public static RocksDB openReadOnly(final DBOptions options, final String path, + final List columnFamilyDescriptors, + final List columnFamilyHandles) throws RocksDBException { + return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); } /** @@ -392,6 +458,8 @@ * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. + * @param errorIfWalFileExists true to raise an error when opening the db + * if a Write Ahead Log file exists, false otherwise. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * @@ -400,7 +468,7 @@ */ public static RocksDB openReadOnly(final DBOptions options, final String path, final List columnFamilyDescriptors, - final List columnFamilyHandles) + final List columnFamilyHandles, final boolean errorIfWalFileExists) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of @@ -415,15 +483,114 @@ cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; } - final long[] handles = openROnly(options.nativeHandle_, path, cfNames, - cfOptionHandles); + final long[] handles = + openROnly(options.nativeHandle_, path, cfNames, cfOptionHandles, errorIfWalFileExists); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + + return db; + } + + /** + * Open DB as secondary instance with only the default column family. + * + * The secondary instance can dynamically tail the MANIFEST of + * a primary that must have already been created. User can call + * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up + * with primary (WAL tailing is NOT supported now) whenever the user feels + * necessary. Column families created by the primary after the secondary + * instance starts are currently ignored by the secondary instance. + * Column families opened by secondary and dropped by the primary will be + * dropped by secondary as well. However the user of the secondary instance + * can still access the data of such dropped column family as long as they + * do not destroy the corresponding column family handle. + * WAL tailing is not supported at present, but will arrive soon. + * + * @param options the options to open the secondary instance. + * @param path the path to the primary RocksDB instance. + * @param secondaryPath points to a directory where the secondary instance + * stores its info log + * + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openAsSecondary(final Options options, final String path, + final String secondaryPath) throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + final RocksDB db = new RocksDB(openAsSecondary(options.nativeHandle_, path, secondaryPath)); + db.storeOptionsInstance(options); + return db; + } + + /** + * Open DB as secondary instance with column families. + * You can open a subset of column families in secondary mode. + * + * The secondary instance can dynamically tail the MANIFEST of + * a primary that must have already been created. User can call + * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up + * with primary (WAL tailing is NOT supported now) whenever the user feels + * necessary. Column families created by the primary after the secondary + * instance starts are currently ignored by the secondary instance. + * Column families opened by secondary and dropped by the primary will be + * dropped by secondary as well. However the user of the secondary instance + * can still access the data of such dropped column family as long as they + * do not destroy the corresponding column family handle. + * WAL tailing is not supported at present, but will arrive soon. + * + * @param options the options to open the secondary instance. + * @param path the path to the primary RocksDB instance. + * @param secondaryPath points to a directory where the secondary instance + * stores its info log. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openAsSecondary(final DBOptions options, final String path, + final String secondaryPath, final List columnFamilyDescriptors, + final List columnFamilyHandles) throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + + final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; + final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; + for (int i = 0; i < columnFamilyDescriptors.size(); i++) { + final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors.get(i); + cfNames[i] = cfDescriptor.getName(); + cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; + } + + final long[] handles = + openAsSecondary(options.nativeHandle_, path, secondaryPath, cfNames, cfOptionHandles); + final RocksDB db = new RocksDB(handles[0]); + db.storeOptionsInstance(options); + + for (int i = 1; i < handles.length; i++) { + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); + } + + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + return db; } @@ -441,6 +608,11 @@ * @throws RocksDBException if an error occurs whilst closing. */ public void closeE() throws RocksDBException { + for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -463,6 +635,11 @@ */ @Override public void close() { + for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -505,10 +682,12 @@ public ColumnFamilyHandle createColumnFamily( final ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException { - return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_, - columnFamilyDescriptor.getName(), - columnFamilyDescriptor.getName().length, - columnFamilyDescriptor.getOptions().nativeHandle_)); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, + createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(), + columnFamilyDescriptor.getName().length, + columnFamilyDescriptor.getOptions().nativeHandle_)); + ownedColumnFamilyHandles.add(columnFamilyHandle); + return columnFamilyHandle; } /** @@ -532,8 +711,10 @@ final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } @@ -563,8 +744,10 @@ final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } @@ -597,7 +780,22 @@ dropColumnFamilies(nativeHandle_, cfHandles); } - //TODO(AR) what about DestroyColumnFamilyHandle + /** + * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object + * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}. + * + * @param columnFamilyHandle column family handle object. + */ + public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) { + for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) { + final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i); + if (ownedHandle.equals(columnFamilyHandle)) { + columnFamilyHandle.close(); + ownedColumnFamilyHandles.remove(i); + return; + } + } + } /** * Set the database entry for "key" to "value". @@ -2020,8 +2218,8 @@ assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[0][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2080,8 +2278,8 @@ } final byte[][] keysArray = keys.toArray(new byte[0][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2119,8 +2317,8 @@ assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[0][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2180,8 +2378,8 @@ } final byte[][] keysArray = keys.toArray(new byte[0][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2217,8 +2415,8 @@ assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2263,8 +2461,8 @@ } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2290,8 +2488,8 @@ assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2336,8 +2534,8 @@ } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); - final int keyOffsets[] = new int[keysArray.length]; - final int keyLengths[] = new int[keysArray.length]; + final int[] keyOffsets = new int[keysArray.length]; + final int[] keyLengths = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } @@ -2347,8 +2545,158 @@ } /** + * Fetches a list of values for the given list of keys, all from the default column family. + * + * @param keys list of keys for which values need to be retrieved. + * @param values list of buffers to return retrieved values in + * @return list of number of bytes in DB for each requested key + * this can be more than the size of the corresponding buffer; then the buffer will be filled + * with the appropriate truncation of the database value. + * @throws RocksDBException if error happens in underlying native library. + * @throws IllegalArgumentException thrown if the number of passed keys and passed values + * do not match. + */ + public List multiGetByteBuffers( + final List keys, final List values) throws RocksDBException { + final ReadOptions readOptions = new ReadOptions(); + final List columnFamilyHandleList = new ArrayList<>(1); + columnFamilyHandleList.add(getDefaultColumnFamily()); + return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + } + + /** + * Fetches a list of values for the given list of keys, all from the default column family. + * + * @param readOptions Read options + * @param keys list of keys for which values need to be retrieved. + * @param values list of buffers to return retrieved values in + * @throws RocksDBException if error happens in underlying native library. + * @throws IllegalArgumentException thrown if the number of passed keys and passed values + * do not match. + */ + public List multiGetByteBuffers(final ReadOptions readOptions, + final List keys, final List values) throws RocksDBException { + final List columnFamilyHandleList = new ArrayList<>(1); + columnFamilyHandleList.add(getDefaultColumnFamily()); + return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + } + + /** + * Fetches a list of values for the given list of keys. + *

      + * Note: Every key needs to have a related column family name in + * {@code columnFamilyHandleList}. + *

      + * + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys list of keys for which values need to be retrieved. + * @param values list of buffers to return retrieved values in + * @throws RocksDBException if error happens in underlying native library. + * @throws IllegalArgumentException thrown if the number of passed keys, passed values and + * passed column family handles do not match. + */ + public List multiGetByteBuffers( + final List columnFamilyHandleList, final List keys, + final List values) throws RocksDBException { + final ReadOptions readOptions = new ReadOptions(); + return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + } + + /** + * Fetches a list of values for the given list of keys. + *

      + * Note: Every key needs to have a related column family name in + * {@code columnFamilyHandleList}. + *

      + * + * @param readOptions Read options + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys list of keys for which values need to be retrieved. + * @param values list of buffers to return retrieved values in + * @throws RocksDBException if error happens in underlying native library. + * @throws IllegalArgumentException thrown if the number of passed keys, passed values and + * passed column family handles do not match. + */ + public List multiGetByteBuffers(final ReadOptions readOptions, + final List columnFamilyHandleList, final List keys, + final List values) throws RocksDBException { + assert (keys.size() != 0); + + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (keys.size() != columnFamilyHandleList.size() && columnFamilyHandleList.size() > 1) { + throw new IllegalArgumentException( + "Wrong number of ColumnFamilyHandle(s) supplied. Provide 0, 1, or as many as there are key/value(s)"); + } + + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (values.size() != keys.size()) { + throw new IllegalArgumentException("For each key there must be a corresponding value."); + } + + // TODO (AP) support indirect buffers + for (final ByteBuffer key : keys) { + if (!key.isDirect()) { + throw new IllegalArgumentException("All key buffers must be direct byte buffers"); + } + } + + // TODO (AP) support indirect buffers, though probably via a less efficient code path + for (final ByteBuffer value : values) { + if (!value.isDirect()) { + throw new IllegalArgumentException("All value buffers must be direct byte buffers"); + } + } + + final int numCFHandles = columnFamilyHandleList.size(); + final long[] cfHandles = new long[numCFHandles]; + for (int i = 0; i < numCFHandles; i++) { + cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; + } + + final int numValues = keys.size(); + + final ByteBuffer[] keysArray = keys.toArray(new ByteBuffer[0]); + final int[] keyOffsets = new int[numValues]; + final int[] keyLengths = new int[numValues]; + for (int i = 0; i < numValues; i++) { + // TODO (AP) add keysArray[i].arrayOffset() if the buffer is indirect + // TODO (AP) because in that case we have to pass the array directly, + // so that the JNI C++ code will not know to compensate for the array offset + keyOffsets[i] = keysArray[i].position(); + keyLengths[i] = keysArray[i].limit(); + } + final ByteBuffer[] valuesArray = values.toArray(new ByteBuffer[0]); + final int[] valuesSizeArray = new int[numValues]; + final Status[] statusArray = new Status[numValues]; + + multiGet(nativeHandle_, readOptions.nativeHandle_, cfHandles, keysArray, keyOffsets, keyLengths, + valuesArray, valuesSizeArray, statusArray); + + final List results = new ArrayList<>(); + for (int i = 0; i < numValues; i++) { + final Status status = statusArray[i]; + if (status.getCode() == Status.Code.Ok) { + final ByteBuffer value = valuesArray[i]; + value.position(Math.min(valuesSizeArray[i], value.capacity())); + value.flip(); // prepare for read out + results.add(new ByteBufferGetStatus(status, valuesSizeArray[i], value)); + } else { + results.add(new ByteBufferGetStatus(status)); + } + } + + return results; + } + + /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2372,7 +2720,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2401,7 +2751,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2428,7 +2780,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2460,7 +2814,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2487,7 +2843,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2519,7 +2877,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2548,7 +2908,9 @@ /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2602,6 +2964,159 @@ } /** + * If the key definitely does not exist in the database, then this method + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. + * + * @param key bytebuffer containing the value of the key + * @return false if the key definitely does not exist in the database, + * otherwise true. + */ + public boolean keyMayExist(final ByteBuffer key) { + return keyMayExist(null, (ReadOptions) null, key); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. + * + * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in + * @param key bytebuffer containing the value of the key + * @return false if the key definitely does not exist in the database, + * otherwise true. + */ + public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) { + return keyMayExist(columnFamilyHandle, (ReadOptions) null, key); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. + * + * @param readOptions the {@link ReadOptions} to use when reading the key/value + * @param key bytebuffer containing the value of the key + * @return false if the key definitely does not exist in the database, + * otherwise true. + */ + public boolean keyMayExist(final ReadOptions readOptions, final ByteBuffer key) { + return keyMayExist(null, readOptions, key); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist}, + * otherwise if it can with best effort retreive the value, it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might + * exist is at the discretion of the implementation; the only guarantee is that {@link + * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist. + * + * @param key bytebuffer containing the value of the key + * @param value bytebuffer which will receive a value if the key exists and a value is known + * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided + */ + public KeyMayExist keyMayExist(final ByteBuffer key, final ByteBuffer value) { + return keyMayExist(null, null, key, value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist}, + * otherwise if it can with best effort retreive the value, it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might + * exist is at the discretion of the implementation; the only guarantee is that {@link + * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist. + * + * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in + * @param key bytebuffer containing the value of the key + * @param value bytebuffer which will receive a value if the key exists and a value is known + * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided + */ + public KeyMayExist keyMayExist( + final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value) { + return keyMayExist(columnFamilyHandle, null, key, value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist}, + * otherwise if it can with best effort retreive the value, it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might + * exist is at the discretion of the implementation; the only guarantee is that {@link + * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist. + * + * @param readOptions the {@link ReadOptions} to use when reading the key/value + * @param key bytebuffer containing the value of the key + * @param value bytebuffer which will receive a value if the key exists and a value is known + * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided + */ + public KeyMayExist keyMayExist( + final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) { + return keyMayExist(null, readOptions, key, value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a false negative. + * + * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in + * @param readOptions the {@link ReadOptions} to use when reading the key/value + * @param key bytebuffer containing the value of the key + * @return false if the key definitely does not exist in the database, + * otherwise true. + */ + public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final ByteBuffer key) { + assert key != null : "key ByteBuffer parameter cannot be null"; + assert key.isDirect() : "key parameter must be a direct ByteBuffer"; + return keyMayExistDirect(nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.limit()); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist}, + * otherwise if it can with best effort retreive the value, it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link + * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might + * exist is at the discretion of the implementation; the only guarantee is that {@link + * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist. + * + * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in + * @param readOptions the {@link ReadOptions} to use when reading the key/value + * @param key bytebuffer containing the value of the key + * @param value bytebuffer which will receive a value if the key exists and a value is known + * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided + */ + public KeyMayExist keyMayExist(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) { + assert key != null : "key ByteBuffer parameter cannot be null"; + assert key.isDirect() : "key parameter must be a direct ByteBuffer"; + assert value + != null + : "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method"; + assert value.isDirect() : "value parameter must be a direct ByteBuffer"; + + final int[] result = keyMayExistDirectFoundValue(nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.remaining(), + value, value.position(), value.remaining()); + final int valueLength = result[1]; + value.limit(value.position() + Math.min(valueLength, value.remaining())); + return new KeyMayExist(KeyMayExist.KeyMayExistEnum.values()[result[0]], valueLength); + } + + /** *

      Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator @@ -2636,8 +3151,8 @@ } /** - *

      Return a heap-allocated iterator over the contents of the - * database. The result of newIterator() is initially invalid + *

      Return a heap-allocated iterator over the contents of a + * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

      * @@ -2656,8 +3171,8 @@ } /** - *

      Return a heap-allocated iterator over the contents of the - * database. The result of newIterator() is initially invalid + *

      Return a heap-allocated iterator over the contents of a + * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

      * @@ -3376,9 +3891,52 @@ /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, final MutableColumnFamilyOptions mutableColumnFamilyOptions) throws RocksDBException { - setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_, - mutableColumnFamilyOptions.getKeys(), - mutableColumnFamilyOptions.getValues()); + setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues()); + } + + /** + * Get the options for the column family handle + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance, or null for the default column family. + * + * @return the options parsed from the options string return by RocksDB + * + * @throws RocksDBException if an error occurs while getting the options string, or parsing the + * resulting options string into options + */ + public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions( + /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { + String optionsString = getOptions( + nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); + return MutableColumnFamilyOptions.parse(optionsString, true); + } + + /** + * Default column family options + * + * @return the options parsed from the options string return by RocksDB + * + * @throws RocksDBException if an error occurs while getting the options string, or parsing the + * resulting options string into options + */ + public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions() + throws RocksDBException { + return getOptions(null); + } + + /** + * Get the database options + * + * @return the DB options parsed from the options string return by RocksDB + * + * @throws RocksDBException if an error occurs while getting the options string, or parsing the + * resulting options string into options + */ + public MutableDBOptions.MutableDBOptionsBuilder getDBOptions() throws RocksDBException { + String optionsString = getDBOptions(nativeHandle_); + return MutableDBOptions.parse(optionsString, true); } /** @@ -3480,6 +4038,17 @@ } /** + * This function will cancel all currently running background processes. + * + * @param wait if true, wait for all background work to be cancelled before + * returning. + * + */ + public void cancelAllBackgroundWork(boolean wait) { + cancelAllBackgroundWork(nativeHandle_, wait); + } + + /** * This function will wait until all currently running background processes * finish. After it returns, no background process will be run until * {@link #continueBackgroundWork()} is called @@ -3914,7 +4483,7 @@ * * @return the column family metadata */ - public ColumnFamilyMetaData GetColumnFamilyMetaData() { + public ColumnFamilyMetaData getColumnFamilyMetaData() { return getColumnFamilyMetaData(null); } @@ -4146,6 +4715,25 @@ } /** + * Make the secondary instance catch up with the primary by tailing and + * replaying the MANIFEST and WAL of the primary. + * Column families created by the primary after the secondary instance starts + * will be ignored unless the secondary instance closes and restarts with the + * newly created column families. + * Column families that exist before secondary instance starts and dropped by + * the primary afterwards will be marked as dropped. However, as long as the + * secondary instance does not delete the corresponding column family + * handles, the data of the column family is still accessible to the + * secondary. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void tryCatchUpWithPrimary() throws RocksDBException { + tryCatchUpWithPrimary(nativeHandle_); + } + + /** * Delete files in multiple ranges at once. * Delete files in a lot of ranges one at a time can be slow, use this API for * better performance in that case. @@ -4212,7 +4800,7 @@ return rangeSliceHandles; } - protected void storeOptionsInstance(DBOptionsInterface options) { + protected void storeOptionsInstance(DBOptionsInterface options) { options_ = options; } @@ -4248,8 +4836,8 @@ final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; - private native static long openROnly(final long optionsHandle, - final String path) throws RocksDBException; + private native static long openROnly(final long optionsHandle, final String path, + final boolean errorIfWalFileExists) throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object @@ -4263,10 +4851,16 @@ * * @throws RocksDBException thrown if the database could not be opened */ - private native static long[] openROnly(final long optionsHandle, - final String path, final byte[][] columnFamilyNames, - final long[] columnFamilyOptions - ) throws RocksDBException; + private native static long[] openROnly(final long optionsHandle, final String path, + final byte[][] columnFamilyNames, final long[] columnFamilyOptions, + final boolean errorIfWalFileExists) throws RocksDBException; + + private native static long openAsSecondary(final long optionsHandle, final String path, + final String secondaryPath) throws RocksDBException; + + private native static long[] openAsSecondary(final long optionsHandle, final String path, + final String secondaryPath, final byte[][] columnFamilyNames, + final long[] columnFamilyOptions) throws RocksDBException; @Override protected native void disposeInternal(final long handle); @@ -4287,7 +4881,6 @@ final long handle, final long cfHandle) throws RocksDBException; private native void dropColumnFamilies(final long handle, final long[] cfHandles) throws RocksDBException; - //TODO(AR) best way to express DestroyColumnFamilyHandle? ...maybe in ColumnFamilyHandle? private native void put(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, int valueLength) throws RocksDBException; @@ -4397,6 +4990,12 @@ private native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); + + private native void multiGet(final long dbHandle, final long rOptHandle, + final long[] columnFamilyHandles, final ByteBuffer[] keysArray, final int[] keyOffsets, + final int[] keyLengths, final ByteBuffer[] valuesArray, final int[] valuesSizeArray, + final Status[] statusArray); + private native boolean keyMayExist( final long handle, final long cfHandle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); @@ -4426,6 +5025,11 @@ private native int getDirect(long handle, long readOptHandle, ByteBuffer key, int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; + private native boolean keyMayExistDirect(final long handle, final long cfHhandle, + final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength); + private native int[] keyMayExistDirectFoundValue(final long handle, final long cfHhandle, + final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength, + final ByteBuffer value, final int valueOffset, final int valueLength); private native void deleteDirect(long handle, long optHandle, ByteBuffer key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; private native long getLongProperty(final long nativeHandle, @@ -4438,9 +5042,9 @@ private native long[] getApproximateSizes(final long nativeHandle, final long columnFamilyHandle, final long[] rangeSliceHandles, final byte includeFlags); - private final native long[] getApproximateMemTableStats( - final long nativeHandle, final long columnFamilyHandle, - final long rangeStartSliceHandle, final long rangeLimitSliceHandle); + private native long[] getApproximateMemTableStats(final long nativeHandle, + final long columnFamilyHandle, final long rangeStartSliceHandle, + final long rangeLimitSliceHandle); private native void compactRange(final long handle, /* @Nullable */ final byte[] begin, final int beginLen, /* @Nullable */ final byte[] end, final int endLen, @@ -4448,8 +5052,10 @@ throws RocksDBException; private native void setOptions(final long handle, final long cfHandle, final String[] keys, final String[] values) throws RocksDBException; + private native String getOptions(final long handle, final long cfHandle); private native void setDBOptions(final long handle, final String[] keys, final String[] values) throws RocksDBException; + private native String getDBOptions(final long handle); private native String[] compactFiles(final long handle, final long compactionOptionsHandle, final long columnFamilyHandle, @@ -4457,6 +5063,8 @@ final int outputLevel, final int outputPathId, final long compactionJobInfoHandle) throws RocksDBException; + private native void cancelAllBackgroundWork(final long handle, + final boolean wait); private native void pauseBackgroundWork(final long handle) throws RocksDBException; private native void continueBackgroundWork(final long handle) @@ -4512,11 +5120,54 @@ private native void startTrace(final long handle, final long maxTraceFileSize, final long traceWriterHandle) throws RocksDBException; private native void endTrace(final long handle) throws RocksDBException; + private native void tryCatchUpWithPrimary(final long handle) throws RocksDBException; private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, boolean include_end) throws RocksDBException; private native static void destroyDB(final String path, final long optionsHandle) throws RocksDBException; - protected DBOptionsInterface options_; + private native static int version(); + + protected DBOptionsInterface options_; + private static Version version; + + public static class Version { + private final byte major; + private final byte minor; + private final byte patch; + + public Version(final byte major, final byte minor, final byte patch) { + this.major = major; + this.minor = minor; + this.patch = patch; + } + + public int getMajor() { + return major; + } + + public int getMinor() { + return minor; + } + + public int getPatch() { + return patch; + } + + @Override + public String toString() { + return getMajor() + "." + getMinor() + "." + getPatch(); + } + + private static Version fromEncodedVersion(int encodedVersion) { + final byte patch = (byte) (encodedVersion & 0xff); + encodedVersion >>= 8; + final byte minor = (byte) (encodedVersion & 0xff); + encodedVersion >>= 8; + final byte major = (byte) (encodedVersion & 0xff); + + return new Version(major, minor, patch); + } + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java 2025-05-19 16:14:27.000000000 +0000 @@ -102,6 +102,7 @@ @Override final native void seekToLast0(long handle); @Override final native void next0(long handle); @Override final native void prev0(long handle); + @Override final native void refresh0(long handle); @Override final native void seek0(long handle, byte[] target, int targetLen); @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); @Override diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -114,4 +114,14 @@ * native library. */ void status() throws RocksDBException; + + /** + *

      If supported, renew the iterator to represent the latest state. The iterator will be + * invalidated after the call. Not supported if {@link ReadOptions#setSnapshot(Snapshot)} was + * specified when creating the iterator.

      + * + * @throws RocksDBException thrown if the operation is not supported or an error happens in the + * underlying native library + */ + void refresh() throws RocksDBException; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java 2025-05-19 16:14:27.000000000 +0000 @@ -38,4 +38,8 @@ } protected abstract void disposeInternal(final long handle); + + public long getNativeHandle() { + return nativeHandle_; + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,41 @@ +package org.rocksdb; + +public enum SanityLevel { + NONE((byte) 0x0), + LOOSELY_COMPATIBLE((byte) 0x1), + EXACT_MATCH((byte) 0xFF); + + private final byte value; + + SanityLevel(final byte value) { + this.value = value; + } + + /** + * Get the internal representation value. + * + * @return the internal representation value. + */ + byte getValue() { + return value; + } + + /** + * Get the SanityLevel from the internal representation value. + * + * @param value the internal representation value. + * + * @return the SanityLevel + * + * @throws IllegalArgumentException if the value does not match a + * SanityLevel + */ + static SanityLevel fromValue(final byte value) throws IllegalArgumentException { + for (final SanityLevel level : SanityLevel.values()) { + if (level.value == value) { + return level; + } + } + throw new IllegalArgumentException("Unknown value for SanityLevel: " + value); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java 2025-05-19 16:14:27.000000000 +0000 @@ -102,6 +102,7 @@ @Override final native void seekToLast0(long handle); @Override final native void next0(long handle); @Override final native void prev0(long handle); + @Override final native void refresh0(long handle) throws RocksDBException; @Override final native void seek0(long handle, byte[] target, int targetLen); @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); @Override final native void status0(long handle) throws RocksDBException; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java 2025-05-19 16:14:27.000000000 +0000 @@ -244,6 +244,7 @@ /** * Return the current file size. * + * @return the current file size. * @throws RocksDBException thrown if error happens in underlying * native library. */ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * Handle to factory for SstPartitioner. It is used in {@link ColumnFamilyOptions} + */ +public abstract class SstPartitionerFactory extends RocksObject { + protected SstPartitionerFactory(final long nativeHandle) { + super(nativeHandle); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,19 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * Fixed prefix factory. It partitions SST files using fixed prefix of the key. + */ +public class SstPartitionerFixedPrefixFactory extends SstPartitionerFactory { + public SstPartitionerFixedPrefixFactory(long prefixLength) { + super(newSstPartitionerFixedPrefixFactory0(prefixLength)); + } + + private native static long newSstPartitionerFixedPrefixFactory0(long prefixLength); + + @Override protected final native void disposeInternal(final long handle); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.Objects; + /** * Represents the status returned by a function call in RocksDB. * @@ -135,4 +137,19 @@ return value; } } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Status status = (Status) o; + return code == status.code && subCode == status.subCode && Objects.equals(state, status.state); + } + + @Override + public int hashCode() { + return Objects.hash(code, subCode, state); + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java 2025-05-19 16:14:27.000000000 +0000 @@ -19,6 +19,11 @@ super(newSharedStringAppendOperator(delim)); } + public StringAppendOperator(String delim) { + super(newSharedStringAppendOperator(delim)); + } + private native static long newSharedStringAppendOperator(final char delim); + private native static long newSharedStringAppendOperator(final String delim); @Override protected final native void disposeInternal(final long handle); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileCreationBriefInfo { + private final String dbName; + private final String columnFamilyName; + private final String filePath; + private final int jobId; + private final TableFileCreationReason reason; + + /** + * Access is private as this will only be constructed from + * C++ via JNI, either directly of via + * {@link TableFileCreationInfo#TableFileCreationInfo(long, TableProperties, Status, String, + * String, String, int, byte)}. + * + * @param dbName the database name + * @param columnFamilyName the column family name + * @param filePath the path to the table file + * @param jobId the job identifier + * @param tableFileCreationReasonValue the reason for creation of the table file + */ + protected TableFileCreationBriefInfo(final String dbName, final String columnFamilyName, + final String filePath, final int jobId, final byte tableFileCreationReasonValue) { + this.dbName = dbName; + this.columnFamilyName = columnFamilyName; + this.filePath = filePath; + this.jobId = jobId; + this.reason = TableFileCreationReason.fromValue(tableFileCreationReasonValue); + } + + /** + * Get the name of the database where the file was created. + * + * @return the name of the database. + */ + public String getDbName() { + return dbName; + } + + /** + * Get the name of the column family where the file was created. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path to the created file. + * + * @return the path. + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the job (which could be flush or compaction) that + * created the file. + * + * @return the id of the job. + */ + public int getJobId() { + return jobId; + } + + /** + * Get the reason for creating the table. + * + * @return the reason for creating the table. + */ + public TableFileCreationReason getReason() { + return reason; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o; + return jobId == that.jobId && Objects.equals(dbName, that.dbName) + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filePath, that.filePath) && reason == that.reason; + } + + @Override + public int hashCode() { + return Objects.hash(dbName, columnFamilyName, filePath, jobId, reason); + } + + @Override + public String toString() { + return "TableFileCreationBriefInfo{" + + "dbName='" + dbName + '\'' + ", columnFamilyName='" + columnFamilyName + '\'' + + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + ", reason=" + reason + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileCreationInfo extends TableFileCreationBriefInfo { + private final long fileSize; + private final TableProperties tableProperties; + private final Status status; + + /** + * Access is protected as this will only be constructed from + * C++ via JNI. + * + * @param fileSize the size of the table file + * @param tableProperties the properties of the table file + * @param status the status of the creation operation + * @param dbName the database name + * @param columnFamilyName the column family name + * @param filePath the path to the table file + * @param jobId the job identifier + * @param tableFileCreationReasonValue the reason for creation of the table file + */ + protected TableFileCreationInfo(final long fileSize, final TableProperties tableProperties, + final Status status, final String dbName, final String columnFamilyName, + final String filePath, final int jobId, final byte tableFileCreationReasonValue) { + super(dbName, columnFamilyName, filePath, jobId, tableFileCreationReasonValue); + this.fileSize = fileSize; + this.tableProperties = tableProperties; + this.status = status; + } + + /** + * Get the size of the file. + * + * @return the size. + */ + public long getFileSize() { + return fileSize; + } + + /** + * Get the detailed properties of the created file. + * + * @return the properties. + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + /** + * Get the status indicating whether the creation was successful or not. + * + * @return the status. + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileCreationInfo that = (TableFileCreationInfo) o; + return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties) + && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(fileSize, tableProperties, status); + } + + @Override + public String toString() { + return "TableFileCreationInfo{" + + "fileSize=" + fileSize + ", tableProperties=" + tableProperties + ", status=" + status + + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum TableFileCreationReason { + FLUSH((byte) 0x00), + COMPACTION((byte) 0x01), + RECOVERY((byte) 0x02), + MISC((byte) 0x03); + + private final byte value; + + TableFileCreationReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the TableFileCreationReason from the internal representation value. + * + * @return the table file creation reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static TableFileCreationReason fromValue(final byte value) { + for (final TableFileCreationReason tableFileCreationReason : TableFileCreationReason.values()) { + if (tableFileCreationReason.value == value) { + return tableFileCreationReason; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for TableFileCreationReason: " + value); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileDeletionInfo { + private final String dbName; + private final String filePath; + private final int jobId; + private final Status status; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + TableFileDeletionInfo( + final String dbName, final String filePath, final int jobId, final Status status) { + this.dbName = dbName; + this.filePath = filePath; + this.jobId = jobId; + this.status = status; + } + + /** + * Get the name of the database where the file was deleted. + * + * @return the name of the database. + */ + public String getDbName() { + return dbName; + } + + /** + * Get the path to the deleted file. + * + * @return the path. + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the job which deleted the file. + * + * @return the id of the job. + */ + public int getJobId() { + return jobId; + } + + /** + * Get the status indicating whether the deletion was successful or not. + * + * @return the status + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileDeletionInfo that = (TableFileDeletionInfo) o; + return jobId == that.jobId && Objects.equals(dbName, that.dbName) + && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(dbName, filePath, jobId, status); + } + + @Override + public String toString() { + return "TableFileDeletionInfo{" + + "dbName='" + dbName + '\'' + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + + ", status=" + status + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java 2025-05-19 16:14:27.000000000 +0000 @@ -1,7 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb; +import java.util.Arrays; import java.util.Map; +import java.util.Objects; /** * TableProperties contains read-only properties of its associated @@ -27,6 +29,9 @@ private final long columnFamilyId; private final long creationTime; private final long oldestKeyTime; + private final long slowCompressionEstimatedDataSize; + private final long fastCompressionEstimatedDataSize; + private final long externalSstFileGlobalSeqnoOffset; private final byte[] columnFamilyName; private final String filterPolicyName; private final String comparatorName; @@ -36,27 +41,24 @@ private final String compressionName; private final Map userCollectedProperties; private final Map readableProperties; - private final Map propertiesOffsets; /** - * Access is private as this will only be constructed from - * C++ via JNI. + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. */ - private TableProperties(final long dataSize, final long indexSize, - final long indexPartitions, final long topLevelIndexSize, - final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded, - final long filterSize, final long rawKeySize, final long rawValueSize, - final long numDataBlocks, final long numEntries, final long numDeletions, - final long numMergeOperands, final long numRangeDeletions, - final long formatVersion, final long fixedKeyLen, - final long columnFamilyId, final long creationTime, - final long oldestKeyTime, final byte[] columnFamilyName, - final String filterPolicyName, final String comparatorName, - final String mergeOperatorName, final String prefixExtractorName, - final String propertyCollectorsNames, final String compressionName, - final Map userCollectedProperties, - final Map readableProperties, - final Map propertiesOffsets) { + TableProperties(final long dataSize, final long indexSize, final long indexPartitions, + final long topLevelIndexSize, final long indexKeyIsUserKey, + final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize, + final long rawValueSize, final long numDataBlocks, final long numEntries, + final long numDeletions, final long numMergeOperands, final long numRangeDeletions, + final long formatVersion, final long fixedKeyLen, final long columnFamilyId, + final long creationTime, final long oldestKeyTime, + final long slowCompressionEstimatedDataSize, final long fastCompressionEstimatedDataSize, + final long externalSstFileGlobalSeqnoOffset, final byte[] columnFamilyName, + final String filterPolicyName, final String comparatorName, final String mergeOperatorName, + final String prefixExtractorName, final String propertyCollectorsNames, + final String compressionName, final Map userCollectedProperties, + final Map readableProperties) { this.dataSize = dataSize; this.indexSize = indexSize; this.indexPartitions = indexPartitions; @@ -76,6 +78,9 @@ this.columnFamilyId = columnFamilyId; this.creationTime = creationTime; this.oldestKeyTime = oldestKeyTime; + this.slowCompressionEstimatedDataSize = slowCompressionEstimatedDataSize; + this.fastCompressionEstimatedDataSize = fastCompressionEstimatedDataSize; + this.externalSstFileGlobalSeqnoOffset = externalSstFileGlobalSeqnoOffset; this.columnFamilyName = columnFamilyName; this.filterPolicyName = filterPolicyName; this.comparatorName = comparatorName; @@ -85,7 +90,6 @@ this.compressionName = compressionName; this.userCollectedProperties = userCollectedProperties; this.readableProperties = readableProperties; - this.propertiesOffsets = propertiesOffsets; } /** @@ -269,6 +273,26 @@ } /** + * Get the estimated size of data blocks compressed with a relatively slower + * compression algorithm. + * + * @return 0 means unknown, otherwise the timestamp. + */ + public long getSlowCompressionEstimatedDataSize() { + return slowCompressionEstimatedDataSize; + } + + /** + * Get the estimated size of data blocks compressed with a relatively faster + * compression algorithm. + * + * @return 0 means unknown, otherwise the timestamp. + */ + public long getFastCompressionEstimatedDataSize() { + return fastCompressionEstimatedDataSize; + } + + /** * Get the name of the column family with which this * SST file is associated. * @@ -355,12 +379,48 @@ return readableProperties; } - /** - * The offset of the value of each property in the file. - * - * @return the offset of each property. - */ - public Map getPropertiesOffsets() { - return propertiesOffsets; + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableProperties that = (TableProperties) o; + return dataSize == that.dataSize && indexSize == that.indexSize + && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize + && indexKeyIsUserKey == that.indexKeyIsUserKey + && indexValueIsDeltaEncoded == that.indexValueIsDeltaEncoded + && filterSize == that.filterSize && rawKeySize == that.rawKeySize + && rawValueSize == that.rawValueSize && numDataBlocks == that.numDataBlocks + && numEntries == that.numEntries && numDeletions == that.numDeletions + && numMergeOperands == that.numMergeOperands && numRangeDeletions == that.numRangeDeletions + && formatVersion == that.formatVersion && fixedKeyLen == that.fixedKeyLen + && columnFamilyId == that.columnFamilyId && creationTime == that.creationTime + && oldestKeyTime == that.oldestKeyTime + && slowCompressionEstimatedDataSize == that.slowCompressionEstimatedDataSize + && fastCompressionEstimatedDataSize == that.fastCompressionEstimatedDataSize + && externalSstFileGlobalSeqnoOffset == that.externalSstFileGlobalSeqnoOffset + && Arrays.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filterPolicyName, that.filterPolicyName) + && Objects.equals(comparatorName, that.comparatorName) + && Objects.equals(mergeOperatorName, that.mergeOperatorName) + && Objects.equals(prefixExtractorName, that.prefixExtractorName) + && Objects.equals(propertyCollectorsNames, that.propertyCollectorsNames) + && Objects.equals(compressionName, that.compressionName) + && Objects.equals(userCollectedProperties, that.userCollectedProperties) + && Objects.equals(readableProperties, that.readableProperties); + } + + @Override + public int hashCode() { + int result = Objects.hash(dataSize, indexSize, indexPartitions, topLevelIndexSize, + indexKeyIsUserKey, indexValueIsDeltaEncoded, filterSize, rawKeySize, rawValueSize, + numDataBlocks, numEntries, numDeletions, numMergeOperands, numRangeDeletions, formatVersion, + fixedKeyLen, columnFamilyId, creationTime, oldestKeyTime, slowCompressionEstimatedDataSize, + fastCompressionEstimatedDataSize, externalSstFileGlobalSeqnoOffset, filterPolicyName, + comparatorName, mergeOperatorName, prefixExtractorName, propertyCollectorsNames, + compressionName, userCollectedProperties, readableProperties); + result = 31 * result + Arrays.hashCode(columnFamilyName); + return result; } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java 2025-05-19 16:14:27.000000000 +0000 @@ -722,6 +722,80 @@ */ TXN_GET_TRY_AGAIN((byte) -0x0D), + /** + * # of files marked as trash by delete scheduler + */ + FILES_MARKED_TRASH((byte) -0x0E), + + /** + * # of files deleted immediately by delete scheduler + */ + FILES_DELETED_IMMEDIATELY((byte) -0x0f), + + /** + * Compaction read and write statistics broken down by CompactionReason + */ + COMPACT_READ_BYTES_MARKED((byte) -0x10), + COMPACT_READ_BYTES_PERIODIC((byte) -0x11), + COMPACT_READ_BYTES_TTL((byte) -0x12), + COMPACT_WRITE_BYTES_MARKED((byte) -0x13), + COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14), + COMPACT_WRITE_BYTES_TTL((byte) -0x15), + + /** + * DB error handler statistics + */ + ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16), + ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18), + ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B), + + /** + * Bytes of raw data (payload) found on memtable at flush time. + * Contains the sum of garbage payload (bytes that are discarded + * at flush time) and useful payload (bytes of data that will + * eventually be written to SSTable). + */ + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C), + /** + * Outdated bytes of data present on memtable at flush time. + */ + MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D), + + /** + * Number of secondary cache hits + */ + SECONDARY_CACHE_HITS((byte) -0x1E), + + /** + * Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs. + */ + VERIFY_CHECKSUM_READ_BYTES((byte) -0x1F), + + /** + * Bytes read/written while creating backups + */ + BACKUP_READ_BYTES((byte) -0x20), + BACKUP_WRITE_BYTES((byte) -0x21), + + /** + * Remote compaction read/write statistics + */ + REMOTE_COMPACT_READ_BYTES((byte) -0x22), + REMOTE_COMPACT_WRITE_BYTES((byte) -0x23), + + /** + * Tiered storage related statistics + */ + HOT_FILE_READ_BYTES((byte) -0x24), + WARM_FILE_READ_BYTES((byte) -0x25), + COLD_FILE_READ_BYTES((byte) -0x26), + HOT_FILE_READ_COUNT((byte) -0x27), + WARM_FILE_READ_COUNT((byte) -0x28), + COLD_FILE_READ_COUNT((byte) -0x29), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,7 @@ private final long maxTraceFileSize; public TraceOptions() { - this.maxTraceFileSize = 64 * 1024 * 1024 * 1024; // 64 GB + this.maxTraceFileSize = 64L * 1024L * 1024L * 1024L; // 64 GB } public TraceOptions(final long maxTraceFileSize) { @@ -21,8 +21,8 @@ } /** - * To avoid the trace file size grows large than the storage space, - * user can set the max trace file size in Bytes. Default is 64GB + * To avoid the trace file size grows larger than the storage space, + * user can set the max trace file size in Bytes. Default is 64 GB. * * @return the max trace size */ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java 2025-05-19 16:14:27.000000000 +0000 @@ -45,7 +45,7 @@ /** * If a transaction has a snapshot set, the transaction will ensure that - * any keys successfully written(or fetched via {@link #getForUpdate}) have + * any keys successfully written (or fetched via {@link #getForUpdate}) have * not been modified outside of this transaction since the time the snapshot * was set. * @@ -611,9 +611,9 @@ } /** - * Returns an iterator that will iterate on all keys in the default - * column family including both keys in the DB and uncommitted keys in this - * transaction. + * Returns an iterator that will iterate on all keys in the column family + * specified by {@code columnFamilyHandle} including both keys in the DB + * and uncommitted keys in this transaction. * * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction @@ -1068,7 +1068,7 @@ * @param columnFamilyHandle The column family to delete the key/value from * @param key the specified key to be deleted. * @param assumeTracked true when it is expected that the key is already - * tracked. More specifically, it means the the key was previous tracked + * tracked. More specifically, it means the key was previously tracked * in the same savepoint, with the same exclusive flag, and at a lower * sequence number. If valid then it skips ValidateSnapshot, * throws an error otherwise. @@ -1152,7 +1152,7 @@ * @param columnFamilyHandle The column family to delete the key/value from * @param keyParts the specified key to be deleted. * @param assumeTracked true when it is expected that the key is already - * tracked. More specifically, it means the the key was previous tracked + * tracked. More specifically, it means the key was previously tracked * in the same savepoint, with the same exclusive flag, and at a lower * sequence number. If valid then it skips ValidateSnapshot, * throws an error otherwise. @@ -1788,11 +1788,17 @@ AWAITING_PREPARE((byte)1), PREPARED((byte)2), AWAITING_COMMIT((byte)3), - COMMITED((byte)4), + COMMITTED((byte)4), AWAITING_ROLLBACK((byte)5), ROLLEDBACK((byte)6), LOCKS_STOLEN((byte)7); + /* + * Keep old misspelled variable as alias + * Tip from https://stackoverflow.com/a/37092410/454544 + */ + public static final TransactionState COMMITED = COMMITTED; + private final byte value; TransactionState(final byte value) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java 2025-05-19 16:14:27.000000000 +0000 @@ -6,7 +6,6 @@ package org.rocksdb; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,7 @@ package org.rocksdb; - -interface TransactionalDB - extends AutoCloseable { - +interface TransactionalDB> extends AutoCloseable { /** * Starts a new Transaction. * diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java 2025-05-19 16:14:27.000000000 +0000 @@ -113,7 +113,7 @@ throws RocksDBException { if (columnFamilyDescriptors.size() != ttlValues.size()) { throw new IllegalArgumentException("There must be a ttl value per column" - + "family handle."); + + " family handle."); } final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java 2025-05-19 16:14:27.000000000 +0000 @@ -46,6 +46,7 @@ @Override final native void seekToLast0(long handle); @Override final native void next0(long handle); @Override final native void prev0(long handle); + @Override final native void refresh0(final long handle) throws RocksDBException; @Override final native void seek0(long handle, byte[] target, int targetLen); @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); @Override final native void status0(long handle) throws RocksDBException; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java 2025-05-19 16:14:27.000000000 +0000 @@ -243,7 +243,7 @@ @Override final native void singleDelete(final long handle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; @Override - final native void removeDirect(final long handle, final ByteBuffer key, final int keyOffset, + final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; @Override final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen, @@ -321,6 +321,8 @@ throws RocksDBException; public abstract void markCommit(final byte[] xid) throws RocksDBException; + public abstract void markCommitWithTimestamp(final byte[] xid, final byte[] ts) + throws RocksDBException; /** * shouldContinue is called by the underlying iterator diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java 2025-05-19 16:14:27.000000000 +0000 @@ -39,8 +39,8 @@ * @param value the value associated with the specified key. * @throws RocksDBException thrown if error happens in underlying native library. */ - void put(ColumnFamilyHandle columnFamilyHandle, - byte[] key, byte[] value) throws RocksDBException; + void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) + throws RocksDBException; /** *

      Store the mapping "key->value" within given column @@ -50,9 +50,9 @@ * Supports direct buffer only. * @param value the value associated with the specified key. It is using position and limit. * Supports direct buffer only. - * @throws RocksDBException + * @throws RocksDBException thrown if error happens in underlying native library. */ - void put(ByteBuffer key, ByteBuffer value) throws RocksDBException; + void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException; /** *

      Store the mapping "key->value" within given column @@ -64,9 +64,9 @@ * Supports direct buffer only. * @param value the value associated with the specified key. It is using position and limit. * Supports direct buffer only. - * @throws RocksDBException + * @throws RocksDBException thrown if error happens in underlying native library. */ - void put(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key, ByteBuffer value) + void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value) throws RocksDBException; /** @@ -90,8 +90,8 @@ * the specified key. * @throws RocksDBException thrown if error happens in underlying native library. */ - void merge(ColumnFamilyHandle columnFamilyHandle, - byte[] key, byte[] value) throws RocksDBException; + void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) + throws RocksDBException; /** *

      If the database contains a mapping for "key", erase it. Else do nothing.

      @@ -114,7 +114,31 @@ * @throws RocksDBException thrown if error happens in underlying native library. */ @Deprecated - void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) + void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) throws RocksDBException; + + /** + *

      If column family contains a mapping for "key", erase it. Else do nothing.

      + * + * @param key Key to delete within database. It is using position and limit. + * Supports direct buffer only. + * + * @deprecated Use {@link #delete(ByteBuffer)} + * @throws RocksDBException thrown if error happens in underlying native library. + */ + @Deprecated void remove(final ByteBuffer key) throws RocksDBException; + + /** + *

      If column family contains a mapping for "key", erase it. Else do nothing.

      + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key Key to delete within database. It is using position and limit. + * Supports direct buffer only. + * + * @deprecated Use {@link #delete(ColumnFamilyHandle, ByteBuffer)} + * @throws RocksDBException thrown if error happens in underlying native library. + */ + @Deprecated + void remove(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) throws RocksDBException; /** @@ -132,7 +156,28 @@ * @param key Key to delete within database * @throws RocksDBException thrown if error happens in underlying native library. */ - void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) throws RocksDBException; + + /** + *

      If column family contains a mapping for "key", erase it. Else do nothing.

      + * + * @param key Key to delete within database. It is using position and limit. + * Supports direct buffer only. + * + * @throws RocksDBException thrown if error happens in underlying native library. + */ + void delete(final ByteBuffer key) throws RocksDBException; + + /** + *

      If column family contains a mapping for "key", erase it. Else do nothing.

      + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key Key to delete within database. It is using position and limit. + * Supports direct buffer only. + * + * @throws RocksDBException thrown if error happens in underlying native library. + */ + void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) throws RocksDBException; /** @@ -182,27 +227,8 @@ * native library. */ @Experimental("Performance optimization for a very specific workload") - void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[] key) throws RocksDBException; - - /** - *

      If column family contains a mapping for "key", erase it. Else do nothing.

      - * - * @param key Key to delete within database. It is using position and limit. - * Supports direct buffer only. - * @throws RocksDBException - */ - void remove(ByteBuffer key) throws RocksDBException; - - /** - *

      If column family contains a mapping for "key", erase it. Else do nothing.

      - * - * @param columnFamilyHandle {@link ColumnFamilyHandle} instance - * @param key Key to delete within database. It is using position and limit. - * Supports direct buffer only. - * @throws RocksDBException - */ - void remove(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key) throws RocksDBException; + void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) + throws RocksDBException; /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., @@ -237,8 +263,8 @@ * Last key to delete within database (excluded) * @throws RocksDBException thrown if error happens in underlying native library. */ - void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, - byte[] endKey) throws RocksDBException; + void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey) + throws RocksDBException; /** * Append a blob of arbitrary size to the records in this batch. The blob will diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java 2025-05-19 16:14:27.000000000 +0000 @@ -117,7 +117,7 @@ * as a delta and baseIterator as a base * * Updating write batch with the current key of the iterator is not safe. - * We strongly recommand users not to do it. It will invalidate the current + * We strongly recommend users not to do it. It will invalidate the current * key() and value() of the iterator. This invalidation happens even before * the write batch update finishes. The state may recover after Next() is * called. @@ -131,11 +131,36 @@ public RocksIterator newIteratorWithBase( final ColumnFamilyHandle columnFamilyHandle, final RocksIterator baseIterator) { - RocksIterator iterator = new RocksIterator(baseIterator.parent_, - iteratorWithBase( - nativeHandle_, columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_)); + return newIteratorWithBase(columnFamilyHandle, baseIterator, null); + } + + /** + * Provides Read-Your-Own-Writes like functionality by + * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} + * as a delta and baseIterator as a base + * + * Updating write batch with the current key of the iterator is not safe. + * We strongly recommend users not to do it. It will invalidate the current + * key() and value() of the iterator. This invalidation happens even before + * the write batch update finishes. The state may recover after Next() is + * called. + * + * @param columnFamilyHandle The column family to iterate over + * @param baseIterator The base iterator, + * e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @param readOptions the read options, or null + * @return An iterator which shows a view comprised of both the database + * point-in-time from baseIterator and modifications made in this write batch. + */ + public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHandle, + final RocksIterator baseIterator, /* @Nullable */ final ReadOptions readOptions) { + final RocksIterator iterator = new RocksIterator(baseIterator.parent_, + iteratorWithBase(nativeHandle_, columnFamilyHandle.nativeHandle_, + baseIterator.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_)); + // when the iterator is deleted it will also delete the baseIterator baseIterator.disOwnNativeHandle(); + return iterator; } @@ -151,7 +176,25 @@ * point-in-timefrom baseIterator and modifications made in this write batch. */ public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) { - return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator); + return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator, null); + } + + /** + * Provides Read-Your-Own-Writes like functionality by + * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} + * as a delta and baseIterator as a base. Operates on the default column + * family. + * + * @param baseIterator The base iterator, + * e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @param readOptions the read options, or null + * @return An iterator which shows a view comprised of both the database + * point-in-timefrom baseIterator and modifications made in this write batch. + */ + public RocksIterator newIteratorWithBase(final RocksIterator baseIterator, + /* @Nullable */ final ReadOptions readOptions) { + return newIteratorWithBase( + baseIterator.parent_.getDefaultColumnFamily(), baseIterator, readOptions); } /** @@ -200,7 +243,7 @@ * the results using the DB's merge operator (if the batch contains any * merge requests). * - * Setting {@link ReadOptions#setSnapshot(long, long)} will affect what is + * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is * read from the DB but will NOT change which keys are read from the batch * (the keys in this batch do not yet belong to any snapshot and will be * fetched regardless). @@ -230,7 +273,7 @@ * the results using the DB's merge operator (if the batch contains any * merge requests). * - * Setting {@link ReadOptions#setSnapshot(long, long)} will affect what is + * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is * read from the DB but will NOT change which keys are read from the batch * (the keys in this batch do not yet belong to any snapshot and will be * fetched regardless). @@ -275,7 +318,7 @@ @Override final native void singleDelete(final long handle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; @Override - final native void removeDirect(final long handle, final ByteBuffer key, final int keyOffset, + final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported @Override @@ -303,8 +346,8 @@ final boolean overwriteKey); private native long iterator0(final long handle); private native long iterator1(final long handle, final long cfHandle); - private native long iteratorWithBase( - final long handle, final long baseIteratorHandle, final long cfHandle); + private native long iteratorWithBase(final long handle, final long baseIteratorHandle, + final long cfHandle, final long readOptionsHandle); private native byte[] getFromBatch(final long handle, final long optHandle, final byte[] key, final int keyLen); private native byte[] getFromBatch(final long handle, final long optHandle, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java 2025-05-19 16:14:27.000000000 +0000 @@ -22,12 +22,29 @@ * * @param bufferSizeBytes buffer size(in bytes) to use for native write_buffer_manager * @param cache cache whose memory should be bounded by this write buffer manager + * @param allowStall if set true, it will enable stalling of writes when memory_usage() exceeds + * buffer_size. + * It will wait for flush to complete and memory usage to drop down. */ + public WriteBufferManager( + final long bufferSizeBytes, final Cache cache, final boolean allowStall) { + super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_, allowStall)); + this.allowStall_ = allowStall; + } + public WriteBufferManager(final long bufferSizeBytes, final Cache cache){ - super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_)); + this(bufferSizeBytes, cache, false); + } + + public boolean allowStall() { + return allowStall_; } - private native static long newWriteBufferManager(final long bufferSizeBytes, final long cacheHandle); + private native static long newWriteBufferManager( + final long bufferSizeBytes, final long cacheHandle, final boolean allowStall); + @Override protected native void disposeInternal(final long handle); + + private boolean allowStall_; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java 2025-05-19 16:14:27.000000000 +0000 @@ -171,7 +171,7 @@ /** * If true, this write request is of lower priority if compaction is - * behind. In this case that, {@link #noSlowdown()} == true, the request + * behind. In the case that, {@link #noSlowdown()} == true, the request * will be cancelled immediately with {@link Status.Code#Incomplete} returned. * Otherwise, it will be slowed down. The slowdown value is determined by * RocksDB to guarantee it introduces minimum impacts to high priority writes. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum WriteStallCondition { + NORMAL((byte) 0x0), + DELAYED((byte) 0x1), + STOPPED((byte) 0x2); + + private final byte value; + + WriteStallCondition(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the WriteStallCondition from the internal representation value. + * + * @return the flush reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static WriteStallCondition fromValue(final byte value) { + for (final WriteStallCondition writeStallCondition : WriteStallCondition.values()) { + if (writeStallCondition.value == value) { + return writeStallCondition; + } + } + + throw new IllegalArgumentException("Illegal value provided for WriteStallCondition: " + value); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class WriteStallInfo { + private final String columnFamilyName; + private final WriteStallCondition currentCondition; + private final WriteStallCondition previousCondition; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + WriteStallInfo(final String columnFamilyName, final byte currentConditionValue, + final byte previousConditionValue) { + this.columnFamilyName = columnFamilyName; + this.currentCondition = WriteStallCondition.fromValue(currentConditionValue); + this.previousCondition = WriteStallCondition.fromValue(previousConditionValue); + } + + /** + * Get the name of the column family. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the current state of the write controller. + * + * @return the current state. + */ + public WriteStallCondition getCurrentCondition() { + return currentCondition; + } + + /** + * Get the previous state of the write controller. + * + * @return the previous state. + */ + public WriteStallCondition getPreviousCondition() { + return previousCondition; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + WriteStallInfo that = (WriteStallInfo) o; + return Objects.equals(columnFamilyName, that.columnFamilyName) + && currentCondition == that.currentCondition && previousCondition == that.previousCondition; + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyName, currentCondition, previousCondition); + } + + @Override + public String toString() { + return "WriteStallInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", currentCondition=" + currentCondition + + ", previousCondition=" + previousCondition + '}'; + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java 2025-05-19 16:14:27.000000000 +0000 @@ -1,7 +1,6 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb.util; -import java.io.File; import java.io.IOException; public class Environment { @@ -106,12 +105,22 @@ if (isPowerPC() || isAarch64()) { return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix()); } else if (isS390x()) { - return String.format("%sjni-linux%s", name, ARCH); + return String.format("%sjni-linux-%s", name, ARCH); } else { return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix()); } } else if (isMac()) { - return String.format("%sjni-osx", name); + if (is64Bit()) { + final String arch; + if (isAarch64()) { + arch = "arm64"; + } else { + arch = "x86_64"; + } + return String.format("%sjni-osx-%s", name, arch); + } else { + return String.format("%sjni-osx", name); + } } else if (isFreeBSD()) { return String.format("%sjni-freebsd%s", name, is64Bit() ? "64" : "32"); } else if (isAix() && is64Bit()) { @@ -128,10 +137,25 @@ throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name)); } + public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) { + if (isMac() && is64Bit()) { + return String.format("%sjni-osx", name); + } + return null; + } + public static String getJniLibraryFileName(final String name) { return appendLibOsSuffix("lib" + getJniLibraryName(name), false); } + public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) { + final String fallbackJniLibraryName = getFallbackJniLibraryName(name); + if (fallbackJniLibraryName == null) { + return null; + } + return appendLibOsSuffix("lib" + fallbackJniLibraryName, false); + } + private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) { if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) { return libraryFileName + ".so"; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.*; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class BlobOptionsTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + final int minBlobSize = 65536; + final int largeBlobSize = 65536 * 2; + + /** + * Count the files in the temporary folder which end with a particular suffix + * Used to query the state of a test database to check if it is as the test expects + * + * @param endsWith the suffix to match + * @return the number of files with a matching suffix + */ + @SuppressWarnings("CallToStringConcatCanBeReplacedByOperator") + private int countDBFiles(final String endsWith) { + return Objects + .requireNonNull(dbFolder.getRoot().list(new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + return name.endsWith(endsWith); + } + })) + .length; + } + + @SuppressWarnings("SameParameterValue") + private byte[] small_key(String suffix) { + return ("small_key_" + suffix).getBytes(UTF_8); + } + + @SuppressWarnings("SameParameterValue") + private byte[] small_value(String suffix) { + return ("small_value_" + suffix).getBytes(UTF_8); + } + + private byte[] large_key(String suffix) { + return ("large_key_" + suffix).getBytes(UTF_8); + } + + private byte[] large_value(String repeat) { + final byte[] large_value = ("" + repeat + "_" + largeBlobSize + "b").getBytes(UTF_8); + final byte[] large_buffer = new byte[largeBlobSize]; + for (int pos = 0; pos < largeBlobSize; pos += large_value.length) { + int numBytes = Math.min(large_value.length, large_buffer.length - pos); + System.arraycopy(large_value, 0, large_buffer, pos, numBytes); + } + return large_buffer; + } + + @Test + public void blobOptions() { + try (final Options options = new Options()) { + assertThat(options.enableBlobFiles()).isEqualTo(false); + assertThat(options.minBlobSize()).isEqualTo(0); + assertThat(options.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION); + assertThat(options.enableBlobGarbageCollection()).isEqualTo(false); + assertThat(options.blobFileSize()).isEqualTo(268435456L); + assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(1.0); + + assertThat(options.setEnableBlobFiles(true)).isEqualTo(options); + assertThat(options.setMinBlobSize(132768L)).isEqualTo(options); + assertThat(options.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION)) + .isEqualTo(options); + assertThat(options.setEnableBlobGarbageCollection(true)).isEqualTo(options); + assertThat(options.setBlobFileSize(132768L)).isEqualTo(options); + assertThat(options.setBlobGarbageCollectionAgeCutoff(0.89)).isEqualTo(options); + assertThat(options.setBlobGarbageCollectionForceThreshold(0.80)).isEqualTo(options); + + assertThat(options.enableBlobFiles()).isEqualTo(true); + assertThat(options.minBlobSize()).isEqualTo(132768L); + assertThat(options.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION); + assertThat(options.enableBlobGarbageCollection()).isEqualTo(true); + assertThat(options.blobFileSize()).isEqualTo(132768L); + assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89); + assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + } + } + + @Test + public void blobColumnFamilyOptions() { + try (final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions()) { + assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(false); + assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(0); + assertThat(columnFamilyOptions.blobCompressionType()) + .isEqualTo(CompressionType.NO_COMPRESSION); + assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(false); + assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(268435456L); + assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(1.0); + + assertThat(columnFamilyOptions.setEnableBlobFiles(true)).isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setMinBlobSize(132768L)).isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION)) + .isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setEnableBlobGarbageCollection(true)) + .isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setBlobFileSize(132768L)).isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setBlobGarbageCollectionAgeCutoff(0.89)) + .isEqualTo(columnFamilyOptions); + assertThat(columnFamilyOptions.setBlobGarbageCollectionForceThreshold(0.80)) + .isEqualTo(columnFamilyOptions); + + assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(true); + assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(132768L); + assertThat(columnFamilyOptions.blobCompressionType()) + .isEqualTo(CompressionType.BZLIB2_COMPRESSION); + assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(true); + assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(132768L); + assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89); + assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + } + } + + @Test + public void blobMutableColumnFamilyOptionsBuilder() { + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder = + MutableColumnFamilyOptions.builder(); + builder.setEnableBlobFiles(true) + .setMinBlobSize(1024) + .setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION) + .setEnableBlobGarbageCollection(true) + .setBlobGarbageCollectionAgeCutoff(0.89) + .setBlobGarbageCollectionForceThreshold(0.80) + .setBlobFileSize(132768); + + assertThat(builder.enableBlobFiles()).isEqualTo(true); + assertThat(builder.minBlobSize()).isEqualTo(1024); + assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION); + assertThat(builder.enableBlobGarbageCollection()).isEqualTo(true); + assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89); + assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + assertThat(builder.blobFileSize()).isEqualTo(132768); + + builder.setEnableBlobFiles(false) + .setMinBlobSize(4096) + .setBlobCompressionType(CompressionType.LZ4_COMPRESSION) + .setEnableBlobGarbageCollection(false) + .setBlobGarbageCollectionAgeCutoff(0.91) + .setBlobGarbageCollectionForceThreshold(0.96) + .setBlobFileSize(2048); + + assertThat(builder.enableBlobFiles()).isEqualTo(false); + assertThat(builder.minBlobSize()).isEqualTo(4096); + assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.LZ4_COMPRESSION); + assertThat(builder.enableBlobGarbageCollection()).isEqualTo(false); + assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.91); + assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.96); + assertThat(builder.blobFileSize()).isEqualTo(2048); + + final MutableColumnFamilyOptions options = builder.build(); + assertThat(options.getKeys()) + .isEqualTo(new String[] {"enable_blob_files", "min_blob_size", "blob_compression_type", + "enable_blob_garbage_collection", "blob_garbage_collection_age_cutoff", + "blob_garbage_collection_force_threshold", "blob_file_size"}); + assertThat(options.getValues()) + .isEqualTo( + new String[] {"false", "4096", "LZ4_COMPRESSION", "false", "0.91", "0.96", "2048"}); + } + + /** + * Configure the default column family with BLOBs. + * Confirm that BLOBs are generated when appropriately-sized writes are flushed. + * + * @throws RocksDBException if a db access throws an exception + */ + @Test + public void testBlobWriteAboveThreshold() throws RocksDBException { + try (final Options options = new Options() + .setCreateIfMissing(true) + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(true); + + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put(small_key("default"), small_value("default")); + db.flush(new FlushOptions().setWaitForFlush(true)); + + // check there are no blobs in the database + assertThat(countDBFiles(".sst")).isEqualTo(1); + assertThat(countDBFiles(".blob")).isEqualTo(0); + + db.put(large_key("default"), large_value("default")); + db.flush(new FlushOptions().setWaitForFlush(true)); + + // wrote and flushed a value larger than the blobbing threshold + // check there is a single blob in the database + assertThat(countDBFiles(".sst")).isEqualTo(2); + assertThat(countDBFiles(".blob")).isEqualTo(1); + + assertThat(db.get(small_key("default"))).isEqualTo(small_value("default")); + assertThat(db.get(large_key("default"))).isEqualTo(large_value("default")); + + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder fetchOptions = + db.getOptions(null); + assertThat(fetchOptions.minBlobSize()).isEqualTo(minBlobSize); + assertThat(fetchOptions.enableBlobFiles()).isEqualTo(true); + assertThat(fetchOptions.writeBufferSize()).isEqualTo(64 << 20); + } + } + + /** + * Configure 2 column families respectively with and without BLOBs. + * Confirm that BLOB files are generated (once the DB is flushed) only for the appropriate column + * family. + * + * @throws RocksDBException if a db access throws an exception + */ + @Test + public void testBlobWriteAboveThresholdCF() throws RocksDBException { + final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions(); + final ColumnFamilyDescriptor columnFamilyDescriptor0 = + new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0); + List columnFamilyDescriptors = + Collections.singletonList(columnFamilyDescriptor0); + List columnFamilyHandles = new ArrayList<>(); + + try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + db.put(columnFamilyHandles.get(0), small_key("default"), small_value("default")); + db.flush(new FlushOptions().setWaitForFlush(true)); + + assertThat(countDBFiles(".blob")).isEqualTo(0); + + try (final ColumnFamilyOptions columnFamilyOptions1 = + new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true); + + final ColumnFamilyOptions columnFamilyOptions2 = + new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) { + final ColumnFamilyDescriptor columnFamilyDescriptor1 = + new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1); + final ColumnFamilyDescriptor columnFamilyDescriptor2 = + new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2); + + // Create the first column family with blob options + db.createColumnFamily(columnFamilyDescriptor1); + + // Create the second column family with not-blob options + db.createColumnFamily(columnFamilyDescriptor2); + } + } + + // Now re-open after auto-close - at this point the CF options we use are recognized. + try (final ColumnFamilyOptions columnFamilyOptions1 = + new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true); + + final ColumnFamilyOptions columnFamilyOptions2 = + new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) { + assertThat(columnFamilyOptions1.enableBlobFiles()).isEqualTo(true); + assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize); + assertThat(columnFamilyOptions2.enableBlobFiles()).isEqualTo(false); + assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize); + + final ColumnFamilyDescriptor columnFamilyDescriptor1 = + new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1); + final ColumnFamilyDescriptor columnFamilyDescriptor2 = + new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2); + columnFamilyDescriptors = new ArrayList<>(); + columnFamilyDescriptors.add(columnFamilyDescriptor0); + columnFamilyDescriptors.add(columnFamilyDescriptor1); + columnFamilyDescriptors.add(columnFamilyDescriptor2); + columnFamilyHandles = new ArrayList<>(); + + assertThat(columnFamilyDescriptor1.getOptions().enableBlobFiles()).isEqualTo(true); + assertThat(columnFamilyDescriptor2.getOptions().enableBlobFiles()).isEqualTo(false); + + try (final DBOptions dbOptions = new DBOptions(); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = + db.getOptions(columnFamilyHandles.get(1)); + assertThat(builder1.enableBlobFiles()).isEqualTo(true); + assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize); + + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 = + db.getOptions(columnFamilyHandles.get(2)); + assertThat(builder2.enableBlobFiles()).isEqualTo(false); + assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize); + + db.put(columnFamilyHandles.get(1), large_key("column_family_1_k2"), + large_value("column_family_1_k2")); + db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(1)); + assertThat(countDBFiles(".blob")).isEqualTo(1); + + db.put(columnFamilyHandles.get(2), large_key("column_family_2_k2"), + large_value("column_family_2_k2")); + db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(2)); + assertThat(countDBFiles(".blob")).isEqualTo(1); + } + } + } +} \ No newline at end of file diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,16 +5,16 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.fail; + +import java.nio.charset.StandardCharsets; import org.junit.ClassRule; import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.nio.charset.StandardCharsets; - -import static org.assertj.core.api.Assertions.assertThat; - public class BlockBasedTableConfigTest { @ClassRule @@ -35,9 +35,10 @@ @Test public void cacheIndexAndFilterBlocksWithHighPriority() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(true); assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()). isTrue(); + blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(false); + assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).isFalse(); } @Test @@ -59,7 +60,7 @@ @Test public void indexType() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - assertThat(IndexType.values().length).isEqualTo(3); + assertThat(IndexType.values().length).isEqualTo(4); blockBasedTableConfig.setIndexType(IndexType.kHashSearch); assertThat(blockBasedTableConfig.indexType().equals( IndexType.kHashSearch)); @@ -83,7 +84,7 @@ @Test public void checksumType() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - assertThat(ChecksumType.values().length).isEqualTo(3); + assertThat(ChecksumType.values().length).isEqualTo(4); assertThat(ChecksumType.valueOf("kxxHash")). isEqualTo(ChecksumType.kxxHash); blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum); @@ -259,6 +260,13 @@ } @Test + public void optimizeFiltersForMemory() { + final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setOptimizeFiltersForMemory(true); + assertThat(blockBasedTableConfig.optimizeFiltersForMemory()).isTrue(); + } + + @Test public void useDeltaEncoding() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); blockBasedTableConfig.setUseDeltaEncoding(false); @@ -296,6 +304,7 @@ @Test public void verifyCompression() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + assertThat(blockBasedTableConfig.verifyCompression()).isFalse(); blockBasedTableConfig.setVerifyCompression(true); assertThat(blockBasedTableConfig.verifyCompression()). isTrue(); @@ -312,7 +321,7 @@ @Test public void formatVersion() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - for (int version = 0; version < 5; version++) { + for (int version = 0; version <= 5; version++) { blockBasedTableConfig.setFormatVersion(version); assertThat(blockBasedTableConfig.formatVersion()).isEqualTo(version); } @@ -324,10 +333,15 @@ blockBasedTableConfig.setFormatVersion(-1); } - @Test(expected = AssertionError.class) - public void formatVersionFailIllegalVersion() { - final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setFormatVersion(99); + @Test(expected = RocksDBException.class) + public void invalidFormatVersion() throws RocksDBException { + final BlockBasedTableConfig blockBasedTableConfig = + new BlockBasedTableConfig().setFormatVersion(99999); + + try (final Options options = new Options().setTableFormatConfig(blockBasedTableConfig); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + fail("Opening the database with an invalid format_version should have raised an exception"); + } } @Test @@ -346,6 +360,14 @@ isTrue(); } + @Test + public void indexShortening() { + final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor); + assertThat(blockBasedTableConfig.indexShortening()) + .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor); + } + @Deprecated @Test public void hashIndexAllowCollision() { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.junit.Assert.assertArrayEquals; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.BytewiseComparator; + +/** + * This test confirms that the following issues were in fact resolved + * by a change made between 6.2.2 and 6.22.1, + * to wit {@link ...} + * which as part of its effect, changed the Java bytewise comparators. + * + * {@link ...} + * {@link ...} + */ +public class BytewiseComparatorRegressionTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule public TemporaryFolder temporarySSTFolder = new TemporaryFolder(); + + private final static byte[][] testData = {{10, -11, 13}, {10, 11, 12}, {10, 11, 14}}; + private final static byte[][] orderedData = {{10, 11, 12}, {10, 11, 14}, {10, -11, 13}}; + + /** + * {@link ...} + */ + @Test + public void testJavaComparator() throws RocksDBException { + final BytewiseComparator comparator = new BytewiseComparator(new ComparatorOptions()); + performTest(new Options().setCreateIfMissing(true).setComparator(comparator)); + } + + @Test + public void testDefaultComparator() throws RocksDBException { + performTest(new Options().setCreateIfMissing(true)); + } + + /** + * {@link ...} + */ + @Test + public void testCppComparator() throws RocksDBException { + performTest(new Options().setCreateIfMissing(true).setComparator( + BuiltinComparator.BYTEWISE_COMPARATOR)); + } + + private void performTest(final Options options) throws RocksDBException { + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + for (final byte[] item : testData) { + db.put(item, item); + } + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + final ArrayList result = new ArrayList<>(); + while (iterator.isValid()) { + result.add(iterator.key()); + iterator.next(); + } + assertArrayEquals(orderedData, result.toArray()); + } + } + } + + private byte[] hexToByte(final String hexString) { + final byte[] bytes = new byte[hexString.length() / 2]; + if (bytes.length * 2 < hexString.length()) { + throw new RuntimeException("Hex string has odd length: " + hexString); + } + + for (int i = 0; i < bytes.length; i++) { + final int firstDigit = toDigit(hexString.charAt(i + i)); + final int secondDigit = toDigit(hexString.charAt(i + i + 1)); + bytes[i] = (byte) ((firstDigit << 4) + secondDigit); + } + + return bytes; + } + + private int toDigit(final char hexChar) { + final int digit = Character.digit(hexChar, 16); + if (digit == -1) { + throw new IllegalArgumentException("Invalid Hexadecimal Character: " + hexChar); + } + return digit; + } + + /** + * {@link ...} + * + * @throws RocksDBException if something goes wrong, or if the regression occurs + * @throws IOException if we can't make the temporary file + */ + @Test + public void testSST() throws RocksDBException, IOException { + final File tempSSTFile = temporarySSTFolder.newFile("test_file_with_weird_keys.sst"); + + final EnvOptions envOpts = new EnvOptions(); + final Options opts = new Options(); + final SstFileWriter writer = + new SstFileWriter(envOpts, opts, new BytewiseComparator(new ComparatorOptions())); + writer.open(tempSSTFile.getAbsolutePath()); + final byte[] gKey = + hexToByte("000000293030303030303030303030303030303030303032303736343730696E666F33"); + final byte[] wKey = + hexToByte("0000008d3030303030303030303030303030303030303030303437363433696e666f34"); + writer.add(new Slice(gKey), new Slice("dummyV1")); + writer.add(new Slice(wKey), new Slice("dummyV2")); + writer.finish(); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,17 +5,17 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.*; import org.junit.ClassRule; import org.junit.Test; import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; -import java.util.Random; - -import static org.assertj.core.api.Assertions.assertThat; - public class ColumnFamilyOptionsTest { @ClassRule @@ -55,6 +55,27 @@ } @Test + public void getColumnFamilyOptionsFromPropsWithIgnoreIllegalValue() { + // setup sample properties + final Properties properties = new Properties(); + properties.put("tomato", "1024"); + properties.put("burger", "2"); + properties.put("write_buffer_size", "112"); + properties.put("max_write_buffer_number", "13"); + + try (final ConfigOptions cfgOpts = new ConfigOptions().setIgnoreUnknownOptions(true); + final ColumnFamilyOptions opt = + ColumnFamilyOptions.getColumnFamilyOptionsFromProps(cfgOpts, properties)) { + // setup sample properties + assertThat(opt).isNotNull(); + assertThat(String.valueOf(opt.writeBufferSize())) + .isEqualTo(properties.get("write_buffer_size")); + assertThat(String.valueOf(opt.maxWriteBufferNumber())) + .isEqualTo(properties.get("max_write_buffer_number")); + } + } + + @Test public void failColumnFamilyOptionsFromPropsWithIllegalValue() { // setup sample properties final Properties properties = new Properties(); @@ -569,6 +590,14 @@ } @Test + public void periodicCompactionSeconds() { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { + options.setPeriodicCompactionSeconds(1000 * 60); + assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60); + } + } + + @Test public void compactionOptionsUniversal() { try (final ColumnFamilyOptions opt = new ColumnFamilyOptions(); final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal() @@ -622,4 +651,46 @@ } } + @Test + public void compactionThreadLimiter() { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions(); + final ConcurrentTaskLimiter compactionThreadLimiter = + new ConcurrentTaskLimiterImpl("name", 3)) { + options.setCompactionThreadLimiter(compactionThreadLimiter); + assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter); + } + } + + @Test + public void oldDefaults() { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { + options.oldDefaults(4, 6); + assertEquals(4 << 20, options.writeBufferSize()); + assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize); + assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576); + assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576); + assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0); + assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0); + assertThat(options.level0StopWritesTrigger()).isEqualTo(24); + } + } + + @Test + public void optimizeForSmallDbWithCache() { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions(); + final Cache cache = new LRUCache(1024)) { + assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options); + } + } + + @Test + public void cfPaths() throws IOException { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { + final List paths = Arrays.asList( + new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25)); + assertThat(options.cfPaths()).isEqualTo(Collections.emptyList()); + assertThat(options.setCfPaths(paths)).isEqualTo(options); + assertThat(options.cfPaths()).isEqualTo(paths); + } + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,16 +5,17 @@ package org.rocksdb; -import java.util.*; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import java.util.*; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.assertj.core.api.Assertions.assertThat; - public class ColumnFamilyTest { @ClassRule @@ -75,6 +76,7 @@ assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8)); assertThat(cfh.getID()).isEqualTo(0); + assertThat(cfh.getDescriptor().getName()).isEqualTo("default".getBytes(UTF_8)); final byte[] key = "key".getBytes(); final byte[] value = "value".getBytes(); @@ -140,33 +142,19 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { - - try { - assertThat(columnFamilyHandleList.size()).isEqualTo(2); - db.put("dfkey1".getBytes(), "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), - "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), - "newcfvalue".getBytes()); - - String retVal = new String(db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes())); - assertThat(retVal).isEqualTo("newcfvalue"); - assertThat((db.get(columnFamilyHandleList.get(1), - "dfkey1".getBytes()))).isNull(); - db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); - assertThat((db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes()))).isNull(); - db.delete(columnFamilyHandleList.get(0), new WriteOptions(), - "dfkey2".getBytes()); - assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "dfkey2".getBytes())).isNull(); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + db.put("dfkey1".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes()); + + String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes())); + assertThat(retVal).isEqualTo("newcfvalue"); + assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull(); + db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); + assertThat((db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()))).isNull(); + db.delete(columnFamilyHandleList.get(0), new WriteOptions(), "dfkey2".getBytes()); + assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), "dfkey2".getBytes())) + .isNull(); } } @@ -183,30 +171,22 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), new WriteOptions(), - "key1".getBytes(), "value".getBytes()); - db.put("key2".getBytes(), "12345678".getBytes()); - final byte[] outValue = new byte[5]; - // not found value - int getResult = db.get("keyNotFound".getBytes(), outValue); - assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); - // found value which fits in outValue - getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), - outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("value".getBytes()); - // found value which fits partially - getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "key2".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("12345".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put( + columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + final byte[] outValue = new byte[5]; + // not found value + int getResult = db.get("keyNotFound".getBytes(), outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = + db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); } } @@ -222,22 +202,12 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - ColumnFamilyHandle tmpColumnFamilyHandle = null; - try { - tmpColumnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF".getBytes(), - new ColumnFamilyOptions())); - db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); - db.dropColumnFamily(tmpColumnFamilyHandle); - assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); - } finally { - if (tmpColumnFamilyHandle != null) { - tmpColumnFamilyHandle.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + ColumnFamilyHandle tmpColumnFamilyHandle; + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.dropColumnFamily(tmpColumnFamilyHandle); + assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); } } @@ -255,29 +225,15 @@ columnFamilyHandleList)) { ColumnFamilyHandle tmpColumnFamilyHandle = null; ColumnFamilyHandle tmpColumnFamilyHandle2 = null; - try { - tmpColumnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF".getBytes(), - new ColumnFamilyOptions())); - tmpColumnFamilyHandle2 = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF2".getBytes(), - new ColumnFamilyOptions())); - db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); - db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes()); - db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2)); - assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); - assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue(); - } finally { - if (tmpColumnFamilyHandle != null) { - tmpColumnFamilyHandle.close(); - } - if (tmpColumnFamilyHandle2 != null) { - tmpColumnFamilyHandle2.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); + tmpColumnFamilyHandle2 = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF2".getBytes(), new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes()); + db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2)); + assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); + assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue(); } } @@ -299,36 +255,24 @@ cfDescriptors, columnFamilyHandleList); final WriteBatch writeBatch = new WriteBatch(); final WriteOptions writeOpt = new WriteOptions()) { - try { - writeBatch.put("key".getBytes(), "value".getBytes()); - writeBatch.put(db.getDefaultColumnFamily(), - "mergeKey".getBytes(), "merge".getBytes()); - writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), - "merge".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - writeBatch.delete("xyz".getBytes()); - writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes()); - db.write(writeOpt, writeBatch); - - assertThat(db.get(columnFamilyHandleList.get(1), - "xyz".getBytes()) == null); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey".getBytes()))).isEqualTo("value"); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey2".getBytes()))).isEqualTo("value2"); - assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); - // check if key is merged - assertThat(new String(db.get(db.getDefaultColumnFamily(), - "mergeKey".getBytes()))).isEqualTo("merge,merge"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + writeBatch.put("key".getBytes(), "value".getBytes()); + writeBatch.put(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes()); + writeBatch.delete("xyz".getBytes()); + writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes()); + db.write(writeOpt, writeBatch); + + assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null); + assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes()))) + .isEqualTo("value"); + assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes()))) + .isEqualTo("value2"); + assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); + // check if key is merged + assertThat(new String(db.get(db.getDefaultColumnFamily(), "mergeKey".getBytes()))) + .isEqualTo("merge,merge"); } } } @@ -345,32 +289,21 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - try (final RocksIterator rocksIterator = - db.newIterator(columnFamilyHandleList.get(1))) { - rocksIterator.seekToFirst(); - Map refMap = new HashMap<>(); - refMap.put("newcfkey", "value"); - refMap.put("newcfkey2", "value2"); - int i = 0; - while (rocksIterator.isValid()) { - i++; - assertThat(refMap.get(new String(rocksIterator.key()))). - isEqualTo(new String(rocksIterator.value())); - rocksIterator.next(); - } - assertThat(i).isEqualTo(2); - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes()); + try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) { + rocksIterator.seekToFirst(); + Map refMap = new HashMap<>(); + refMap.put("newcfkey", "value"); + refMap.put("newcfkey2", "value2"); + int i = 0; + while (rocksIterator.isValid()) { + i++; + assertThat(refMap.get(new String(rocksIterator.key()))) + .isEqualTo(new String(rocksIterator.value())); + rocksIterator.next(); } + assertThat(i).isEqualTo(2); } } } @@ -387,35 +320,20 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), "key".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - - final List keys = Arrays.asList(new byte[][]{ - "key".getBytes(), "newcfkey".getBytes() - }); - - List retValues = db.multiGetAsList(columnFamilyHandleList, keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + + final List keys = + Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + + List retValues = db.multiGetAsList(columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); + retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); } } @@ -431,35 +349,19 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), "key".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - - final List keys = Arrays.asList(new byte[][]{ - "key".getBytes(), "newcfkey".getBytes() - }); - List retValues = db.multiGetAsList(columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + + final List keys = + Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + List retValues = db.multiGetAsList(columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); + retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); } } @@ -475,30 +377,18 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - assertThat(db.getProperty("rocksdb.estimate-num-keys")). - isNotNull(); - assertThat(db.getLongProperty(columnFamilyHandleList.get(0), - "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0); - assertThat(db.getProperty("rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(0), - "rocksdb.sstables")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.estimate-num-keys")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.sstables")).isNotNull(); - assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). - isNotNull(); - assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). - isGreaterThanOrEqualTo(0); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + assertThat(db.getProperty("rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getLongProperty(columnFamilyHandleList.get(0), "rocksdb.estimate-num-keys")) + .isGreaterThanOrEqualTo(0); + assertThat(db.getProperty("rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(0), "rocksdb.sstables")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.estimate-num-keys")) + .isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")) + .isGreaterThanOrEqualTo(0); } } @@ -546,10 +436,6 @@ rocksIterator.close(); } } - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } } } } @@ -565,15 +451,8 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.put(columnFamilyHandleList.get(1), "key".getBytes(), - "value".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes()); } } @@ -588,15 +467,8 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.delete(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.delete(columnFamilyHandleList.get(1), "key".getBytes()); } } @@ -611,15 +483,8 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.get(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.get(columnFamilyHandleList.get(1), "key".getBytes()); } } @@ -634,19 +499,11 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - final List keys = new ArrayList<>(); - keys.add("key".getBytes()); - keys.add("newcfkey".getBytes()); - final List cfCustomList = new ArrayList<>(); - db.multiGetAsList(cfCustomList, keys); - - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + final List keys = new ArrayList<>(); + keys.add("key".getBytes()); + keys.add("newcfkey".getBytes()); + final List cfCustomList = new ArrayList<>(); + db.multiGetAsList(cfCustomList, keys); } } @@ -660,25 +517,12 @@ final byte[] b0 = new byte[]{(byte) 0x00}; final byte[] b1 = new byte[]{(byte) 0x01}; final byte[] b2 = new byte[]{(byte) 0x02}; - ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null; - try { - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2)); - } finally { - if (cf1 != null) { - cf1.close(); - } - if (cf2 != null) { - cf2.close(); - } - if (cf3 != null) { - cf3.close(); - } - } + db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); + db.createColumnFamily(new ColumnFamilyDescriptor(b2)); } } @@ -689,22 +533,13 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); ) { - try { - final byte[] b0 = new byte[]{0, 0}; - final byte[] b1 = new byte[]{0, 1}; - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - } finally { - if (cf1 != null) { - cf1.close(); - } - if (cf2 != null) { - cf2.close(); - } - } + final byte[] b0 = new byte[] {0, 0}; + final byte[] b1 = new byte[] {0, 1}; + cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); } } @@ -715,17 +550,57 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); ) { + final String simplifiedChinese = "\u7b80\u4f53\u5b57"; + columnFamilyHandle = + db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); + + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes()); + } + } + + @Test + public void testDestroyColumnFamilyHandle() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) { + final byte[] name1 = "cf1".getBytes(); + final byte[] name2 = "cf2".getBytes(); + final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1); + final ColumnFamilyDescriptor desc2 = new ColumnFamilyDescriptor(name2); + final ColumnFamilyHandle cf1 = db.createColumnFamily(desc1); + final ColumnFamilyHandle cf2 = db.createColumnFamily(desc2); + assertTrue(cf1.isOwningHandle()); + assertTrue(cf2.isOwningHandle()); + assertFalse(cf1.isDefaultColumnFamily()); + db.destroyColumnFamilyHandle(cf1); + // At this point cf1 should not be used! + assertFalse(cf1.isOwningHandle()); + assertTrue(cf2.isOwningHandle()); + } + } + + @Test + @Deprecated + /** + * @deprecated Now explicitly closing instances of ColumnFamilyHandle is not required. + * RocksDB instance will take care of closing its associated ColumnFamilyHandle objects. + */ + public void testColumnFamilyCloseBeforeDb() throws RocksDBException { + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { try { - final String simplifiedChinese = "\u7b80\u4f53\u5b57"; - columnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); - - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), - simplifiedChinese.getBytes()); + db.put("testKey".getBytes(), "tstValue".getBytes()); + // Do something... } finally { - if (columnFamilyHandle != null) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { columnFamilyHandle.close(); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -39,29 +39,22 @@ final List cfHandles = new ArrayList<>(); - try (final RocksDB rocksDb = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles); - ) { - try { - final byte[] key1 = "key1".getBytes(); - final byte[] key2 = "key2".getBytes(); - - final byte[] value1 = "value1".getBytes(); - final byte[] value2 = new byte[0]; - - rocksDb.put(cfHandles.get(1), key1, value1); - rocksDb.put(cfHandles.get(1), key2, value2); - - rocksDb.compactRange(cfHandles.get(1)); - - assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1); - final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null); - assertThat(exists).isFalse(); - } finally { - for (final ColumnFamilyHandle cfHandle : cfHandles) { - cfHandle.close(); - } - } + try (final RocksDB rocksDb = + RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles)) { + final byte[] key1 = "key1".getBytes(); + final byte[] key2 = "key2".getBytes(); + + final byte[] value1 = "value1".getBytes(); + final byte[] value2 = new byte[0]; + + rocksDb.put(cfHandles.get(1), key1, value1); + rocksDb.put(cfHandles.get(1), key2, value2); + + rocksDb.compactRange(cfHandles.get(1)); + + assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1); + final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null); + assertThat(exists).isFalse(); } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,50 @@ +package org.rocksdb; + +import static org.junit.Assert.assertEquals; + +import org.junit.After; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; + +public class ConcurrentTaskLimiterTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + private static final String NAME = "name"; + + private ConcurrentTaskLimiter concurrentTaskLimiter; + + @Before + public void beforeTest() { + concurrentTaskLimiter = new ConcurrentTaskLimiterImpl(NAME, 3); + } + + @Test + public void name() { + assertEquals(NAME, concurrentTaskLimiter.name()); + } + + @Test + public void outstandingTask() { + assertEquals(0, concurrentTaskLimiter.outstandingTask()); + } + + @Test + public void setMaxOutstandingTask() { + assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.setMaxOutstandingTask(4)); + assertEquals(0, concurrentTaskLimiter.outstandingTask()); + } + + @Test + public void resetMaxOutstandingTask() { + assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.resetMaxOutstandingTask()); + assertEquals(0, concurrentTaskLimiter.outstandingTask()); + } + + @After + public void afterTest() { + concurrentTaskLimiter.close(); + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,13 +5,16 @@ package org.rocksdb; -import org.junit.ClassRule; -import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import java.nio.file.Paths; import java.util.*; - -import static org.assertj.core.api.Assertions.assertThat; +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.ClassRule; +import org.junit.Test; public class DBOptionsTest { @@ -810,4 +813,123 @@ assertThat(stats).isNotNull(); } } + + @Test + public void avoidUnnecessaryBlockingIO() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false); + assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options); + assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true); + } + } + + @Test + public void persistStatsToDisk() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.persistStatsToDisk()).isEqualTo(false); + assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options); + assertThat(options.persistStatsToDisk()).isEqualTo(true); + } + } + + @Test + public void writeDbidToManifest() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.writeDbidToManifest()).isEqualTo(false); + assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options); + assertThat(options.writeDbidToManifest()).isEqualTo(true); + } + } + + @Test + public void logReadaheadSize() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.logReadaheadSize()).isEqualTo(0); + final int size = 1024 * 1024 * 100; + assertThat(options.setLogReadaheadSize(size)).isEqualTo(options); + assertThat(options.logReadaheadSize()).isEqualTo(size); + } + } + + @Test + public void bestEffortsRecovery() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.bestEffortsRecovery()).isEqualTo(false); + assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options); + assertThat(options.bestEffortsRecovery()).isEqualTo(true); + } + } + + @Test + public void maxBgerrorResumeCount() { + try (final DBOptions options = new DBOptions()) { + final int INT_MAX = 2147483647; + assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX); + assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options); + assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1); + } + } + + @Test + public void bgerrorResumeRetryInterval() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000); + final long newRetryInterval = 24 * 3600 * 1000000L; + assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options); + assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval); + } + } + + @Test + public void maxWriteBatchGroupSizeBytes() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024); + final long size = 1024 * 1024 * 1024 * 10L; + assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options); + assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size); + } + } + + @Test + public void skipCheckingSstFileSizesOnDbOpen() { + try (final DBOptions options = new DBOptions()) { + assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false); + assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options); + assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true); + } + } + + @Test + public void eventListeners() { + final AtomicBoolean wasCalled1 = new AtomicBoolean(); + final AtomicBoolean wasCalled2 = new AtomicBoolean(); + try (final DBOptions options = new DBOptions(); + final AbstractEventListener el1 = + new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + wasCalled1.set(true); + } + }; + final AbstractEventListener el2 = + new AbstractEventListener() { + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + wasCalled2.set(true); + } + }) { + assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); + List listeners = options.listeners(); + assertEquals(el1, listeners.get(0)); + assertEquals(el2, listeners.get(1)); + options.setListeners(Collections.emptyList()); + listeners.get(0).onTableFileDeleted(null); + assertTrue(wasCalled1.get()); + listeners.get(1).onMemTableSealed(null); + assertTrue(wasCalled2.get()); + List listeners2 = options.listeners(); + assertNotNull(listeners2); + assertEquals(0, listeners2.size()); + } + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,763 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.*; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.AbstractEventListener.EnabledEventCallback; +import org.rocksdb.test.TestableEventListener; + +public class EventListenerTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory(); + + void flushDb(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + db.flush(new FlushOptions()); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onFlushCompleted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onFlushCompletedListener = new AbstractEventListener() { + @Override + public void onFlushCompleted(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) { + assertNotNull(flushJobInfo.getColumnFamilyName()); + assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason()); + wasCbCalled.set(true); + } + }; + flushDb(onFlushCompletedListener, wasCbCalled); + } + + @Test + public void onFlushBegin() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onFlushBeginListener = new AbstractEventListener() { + @Override + public void onFlushBegin(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) { + assertNotNull(flushJobInfo.getColumnFamilyName()); + assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason()); + wasCbCalled.set(true); + } + }; + flushDb(onFlushBeginListener, wasCbCalled); + } + + void deleteTableFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + final RocksDB.LiveFiles liveFiles = db.getLiveFiles(); + assertNotNull(liveFiles); + assertNotNull(liveFiles.files); + assertFalse(liveFiles.files.isEmpty()); + db.deleteFile(liveFiles.files.get(0)); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onTableFileDeleted() throws RocksDBException, InterruptedException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileDeletedListener = new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + assertNotNull(tableFileDeletionInfo.getDbName()); + wasCbCalled.set(true); + } + }; + deleteTableFile(onTableFileDeletedListener, wasCbCalled); + } + + void compactRange(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + db.compactRange(); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onCompactionBegin() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onCompactionBeginListener = new AbstractEventListener() { + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason()); + wasCbCalled.set(true); + } + }; + compactRange(onCompactionBeginListener, wasCbCalled); + } + + @Test + public void onCompactionCompleted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onCompactionCompletedListener = new AbstractEventListener() { + @Override + public void onCompactionCompleted( + final RocksDB db, final CompactionJobInfo compactionJobInfo) { + assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason()); + wasCbCalled.set(true); + } + }; + compactRange(onCompactionCompletedListener, wasCbCalled); + } + + @Test + public void onTableFileCreated() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileCreatedListener = new AbstractEventListener() { + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + assertEquals(TableFileCreationReason.FLUSH, tableFileCreationInfo.getReason()); + wasCbCalled.set(true); + } + }; + flushDb(onTableFileCreatedListener, wasCbCalled); + } + + @Test + public void onTableFileCreationStarted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileCreationStartedListener = new AbstractEventListener() { + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + assertEquals(TableFileCreationReason.FLUSH, tableFileCreationBriefInfo.getReason()); + wasCbCalled.set(true); + } + }; + flushDb(onTableFileCreationStartedListener, wasCbCalled); + } + + void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily(); + columnFamilyHandle.close(); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onColumnFamilyHandleDeletionStarted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onColumnFamilyHandleDeletionStartedListener = + new AbstractEventListener() { + @Override + public void onColumnFamilyHandleDeletionStarted( + final ColumnFamilyHandle columnFamilyHandle) { + assertNotNull(columnFamilyHandle); + wasCbCalled.set(true); + } + }; + deleteColumnFamilyHandle(onColumnFamilyHandleDeletionStartedListener, wasCbCalled); + } + + void ingestExternalFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final String uuid = UUID.randomUUID().toString(); + final SstFileWriter sstFileWriter = new SstFileWriter(new EnvOptions(), opt); + final Path externalFilePath = Paths.get(db.getName(), uuid); + sstFileWriter.open(externalFilePath.toString()); + sstFileWriter.put("testKey".getBytes(), uuid.getBytes()); + sstFileWriter.finish(); + db.ingestExternalFile( + Collections.singletonList(externalFilePath.toString()), new IngestExternalFileOptions()); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onExternalFileIngested() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onExternalFileIngestedListener = new AbstractEventListener() { + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + assertNotNull(db); + wasCbCalled.set(true); + } + }; + ingestExternalFile(onExternalFileIngestedListener, wasCbCalled); + } + + @Test + public void testAllCallbacksInvocation() { + final int TEST_INT_VAL = -1; + final long TEST_LONG_VAL = -1; + // Expected test data objects + final Map userCollectedPropertiesTestData = + Collections.singletonMap("key", "value"); + final Map readablePropertiesTestData = Collections.singletonMap("key", "value"); + final TableProperties tablePropertiesTestData = new TableProperties(TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, "columnFamilyName".getBytes(), + "filterPolicyName", "comparatorName", "mergeOperatorName", "prefixExtractorName", + "propertyCollectorsNames", "compressionName", userCollectedPropertiesTestData, + readablePropertiesTestData); + final FlushJobInfo flushJobInfoTestData = new FlushJobInfo(Integer.MAX_VALUE, + "testColumnFamily", "/file/path", TEST_LONG_VAL, Integer.MAX_VALUE, true, true, + TEST_LONG_VAL, TEST_LONG_VAL, tablePropertiesTestData, (byte) 0x0a); + final Status statusTestData = new Status(Status.Code.Incomplete, Status.SubCode.NoSpace, null); + final TableFileDeletionInfo tableFileDeletionInfoTestData = + new TableFileDeletionInfo("dbName", "/file/path", Integer.MAX_VALUE, statusTestData); + final TableFileCreationInfo tableFileCreationInfoTestData = + new TableFileCreationInfo(TEST_LONG_VAL, tablePropertiesTestData, statusTestData, "dbName", + "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03); + final TableFileCreationBriefInfo tableFileCreationBriefInfoTestData = + new TableFileCreationBriefInfo( + "dbName", "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03); + final MemTableInfo memTableInfoTestData = new MemTableInfo( + "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL); + final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path", + TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); + final WriteStallInfo writeStallInfoTestData = + new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2); + final ExternalFileIngestionInfo externalFileIngestionInfoTestData = + new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path", + "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData); + + final CapturingTestableEventListener listener = new CapturingTestableEventListener() { + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + super.onFlushCompleted(db, flushJobInfo); + assertEquals(flushJobInfoTestData, flushJobInfo); + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + super.onFlushBegin(db, flushJobInfo); + assertEquals(flushJobInfoTestData, flushJobInfo); + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + super.onTableFileDeleted(tableFileDeletionInfo); + assertEquals(tableFileDeletionInfoTestData, tableFileDeletionInfo); + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + super.onCompactionBegin(db, compactionJobInfo); + assertArrayEquals( + "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName()); + assertEquals(statusTestData, compactionJobInfo.status()); + assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel()); + assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles()); + assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles()); + assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData), + compactionJobInfo.tableProperties()); + assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason()); + assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression()); + } + + @Override + public void onCompactionCompleted( + final RocksDB db, final CompactionJobInfo compactionJobInfo) { + super.onCompactionCompleted(db, compactionJobInfo); + assertArrayEquals( + "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName()); + assertEquals(statusTestData, compactionJobInfo.status()); + assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel()); + assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles()); + assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles()); + assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData), + compactionJobInfo.tableProperties()); + assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason()); + assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression()); + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + super.onTableFileCreated(tableFileCreationInfo); + assertEquals(tableFileCreationInfoTestData, tableFileCreationInfo); + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + super.onTableFileCreationStarted(tableFileCreationBriefInfo); + assertEquals(tableFileCreationBriefInfoTestData, tableFileCreationBriefInfo); + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + super.onMemTableSealed(memTableInfo); + assertEquals(memTableInfoTestData, memTableInfo); + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + super.onColumnFamilyHandleDeletionStarted(columnFamilyHandle); + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + super.onExternalFileIngested(db, externalFileIngestionInfo); + assertEquals(externalFileIngestionInfoTestData, externalFileIngestionInfo); + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + super.onBackgroundError(backgroundErrorReason, backgroundError); + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + super.onStallConditionsChanged(writeStallInfo); + assertEquals(writeStallInfoTestData, writeStallInfo); + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + super.onFileReadFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + super.onFileWriteFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + super.onFileFlushFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + super.onFileSyncFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + super.onFileRangeSyncFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + assertEquals(fileOperationInfoTestData, fileOperationInfo); + super.onFileTruncateFinish(fileOperationInfo); + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + super.onFileCloseFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + super.shouldBeNotifiedOnFileIO(); + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + super.onErrorRecoveryBegin(backgroundErrorReason, backgroundError); + assertEquals(BackgroundErrorReason.FLUSH, backgroundErrorReason); + assertEquals(statusTestData, backgroundError); + return true; + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + super.onErrorRecoveryCompleted(oldBackgroundError); + assertEquals(statusTestData, oldBackgroundError); + } + }; + + // test action + listener.invokeAllCallbacks(); + + // assert + assertAllEventsCalled(listener); + } + + @Test + public void testEnabledCallbacks() { + final EnabledEventCallback enabledEvents[] = { + EnabledEventCallback.ON_MEMTABLE_SEALED, EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED}; + + final CapturingTestableEventListener listener = + new CapturingTestableEventListener(enabledEvents); + + // test action + listener.invokeAllCallbacks(); + + // assert + assertEventsCalled(listener, enabledEvents); + } + + private static void assertAllEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener) { + assertEventsCalled(capturingTestableEventListener, EnumSet.allOf(EnabledEventCallback.class)); + } + + private static void assertEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener, + final EnabledEventCallback[] expected) { + assertEventsCalled(capturingTestableEventListener, EnumSet.copyOf(Arrays.asList(expected))); + } + + private static void assertEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener, + final EnumSet expected) { + final ListenerEvents capturedEvents = capturingTestableEventListener.capturedListenerEvents; + + if (expected.contains(EnabledEventCallback.ON_FLUSH_COMPLETED)) { + assertTrue("onFlushCompleted was not called", capturedEvents.flushCompleted); + } else { + assertFalse("onFlushCompleted was not called", capturedEvents.flushCompleted); + } + + if (expected.contains(EnabledEventCallback.ON_FLUSH_BEGIN)) { + assertTrue("onFlushBegin was not called", capturedEvents.flushBegin); + } else { + assertFalse("onFlushBegin was called", capturedEvents.flushBegin); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_DELETED)) { + assertTrue("onTableFileDeleted was not called", capturedEvents.tableFileDeleted); + } else { + assertFalse("onTableFileDeleted was called", capturedEvents.tableFileDeleted); + } + + if (expected.contains(EnabledEventCallback.ON_COMPACTION_BEGIN)) { + assertTrue("onCompactionBegin was not called", capturedEvents.compactionBegin); + } else { + assertFalse("onCompactionBegin was called", capturedEvents.compactionBegin); + } + + if (expected.contains(EnabledEventCallback.ON_COMPACTION_COMPLETED)) { + assertTrue("onCompactionCompleted was not called", capturedEvents.compactionCompleted); + } else { + assertFalse("onCompactionCompleted was called", capturedEvents.compactionCompleted); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATED)) { + assertTrue("onTableFileCreated was not called", capturedEvents.tableFileCreated); + } else { + assertFalse("onTableFileCreated was called", capturedEvents.tableFileCreated); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATION_STARTED)) { + assertTrue( + "onTableFileCreationStarted was not called", capturedEvents.tableFileCreationStarted); + } else { + assertFalse("onTableFileCreationStarted was called", capturedEvents.tableFileCreationStarted); + } + + if (expected.contains(EnabledEventCallback.ON_MEMTABLE_SEALED)) { + assertTrue("onMemTableSealed was not called", capturedEvents.memTableSealed); + } else { + assertFalse("onMemTableSealed was called", capturedEvents.memTableSealed); + } + + if (expected.contains(EnabledEventCallback.ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED)) { + assertTrue("onColumnFamilyHandleDeletionStarted was not called", + capturedEvents.columnFamilyHandleDeletionStarted); + } else { + assertFalse("onColumnFamilyHandleDeletionStarted was called", + capturedEvents.columnFamilyHandleDeletionStarted); + } + + if (expected.contains(EnabledEventCallback.ON_EXTERNAL_FILE_INGESTED)) { + assertTrue("onExternalFileIngested was not called", capturedEvents.externalFileIngested); + } else { + assertFalse("onExternalFileIngested was called", capturedEvents.externalFileIngested); + } + + if (expected.contains(EnabledEventCallback.ON_BACKGROUND_ERROR)) { + assertTrue("onBackgroundError was not called", capturedEvents.backgroundError); + } else { + assertFalse("onBackgroundError was called", capturedEvents.backgroundError); + } + + if (expected.contains(EnabledEventCallback.ON_STALL_CONDITIONS_CHANGED)) { + assertTrue("onStallConditionsChanged was not called", capturedEvents.stallConditionsChanged); + } else { + assertFalse("onStallConditionsChanged was called", capturedEvents.stallConditionsChanged); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_READ_FINISH)) { + assertTrue("onFileReadFinish was not called", capturedEvents.fileReadFinish); + } else { + assertFalse("onFileReadFinish was called", capturedEvents.fileReadFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_WRITE_FINISH)) { + assertTrue("onFileWriteFinish was not called", capturedEvents.fileWriteFinish); + } else { + assertFalse("onFileWriteFinish was called", capturedEvents.fileWriteFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_FLUSH_FINISH)) { + assertTrue("onFileFlushFinish was not called", capturedEvents.fileFlushFinish); + } else { + assertFalse("onFileFlushFinish was called", capturedEvents.fileFlushFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_SYNC_FINISH)) { + assertTrue("onFileSyncFinish was not called", capturedEvents.fileSyncFinish); + } else { + assertFalse("onFileSyncFinish was called", capturedEvents.fileSyncFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_RANGE_SYNC_FINISH)) { + assertTrue("onFileRangeSyncFinish was not called", capturedEvents.fileRangeSyncFinish); + } else { + assertFalse("onFileRangeSyncFinish was called", capturedEvents.fileRangeSyncFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_TRUNCATE_FINISH)) { + assertTrue("onFileTruncateFinish was not called", capturedEvents.fileTruncateFinish); + } else { + assertFalse("onFileTruncateFinish was called", capturedEvents.fileTruncateFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_CLOSE_FINISH)) { + assertTrue("onFileCloseFinish was not called", capturedEvents.fileCloseFinish); + } else { + assertFalse("onFileCloseFinish was called", capturedEvents.fileCloseFinish); + } + + if (expected.contains(EnabledEventCallback.SHOULD_BE_NOTIFIED_ON_FILE_IO)) { + assertTrue( + "shouldBeNotifiedOnFileIO was not called", capturedEvents.shouldBeNotifiedOnFileIO); + } else { + assertFalse("shouldBeNotifiedOnFileIO was called", capturedEvents.shouldBeNotifiedOnFileIO); + } + + if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_BEGIN)) { + assertTrue("onErrorRecoveryBegin was not called", capturedEvents.errorRecoveryBegin); + } else { + assertFalse("onErrorRecoveryBegin was called", capturedEvents.errorRecoveryBegin); + } + + if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED)) { + assertTrue("onErrorRecoveryCompleted was not called", capturedEvents.errorRecoveryCompleted); + } else { + assertFalse("onErrorRecoveryCompleted was called", capturedEvents.errorRecoveryCompleted); + } + } + + /** + * Members are volatile as they may be written + * and read by different threads. + */ + private static class ListenerEvents { + volatile boolean flushCompleted; + volatile boolean flushBegin; + volatile boolean tableFileDeleted; + volatile boolean compactionBegin; + volatile boolean compactionCompleted; + volatile boolean tableFileCreated; + volatile boolean tableFileCreationStarted; + volatile boolean memTableSealed; + volatile boolean columnFamilyHandleDeletionStarted; + volatile boolean externalFileIngested; + volatile boolean backgroundError; + volatile boolean stallConditionsChanged; + volatile boolean fileReadFinish; + volatile boolean fileWriteFinish; + volatile boolean fileFlushFinish; + volatile boolean fileSyncFinish; + volatile boolean fileRangeSyncFinish; + volatile boolean fileTruncateFinish; + volatile boolean fileCloseFinish; + volatile boolean shouldBeNotifiedOnFileIO; + volatile boolean errorRecoveryBegin; + volatile boolean errorRecoveryCompleted; + } + + private static class CapturingTestableEventListener extends TestableEventListener { + final ListenerEvents capturedListenerEvents = new ListenerEvents(); + + public CapturingTestableEventListener() {} + + public CapturingTestableEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(enabledEventCallbacks); + } + + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + capturedListenerEvents.flushCompleted = true; + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + capturedListenerEvents.flushBegin = true; + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + capturedListenerEvents.tableFileDeleted = true; + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + capturedListenerEvents.compactionBegin = true; + } + + @Override + public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + capturedListenerEvents.compactionCompleted = true; + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + capturedListenerEvents.tableFileCreated = true; + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + capturedListenerEvents.tableFileCreationStarted = true; + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + capturedListenerEvents.memTableSealed = true; + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + capturedListenerEvents.columnFamilyHandleDeletionStarted = true; + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + capturedListenerEvents.externalFileIngested = true; + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + capturedListenerEvents.backgroundError = true; + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + capturedListenerEvents.stallConditionsChanged = true; + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileReadFinish = true; + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileWriteFinish = true; + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileFlushFinish = true; + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileSyncFinish = true; + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileRangeSyncFinish = true; + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileTruncateFinish = true; + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileCloseFinish = true; + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + capturedListenerEvents.shouldBeNotifiedOnFileIO = true; + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + capturedListenerEvents.errorRecoveryBegin = true; + return true; + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + capturedListenerEvents.errorRecoveryCompleted = true; + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -4,20 +4,19 @@ // (found in the LICENSE.Apache file in the root directory). package org.rocksdb; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.assertj.core.api.Assertions.assertThat; +import org.junit.*; +import org.junit.rules.ExpectedException; +import org.junit.rules.TemporaryFolder; public class KeyMayExistTest { - @ClassRule public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = new RocksNativeLibraryResource(); @@ -25,168 +24,505 @@ @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + @Rule public ExpectedException exceptionRule = ExpectedException.none(); + + List cfDescriptors; + List columnFamilyHandleList = new ArrayList<>(); + RocksDB db; + + // Slice key + int offset; + int len; + + byte[] sliceKey; + byte[] sliceValue; + + @Before + public void before() throws RocksDBException { + cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + + db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); + + // Build the slice key + final StringBuilder builder = new StringBuilder("prefix"); + offset = builder.toString().length(); + builder.append("slice key 0"); + len = builder.toString().length() - offset; + builder.append("suffix"); + sliceKey = builder.toString().getBytes(UTF_8); + sliceValue = "slice value 0".getBytes(UTF_8); + } + + @After + public void after() { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + db.close(); + } + @Test public void keyMayExist() throws RocksDBException { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor("new_cf".getBytes()) - ); - - final List columnFamilyHandleList = new ArrayList<>(); - try (final DBOptions options = new DBOptions() - .setCreateIfMissing(true) - .setCreateMissingColumnFamilies(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList)) { - try { - assertThat(columnFamilyHandleList.size()). - isEqualTo(2); - db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); - // Test without column family - final Holder holder = new Holder<>(); - boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); - - exists = db.keyMayExist("key".getBytes(UTF_8), null); - assertThat(exists).isTrue(); - - // Slice key - final StringBuilder builder = new StringBuilder("prefix"); - final int offset = builder.toString().length(); - builder.append("slice key 0"); - final int len = builder.toString().length() - offset; - builder.append("suffix"); - - final byte[] sliceKey = builder.toString().getBytes(UTF_8); - final byte[] sliceValue = "slice value 0".getBytes(UTF_8); - db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); - - exists = db.keyMayExist(sliceKey, offset, len, holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(holder.getValue()).isEqualTo(sliceValue); - - exists = db.keyMayExist(sliceKey, offset, len, null); - assertThat(exists).isTrue(); - - // Test without column family but with readOptions - try (final ReadOptions readOptions = new ReadOptions()) { - exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); - - exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), null); - assertThat(exists).isTrue(); - - exists = db.keyMayExist(readOptions, sliceKey, offset, len, holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(holder.getValue()).isEqualTo(sliceValue); - - exists = db.keyMayExist(readOptions, sliceKey, offset, len, null); - assertThat(exists).isTrue(); - } - - // Test with column family - exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), - holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); - - exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), - null); - assertThat(exists).isTrue(); - - // Test slice sky with column family - exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, - holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(holder.getValue()).isEqualTo(sliceValue); - - exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, - null); - assertThat(exists).isTrue(); - - // Test with column family and readOptions - try (final ReadOptions readOptions = new ReadOptions()) { - exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions, - "key".getBytes(UTF_8), holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); - - exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions, - "key".getBytes(UTF_8), null); - assertThat(exists).isTrue(); - - // Test slice key with column family and read options - exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions, - sliceKey, offset, len, holder); - assertThat(exists).isTrue(); - assertThat(holder.getValue()).isNotNull(); - assertThat(holder.getValue()).isEqualTo(sliceValue); - - exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions, - sliceKey, offset, len, null); - assertThat(exists).isTrue(); - } - - // KeyMayExist in CF1 must return null value - exists = db.keyMayExist(columnFamilyHandleList.get(1), - "key".getBytes(UTF_8), holder); - assertThat(exists).isFalse(); - assertThat(holder.getValue()).isNull(); - exists = db.keyMayExist(columnFamilyHandleList.get(1), - "key".getBytes(UTF_8), null); - assertThat(exists).isFalse(); - - // slice key - exists = db.keyMayExist(columnFamilyHandleList.get(1), - sliceKey, 1, 3, holder); - assertThat(exists).isFalse(); - assertThat(holder.getValue()).isNull(); - exists = db.keyMayExist(columnFamilyHandleList.get(1), - sliceKey, 1, 3, null); - assertThat(exists).isFalse(); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } - } + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Test without column family + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); + + exists = db.keyMayExist("key".getBytes(UTF_8), null); + assertThat(exists).isTrue(); } @Test - public void keyMayExistNonUnicodeString() throws RocksDBException { - try (final Options options = new Options() - .setCreateIfMissing(true) - .setCreateMissingColumnFamilies(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { - final byte key[] = "key".getBytes(UTF_8); - final byte value[] = { (byte)0x80 }; // invalid unicode code-point - db.put(key, value); - - final byte buf[] = new byte[10]; - final int read = db.get(key, buf); - assertThat(read).isEqualTo(1); - assertThat(buf).startsWith(value); + public void keyMayExistReadOptions() throws RocksDBException { + // Test without column family but with readOptions + try (final ReadOptions readOptions = new ReadOptions()) { + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); final Holder holder = new Holder<>(); - boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder); + boolean exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); + + exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), null); + assertThat(exists).isTrue(); + + exists = db.keyMayExist(readOptions, sliceKey, offset, len, holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(holder.getValue()).isEqualTo(sliceValue); + + exists = db.keyMayExist(readOptions, sliceKey, offset, len, null); + assertThat(exists).isTrue(); + } + } + + @Test + public void keyMayExistColumnFamily() throws RocksDBException { + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); + + // Test slice key with column family + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(holder.getValue()).isEqualTo(sliceValue); + + exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, null); + assertThat(exists).isTrue(); + } + + @Test + public void keyMayExistColumnFamilyReadOptions() throws RocksDBException { + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); + + // Test slice key with column family and read options + final Holder holder = new Holder<>(); + try (final ReadOptions readOptions = new ReadOptions()) { + boolean exists = + db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), holder); assertThat(exists).isTrue(); assertThat(holder.getValue()).isNotNull(); - assertThat(holder.getValue()).isEqualTo(value); + assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); + + exists = + db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), null); + assertThat(exists).isTrue(); - exists = db.keyMayExist("key".getBytes(UTF_8), null); + // Test slice key with column family and read options + exists = + db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(holder.getValue()).isEqualTo(sliceValue); + + exists = + db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, null); assertThat(exists).isTrue(); } } + + @Test + public void keyMayExistSliceKey() throws RocksDBException { + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); + + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist(sliceKey, offset, len, holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(holder.getValue()).isEqualTo(sliceValue); + + exists = db.keyMayExist(sliceKey, offset, len, null); + assertThat(exists).isTrue(); + + exists = db.keyMayExist("slice key".getBytes(UTF_8), null); + assertThat(exists).isFalse(); + + exists = db.keyMayExist("slice key 0".getBytes(UTF_8), null); + assertThat(exists).isTrue(); + + // Test with column family + exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value"); + + exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), null); + assertThat(exists).isTrue(); + + // KeyMayExist in CF1 must return null value + exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder); + assertThat(exists).isFalse(); + assertThat(holder.getValue()).isNull(); + exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null); + assertThat(exists).isFalse(); + + // slice key + exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder); + assertThat(exists).isFalse(); + assertThat(holder.getValue()).isNull(); + exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null); + assertThat(exists).isFalse(); + } + + @Test + public void keyMayExistCF1() throws RocksDBException { + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); + + // KeyMayExist in CF1 must return null value + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder); + assertThat(exists).isFalse(); + assertThat(holder.getValue()).isNull(); + exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null); + assertThat(exists).isFalse(); + } + + @Test + public void keyMayExistCF1Slice() throws RocksDBException { + // Standard key + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + + // Slice key + db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length); + + // slice key + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder); + assertThat(exists).isFalse(); + assertThat(holder.getValue()).isNull(); + exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null); + assertThat(exists).isFalse(); + } + + @Test + public void keyMayExistBB() throws RocksDBException { + // Standard key + db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8)); + + final byte[] key = "keyBB".getBytes(UTF_8); + final byte[] value = "valueBB".getBytes(UTF_8); + + final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + + final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); + valueBuffer.position(12); + KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(12); + assertThat(valueBuffer.limit()).isEqualTo(12 + value.length); + byte[] valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(value); + + valueBuffer.limit(value.length + 24); + valueBuffer.position(25); + keyMayExist = db.keyMayExist(keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(25); + assertThat(valueBuffer.limit()).isEqualTo(24 + value.length); + valueGet = new byte[value.length - 1]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1)); + + exceptionRule.expect(BufferUnderflowException.class); + valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + } + + @Test + public void keyMayExistBBReadOptions() throws RocksDBException { + // Standard key + db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8)); + + final byte[] key = "keyBB".getBytes(UTF_8); + final byte[] value = "valueBB".getBytes(UTF_8); + + final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + try (final ReadOptions readOptions = new ReadOptions()) { + assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(true); + + final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); + valueBuffer.position(12); + KeyMayExist keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(12); + assertThat(valueBuffer.limit()).isEqualTo(12 + value.length); + byte[] valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(value); + + valueBuffer.limit(value.length + 24); + valueBuffer.position(25); + keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(25); + assertThat(valueBuffer.limit()).isEqualTo(24 + value.length); + valueGet = new byte[value.length - 1]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1)); + + exceptionRule.expect(BufferUnderflowException.class); + valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + } + } + + @Test + public void keyMayExistBBNullValue() throws RocksDBException { + // Standard key + db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8)); + + final byte[] key = "keyBB".getBytes(UTF_8); + + final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + exceptionRule.expect(AssertionError.class); + exceptionRule.expectMessage( + "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method"); + final KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, null); + } + + @Test + public void keyMayExistBBCF() throws RocksDBException { + // Standard key + db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8)); + + // 0 is the default CF + byte[] key = "keyBBCF0".getBytes(UTF_8); + ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(false); + assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(true); + + // 1 is just a CF + key = "keyBBCF1".getBytes(UTF_8); + keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + assertThat(db.keyMayExist(keyBuffer)).isEqualTo(false); + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true); + assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(false); + + exceptionRule.expect(AssertionError.class); + exceptionRule.expectMessage( + "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method"); + final KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer, null); + } + + @Test + public void keyMayExistBBCFReadOptions() throws RocksDBException { + // Standard key + db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8)); + + // 0 is the default CF + byte[] key = "keyBBCF0".getBytes(UTF_8); + ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + try (final ReadOptions readOptions = new ReadOptions()) { + assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true); + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) + .isEqualTo(false); + assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer)) + .isEqualTo(true); + + // 1 is just a CF + key = "keyBBCF1".getBytes(UTF_8); + keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(false); + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) + .isEqualTo(true); + assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer)) + .isEqualTo(false); + + exceptionRule.expect(AssertionError.class); + exceptionRule.expectMessage( + "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method"); + final KeyMayExist keyMayExist = + db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer, null); + } + } + + @Test + public void keyMayExistBBCFOffset() throws RocksDBException { + db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8)); + + final byte[] key = "keyBBCF1".getBytes(UTF_8); + final byte[] value = "valueBBCF1".getBytes(UTF_8); + + final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true); + + final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); + valueBuffer.position(12); + KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(12); + assertThat(valueBuffer.limit()).isEqualTo(12 + value.length); + byte[] valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(value); + + valueBuffer.limit(value.length + 24); + valueBuffer.position(25); + keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(25); + assertThat(valueBuffer.limit()).isEqualTo(24 + value.length); + valueGet = new byte[value.length - 1]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1)); + + exceptionRule.expect(BufferUnderflowException.class); + valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + } + + @Test + public void keyMayExistBBCFOffsetReadOptions() throws RocksDBException { + db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8)); + + final byte[] key = "keyBBCF1".getBytes(UTF_8); + final byte[] value = "valueBBCF1".getBytes(UTF_8); + + final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length); + keyBuffer.put(key, 0, key.length); + keyBuffer.flip(); + + try (final ReadOptions readOptions = new ReadOptions()) { + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer)) + .isEqualTo(true); + + final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24); + valueBuffer.position(12); + KeyMayExist keyMayExist = + db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(12); + assertThat(valueBuffer.limit()).isEqualTo(12 + value.length); + byte[] valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(value); + + valueBuffer.limit(value.length + 24); + valueBuffer.position(25); + keyMayExist = + db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer); + assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue); + assertThat(keyMayExist.valueLength).isEqualTo(value.length); + assertThat(valueBuffer.position()).isEqualTo(25); + assertThat(valueBuffer.limit()).isEqualTo(24 + value.length); + valueGet = new byte[value.length - 1]; + valueBuffer.get(valueGet); + assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1)); + + exceptionRule.expect(BufferUnderflowException.class); + valueGet = new byte[value.length]; + valueBuffer.get(valueGet); + } + } + + @Test + public void keyMayExistNonUnicodeString() throws RocksDBException { + final byte[] key = "key".getBytes(UTF_8); + final byte[] value = {(byte) 0x80}; // invalid unicode code-point + db.put(key, value); + + final byte[] buf = new byte[10]; + final int read = db.get(key, buf); + assertThat(read).isEqualTo(1); + assertThat(buf).startsWith(value); + + final Holder holder = new Holder<>(); + boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder); + assertThat(exists).isTrue(); + assertThat(holder.getValue()).isNotNull(); + assertThat(holder.getValue()).isEqualTo(value); + + exists = db.keyMayExist("key".getBytes(UTF_8), null); + assertThat(exists).isTrue(); + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,23 +5,27 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.ClassRule; import org.junit.Test; public class LRUCacheTest { - - static { - RocksDB.loadLibrary(); - } + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); @Test public void newLRUCache() { - final long capacity = 1000; + final long capacity = 80000000; final int numShardBits = 16; final boolean strictCapacityLimit = true; - final double highPriPoolRatio = 5; + final double highPriPoolRatio = 0.05; try(final Cache lruCache = new LRUCache(capacity, numShardBits, strictCapacityLimit, highPriPoolRatio)) { //no op + assertThat(lruCache.getUsage()).isGreaterThanOrEqualTo(0); + assertThat(lruCache.getPinnedUsage()).isGreaterThanOrEqualTo(0); } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -58,7 +58,8 @@ db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE)); assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo( db.getAggregatedLongProperty(TABLE_READERS)); - assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(0); + // TODO(peterd): disable block cache entry stats and check for 0 + assertThat(usage.get(MemoryUsageType.kCacheTotal)).isLessThan(1024); db.put(key, value); db.flush(flushOptions); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,18 +5,18 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.ArrayList; - import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static org.assertj.core.api.Assertions.assertThat; - public class MergeTest { @ClassRule @@ -46,13 +46,13 @@ } private byte[] longToByteArray(long l) { - ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE); + ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.putLong(l); return buf.array(); } private long longFromByteArray(byte[] a) { - ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE); + ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.put(a); buf.flip(); return buf.getLong(); @@ -144,14 +144,13 @@ // writing (long)100 under key db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(100)); - // merge (long)1 under key - db.merge(columnFamilyHandleList.get(1), - "cfkey".getBytes(), longToByteArray(1)); + // merge (long)157 under key + db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(157)); byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); long longValue = longFromByteArray(value); - assertThat(longValue).isEqualTo(101); + assertThat(longValue).isEqualTo(257); } finally { for (final ColumnFamilyHandle handle : columnFamilyHandleList) { handle.close(); @@ -413,6 +412,32 @@ } } + @Test + public void emptyStringAsStringAppendDelimiter() throws RocksDBException { + try (final StringAppendOperator stringAppendOperator = new StringAppendOperator(""); + final Options opt = + new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key".getBytes(), "aa".getBytes()); + db.merge("key".getBytes(), "bb".getBytes()); + final byte[] value = db.get("key".getBytes()); + assertThat(new String(value)).isEqualTo("aabb"); + } + } + + @Test + public void multiCharStringAsStringAppendDelimiter() throws RocksDBException { + try (final StringAppendOperator stringAppendOperator = new StringAppendOperator("<>"); + final Options opt = + new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key".getBytes(), "aa".getBytes()); + db.merge("key".getBytes(), "bb".getBytes()); + final byte[] value = db.get("key".getBytes()); + assertThat(new String(value)).isEqualTo("aa<>bb"); + } + } + @Test public void emptyStringInSetMergeOperatorByName() { try (final Options opt = new Options() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.charset.StandardCharsets; +import java.util.*; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class MultiGetManyKeysTest { + @Parameterized.Parameters + public static List data() { + return Arrays.asList(3, 250, 60000, 70000, 150000, 750000); + } + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + private final int keySize; + + public MultiGetManyKeysTest(final Integer keySize) { + this.keySize = keySize; + } + + /** + * Test for https://github.com/facebook/rocksdb/issues/8039 + */ + @Test + public void multiGetAsListLarge() throws RocksDBException { + final Random rand = new Random(); + final List keys = new ArrayList<>(); + for (int i = 0; i < keySize; i++) { + final byte[] key = new byte[4]; + rand.nextBytes(key); + keys.add(key); + } + + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List values = db.multiGetAsList(keys); + assertThat(values.size()).isEqualTo(keys.size()); + } + } + + @Test + public void multiGetAsListCheckResults() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List keys = new ArrayList<>(); + for (int i = 0; i < keySize; i++) { + byte[] key = ("key" + i + ":").getBytes(); + keys.add(key); + db.put(key, ("value" + i + ":").getBytes()); + } + + final List values = db.multiGetAsList(keys); + assertThat(values.size()).isEqualTo(keys.size()); + for (int i = 0; i < keySize; i++) { + assertThat(values.get(i)).isEqualTo(("value" + i + ":").getBytes()); + } + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,525 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.TestUtil; + +public class MultiGetTest { + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void putNThenMultiGet() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + final List keys = + Arrays.asList("key1".getBytes(), "key2".getBytes(), "key3".getBytes()); + final List values = db.multiGetAsList(keys); + assertThat(values.size()).isEqualTo(keys.size()); + assertThat(values.get(0)).isEqualTo("value1ForKey1".getBytes()); + assertThat(values.get(1)).isEqualTo("value2ForKey2".getBytes()); + assertThat(values.get(2)).isEqualTo("value3ForKey3".getBytes()); + } + } + + @Test + public void putNThenMultiGetDirect() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List results = db.multiGetByteBuffers(keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + + { + final List results = + db.multiGetByteBuffers(new ReadOptions(), keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + } + } + + @Test + public void putNThenMultiGetDirectSliced() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + keys.add( + ByteBuffer.allocateDirect(12).put("prefix1".getBytes()).slice().put("key1".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List results = db.multiGetByteBuffers(keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value3ForKey3".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value1ForKey1".getBytes()); + } + } + } + + @Test + public void putNThenMultiGetDirectBadValuesArray() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + + { + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + values.remove(0); + + try { + db.multiGetByteBuffers(keys, values); + fail("Expected exception when not enough value ByteBuffers supplied"); + } catch (final IllegalArgumentException e) { + assertThat(e.getMessage()).contains("For each key there must be a corresponding value"); + } + } + + { + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + values.add(ByteBuffer.allocateDirect(24)); + + try { + db.multiGetByteBuffers(keys, values); + fail("Expected exception when too many value ByteBuffers supplied"); + } catch (final IllegalArgumentException e) { + assertThat(e.getMessage()).contains("For each key there must be a corresponding value"); + } + } + } + } + + @Test + public void putNThenMultiGetDirectShortValueBuffers() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + + { + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(4)); + } + + final List statii = db.multiGetByteBuffers(keys, values); + assertThat(statii.size()).isEqualTo(values.size()); + for (final ByteBufferGetStatus status : statii) { + assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(status.requiredSize).isEqualTo("value3ForKey3".getBytes().length); + final ByteBuffer expected = + ByteBuffer.allocateDirect(24).put(Arrays.copyOf("valueX".getBytes(), 4)); + expected.flip(); + assertThat(status.value).isEqualTo(expected); + } + } + } + } + + @Test + public void putNThenMultiGetDirectNondefaultCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(0); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes())); + cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes())); + cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes())); + + final List cf = db.createColumnFamilies(cfDescriptors); + + db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes()); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes()); + db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List results = db.multiGetByteBuffers(keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + columnFamilyHandles.add(cf.get(0)); + columnFamilyHandles.add(cf.get(0)); + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + } + } + + @Test + public void putNThenMultiGetDirectCFParams() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put("key3".getBytes(), "value3ForKey3".getBytes()); + + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + try { + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + fail("Expected exception when 2 column families supplied"); + } catch (final IllegalArgumentException e) { + assertThat(e.getMessage()).contains("Wrong number of ColumnFamilyHandle(s) supplied"); + } + + columnFamilyHandles.clear(); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)).isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)).isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)).isEqualTo("value3ForKey3".getBytes()); + } + } + + @Test + public void putNThenMultiGetDirectMixedCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes())); + cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes())); + cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes())); + cfDescriptors.add(new ColumnFamilyDescriptor("cf3".getBytes())); + + final List cf = db.createColumnFamilies(cfDescriptors); + + db.put(cf.get(1), "key1".getBytes(), "value1ForKey1".getBytes()); + db.put("key2".getBytes(), "value2ForKey2".getBytes()); + db.put(cf.get(3), "key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound); + + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(1)); + + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound); + + assertThat(results.get(0).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(1)); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + columnFamilyHandles.add(cf.get(3)); + + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("value2ForKey2".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(db.getDefaultColumnFamily()); + columnFamilyHandles.add(cf.get(1)); + columnFamilyHandles.add(cf.get(3)); + + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + } + } + + @Test + public void putNThenMultiGetDirectTruncateCF() throws RocksDBException { + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List cfDescriptors = new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes())); + + final List cf = db.createColumnFamilies(cfDescriptors); + + db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes()); + db.put(cf.get(0), "key2".getBytes(), "value2ForKey2WithLotsOfTrailingGarbage".getBytes()); + db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes()); + + final List keys = new ArrayList<>(); + keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes())); + keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes())); + // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\ + for (final ByteBuffer key : keys) { + key.flip(); + } + final List values = new ArrayList<>(); + for (int i = 0; i < keys.size(); i++) { + values.add(ByteBuffer.allocateDirect(24)); + } + + { + final List columnFamilyHandles = new ArrayList<>(); + columnFamilyHandles.add(cf.get(0)); + final List results = + db.multiGetByteBuffers(columnFamilyHandles, keys, values); + + assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok); + assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok); + + assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length); + assertThat(results.get(1).requiredSize) + .isEqualTo("value2ForKey2WithLotsOfTrailingGarbage".getBytes().length); + assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length); + + assertThat(TestUtil.bufferBytes(results.get(0).value)) + .isEqualTo("value1ForKey1".getBytes()); + assertThat(TestUtil.bufferBytes(results.get(1).value)) + .isEqualTo("valu e2Fo rKey 2Wit hLot sOfT".replace(" ", "").getBytes()); + assertThat(TestUtil.bufferBytes(results.get(2).value)) + .isEqualTo("value3ForKey3".getBytes()); + } + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -59,23 +59,23 @@ @Test public void mutableColumnFamilyOptions_toString() { - final String str = MutableColumnFamilyOptions - .builder() - .setWriteBufferSize(10) - .setInplaceUpdateNumLocks(5) - .setDisableAutoCompactions(true) - .setParanoidFileChecks(true) - .build() - .toString(); + final String str = MutableColumnFamilyOptions.builder() + .setWriteBufferSize(10) + .setInplaceUpdateNumLocks(5) + .setDisableAutoCompactions(true) + .setParanoidFileChecks(true) + .setMaxBytesForLevelMultiplierAdditional(new int[] {2, 3, 5, 7, 11, 13}) + .build() + .toString(); assertThat(str).isEqualTo("write_buffer_size=10;inplace_update_num_locks=5;" - + "disable_auto_compactions=true;paranoid_file_checks=true"); + + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:3:5:7:11:13"); } @Test public void mutableColumnFamilyOptions_parse() { final String str = "write_buffer_size=10;inplace_update_num_locks=5;" - + "disable_auto_compactions=true;paranoid_file_checks=true"; + + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:{3}:{5}:{7}:{11}:{13}"; final MutableColumnFamilyOptionsBuilder builder = MutableColumnFamilyOptions.parse(str); @@ -84,5 +84,79 @@ assertThat(builder.inplaceUpdateNumLocks()).isEqualTo(5); assertThat(builder.disableAutoCompactions()).isEqualTo(true); assertThat(builder.paranoidFileChecks()).isEqualTo(true); + assertThat(builder.maxBytesForLevelMultiplierAdditional()) + .isEqualTo(new int[] {2, 3, 5, 7, 11, 13}); + } + + /** + * Extended parsing test to deal with all the options which C++ may return. + * We have canned a set of options returned by {RocksDB#getOptions} + */ + @Test + public void mutableColumnFamilyOptions_parse_getOptions_output() { + final String optionsString = + "bottommost_compression=kDisableCompressionOption; sample_for_compression=0; " + + "blob_garbage_collection_age_cutoff=0.250000; blob_garbage_collection_force_threshold=0.800000; arena_block_size=1048576; enable_blob_garbage_collection=false; " + + "level0_stop_writes_trigger=36; min_blob_size=65536; " + + "compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;" + + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;}; " + + "target_file_size_base=67108864; max_bytes_for_level_base=268435456; memtable_whole_key_filtering=false; " + + "soft_pending_compaction_bytes_limit=68719476736; blob_compression_type=kNoCompression; max_write_buffer_number=2; " + + "ttl=2592000; compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}; " + + "check_flush_compaction_key_order=true; max_successive_merges=0; inplace_update_num_locks=10000; " + + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;" + + "strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;}; " + + "target_file_size_multiplier=1; max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17}; " + + "enable_blob_files=true; level0_slowdown_writes_trigger=20; compression=kLZ4HCCompression; level0_file_num_compaction_trigger=4; " + + "blob_file_size=268435456; prefix_extractor=nullptr; max_bytes_for_level_multiplier=10.000000; write_buffer_size=67108864; " + + "disable_auto_compactions=false; max_compaction_bytes=1677721600; memtable_huge_page_size=0; " + + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;" + + "level=32767;window_bits=-14;}; " + + "hard_pending_compaction_bytes_limit=274877906944; periodic_compaction_seconds=0; paranoid_file_checks=true; " + + "memtable_prefix_bloom_size_ratio=7.500000; max_sequential_skip_in_iterations=8; report_bg_io_stats=true; " + + "compaction_pri=kMinOverlappingRatio; compaction_style=kCompactionStyleLevel; memtable_factory=SkipListFactory; " + + "comparator=leveldb.BytewiseComparator; bloom_locality=0; compaction_filter_factory=nullptr; " + + "min_write_buffer_number_to_merge=1; max_write_buffer_number_to_maintain=0; compaction_filter=nullptr; merge_operator=nullptr; " + + "num_levels=7; optimize_filters_for_hits=false; force_consistency_checks=true; table_factory=BlockBasedTable; " + + "max_write_buffer_size_to_maintain=0; memtable_insert_with_hint_prefix_extractor=nullptr; level_compaction_dynamic_level_bytes=false; " + + "inplace_update_support=false;"; + + MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf = + MutableColumnFamilyOptions.parse(optionsString, true); + + // Check the values from the parsed string which are column family options + assertThat(cf.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(cf.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + assertThat(cf.arenaBlockSize()).isEqualTo(1048576); + assertThat(cf.enableBlobGarbageCollection()).isEqualTo(false); + assertThat(cf.level0StopWritesTrigger()).isEqualTo(36); + assertThat(cf.minBlobSize()).isEqualTo(65536); + assertThat(cf.targetFileSizeBase()).isEqualTo(67108864); + assertThat(cf.maxBytesForLevelBase()).isEqualTo(268435456); + assertThat(cf.softPendingCompactionBytesLimit()).isEqualTo(68719476736L); + assertThat(cf.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION); + assertThat(cf.maxWriteBufferNumber()).isEqualTo(2); + assertThat(cf.ttl()).isEqualTo(2592000); + assertThat(cf.maxSuccessiveMerges()).isEqualTo(0); + assertThat(cf.inplaceUpdateNumLocks()).isEqualTo(10000); + assertThat(cf.targetFileSizeMultiplier()).isEqualTo(1); + assertThat(cf.maxBytesForLevelMultiplierAdditional()) + .isEqualTo(new int[] {5, 7, 9, 11, 13, 15, 17}); + assertThat(cf.enableBlobFiles()).isEqualTo(true); + assertThat(cf.level0SlowdownWritesTrigger()).isEqualTo(20); + assertThat(cf.compressionType()).isEqualTo(CompressionType.LZ4HC_COMPRESSION); + assertThat(cf.level0FileNumCompactionTrigger()).isEqualTo(4); + assertThat(cf.blobFileSize()).isEqualTo(268435456); + assertThat(cf.maxBytesForLevelMultiplier()).isEqualTo(10.0); + assertThat(cf.writeBufferSize()).isEqualTo(67108864); + assertThat(cf.disableAutoCompactions()).isEqualTo(false); + assertThat(cf.maxCompactionBytes()).isEqualTo(1677721600); + assertThat(cf.memtableHugePageSize()).isEqualTo(0); + assertThat(cf.hardPendingCompactionBytesLimit()).isEqualTo(274877906944L); + assertThat(cf.periodicCompactionSeconds()).isEqualTo(0); + assertThat(cf.paranoidFileChecks()).isEqualTo(true); + assertThat(cf.memtablePrefixBloomSizeRatio()).isEqualTo(7.5); + assertThat(cf.maxSequentialSkipInIterations()).isEqualTo(8); + assertThat(cf.reportBgIoStats()).isEqualTo(true); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,397 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class MutableOptionsGetSetTest { + final int minBlobSize = 65536; + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + /** + * Validate the round-trip of blob options into and out of the C++ core of RocksDB + * From CF options on CF Creation to {RocksDB#getOptions} + * Uses 2x column families with different values for their options. + * NOTE that some constraints are applied to the options in the C++ core, + * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio} + * + * @throws RocksDBException if the database throws an exception + */ + @Test + public void testGetMutableBlobOptionsAfterCreate() throws RocksDBException { + final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions(); + final ColumnFamilyDescriptor columnFamilyDescriptor0 = + new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0); + final List columnFamilyDescriptors = + Collections.singletonList(columnFamilyDescriptor0); + final List columnFamilyHandles = new ArrayList<>(); + + try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + try (final ColumnFamilyOptions columnFamilyOptions1 = + new ColumnFamilyOptions() + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(true) + .setBlobGarbageCollectionAgeCutoff(0.25) + .setBlobGarbageCollectionForceThreshold(0.80) + .setArenaBlockSize(42) + .setMemtablePrefixBloomSizeRatio(0.17) + .setMemtableHugePageSize(3) + .setMaxSuccessiveMerges(4) + .setMaxWriteBufferNumber(12) + .setInplaceUpdateNumLocks(16) + .setDisableAutoCompactions(false) + .setSoftPendingCompactionBytesLimit(112) + .setHardPendingCompactionBytesLimit(280) + .setLevel0FileNumCompactionTrigger(200) + .setLevel0SlowdownWritesTrigger(312) + .setLevel0StopWritesTrigger(584) + .setMaxCompactionBytes(12) + .setTargetFileSizeBase(99) + .setTargetFileSizeMultiplier(112) + .setMaxSequentialSkipInIterations(50) + .setReportBgIoStats(true); + + final ColumnFamilyOptions columnFamilyOptions2 = + new ColumnFamilyOptions() + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(false) + .setArenaBlockSize(42) + .setMemtablePrefixBloomSizeRatio(0.236) + .setMemtableHugePageSize(8) + .setMaxSuccessiveMerges(12) + .setMaxWriteBufferNumber(22) + .setInplaceUpdateNumLocks(160) + .setDisableAutoCompactions(true) + .setSoftPendingCompactionBytesLimit(1124) + .setHardPendingCompactionBytesLimit(2800) + .setLevel0FileNumCompactionTrigger(2000) + .setLevel0SlowdownWritesTrigger(5840) + .setLevel0StopWritesTrigger(31200) + .setMaxCompactionBytes(112) + .setTargetFileSizeBase(999) + .setTargetFileSizeMultiplier(1120) + .setMaxSequentialSkipInIterations(24) + .setReportBgIoStats(true)) { + final ColumnFamilyDescriptor columnFamilyDescriptor1 = + new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1); + final ColumnFamilyDescriptor columnFamilyDescriptor2 = + new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2); + + // Create the column family with blob options + final ColumnFamilyHandle columnFamilyHandle1 = + db.createColumnFamily(columnFamilyDescriptor1); + final ColumnFamilyHandle columnFamilyHandle2 = + db.createColumnFamily(columnFamilyDescriptor2); + + // Check the getOptions() brings back the creation options for CF1 + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = + db.getOptions(columnFamilyHandle1); + assertThat(builder1.enableBlobFiles()).isEqualTo(true); + assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize); + assertThat(builder1.arenaBlockSize()).isEqualTo(42); + assertThat(builder1.memtableHugePageSize()).isEqualTo(3); + assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17); + assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4); + assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12); + assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16); + assertThat(builder1.disableAutoCompactions()).isEqualTo(false); + assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112); + assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280); + assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200); + assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312); + assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584); + assertThat(builder1.maxCompactionBytes()).isEqualTo(12); + assertThat(builder1.targetFileSizeBase()).isEqualTo(99); + assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112); + assertThat(builder1.maxSequentialSkipInIterations()).isEqualTo(50); + assertThat(builder1.reportBgIoStats()).isEqualTo(true); + + // Check the getOptions() brings back the creation options for CF2 + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 = + db.getOptions(columnFamilyHandle2); + assertThat(builder2.enableBlobFiles()).isEqualTo(false); + assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize); + assertThat(builder2.arenaBlockSize()).isEqualTo(42); + assertThat(builder2.memtableHugePageSize()).isEqualTo(8); + assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236); + assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12); + assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22); + assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160); + assertThat(builder2.disableAutoCompactions()).isEqualTo(true); + assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124); + assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800); + assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000); + assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840); + assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200); + assertThat(builder2.maxCompactionBytes()).isEqualTo(112); + assertThat(builder2.targetFileSizeBase()).isEqualTo(999); + assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120); + assertThat(builder2.maxSequentialSkipInIterations()).isEqualTo(24); + assertThat(builder2.reportBgIoStats()).isEqualTo(true); + } + } + } + + /** + * Validate the round-trip of blob options into and out of the C++ core of RocksDB + * From {RocksDB#setOptions} to {RocksDB#getOptions} + * Uses 2x column families with different values for their options. + * NOTE that some constraints are applied to the options in the C++ core, + * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio} + * + * @throws RocksDBException if a database access has an error + */ + @Test + public void testGetMutableBlobOptionsAfterSetCF() throws RocksDBException { + final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions(); + final ColumnFamilyDescriptor columnFamilyDescriptor0 = + new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0); + final List columnFamilyDescriptors = + Collections.singletonList(columnFamilyDescriptor0); + final List columnFamilyHandles = new ArrayList<>(); + + try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + try (final ColumnFamilyOptions columnFamilyOptions1 = new ColumnFamilyOptions(); + + final ColumnFamilyOptions columnFamilyOptions2 = new ColumnFamilyOptions()) { + final ColumnFamilyDescriptor columnFamilyDescriptor1 = + new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1); + final ColumnFamilyDescriptor columnFamilyDescriptor2 = + new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2); + + // Create the column family with blob options + final ColumnFamilyHandle columnFamilyHandle1 = + db.createColumnFamily(columnFamilyDescriptor1); + final ColumnFamilyHandle columnFamilyHandle2 = + db.createColumnFamily(columnFamilyDescriptor2); + db.flush(new FlushOptions().setWaitForFlush(true)); + + final MutableColumnFamilyOptions + .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions1 = + MutableColumnFamilyOptions.builder() + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(true) + .setBlobGarbageCollectionAgeCutoff(0.25) + .setBlobGarbageCollectionForceThreshold(0.80) + .setArenaBlockSize(42) + .setMemtablePrefixBloomSizeRatio(0.17) + .setMemtableHugePageSize(3) + .setMaxSuccessiveMerges(4) + .setMaxWriteBufferNumber(12) + .setInplaceUpdateNumLocks(16) + .setDisableAutoCompactions(false) + .setSoftPendingCompactionBytesLimit(112) + .setHardPendingCompactionBytesLimit(280) + .setLevel0FileNumCompactionTrigger(200) + .setLevel0SlowdownWritesTrigger(312) + .setLevel0StopWritesTrigger(584) + .setMaxCompactionBytes(12) + .setTargetFileSizeBase(99) + .setTargetFileSizeMultiplier(112); + db.setOptions(columnFamilyHandle1, mutableColumnFamilyOptions1.build()); + + // Check the getOptions() brings back the creation options for CF1 + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = + db.getOptions(columnFamilyHandle1); + assertThat(builder1.enableBlobFiles()).isEqualTo(true); + assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize); + assertThat(builder1.arenaBlockSize()).isEqualTo(42); + assertThat(builder1.memtableHugePageSize()).isEqualTo(3); + assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17); + assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4); + assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12); + assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16); + assertThat(builder1.disableAutoCompactions()).isEqualTo(false); + assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112); + assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280); + assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200); + assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312); + assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584); + assertThat(builder1.maxCompactionBytes()).isEqualTo(12); + assertThat(builder1.targetFileSizeBase()).isEqualTo(99); + assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112); + + final MutableColumnFamilyOptions + .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions2 = + MutableColumnFamilyOptions.builder() + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(false) + .setArenaBlockSize(42) + .setMemtablePrefixBloomSizeRatio(0.236) + .setMemtableHugePageSize(8) + .setMaxSuccessiveMerges(12) + .setMaxWriteBufferNumber(22) + .setInplaceUpdateNumLocks(160) + .setDisableAutoCompactions(true) + .setSoftPendingCompactionBytesLimit(1124) + .setHardPendingCompactionBytesLimit(2800) + .setLevel0FileNumCompactionTrigger(2000) + .setLevel0SlowdownWritesTrigger(5840) + .setLevel0StopWritesTrigger(31200) + .setMaxCompactionBytes(112) + .setTargetFileSizeBase(999) + .setTargetFileSizeMultiplier(1120); + db.setOptions(columnFamilyHandle2, mutableColumnFamilyOptions2.build()); + + // Check the getOptions() brings back the creation options for CF2 + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 = + db.getOptions(columnFamilyHandle2); + assertThat(builder2.enableBlobFiles()).isEqualTo(false); + assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize); + assertThat(builder2.arenaBlockSize()).isEqualTo(42); + assertThat(builder2.memtableHugePageSize()).isEqualTo(8); + assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236); + assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12); + assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22); + assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160); + assertThat(builder2.disableAutoCompactions()).isEqualTo(true); + assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124); + assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800); + assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000); + assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840); + assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200); + assertThat(builder2.maxCompactionBytes()).isEqualTo(112); + assertThat(builder2.targetFileSizeBase()).isEqualTo(999); + assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120); + } + } + } + + /** + * Validate the round-trip of blob options into and out of the C++ core of RocksDB + * From {RocksDB#setOptions} to {RocksDB#getOptions} + * Uses 2x column families with different values for their options. + * NOTE that some constraints are applied to the options in the C++ core, + * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio} + * + * @throws RocksDBException if a database access has an error + */ + @Test + public void testGetMutableBlobOptionsAfterSet() throws RocksDBException { + final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions(); + final ColumnFamilyDescriptor columnFamilyDescriptor0 = + new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0); + final List columnFamilyDescriptors = + Collections.singletonList(columnFamilyDescriptor0); + final List columnFamilyHandles = new ArrayList<>(); + + try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + final MutableColumnFamilyOptions + .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions = + MutableColumnFamilyOptions.builder() + .setMinBlobSize(minBlobSize) + .setEnableBlobFiles(true) + .setBlobGarbageCollectionAgeCutoff(0.25) + .setBlobGarbageCollectionForceThreshold(0.80) + .setArenaBlockSize(42) + .setMemtablePrefixBloomSizeRatio(0.17) + .setMemtableHugePageSize(3) + .setMaxSuccessiveMerges(4) + .setMaxWriteBufferNumber(12) + .setInplaceUpdateNumLocks(16) + .setDisableAutoCompactions(false) + .setSoftPendingCompactionBytesLimit(112) + .setHardPendingCompactionBytesLimit(280) + .setLevel0FileNumCompactionTrigger(200) + .setLevel0SlowdownWritesTrigger(312) + .setLevel0StopWritesTrigger(584) + .setMaxCompactionBytes(12) + .setTargetFileSizeBase(99) + .setTargetFileSizeMultiplier(112); + db.setOptions(mutableColumnFamilyOptions.build()); + + // Check the getOptions() brings back the creation options for CF1 + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = db.getOptions(); + assertThat(builder1.enableBlobFiles()).isEqualTo(true); + assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25); + assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80); + assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize); + assertThat(builder1.arenaBlockSize()).isEqualTo(42); + assertThat(builder1.memtableHugePageSize()).isEqualTo(3); + assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17); + assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4); + assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12); + assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16); + assertThat(builder1.disableAutoCompactions()).isEqualTo(false); + assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112); + assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280); + assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200); + assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312); + assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584); + assertThat(builder1.maxCompactionBytes()).isEqualTo(12); + assertThat(builder1.targetFileSizeBase()).isEqualTo(99); + assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112); + } + } + + @Test + public void testGetMutableDBOptionsAfterSet() throws RocksDBException { + final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions(); + final ColumnFamilyDescriptor columnFamilyDescriptor0 = + new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0); + final List columnFamilyDescriptors = + Collections.singletonList(columnFamilyDescriptor0); + final List columnFamilyHandles = new ArrayList<>(); + + try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + final MutableDBOptions.MutableDBOptionsBuilder mutableDBOptions = + MutableDBOptions.builder() + .setMaxBackgroundJobs(16) + .setAvoidFlushDuringShutdown(true) + .setWritableFileMaxBufferSize(2097152) + .setDelayedWriteRate(67108864) + .setMaxTotalWalSize(16777216) + .setDeleteObsoleteFilesPeriodMicros(86400000000L) + .setStatsDumpPeriodSec(1200) + .setStatsPersistPeriodSec(7200) + .setStatsHistoryBufferSize(6291456) + .setMaxOpenFiles(8) + .setBytesPerSync(4194304) + .setWalBytesPerSync(1048576) + .setStrictBytesPerSync(true) + .setCompactionReadaheadSize(1024); + + db.setDBOptions(mutableDBOptions.build()); + + final MutableDBOptions.MutableDBOptionsBuilder getBuilder = db.getDBOptions(); + assertThat(getBuilder.maxBackgroundJobs()).isEqualTo(16); // 4 + assertThat(getBuilder.avoidFlushDuringShutdown()).isEqualTo(true); // false + assertThat(getBuilder.writableFileMaxBufferSize()).isEqualTo(2097152); // 1048576 + assertThat(getBuilder.delayedWriteRate()).isEqualTo(67108864); // 16777216 + assertThat(getBuilder.maxTotalWalSize()).isEqualTo(16777216); + assertThat(getBuilder.deleteObsoleteFilesPeriodMicros()) + .isEqualTo(86400000000L); // 21600000000 + assertThat(getBuilder.statsDumpPeriodSec()).isEqualTo(1200); // 600 + assertThat(getBuilder.statsPersistPeriodSec()).isEqualTo(7200); // 600 + assertThat(getBuilder.statsHistoryBufferSize()).isEqualTo(6291456); // 1048576 + assertThat(getBuilder.maxOpenFiles()).isEqualTo(8); //-1 + assertThat(getBuilder.bytesPerSync()).isEqualTo(4194304); // 1048576 + assertThat(getBuilder.walBytesPerSync()).isEqualTo(1048576); // 0 + assertThat(getBuilder.strictBytesPerSync()).isEqualTo(true); // false + assertThat(getBuilder.compactionReadaheadSize()).isEqualTo(1024); // 0 + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -15,6 +15,9 @@ import static org.junit.Assert.assertEquals; public class NativeComparatorWrapperTest { + static { + RocksDB.loadLibrary(); + } @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,16 +5,18 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; - +import java.util.concurrent.atomic.AtomicBoolean; import org.junit.ClassRule; import org.junit.Test; import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory; -import static org.assertj.core.api.Assertions.assertThat; - - public class OptionsTest { @ClassRule @@ -685,6 +687,16 @@ } @Test + public void setWriteBufferManagerWithAllowStall() throws RocksDBException { + try (final Options opt = new Options(); final Cache cache = new LRUCache(1 * 1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache, true)) { + opt.setWriteBufferManager(writeBufferManager); + assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); + assertThat(opt.writeBufferManager().allowStall()).isEqualTo(true); + } + } + + @Test public void accessHintOnCompactionStart() { try (final Options opt = new Options()) { final AccessHint accessHint = AccessHint.SEQUENTIAL; @@ -1255,6 +1267,14 @@ } @Test + public void periodicCompactionSeconds() { + try (final Options options = new Options()) { + options.setPeriodicCompactionSeconds(1000 * 60); + assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60); + } + } + + @Test public void compactionOptionsUniversal() { try (final Options options = new Options(); final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal() @@ -1308,4 +1328,164 @@ } } + @Test + public void compactionThreadLimiter() { + try (final Options options = new Options(); + final ConcurrentTaskLimiter compactionThreadLimiter = + new ConcurrentTaskLimiterImpl("name", 3)) { + options.setCompactionThreadLimiter(compactionThreadLimiter); + assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter); + } + } + + @Test + public void oldDefaults() { + try (final Options options = new Options()) { + options.oldDefaults(4, 6); + assertThat(options.writeBufferSize()).isEqualTo(4 << 20); + assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize); + assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576); + assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576); + assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0); + assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0); + assertThat(options.level0StopWritesTrigger()).isEqualTo(24); + } + } + + @Test + public void optimizeForSmallDbWithCache() { + try (final Options options = new Options(); final Cache cache = new LRUCache(1024)) { + assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options); + } + } + + @Test + public void cfPaths() { + try (final Options options = new Options()) { + final List paths = Arrays.asList( + new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25)); + assertThat(options.cfPaths()).isEqualTo(Collections.emptyList()); + assertThat(options.setCfPaths(paths)).isEqualTo(options); + assertThat(options.cfPaths()).isEqualTo(paths); + } + } + + @Test + public void avoidUnnecessaryBlockingIO() { + try (final Options options = new Options()) { + assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false); + assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options); + assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true); + } + } + + @Test + public void persistStatsToDisk() { + try (final Options options = new Options()) { + assertThat(options.persistStatsToDisk()).isEqualTo(false); + assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options); + assertThat(options.persistStatsToDisk()).isEqualTo(true); + } + } + + @Test + public void writeDbidToManifest() { + try (final Options options = new Options()) { + assertThat(options.writeDbidToManifest()).isEqualTo(false); + assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options); + assertThat(options.writeDbidToManifest()).isEqualTo(true); + } + } + + @Test + public void logReadaheadSize() { + try (final Options options = new Options()) { + assertThat(options.logReadaheadSize()).isEqualTo(0); + final int size = 1024 * 1024 * 100; + assertThat(options.setLogReadaheadSize(size)).isEqualTo(options); + assertThat(options.logReadaheadSize()).isEqualTo(size); + } + } + + @Test + public void bestEffortsRecovery() { + try (final Options options = new Options()) { + assertThat(options.bestEffortsRecovery()).isEqualTo(false); + assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options); + assertThat(options.bestEffortsRecovery()).isEqualTo(true); + } + } + + @Test + public void maxBgerrorResumeCount() { + try (final Options options = new Options()) { + final int INT_MAX = 2147483647; + assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX); + assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options); + assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1); + } + } + + @Test + public void bgerrorResumeRetryInterval() { + try (final Options options = new Options()) { + assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000); + final long newRetryInterval = 24 * 3600 * 1000000L; + assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options); + assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval); + } + } + + @Test + public void maxWriteBatchGroupSizeBytes() { + try (final Options options = new Options()) { + assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024); + final long size = 1024 * 1024 * 1024 * 10L; + assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options); + assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size); + } + } + + @Test + public void skipCheckingSstFileSizesOnDbOpen() { + try (final Options options = new Options()) { + assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false); + assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options); + assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true); + } + } + + @Test + public void eventListeners() { + final AtomicBoolean wasCalled1 = new AtomicBoolean(); + final AtomicBoolean wasCalled2 = new AtomicBoolean(); + try (final Options options = new Options(); + final AbstractEventListener el1 = + new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + wasCalled1.set(true); + } + }; + final AbstractEventListener el2 = + new AbstractEventListener() { + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + wasCalled2.set(true); + } + }) { + assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); + List listeners = options.listeners(); + assertEquals(el1, listeners.get(0)); + assertEquals(el2, listeners.get(1)); + options.setListeners(Collections.emptyList()); + listeners.get(0).onTableFileDeleted(null); + assertTrue(wasCalled1.get()); + listeners.get(1).onMemTableSealed(null); + assertTrue(wasCalled2.get()); + List listeners2 = options.listeners(); + assertNotNull(listeners2); + assertEquals(0, listeners2.size()); + } + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -31,115 +31,60 @@ final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); - try (final RocksDB db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath())) { - assertThat("value"). - isEqualTo(new String(db2.get("key".getBytes()))); - } + } + try (final RocksDB db = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath())) { + assertThat("value").isEqualTo(new String(db.get("key".getBytes()))); } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { final List cfDescriptors = new ArrayList<>(); - cfDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); - + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List columnFamilyHandleList = new ArrayList<>(); - try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList)) { - try (final ColumnFamilyOptions newCfOpts = new ColumnFamilyOptions(); - final ColumnFamilyOptions newCf2Opts = new ColumnFamilyOptions() - ) { - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes(), newCfOpts))); - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf2".getBytes(), newCf2Opts))); - db.put(columnFamilyHandleList.get(2), "key2".getBytes(), - "value2".getBytes()); - - final List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try (final RocksDB db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList)) { - try (final ColumnFamilyOptions newCfOpts2 = - new ColumnFamilyOptions(); - final ColumnFamilyOptions newCf2Opts2 = - new ColumnFamilyOptions() - ) { - assertThat(db2.get("key2".getBytes())).isNull(); - assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), - "key2".getBytes())). - isNull(); - cfDescriptors.clear(); - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - newCfOpts2)); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), - newCf2Opts2)); - - final List readOnlyColumnFamilyHandleList2 - = new ArrayList<>(); - try (final RocksDB db3 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList2)) { - try { - assertThat(new String(db3.get( - readOnlyColumnFamilyHandleList2.get(1), - "key2".getBytes()))).isEqualTo("value2"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList2) { - columnFamilyHandle.close(); - } - } - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + try (final RocksDB db = RocksDB.open( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + columnFamilyHandleList.add( + db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpts))); + columnFamilyHandleList.add( + db.createColumnFamily(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts))); + db.put(columnFamilyHandleList.get(2), "key2".getBytes(), "value2".getBytes()); + } + + columnFamilyHandleList.clear(); + try (final RocksDB db = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + assertThat(db.get("key2".getBytes())).isNull(); + assertThat(db.get(columnFamilyHandleList.get(0), "key2".getBytes())).isNull(); + } + + cfDescriptors.clear(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts)); + columnFamilyHandleList.clear(); + try (final RocksDB db = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + assertThat(new String(db.get(columnFamilyHandleList.get(1), "key2".getBytes()))) + .isEqualTo("value2"); } } } @Test(expected = RocksDBException.class) public void failToWriteInReadOnly() throws RocksDBException { - try (final Options options = new Options() - .setCreateIfMissing(true)) { - - try (final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { - //no-op + try (final Options options = new Options().setCreateIfMissing(true)) { + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + // no-op } } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); - final List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try (final RocksDB rDb = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList)) { - try { - // test that put fails in readonly mode - rDb.put("key".getBytes(), "value".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + final List readOnlyColumnFamilyHandleList = new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, readOnlyColumnFamilyHandleList)) { + // test that put fails in readonly mode + rDb.put("key".getBytes(), "value".getBytes()); } } } @@ -161,15 +106,7 @@ try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { - rDb.put(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes(), "value".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + rDb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); } } } @@ -193,14 +130,7 @@ try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { - rDb.delete("key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + rDb.delete("key".getBytes()); } } } @@ -223,15 +153,8 @@ try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { rDb.delete(readOnlyColumnFamilyHandleList.get(0), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } } } } @@ -256,15 +179,8 @@ readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - try { wb.put("key".getBytes(), "value".getBytes()); rDb.write(wOpts, wb); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } } } } @@ -289,16 +205,29 @@ readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - try { wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); rDb.write(wOpts, wb); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + } + } + } + + @Test(expected = RocksDBException.class) + public void errorIfWalFileExists() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + // no-op + } + + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + + final List readOnlyColumnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = new DBOptions(); + final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, readOnlyColumnFamilyHandleList, true);) { + // no-op... should have raised an error as errorIfWalFileExists=true } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -39,11 +39,15 @@ opt.setFillCache(false); opt.setIterateUpperBound(buildRandomSlice()); opt.setIterateLowerBound(buildRandomSlice()); + opt.setTimestamp(buildRandomSlice()); + opt.setIterStartTs(buildRandomSlice()); try (final ReadOptions other = new ReadOptions(opt)) { assertThat(opt.verifyChecksums()).isEqualTo(other.verifyChecksums()); assertThat(opt.fillCache()).isEqualTo(other.fillCache()); assertThat(Arrays.equals(opt.iterateUpperBound().data(), other.iterateUpperBound().data())).isTrue(); assertThat(Arrays.equals(opt.iterateLowerBound().data(), other.iterateLowerBound().data())).isTrue(); + assertThat(Arrays.equals(opt.timestamp().data(), other.timestamp().data())).isTrue(); + assertThat(Arrays.equals(opt.iterStartTs().data(), other.iterStartTs().data())).isTrue(); } } } @@ -159,6 +163,8 @@ Slice upperBound = buildRandomSlice(); opt.setIterateUpperBound(upperBound); assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue(); + opt.setIterateUpperBound(null); + assertThat(opt.iterateUpperBound()).isNull(); } } @@ -175,6 +181,8 @@ Slice lowerBound = buildRandomSlice(); opt.setIterateLowerBound(lowerBound); assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue(); + opt.setIterateLowerBound(null); + assertThat(opt.iterateLowerBound()).isNull(); } } @@ -203,6 +211,60 @@ } } + @Test + public void autoPrefixMode() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setAutoPrefixMode(true); + assertThat(opt.autoPrefixMode()).isTrue(); + } + } + + @Test + public void timestamp() { + try (final ReadOptions opt = new ReadOptions()) { + Slice timestamp = buildRandomSlice(); + opt.setTimestamp(timestamp); + assertThat(Arrays.equals(timestamp.data(), opt.timestamp().data())).isTrue(); + opt.setTimestamp(null); + assertThat(opt.timestamp()).isNull(); + } + } + + @Test + public void iterStartTs() { + try (final ReadOptions opt = new ReadOptions()) { + Slice itertStartTsSlice = buildRandomSlice(); + opt.setIterStartTs(itertStartTsSlice); + assertThat(Arrays.equals(itertStartTsSlice.data(), opt.iterStartTs().data())).isTrue(); + opt.setIterStartTs(null); + assertThat(opt.iterStartTs()).isNull(); + } + } + + @Test + public void deadline() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setDeadline(1999l); + assertThat(opt.deadline()).isEqualTo(1999l); + } + } + + @Test + public void ioTimeout() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setIoTimeout(34555l); + assertThat(opt.ioTimeout()).isEqualTo(34555l); + } + } + + @Test + public void valueSizeSoftLimit() { + try (final ReadOptions opt = new ReadOptions()) { + opt.setValueSizeSoftLimit(12134324l); + assertThat(opt.valueSizeSoftLimit()).isEqualTo(12134324l); + } + } + @Test public void failSetVerifyChecksumUninitialized() { try (final ReadOptions readOptions = diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -1085,6 +1085,57 @@ } @Test + public void continueBackgroundWorkAfterCancelAllBackgroundWork() throws RocksDBException { + final int KEY_SIZE = 20; + final int VALUE_SIZE = 300; + try (final DBOptions opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions() + ) { + final List columnFamilyDescriptors = + Arrays.asList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts) + ); + + final List columnFamilyHandles = new ArrayList<>(); + // open the database + try (final RocksDB db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles)) { + try { + db.cancelAllBackgroundWork(true); + try { + db.put(new byte[KEY_SIZE], new byte[VALUE_SIZE]); + db.flush(new FlushOptions().setWaitForFlush(true)); + fail("Expected RocksDBException to be thrown if we attempt to trigger a flush after" + + " all background work is cancelled."); + } catch (RocksDBException ignored) { } + } finally { + for (final ColumnFamilyHandle handle : columnFamilyHandles) { + handle.close(); + } + } + } + } + } + + @Test + public void cancelAllBackgroundWorkTwice() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()) + ) { + // Cancel all background work synchronously + db.cancelAllBackgroundWork(true); + // Cancel all background work asynchronously + db.cancelAllBackgroundWork(false); + } + } + + @Test public void pauseContinueBackgroundWork() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, @@ -1170,7 +1221,6 @@ } } - @Ignore("This test crashes. Re-enable after fixing.") @Test public void getApproximateSizes() throws RocksDBException { final byte key1[] = "key1".getBytes(UTF_8); @@ -1185,7 +1235,7 @@ final long[] sizes = db.getApproximateSizes( Arrays.asList( - new Range(new Slice(key1), new Slice(key2)), + new Range(new Slice(key1), new Slice(key1)), new Range(new Slice(key2), new Slice(key3)) ), SizeApproximationFlag.INCLUDE_FILES, @@ -1221,6 +1271,26 @@ } } + @Test + public void getApproximateMemTableStatsSingleKey() throws RocksDBException { + final byte key1[] = "key1".getBytes(UTF_8); + final byte key2[] = "key2".getBytes(UTF_8); + final byte key3[] = "key3".getBytes(UTF_8); + try (final Options options = new Options().setCreateIfMissing(true)) { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + try (final RocksDB db = RocksDB.open(options, dbPath)) { + db.put(key1, key1); + + final RocksDB.CountAndSize stats = + db.getApproximateMemTableStats(new Range(new Slice(key1), new Slice(key3))); + + assertThat(stats).isNotNull(); + assertThat(stats.count).isEqualTo(1); + assertThat(stats.size).isGreaterThan(1); + } + } + } + @Ignore("TODO(AR) re-enable when ready!") @Test public void compactFiles() throws RocksDBException { @@ -1406,11 +1476,11 @@ try (final RocksDB db = RocksDB.open(options, dbPath)) { final RocksDB.LiveFiles livefiles = db.getLiveFiles(true); assertThat(livefiles).isNotNull(); - assertThat(livefiles.manifestFileSize).isEqualTo(13); + assertThat(livefiles.manifestFileSize).isEqualTo(59); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); - assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001"); - assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005"); + assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004"); + assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007"); } } } @@ -1633,6 +1703,13 @@ } } + @Test + public void rocksdbVersion() { + final RocksDB.Version version = RocksDB.rocksdbVersion(); + assertThat(version).isNotNull(); + assertThat(version.getMajor()).isGreaterThan(1); + } + private static class InMemoryTraceWriter extends AbstractTraceWriter { private final List writes = new ArrayList<>(); private volatile boolean closed = false; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -147,6 +147,27 @@ assertThat(iterator.isValid()).isTrue(); assertThat(iterator.key()).isEqualTo("key2".getBytes()); } + + try (final RocksIterator iterator = db.newIterator()) { + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + + byte[] lastKey; + do { + lastKey = iterator.key(); + iterator.next(); + } while (iterator.isValid()); + + db.put("key3".getBytes(), "value3".getBytes()); + assertThat(iterator.isValid()).isFalse(); + iterator.refresh(); + iterator.seek(lastKey); + assertThat(iterator.isValid()).isTrue(); + + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key3".getBytes()); + } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class SecondaryDBTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule public TemporaryFolder secondaryDbFolder = new TemporaryFolder(); + + @Test + public void openAsSecondary() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + db.put("key3".getBytes(), "value3".getBytes()); + + // open secondary + try (final Options secondaryOptions = new Options(); + final RocksDB secondaryDb = + RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(), + secondaryDbFolder.getRoot().getAbsolutePath())) { + assertThat(secondaryDb.get("key1".getBytes())).isEqualTo("value1".getBytes()); + assertThat(secondaryDb.get("key2".getBytes())).isEqualTo("value2".getBytes()); + assertThat(secondaryDb.get("key3".getBytes())).isEqualTo("value3".getBytes()); + + // write to primary + db.put("key4".getBytes(), "value4".getBytes()); + db.put("key5".getBytes(), "value5".getBytes()); + db.put("key6".getBytes(), "value6".getBytes()); + + // tell secondary to catch up + secondaryDb.tryCatchUpWithPrimary(); + + db.put("key7".getBytes(), "value7".getBytes()); + + // check secondary + assertThat(secondaryDb.get("key4".getBytes())).isEqualTo("value4".getBytes()); + assertThat(secondaryDb.get("key5".getBytes())).isEqualTo("value5".getBytes()); + assertThat(secondaryDb.get("key6".getBytes())).isEqualTo("value6".getBytes()); + + assertThat(secondaryDb.get("key7".getBytes())).isNull(); + } + } + } + + @Test + public void openAsSecondaryColumnFamilies() throws RocksDBException { + try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { + final List cfDescriptors = new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes(), cfOpts)); + + final List cfHandles = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, cfHandles)) { + try { + final ColumnFamilyHandle cf1 = cfHandles.get(1); + + db.put(cf1, "key1".getBytes(), "value1".getBytes()); + db.put(cf1, "key2".getBytes(), "value2".getBytes()); + db.put(cf1, "key3".getBytes(), "value3".getBytes()); + + final List secondaryCfHandles = new ArrayList<>(); + + // open secondary + try (final DBOptions secondaryOptions = new DBOptions(); + final RocksDB secondaryDb = + RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(), + secondaryDbFolder.getRoot().getAbsolutePath(), cfDescriptors, + secondaryCfHandles)) { + try { + final ColumnFamilyHandle secondaryCf1 = secondaryCfHandles.get(1); + + assertThat(secondaryDb.get(secondaryCf1, "key1".getBytes())) + .isEqualTo("value1".getBytes()); + assertThat(secondaryDb.get(secondaryCf1, "key2".getBytes())) + .isEqualTo("value2".getBytes()); + assertThat(secondaryDb.get(secondaryCf1, "key3".getBytes())) + .isEqualTo("value3".getBytes()); + + // write to primary + db.put(cf1, "key4".getBytes(), "value4".getBytes()); + db.put(cf1, "key5".getBytes(), "value5".getBytes()); + db.put(cf1, "key6".getBytes(), "value6".getBytes()); + + // tell secondary to catch up + secondaryDb.tryCatchUpWithPrimary(); + + db.put(cf1, "key7".getBytes(), "value7".getBytes()); + + // check secondary + assertThat(secondaryDb.get(secondaryCf1, "key4".getBytes())) + .isEqualTo("value4".getBytes()); + assertThat(secondaryDb.get(secondaryCf1, "key5".getBytes())) + .isEqualTo("value5".getBytes()); + assertThat(secondaryDb.get(secondaryCf1, "key6".getBytes())) + .isEqualTo("value6".getBytes()); + + assertThat(secondaryDb.get(secondaryCf1, "key7".getBytes())).isNull(); + + } finally { + for (final ColumnFamilyHandle secondaryCfHandle : secondaryCfHandles) { + secondaryCfHandle.close(); + } + } + } + } finally { + for (final ColumnFamilyHandle cfHandle : cfHandles) { + cfHandle.close(); + } + } + } + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class SstPartitionerTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void sstFixedPrefix() throws RocksDBException { + try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4); + final Options opt = + new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + // writing (long)100 under key + db.put("aaaa1".getBytes(), "A".getBytes()); + db.put("bbbb1".getBytes(), "B".getBytes()); + db.flush(new FlushOptions()); + + db.put("aaaa0".getBytes(), "A2".getBytes()); + db.put("aaaa2".getBytes(), "A2".getBytes()); + db.flush(new FlushOptions()); + + db.compactRange(); + + List metadata = db.getLiveFilesMetaData(); + assertThat(metadata.size()).isEqualTo(2); + } + } + + @Test + public void sstFixedPrefixFamily() throws RocksDBException { + final byte[] cfName = "new_cf".getBytes(UTF_8); + final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName, + new ColumnFamilyOptions().setSstPartitionerFactory( + new SstPartitionerFixedPrefixFactory(4))); + + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor); + + // writing (long)100 under key + db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes()); + db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes()); + db.flush(new FlushOptions(), columnFamilyHandle); + + db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes()); + db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes()); + db.flush(new FlushOptions(), columnFamilyHandle); + + db.compactRange(columnFamilyHandle); + + List metadata = db.getLiveFilesMetaData(); + assertThat(metadata.size()).isEqualTo(2); + } + } +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -209,7 +209,7 @@ .isSameAs(Transaction.TransactionState.STARTED); txn.commit(); assertThat(txn.getState()) - .isSameAs(Transaction.TransactionState.COMMITED); + .isSameAs(Transaction.TransactionState.COMMITTED); } try(final Transaction txn = dbContainer.beginTransaction()) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -100,7 +100,7 @@ key.clear(); key.put("box".getBytes("US-ASCII")).flip(); - batch.remove(key); + batch.delete(key); assertThat(key.position()).isEqualTo(3); assertThat(key.limit()).isEqualTo(3); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,9 @@ import static org.assertj.core.api.Assertions.assertThat; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; @@ -102,6 +104,95 @@ } @Test + public void readYourOwnWritesCf() throws RocksDBException { + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + + final List columnFamilyHandleList = new ArrayList<>(); + + // Test open database with column family names + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + final ColumnFamilyHandle newCf = columnFamilyHandleList.get(1); + + try { + final byte[] k1 = "key1".getBytes(); + final byte[] v1 = "value1".getBytes(); + final byte[] k2 = "key2".getBytes(); + final byte[] v2 = "value2".getBytes(); + + db.put(newCf, k1, v1); + db.put(newCf, k2, v2); + + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final ReadOptions readOptions = new ReadOptions(); + final RocksIterator base = db.newIterator(newCf, readOptions); + final RocksIterator it = wbwi.newIteratorWithBase(newCf, base, readOptions)) { + it.seek(k1); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k1); + assertThat(it.value()).isEqualTo(v1); + + it.seek(k2); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k2); + assertThat(it.value()).isEqualTo(v2); + + // put data to the write batch and make sure we can read it. + final byte[] k3 = "key3".getBytes(); + final byte[] v3 = "value3".getBytes(); + wbwi.put(newCf, k3, v3); + it.seek(k3); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k3); + assertThat(it.value()).isEqualTo(v3); + + // update k2 in the write batch and check the value + final byte[] v2Other = "otherValue2".getBytes(); + wbwi.put(newCf, k2, v2Other); + it.seek(k2); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k2); + assertThat(it.value()).isEqualTo(v2Other); + + // delete k1 and make sure we can read back the write + wbwi.delete(newCf, k1); + it.seek(k1); + assertThat(it.key()).isNotEqualTo(k1); + + // reinsert k1 and make sure we see the new value + final byte[] v1Other = "otherValue1".getBytes(); + wbwi.put(newCf, k1, v1Other); + it.seek(k1); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k1); + assertThat(it.value()).isEqualTo(v1Other); + + // single remove k3 and make sure we can read back the write + wbwi.singleDelete(newCf, k3); + it.seek(k3); + assertThat(it.isValid()).isEqualTo(false); + + // reinsert k3 and make sure we see the new value + final byte[] v3Other = "otherValue3".getBytes(); + wbwi.put(newCf, k3, v3Other); + it.seek(k3); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k3); + assertThat(it.value()).isEqualTo(v3Other); + } + } finally { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + } + } + } + + @Test public void writeBatchWithIndex() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, @@ -563,4 +654,106 @@ assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes()); } } + + @Test + public void iteratorWithBaseOverwriteTrue() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final RocksIterator baseIter = db.newIterator(); + final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final RocksIterator baseIter = db.newIterator(); + final ReadOptions readOptions = new ReadOptions(); + final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final RocksIterator baseIter = db.newIterator(); + final RocksIterator wbwiIter = + wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + final RocksIterator baseIter = db.newIterator(); + final ReadOptions readOptions = new ReadOptions(); + final RocksIterator wbwiIter = + wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + } + } + + @Test + public void iteratorWithBaseOverwriteFalse() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false); + final RocksIterator baseIter = db.newIterator(); + final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false); + final RocksIterator baseIter = db.newIterator(); + final ReadOptions readOptions = new ReadOptions(); + final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + } + + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false); + final RocksIterator baseIter = db.newIterator(); + final RocksIterator wbwiIter = + wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + + try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false); + final RocksIterator baseIter = db.newIterator(); + final ReadOptions readOptions = new ReadOptions(); + final RocksIterator wbwiIter = + wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) { + assertThat(wbwiIter).isNotNull(); + assertThat(wbwiIter.nativeHandle_).isGreaterThan(0); + wbwiIter.status(); + } + } + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb.test; + +import org.rocksdb.AbstractEventListener; + +public class TestableEventListener extends AbstractEventListener { + public TestableEventListener() { + super(); + } + + public TestableEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(enabledEventCallbacks); + } + + public void invokeAllCallbacks() { + invokeAllCallbacks(nativeHandle_); + } + + private static native void invokeAllCallbacks(final long handle); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -263,7 +263,7 @@ for (int i = 0; i < num_iter_ops; i++) { // Random walk and make sure iter and result_iter returns the // same key and value - final int type = rnd.nextInt(7); + final int type = rnd.nextInt(8); iter.status(); switch (type) { case 0: @@ -310,8 +310,15 @@ continue; } break; + case 6: + // Refresh + iter.refresh(); + result_iter.refresh(); + iter.seekToFirst(); + result_iter.seekToFirst(); + break; default: { - assert (type == 6); + assert (type == 7); final int key_idx = rnd.nextInt(source_strings.size()); final String key = source_strings.get(key_idx); final byte[] result = db.get(readOptions, bytes(key)); @@ -473,6 +480,11 @@ } @Override + public void refresh() throws RocksDBException { + offset = -1; + } + + @Override public void status() throws RocksDBException { if(offset < 0 || offset >= entries.size()) { throw new RocksDBException("Index out of bounds. Size is: " + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java 2025-05-19 16:14:27.000000000 +0000 @@ -119,6 +119,11 @@ events.add(new Event(Action.MARK_COMMIT, (byte[])null, (byte[])null)); } + @Override + public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException { + events.add(new Event(Action.MARK_COMMIT_WITH_TIMESTAMP, (byte[]) null, (byte[]) null)); + } + public static class Event { public final Action action; public final int columnFamilyId; @@ -156,8 +161,10 @@ @Override public int hashCode() { - - return Objects.hash(action, columnFamilyId, key, value); + int result = Objects.hash(action, columnFamilyId); + result = 31 * result + Arrays.hashCode(key); + result = 31 * result + Arrays.hashCode(value); + return result; } } @@ -166,7 +173,18 @@ * event actions */ public enum Action { - PUT, MERGE, DELETE, SINGLE_DELETE, DELETE_RANGE, LOG, PUT_BLOB_INDEX, - MARK_BEGIN_PREPARE, MARK_END_PREPARE, MARK_NOOP, MARK_COMMIT, - MARK_ROLLBACK } + PUT, + MERGE, + DELETE, + SINGLE_DELETE, + DELETE_RANGE, + LOG, + PUT_BLOB_INDEX, + MARK_BEGIN_PREPARE, + MARK_END_PREPARE, + MARK_NOOP, + MARK_COMMIT, + MARK_ROLLBACK, + MARK_COMMIT_WITH_TIMESTAMP + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java 2025-05-19 16:14:27.000000000 +0000 @@ -9,7 +9,6 @@ import org.junit.Test; import java.lang.reflect.Field; -import java.lang.reflect.Modifier; import static org.assertj.core.api.Assertions.assertThat; @@ -37,23 +36,38 @@ isEqualTo(".jnilib"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-osx.jnilib"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.dylib"); } @Test - public void mac64() { - setEnvironmentClassFields("mac", "64"); + public void mac64_x86_64() { + setEnvironmentClassFields("mac", "x86_64"); assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). isEqualTo(".jnilib"); - assertThat(Environment.getJniLibraryFileName("rocksdb")). - isEqualTo("librocksdbjni-osx.jnilib"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-osx-x86_64.jnilib"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-osx.jnilib"); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.dylib"); } @Test + public void macAarch64() { + setEnvironmentClassFields("mac", "aarch64"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-osx-arm64.jnilib"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-osx.jnilib"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib"); + } + + @Test public void nix32() { // Linux setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); @@ -63,6 +77,7 @@ isEqualTo(".so"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-linux32.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); // Linux musl-libc (Alpine) @@ -93,7 +108,8 @@ assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). isEqualTo(".so"); - Environment.getJniLibraryFileName("rocksdb"); + assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); } @Test @@ -105,6 +121,7 @@ isEqualTo(".so"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-linux64.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); // Linux musl-libc (Alpine) @@ -114,6 +131,7 @@ isEqualTo(".so"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-linux64-musl.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); // UNIX @@ -124,6 +142,7 @@ isEqualTo(".so"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-linux64.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); // AIX @@ -133,6 +152,7 @@ isEqualTo(".so"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-aix64.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); } @@ -151,6 +171,7 @@ isEqualTo(".dll"); assertThat(Environment.getJniLibraryFileName("rocksdb")). isEqualTo("librocksdbjni-win64.dll"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.dll"); } @@ -167,6 +188,7 @@ assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le"); assertThat(Environment.getJniLibraryFileName("rocksdb")) .isEqualTo("librocksdbjni-linux-ppc64le.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); // Linux musl-libc (Alpine) setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); @@ -179,12 +201,13 @@ assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl"); assertThat(Environment.getJniLibraryFileName("rocksdb")) .isEqualTo("librocksdbjni-linux-ppc64le-musl.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); } @Test - public void aarch64() { + public void linuxArch64() { setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); setEnvironmentClassFields("Linux", "aarch64"); assertThat(Environment.isUnix()).isTrue(); @@ -195,6 +218,7 @@ assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64"); assertThat(Environment.getJniLibraryFileName("rocksdb")) .isEqualTo("librocksdbjni-linux-aarch64.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); // Linux musl-libc (Alpine) setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); @@ -207,6 +231,7 @@ assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl"); assertThat(Environment.getJniLibraryFileName("rocksdb")) .isEqualTo("librocksdbjni-linux-aarch64-musl.so"); + assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull(); assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java 2025-05-19 16:14:27.000000000 +0000 @@ -5,14 +5,14 @@ package org.rocksdb.util; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.nio.ByteBuffer; +import java.util.Random; import org.rocksdb.CompactionPriority; import org.rocksdb.Options; import org.rocksdb.WALRecoveryMode; -import java.util.Random; - -import static java.nio.charset.StandardCharsets.UTF_8; - /** * General test utilities. */ @@ -58,4 +58,15 @@ random.nextBytes(str); return str; } + + /** + * Copy a {@link ByteBuffer} into an array for shorthand ease of test coding + * @param byteBuffer the buffer to copy + * @return a {@link byte[]} containing the same bytes as the input + */ + public static byte[] bufferBytes(final ByteBuffer byteBuffer) { + final byte[] result = new byte[byteBuffer.limit()]; + byteBuffer.get(result); + return result; + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java 2025-05-19 16:14:27.000000000 +0000 @@ -131,4 +131,9 @@ public void markCommit(final byte[] xid) throws RocksDBException { throw new UnsupportedOperationException(); } + + @Override + public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException { + throw new UnsupportedOperationException(); + } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/understanding_options.md mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/java/understanding_options.md 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,79 @@ +# How RocksDB Options and their Java Wrappers Work + +Options in RocksDB come in many different flavours. This is an attempt at a taxonomy and explanation. + +## RocksDB Options + +Initially, I believe, RocksDB had only database options. I don't know if any of these were mutable. Column families came later. Read on to understand the terminology. + +So to begin, one sets up a collection of options and starts/creates a database with these options. That's a useful way to think about it, because from a Java point-of-view (and I didn't realise this initially and got very confused), despite making native calls to C++, the `API`s are just manipulating a native C++ configuration object. This object is just a record of configuration, and it must later be passed to the database (at create or open time) in order to apply the options. + +### Database versus Column Family + +The concept of the *column family* or `CF` is widespread within RocksDB. I think of it as a data namespace, but conveniently transactions can operate across these namespaces. The concept of a default column family exists, and when operations do not refer to a particular `CF`, it refers to the default. + +We raise this w.r.t. options because many options, perhaps most that users encounter, are *column family options*. That is to say they apply individually to a particular column family, or to the default column family. Crucially also, many/most/all of these same options are exposed as *database options* and then apply as the default for column families which do not have the option set explicitly. Obviously some database options are naturally database-wide; they apply to the operation of the database and don't make any sense applied to a column family. + +### Mutability + +There are 2 kinds of options + +- Mutable options +- Immutable options. We name these in contrast to the mutable ones, but they are usually referred to unqualified. + +Mutable options are those which can be changed on a running `RocksDB` instance. Immutable options can only be configured prior to the start of a database. Of course, we can configure the immutable options at this time too; The entirety of options is a strict superset of the mutable options. + +Mutable options (whether column-family specific or database-wide) are manipulated at runtime with builders, so we have `MutableDBOptions.MutableDBOptionsBuilder` and `MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder` which share tooling classes/hierarchy and maintain and manipulate the relevant options as a `(key,value)` map. + +Mutable options are then passed using `setOptions()` and `setDBOptions()` methods on the live RocksDB, and then take effect immediately (depending on the semantics of the option) on the database. + +### Advanced + +There are 2 classes of options + +- Advanced options +- Non-advanced options + +It's not clear to me what the conceptual distinction is between advanced and not. However, the Java code takes care to reflect it from the underlying C++. + +This leads to 2 separate type hierarchies within column family options, one for each `class` of options. The `kind`s are represented by where the options appear in their hierarchy. + +```java +interface ColumnFamilyOptionsInterface> + extends AdvancedColumnFamilyOptionsInterface +interface MutableColumnFamilyOptionsInterface> + extends AdvancedMutableColumnFamilyOptionsInterface +``` + +And then there is ultimately a single concrete implementation class for CF options: + +```java +class ColumnFamilyOptions extends RocksObject + implements ColumnFamilyOptionsInterface, + MutableColumnFamilyOptionsInterface +``` + +as there is a single concrete implementation class for DB options: + +```java +class DBOptions extends RocksObject + implements DBOptionsInterface, + MutableDBOptionsInterface +``` + +Interestingly `DBOptionsInterface` doesn't extend `MutableDBOptionsInterface`, if only in order to disrupt our belief in consistent basic laws of the Universe. + +## Startup/Creation Options + +```java +class Options extends RocksObject + implements DBOptionsInterface, + MutableDBOptionsInterface, + ColumnFamilyOptionsInterface, + MutableColumnFamilyOptionsInterface +``` + +### Example - Blob Options + +The `enable_blob_files` and `min_blob_size` options are per-column-family, and are mutable. The options also appear in the unqualified database options. So by initial configuration, we can set up a RocksDB database where for every `(key,value)` with a value of size at least `min_blob_size`, the value is written (indirected) to a blob file. Blobs may share a blob file, subject to the configuration values set. Later, using the `MutableColumnFamilyOptionsInterface` of the `ColumnFamilyOptions`, we can choose to turn this off (`enable_blob_files=false`) , or alter the `min_blob_size` for the default column family, or any other column family. It seems to me that we cannot, though, mutate the column family options for all column families using the +`setOptions()` mechanism, either for all existing column families or for all future column families; but maybe we can do the latter on a re-`open()/create()' diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,8 +6,12 @@ #include "logging/auto_roll_logger.h" #include + #include "file/filename.h" #include "logging/logging.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { @@ -15,7 +19,9 @@ #ifndef ROCKSDB_LITE // -- AutoRollLogger -AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, +AutoRollLogger::AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, const std::string& db_log_dir, size_t log_max_size, size_t log_file_time_to_roll, @@ -24,36 +30,38 @@ : Logger(log_level), dbname_(dbname), db_log_dir_(db_log_dir), - env_(env), + fs_(fs), + clock_(clock), status_(Status::OK()), kMaxLogFileSize(log_max_size), kLogFileTimeToRoll(log_file_time_to_roll), kKeepLogFileNum(keep_log_file_num), - cached_now(static_cast(env_->NowMicros() * 1e-6)), + cached_now(static_cast(clock_->NowMicros() * 1e-6)), ctime_(cached_now), cached_now_access_count(0), call_NowMicros_every_N_records_(100), mutex_() { - Status s = env->GetAbsolutePath(dbname, &db_absolute_path_); + Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_, + &io_context_); if (s.IsNotSupported()) { db_absolute_path_ = dbname; } else { status_ = s; } log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); - if (env_->FileExists(log_fname_).ok()) { + if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) { RollLogFile(); } GetExistingFiles(); - ResetLogger(); - if (status_.ok()) { + s = ResetLogger(); + if (s.ok() && status_.ok()) { status_ = TrimOldLogFiles(); } } Status AutoRollLogger::ResetLogger() { TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); - status_ = env_->NewLogger(log_fname_, &logger_); + status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_); TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); if (!status_.ok()) { @@ -67,7 +75,7 @@ "The underlying logger doesn't support GetLogFileSize()"); } if (status_.ok()) { - cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now = static_cast(clock_->NowMicros() * 1e-6); ctime_ = cached_now; cached_now_access_count = 0; } @@ -79,14 +87,17 @@ // This function is called when log is rotating. Two rotations // can happen quickly (NowMicro returns same value). To not overwrite // previous log file we increment by one micro second and try again. - uint64_t now = env_->NowMicros(); + uint64_t now = clock_->NowMicros(); std::string old_fname; do { old_fname = OldInfoLogFileName( dbname_, now, db_absolute_path_, db_log_dir_); now++; - } while (env_->FileExists(old_fname).ok()); - env_->RenameFile(log_fname_, old_fname); + } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok()); + Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_); + if (!s.ok()) { + // What should we do on error? + } old_log_files_.push(old_fname); } @@ -100,7 +111,7 @@ std::string parent_dir; std::vector info_log_files; Status s = - GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files); + GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files); if (status_.ok()) { status_ = s; } @@ -114,7 +125,7 @@ } Status AutoRollLogger::TrimOldLogFiles() { - // Here we directly list info files and delete them through Env. + // Here we directly list info files and delete them through FileSystem. // The deletion isn't going through DB, so there are shortcomes: // 1. the deletion is not rate limited by SstFileManager // 2. there is a chance that an I/O will be issued here @@ -127,7 +138,8 @@ // it's essentially the same thing, and checking empty before accessing // the queue feels safer. while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) { - Status s = env_->DeleteFile(old_log_files_.front()); + Status s = + fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_); // Remove the file from the tracking anyway. It's possible that // DB cleaned up the old log file, or people cleaned it up manually. old_log_files_.pop(); @@ -238,7 +250,7 @@ bool AutoRollLogger::LogExpired() { if (cached_now_access_count >= call_NowMicros_every_N_records_) { - cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now = static_cast(clock_->NowMicros() * 1e-6); cached_now_access_count = 0; } @@ -257,19 +269,24 @@ Env* env = options.env; std::string db_absolute_path; - env->GetAbsolutePath(dbname, &db_absolute_path); + Status s = env->GetAbsolutePath(dbname, &db_absolute_path); + if (!s.ok()) { + return s; + } std::string fname = InfoLogFileName(dbname, db_absolute_path, options.db_log_dir); - env->CreateDirIfMissing(dbname); // In case it does not exist + const auto& clock = env->GetSystemClock(); + env->CreateDirIfMissing(dbname) + .PermitUncheckedError(); // In case it does not exist // Currently we only support roll by time-to-roll and log size #ifndef ROCKSDB_LITE if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( - env, dbname, options.db_log_dir, options.max_log_file_size, - options.log_file_time_to_roll, options.keep_log_file_num, - options.info_log_level); - Status s = result->GetStatus(); + env->GetFileSystem(), clock, dbname, options.db_log_dir, + options.max_log_file_size, options.log_file_time_to_roll, + options.keep_log_file_num, options.info_log_level); + s = result->GetStatus(); if (!s.ok()) { delete result; } else { @@ -279,11 +296,19 @@ } #endif // !ROCKSDB_LITE // Open a log file in the same directory as the db - env->RenameFile(fname, - OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path, - options.db_log_dir)); - auto s = env->NewLogger(fname, logger); - if (logger->get() != nullptr) { + s = env->FileExists(fname); + if (s.ok()) { + s = env->RenameFile( + fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path, + options.db_log_dir)); + } else if (s.IsNotFound()) { + // "LOG" is not required to exist since this could be a new DB. + s = Status::OK(); + } + if (s.ok()) { + s = env->NewLogger(fname, logger); + } + if (s.ok() && logger->get() != nullptr) { (*logger)->SetInfoLogLevel(options.info_log_level); } return s; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h 2025-05-19 16:14:27.000000000 +0000 @@ -18,14 +18,18 @@ #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +class FileSystem; +class SystemClock; #ifndef ROCKSDB_LITE // Rolls the log file by size and/or time class AutoRollLogger : public Logger { public: - AutoRollLogger(Env* env, const std::string& dbname, - const std::string& db_log_dir, size_t log_max_size, - size_t log_file_time_to_roll, size_t keep_log_file_num, + AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, const std::string& db_log_dir, + size_t log_max_size, size_t log_file_time_to_roll, + size_t keep_log_file_num, const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL); using Logger::Logv; @@ -69,8 +73,9 @@ virtual ~AutoRollLogger() { if (logger_ && !closed_) { - logger_->Close(); + logger_->Close().PermitUncheckedError(); } + status_.PermitUncheckedError(); } using Logger::GetInfoLogLevel; @@ -133,7 +138,8 @@ std::string dbname_; std::string db_log_dir_; std::string db_absolute_path_; - Env* env_; + std::shared_ptr fs_; + std::shared_ptr clock_; std::shared_ptr logger_; // current status of the logger Status status_; @@ -147,11 +153,13 @@ // Full path is stored here. It consumes signifianctly more memory // than only storing file name. Can optimize if it causes a problem. std::queue old_log_files_; - // to avoid frequent env->NowMicros() calls, we cached the current time + // to avoid frequent clock->NowMicros() calls, we cached the current time uint64_t cached_now; uint64_t ctime_; uint64_t cached_now_access_count; uint64_t call_NowMicros_every_N_records_; + IOOptions io_options_; + IODebugContext io_context_; mutable port::Mutex mutex_; }; #endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,8 +7,9 @@ #ifndef ROCKSDB_LITE #include "logging/auto_roll_logger.h" -#include + #include + #include #include #include @@ -17,30 +18,19 @@ #include #include #include + +#include "db/db_test_util.h" +#include "env/emulated_clock.h" #include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" namespace ROCKSDB_NAMESPACE { -namespace { -class NoSleepEnv : public EnvWrapper { - public: - NoSleepEnv(Env* base) : EnvWrapper(base) {} - void SleepForMicroseconds(int micros) override { - fake_time_ += static_cast(micros); - } - - uint64_t NowNanos() override { return fake_time_ * 1000; } - - uint64_t NowMicros() override { return fake_time_; } - - private: - uint64_t fake_time_ = 6666666666; -}; -} // namespace // In this test we only want to Log some simple log message with // no format. LogMessage() provides such a simple interface and @@ -71,12 +61,14 @@ std::string deleteCmd = "rm -rf " + kTestDir; #endif ASSERT_TRUE(system(deleteCmd.c_str()) == 0); - Env::Default()->CreateDir(kTestDir); + ASSERT_OK(Env::Default()->CreateDir(kTestDir)); } void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, const std::string& log_message); - void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time, + void RollLogFileByTimeTest(const std::shared_ptr& fs, + const std::shared_ptr& sc, + AutoRollLogger* logger, size_t time, const std::string& log_message); // return list of files under kTestDir that contains "LOG" std::vector GetLogFiles() { @@ -157,21 +149,22 @@ ASSERT_TRUE(message_size == logger->GetLogFileSize()); } -void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, - size_t time, - const std::string& log_message) { +void AutoRollLoggerTest::RollLogFileByTimeTest( + const std::shared_ptr& fs, + const std::shared_ptr& sc, AutoRollLogger* logger, size_t time, + const std::string& log_message) { uint64_t expected_ctime; uint64_t actual_ctime; uint64_t total_log_size; - EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size)); + EXPECT_OK(fs->GetFileSize(kLogFile, IOOptions(), &total_log_size, nullptr)); expected_ctime = logger->TEST_ctime(); logger->SetCallNowMicrosEveryNRecords(0); // -- Write to the log for several times, which is supposed // to be finished before time. for (int i = 0; i < 10; ++i) { - env->SleepForMicroseconds(50000); + sc->SleepForMicroseconds(50000); LogMessage(logger, log_message.c_str()); EXPECT_OK(logger->GetStatus()); // Make sure we always write to the same log file (by @@ -186,7 +179,7 @@ } // -- Make the log file expire - env->SleepForMicroseconds(static_cast(time * 1000000)); + sc->SleepForMicroseconds(static_cast(time * 1000000)); LogMessage(logger, log_message.c_str()); // At this time, the new log file should be created. @@ -200,15 +193,16 @@ size_t log_max_size = 1024 * 5; size_t keep_log_file_num = 10; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0, - keep_log_file_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_max_size, 0, keep_log_file_num); RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":RollLogFileBySize"); } TEST_F(AutoRollLoggerTest, RollLogFileByTime) { - NoSleepEnv nse(Env::Default()); + auto nsc = + std::make_shared(SystemClock::Default(), true); size_t time = 2; size_t log_size = 1024 * 5; @@ -217,10 +211,11 @@ InitTestDb(); // -- Test the existence of file during the server restart. ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); - AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num); + AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "", + log_size, time, keep_log_file_num); ASSERT_OK(default_env->FileExists(kLogFile)); - RollLogFileByTimeTest(&nse, &logger, time, + RollLogFileByTimeTest(default_env->GetFileSystem(), nsc, &logger, time, kSampleMessage + ":RollLogFileByTime"); } @@ -255,15 +250,17 @@ size_t log_size = 1024; size_t keep_log_file_num = 10; - AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "", - log_size, 0, keep_log_file_num); + AutoRollLogger* logger = + new AutoRollLogger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, keep_log_file_num); LogMessage(logger, kSampleMessage.c_str()); ASSERT_GT(logger->GetLogFileSize(), kZero); delete logger; // reopens the log file and an empty log file will be created. - logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10); + logger = new AutoRollLogger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, 10); ASSERT_EQ(logger->GetLogFileSize(), kZero); delete logger; } @@ -274,16 +271,17 @@ InitTestDb(); - NoSleepEnv nse(Env::Default()); - AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time, - keep_log_file_num); + auto nsc = + std::make_shared(SystemClock::Default(), true); + AutoRollLogger logger(FileSystem::Default(), nsc, kTestDir, "", log_max_size, + time, keep_log_file_num); // Test the ability to roll by size RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); // Test the ability to roll by Time - RollLogFileByTimeTest(&nse, &logger, time, + RollLogFileByTimeTest(FileSystem::Default(), nsc, &logger, time, kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); } @@ -292,7 +290,10 @@ // port TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { DBOptions options; - NoSleepEnv nse(Env::Default()); + auto nsc = + std::make_shared(SystemClock::Default(), true); + std::unique_ptr nse(new CompositeEnvWrapper(Env::Default(), nsc)); + std::shared_ptr logger; // Normal logger @@ -311,14 +312,15 @@ kSampleMessage + ":CreateLoggerFromOptions - size"); // Only roll by Time - options.env = &nse; + options.env = nse.get(); InitTestDb(); options.max_log_file_size = 0; options.log_file_time_to_roll = 2; ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); auto_roll_logger = dynamic_cast(logger.get()); - RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, + options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - time"); // roll by both Time and size @@ -330,7 +332,8 @@ dynamic_cast(logger.get()); RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, kSampleMessage + ":CreateLoggerFromOptions - both"); - RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, + options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); // Set keep_log_file_num @@ -403,8 +406,8 @@ const size_t kMaxFileSize = 512; { size_t log_num = 8; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); RollNTimesBySize(&logger, log_num, kMaxFileSize); ASSERT_EQ(log_num, GetLogFiles().size()); @@ -412,8 +415,8 @@ // Shrink number of files { size_t log_num = 5; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); ASSERT_EQ(log_num, GetLogFiles().size()); RollNTimesBySize(&logger, 3, kMaxFileSize); @@ -423,8 +426,8 @@ // Increase number of files again. { size_t log_num = 7; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); ASSERT_EQ(6, GetLogFiles().size()); RollNTimesBySize(&logger, 3, kMaxFileSize); @@ -486,7 +489,8 @@ // an extra-scope to force the AutoRollLogger to flush the log file when it // becomes out of scope. { - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -524,7 +528,8 @@ size_t log_size = 8192; size_t log_lines = 0; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir, + "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -567,7 +572,7 @@ const std::string fname = path.substr(path.find_last_of("/") + 1); std::vector children; - Env::Default()->GetChildren(dirname, &children); + EXPECT_OK(Env::Default()->GetChildren(dirname, &children)); // We know that the old log files are named [path] // Return all entities that match the pattern @@ -591,8 +596,9 @@ InitTestDb(); - AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"", - LOG_MAX_SIZE, /*log_file_time_to_roll=*/0, + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, /*db_log_dir=*/"", LOG_MAX_SIZE, + /*log_file_time_to_roll=*/0, /*keep_log_file_num=*/10); if (test_num == 0) { @@ -666,6 +672,50 @@ ASSERT_NOK(CreateLoggerFromOptions("", options, &logger)); ASSERT_TRUE(!logger); } + +TEST_F(AutoRollLoggerTest, RenameOnlyWhenExists) { + InitTestDb(); + SpecialEnv env(Env::Default()); + Options options; + options.env = &env; + + // Originally no LOG exists. Should not see a rename. + { + std::shared_ptr logger; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + ASSERT_EQ(0, env.rename_count_); + } + + // Now a LOG exists. Create a new one should see a rename. + { + std::shared_ptr logger; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + ASSERT_EQ(1, env.rename_count_); + } +} + +TEST_F(AutoRollLoggerTest, RenameError) { + InitTestDb(); + SpecialEnv env(Env::Default()); + env.rename_error_ = true; + Options options; + options.env = &env; + + // Originally no LOG exists. Should not be impacted by rename error. + { + std::shared_ptr logger; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + ASSERT_TRUE(logger != nullptr); + } + + // Now a LOG exists. Rename error should cause failure. + { + std::shared_ptr logger; + ASSERT_NOK(CreateLoggerFromOptions(kTestDir, options, &logger)); + ASSERT_TRUE(logger == nullptr); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h 2025-05-19 16:14:27.000000000 +0000 @@ -31,15 +31,16 @@ const std::string& fname, const EnvOptions& options, Env* env, InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) : Logger(log_level), - file_(std::move(writable_file), fname, options, env), - last_flush_micros_(0), env_(env), + clock_(env_->GetSystemClock().get()), + file_(std::move(writable_file), fname, options, clock_), + last_flush_micros_(0), flush_pending_(false) {} ~EnvLogger() { if (!closed_) { closed_ = true; - CloseHelper(); + CloseHelper().PermitUncheckedError(); } } @@ -48,9 +49,9 @@ mutex_.AssertHeld(); if (flush_pending_) { flush_pending_ = false; - file_.Flush(); + file_.Flush().PermitUncheckedError(); } - last_flush_micros_ = env_->NowMicros(); + last_flush_micros_ = clock_->NowMicros(); } void Flush() override { @@ -134,9 +135,9 @@ assert(p <= limit); mutex_.Lock(); // We will ignore any error returned by Append(). - file_.Append(Slice(base, p - base)); + file_.Append(Slice(base, p - base)).PermitUncheckedError(); flush_pending_ = true; - const uint64_t now_micros = env_->NowMicros(); + const uint64_t now_micros = clock_->NowMicros(); if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { FlushLocked(); } @@ -154,11 +155,12 @@ } private: + Env* env_; + SystemClock* clock_; WritableFileWriter file_; mutable port::Mutex mutex_; // Mutex to protect the shared variables below. const static uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; - Env* env_; std::atomic flush_pending_; }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,7 +5,6 @@ // #include "logging/env_logger.h" -#include "env/mock_env.h" #include "test_util/testharness.h" #include "test_util/testutil.h" diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/event_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/event_logger.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,6 @@ #include #include -#include "logging/logging.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/logging.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/logging.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h 2025-05-19 16:14:27.000000000 +0000 @@ -19,9 +19,9 @@ inline const char* RocksLogShorterFileName(const char* file) { - // 15 is the length of "logging/logging.h". + // 18 is the length of "logging/logging.h". // If the name of this file changed, please change this number, too. - return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0); + return file + (sizeof(__FILE__) > 18 ? sizeof(__FILE__) - 18 : 0); } // Don't inclide file/line info in HEADER level diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/posix_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/posix_logger.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h 2025-05-19 16:14:27.000000000 +0000 @@ -68,7 +68,7 @@ virtual ~PosixLogger() { if (!closed_) { closed_ = true; - PosixCloseHelper(); + PosixCloseHelper().PermitUncheckedError(); } } virtual void Flush() override { @@ -108,15 +108,9 @@ const time_t seconds = now_tv.tv_sec; struct tm t; localtime_r(&seconds, &t); - p += snprintf(p, limit - p, - "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec, - static_cast(now_tv.tv_usec), + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llu ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), static_cast(thread_id)); // Print the message diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,11 +12,13 @@ #include #endif #include + #include "logging/logging.h" #include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -160,7 +162,7 @@ #ifdef MAP_HUGETLB if (huge_page_size > 0 && bytes > 0) { - // Allocate from a huge page TBL table. + // Allocate from a huge page TLB table. assert(logger != nullptr); // logger need to be passed in. size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; @@ -170,7 +172,7 @@ if (addr == nullptr) { ROCKS_LOG_WARN(logger, "AllocateAligned fail to allocate huge TLB pages: %s", - strerror(errno)); + errnoStr(errno).c_str()); // fail back to malloc } else { return addr; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h 2025-05-19 16:14:27.000000000 +0000 @@ -86,7 +86,7 @@ // Number of bytes allocated in one block const size_t kBlockSize; // Array of new[] allocated memory blocks - typedef std::vector Blocks; + using Blocks = std::vector; Blocks blocks_; struct MmapInfo { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/concurrent_arena.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/concurrent_arena.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,6 +13,7 @@ #include #include "memory/allocator.h" #include "memory/arena.h" +#include "port/lang.h" #include "port/likely.h" #include "util/core_local.h" #include "util/mutexlock.h" @@ -49,7 +50,7 @@ char* Allocate(size_t bytes) override { return AllocateImpl(bytes, false /*force_arena*/, - [=]() { return arena_.Allocate(bytes); }); + [this, bytes]() { return arena_.Allocate(bytes); }); } char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, @@ -58,9 +59,11 @@ assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) && (rounded_up % sizeof(void*)) == 0); - return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, [=]() { - return arena_.AllocateAligned(rounded_up, huge_page_size, logger); - }); + return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, + [this, rounded_up, huge_page_size, logger]() { + return arena_.AllocateAligned(rounded_up, + huge_page_size, logger); + }); } size_t ApproximateMemoryUsage() const { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,22 +10,175 @@ #include "port/likely.h" #include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - std::atomic JemallocNodumpAllocator::original_alloc_{nullptr}; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +static std::unordered_map jemalloc_type_info = { +#ifndef ROCKSDB_LITE + {"limit_tcache_size", + {offsetof(struct JemallocAllocatorOptions, limit_tcache_size), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_lower_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tcache_size_upper_bound", + {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +bool JemallocNodumpAllocator::IsSupported(std::string* why) { +#ifndef ROCKSDB_JEMALLOC + *why = "Not compiled with ROCKSDB_JEMALLOC"; + return false; +#else + static const std::string unsupported = + "JemallocNodumpAllocator only available with jemalloc version >= 5 " + "and MADV_DONTDUMP is available."; + if (!HasJemalloc()) { + *why = unsupported; + return false; + } +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + *why = unsupported; + return false; +#else + return true; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // ROCKSDB_MALLOC +} JemallocNodumpAllocator::JemallocNodumpAllocator( - JemallocAllocatorOptions& options, - std::unique_ptr&& arena_hooks, unsigned arena_index) + JemallocAllocatorOptions& options) : options_(options), - arena_hooks_(std::move(arena_hooks)), - arena_index_(arena_index), - tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {} +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache), +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + arena_index_(0) { + RegisterOptions(&options_, &jemalloc_type_info); +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +JemallocNodumpAllocator::~JemallocNodumpAllocator() { + // Destroy tcache before destroying arena. + autovector tcache_list; + tcache_.Scrape(&tcache_list, nullptr); + for (void* tcache_index : tcache_list) { + DestroyThreadSpecificCache(tcache_index); + } + if (arena_index_ > 0) { + // Destroy arena. Silently ignore error. + Status s = DestroyArena(arena_index_); + assert(s.ok()); + s.PermitUncheckedError(); + } +} + +size_t JemallocNodumpAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return malloc_usable_size(static_cast(p)); +} + +void* JemallocNodumpAllocator::Allocate(size_t size) { + int tcache_flag = GetThreadSpecificCache(size); + return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); +} + +void JemallocNodumpAllocator::Deallocate(void* p) { + // Obtain tcache. + size_t size = 0; + if (options_.limit_tcache_size) { + size = malloc_usable_size(p); + } + int tcache_flag = GetThreadSpecificCache(size); + // No need to pass arena index to dallocx(). Jemalloc will find arena index + // from its own metadata. + dallocx(p, tcache_flag); +} + +Status JemallocNodumpAllocator::InitializeArenas() { + // Create arena. + size_t arena_index_size = sizeof(arena_index_); + int ret = + mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to create jemalloc arena, error code: " + + ROCKSDB_NAMESPACE::ToString(ret)); + } + assert(arena_index_ != 0); + + // Read existing hooks. + std::string key = + "arena." + ROCKSDB_NAMESPACE::ToString(arena_index_) + ".extent_hooks"; + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to read existing hooks, error code: " + + ROCKSDB_NAMESPACE::ToString(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + if (!success && original_alloc != expected) { + return Status::Incomplete("Original alloc conflict."); + } + // Set the custom hook. + arena_hooks_.reset(new extent_hooks_t(*hooks)); + arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = arena_hooks_.get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + ROCKSDB_NAMESPACE::ToString(ret)); + } + return Status::OK(); +} + +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status JemallocNodumpAllocator::PrepareOptions( + const ConfigOptions& config_options) { + std::string message; + + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else if (options_.limit_tcache_size && + options_.tcache_size_lower_bound >= + options_.tcache_size_upper_bound) { + return Status::InvalidArgument( + "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); + } else if (IsMutable()) { + Status s = MemoryAllocator::PrepareOptions(config_options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + if (s.ok()) { + s = InitializeArenas(); + } +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + return s; + } else { + // Already prepared + return Status::OK(); + } +} + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) { // We always enable tcache. The only corner case is when there are a ton of // threads accessing with low frequency, then it could consume a lot of @@ -50,24 +203,6 @@ } return MALLOCX_TCACHE(*tcache_index); } - -void* JemallocNodumpAllocator::Allocate(size_t size) { - int tcache_flag = GetThreadSpecificCache(size); - return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag); -} - -void JemallocNodumpAllocator::Deallocate(void* p) { - // Obtain tcache. - size_t size = 0; - if (options_.limit_tcache_size) { - size = malloc_usable_size(p); - } - int tcache_flag = GetThreadSpecificCache(size); - // No need to pass arena index to dallocx(). Jemalloc will find arena index - // from its own metadata. - dallocx(p, tcache_flag); -} - void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, size_t size, size_t alignment, bool* zero, bool* commit, unsigned arena_ind) { @@ -91,11 +226,12 @@ Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) { assert(arena_index != 0); - std::string key = "arena." + ToString(arena_index) + ".destroy"; + std::string key = + "arena." + ROCKSDB_NAMESPACE::ToString(arena_index) + ".destroy"; int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); if (ret != 0) { return Status::Incomplete("Failed to destroy jemalloc arena, error code: " + - ToString(ret)); + ROCKSDB_NAMESPACE::ToString(ret)); } return Status::OK(); } @@ -111,96 +247,25 @@ delete tcache_index; } -JemallocNodumpAllocator::~JemallocNodumpAllocator() { - // Destroy tcache before destroying arena. - autovector tcache_list; - tcache_.Scrape(&tcache_list, nullptr); - for (void* tcache_index : tcache_list) { - DestroyThreadSpecificCache(tcache_index); - } - // Destroy arena. Silently ignore error. - Status s __attribute__((__unused__)) = DestroyArena(arena_index_); - assert(s.ok()); -} - -size_t JemallocNodumpAllocator::UsableSize(void* p, - size_t /*allocation_size*/) const { - return malloc_usable_size(static_cast(p)); -} #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator) { - *memory_allocator = nullptr; - Status unsupported = Status::NotSupported( - "JemallocNodumpAllocator only available with jemalloc version >= 5 " - "and MADV_DONTDUMP is available."); -#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - (void)options; - return unsupported; -#else - if (!HasJemalloc()) { - return unsupported; - } if (memory_allocator == nullptr) { return Status::InvalidArgument("memory_allocator must be non-null."); } - if (options.limit_tcache_size && - options.tcache_size_lower_bound >= options.tcache_size_upper_bound) { - return Status::InvalidArgument( - "tcache_size_lower_bound larger or equal to tcache_size_upper_bound."); - } - - // Create arena. - unsigned arena_index = 0; - size_t arena_index_size = sizeof(arena_index); - int ret = - mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); - if (ret != 0) { - return Status::Incomplete("Failed to create jemalloc arena, error code: " + - ToString(ret)); - } - assert(arena_index != 0); - - // Read existing hooks. - std::string key = "arena." + ToString(arena_index) + ".extent_hooks"; - extent_hooks_t* hooks; - size_t hooks_size = sizeof(hooks); - ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); - if (ret != 0) { - JemallocNodumpAllocator::DestroyArena(arena_index); - return Status::Incomplete("Failed to read existing hooks, error code: " + - ToString(ret)); - } - - // Store existing alloc. - extent_alloc_t* original_alloc = hooks->alloc; - extent_alloc_t* expected = nullptr; - bool success = - JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( - expected, original_alloc); - if (!success && original_alloc != expected) { - JemallocNodumpAllocator::DestroyArena(arena_index); - return Status::Incomplete("Original alloc conflict."); - } - - // Set the custom hook. - std::unique_ptr new_hooks(new extent_hooks_t(*hooks)); - new_hooks->alloc = &JemallocNodumpAllocator::Alloc; - extent_hooks_t* hooks_ptr = new_hooks.get(); - ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); - if (ret != 0) { - JemallocNodumpAllocator::DestroyArena(arena_index); - return Status::Incomplete("Failed to set custom hook, error code: " + - ToString(ret)); +#ifndef ROCKSDB_JEMALLOC + (void)options; + return Status::NotSupported("Not compiled with JEMALLOC"); +#else + std::unique_ptr allocator( + new JemallocNodumpAllocator(options)); + Status s = allocator->PrepareOptions(ConfigOptions()); + if (s.ok()) { + memory_allocator->reset(allocator.release()); } - - // Create cache allocator. - memory_allocator->reset( - new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index)); - return Status::OK(); -#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + return s; +#endif } - } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,7 @@ #include "port/port.h" #include "rocksdb/memory_allocator.h" #include "util/thread_local.h" +#include "utilities/memory_allocators.h" #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) @@ -19,22 +20,38 @@ #if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) #define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP +#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX namespace ROCKSDB_NAMESPACE { - -class JemallocNodumpAllocator : public MemoryAllocator { +class JemallocNodumpAllocator : public BaseMemoryAllocator { public: - JemallocNodumpAllocator(JemallocAllocatorOptions& options, - std::unique_ptr&& arena_hooks, - unsigned arena_index); + explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR ~JemallocNodumpAllocator(); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + + static const char* kClassName() { return "JemallocNodumpAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + static bool IsSupported(std::string* why); + bool IsMutable() const { return arena_index_ == 0; } - const char* Name() const override { return "JemallocNodumpAllocator"; } + Status PrepareOptions(const ConfigOptions& config_options) override; + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR void* Allocate(size_t size) override; void Deallocate(void* p) override; size_t UsableSize(void* p, size_t allocation_size) const override; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR private: +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + Status InitializeArenas(); + friend Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); @@ -53,7 +70,10 @@ // Get or create tcache. Return flag suitable to use with `mallocx`: // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc). int GetThreadSpecificCache(size_t size); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + JemallocAllocatorOptions options_; +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR // A function pointer to jemalloc default alloc. Use atomic to make sure // NewJemallocNodumpAllocator is thread-safe. // @@ -61,18 +81,14 @@ // alloc needs to be static to pass to jemalloc as function pointer. static std::atomic original_alloc_; - const JemallocAllocatorOptions options_; - // Custom hooks has to outlive corresponding arena. - const std::unique_ptr arena_hooks_; - - // Arena index. - const unsigned arena_index_; + std::unique_ptr arena_hooks_; // Hold thread-local tcache index. ThreadLocalPtr tcache_; -}; +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + // Arena index. + unsigned arena_index_; +}; } // namespace ROCKSDB_NAMESPACE -#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP -#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef MEMKIND +#include +#endif // MEMKIND + +#include "memory/memkind_kmem_allocator.h" + +namespace ROCKSDB_NAMESPACE { +Status MemkindKmemAllocator::PrepareOptions(const ConfigOptions& options) { + std::string message; + if (!IsSupported(&message)) { + return Status::NotSupported(message); + } else { + return MemoryAllocator::PrepareOptions(options); + } +} + +#ifdef MEMKIND +void* MemkindKmemAllocator::Allocate(size_t size) { + void* p = memkind_malloc(MEMKIND_DAX_KMEM, size); + if (p == NULL) { + throw std::bad_alloc(); + } + return p; +} + +void MemkindKmemAllocator::Deallocate(void* p) { + memkind_free(MEMKIND_DAX_KMEM, p); +} + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +size_t MemkindKmemAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return memkind_malloc_usable_size(MEMKIND_DAX_KMEM, p); +} +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#endif // MEMKIND + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/memory_allocator.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { + +class MemkindKmemAllocator : public BaseMemoryAllocator { + public: + static const char* kClassName() { return "MemkindKmemAllocator"; } + const char* Name() const override { return kClassName(); } + static bool IsSupported() { + std::string unused; + return IsSupported(&unused); + } + + static bool IsSupported(std::string* msg) { +#ifdef MEMKIND + (void)msg; + return true; +#else + *msg = "Not compiled with MemKind"; + return false; +#endif + } + Status PrepareOptions(const ConfigOptions& options) override; + +#ifdef MEMKIND + void* Allocate(size_t size) override; + void Deallocate(void* p) override; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + size_t UsableSize(void* p, size_t /*allocation_size*/) const override; +#endif +#endif // MEMKIND +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,91 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/memory_allocator.h" + +#include "memory/jemalloc_nodump_allocator.h" +#include "memory/memkind_kmem_allocator.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static std::unordered_map ma_wrapper_type_info = { +#ifndef ROCKSDB_LITE + {"target", OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinAllocators(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + DefaultMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new DefaultMemoryAllocator()); + return guard->get(); + }); + library.AddFactory( + CountedMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new CountedMemoryAllocator( + std::make_shared())); + return guard->get(); + }); + library.AddFactory( + JemallocNodumpAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + if (JemallocNodumpAllocator::IsSupported(errmsg)) { + JemallocAllocatorOptions options; + guard->reset(new JemallocNodumpAllocator(options)); + } + return guard->get(); + }); + library.AddFactory( + MemkindKmemAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* errmsg) { + if (MemkindKmemAllocator::IsSupported(errmsg)) { + guard->reset(new MemkindKmemAllocator()); + } + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE +} // namespace + +MemoryAllocatorWrapper::MemoryAllocatorWrapper( + const std::shared_ptr& t) + : target_(t) { + RegisterOptions("", &target_, &ma_wrapper_type_info); +} + +Status MemoryAllocator::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), ""); + }); +#else + if (value == DefaultMemoryAllocator::kClassName()) { + result->reset(new DefaultMemoryAllocator()); + return Status::OK(); + } +#endif // ROCKSDB_LITE + ConfigOptions copy = options; + copy.invoke_prepare_options = true; + return LoadManagedObject(copy, value, result); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,243 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2019 Intel Corporation +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "memory/jemalloc_nodump_allocator.h" +#include "memory/memkind_kmem_allocator.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/testharness.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: the tests do not work in LITE mode due to relying on +// `CreateFromString()` to create non-default memory allocators. +#ifndef ROCKSDB_LITE + +class MemoryAllocatorTest + : public testing::Test, + public ::testing::WithParamInterface> { + public: + MemoryAllocatorTest() { + std::tie(id_, supported_) = GetParam(); + Status s = + MemoryAllocator::CreateFromString(ConfigOptions(), id_, &allocator_); + if (supported_) { + EXPECT_OK(s); + } else if (!s.ok()) { + EXPECT_TRUE(s.IsNotSupported()); + } + } + bool IsSupported() { return supported_; } + + std::shared_ptr allocator_; + std::string id_; + + private: + bool supported_; +}; + +TEST_P(MemoryAllocatorTest, Allocate) { + if (!IsSupported()) { + return; + } + void* p = allocator_->Allocate(1024); + ASSERT_NE(p, nullptr); + size_t size = allocator_->UsableSize(p, 1024); + ASSERT_GE(size, 1024); + allocator_->Deallocate(p); +} + +TEST_P(MemoryAllocatorTest, CreateAllocator) { + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.ignore_unsupported_options = false; + std::shared_ptr orig, copy; + Status s = MemoryAllocator::CreateFromString(config_options, id_, &orig); + if (!IsSupported()) { + ASSERT_TRUE(s.IsNotSupported()); + } else { + ASSERT_OK(s); + ASSERT_NE(orig, nullptr); +#ifndef ROCKSDB_LITE + std::string str = orig->ToString(config_options); + ASSERT_OK(MemoryAllocator::CreateFromString(config_options, str, ©)); + ASSERT_EQ(orig, copy); +#endif // ROCKSDB_LITE + } +} + +TEST_P(MemoryAllocatorTest, DatabaseBlockCache) { + if (!IsSupported()) { + // Check if a memory node is available for allocation + } + + // Create database with block cache using the MemoryAllocator + Options options; + std::string dbname = test::PerThreadDBPath("allocator_test"); + ASSERT_OK(DestroyDB(dbname, options)); + + options.create_if_missing = true; + BlockBasedTableOptions table_options; + auto cache = NewLRUCache(1024 * 1024, 6, false, false, allocator_); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DB* db = nullptr; + Status s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + ASSERT_LE(cache->GetUsage(), 104); // Cache will contain stats + + // Write 2kB (200 values, each 10 bytes) + int num_keys = 200; + WriteOptions wo; + std::string val = "0123456789"; + for (int i = 0; i < num_keys; i++) { + std::string key = std::to_string(i); + s = db->Put(wo, Slice(key), Slice(val)); + ASSERT_OK(s); + } + ASSERT_OK(db->Flush(FlushOptions())); // Flush all data from memtable so that + // reads are from block cache + + // Read and check block cache usage + ReadOptions ro; + std::string result; + for (int i = 0; i < num_keys; i++) { + std::string key = std::to_string(i); + s = db->Get(ro, key, &result); + ASSERT_OK(s); + ASSERT_EQ(result, val); + } + ASSERT_GT(cache->GetUsage(), 2000); + + // Close database + s = db->Close(); + ASSERT_OK(s); + delete db; + ASSERT_OK(DestroyDB(dbname, options)); +} + +class CreateMemoryAllocatorTest : public testing::Test { + public: + CreateMemoryAllocatorTest() { + config_options_.ignore_unknown_options = false; + config_options_.ignore_unsupported_options = false; + } + ConfigOptions config_options_; +}; + +TEST_F(CreateMemoryAllocatorTest, JemallocOptionsTest) { + std::shared_ptr allocator; + std::string id = std::string("id=") + JemallocNodumpAllocator::kClassName(); + Status s = MemoryAllocator::CreateFromString(config_options_, id, &allocator); + if (!JemallocNodumpAllocator::IsSupported()) { + ASSERT_TRUE(s.IsNotSupported()); + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + return; + } + ASSERT_OK(s); + ASSERT_NE(allocator, nullptr); + JemallocAllocatorOptions jopts; + auto opts = allocator->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + + ASSERT_NOK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=true; tcache_size_lower_bound=4096; " + "tcache_size_upper_bound=1024", + &allocator)); + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=false; tcache_size_lower_bound=4096; " + "tcache_size_upper_bound=1024", + &allocator)); + opts = allocator->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, false); + ASSERT_EQ(opts->tcache_size_lower_bound, 4096U); + ASSERT_EQ(opts->tcache_size_upper_bound, 1024U); + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, + id + "; limit_tcache_size=true; tcache_size_upper_bound=4096; " + "tcache_size_lower_bound=1024", + &allocator)); + opts = allocator->GetOptions(); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->limit_tcache_size, true); + ASSERT_EQ(opts->tcache_size_lower_bound, 1024U); + ASSERT_EQ(opts->tcache_size_upper_bound, 4096U); +} + +TEST_F(CreateMemoryAllocatorTest, NewJemallocNodumpAllocator) { + JemallocAllocatorOptions jopts; + std::shared_ptr allocator; + + jopts.limit_tcache_size = true; + jopts.tcache_size_lower_bound = 2 * 1024; + jopts.tcache_size_upper_bound = 1024; + + ASSERT_NOK(NewJemallocNodumpAllocator(jopts, nullptr)); + Status s = NewJemallocNodumpAllocator(jopts, &allocator); + std::string msg; + if (!JemallocNodumpAllocator::IsSupported(&msg)) { + ASSERT_TRUE(s.IsNotSupported()); + ROCKSDB_GTEST_BYPASS("JEMALLOC not supported"); + return; + } + ASSERT_NOK(s); // Invalid options + ASSERT_EQ(allocator, nullptr); + + jopts.tcache_size_upper_bound = 4 * 1024; + ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator)); + ASSERT_NE(allocator, nullptr); + auto opts = allocator->GetOptions(); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); + + jopts.limit_tcache_size = false; + ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator)); + ASSERT_NE(allocator, nullptr); + opts = allocator->GetOptions(); + ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound); + ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound); + ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size); +} + +INSTANTIATE_TEST_CASE_P(DefaultMemoryAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple( + DefaultMemoryAllocator::kClassName(), true))); +#ifdef MEMKIND +INSTANTIATE_TEST_CASE_P( + MemkindkMemAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple(MemkindKmemAllocator::kClassName(), + MemkindKmemAllocator::IsSupported()))); +#endif // MEMKIND + +#ifdef ROCKSDB_JEMALLOC +INSTANTIATE_TEST_CASE_P( + JemallocNodumpAllocator, MemoryAllocatorTest, + ::testing::Values(std::make_tuple(JemallocNodumpAllocator::kClassName(), + JemallocNodumpAllocator::IsSupported()))); +#endif // ROCKSDB_JEMALLOC + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_usage.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_usage.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,11 @@ #pragma once +#include #include +#include "rocksdb/rocksdb_namespace.h" + namespace ROCKSDB_NAMESPACE { // Helper methods to estimate memroy usage by std containers. @@ -14,7 +17,7 @@ template size_t ApproximateMemoryUsage( const std::unordered_map& umap) { - typedef std::unordered_map Map; + using Map = std::unordered_map; return sizeof(umap) + // Size of all items plus a next pointer for each item. (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() + diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,10 @@ // #ifndef ROCKSDB_LITE -#include "memtable/hash_linklist_rep.h" #include #include + #include "db/memtable.h" #include "memory/arena.h" #include "memtable/skiplist.h" @@ -17,14 +17,15 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/options_type.h" #include "util/hash.h" namespace ROCKSDB_NAMESPACE { namespace { -typedef const char* Key; -typedef SkipList MemtableSkipList; -typedef std::atomic Pointer; +using Key = const char*; +using MemtableSkipList = SkipList; +using Pointer = std::atomic; // A data structure used as the header of a link list of a hash bucket. struct BucketHeader { @@ -218,7 +219,7 @@ } size_t GetHash(const Slice& slice) const { - return fastrange64(GetSliceNPHash64(slice), bucket_size_); + return GetSliceRangedNPHash(slice, bucket_size_); } Pointer* GetBucket(size_t i) const { @@ -820,15 +821,77 @@ return x; } -} // anon namespace +struct HashLinkListRepOptions { + static const char* kName() { return "HashLinkListRepFactoryOptions"; } + size_t bucket_count; + uint32_t threshold_use_skiplist; + size_t huge_page_tlb_size; + int bucket_entries_logging_threshold; + bool if_log_bucket_dist_when_flash; +}; + +static std::unordered_map hash_linklist_info = { + {"bucket_count", + {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"threshold", + {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"huge_page_size", + {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"logging_threshold", + {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_when_flash", + {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +class HashLinkListRepFactory : public MemTableRepFactory { + public: + explicit HashLinkListRepFactory(size_t bucket_count, + uint32_t threshold_use_skiplist, + size_t huge_page_tlb_size, + int bucket_entries_logging_threshold, + bool if_log_bucket_dist_when_flash) { + options_.bucket_count = bucket_count; + options_.threshold_use_skiplist = threshold_use_skiplist; + options_.huge_page_tlb_size = huge_page_tlb_size; + options_.bucket_entries_logging_threshold = + bucket_entries_logging_threshold; + options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash; + RegisterOptions(&options_, &hash_linklist_info); + } + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* logger) override; + + static const char* kClassName() { return "HashLinkListRepFactory"; } + static const char* kNickName() { return "hash_linkedlist"; } + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + HashLinkListRepOptions options_; +}; + +} // namespace MemTableRep* HashLinkListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, const SliceTransform* transform, Logger* logger) { - return new HashLinkListRep(compare, allocator, transform, bucket_count_, - threshold_use_skiplist_, huge_page_tlb_size_, - logger, bucket_entries_logging_threshold_, - if_log_bucket_dist_when_flash_); + return new HashLinkListRep( + compare, allocator, transform, options_.bucket_count, + options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger, + options_.bucket_entries_logging_threshold, + options_.if_log_bucket_dist_when_flash); } MemTableRepFactory* NewHashLinkListRepFactory( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,49 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#pragma once -#ifndef ROCKSDB_LITE -#include "rocksdb/slice_transform.h" -#include "rocksdb/memtablerep.h" - -namespace ROCKSDB_NAMESPACE { - -class HashLinkListRepFactory : public MemTableRepFactory { - public: - explicit HashLinkListRepFactory(size_t bucket_count, - uint32_t threshold_use_skiplist, - size_t huge_page_tlb_size, - int bucket_entries_logging_threshold, - bool if_log_bucket_dist_when_flash) - : bucket_count_(bucket_count), - threshold_use_skiplist_(threshold_use_skiplist), - huge_page_tlb_size_(huge_page_tlb_size), - bucket_entries_logging_threshold_(bucket_entries_logging_threshold), - if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {} - - virtual ~HashLinkListRepFactory() {} - - using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) override; - - virtual const char* Name() const override { - return "HashLinkListRepFactory"; - } - - private: - const size_t bucket_count_; - const uint32_t threshold_use_skiplist_; - const size_t huge_page_tlb_size_; - int bucket_entries_logging_threshold_; - bool if_log_bucket_dist_when_flash_; -}; - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,6 @@ // #ifndef ROCKSDB_LITE -#include "memtable/hash_skiplist_rep.h" - #include #include "db/memtable.h" @@ -16,6 +14,7 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/options_type.h" #include "util/murmurhash.h" namespace ROCKSDB_NAMESPACE { @@ -46,7 +45,7 @@ private: friend class DynamicIterator; - typedef SkipList Bucket; + using Bucket = SkipList; size_t bucket_size_; @@ -329,13 +328,60 @@ } } -} // anon namespace +struct HashSkipListRepOptions { + static const char* kName() { return "HashSkipListRepFactoryOptions"; } + size_t bucket_count; + int32_t skiplist_height; + int32_t skiplist_branching_factor; +}; + +static std::unordered_map hash_skiplist_info = { + {"bucket_count", + {offsetof(struct HashSkipListRepOptions, bucket_count), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"skiplist_height", + {offsetof(struct HashSkipListRepOptions, skiplist_height), + OptionType::kInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"branching_factor", + {offsetof(struct HashSkipListRepOptions, skiplist_branching_factor), + OptionType::kInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +class HashSkipListRepFactory : public MemTableRepFactory { + public: + explicit HashSkipListRepFactory(size_t bucket_count, int32_t skiplist_height, + int32_t skiplist_branching_factor) { + options_.bucket_count = bucket_count; + options_.skiplist_height = skiplist_height; + options_.skiplist_branching_factor = skiplist_branching_factor; + RegisterOptions(&options_, &hash_skiplist_info); + } + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* logger) override; + + static const char* kClassName() { return "HashSkipListRepFactory"; } + static const char* kNickName() { return "prefix_hash"; } + + virtual const char* Name() const override { return kClassName(); } + virtual const char* NickName() const override { return kNickName(); } + + private: + HashSkipListRepOptions options_; +}; + +} // namespace MemTableRep* HashSkipListRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, const SliceTransform* transform, Logger* /*logger*/) { - return new HashSkipListRep(compare, allocator, transform, bucket_count_, - skiplist_height_, skiplist_branching_factor_); + return new HashSkipListRep(compare, allocator, transform, + options_.bucket_count, options_.skiplist_height, + options_.skiplist_branching_factor); } MemTableRepFactory* NewHashSkipListRepFactory( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#pragma once -#ifndef ROCKSDB_LITE -#include "rocksdb/slice_transform.h" -#include "rocksdb/memtablerep.h" - -namespace ROCKSDB_NAMESPACE { - -class HashSkipListRepFactory : public MemTableRepFactory { - public: - explicit HashSkipListRepFactory( - size_t bucket_count, - int32_t skiplist_height, - int32_t skiplist_branching_factor) - : bucket_count_(bucket_count), - skiplist_height_(skiplist_height), - skiplist_branching_factor_(skiplist_branching_factor) { } - - virtual ~HashSkipListRepFactory() {} - - using MemTableRepFactory::CreateMemTableRep; - virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Allocator* allocator, - const SliceTransform* transform, Logger* logger) override; - - virtual const char* Name() const override { - return "HashSkipListRepFactory"; - } - - private: - const size_t bucket_count_; - const int32_t skiplist_height_; - const int32_t skiplist_branching_factor_; -}; - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h 2025-05-19 16:14:27.000000000 +0000 @@ -177,6 +177,9 @@ // Retreat to the last entry with a key <= target void SeekForPrev(const char* target); + // Advance to a random entry in the list. + void RandomSeek(); + // Position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. void SeekToFirst(); @@ -252,6 +255,9 @@ // Return head_ if list is empty. Node* FindLast() const; + // Returns a random entry. + Node* FindRandomEntry() const; + // Traverses a single level of the list, setting *out_prev to the last // node before the key and *out_next to the first node after. Assumes // that the key is not present in the skip list. On entry, before should @@ -413,6 +419,11 @@ } template +inline void InlineSkipList::Iterator::RandomSeek() { + node_ = list_->FindRandomEntry(); +} + +template inline void InlineSkipList::Iterator::SeekToFirst() { node_ = list_->head_->Next(0); } @@ -559,6 +570,48 @@ } template +typename InlineSkipList::Node* +InlineSkipList::FindRandomEntry() const { + // TODO(bjlemaire): consider adding PREFETCH calls. + Node *x = head_, *scan_node = nullptr, *limit_node = nullptr; + + // We start at the max level. + // FOr each level, we look at all the nodes at the level, and + // we randomly pick one of them. Then decrement the level + // and reiterate the process. + // eg: assume GetMaxHeight()=5, and there are #100 elements (nodes). + // level 4 nodes: lvl_nodes={#1, #15, #67, #84}. Randomly pick #15. + // We will consider all the nodes between #15 (inclusive) and #67 + // (exclusive). #67 is called 'limit_node' here. + // level 3 nodes: lvl_nodes={#15, #21, #45, #51}. Randomly choose + // #51. #67 remains 'limit_node'. + // [...] + // level 0 nodes: lvl_nodes={#56,#57,#58,#59}. Randomly pick $57. + // Return Node #57. + std::vector lvl_nodes; + Random* rnd = Random::GetTLSInstance(); + int level = GetMaxHeight() - 1; + + while (level >= 0) { + lvl_nodes.clear(); + scan_node = x; + while (scan_node != limit_node) { + lvl_nodes.push_back(scan_node); + scan_node = scan_node->Next(level); + } + uint32_t rnd_idx = rnd->Next() % lvl_nodes.size(); + x = lvl_nodes[rnd_idx]; + if (rnd_idx + 1 < lvl_nodes.size()) { + limit_node = lvl_nodes[rnd_idx + 1]; + } + level--; + } + // There is a special case where x could still be the head_ + // (note that the head_ contains no key). + return x == head_ ? head_->Next(0) : x; +} + +template uint64_t InlineSkipList::EstimateCount(const char* key) const { uint64_t count = 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,7 +19,7 @@ namespace ROCKSDB_NAMESPACE { // Our test skip list stores 8-byte unsigned integers -typedef uint64_t Key; +using Key = uint64_t; static const char* Encode(const uint64_t* key) { return reinterpret_cast(key); @@ -32,7 +32,7 @@ } struct TestComparator { - typedef Key DecodedType; + using DecodedType = Key; static DecodedType decode_key(const char* b) { return Decode(b); @@ -59,7 +59,7 @@ } }; -typedef InlineSkipList TestInlineSkipList; +using TestInlineSkipList = InlineSkipList; class InlineSkipTest : public testing::Test { public: @@ -309,7 +309,7 @@ Validate(&list); } -#ifndef ROCKSDB_VALGRIND_RUN +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) // We want to make sure that with a single writer and multiple // concurrent readers (with no synchronization other than when a // reader's iterator is created), the reader always observes all the @@ -654,7 +654,7 @@ RunConcurrentInsert(3, true); } -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -28,9 +28,11 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" #include "rocksdb/memtablerep.h" #include "rocksdb/options.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/write_buffer_manager.h" #include "test_util/testutil.h" #include "util/gflags_compat.h" @@ -141,7 +143,7 @@ RandomGenerator() { Random rnd(301); auto size = (unsigned)std::max(1048576, FLAGS_item_size); - test::RandomString(&rnd, size, &data_); + data_ = rnd.RandomString(size); pos_ = 0; } @@ -170,9 +172,8 @@ for (uint64_t i = 0; i < num_; ++i) { values_[i] = i; } - std::shuffle( - values_.begin(), values_.end(), - std::default_random_engine(static_cast(FLAGS_seed))); + RandomShuffle(values_.begin(), values_.end(), + static_cast(FLAGS_seed)); } } @@ -418,7 +419,7 @@ uint64_t bytes_written = 0; uint64_t bytes_read = 0; uint64_t read_hits = 0; - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits); auto elapsed_time = static_cast(timer.ElapsedNanos() / 1000); std::cout << "Elapsed time: " << static_cast(elapsed_time) << " us" @@ -453,8 +454,8 @@ MemTableRep* table_; KeyGenerator* key_gen_; uint64_t* sequence_; - uint64_t num_write_ops_per_thread_; - uint64_t num_read_ops_per_thread_; + uint64_t num_write_ops_per_thread_ = 0; + uint64_t num_read_ops_per_thread_ = 0; const uint32_t num_threads_; }; @@ -581,13 +582,15 @@ #ifndef ROCKSDB_LITE } else if (FLAGS_memtablerep == "vector") { factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); - } else if (FLAGS_memtablerep == "hashskiplist") { + } else if (FLAGS_memtablerep == "hashskiplist" || + FLAGS_memtablerep == "prefix_hash") { factory.reset(ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( FLAGS_bucket_count, FLAGS_hashskiplist_height, FLAGS_hashskiplist_branching_factor)); options.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length)); - } else if (FLAGS_memtablerep == "hashlinklist") { + } else if (FLAGS_memtablerep == "hashlinklist" || + FLAGS_memtablerep == "hash_linkedlist") { factory.reset(ROCKSDB_NAMESPACE::NewHashLinkListRepFactory( FLAGS_bucket_count, FLAGS_huge_page_tlb_size, FLAGS_bucket_entries_logging_threshold, @@ -596,8 +599,16 @@ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length)); #endif // ROCKSDB_LITE } else { - fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str()); - exit(1); + ROCKSDB_NAMESPACE::ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + + ROCKSDB_NAMESPACE::Status s = + ROCKSDB_NAMESPACE::MemTableRepFactory::CreateFromString( + config_options, FLAGS_memtablerep, &factory); + if (!s.ok()) { + fprintf(stdout, "Unknown memtablerep: %s\n", s.ToString().c_str()); + exit(1); + } } ROCKSDB_NAMESPACE::InternalKeyComparator internal_key_comp( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplist_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplist_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE { -typedef uint64_t Key; +using Key = uint64_t; struct TestComparator { int operator()(const Key& a, const Key& b) const { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplistrep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplistrep.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,10 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include + #include "db/memtable.h" #include "memory/arena.h" #include "memtable/inlineskiplist.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { namespace { @@ -95,6 +99,66 @@ return (end_count >= start_count) ? (end_count - start_count) : 0; } + void UniqueRandomSample(const uint64_t num_entries, + const uint64_t target_sample_size, + std::unordered_set* entries) override { + entries->clear(); + // Avoid divide-by-0. + assert(target_sample_size > 0); + assert(num_entries > 0); + // NOTE: the size of entries is not enforced to be exactly + // target_sample_size at the end of this function, it might be slightly + // greater or smaller. + SkipListRep::Iterator iter(&skip_list_); + // There are two methods to create the subset of samples (size m) + // from the table containing N elements: + // 1-Iterate linearly through the N memtable entries. For each entry i, + // add it to the sample set with a probability + // (target_sample_size - entries.size() ) / (N-i). + // + // 2-Pick m random elements without repetition. + // We pick Option 2 when m sqrt(N). + if (target_sample_size > + static_cast(std::sqrt(1.0 * num_entries))) { + Random* rnd = Random::GetTLSInstance(); + iter.SeekToFirst(); + uint64_t counter = 0, num_samples_left = target_sample_size; + for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) { + // Add entry to sample set with probability + // num_samples_left/(num_entries - counter). + if (rnd->Next() % (num_entries - counter) < num_samples_left) { + entries->insert(iter.key()); + num_samples_left--; + } + } + } else { + // Option 2: pick m random elements with no duplicates. + // If Option 2 is picked, then target_sample_size99.9% for N>4. + // At worst, for the final pick , when m=sqrt(N) there is + // a probability of p= 1/sqrt(N) chances to find a duplicate. + for (uint64_t j = 0; j < 5; j++) { + iter.RandomSeek(); + // unordered_set::insert returns pair. + // The second element is true if an insert successfully happened. + // If element is already in the set, this bool will be false, and + // true otherwise. + if ((entries->insert(iter.key())).second) { + break; + } + } + } + } + } + ~SkipListRep() override {} // Iteration over the contents of a skip list @@ -143,6 +207,8 @@ } } + void RandomSeek() override { iter_.RandomSeek(); } + // Position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. void SeekToFirst() override { iter_.SeekToFirst(); } @@ -271,6 +337,27 @@ }; } +static std::unordered_map skiplist_factory_info = { +#ifndef ROCKSDB_LITE + {"lookahead", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}}, +#endif +}; + +SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) { + RegisterOptions("SkipListFactoryOptions", &lookahead_, + &skiplist_factory_info); +} + +std::string SkipListFactory::GetId() const { + std::string id = Name(); + if (lookahead_ > 0) { + id.append(":").append(ROCKSDB_NAMESPACE::ToString(lookahead_)); + } + return id; +} + MemTableRep* SkipListFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, const SliceTransform* transform, Logger* /*logger*/) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/vectorrep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/vectorrep.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,25 +4,23 @@ // (found in the LICENSE.Apache file in the root directory). // #ifndef ROCKSDB_LITE -#include "rocksdb/memtablerep.h" - -#include -#include -#include #include +#include +#include #include +#include #include "db/memtable.h" #include "memory/arena.h" #include "memtable/stl_wrappers.h" #include "port/port.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/utilities/options_type.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { namespace { -using namespace stl_wrappers; - class VectorRep : public MemTableRep { public: VectorRep(const KeyComparator& compare, Allocator* allocator, size_t count); @@ -98,7 +96,7 @@ private: friend class Iterator; - typedef std::vector Bucket; + using Bucket = std::vector; std::shared_ptr bucket_; mutable port::RWMutex rwlock_; bool immutable_; @@ -157,14 +155,16 @@ if (!sorted_ && vrep_ != nullptr) { WriteLock l(&vrep_->rwlock_); if (!vrep_->sorted_) { - std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + std::sort(bucket_->begin(), bucket_->end(), + stl_wrappers::Compare(compare_)); cit_ = bucket_->begin(); vrep_->sorted_ = true; } sorted_ = true; } if (!sorted_) { - std::sort(bucket_->begin(), bucket_->end(), Compare(compare_)); + std::sort(bucket_->begin(), bucket_->end(), + stl_wrappers::Compare(compare_)); cit_ = bucket_->begin(); sorted_ = true; } @@ -292,6 +292,16 @@ } } // anon namespace +static std::unordered_map vector_rep_table_info = { + {"count", + {0, OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +VectorRepFactory::VectorRepFactory(size_t count) : count_(count) { + RegisterOptions("VectorRepFactoryOptions", &count_, &vector_rep_table_info); +} + MemTableRep* VectorRepFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Allocator* allocator, const SliceTransform*, Logger* /*logger*/) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,57 +8,31 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" -#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "db/db_impl/db_impl.h" +#include "rocksdb/status.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE -namespace { -const size_t kSizeDummyEntry = 256 * 1024; -// The key will be longer than keys for blocks in SST files so they won't -// conflict. -const size_t kCacheKeyPrefix = kMaxVarint64Length * 4 + 1; -} // namespace - -struct WriteBufferManager::CacheRep { - std::shared_ptr cache_; - std::mutex cache_mutex_; - std::atomic cache_allocated_size_; - // The non-prefix part will be updated according to the ID to use. - char cache_key_[kCacheKeyPrefix + kMaxVarint64Length]; - uint64_t next_cache_key_id_ = 0; - std::vector dummy_handles_; - - explicit CacheRep(std::shared_ptr cache) - : cache_(cache), cache_allocated_size_(0) { - memset(cache_key_, 0, kCacheKeyPrefix); - size_t pointer_size = sizeof(const void*); - assert(pointer_size <= kCacheKeyPrefix); - memcpy(cache_key_, static_cast(this), pointer_size); - } - - Slice GetNextCacheKey() { - memset(cache_key_ + kCacheKeyPrefix, 0, kMaxVarint64Length); - char* end = - EncodeVarint64(cache_key_ + kCacheKeyPrefix, next_cache_key_id_++); - return Slice(cache_key_, static_cast(end - cache_key_)); - } -}; -#else -struct WriteBufferManager::CacheRep {}; -#endif // ROCKSDB_LITE - WriteBufferManager::WriteBufferManager(size_t _buffer_size, - std::shared_ptr cache) + std::shared_ptr cache, + bool allow_stall) : buffer_size_(_buffer_size), mutable_limit_(buffer_size_ * 7 / 8), memory_used_(0), memory_active_(0), - cache_rep_(nullptr) { + cache_res_mgr_(nullptr), + allow_stall_(allow_stall), + stall_active_(false) { #ifndef ROCKSDB_LITE if (cache) { - // Construct the cache key using the pointer to this. - cache_rep_.reset(new CacheRep(cache)); + // Memtable's memory usage tends to fluctuate frequently + // therefore we set delayed_decrease = true to save some dummy entry + // insertion on memory increase right after memory decrease + cache_res_mgr_.reset( + new CacheReservationManager(cache, true /* delayed_decrease */)); } #else (void)cache; @@ -66,65 +40,164 @@ } WriteBufferManager::~WriteBufferManager() { -#ifndef ROCKSDB_LITE - if (cache_rep_) { - for (auto* handle : cache_rep_->dummy_handles_) { - cache_rep_->cache_->Release(handle, true); - } +#ifndef NDEBUG + std::unique_lock lock(mu_); + assert(queue_.empty()); +#endif +} + +std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const { + if (cache_res_mgr_ != nullptr) { + return cache_res_mgr_->GetTotalReservedCacheSize(); + } else { + return 0; + } +} + +void WriteBufferManager::ReserveMem(size_t mem) { + if (cache_res_mgr_ != nullptr) { + ReserveMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_add(mem, std::memory_order_relaxed); + } + if (enabled()) { + memory_active_.fetch_add(mem, std::memory_order_relaxed); } -#endif // ROCKSDB_LITE } // Should only be called from write thread void WriteBufferManager::ReserveMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE - assert(cache_rep_ != nullptr); + assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. - std::lock_guard lock(cache_rep_->cache_mutex_); + std::lock_guard lock(cache_res_mgr_mu_); size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem; memory_used_.store(new_mem_used, std::memory_order_relaxed); - while (new_mem_used > cache_rep_->cache_allocated_size_) { - // Expand size by at least 256KB. - // Add a dummy record to the cache - Cache::Handle* handle; - cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr, - kSizeDummyEntry, nullptr, &handle); - cache_rep_->dummy_handles_.push_back(handle); - cache_rep_->cache_allocated_size_ += kSizeDummyEntry; - } + Status s = + cache_res_mgr_->UpdateCacheReservation( + new_mem_used); + + // We absorb the error since WriteBufferManager is not able to handle + // this failure properly. Ideallly we should prevent this allocation + // from happening if this cache reservation fails. + // [TODO] We'll need to improve it in the future and figure out what to do on + // error + s.PermitUncheckedError(); #else (void)mem; #endif // ROCKSDB_LITE } +void WriteBufferManager::ScheduleFreeMem(size_t mem) { + if (enabled()) { + memory_active_.fetch_sub(mem, std::memory_order_relaxed); + } +} + +void WriteBufferManager::FreeMem(size_t mem) { + if (cache_res_mgr_ != nullptr) { + FreeMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + } + // Check if stall is active and can be ended. + MaybeEndWriteStall(); +} + void WriteBufferManager::FreeMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE - assert(cache_rep_ != nullptr); + assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. - std::lock_guard lock(cache_rep_->cache_mutex_); + std::lock_guard lock(cache_res_mgr_mu_); size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem; memory_used_.store(new_mem_used, std::memory_order_relaxed); - // Gradually shrink memory costed in the block cache if the actual - // usage is less than 3/4 of what we reserve from the block cache. - // We do this because: - // 1. we don't pay the cost of the block cache immediately a memtable is - // freed, as block cache insert is expensive; - // 2. eventually, if we walk away from a temporary memtable size increase, - // we make sure shrink the memory costed in block cache over time. - // In this way, we only shrink costed memory showly even there is enough - // margin. - if (new_mem_used < cache_rep_->cache_allocated_size_ / 4 * 3 && - cache_rep_->cache_allocated_size_ - kSizeDummyEntry > new_mem_used) { - assert(!cache_rep_->dummy_handles_.empty()); - cache_rep_->cache_->Release(cache_rep_->dummy_handles_.back(), true); - cache_rep_->dummy_handles_.pop_back(); - cache_rep_->cache_allocated_size_ -= kSizeDummyEntry; - } + Status s = + cache_res_mgr_->UpdateCacheReservation( + new_mem_used); + + // We absorb the error since WriteBufferManager is not able to handle + // this failure properly. + // [TODO] We'll need to improve it in the future and figure out what to do on + // error + s.PermitUncheckedError(); #else (void)mem; #endif // ROCKSDB_LITE } + +void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + assert(allow_stall_); + + // Allocate outside of the lock. + std::list new_node = {wbm_stall}; + + { + std::unique_lock lock(mu_); + // Verify if the stall conditions are stil active. + if (ShouldStall()) { + stall_active_.store(true, std::memory_order_relaxed); + queue_.splice(queue_.end(), std::move(new_node)); + } + } + + // If the node was not consumed, the stall has ended already and we can signal + // the caller. + if (!new_node.empty()) { + new_node.front()->Signal(); + } +} + +// Called when memory is freed in FreeMem or the buffer size has changed. +void WriteBufferManager::MaybeEndWriteStall() { + // Cannot early-exit on !enabled() because SetBufferSize(0) needs to unblock + // the writers. + if (!allow_stall_) { + return; + } + + if (IsStallThresholdExceeded()) { + return; // Stall conditions have not resolved. + } + + // Perform all deallocations outside of the lock. + std::list cleanup; + + std::unique_lock lock(mu_); + if (!stall_active_.load(std::memory_order_relaxed)) { + return; // Nothing to do. + } + + // Unblock new writers. + stall_active_.store(false, std::memory_order_relaxed); + + // Unblock the writers in the queue. + for (StallInterface* wbm_stall : queue_) { + wbm_stall->Signal(); + } + cleanup = std::move(queue_); +} + +void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + + // Deallocate the removed nodes outside of the lock. + std::list cleanup; + + if (enabled() && allow_stall_) { + std::unique_lock lock(mu_); + for (auto it = queue_.begin(); it != queue_.end();) { + auto next = std::next(it); + if (*it == wbm_stall) { + cleanup.splice(cleanup.end(), queue_, std::move(it)); + } + it = next; + } + } + wbm_stall->Signal(); +} + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,10 +11,11 @@ #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { - class WriteBufferManagerTest : public testing::Test {}; #ifndef ROCKSDB_LITE +const size_t kSizeDummyEntry = 256 * 1024; + TEST_F(WriteBufferManagerTest, ShouldFlush) { // A write buffer manager of size 10MB std::unique_ptr wbf( @@ -46,11 +47,39 @@ ASSERT_TRUE(wbf->ShouldFlush()); wbf->FreeMem(7 * 1024 * 1024); - // 9MB total, 8MB mutable. + // 8MB total, 8MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + // change size: 8M limit, 7M mutable limit + wbf->SetBufferSize(8 * 1024 * 1024); + // 8MB total, 8MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->ScheduleFreeMem(2 * 1024 * 1024); + // 8MB total, 6MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->FreeMem(2 * 1024 * 1024); + // 6MB total, 6MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + wbf->ReserveMem(1 * 1024 * 1024); + // 7MB total, 7MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + wbf->ReserveMem(1 * 1024 * 1024); + // 8MB total, 8MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->ScheduleFreeMem(1 * 1024 * 1024); + wbf->FreeMem(1 * 1024 * 1024); + // 7MB total, 7MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); } TEST_F(WriteBufferManagerTest, CacheCost) { + constexpr std::size_t kMetaDataChargeOverhead = 10000; + LRUCacheOptions co; // 1GB cache co.capacity = 1024 * 1024 * 1024; @@ -61,91 +90,208 @@ std::unique_ptr wbf( new WriteBufferManager(50 * 1024 * 1024, cache)); - // Allocate 333KB will allocate 512KB + // Allocate 333KB will allocate 512KB, memory_used_ = 333KB wbf->ReserveMem(333 * 1024); + // 2 dummy entries are added for size 333 KB + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 2 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 2 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + 10000); + ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + kMetaDataChargeOverhead); - // Allocate another 512KB + // Allocate another 512KB, memory_used_ = 845KB wbf->ReserveMem(512 * 1024); + // 2 more dummy entries are added for size 512 KB + // since ceil((memory_used_ - dummy_entries_in_cache_usage) % kSizeDummyEntry) + // = 2 + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + 10000); + ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead); - // Allocate another 10MB + // Allocate another 10MB, memory_used_ = 11085KB wbf->ReserveMem(10 * 1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000); - - // Free 1MB will not cause any change in cache cost - wbf->FreeMem(1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000); - + // 40 more entries are added for size 10 * 1024 * 1024 KB + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); + + // Free 1MB, memory_used_ = 10061KB + // It will not cause any change in cache cost + // since memory_used_ > dummy_entries_in_cache_usage * (3/4) + wbf->FreeMem(1 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); ASSERT_FALSE(wbf->ShouldFlush()); - // Allocate another 41MB + // Allocate another 41MB, memory_used_ = 52045KB wbf->ReserveMem(41 * 1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), + 204 * 256 * 1024 + kMetaDataChargeOverhead); ASSERT_TRUE(wbf->ShouldFlush()); ASSERT_TRUE(wbf->ShouldFlush()); + // Schedule free 20MB, memory_used_ = 52045KB + // It will not cause any change in memory_used and cache cost wbf->ScheduleFreeMem(20 * 1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000); - + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), + 204 * 256 * 1024 + kMetaDataChargeOverhead); // Still need flush as the hard limit hits ASSERT_TRUE(wbf->ShouldFlush()); - // Free 20MB will releae 256KB from cache + // Free 20MB, memory_used_ = 31565KB + // It will releae 80 dummy entries from cache since + // since memory_used_ < dummy_entries_in_cache_usage * (3/4) + // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry) + // = 80 wbf->FreeMem(20 * 1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), + 124 * 256 * 1024 + kMetaDataChargeOverhead); ASSERT_FALSE(wbf->ShouldFlush()); - // Every free will release 256KB if still not hit 3/4 - wbf->FreeMem(16 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024 + 10000); - + // Free 16KB, memory_used_ = 31549KB + // It will not release any dummy entry since memory_used_ >= + // dummy_entries_in_cache_usage * (3/4) wbf->FreeMem(16 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000); - - // Reserve 512KB will not cause any change in cache cost + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), + 124 * 256 * 1024 + kMetaDataChargeOverhead); + + // Free 20MB, memory_used_ = 11069KB + // It will releae 80 dummy entries from cache + // since memory_used_ < dummy_entries_in_cache_usage * (3/4) + // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry) + // = 80 + wbf->FreeMem(20 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); + + // Free 1MB, memory_used_ = 10045KB + // It will not cause any change in cache cost + // since memory_used_ > dummy_entries_in_cache_usage * (3/4) + wbf->FreeMem(1 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); + + // Reserve 512KB, memory_used_ = 10557KB + // It will not casue any change in cache cost + // since memory_used_ > dummy_entries_in_cache_usage * (3/4) + // which reflects the benefit of saving dummy entry insertion on memory + // reservation after delay decrease wbf->ReserveMem(512 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000); - - wbf->FreeMem(16 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); // Destory write buffer manger should free everything wbf.reset(); - ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024); + ASSERT_EQ(cache->GetPinnedUsage(), 0); } TEST_F(WriteBufferManagerTest, NoCapCacheCost) { + constexpr std::size_t kMetaDataChargeOverhead = 10000; // 1GB cache std::shared_ptr cache = NewLRUCache(1024 * 1024 * 1024, 4); // A write buffer manager of size 256MB std::unique_ptr wbf(new WriteBufferManager(0, cache)); - // Allocate 1.5MB will allocate 2MB + + // Allocate 10MB, memory_used_ = 10240KB + // It will allocate 40 dummy entries wbf->ReserveMem(10 * 1024 * 1024); - ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 10 * 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 40 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 40 * 256 * 1024 + kMetaDataChargeOverhead); + ASSERT_FALSE(wbf->ShouldFlush()); + // Free 9MB, memory_used_ = 1024KB + // It will free 36 dummy entries wbf->FreeMem(9 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead); + + // Free 160KB gradually, memory_used_ = 864KB + // It will not cause any change + // since memory_used_ > dummy_entries_in_cache_usage * 3/4 for (int i = 0; i < 40; i++) { wbf->FreeMem(4 * 1024); } - ASSERT_GE(cache->GetPinnedUsage(), 1024 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); + ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead); +} + +TEST_F(WriteBufferManagerTest, CacheFull) { + constexpr std::size_t kMetaDataChargeOverhead = 20000; + + // 12MB cache size with strict capacity + LRUCacheOptions lo; + lo.capacity = 12 * 1024 * 1024; + lo.num_shard_bits = 0; + lo.strict_capacity_limit = true; + std::shared_ptr cache = NewLRUCache(lo); + std::unique_ptr wbf(new WriteBufferManager(0, cache)); + + // Allocate 10MB, memory_used_ = 10240KB + wbf->ReserveMem(10 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 40 * kSizeDummyEntry + kMetaDataChargeOverhead); + + // Allocate 10MB, memory_used_ = 20480KB + // Some dummy entry insertion will fail due to full cache + wbf->ReserveMem(10 * 1024 * 1024); + ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry); + ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024); + ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry); + + // Free 15MB after encoutering cache full, memory_used_ = 5120KB + wbf->FreeMem(15 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 20 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 20 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 20 * kSizeDummyEntry + kMetaDataChargeOverhead); + + // Reserve 15MB, creating cache full again, memory_used_ = 20480KB + wbf->ReserveMem(15 * 1024 * 1024); + ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024); + ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry); + + // Increase capacity so next insert will fully succeed + cache->SetCapacity(40 * 1024 * 1024); + + // Allocate 10MB, memory_used_ = 30720KB + wbf->ReserveMem(10 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 120 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 120 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 120 * kSizeDummyEntry + kMetaDataChargeOverhead); + + // Gradually release 20 MB + // It ended up sequentially releasing 32, 24, 18 dummy entries when + // memory_used_ decreases to 22528KB, 16384KB, 11776KB. + // In total, it releases 74 dummy entries + for (int i = 0; i < 40; i++) { + wbf->FreeMem(512 * 1024); + } + + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 46 * kSizeDummyEntry); + ASSERT_GE(cache->GetPinnedUsage(), 46 * kSizeDummyEntry); + ASSERT_LT(cache->GetPinnedUsage(), + 46 * kSizeDummyEntry + kMetaDataChargeOverhead); } + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt --- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/CMakeLists.txt 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,16 @@ +find_package(benchmark REQUIRED) +find_package(Threads REQUIRED) + +file(GLOB_RECURSE ALL_BENCH_CPP *.cc) +foreach(ONE_BENCH_CPP ${ALL_BENCH_CPP}) + get_filename_component(TARGET_NAME ${ONE_BENCH_CPP} NAME_WE) + add_executable(${TARGET_NAME} ${ONE_BENCH_CPP}) + target_link_libraries(${TARGET_NAME} ${ROCKSDB_LIB} benchmark::benchmark + ${CMAKE_THREAD_LIBS_INIT}) + # run benchmark like a test, if added, the benchmark tests could be run by `ctest -R Bench_` + # add_test(Bench_${TARGET_NAME} ${TARGET_NAME}) + list(APPEND ALL_BENCH_TARGETS ${TARGET_NAME}) +endforeach() +add_custom_target(microbench + COMMAND for t in ${ALL_BENCH_TARGETS}\; do \.\/$$t \|\| exit 1\; done + DEPENDS ${ALL_BENCH_TARGETS}) diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// this is a simple micro-benchmark for compare ribbon filter vs. other filter +// for more comprehensive, please check the dedicate util/filter_bench. +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +static void DBOpen(benchmark::State& state) { + // create DB + DB* db; + Options options; + auto env = Env::Default(); + std::string db_path; + auto s = env->GetTestDirectory(&db_path); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + std::string db_name = db_path + "/bench_dbopen"; + + DestroyDB(db_name, options); + + options.create_if_missing = true; + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + db->Close(); + + options.create_if_missing = false; + + auto rnd = Random(12345); + + for (auto _ : state) { + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.PauseTiming(); + auto wo = WriteOptions(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 100; j++) { + s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100)); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + s = db->Flush(FlushOptions()); + } + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + s = db->Close(); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.ResumeTiming(); + } + DestroyDB(db_name, options); +} + +BENCHMARK(DBOpen)->Iterations(200); // specify iteration number as the db size + // is impacted by iteration number + +static void DBClose(benchmark::State& state) { + // create DB + DB* db; + Options options; + auto env = Env::Default(); + std::string db_path; + auto s = env->GetTestDirectory(&db_path); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + std::string db_name = db_path + "/bench_dbclose"; + + DestroyDB(db_name, options); + + options.create_if_missing = true; + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + db->Close(); + + options.create_if_missing = false; + + auto rnd = Random(12345); + + for (auto _ : state) { + state.PauseTiming(); + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + auto wo = WriteOptions(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 100; j++) { + s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100)); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + s = db->Flush(FlushOptions()); + } + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.ResumeTiming(); + s = db->Close(); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + DestroyDB(db_name, options); +} + +BENCHMARK(DBClose)->Iterations(200); // specify iteration number as the db size + // is impacted by iteration number + +} // namespace ROCKSDB_NAMESPACE + +BENCHMARK_MAIN(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// this is a simple micro-benchmark for compare ribbon filter vs. other filter +// for more comprehensive, please check the dedicate util/filter_bench. +#include + +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/mock_block_based_table.h" + +namespace ROCKSDB_NAMESPACE { + +struct KeyMaker { + explicit KeyMaker(size_t avg_size) + : smallest_size_(avg_size), + buf_size_(avg_size + 11), // pad to vary key size and alignment + buf_(new char[buf_size_]) { + memset(buf_.get(), 0, buf_size_); + assert(smallest_size_ > 8); + } + size_t smallest_size_; + size_t buf_size_; + std::unique_ptr buf_; + + // Returns a unique(-ish) key based on the given parameter values. Each + // call returns a Slice from the same buffer so previously returned + // Slices should be considered invalidated. + Slice Get(uint32_t filter_num, uint32_t val_num) const { + size_t start = val_num % 4; + size_t len = smallest_size_; + // To get range [avg_size - 2, avg_size + 2] + // use range [smallest_size, smallest_size + 4] + len += FastRange32((val_num >> 5) * 1234567891, 5); + char *data = buf_.get() + start; + // Populate key data such that all data makes it into a key of at + // least 8 bytes. We also don't want all the within-filter key + // variance confined to a contiguous 32 bits, because then a 32 bit + // hash function can "cheat" the false positive rate by + // approximating a perfect hash. + EncodeFixed32(data, val_num); + EncodeFixed32(data + 4, filter_num + val_num); + // ensure clearing leftovers from different alignment + EncodeFixed32(data + 8, 0); + return {data, len}; + } +}; + +// benchmark arguments: +// 0. filter mode +// 1. filter config bits_per_key +// 2. average data key length +// 3. data entry number +static void CustomArguments(benchmark::internal::Benchmark *b) { + for (int filterMode : + {BloomFilterPolicy::kLegacyBloom, BloomFilterPolicy::kFastLocalBloom, + BloomFilterPolicy::kStandard128Ribbon}) { + // for (int bits_per_key : {4, 10, 20, 30}) { + for (int bits_per_key : {10, 20}) { + for (int key_len_avg : {10, 100}) { + for (int64_t entry_num : {1 << 10, 1 << 20}) { + b->Args({filterMode, bits_per_key, key_len_avg, entry_num}); + } + } + } + } +} + +static void FilterBuild(benchmark::State &state) { + // setup data + auto filter = new BloomFilterPolicy( + static_cast(state.range(1)), + static_cast(state.range(0))); + auto tester = new mock::MockBlockBasedTableTester(filter); + KeyMaker km(state.range(2)); + std::unique_ptr owner; + const int64_t kEntryNum = state.range(3); + auto rnd = Random32(12345); + uint32_t filter_num = rnd.Next(); + // run the test + for (auto _ : state) { + std::unique_ptr builder(tester->GetBuilder()); + for (uint32_t i = 0; i < kEntryNum; i++) { + builder->AddKey(km.Get(filter_num, i)); + } + auto ret = builder->Finish(&owner); + state.counters["size"] = static_cast(ret.size()); + } +} +BENCHMARK(FilterBuild)->Apply(CustomArguments); + +static void FilterQueryPositive(benchmark::State &state) { + // setup data + auto filter = new BloomFilterPolicy( + static_cast(state.range(1)), + static_cast(state.range(0))); + auto tester = new mock::MockBlockBasedTableTester(filter); + KeyMaker km(state.range(2)); + std::unique_ptr owner; + const int64_t kEntryNum = state.range(3); + auto rnd = Random32(12345); + uint32_t filter_num = rnd.Next(); + std::unique_ptr builder(tester->GetBuilder()); + for (uint32_t i = 0; i < kEntryNum; i++) { + builder->AddKey(km.Get(filter_num, i)); + } + auto data = builder->Finish(&owner); + auto reader = filter->GetFilterBitsReader(data); + + // run test + uint32_t i = 0; + for (auto _ : state) { + i++; + i = i % kEntryNum; + reader->MayMatch(km.Get(filter_num, i)); + } +} +BENCHMARK(FilterQueryPositive)->Apply(CustomArguments); + +static void FilterQueryNegative(benchmark::State &state) { + // setup data + auto filter = new BloomFilterPolicy( + static_cast(state.range(1)), + static_cast(state.range(0))); + auto tester = new mock::MockBlockBasedTableTester(filter); + KeyMaker km(state.range(2)); + std::unique_ptr owner; + const int64_t kEntryNum = state.range(3); + auto rnd = Random32(12345); + uint32_t filter_num = rnd.Next(); + std::unique_ptr builder(tester->GetBuilder()); + for (uint32_t i = 0; i < kEntryNum; i++) { + builder->AddKey(km.Get(filter_num, i)); + } + auto data = builder->Finish(&owner); + auto reader = filter->GetFilterBitsReader(data); + + // run test + uint32_t i = 0; + double fp_cnt = 0; + for (auto _ : state) { + i++; + auto result = reader->MayMatch(km.Get(filter_num + 1, i)); + if (result) { + fp_cnt++; + } + } + state.counters["FP %"] = + benchmark::Counter(fp_cnt * 100, benchmark::Counter::kAvgIterations); +} +BENCHMARK(FilterQueryNegative)->Apply(CustomArguments); + +} // namespace ROCKSDB_NAMESPACE + +BENCHMARK_MAIN(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,6 +10,8 @@ #include "monitoring/histogram.h" #include + +#include #include #include #include @@ -23,7 +25,6 @@ // If you change this, you also need to change // size of array buckets_ in HistogramImpl bucketValues_ = {1, 2}; - valueIndexMap_ = {{1, 0}, {2, 1}}; double bucket_val = static_cast(bucketValues_.back()); while ((bucket_val = 1.5 * bucket_val) <= static_cast(port::kMaxUint64)) { bucketValues_.push_back(static_cast(bucket_val)); @@ -35,26 +36,18 @@ pow_of_ten *= 10; } bucketValues_.back() *= pow_of_ten; - valueIndexMap_[bucketValues_.back()] = bucketValues_.size() - 1; } maxBucketValue_ = bucketValues_.back(); minBucketValue_ = bucketValues_.front(); } size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { - if (value >= maxBucketValue_) { - return bucketValues_.size() - 1; - } else if ( value >= minBucketValue_ ) { - std::map::const_iterator lowerBound = - valueIndexMap_.lower_bound(value); - if (lowerBound != valueIndexMap_.end()) { - return static_cast(lowerBound->second); - } else { - return 0; - } - } else { - return 0; - } + auto beg = bucketValues_.begin(); + auto end = bucketValues_.end(); + if (value >= maxBucketValue_) + return end - beg - 1; // bucketValues_.size() - 1 + else + return std::lower_bound(beg, end, value) - beg; } namespace { @@ -251,8 +244,7 @@ void HistogramImpl::Merge(const Histogram& other) { if (strcmp(Name(), other.Name()) == 0) { - Merge( - *static_cast_with_check(&other)); + Merge(*static_cast_with_check(&other)); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h 2025-05-19 16:14:27.000000000 +0000 @@ -48,7 +48,6 @@ std::vector bucketValues_; uint64_t maxBucketValue_; uint64_t minBucketValue_; - std::map valueIndexMap_; }; struct HistogramStat { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,11 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include "monitoring/histogram.h" + #include -#include "monitoring/histogram.h" #include "monitoring/histogram_windowing.h" +#include "rocksdb/system_clock.h" +#include "test_util/mock_time_env.h" #include "test_util/testharness.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -16,16 +20,22 @@ namespace { const double kIota = 0.1; const HistogramBucketMapper bucketMapper; - Env* env = Env::Default(); + std::shared_ptr clock = + std::make_shared(SystemClock::Default()); } void PopulateHistogram(Histogram& histogram, uint64_t low, uint64_t high, uint64_t loop = 1) { + Random rnd(test::RandomSeed()); for (; loop > 0; loop--) { for (uint64_t i = low; i <= high; i++) { histogram.Add(i); + // sleep a random microseconds [0-10) + clock->SleepForMicroseconds(rnd.Uniform(10)); } } + // make sure each data population at least take some time + clock->SleepForMicroseconds(1); } void BasicOperation(Histogram& histogram) { @@ -131,23 +141,23 @@ HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window, min_num_per_window); - + histogramWindowing.TEST_UpdateClock(clock); PopulateHistogram(histogramWindowing, 1, 1, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 100); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 1); ASSERT_EQ(histogramWindowing.Average(), 1); PopulateHistogram(histogramWindowing, 2, 2, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 200); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 2); ASSERT_EQ(histogramWindowing.Average(), 1.5); PopulateHistogram(histogramWindowing, 3, 3, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 3); @@ -155,7 +165,7 @@ // dropping oldest window with value 1, remaining 2 ~ 4 PopulateHistogram(histogramWindowing, 4, 4, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 2); ASSERT_EQ(histogramWindowing.max(), 4); @@ -163,7 +173,7 @@ // dropping oldest window with value 2, remaining 3 ~ 5 PopulateHistogram(histogramWindowing, 5, 5, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 3); ASSERT_EQ(histogramWindowing.max(), 5); @@ -179,18 +189,20 @@ histogramWindowing(num_windows, micros_per_window, min_num_per_window); HistogramWindowingImpl otherWindowing(num_windows, micros_per_window, min_num_per_window); + histogramWindowing.TEST_UpdateClock(clock); + otherWindowing.TEST_UpdateClock(clock); PopulateHistogram(histogramWindowing, 1, 1, 100); PopulateHistogram(otherWindowing, 1, 1, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); PopulateHistogram(histogramWindowing, 2, 2, 100); PopulateHistogram(otherWindowing, 2, 2, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); PopulateHistogram(histogramWindowing, 3, 3, 100); PopulateHistogram(otherWindowing, 3, 3, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); histogramWindowing.Merge(otherWindowing); ASSERT_EQ(histogramWindowing.num(), 600); @@ -200,14 +212,14 @@ // dropping oldest window with value 1, remaining 2 ~ 4 PopulateHistogram(histogramWindowing, 4, 4, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 500); ASSERT_EQ(histogramWindowing.min(), 2); ASSERT_EQ(histogramWindowing.max(), 4); // dropping oldest window with value 2, remaining 3 ~ 5 PopulateHistogram(histogramWindowing, 5, 5, 100); - env->SleepForMicroseconds(micros_per_window); + clock->SleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 400); ASSERT_EQ(histogramWindowing.min(), 3); ASSERT_EQ(histogramWindowing.max(), 5); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,15 +8,17 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "monitoring/histogram_windowing.h" -#include "monitoring/histogram.h" -#include "util/cast_util.h" #include +#include "monitoring/histogram.h" +#include "rocksdb/system_clock.h" +#include "util/cast_util.h" + namespace ROCKSDB_NAMESPACE { HistogramWindowingImpl::HistogramWindowingImpl() { - env_ = Env::Default(); + clock_ = SystemClock::Default(); window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } @@ -28,7 +30,7 @@ num_windows_(num_windows), micros_per_window_(micros_per_window), min_num_per_window_(min_num_per_window) { - env_ = Env::Default(); + clock_ = SystemClock::Default(); window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } @@ -44,7 +46,7 @@ window_stats_[i].Clear(); } current_window_.store(0, std::memory_order_relaxed); - last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); } bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); } @@ -65,9 +67,7 @@ void HistogramWindowingImpl::Merge(const Histogram& other) { if (strcmp(Name(), other.Name()) == 0) { - Merge( - *static_cast_with_check( - &other)); + Merge(*static_cast_with_check(&other)); } } @@ -131,7 +131,7 @@ } void HistogramWindowingImpl::TimerTick() { - uint64_t curr_time = env_->NowMicros(); + uint64_t curr_time = clock_->NowMicros(); size_t curr_window_ = static_cast(current_window()); if (curr_time - last_swap_time() > micros_per_window_ && window_stats_[curr_window_].num() >= min_num_per_window_) { @@ -146,7 +146,7 @@ // If mutex is held by Merge() or Clear(), next Add() will take care of the // swap, if needed. if (mutex_.try_lock()) { - last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); uint64_t curr_window = current_window(); uint64_t next_window = (curr_window == num_windows_ - 1) ? diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,9 @@ #pragma once #include "monitoring/histogram.h" -#include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; class HistogramWindowingImpl : public Histogram { @@ -44,7 +44,13 @@ virtual double StandardDeviation() const override; virtual void Data(HistogramData* const data) const override; -private: +#ifndef NDEBUG + void TEST_UpdateClock(const std::shared_ptr& clock) { + clock_ = clock; + } +#endif // NDEBUG + + private: void TimerTick(); void SwapHistoryBucket(); inline uint64_t current_window() const { @@ -54,7 +60,7 @@ return last_swap_time_.load(std::memory_order_relaxed); } - Env* env_; + std::shared_ptr clock_; std::mutex mutex_; // Aggregated stats over windows_stats_, all the computation is done diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,26 +4,30 @@ // (found in the LICENSE.Apache file in the root directory). #include "monitoring/instrumented_mutex.h" + #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" namespace ROCKSDB_NAMESPACE { namespace { -Statistics* stats_for_report(Env* env, Statistics* stats) { - if (env != nullptr && stats != nullptr && +#ifndef NPERF_CONTEXT +Statistics* stats_for_report(SystemClock* clock, Statistics* stats) { + if (clock != nullptr && stats != nullptr && stats->get_stats_level() > kExceptTimeForMutex) { return stats; } else { return nullptr; } } +#endif // NPERF_CONTEXT } // namespace void InstrumentedMutex::Lock() { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); LockInternal(); } @@ -37,7 +41,7 @@ void InstrumentedCondVar::Wait() { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); WaitInternal(); } @@ -51,7 +55,7 @@ bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); return TimedWaitInternal(abs_time_us); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,8 +7,8 @@ #include "monitoring/statistics.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "rocksdb/thread_status.h" #include "util/stop_watch.h" @@ -20,13 +20,16 @@ class InstrumentedMutex { public: explicit InstrumentedMutex(bool adaptive = false) - : mutex_(adaptive), stats_(nullptr), env_(nullptr), - stats_code_(0) {} + : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {} - InstrumentedMutex( - Statistics* stats, Env* env, - int stats_code, bool adaptive = false) - : mutex_(adaptive), stats_(stats), env_(env), + explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false) + : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {} + + InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code, + bool adaptive = false) + : mutex_(adaptive), + stats_(stats), + clock_(clock), stats_code_(stats_code) {} void Lock(); @@ -44,12 +47,11 @@ friend class InstrumentedCondVar; port::Mutex mutex_; Statistics* stats_; - Env* env_; + SystemClock* clock_; int stats_code_; }; -// A wrapper class for port::Mutex that provides additional layer -// for collecting stats and instrumentation. +// RAII wrapper for InstrumentedMutex class InstrumentedMutexLock { public: explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) { @@ -66,12 +68,28 @@ void operator=(const InstrumentedMutexLock&) = delete; }; +// RAII wrapper for temporary releasing InstrumentedMutex inside +// InstrumentedMutexLock +class InstrumentedMutexUnlock { + public: + explicit InstrumentedMutexUnlock(InstrumentedMutex* mutex) : mutex_(mutex) { + mutex_->Unlock(); + } + + ~InstrumentedMutexUnlock() { mutex_->Lock(); } + + private: + InstrumentedMutex* const mutex_; + InstrumentedMutexUnlock(const InstrumentedMutexUnlock&) = delete; + void operator=(const InstrumentedMutexUnlock&) = delete; +}; + class InstrumentedCondVar { public: explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex) : cond_(&(instrumented_mutex->mutex_)), stats_(instrumented_mutex->stats_), - env_(instrumented_mutex->env_), + clock_(instrumented_mutex->clock_), stats_code_(instrumented_mutex->stats_code_) {} void Wait(); @@ -91,7 +109,7 @@ bool TimedWaitInternal(uint64_t abs_time_us); port::CondVar cond_; Statistics* stats_; - Env* env_; + SystemClock* clock_; int stats_code_; }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,19 +9,23 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +#ifdef NIOSTATS_CONTEXT +// Should not be used because the counters are not thread-safe. +// Put here just to make get_iostats_context() simple without ifdef. +static IOStatsContext iostats_context; +#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL) __thread IOStatsContext iostats_context; +#else +#error \ + "No thread-local support. Disable iostats context with -DNIOSTATS_CONTEXT." #endif IOStatsContext* get_iostats_context() { -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL return &iostats_context; -#else - return nullptr; -#endif } void IOStatsContext::Reset() { +#ifndef NIOSTATS_CONTEXT thread_pool_id = Env::Priority::TOTAL; bytes_read = 0; bytes_written = 0; @@ -33,6 +37,10 @@ prepare_write_nanos = 0; fsync_nanos = 0; logger_nanos = 0; + cpu_write_nanos = 0; + cpu_read_nanos = 0; + file_io_stats_by_temperature.Reset(); +#endif //! NIOSTATS_CONTEXT } #define IOSTATS_CONTEXT_OUTPUT(counter) \ @@ -41,6 +49,10 @@ } std::string IOStatsContext::ToString(bool exclude_zero_counters) const { +#ifdef NIOSTATS_CONTEXT + (void)exclude_zero_counters; + return ""; +#else std::ostringstream ss; IOSTATS_CONTEXT_OUTPUT(thread_pool_id); IOSTATS_CONTEXT_OUTPUT(bytes_read); @@ -53,10 +65,18 @@ IOSTATS_CONTEXT_OUTPUT(fsync_nanos); IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos); IOSTATS_CONTEXT_OUTPUT(logger_nanos); - + IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos); + IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count); std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); return str; +#endif //! NIOSTATS_CONTEXT } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,7 +7,7 @@ #include "monitoring/perf_step_timer.h" #include "rocksdb/iostats_context.h" -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(NIOSTATS_CONTEXT) namespace ROCKSDB_NAMESPACE { extern __thread IOStatsContext iostats_context; } // namespace ROCKSDB_NAMESPACE @@ -15,10 +15,6 @@ // increment a specific counter by the specified value #define IOSTATS_ADD(metric, value) (iostats_context.metric += value) -// Increase metric value only when it is positive -#define IOSTATS_ADD_IF_POSITIVE(metric, value) \ - if (value > 0) { IOSTATS_ADD(metric, value); } - // reset a specific counter to zero #define IOSTATS_RESET(metric) (iostats_context.metric = 0) @@ -38,13 +34,13 @@ iostats_step_timer_##metric.Start(); // Declare and set start time of the timer -#define IOSTATS_CPU_TIMER_GUARD(metric, env) \ +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) \ PerfStepTimer iostats_step_timer_##metric( \ - &(iostats_context.metric), env, true, \ + &(iostats_context.metric), clock, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ iostats_step_timer_##metric.Start(); -#else // ROCKSDB_SUPPORT_THREAD_LOCAL +#else // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT #define IOSTATS_ADD(metric, value) #define IOSTATS_ADD_IF_POSITIVE(metric, value) @@ -55,6 +51,6 @@ #define IOSTATS(metric) 0 #define IOSTATS_TIMER_GUARD(metric) -#define IOSTATS_CPU_TIMER_GUARD(metric, env) static_cast(env) +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast(clock) -#endif // ROCKSDB_SUPPORT_THREAD_LOCAL +#endif // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,26 +9,22 @@ namespace ROCKSDB_NAMESPACE { -#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL) +#if defined(NPERF_CONTEXT) +// Should not be used because the counters are not thread-safe. +// Put here just to make get_perf_context() simple without ifdef. PerfContext perf_context; -#else +#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL) #if defined(OS_SOLARIS) -__thread PerfContext perf_context_; -#else +__thread PerfContext perf_context; +#else // OS_SOLARIS thread_local PerfContext perf_context; -#endif +#endif // OS_SOLARIS +#else +#error "No thread-local support. Disable perf context with -DNPERF_CONTEXT." #endif PerfContext* get_perf_context() { -#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL) - return &perf_context; -#else -#if defined(OS_SOLARIS) - return &perf_context_; -#else return &perf_context; -#endif -#endif } PerfContext::~PerfContext() { @@ -38,7 +34,9 @@ } PerfContext::PerfContext(const PerfContext& other) { -#ifndef NPERF_CONTEXT +#ifdef NPERF_CONTEXT + (void)other; +#else user_key_comparison_count = other.user_key_comparison_count; block_cache_hit_count = other.block_cache_hit_count; block_read_count = other.block_read_count; @@ -49,6 +47,7 @@ block_cache_filter_hit_count = other.block_cache_filter_hit_count; filter_block_read_count = other.filter_block_read_count; compression_dict_block_read_count = other.compression_dict_block_read_count; + secondary_cache_hit_count = other.secondary_cache_hit_count; block_checksum_time = other.block_checksum_time; block_decompress_time = other.block_decompress_time; get_read_bytes = other.get_read_bytes; @@ -133,7 +132,9 @@ } PerfContext::PerfContext(PerfContext&& other) noexcept { -#ifndef NPERF_CONTEXT +#ifdef NPERF_CONTEXT + (void)other; +#else user_key_comparison_count = other.user_key_comparison_count; block_cache_hit_count = other.block_cache_hit_count; block_read_count = other.block_read_count; @@ -144,6 +145,7 @@ block_cache_filter_hit_count = other.block_cache_filter_hit_count; filter_block_read_count = other.filter_block_read_count; compression_dict_block_read_count = other.compression_dict_block_read_count; + secondary_cache_hit_count = other.secondary_cache_hit_count; block_checksum_time = other.block_checksum_time; block_decompress_time = other.block_decompress_time; get_read_bytes = other.get_read_bytes; @@ -230,7 +232,9 @@ // TODO(Zhongyi): reduce code duplication between copy constructor and // assignment operator PerfContext& PerfContext::operator=(const PerfContext& other) { -#ifndef NPERF_CONTEXT +#ifdef NPERF_CONTEXT + (void)other; +#else user_key_comparison_count = other.user_key_comparison_count; block_cache_hit_count = other.block_cache_hit_count; block_read_count = other.block_read_count; @@ -241,6 +245,7 @@ block_cache_filter_hit_count = other.block_cache_filter_hit_count; filter_block_read_count = other.filter_block_read_count; compression_dict_block_read_count = other.compression_dict_block_read_count; + secondary_cache_hit_count = other.secondary_cache_hit_count; block_checksum_time = other.block_checksum_time; block_decompress_time = other.block_decompress_time; get_read_bytes = other.get_read_bytes; @@ -337,6 +342,7 @@ block_cache_filter_hit_count = 0; filter_block_read_count = 0; compression_dict_block_read_count = 0; + secondary_cache_hit_count = 0; block_checksum_time = 0; block_decompress_time = 0; get_read_bytes = 0; @@ -443,6 +449,7 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const { #ifdef NPERF_CONTEXT + (void)exclude_zero_counters; return ""; #else std::ostringstream ss; @@ -456,6 +463,7 @@ PERF_CONTEXT_OUTPUT(block_cache_filter_hit_count); PERF_CONTEXT_OUTPUT(filter_block_read_count); PERF_CONTEXT_OUTPUT(compression_dict_block_read_count); + PERF_CONTEXT_OUTPUT(secondary_cache_hit_count); PERF_CONTEXT_OUTPUT(block_checksum_time); PERF_CONTEXT_OUTPUT(block_decompress_time); PERF_CONTEXT_OUTPUT(get_read_bytes); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,8 +25,8 @@ #define PERF_TIMER_STOP(metric) #define PERF_TIMER_START(metric) #define PERF_TIMER_GUARD(metric) -#define PERF_TIMER_GUARD_WITH_ENV(metric, env) -#define PERF_CPU_TIMER_GUARD(metric, env) +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) +#define PERF_CPU_TIMER_GUARD(metric, clock) #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ ticker_type) #define PERF_TIMER_MEASURE(metric) @@ -46,14 +46,14 @@ perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_CPU_TIMER_GUARD(metric, env) \ +#define PERF_CPU_TIMER_GUARD(metric, clock) \ PerfStepTimer perf_step_timer_##metric( \ - &(perf_context.metric), env, true, \ + &(perf_context.metric), clock, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ perf_step_timer_##metric.Start(); @@ -77,20 +77,19 @@ } // Increase metric value -#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ - if (perf_level >= PerfLevel::kEnableCount && \ - perf_context.per_level_perf_context_enabled && \ - perf_context.level_to_perf_context) { \ - if ((*(perf_context.level_to_perf_context)).find(level) != \ - (*(perf_context.level_to_perf_context)).end()) { \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - else { \ - PerfContextByLevel empty_context; \ - (*(perf_context.level_to_perf_context))[level] = empty_context; \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - } \ +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ + if (perf_level >= PerfLevel::kEnableCount && \ + perf_context.per_level_perf_context_enabled && \ + perf_context.level_to_perf_context) { \ + if ((*(perf_context.level_to_perf_context)).find(level) != \ + (*(perf_context.level_to_perf_context)).end()) { \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } else { \ + PerfContextByLevel empty_context; \ + (*(perf_context.level_to_perf_context))[level] = empty_context; \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } \ + } #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,26 +5,26 @@ // #pragma once #include "monitoring/perf_level_imp.h" -#include "rocksdb/env.h" -#include "util/stop_watch.h" +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { class PerfStepTimer { public: explicit PerfStepTimer( - uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, + uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false, PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, Statistics* statistics = nullptr, uint32_t ticker_type = 0) : perf_counter_enabled_(perf_level >= enable_level), use_cpu_time_(use_cpu_time), - env_((perf_counter_enabled_ || statistics != nullptr) - ? ((env != nullptr) ? env : Env::Default()) - : nullptr), + ticker_type_(ticker_type), + clock_((perf_counter_enabled_ || statistics != nullptr) + ? (clock ? clock : SystemClock::Default().get()) + : nullptr), start_(0), metric_(metric), - statistics_(statistics), - ticker_type_(ticker_type) {} + statistics_(statistics) {} ~PerfStepTimer() { Stop(); @@ -36,14 +36,6 @@ } } - uint64_t time_now() { - if (!use_cpu_time_) { - return env_->NowNanos(); - } else { - return env_->NowCPUNanos(); - } - } - void Measure() { if (start_) { uint64_t now = time_now(); @@ -67,13 +59,21 @@ } private: + uint64_t time_now() { + if (!use_cpu_time_) { + return clock_->NowNanos(); + } else { + return clock_->CPUNanos(); + } + } + const bool perf_counter_enabled_; const bool use_cpu_time_; - Env* const env_; + uint32_t ticker_type_; + SystemClock* const clock_; uint64_t start_; uint64_t* metric_; Statistics* statistics_; - uint32_t ticker_type_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ #include #include #include "db/db_impl/db_impl.h" -#include "port/likely.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,8 +8,12 @@ #include #include #include -#include "port/likely.h" + +#include "rocksdb/convenience.h" #include "rocksdb/statistics.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -105,6 +109,12 @@ {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"}, + {COMPACT_READ_BYTES_MARKED, "rocksdb.compact.read.marked.bytes"}, + {COMPACT_READ_BYTES_PERIODIC, "rocksdb.compact.read.periodic.bytes"}, + {COMPACT_READ_BYTES_TTL, "rocksdb.compact.read.ttl.bytes"}, + {COMPACT_WRITE_BYTES_MARKED, "rocksdb.compact.write.marked.bytes"}, + {COMPACT_WRITE_BYTES_PERIODIC, "rocksdb.compact.write.periodic.bytes"}, + {COMPACT_WRITE_BYTES_TTL, "rocksdb.compact.write.ttl.bytes"}, {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, "rocksdb.number.direct.load.table.properties"}, {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"}, @@ -176,6 +186,42 @@ "rocksdb.block.cache.compression.dict.bytes.insert"}, {BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, "rocksdb.block.cache.compression.dict.bytes.evict"}, + {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"}, + {BLOCK_CACHE_INDEX_ADD_REDUNDANT, + "rocksdb.block.cache.index.add.redundant"}, + {BLOCK_CACHE_FILTER_ADD_REDUNDANT, + "rocksdb.block.cache.filter.add.redundant"}, + {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"}, + {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + "rocksdb.block.cache.compression.dict.add.redundant"}, + {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, + {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, + {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"}, + {ERROR_HANDLER_BG_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.io.errro.count"}, + {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.retryable.io.errro.count"}, + {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, + {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + "rocksdb.error.handler.autoresume.retry.total.count"}, + {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + "rocksdb.error.handler.autoresume.success.count"}, + {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + "rocksdb.memtable.payload.bytes.at.flush"}, + {MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + "rocksdb.memtable.garbage.bytes.at.flush"}, + {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"}, + {VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"}, + {BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"}, + {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"}, + {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"}, + {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"}, + {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"}, + {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"}, + {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"}, + {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"}, + {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"}, + {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"}, }; const std::vector> HistogramsNameMap = { @@ -227,14 +273,64 @@ {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"}, {FLUSH_TIME, "rocksdb.db.flush.micros"}, {SST_BATCH_SIZE, "rocksdb.sst.batch.size"}, + {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + "rocksdb.num.index.and.filter.blocks.read.per.level"}, + {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, + {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, + {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + "rocksdb.error.handler.autoresume.retry.count"}, }; std::shared_ptr CreateDBStatistics() { return std::make_shared(nullptr); } +#ifndef ROCKSDB_LITE +static int RegisterBuiltinStatistics(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + StatisticsImpl::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new StatisticsImpl(nullptr)); + return guard->get(); + }); + return 1; +} +#endif // ROCKSDB_LITE + +Status Statistics::CreateFromString(const ConfigOptions& config_options, + const std::string& id, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + Status s; + if (id == "" || id == StatisticsImpl::kClassName()) { + result->reset(new StatisticsImpl(nullptr)); + } else if (id == kNullptrString) { + result->reset(); + } else { + s = LoadSharedObject(config_options, id, nullptr, result); + } + return s; +} + +static std::unordered_map stats_type_info = { +#ifndef ROCKSDB_LITE + {"inner", OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kCompareNever)}, +#endif // !ROCKSDB_LITE +}; + StatisticsImpl::StatisticsImpl(std::shared_ptr stats) - : stats_(std::move(stats)) {} + : stats_(std::move(stats)) { + RegisterOptions("StatisticsOptions", &stats_, &stats_type_info); +} StatisticsImpl::~StatisticsImpl() {} @@ -313,11 +409,17 @@ } void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) { - assert(tickerType < TICKER_ENUM_MAX); - per_core_stats_.Access()->tickers_[tickerType].fetch_add( - count, std::memory_order_relaxed); - if (stats_ && tickerType < TICKER_ENUM_MAX) { - stats_->recordTick(tickerType, count); + if (get_stats_level() <= StatsLevel::kExceptTickers) { + return; + } + if (tickerType < TICKER_ENUM_MAX) { + per_core_stats_.Access()->tickers_[tickerType].fetch_add( + count, std::memory_order_relaxed); + if (stats_) { + stats_->recordTick(tickerType, count); + } + } else { + assert(false); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h 2025-05-19 16:14:27.000000000 +0000 @@ -44,6 +44,8 @@ public: StatisticsImpl(std::shared_ptr stats); virtual ~StatisticsImpl(); + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "BasicStatistics"; } virtual uint64_t getTickerCount(uint32_t ticker_type) const override; virtual void histogramData(uint32_t histogram_type, @@ -68,6 +70,8 @@ virtual bool getTickerMap(std::map*) const override; virtual bool HistEnabledForType(uint32_t type) const override; + const Customizable* Inner() const override { return stats_.get(); } + private: // If non-nullptr, forwards updates to the object pointed to by `stats_`. std::shared_ptr stats_; @@ -96,7 +100,9 @@ void operator delete[](void *p) { port::cacheline_aligned_free(p); } }; +#ifndef TEST_CACHE_LINE_SIZE static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned"); +#endif CoreLocalArray per_core_stats_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,12 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). // +#include "rocksdb/statistics.h" + #include "port/stack_trace.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/options_type.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "rocksdb/statistics.h" - namespace ROCKSDB_NAMESPACE { class StatisticsTest : public testing::Test {}; @@ -38,6 +40,49 @@ } } +TEST_F(StatisticsTest, NoNameStats) { + static std::unordered_map no_name_opt_info = { +#ifndef ROCKSDB_LITE + {"inner", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, + OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever)}, +#endif // ROCKSDB_LITE + }; + + class DefaultNameStatistics : public Statistics { + public: + DefaultNameStatistics(const std::shared_ptr& stats = nullptr) + : inner(stats) { + RegisterOptions("", &inner, &no_name_opt_info); + } + + uint64_t getTickerCount(uint32_t /*tickerType*/) const override { + return 0; + } + void histogramData(uint32_t /*type*/, + HistogramData* const /*data*/) const override {} + void recordTick(uint32_t /*tickerType*/, uint64_t /*count*/) override {} + void setTickerCount(uint32_t /*tickerType*/, uint64_t /*count*/) override {} + uint64_t getAndResetTickerCount(uint32_t /*tickerType*/) override { + return 0; + } + std::shared_ptr inner; + }; + ConfigOptions options; + options.ignore_unsupported_options = false; + auto stats = std::make_shared(); + ASSERT_STREQ(stats->Name(), ""); +#ifndef ROCKSDB_LITE + ASSERT_EQ("", stats->ToString( + options)); // A stats with no name with have no options... + ASSERT_OK(stats->ConfigureFromString(options, "inner=")); + ASSERT_EQ("", stats->ToString( + options)); // A stats with no name with have no options... + ASSERT_NE(stats->inner, nullptr); + ASSERT_NE("", stats->inner->ToString(options)); // ... even if it does... +#endif // ROCKSDB_LITE +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,8 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/stats_history.h" + #include #include #include @@ -13,58 +15,70 @@ #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "db/periodic_work_scheduler.h" #include "monitoring/persistent_stats_history.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/rate_limiter.h" -#include "rocksdb/stats_history.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE class StatsHistoryTest : public DBTestBase { public: - StatsHistoryTest() : DBTestBase("/stats_history_test") {} + StatsHistoryTest() : DBTestBase("stats_history_test", /*env_do_fsync=*/true) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_)); + } + + protected: + std::shared_ptr mock_clock_; + std::unique_ptr mock_env_; + + void SetUp() override { + mock_clock_->InstallTimedWaitFixCallback(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) { + auto* periodic_work_scheduler_ptr = + reinterpret_cast(arg); + *periodic_work_scheduler_ptr = + PeriodicWorkTestScheduler::Default(mock_clock_); + }); + } }; -#ifndef ROCKSDB_LITE TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_dump_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); + options.stats_dump_period_sec = kPeriodSec; + options.env = mock_env_.get(); int counter = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:1", + [&](void* /*arg*/) { counter++; }); Reopen(options); ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); }); + + // Wait for the first stats persist to finish, as the initial delay could be + // different. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); - // Test cacel job through SetOptions + // Test cancel job through SetOptions ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); int old_val = counter; - for (int i = 6; i < 20; ++i) { - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); }); + for (int i = 1; i < 20; ++i) { + mock_clock_->MockSleepForSeconds(kPeriodSec); } ASSERT_EQ(counter, old_val); Close(); @@ -72,120 +86,96 @@ // Test persistent stats background thread scheduling and cancelling TEST_F(StatsHistoryTest, StatsPersistScheduling) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_persist_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG + options.stats_persist_period_sec = kPeriodSec; + options.env = mock_env_.get(); int counter = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry", + [&](void* /*arg*/) { counter++; }); Reopen(options); ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + + // Wait for the first stats persist to finish, as the initial delay could be + // different. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); - // Test cacel job through SetOptions - ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled()); + // Test cancel job through SetOptions ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); - ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled()); + int old_val = counter; + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); }); + ASSERT_EQ(counter, old_val); + Close(); } // Test enabling persistent stats for the first time TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) { + constexpr unsigned int kPeriodSec = 5; Options options; options.create_if_missing = true; options.stats_persist_period_sec = 0; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG + options.env = mock_env_.get(); int counter = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry", + [&](void* /*arg*/) { counter++; }); Reopen(options); - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}})); - ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_persist_period_sec", std::to_string(kPeriodSec)}})); + ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); Close(); } // TODO(Zhongyi): Move persistent stats related tests to a separate file TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG - + options.stats_persist_period_sec = kPeriodSec; + options.statistics = CreateDBStatistics(); + options.env = mock_env_.get(); CreateColumnFamilies({"pikachu"}, options); ASSERT_OK(Put("foo", "bar")); ReopenWithColumnFamilies({"default", "pikachu"}, options); - int mock_time = 1; + // make sure the first stats persist to finish + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + // Wait for stats persist to finish - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + std::unique_ptr stats_iter; - db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); // disabled stats snapshots ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); size_t stats_count = 0; for (; stats_iter->Valid(); stats_iter->Next()) { auto stats_map = stats_iter->GetStatsMap(); - ASSERT_EQ(stats_iter->GetStatsTime(), 5); + ASSERT_EQ(stats_iter->GetStatsTime(), mock_clock_->NowSeconds()); stats_count += stats_map.size(); } ASSERT_GT(stats_count, 0); // Wait a bit and verify no more stats are found - for (mock_time = 6; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); + for (int i = 0; i < 10; ++i) { + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(1); }); } - db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_new = 0; for (; stats_iter->Valid(); stats_iter->Next()) { @@ -196,26 +186,12 @@ } TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { + constexpr int kPeriodSec = 1; Options options; options.create_if_missing = true; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.stats_persist_period_sec = 1; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG + options.statistics = CreateDBStatistics(); + options.stats_persist_period_sec = kPeriodSec; + options.env = mock_env_.get(); CreateColumnFamilies({"pikachu"}, options); ASSERT_OK(Put("foo", "bar")); @@ -235,13 +211,7 @@ delete iterator; ASSERT_OK(Flush()); ASSERT_OK(Delete("sol")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - int mock_time = 1; - // Wait for stats persist to finish - for (; mock_time < 5; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // second round of ops ASSERT_OK(Put("saigon", "saigon")); @@ -253,13 +223,17 @@ } delete iterator; ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - for (; mock_time < 10; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + const int kIterations = 10; + for (int i = 0; i < kIterations; ++i) { + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } + std::unique_ptr stats_iter; - db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count = 0; int slice_count = 0; @@ -269,17 +243,20 @@ stats_count += stats_map.size(); } size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); - ASSERT_GE(slice_count, 9); - ASSERT_GE(stats_history_size, 12000); - // capping memory cost at 12000 bytes since one slice is around 10000~12000 - ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}})); - ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size); + ASSERT_GE(slice_count, kIterations - 1); + ASSERT_GE(stats_history_size, 15000); + // capping memory cost at 15000 bytes since one slice is around 10000~15000 + ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "15000"}})); + ASSERT_EQ(15000, dbfull()->GetDBOptions().stats_history_buffer_size); + // Wait for stats persist to finish - for (; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); + for (int i = 0; i < kIterations; ++i) { + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } - db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_reopen = 0; slice_count = 0; @@ -292,7 +269,7 @@ dbfull()->TEST_EstimateInMemoryStatsHistorySize(); // only one slice can fit under the new stats_history_buffer_size ASSERT_LT(slice_count, 2); - ASSERT_TRUE(stats_history_size_reopen < 12000 && + ASSERT_TRUE(stats_history_size_reopen < 15000 && stats_history_size_reopen > 0); ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); Close(); @@ -309,34 +286,41 @@ } TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.stats_persist_period_sec = kPeriodSec; + options.statistics = CreateDBStatistics(); options.persist_stats_to_disk = true; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); + options.env = mock_env_.get(); CreateColumnFamilies({"pikachu"}, options); ASSERT_OK(Put("foo", "bar")); ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ(Get("foo"), "bar"); + // Wait for the first stats persist to finish, as the initial delay could be + // different. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + // Wait for stats persist to finish - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); + auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count1 = countkeys(iter); delete iter; - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(10); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count2 = countkeys(iter); delete iter; - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(15); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count3 = countkeys(iter); @@ -345,15 +329,16 @@ ASSERT_GE(key_count3, key_count2); ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1); std::unique_ptr stats_iter; - db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count = 0; int slice_count = 0; int non_zero_count = 0; - for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) { slice_count++; auto stats_map = stats_iter->GetStatsMap(); - ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1); for (auto& stat : stats_map) { if (stat.second != 0) { non_zero_count++; @@ -366,7 +351,8 @@ ASSERT_EQ(stats_count, key_count3 - 2); // verify reopen will not cause data loss ReopenWithColumnFamilies({"default", "pikachu"}, options); - db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_reopen = 0; int slice_count_reopen = 0; @@ -381,6 +367,7 @@ } stats_count_reopen += stats_map.size(); } + ASSERT_EQ(non_zero_count, non_zero_count_recover); ASSERT_EQ(slice_count, slice_count_reopen); ASSERT_EQ(stats_count, stats_count_reopen); @@ -390,53 +377,61 @@ // Test persisted stats matches the value found in options.statistics and // the stats value retains after DB reopen TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.stats_persist_period_sec = kPeriodSec; + options.statistics = CreateDBStatistics(); options.persist_stats_to_disk = true; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); std::map stats_map_before; ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); + options.env = mock_env_.get(); CreateColumnFamilies({"pikachu"}, options); ASSERT_OK(Put("foo", "bar")); ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ(Get("foo"), "bar"); + // Wait for the first stats persist to finish, as the initial delay could be + // different. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + // Wait for stats persist to finish - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(10); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(15); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(20); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); std::map stats_map_after; ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after)); std::unique_ptr stats_iter; - db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); std::string sample = "rocksdb.num.iterator.deleted"; uint64_t recovered_value = 0; - for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) { + for (int i = 2; stats_iter->Valid(); stats_iter->Next(), ++i) { auto stats_map = stats_iter->GetStatsMap(); - ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1); for (const auto& stat : stats_map) { if (sample.compare(stat.first) == 0) { recovered_value += stat.second; @@ -447,12 +442,13 @@ // test stats value retains after recovery ReopenWithColumnFamilies({"default", "pikachu"}, options); - db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); uint64_t new_recovered_value = 0; - for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) { auto stats_map = stats_iter->GetStatsMap(); - ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1); for (const auto& stat : stats_map) { if (sample.compare(stat.first) == 0) { new_recovered_value += stat.second; @@ -469,15 +465,13 @@ // TODO(Zhongyi): add test for different format versions TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.stats_persist_period_sec = kPeriodSec; + options.statistics = CreateDBStatistics(); options.persist_stats_to_disk = true; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); + options.env = mock_env_.get(); ASSERT_OK(TryReopen(options)); CreateColumnFamilies({"one", "two", "three"}, options); ASSERT_OK(Put(1, "foo", "bar")); @@ -486,7 +480,13 @@ CreateColumnFamilies({"four"}, options); ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); ASSERT_EQ(Get(2, "foo"), "bar"); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + + // make sure the first stats persist to finish + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count = countkeys(iter); @@ -495,7 +495,7 @@ uint64_t num_write_wal = 0; std::string sample = "rocksdb.write.wal"; std::unique_ptr stats_iter; - db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); for (; stats_iter->Valid(); stats_iter->Next()) { auto stats_map = stats_iter->GetStatsMap(); @@ -506,7 +506,7 @@ } } stats_iter.reset(); - ASSERT_EQ(num_write_wal, 2); + ASSERT_EQ(num_write_wal, 1); options.persist_stats_to_disk = false; ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); @@ -531,7 +531,7 @@ ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, &handle)); // verify stats is not affected by prior failed CF creation - db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); num_write_wal = 0; for (; stats_iter->Valid(); stats_iter->Next()) { @@ -542,7 +542,7 @@ } } } - ASSERT_EQ(num_write_wal, 2); + ASSERT_EQ(num_write_wal, 1); Close(); Destroy(options); @@ -562,25 +562,29 @@ // Reopen and flush memtable. ASSERT_OK(TryReopen(options)); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); } TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { + constexpr int kPeriodSec = 5; Options options; options.create_if_missing = true; options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb - options.stats_persist_period_sec = 5; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.stats_persist_period_sec = kPeriodSec; + options.statistics = CreateDBStatistics(); options.persist_stats_to_disk = true; - std::unique_ptr mock_env; - mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); + options.env = mock_env_.get(); CreateColumnFamilies({"pikachu"}, options); ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Wait for the first stats persist to finish, as the initial delay could be + // different. + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); + ColumnFamilyData* cfd_default = static_cast(dbfull()->DefaultColumnFamily()) ->cfd(); @@ -596,7 +600,9 @@ ASSERT_EQ("v0", Get("foo")); ASSERT_OK(Put(1, "Eevee", "v0")); ASSERT_EQ("v0", Get(1, "Eevee")); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flush default cf // LogNumbers: default: 14, stats: 4, pikachu: 4 ASSERT_OK(Flush()); @@ -619,8 +625,9 @@ ASSERT_OK(Put("bar2", "v2")); ASSERT_EQ("v2", Get("bar2")); ASSERT_EQ("v2", Get("foo2")); - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(10); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to default and stats cf, flushing default cf // LogNumbers: default: 19, stats: 19, pikachu: 19 ASSERT_OK(Flush()); @@ -633,8 +640,9 @@ ASSERT_EQ("v3", Get("foo3")); ASSERT_OK(Put(1, "Jolteon", "v3")); ASSERT_EQ("v3", Get(1, "Jolteon")); - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(15); }); + + dbfull()->TEST_WaitForStatsDumpRun( + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flushing test cf // LogNumbers: default: 19, stats: 19, pikachu: 22 ASSERT_OK(Flush(1)); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,9 +4,12 @@ // (found in the LICENSE.Apache file in the root directory). #include "monitoring/thread_status_updater.h" + #include + #include "port/likely.h" #include "rocksdb/env.h" +#include "rocksdb/system_clock.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { @@ -159,7 +162,7 @@ std::vector* thread_list) { thread_list->clear(); std::vector> valid_list; - uint64_t now_micros = Env::Default()->NowMicros(); + uint64_t now_micros = SystemClock::Default()->NowMicros(); std::lock_guard lck(thread_list_mutex_); for (auto* thread_data : thread_data_set_) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,7 @@ #include "db/column_family.h" #include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { @@ -19,7 +20,7 @@ assert(cf_info_map_.size() == handles.size()); } for (auto* handle : handles) { - auto* cfd = reinterpret_cast(handle)->cfd(); + auto* cfd = static_cast_with_check(handle)->cfd(); auto iter __attribute__((__unused__)) = cf_info_map_.find(cfd); if (check_exist) { assert(iter != cf_info_map_.end()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,7 @@ #include "monitoring/thread_status_updater.h" #include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -57,7 +58,7 @@ } if (op != ThreadStatus::OP_UNKNOWN) { - uint64_t current_time = Env::Default()->NowMicros(); + uint64_t current_time = SystemClock::Default()->NowMicros(); thread_updater_local_cache_->SetOperationStartTime(current_time); } else { // TDOO(yhchiang): we could report the time when we set operation to diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,7 +7,7 @@ #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" -#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -23,7 +23,7 @@ void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { auto delay = states_delay[state].load(std::memory_order_relaxed); if (delay > 0) { - Env::Default()->SleepForMicroseconds(delay); + SystemClock::Default()->SleepForMicroseconds(delay); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,27 +9,833 @@ #include #include #include + +#include "logging/logging.h" +#include "options/configurable_helper.h" #include "options/db_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" #include "port/port.h" +#include "rocksdb/compaction_filter.h" #include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/configurable.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { +// offset_of is used to get the offset of a class data member +// ex: offset_of(&ColumnFamilyOptions::num_levels) +// This call will return the offset of num_levels in ColumnFamilyOptions class +// +// This is the same as offsetof() but allow us to work with non standard-layout +// classes and structures +// refs: +// http://en.cppreference.com/w/cpp/concept/StandardLayoutType +// https://gist.github.com/graphitemaster/494f21190bb2c63c5516 +#ifndef ROCKSDB_LITE +static ImmutableCFOptions dummy_cf_options; +template +int offset_of(T1 ImmutableCFOptions::*member) { + return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options)); +} + +static Status ParseCompressionOptions(const std::string& value, + const std::string& name, + CompressionOptions& compression_opts) { + const char kDelimiter = ':'; + std::istringstream field_stream(value); + std::string field; + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.window_bits = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.level = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + compression_opts.strategy = ParseInt(field); + + // max_dict_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.max_dict_bytes = ParseInt(field); + } + + // zstd_max_train_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.zstd_max_train_bytes = ParseInt(field); + } + + // parallel_threads is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + // Since parallel_threads comes before enabled but was added optionally + // later, we need to check if this is the final token (meaning it is the + // enabled bit), or if there are more tokens (meaning this one is + // parallel_threads). + if (!field_stream.eof()) { + compression_opts.parallel_threads = ParseInt(field); + } else { + // parallel_threads is not serialized with this format, but enabled is + compression_opts.enabled = ParseBoolean("", field); + } + } + + // enabled is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.enabled = ParseBoolean("", field); + } + + // max_dict_buffer_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.max_dict_buffer_bytes = ParseUint64(field); + } + + if (!field_stream.eof()) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); + } + return Status::OK(); +} + +const std::string kOptNameBMCompOpts = "bottommost_compression_opts"; +const std::string kOptNameCompOpts = "compression_opts"; + +// OptionTypeInfo map for CompressionOptions +static std::unordered_map + compression_options_type_info = { + {"window_bits", + {offsetof(struct CompressionOptions, window_bits), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"level", + {offsetof(struct CompressionOptions, level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"strategy", + {offsetof(struct CompressionOptions, strategy), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_dict_bytes", + {offsetof(struct CompressionOptions, max_dict_bytes), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"zstd_max_train_bytes", + {offsetof(struct CompressionOptions, zstd_max_train_bytes), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"parallel_threads", + {offsetof(struct CompressionOptions, parallel_threads), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enabled", + {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_dict_buffer_bytes", + {offsetof(struct CompressionOptions, max_dict_buffer_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + fifo_compaction_options_type_info = { + {"max_table_files_size", + {offsetof(struct CompactionOptionsFIFO, max_table_files_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"age_for_warm", + {offsetof(struct CompactionOptionsFIFO, age_for_warm), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"ttl", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"allow_compaction", + {offsetof(struct CompactionOptionsFIFO, allow_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + universal_compaction_options_type_info = { + {"size_ratio", + {offsetof(class CompactionOptionsUniversal, size_ratio), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_merge_width", + {offsetof(class CompactionOptionsUniversal, min_merge_width), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_merge_width", + {offsetof(class CompactionOptionsUniversal, max_merge_width), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_size_amplification_percent", + {offsetof(class CompactionOptionsUniversal, + max_size_amplification_percent), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression_size_percent", + {offsetof(class CompactionOptionsUniversal, compression_size_percent), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stop_style", + {offsetof(class CompactionOptionsUniversal, stop_style), + OptionType::kCompactionStopStyle, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"incremental", + {offsetof(class CompactionOptionsUniversal, incremental), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"allow_trivial_move", + {offsetof(class CompactionOptionsUniversal, allow_trivial_move), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}}; + +static std::unordered_map + cf_mutable_options_type_info = { + {"report_bg_io_stats", + {offsetof(struct MutableCFOptions, report_bg_io_stats), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"disable_auto_compactions", + {offsetof(struct MutableCFOptions, disable_auto_compactions), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"filter_deletes", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"check_flush_compaction_key_order", + {offsetof(struct MutableCFOptions, check_flush_compaction_key_order), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"paranoid_file_checks", + {offsetof(struct MutableCFOptions, paranoid_file_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"verify_checksums_in_compaction", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"soft_pending_compaction_bytes_limit", + {offsetof(struct MutableCFOptions, + soft_pending_compaction_bytes_limit), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"hard_pending_compaction_bytes_limit", + {offsetof(struct MutableCFOptions, + hard_pending_compaction_bytes_limit), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"hard_rate_limit", + {0, OptionType::kDouble, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"soft_rate_limit", + {0, OptionType::kDouble, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_compaction_bytes", + {offsetof(struct MutableCFOptions, max_compaction_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"expanded_compaction_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"level0_file_num_compaction_trigger", + {offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"level0_slowdown_writes_trigger", + {offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"level0_stop_writes_trigger", + {offsetof(struct MutableCFOptions, level0_stop_writes_trigger), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_grandparent_overlap_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_write_buffer_number", + {offsetof(struct MutableCFOptions, max_write_buffer_number), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"source_compaction_factor", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"target_file_size_multiplier", + {offsetof(struct MutableCFOptions, target_file_size_multiplier), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"arena_block_size", + {offsetof(struct MutableCFOptions, arena_block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"inplace_update_num_locks", + {offsetof(struct MutableCFOptions, inplace_update_num_locks), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_successive_merges", + {offsetof(struct MutableCFOptions, max_successive_merges), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_huge_page_size", + {offsetof(struct MutableCFOptions, memtable_huge_page_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_huge_page_tlb_size", + {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"write_buffer_size", + {offsetof(struct MutableCFOptions, write_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_bits", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_size_ratio", + {offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"memtable_prefix_bloom_probes", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"memtable_whole_key_filtering", + {offsetof(struct MutableCFOptions, memtable_whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_partial_merge_operands", + {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_base", + {offsetof(struct MutableCFOptions, max_bytes_for_level_base), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"snap_refresh_nanos", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_multiplier", + {offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_bytes_for_level_multiplier_additional", + OptionTypeInfo::Vector( + offsetof(struct MutableCFOptions, + max_bytes_for_level_multiplier_additional), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + {0, OptionType::kInt})}, + {"max_sequential_skip_in_iterations", + {offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"target_file_size_base", + {offsetof(struct MutableCFOptions, target_file_size_base), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compression", + {offsetof(struct MutableCFOptions, compression), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prefix_extractor", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct MutableCFOptions, prefix_extractor), + OptionVerificationType::kByNameAllowNull, + (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))}, + {"compaction_options_fifo", + OptionTypeInfo::Struct( + "compaction_options_fifo", &fifo_compaction_options_type_info, + offsetof(struct MutableCFOptions, compaction_options_fifo), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + // This is to handle backward compatibility, where + // compaction_options_fifo could be assigned a single scalar + // value, say, like "23", which would be assigned to + // max_table_files_size. + if (name == "compaction_options_fifo" && + value.find("=") == std::string::npos) { + // Old format. Parse just a single uint64_t value. + auto options = static_cast(addr); + options->max_table_files_size = ParseUint64(value); + return Status::OK(); + } else { + return OptionTypeInfo::ParseStruct( + opts, "compaction_options_fifo", + &fifo_compaction_options_type_info, name, value, addr); + } + })}, + {"compaction_options_universal", + OptionTypeInfo::Struct( + "compaction_options_universal", + &universal_compaction_options_type_info, + offsetof(struct MutableCFOptions, compaction_options_universal), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}, + {"ttl", + {offsetof(struct MutableCFOptions, ttl), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"periodic_compaction_seconds", + {offsetof(struct MutableCFOptions, periodic_compaction_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enable_blob_files", + {offsetof(struct MutableCFOptions, enable_blob_files), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"min_blob_size", + {offsetof(struct MutableCFOptions, min_blob_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_file_size", + {offsetof(struct MutableCFOptions, blob_file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_compression_type", + {offsetof(struct MutableCFOptions, blob_compression_type), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"enable_blob_garbage_collection", + {offsetof(struct MutableCFOptions, enable_blob_garbage_collection), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_age_cutoff", + {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_force_threshold", + {offsetof(struct MutableCFOptions, + blob_garbage_collection_force_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_compaction_readahead_size", + {offsetof(struct MutableCFOptions, blob_compaction_readahead_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"sample_for_compression", + {offsetof(struct MutableCFOptions, sample_for_compression), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"bottommost_compression", + {offsetof(struct MutableCFOptions, bottommost_compression), + OptionType::kCompressionType, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {kOptNameCompOpts, + OptionTypeInfo::Struct( + kOptNameCompOpts, &compression_options_type_info, + offsetof(struct MutableCFOptions, compression_opts), + OptionVerificationType::kNormal, + (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + // This is to handle backward compatibility, where + // compression_options was a ":" separated list. + if (name == kOptNameCompOpts && + value.find("=") == std::string::npos) { + auto* compression = static_cast(addr); + return ParseCompressionOptions(value, name, *compression); + } else { + return OptionTypeInfo::ParseStruct( + opts, kOptNameCompOpts, &compression_options_type_info, + name, value, addr); + } + })}, + {kOptNameBMCompOpts, + OptionTypeInfo::Struct( + kOptNameBMCompOpts, &compression_options_type_info, + offsetof(struct MutableCFOptions, bottommost_compression_opts), + OptionVerificationType::kNormal, + (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + // This is to handle backward compatibility, where + // compression_options was a ":" separated list. + if (name == kOptNameBMCompOpts && + value.find("=") == std::string::npos) { + auto* compression = static_cast(addr); + return ParseCompressionOptions(value, name, *compression); + } else { + return OptionTypeInfo::ParseStruct( + opts, kOptNameBMCompOpts, &compression_options_type_info, + name, value, addr); + } + })}, + // End special case properties +}; + +static std::unordered_map + cf_immutable_options_type_info = { + /* not yet supported + CompressionOptions compression_opts; + TablePropertiesCollectorFactories table_properties_collector_factories; + using TablePropertiesCollectorFactories = + std::vector>; + UpdateStatus (*inplace_callback)(char* existing_value, + uint34_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + std::vector cf_paths; + */ + {"compaction_measure_io_stats", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"inplace_update_support", + {offset_of(&ImmutableCFOptions::inplace_update_support), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"level_compaction_dynamic_level_bytes", + {offset_of(&ImmutableCFOptions::level_compaction_dynamic_level_bytes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"optimize_filters_for_hits", + {offset_of(&ImmutableCFOptions::optimize_filters_for_hits), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"force_consistency_checks", + {offset_of(&ImmutableCFOptions::force_consistency_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"purge_redundant_kvs_while_flush", + {offset_of(&ImmutableCFOptions::purge_redundant_kvs_while_flush), + OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"max_mem_compaction_level", + {0, OptionType::kInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"max_write_buffer_number_to_maintain", + {offset_of(&ImmutableCFOptions::max_write_buffer_number_to_maintain), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, 0}}, + {"max_write_buffer_size_to_maintain", + {offset_of(&ImmutableCFOptions::max_write_buffer_size_to_maintain), + OptionType::kInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"min_write_buffer_number_to_merge", + {offset_of(&ImmutableCFOptions::min_write_buffer_number_to_merge), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, 0}}, + {"num_levels", + {offset_of(&ImmutableCFOptions::num_levels), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"bloom_locality", + {offset_of(&ImmutableCFOptions::bloom_locality), OptionType::kUInt32T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"rate_limit_delay_max_milliseconds", + {0, OptionType::kUInt, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"compression_per_level", + OptionTypeInfo::Vector( + offset_of(&ImmutableCFOptions::compression_per_level), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kCompressionType})}, + {"comparator", + OptionTypeInfo::AsCustomRawPtr( + offset_of(&ImmutableCFOptions::user_comparator), + OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose, + // Serializes a Comparator + [](const ConfigOptions& opts, const std::string&, const void* addr, + std::string* value) { + // it's a const pointer of const Comparator* + const auto* ptr = static_cast(addr); + + // Since the user-specified comparator will be wrapped by + // InternalKeyComparator, we should persist the user-specified + // one instead of InternalKeyComparator. + if (*ptr == nullptr) { + *value = kNullptrString; + } else if (opts.mutable_options_only) { + *value = ""; + } else { + const Comparator* root_comp = (*ptr)->GetRootComparator(); + if (root_comp == nullptr) { + root_comp = (*ptr); + } + *value = root_comp->ToString(opts); + } + return Status::OK(); + }, + /* Use the default match function*/ nullptr)}, + {"memtable_insert_with_hint_prefix_extractor", + OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions:: + memtable_insert_with_hint_prefix_extractor), + OptionVerificationType::kByNameAllowNull, OptionTypeFlags::kNone)}, + {"memtable_factory", + {offset_of(&ImmutableCFOptions::memtable_factory), + OptionType::kCustomizable, OptionVerificationType::kByName, + OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + std::unique_ptr factory; + auto* shared = + static_cast*>(addr); + Status s = + MemTableRepFactory::CreateFromString(opts, value, &factory); + if (factory && s.ok()) { + shared->reset(factory.release()); + } + return s; + }}}, + {"memtable", + {offset_of(&ImmutableCFOptions::memtable_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + std::unique_ptr factory; + auto* shared = + static_cast*>(addr); + Status s = + MemTableRepFactory::CreateFromString(opts, value, &factory); + if (factory && s.ok()) { + shared->reset(factory.release()); + } + return s; + }}}, + {"table_factory", OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions::table_factory), + OptionVerificationType::kByName, + (OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kStringNameOnly | + OptionTypeFlags::kDontPrepare))}, + {"block_based_table_factory", + {offset_of(&ImmutableCFOptions::table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, + // Parses the input value and creates a BlockBasedTableFactory + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + BlockBasedTableOptions* old_opts = nullptr; + auto table_factory = + static_cast*>(addr); + if (table_factory->get() != nullptr) { + old_opts = + table_factory->get()->GetOptions(); + } + if (name == "block_based_table_factory") { + std::unique_ptr new_factory; + if (old_opts != nullptr) { + new_factory.reset(NewBlockBasedTableFactory(*old_opts)); + } else { + new_factory.reset(NewBlockBasedTableFactory()); + } + Status s = new_factory->ConfigureFromString(opts, value); + if (s.ok()) { + table_factory->reset(new_factory.release()); + } + return s; + } else if (old_opts != nullptr) { + return table_factory->get()->ConfigureOption(opts, name, value); + } else { + return Status::NotFound("Mismatched table option: ", name); + } + }}}, + {"plain_table_factory", + {offset_of(&ImmutableCFOptions::table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, + // Parses the input value and creates a PlainTableFactory + [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + PlainTableOptions* old_opts = nullptr; + auto table_factory = + static_cast*>(addr); + if (table_factory->get() != nullptr) { + old_opts = table_factory->get()->GetOptions(); + } + if (name == "plain_table_factory") { + std::unique_ptr new_factory; + if (old_opts != nullptr) { + new_factory.reset(NewPlainTableFactory(*old_opts)); + } else { + new_factory.reset(NewPlainTableFactory()); + } + Status s = new_factory->ConfigureFromString(opts, value); + if (s.ok()) { + table_factory->reset(new_factory.release()); + } + return s; + } else if (old_opts != nullptr) { + return table_factory->get()->ConfigureOption(opts, name, value); + } else { + return Status::NotFound("Mismatched table option: ", name); + } + }}}, + {"table_properties_collectors", + OptionTypeInfo::Vector< + std::shared_ptr>( + offset_of( + &ImmutableCFOptions::table_properties_collector_factories), + OptionVerificationType::kByName, OptionTypeFlags::kNone, + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByName, OptionTypeFlags::kNone))}, + {"compaction_filter", + OptionTypeInfo::AsCustomRawPtr( + offset_of(&ImmutableCFOptions::compaction_filter), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"compaction_filter_factory", + OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions::compaction_filter_factory), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"merge_operator", + OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions::merge_operator), + OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kCompareLoose | OptionTypeFlags::kAllowNull)}, + {"compaction_style", + {offset_of(&ImmutableCFOptions::compaction_style), + OptionType::kCompactionStyle, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"compaction_pri", + {offset_of(&ImmutableCFOptions::compaction_pri), + OptionType::kCompactionPri, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"sst_partitioner_factory", + OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions::sst_partitioner_factory), + OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, +}; -ImmutableCFOptions::ImmutableCFOptions(const Options& options) - : ImmutableCFOptions(ImmutableDBOptions(options), options) {} +const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; -ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& cf_options) +class ConfigurableMutableCFOptions : public Configurable { + public: + explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) { + mutable_ = mcf; + RegisterOptions(&mutable_, &cf_mutable_options_type_info); + } + + protected: + MutableCFOptions mutable_; +}; + +class ConfigurableCFOptions : public ConfigurableMutableCFOptions { + public: + ConfigurableCFOptions(const ColumnFamilyOptions& opts, + const std::unordered_map* map) + : ConfigurableMutableCFOptions(MutableCFOptions(opts)), + immutable_(opts), + cf_options_(opts), + opt_map_(map) { + RegisterOptions(&immutable_, &cf_immutable_options_type_info); + } + + protected: + Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) override { + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); + if (s.ok()) { + UpdateColumnFamilyOptions(mutable_, &cf_options_); + UpdateColumnFamilyOptions(immutable_, &cf_options_); + s = PrepareOptions(config_options); + } + return s; + } + + virtual const void* GetOptionsPtr(const std::string& name) const override { + if (name == OptionsHelper::kCFOptionsName) { + return &cf_options_; + } else { + return ConfigurableMutableCFOptions::GetOptionsPtr(name); + } + } + + bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const override { + bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr, + that_ptr, mismatch); + if (!equals && opt_info.IsByName()) { + if (opt_map_ == nullptr) { + equals = true; + } else { + const auto& iter = opt_map_->find(opt_name); + if (iter == opt_map_->end()) { + equals = true; + } else { + equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr, + iter->second); + } + } + if (equals) { // False alarm, clear mismatch + *mismatch = ""; + } + } + if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) { + const auto* this_config = opt_info.AsRawPointer(this_ptr); + if (this_config == nullptr) { + const auto& iter = opt_map_->find(opt_name); + // If the name exists in the map and is not empty/null, + // then the this_config should be set. + if (iter != opt_map_->end() && !iter->second.empty() && + iter->second != kNullptrString) { + *mismatch = opt_name; + equals = false; + } + } + } + return equals; + } + + private: + ImmutableCFOptions immutable_; + ColumnFamilyOptions cf_options_; + const std::unordered_map* opt_map_; +}; + +std::unique_ptr CFOptionsAsConfigurable( + const MutableCFOptions& opts) { + std::unique_ptr ptr(new ConfigurableMutableCFOptions(opts)); + return ptr; +} +std::unique_ptr CFOptionsAsConfigurable( + const ColumnFamilyOptions& opts, + const std::unordered_map* opt_map) { + std::unique_ptr ptr(new ConfigurableCFOptions(opts, opt_map)); + return ptr; +} +#endif // ROCKSDB_LITE + +ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {} + +ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) : compaction_style(cf_options.compaction_style), compaction_pri(cf_options.compaction_pri), user_comparator(cf_options.comparator), internal_comparator(InternalKeyComparator(cf_options.comparator)), - merge_operator(cf_options.merge_operator.get()), + merge_operator(cf_options.merge_operator), compaction_filter(cf_options.compaction_filter), - compaction_filter_factory(cf_options.compaction_filter_factory.get()), + compaction_filter_factory(cf_options.compaction_filter_factory), min_write_buffer_number_to_merge( cf_options.min_write_buffer_number_to_merge), max_write_buffer_number_to_maintain( @@ -38,47 +844,45 @@ cf_options.max_write_buffer_size_to_maintain), inplace_update_support(cf_options.inplace_update_support), inplace_callback(cf_options.inplace_callback), - info_log(db_options.info_log.get()), - statistics(db_options.statistics.get()), - rate_limiter(db_options.rate_limiter.get()), - info_log_level(db_options.info_log_level), - env(db_options.env), - fs(db_options.fs.get()), - allow_mmap_reads(db_options.allow_mmap_reads), - allow_mmap_writes(db_options.allow_mmap_writes), - db_paths(db_options.db_paths), - memtable_factory(cf_options.memtable_factory.get()), - table_factory(cf_options.table_factory.get()), + memtable_factory(cf_options.memtable_factory), + table_factory(cf_options.table_factory), table_properties_collector_factories( cf_options.table_properties_collector_factories), - advise_random_on_open(db_options.advise_random_on_open), bloom_locality(cf_options.bloom_locality), purge_redundant_kvs_while_flush( cf_options.purge_redundant_kvs_while_flush), - use_fsync(db_options.use_fsync), compression_per_level(cf_options.compression_per_level), - bottommost_compression(cf_options.bottommost_compression), - bottommost_compression_opts(cf_options.bottommost_compression_opts), - compression_opts(cf_options.compression_opts), level_compaction_dynamic_level_bytes( cf_options.level_compaction_dynamic_level_bytes), - access_hint_on_compaction_start( - db_options.access_hint_on_compaction_start), - new_table_reader_for_compaction_inputs( - db_options.new_table_reader_for_compaction_inputs), num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), - allow_ingest_behind(db_options.allow_ingest_behind), - preserve_deletes(db_options.preserve_deletes), - listeners(db_options.listeners), - row_cache(db_options.row_cache), - max_subcompactions(db_options.max_subcompactions), memtable_insert_with_hint_prefix_extractor( - cf_options.memtable_insert_with_hint_prefix_extractor.get()), + cf_options.memtable_insert_with_hint_prefix_extractor), cf_paths(cf_options.cf_paths), compaction_thread_limiter(cf_options.compaction_thread_limiter), - sst_file_checksum_func(db_options.sst_file_checksum_func.get()) {} + sst_partitioner_factory(cf_options.sst_partitioner_factory) {} + +ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} + +ImmutableOptions::ImmutableOptions(const Options& options) + : ImmutableOptions(options, options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} // Multiple two operands. If they overflow, return op1. uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { @@ -109,6 +913,17 @@ } } +size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options) { + // We do not want to pin meta-blocks that almost certainly came from intra-L0 + // or a former larger `write_buffer_size` value to avoid surprising users with + // pinned memory usage. We use a factor of 1.5 to account for overhead + // introduced during flush in most cases. + if (port::kMaxSizet / 3 < cf_options.write_buffer_size / 2) { + return port::kMaxSizet; + } + return cf_options.write_buffer_size / 2 * 3; +} + void MutableCFOptions::RefreshDerivedOptions(int num_levels, CompactionStyle compaction_style) { max_file_size.resize(num_levels); @@ -147,9 +962,10 @@ ROCKS_LOG_INFO(log, " inplace_update_num_locks: %" ROCKSDB_PRIszt, inplace_update_num_locks); - ROCKS_LOG_INFO( - log, " prefix_extractor: %s", - prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); + ROCKS_LOG_INFO(log, " prefix_extractor: %s", + prefix_extractor == nullptr + ? "nullptr" + : prefix_extractor->GetId().c_str()); ROCKS_LOG_INFO(log, " disable_auto_compactions: %d", disable_auto_compactions); ROCKS_LOG_INFO(log, " soft_pending_compaction_bytes_limit: %" PRIu64, @@ -192,6 +1008,8 @@ result.c_str()); ROCKS_LOG_INFO(log, " max_sequential_skip_in_iterations: %" PRIu64, max_sequential_skip_in_iterations); + ROCKS_LOG_INFO(log, " check_flush_compaction_key_order: %d", + check_flush_compaction_key_order); ROCKS_LOG_INFO(log, " paranoid_file_checks: %d", paranoid_file_checks); ROCKS_LOG_INFO(log, " report_bg_io_stats: %d", @@ -217,15 +1035,60 @@ ROCKS_LOG_INFO( log, "compaction_options_universal.allow_trivial_move : %d", static_cast(compaction_options_universal.allow_trivial_move)); + ROCKS_LOG_INFO(log, "compaction_options_universal.incremental : %d", + static_cast(compaction_options_universal.incremental)); // FIFO Compaction Options ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64, compaction_options_fifo.max_table_files_size); ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d", compaction_options_fifo.allow_compaction); + + // Blob file related options + ROCKS_LOG_INFO(log, " enable_blob_files: %s", + enable_blob_files ? "true" : "false"); + ROCKS_LOG_INFO(log, " min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_INFO(log, " blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_INFO(log, " blob_compression_type: %s", + CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_INFO(log, " enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); + ROCKS_LOG_INFO(log, " blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); + ROCKS_LOG_INFO(log, " blob_compaction_readahead_size: %" PRIu64, + blob_compaction_readahead_size); } MutableCFOptions::MutableCFOptions(const Options& options) : MutableCFOptions(ColumnFamilyOptions(options)) {} +#ifndef ROCKSDB_LITE +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* /*info_log*/, MutableCFOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, cf_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; +} + +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string) { + assert(opt_string); + opt_string->clear(); + return OptionTypeInfo::SerializeType( + config_options, cf_mutable_options_type_info, &mutable_opts, opt_string); +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h 2025-05-19 16:14:27.000000000 +0000 @@ -20,23 +20,23 @@ // of DB. Raw pointers defined in this struct do not have ownership to the data // they point to. Options contains std::shared_ptr to these data. struct ImmutableCFOptions { - explicit ImmutableCFOptions(const Options& options); - - ImmutableCFOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& cf_options); + public: + static const char* kName() { return "ImmutableCFOptions"; } + explicit ImmutableCFOptions(); + explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options); CompactionStyle compaction_style; CompactionPri compaction_pri; const Comparator* user_comparator; - InternalKeyComparator internal_comparator; + InternalKeyComparator internal_comparator; // Only in Immutable - MergeOperator* merge_operator; + std::shared_ptr merge_operator; const CompactionFilter* compaction_filter; - CompactionFilterFactory* compaction_filter_factory; + std::shared_ptr compaction_filter_factory; int min_write_buffer_number_to_merge; @@ -51,85 +51,58 @@ Slice delta_value, std::string* merged_value); - Logger* info_log; - - Statistics* statistics; - - RateLimiter* rate_limiter; - - InfoLogLevel info_log_level; - - Env* env; - - FileSystem* fs; - - // Allow the OS to mmap file for reading sst tables. Default: false - bool allow_mmap_reads; - - // Allow the OS to mmap file for writing. Default: false - bool allow_mmap_writes; + std::shared_ptr memtable_factory; - std::vector db_paths; - - MemTableRepFactory* memtable_factory; - - TableFactory* table_factory; + std::shared_ptr table_factory; Options::TablePropertiesCollectorFactories table_properties_collector_factories; - bool advise_random_on_open; - // This options is required by PlainTableReader. May need to move it // to PlainTableOptions just like bloom_bits_per_key uint32_t bloom_locality; bool purge_redundant_kvs_while_flush; - bool use_fsync; - std::vector compression_per_level; - CompressionType bottommost_compression; - - CompressionOptions bottommost_compression_opts; - - CompressionOptions compression_opts; - bool level_compaction_dynamic_level_bytes; - Options::AccessHint access_hint_on_compaction_start; - - bool new_table_reader_for_compaction_inputs; - int num_levels; bool optimize_filters_for_hits; bool force_consistency_checks; - bool allow_ingest_behind; + std::shared_ptr + memtable_insert_with_hint_prefix_extractor; - bool preserve_deletes; + std::vector cf_paths; - // A vector of EventListeners which callback functions will be called - // when specific RocksDB event happens. - std::vector> listeners; + std::shared_ptr compaction_thread_limiter; - std::shared_ptr row_cache; + std::shared_ptr sst_partitioner_factory; +}; - uint32_t max_subcompactions; +struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { + explicit ImmutableOptions(); + explicit ImmutableOptions(const Options& options); - const SliceTransform* memtable_insert_with_hint_prefix_extractor; + ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); - std::vector cf_paths; + ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options); - std::shared_ptr compaction_thread_limiter; + ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options); - FileChecksumFunc* sst_file_checksum_func; + ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options); }; struct MutableCFOptions { + static const char* kName() { return "MutableCFOptions"; } explicit MutableCFOptions(const ColumnFamilyOptions& options) : write_buffer_size(options.write_buffer_size), max_write_buffer_number(options.max_write_buffer_number), @@ -161,12 +134,29 @@ options.max_bytes_for_level_multiplier_additional), compaction_options_fifo(options.compaction_options_fifo), compaction_options_universal(options.compaction_options_universal), + enable_blob_files(options.enable_blob_files), + min_blob_size(options.min_blob_size), + blob_file_size(options.blob_file_size), + blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), + blob_compaction_readahead_size(options.blob_compaction_readahead_size), max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), + check_flush_compaction_key_order( + options.check_flush_compaction_key_order), paranoid_file_checks(options.paranoid_file_checks), report_bg_io_stats(options.report_bg_io_stats), compression(options.compression), - sample_for_compression(options.sample_for_compression) { + bottommost_compression(options.bottommost_compression), + compression_opts(options.compression_opts), + bottommost_compression_opts(options.bottommost_compression_opts), + bottommost_temperature(options.bottommost_temperature), + sample_for_compression( + options.sample_for_compression) { // TODO: is 0 fine here? RefreshDerivedOptions(options.num_levels, options.compaction_style); } @@ -194,10 +184,21 @@ ttl(0), periodic_compaction_seconds(0), compaction_options_fifo(), + enable_blob_files(false), + min_blob_size(0), + blob_file_size(0), + blob_compression_type(kNoCompression), + enable_blob_garbage_collection(false), + blob_garbage_collection_age_cutoff(0.0), + blob_garbage_collection_force_threshold(0.0), + blob_compaction_readahead_size(0), max_sequential_skip_in_iterations(0), + check_flush_compaction_key_order(true), paranoid_file_checks(false), report_bg_io_stats(false), compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), + bottommost_compression(kDisableCompressionOption), + bottommost_temperature(Temperature::kUnknown), sample_for_compression(0) {} explicit MutableCFOptions(const Options& options); @@ -248,11 +249,29 @@ CompactionOptionsFIFO compaction_options_fifo; CompactionOptionsUniversal compaction_options_universal; + // Blob file related options + bool enable_blob_files; + uint64_t min_blob_size; + uint64_t blob_file_size; + CompressionType blob_compression_type; + bool enable_blob_garbage_collection; + double blob_garbage_collection_age_cutoff; + double blob_garbage_collection_force_threshold; + uint64_t blob_compaction_readahead_size; + // Misc options uint64_t max_sequential_skip_in_iterations; + bool check_flush_compaction_key_order; bool paranoid_file_checks; bool report_bg_io_stats; CompressionType compression; + CompressionType bottommost_compression; + CompressionOptions compression_opts; + CompressionOptions bottommost_compression_opts; + // TODO this experimental option isn't made configurable + // through strings yet. + Temperature bottommost_temperature; + uint64_t sample_for_compression; // Derived options @@ -266,4 +285,20 @@ uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options, int level, CompactionStyle compaction_style, int base_level = 1, bool level_compaction_dynamic_level_bytes = false); + +// Get the max size of an L0 file for which we will pin its meta-blocks when +// `pin_l0_filter_and_index_blocks_in_cache` is set. +size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options); + +#ifndef ROCKSDB_LITE +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* info_log, MutableCFOptions* new_options); +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,785 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/configurable.h" + +#include "logging/logging.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void Configurable::RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* type_map) { + RegisteredOptions opts; + opts.name = name; +#ifndef ROCKSDB_LITE + opts.type_map = type_map; +#else + (void)type_map; +#endif // ROCKSDB_LITE + opts.opt_ptr = opt_ptr; + options_.emplace_back(opts); +} + +//************************************************************************* +// +// Methods for Initializing and Validating Configurable Objects +// +//************************************************************************* + +Status Configurable::PrepareOptions(const ConfigOptions& opts) { + // We ignore the invoke_prepare_options here intentionally, + // as if you are here, you must have called PrepareOptions explicitly. + Status status = Status::OK(); +#ifndef ROCKSDB_LITE + for (auto opt_iter : options_) { + if (opt_iter.type_map != nullptr) { + for (auto map_iter : *(opt_iter.type_map)) { + auto& opt_info = map_iter.second; + if (!opt_info.IsDeprecated() && !opt_info.IsAlias() && + opt_info.IsConfigurable()) { + if (!opt_info.IsEnabled(OptionTypeFlags::kDontPrepare)) { + Configurable* config = + opt_info.AsRawPointer(opt_iter.opt_ptr); + if (config != nullptr) { + status = config->PrepareOptions(opts); + } else if (!opt_info.CanBeNull()) { + status = Status::NotFound("Missing configurable object", + map_iter.first); + } + if (!status.ok()) { + return status; + } + } + } + } + } + } +#else + (void)opts; +#endif // ROCKSDB_LITE + return status; +} + +Status Configurable::ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const { + Status status; +#ifndef ROCKSDB_LITE + for (auto opt_iter : options_) { + if (opt_iter.type_map != nullptr) { + for (auto map_iter : *(opt_iter.type_map)) { + auto& opt_info = map_iter.second; + if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) { + if (opt_info.IsConfigurable()) { + const Configurable* config = + opt_info.AsRawPointer(opt_iter.opt_ptr); + if (config != nullptr) { + status = config->ValidateOptions(db_opts, cf_opts); + } else if (!opt_info.CanBeNull()) { + status = Status::NotFound("Missing configurable object", + map_iter.first); + } + if (!status.ok()) { + return status; + } + } + } + } + } + } +#else + (void)db_opts; + (void)cf_opts; +#endif // ROCKSDB_LITE + return status; +} + +/*********************************************************************************/ +/* */ +/* Methods for Retrieving Options from Configurables */ +/* */ +/*********************************************************************************/ + +const void* Configurable::GetOptionsPtr(const std::string& name) const { + for (auto o : options_) { + if (o.name == name) { + return o.opt_ptr; + } + } + return nullptr; +} + +std::string Configurable::GetOptionName(const std::string& opt_name) const { + return opt_name; +} + +#ifndef ROCKSDB_LITE +const OptionTypeInfo* ConfigurableHelper::FindOption( + const std::vector& options, + const std::string& short_name, std::string* opt_name, void** opt_ptr) { + for (auto iter : options) { + if (iter.type_map != nullptr) { + const auto opt_info = + OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name); + if (opt_info != nullptr) { + *opt_ptr = iter.opt_ptr; + return opt_info; + } + } + } + return nullptr; +} +#endif // ROCKSDB_LITE + +//************************************************************************* +// +// Methods for Configuring Options from Strings/Name-Value Pairs/Maps +// +//************************************************************************* + +Status Configurable::ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opts_map) { + Status s = ConfigureFromMap(config_options, opts_map, nullptr); + return s; +} + +Status Configurable::ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + return ConfigureOptions(config_options, opts_map, unused); +} + +Status Configurable::ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + std::string curr_opts; + Status s; + if (!opts_map.empty()) { + // There are options in the map. + // Save the current configuration in curr_opts and then configure the + // options, but do not prepare them now. We will do all the prepare when + // the configuration is complete. + ConfigOptions copy = config_options; + copy.invoke_prepare_options = false; +#ifndef ROCKSDB_LITE + if (!config_options.ignore_unknown_options) { + // If we are not ignoring unused, get the defaults in case we need to + // reset + copy.depth = ConfigOptions::kDepthDetailed; + copy.delimiter = "; "; + GetOptionString(copy, &curr_opts).PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + + s = ConfigurableHelper::ConfigureOptions(copy, *this, opts_map, unused); + } + if (config_options.invoke_prepare_options && s.ok()) { + s = PrepareOptions(config_options); + } +#ifndef ROCKSDB_LITE + if (!s.ok() && !curr_opts.empty()) { + ConfigOptions reset = config_options; + reset.ignore_unknown_options = true; + reset.invoke_prepare_options = true; + reset.ignore_unsupported_options = true; + // There are some options to reset from this current error + ConfigureFromString(reset, curr_opts).PermitUncheckedError(); + } +#endif // ROCKSDB_LITE + return s; +} + +Status Configurable::ParseStringOptions(const ConfigOptions& /*config_options*/, + const std::string& /*opts_str*/) { + return Status::OK(); +} + +Status Configurable::ConfigureFromString(const ConfigOptions& config_options, + const std::string& opts_str) { + Status s; + if (!opts_str.empty()) { +#ifndef ROCKSDB_LITE + if (opts_str.find(';') != std::string::npos || + opts_str.find('=') != std::string::npos) { + std::unordered_map opt_map; + s = StringToMap(opts_str, &opt_map); + if (s.ok()) { + s = ConfigureFromMap(config_options, opt_map, nullptr); + } + } else { +#endif // ROCKSDB_LITE + s = ParseStringOptions(config_options, opts_str); + if (s.ok() && config_options.invoke_prepare_options) { + s = PrepareOptions(config_options); + } +#ifndef ROCKSDB_LITE + } +#endif // ROCKSDB_LITE + } else if (config_options.invoke_prepare_options) { + s = PrepareOptions(config_options); + } else { + s = Status::OK(); + } + return s; +} + +#ifndef ROCKSDB_LITE +/** + * Sets the value of the named property to the input value, returning OK on + * succcess. + */ +Status Configurable::ConfigureOption(const ConfigOptions& config_options, + const std::string& name, + const std::string& value) { + return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name, + value); +} + +/** + * Looks for the named option amongst the options for this type and sets + * the value for it to be the input value. + * If the name was found, found_option will be set to true and the resulting + * status should be returned. + */ + +Status Configurable::ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, void* opt_ptr) { + if (opt_info.IsMutable()) { + if (config_options.mutable_options_only) { + // This option is mutable. Treat all of its children as mutable as well + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + return opt_info.Parse(copy, opt_name, opt_value, opt_ptr); + } else { + return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); + } + } else if (config_options.mutable_options_only) { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } else { + return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); + } +} + +#endif // ROCKSDB_LITE + +Status ConfigurableHelper::ConfigureOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& opts_map, + std::unordered_map* unused) { + std::unordered_map remaining = opts_map; + Status s = Status::OK(); + if (!opts_map.empty()) { +#ifndef ROCKSDB_LITE + for (const auto& iter : configurable.options_) { + if (iter.type_map != nullptr) { + s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map), + &remaining, iter.opt_ptr); + if (remaining.empty()) { // Are there more options left? + break; + } else if (!s.ok()) { + break; + } + } + } +#else + (void)configurable; + if (!config_options.ignore_unknown_options) { + s = Status::NotSupported("ConfigureFromMap not supported in LITE mode"); + } +#endif // ROCKSDB_LITE + } + if (unused != nullptr && !remaining.empty()) { + unused->insert(remaining.begin(), remaining.end()); + } + if (config_options.ignore_unknown_options) { + s = Status::OK(); + } else if (s.ok() && unused == nullptr && !remaining.empty()) { + s = Status::NotFound("Could not find option: ", remaining.begin()->first); + } + return s; +} + +#ifndef ROCKSDB_LITE +/** + * Updates the object with the named-value property values, returning OK on + * succcess. Any properties that were found are removed from the options list; + * upon return only options that were not found in this opt_map remain. + + * Returns: + * - OK if ignore_unknown_options is set + * - InvalidArgument, if any option was invalid + * - NotSupported, if any option is unsupported and ignore_unsupported_options + is OFF + * - OK, if no option was invalid or not supported (or ignored) + */ +Status ConfigurableHelper::ConfigureSomeOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& type_map, + std::unordered_map* options, void* opt_ptr) { + Status result = Status::OK(); // The last non-OK result (if any) + Status notsup = Status::OK(); // The last NotSupported result (if any) + std::string elem_name; + int found = 1; + std::unordered_set unsupported; + // While there are unused properties and we processed at least one, + // go through the remaining unused properties and attempt to configure them. + while (found > 0 && !options->empty()) { + found = 0; + notsup = Status::OK(); + for (auto it = options->begin(); it != options->end();) { + const std::string& opt_name = configurable.GetOptionName(it->first); + const std::string& opt_value = it->second; + const auto opt_info = + OptionTypeInfo::Find(opt_name, type_map, &elem_name); + if (opt_info == nullptr) { // Did not find the option. Skip it + ++it; + } else { + Status s = ConfigureOption(config_options, configurable, *opt_info, + opt_name, elem_name, opt_value, opt_ptr); + if (s.IsNotFound()) { + ++it; + } else if (s.IsNotSupported()) { + notsup = s; + unsupported.insert(it->first); + ++it; // Skip it for now + } else { + found++; + it = options->erase(it); + if (!s.ok()) { + result = s; + } + } + } + } // End for all remaining options + } // End while found one or options remain + + // Now that we have been through the list, remove any unsupported + for (auto u : unsupported) { + auto it = options->find(u); + if (it != options->end()) { + options->erase(it); + } + } + if (config_options.ignore_unknown_options) { + if (!result.ok()) result.PermitUncheckedError(); + if (!notsup.ok()) notsup.PermitUncheckedError(); + return Status::OK(); + } else if (!result.ok()) { + if (!notsup.ok()) notsup.PermitUncheckedError(); + return result; + } else if (config_options.ignore_unsupported_options) { + if (!notsup.ok()) notsup.PermitUncheckedError(); + return Status::OK(); + } else { + return notsup; + } +} + +Status ConfigurableHelper::ConfigureSingleOption( + const ConfigOptions& config_options, Configurable& configurable, + const std::string& name, const std::string& value) { + const std::string& opt_name = configurable.GetOptionName(name); + std::string elem_name; + void* opt_ptr = nullptr; + const auto opt_info = + FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr); + if (opt_info == nullptr) { + return Status::NotFound("Could not find option: ", name); + } else { + return ConfigureOption(config_options, configurable, *opt_info, opt_name, + elem_name, value, opt_ptr); + } +} +Status ConfigurableHelper::ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr) { + Customizable* custom = opt_info.AsRawPointer(opt_ptr); + ConfigOptions copy = config_options; + if (opt_info.IsMutable()) { + // This option is mutable. Pass that property on to any subsequent calls + copy.mutable_options_only = false; + } + + if (opt_info.IsMutable() || !config_options.mutable_options_only) { + // Either the option is mutable, or we are processing all of the options + if (opt_name == name || name == OptionTypeInfo::kIdPropName() || + EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix())) { + return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); + } else if (value.empty()) { + return Status::OK(); + } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) { + return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); + } else if (value.find("=") != std::string::npos) { + return custom->ConfigureFromString(copy, value); + } else { + return custom->ConfigureOption(copy, name, value); + } + } else { + // We are processing immutable options, which means that we cannot change + // the Customizable object itself, but could change its mutable properties. + // Check to make sure that nothing is trying to change the Customizable + if (custom == nullptr) { + // We do not have a Customizable to configure. This is OK if the + // value is empty (nothing being configured) but an error otherwise + if (value.empty()) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix()) || + name == OptionTypeInfo::kIdPropName()) { + // We have a property of the form "id=value" or "table.id=value" + // This is OK if we ID/value matches the current customizable object + if (custom->GetId() == value) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (opt_name == name) { + // The properties are of one of forms: + // name = { id = id; prop1 = value1; ... } + // name = { prop1=value1; prop2=value2; ... } + // name = ID + // Convert the value to a map and extract the ID + // If the ID does not match that of the current customizable, return an + // error. Otherwise, update the current customizable via the properties + // map + std::unordered_map props; + std::string id; + Status s = + Configurable::GetOptionsMap(value, custom->GetId(), &id, &props); + if (!s.ok()) { + return s; + } else if (custom->GetId() != id) { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } else if (props.empty()) { + return Status::OK(); + } else { + return custom->ConfigureFromMap(copy, props); + } + } else { + // Attempting to configure one of the properties of the customizable + // Let it through + return custom->ConfigureOption(copy, name, value); + } + } +} + +Status ConfigurableHelper::ConfigureOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr) { + if (opt_info.IsCustomizable()) { + return ConfigureCustomizableOption(config_options, configurable, opt_info, + opt_name, name, value, opt_ptr); + } else if (opt_name == name) { + return configurable.ParseOption(config_options, opt_info, opt_name, value, + opt_ptr); + } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) { + return configurable.ParseOption(config_options, opt_info, name, value, + opt_ptr); + } else { + return Status::NotFound("Could not find option: ", name); + } +} +#endif // ROCKSDB_LITE + +//******************************************************************************* +// +// Methods for Converting Options into strings +// +//******************************************************************************* + +Status Configurable::GetOptionString(const ConfigOptions& config_options, + std::string* result) const { + assert(result); + result->clear(); +#ifndef ROCKSDB_LITE + return ConfigurableHelper::SerializeOptions(config_options, *this, "", + result); +#else + (void)config_options; + return Status::NotSupported("GetOptionString not supported in LITE mode"); +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +std::string Configurable::ToString(const ConfigOptions& config_options, + const std::string& prefix) const { + std::string result = SerializeOptions(config_options, prefix); + if (result.empty() || result.find('=') == std::string::npos) { + return result; + } else { + return "{" + result + "}"; + } +} + +std::string Configurable::SerializeOptions(const ConfigOptions& config_options, + const std::string& header) const { + std::string result; + Status s = ConfigurableHelper::SerializeOptions(config_options, *this, header, + &result); + assert(s.ok()); + return result; +} + +Status Configurable::GetOption(const ConfigOptions& config_options, + const std::string& name, + std::string* value) const { + return ConfigurableHelper::GetOption(config_options, *this, + GetOptionName(name), value); +} + +Status ConfigurableHelper::GetOption(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& short_name, + std::string* value) { + // Look for option directly + assert(value); + value->clear(); + + std::string opt_name; + void* opt_ptr = nullptr; + const auto opt_info = + FindOption(configurable.options_, short_name, &opt_name, &opt_ptr); + if (opt_info != nullptr) { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + if (short_name == opt_name) { + return opt_info->Serialize(embedded, opt_name, opt_ptr, value); + } else if (opt_info->IsStruct()) { + return opt_info->Serialize(embedded, opt_name, opt_ptr, value); + } else if (opt_info->IsConfigurable()) { + auto const* config = opt_info->AsRawPointer(opt_ptr); + if (config != nullptr) { + return config->GetOption(embedded, opt_name, value); + } + } + } + return Status::NotFound("Cannot find option: ", short_name); +} + +Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::string* result) { + assert(result); + for (auto const& opt_iter : configurable.options_) { + if (opt_iter.type_map != nullptr) { + for (const auto& map_iter : *(opt_iter.type_map)) { + const auto& opt_name = map_iter.first; + const auto& opt_info = map_iter.second; + if (opt_info.ShouldSerialize()) { + std::string value; + Status s; + if (!config_options.mutable_options_only) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr, + &value); + } else if (opt_info.IsConfigurable()) { + // If it is a Configurable and we are either printing all of the + // details or not printing only the name, this option should be + // included in the list + if (config_options.IsDetailed() || + !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } + } + if (!s.ok()) { + return s; + } else if (!value.empty()) { + // = + result->append(prefix + opt_name + "=" + value + + config_options.delimiter); + } + } + } + } + } + return Status::OK(); +} +#endif // ROCKSDB_LITE + +//******************************************************************************** +// +// Methods for listing the options from Configurables +// +//******************************************************************************** +#ifndef ROCKSDB_LITE +Status Configurable::GetOptionNames( + const ConfigOptions& config_options, + std::unordered_set* result) const { + assert(result); + return ConfigurableHelper::ListOptions(config_options, *this, "", result); +} + +Status ConfigurableHelper::ListOptions( + const ConfigOptions& config_options, const Configurable& configurable, + const std::string& prefix, std::unordered_set* result) { + Status status; + for (auto const& opt_iter : configurable.options_) { + if (opt_iter.type_map != nullptr) { + for (const auto& map_iter : *(opt_iter.type_map)) { + const auto& opt_name = map_iter.first; + const auto& opt_info = map_iter.second; + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) { + if (!config_options.mutable_options_only) { + result->emplace(prefix + opt_name); + } else if (opt_info.IsMutable()) { + result->emplace(prefix + opt_name); + } + } + } + } + } + return status; +} +#endif // ROCKSDB_LITE + +//******************************************************************************* +// +// Methods for Comparing Configurables +// +//******************************************************************************* + +bool Configurable::AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* name) const { + assert(name); + name->clear(); + if (this == other || config_options.IsCheckDisabled()) { + return true; + } else if (other != nullptr) { +#ifndef ROCKSDB_LITE + return ConfigurableHelper::AreEquivalent(config_options, *this, *other, + name); +#else + return true; +#endif // ROCKSDB_LITE + } else { + return false; + } +} + +#ifndef ROCKSDB_LITE +bool Configurable::OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const { + if (opt_info.AreEqual(config_options, opt_name, this_ptr, that_ptr, + mismatch)) { + return true; + } else if (opt_info.AreEqualByName(config_options, opt_name, this_ptr, + that_ptr)) { + *mismatch = ""; + return true; + } else { + return false; + } +} + +bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options, + const Configurable& this_one, + const Configurable& that_one, + std::string* mismatch) { + assert(mismatch != nullptr); + for (auto const& o : this_one.options_) { + const auto this_offset = this_one.GetOptionsPtr(o.name); + const auto that_offset = that_one.GetOptionsPtr(o.name); + if (this_offset != that_offset) { + if (this_offset == nullptr || that_offset == nullptr) { + return false; + } else if (o.type_map != nullptr) { + for (const auto& map_iter : *(o.type_map)) { + const auto& opt_info = map_iter.second; + if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) { + if (!config_options.mutable_options_only) { + if (!this_one.OptionsAreEqual(config_options, opt_info, + map_iter.first, this_offset, + that_offset, mismatch)) { + return false; + } + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first, + this_offset, that_offset, + mismatch)) { + return false; + } + } + } + } + } + } + } + return true; +} +#endif // ROCKSDB_LITE + +Status Configurable::GetOptionsMap( + const std::string& value, const std::string& default_id, std::string* id, + std::unordered_map* props) { + assert(id); + assert(props); + Status status; + if (value.empty() || value == kNullptrString) { + *id = default_id; + } else if (value.find('=') == std::string::npos) { + *id = value; +#ifndef ROCKSDB_LITE + } else { + status = StringToMap(value, props); + if (!status.ok()) { // There was an error creating the map. + *id = value; // Treat the value as id + props->clear(); // Clear the properties + status = Status::OK(); // and ignore the error + } else { + auto iter = props->find(OptionTypeInfo::kIdPropName()); + if (iter != props->end()) { + *id = iter->second; + props->erase(iter); + if (*id == kNullptrString) { + id->clear(); + } + } else if (!default_id.empty()) { + *id = default_id; + } else { // No id property and no default + *id = value; // Treat the value as id + props->clear(); // Clear the properties + } + } +#else + } else { + *id = value; + props->clear(); +#endif + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_helper.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/configurable.h" +#include "rocksdb/convenience.h" + +namespace ROCKSDB_NAMESPACE { +// Helper class defining static methods for supporting the Configurable +// class. The purpose of this class is to keep the Configurable class +// as tight as possible and provide methods for doing the actual work +// of configuring the objects. +class ConfigurableHelper { + public: + // Configures the input Configurable object based on the parameters. + // On successful completion, the Configurable is updated with the settings + // from the opt_map. + // + // The acceptable values of the name/value pairs are documented with the + // specific class/instance. + // + // @param config_options Controls how the arguments are processed. + // @param opt_map Name/value pairs of the options to update + // @param unused If specified, this value will return the name/value + // pairs from opt_map that were NotFound for this object. + // @return OK If all values in the map were successfully updated + // @return NotFound If any of the names in the opt_map were not valid + // for this object. If unused is specified, it will contain the + // collection of NotFound entries + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. This can also be returned if PrepareOptions encounters an + // error. + static Status ConfigureOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& options, + std::unordered_map* unused); + +#ifndef ROCKSDB_LITE + // Internal method to configure a set of options for this object. + // Classes may override this value to change its behavior. + // @param config_options Controls how the options are being configured + // @param type_name The name that was registered for this set of options + // @param type_map The map of options for this name + // @param opt_ptr Pointer to the object being configured for this option set. + // @param options The option name/values being updated. On return, any + // option that was found is removed from the list. + // @return OK If all of the options were successfully updated. + // @return InvalidArgument If an option was found but the value could not + // be updated. + // @return NotFound If an option name was not found in type_mape + // @return NotSupported If the option was found but no rule for converting + // the value could be found. + static Status ConfigureSomeOptions( + const ConfigOptions& config_options, Configurable& configurable, + const std::unordered_map& type_map, + std::unordered_map* options, void* opt_ptr); + + // Configures a single option in the input Configurable. + // This method will look through the set of option names for this + // Configurable searching for one with the input name. If such an option + // is found, it will be configured via the input value. + // + // @param config_options Controls how the option is being configured + // @param configurable The object to configure + // @param name For options with sub-options (like Structs or + // Configurables), + // this value may be the name of the sub-field of the option being + // updated. For example, if the option is + // "compaction_options_fifo.allow_compaction", then field name would be + // "allow_compaction". For most options, field_name and opt_name will be + // equivalent. + // @param value The new value for this option. + // @param See ConfigureOptions for the possible return values + static Status ConfigureSingleOption(const ConfigOptions& config_options, + Configurable& configurable, + const std::string& name, + const std::string& value); + + // Configures the option referenced by opt_info for this configurable + // This method configures the option based on opt_info for the input + // configurable. + // @param config_options Controls how the option is being configured + // @param configurable The object to configure + // @param opt_name The full option name + // @param name For options with sub-options (like Structs or + // Configurables), + // this value may be the name of the sub-field of the option being + // updated. For example, if the option is + // "compaction_options_fifo.allow_compaction", then field name would be + // "allow_compaction". For most options, field_name and opt_name will be + // equivalent. + // @param value The new value for this option. + // @param See ConfigureOptions for the possible return values + static Status ConfigureOption(const ConfigOptions& config_options, + Configurable& configurable, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& name, + const std::string& value, void* opt_ptr); + + // Returns the value of the option associated with the input name + // This method is the functional inverse of ConfigureOption + // @param config_options Controls how the value is returned + // @param configurable The object from which to get the option. + // @param name The name of the option to return a value for. + // @param value The returned value associated with the named option. + // Note that value will be only the serialized version + // of the option and not "name=value" + // @return OK If the named field was successfully updated to value. + // @return NotFound If the name is not valid for this object. + // @param InvalidArgument If the name is valid for this object but + // its value cannot be serialized. + static Status GetOption(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& name, std::string* value); + + // Serializes the input Configurable into the output result. + // This is the inverse of ConfigureOptions + // @param config_options Controls how serialization happens. + // @param configurable The object to serialize + // @param prefix A prefix to add to the each option as it is serialized. + // @param result The string representation of the configurable. + // @return OK If the options for this object wer successfully serialized. + // @return InvalidArgument If one or more of the options could not be + // serialized. + static Status SerializeOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::string* result); + + // Internal method to list the option names for this object. + // Classes may override this value to change its behavior. + // @see ListOptions for more details + static Status ListOptions(const ConfigOptions& config_options, + const Configurable& configurable, + const std::string& prefix, + std::unordered_set* result); + + // Checks to see if the two configurables are equivalent to one other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param this_one The object to compare to. + // @param that_one The other object being compared. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + static bool AreEquivalent(const ConfigOptions& config_options, + const Configurable& this_one, + const Configurable& that_one, + std::string* mismatch); + + private: + // Looks for the option specified by name in the RegisteredOptions. + // This method traverses the types in the input options vector. If an entry + // matching name is found, that entry, opt_name, and pointer are returned. + // @param options The vector of options to search through + // @param name The name of the option to search for in the OptionType map + // @param opt_name If the name was found, this value is set to the option name + // associated with the input name/type. + // @param opt_ptr If the name was found, this value is set to the option + // pointer + // in the RegisteredOptions vector associated with this entry + // @return A pointer to the OptionTypeInfo from the options if found, + // nullptr if the name was not found in the input options + static const OptionTypeInfo* FindOption( + const std::vector& options, + const std::string& name, std::string* opt_name, void** opt_ptr); + + static Status ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr); +#endif // ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,880 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "options/configurable_test.h" + +#include +#include +#include +#include + +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "rocksdb/configurable.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +#ifndef GFLAGS +bool FLAGS_enable_print = false; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { +namespace test { +class StringLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + char buffer[1000]; + vsnprintf(buffer, sizeof(buffer), format, ap); + string_.append(buffer); + } + const std::string& str() const { return string_; } + void clear() { string_.clear(); } + + private: + std::string string_; +}; +static std::unordered_map struct_option_info = { +#ifndef ROCKSDB_LITE + {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kMutable)}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map imm_struct_option_info = + { +#ifndef ROCKSDB_LITE + {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +class SimpleConfigurable : public TestConfigurable { + public: + static SimpleConfigurable* Create( + const std::string& name = "simple", + int mode = TestConfigMode::kDefaultMode, + const std::unordered_map* map = + &simple_option_info) { + return new SimpleConfigurable(name, mode, map); + } + + SimpleConfigurable(const std::string& name, int mode, + const std::unordered_map* + map = &simple_option_info) + : TestConfigurable(name, mode, map) { + if ((mode & TestConfigMode::kUniqueMode) != 0) { + unique_.reset(SimpleConfigurable::Create("Unique" + name_)); + RegisterOptions(name_ + "Unique", &unique_, &unique_option_info); + } + if ((mode & TestConfigMode::kSharedMode) != 0) { + shared_.reset(SimpleConfigurable::Create("Shared" + name_)); + RegisterOptions(name_ + "Shared", &shared_, &shared_option_info); + } + if ((mode & TestConfigMode::kRawPtrMode) != 0) { + pointer_ = SimpleConfigurable::Create("Pointer" + name_); + RegisterOptions(name_ + "Pointer", &pointer_, &pointer_option_info); + } + } + +}; // End class SimpleConfigurable + +using ConfigTestFactoryFunc = std::function; + +class ConfigurableTest : public testing::Test { + public: + ConfigurableTest() { config_options_.invoke_prepare_options = false; } + + ConfigOptions config_options_; +}; + +TEST_F(ConfigurableTest, GetOptionsPtrTest) { + std::string opt_str; + std::unique_ptr configurable(SimpleConfigurable::Create()); + ASSERT_NE(configurable->GetOptions("simple"), nullptr); + ASSERT_EQ(configurable->GetOptions("bad-opt"), nullptr); +} + +TEST_F(ConfigurableTest, ConfigureFromMapTest) { + std::unique_ptr configurable(SimpleConfigurable::Create()); + auto* opts = configurable->GetOptions("simple"); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, {})); + ASSERT_NE(opts, nullptr); +#ifndef ROCKSDB_LITE + std::unordered_map options_map = { + {"int", "1"}, {"bool", "true"}, {"string", "string"}}; + ASSERT_OK(configurable->ConfigureFromMap(config_options_, options_map)); + ASSERT_EQ(opts->i, 1); + ASSERT_EQ(opts->b, true); + ASSERT_EQ(opts->s, "string"); +#endif +} + +TEST_F(ConfigurableTest, ConfigureFromStringTest) { + std::unique_ptr configurable(SimpleConfigurable::Create()); + auto* opts = configurable->GetOptions("simple"); + ASSERT_OK(configurable->ConfigureFromString(config_options_, "")); + ASSERT_NE(opts, nullptr); +#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE + ASSERT_OK(configurable->ConfigureFromString(config_options_, + "int=1;bool=true;string=s")); + ASSERT_EQ(opts->i, 1); + ASSERT_EQ(opts->b, true); + ASSERT_EQ(opts->s, "s"); +#endif +} + +#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE +TEST_F(ConfigurableTest, ConfigureIgnoreTest) { + std::unique_ptr configurable(SimpleConfigurable::Create()); + std::unordered_map options_map = {{"unused", "u"}}; + ConfigOptions ignore = config_options_; + ignore.ignore_unknown_options = true; + ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map)); + ASSERT_OK(configurable->ConfigureFromMap(ignore, options_map)); + ASSERT_NOK(configurable->ConfigureFromString(config_options_, "unused=u")); + ASSERT_OK(configurable->ConfigureFromString(ignore, "unused=u")); +} + +TEST_F(ConfigurableTest, ConfigureNestedOptionsTest) { + std::unique_ptr base, copy; + std::string opt_str; + std::string mismatch; + + base.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode)); + copy.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode)); + ASSERT_OK(base->ConfigureFromString(config_options_, + "shared={int=10; string=10};" + "unique={int=20; string=20};" + "pointer={int=30; string=30};")); + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(ConfigurableTest, GetOptionsTest) { + std::unique_ptr simple; + + simple.reset( + SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode)); + int i = 11; + for (auto opt : {"", "shared.", "unique.", "pointer."}) { + std::string value; + std::string expected = ToString(i); + std::string opt_name = opt; + ASSERT_OK( + simple->ConfigureOption(config_options_, opt_name + "int", expected)); + ASSERT_OK(simple->GetOption(config_options_, opt_name + "int", &value)); + ASSERT_EQ(expected, value); + ASSERT_OK(simple->ConfigureOption(config_options_, opt_name + "string", + expected)); + ASSERT_OK(simple->GetOption(config_options_, opt_name + "string", &value)); + ASSERT_EQ(expected, value); + + ASSERT_NOK( + simple->ConfigureOption(config_options_, opt_name + "bad", expected)); + ASSERT_NOK(simple->GetOption(config_options_, "bad option", &value)); + ASSERT_TRUE(value.empty()); + i += 11; + } +} + +TEST_F(ConfigurableTest, ConfigureBadOptionsTest) { + std::unique_ptr configurable(SimpleConfigurable::Create()); + auto* opts = configurable->GetOptions("simple"); + ASSERT_NE(opts, nullptr); + ASSERT_OK(configurable->ConfigureOption(config_options_, "int", "42")); + ASSERT_EQ(opts->i, 42); + ASSERT_NOK(configurable->ConfigureOption(config_options_, "int", "fred")); + ASSERT_NOK(configurable->ConfigureOption(config_options_, "bool", "fred")); + ASSERT_NOK( + configurable->ConfigureFromString(config_options_, "int=33;unused=u")); + ASSERT_EQ(opts->i, 42); +} + +TEST_F(ConfigurableTest, InvalidOptionTest) { + std::unique_ptr configurable(SimpleConfigurable::Create()); + std::unordered_map options_map = { + {"bad-option", "bad"}}; + ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map)); + ASSERT_NOK( + configurable->ConfigureFromString(config_options_, "bad-option=bad")); + ASSERT_NOK( + configurable->ConfigureOption(config_options_, "bad-option", "bad")); +} + +static std::unordered_map validated_option_info = { +#ifndef ROCKSDB_LITE + {"validated", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map prepared_option_info = { +#ifndef ROCKSDB_LITE + {"prepared", + {0, OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map + dont_prepare_option_info = { +#ifndef ROCKSDB_LITE + {"unique", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + (OptionTypeFlags::kUnique | OptionTypeFlags::kDontPrepare)}}, + +#endif // ROCKSDB_LITE +}; + +class ValidatedConfigurable : public SimpleConfigurable { + public: + ValidatedConfigurable(const std::string& name, unsigned char mode, + bool dont_prepare = false) + : SimpleConfigurable(name, TestConfigMode::kDefaultMode), + validated(false), + prepared(0) { + RegisterOptions("Validated", &validated, &validated_option_info); + RegisterOptions("Prepared", &prepared, &prepared_option_info); + if ((mode & TestConfigMode::kUniqueMode) != 0) { + unique_.reset(new ValidatedConfigurable( + "Unique" + name_, TestConfigMode::kDefaultMode, false)); + if (dont_prepare) { + RegisterOptions(name_ + "Unique", &unique_, &dont_prepare_option_info); + } else { + RegisterOptions(name_ + "Unique", &unique_, &unique_option_info); + } + } + } + + Status PrepareOptions(const ConfigOptions& config_options) override { + if (++prepared <= 0) { + return Status::InvalidArgument("Cannot prepare option"); + } else { + return SimpleConfigurable::PrepareOptions(config_options); + } + } + + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + if (!validated) { + return Status::InvalidArgument("Not Validated"); + } else { + return SimpleConfigurable::ValidateOptions(db_opts, cf_opts); + } + } + + private: + bool validated; + int prepared; +}; + +TEST_F(ConfigurableTest, ValidateOptionsTest) { + std::unique_ptr configurable( + new ValidatedConfigurable("validated", TestConfigMode::kDefaultMode)); + ColumnFamilyOptions cf_opts; + DBOptions db_opts; + ASSERT_OK( + configurable->ConfigureOption(config_options_, "validated", "false")); + ASSERT_NOK(configurable->ValidateOptions(db_opts, cf_opts)); + ASSERT_OK( + configurable->ConfigureOption(config_options_, "validated", "true")); + ASSERT_OK(configurable->ValidateOptions(db_opts, cf_opts)); +} + +TEST_F(ConfigurableTest, PrepareOptionsTest) { + std::unique_ptr c( + new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, false)); + auto cp = c->GetOptions("Prepared"); + auto u = c->GetOptions>("SimpleUnique"); + auto up = u->get()->GetOptions("Prepared"); + config_options_.invoke_prepare_options = false; + + ASSERT_NE(cp, nullptr); + ASSERT_NE(up, nullptr); + ASSERT_EQ(*cp, 0); + ASSERT_EQ(*up, 0); + ASSERT_OK(c->ConfigureFromMap(config_options_, {})); + ASSERT_EQ(*cp, 0); + ASSERT_EQ(*up, 0); + config_options_.invoke_prepare_options = true; + ASSERT_OK(c->ConfigureFromMap(config_options_, {})); + ASSERT_EQ(*cp, 1); + ASSERT_EQ(*up, 1); + ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0")); + ASSERT_EQ(*up, 2); + ASSERT_EQ(*cp, 1); + + ASSERT_NOK(c->ConfigureFromString(config_options_, "prepared=-2")); + + c.reset( + new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, true)); + cp = c->GetOptions("Prepared"); + u = c->GetOptions>("SimpleUnique"); + up = u->get()->GetOptions("Prepared"); + + ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0")); + ASSERT_EQ(*cp, 1); + ASSERT_EQ(*up, 0); +} + +TEST_F(ConfigurableTest, CopyObjectTest) { + class CopyConfigurable : public Configurable { + public: + CopyConfigurable() : prepared_(0), validated_(0) {} + Status PrepareOptions(const ConfigOptions& options) override { + prepared_++; + return Configurable::PrepareOptions(options); + } + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + validated_++; + return Configurable::ValidateOptions(db_opts, cf_opts); + } + int prepared_; + mutable int validated_; + }; + + CopyConfigurable c1; + ConfigOptions config_options; + Options options; + + ASSERT_OK(c1.PrepareOptions(config_options)); + ASSERT_OK(c1.ValidateOptions(options, options)); + ASSERT_EQ(c1.prepared_, 1); + ASSERT_EQ(c1.validated_, 1); + CopyConfigurable c2 = c1; + ASSERT_OK(c1.PrepareOptions(config_options)); + ASSERT_OK(c1.ValidateOptions(options, options)); + ASSERT_EQ(c2.prepared_, 1); + ASSERT_EQ(c2.validated_, 1); + ASSERT_EQ(c1.prepared_, 2); + ASSERT_EQ(c1.validated_, 2); +} + +TEST_F(ConfigurableTest, MutableOptionsTest) { + static std::unordered_map imm_option_info = { +#ifndef ROCKSDB_LITE + {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE + }; + + class MutableConfigurable : public SimpleConfigurable { + public: + MutableConfigurable() + : SimpleConfigurable("mutable", TestConfigMode::kDefaultMode | + TestConfigMode::kUniqueMode | + TestConfigMode::kSharedMode) { + RegisterOptions("struct", &options_, &struct_option_info); + RegisterOptions("imm", &options_, &imm_option_info); + } + }; + MutableConfigurable mc; + ConfigOptions options = config_options_; + + ASSERT_OK(mc.ConfigureOption(options, "bool", "true")); + ASSERT_OK(mc.ConfigureOption(options, "int", "42")); + auto* opts = mc.GetOptions("mutable"); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->i, 42); + ASSERT_EQ(opts->b, true); + ASSERT_OK(mc.ConfigureOption(options, "struct", "{bool=false;}")); + ASSERT_OK(mc.ConfigureOption(options, "imm", "{int=55;}")); + + options.mutable_options_only = true; + + // Now only mutable options should be settable. + ASSERT_NOK(mc.ConfigureOption(options, "bool", "true")); + ASSERT_OK(mc.ConfigureOption(options, "int", "24")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + ASSERT_NOK(mc.ConfigureFromString(options, "bool=false;int=33;")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + + // Setting options through an immutable struct fails + ASSERT_NOK(mc.ConfigureOption(options, "imm", "{int=55;}")); + ASSERT_NOK(mc.ConfigureOption(options, "imm.int", "55")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + + // Setting options through an mutable struct succeeds + ASSERT_OK(mc.ConfigureOption(options, "struct", "{int=44;}")); + ASSERT_EQ(opts->i, 44); + ASSERT_OK(mc.ConfigureOption(options, "struct.int", "55")); + ASSERT_EQ(opts->i, 55); + + // Setting nested immutable configurable options fail + ASSERT_NOK(mc.ConfigureOption(options, "shared", "{bool=true;}")); + ASSERT_NOK(mc.ConfigureOption(options, "shared.bool", "true")); + + // Setting nested mutable configurable options succeeds + ASSERT_OK(mc.ConfigureOption(options, "unique", "{bool=true}")); + ASSERT_OK(mc.ConfigureOption(options, "unique.bool", "true")); +} + +TEST_F(ConfigurableTest, DeprecatedOptionsTest) { + static std::unordered_map + deprecated_option_info = { + {"deprecated", + {offsetof(struct TestOptions, b), OptionType::kBoolean, + OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}}; + std::unique_ptr orig; + orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode, + &deprecated_option_info)); + auto* opts = orig->GetOptions("simple"); + ASSERT_NE(opts, nullptr); + opts->d = true; + ASSERT_OK(orig->ConfigureOption(config_options_, "deprecated", "false")); + ASSERT_TRUE(opts->d); + ASSERT_OK(orig->ConfigureFromString(config_options_, "deprecated=false")); + ASSERT_TRUE(opts->d); +} + +TEST_F(ConfigurableTest, AliasOptionsTest) { + static std::unordered_map alias_option_info = { + {"bool", + {offsetof(struct TestOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"alias", + {offsetof(struct TestOptions, b), OptionType::kBoolean, + OptionVerificationType::kAlias, OptionTypeFlags::kNone, 0}}}; + std::unique_ptr orig; + orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode, + &alias_option_info)); + auto* opts = orig->GetOptions("simple"); + ASSERT_NE(opts, nullptr); + ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false")); + ASSERT_FALSE(opts->b); + ASSERT_OK(orig->ConfigureOption(config_options_, "alias", "true")); + ASSERT_TRUE(opts->b); + std::string opts_str; + ASSERT_OK(orig->GetOptionString(config_options_, &opts_str)); + ASSERT_EQ(opts_str.find("alias"), std::string::npos); + + ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false")); + ASSERT_FALSE(opts->b); + ASSERT_OK(orig->GetOption(config_options_, "alias", &opts_str)); + ASSERT_EQ(opts_str, "false"); +} + +TEST_F(ConfigurableTest, NestedUniqueConfigTest) { + std::unique_ptr simple; + simple.reset( + SimpleConfigurable::Create("Outer", TestConfigMode::kAllOptMode)); + const auto outer = simple->GetOptions("Outer"); + const auto unique = + simple->GetOptions>("OuterUnique"); + ASSERT_NE(outer, nullptr); + ASSERT_NE(unique, nullptr); + ASSERT_OK( + simple->ConfigureFromString(config_options_, "int=24;string=outer")); + ASSERT_OK(simple->ConfigureFromString(config_options_, + "unique={int=42;string=nested}")); + const auto inner = unique->get()->GetOptions("UniqueOuter"); + ASSERT_NE(inner, nullptr); + ASSERT_EQ(outer->i, 24); + ASSERT_EQ(outer->s, "outer"); + ASSERT_EQ(inner->i, 42); + ASSERT_EQ(inner->s, "nested"); +} + +TEST_F(ConfigurableTest, NestedSharedConfigTest) { + std::unique_ptr simple; + simple.reset(SimpleConfigurable::Create( + "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kSharedMode)); + ASSERT_OK( + simple->ConfigureFromString(config_options_, "int=24;string=outer")); + ASSERT_OK(simple->ConfigureFromString(config_options_, + "shared={int=42;string=nested}")); + const auto outer = simple->GetOptions("Outer"); + const auto shared = + simple->GetOptions>("OuterShared"); + ASSERT_NE(outer, nullptr); + ASSERT_NE(shared, nullptr); + const auto inner = shared->get()->GetOptions("SharedOuter"); + ASSERT_NE(inner, nullptr); + ASSERT_EQ(outer->i, 24); + ASSERT_EQ(outer->s, "outer"); + ASSERT_EQ(inner->i, 42); + ASSERT_EQ(inner->s, "nested"); +} + +TEST_F(ConfigurableTest, NestedRawConfigTest) { + std::unique_ptr simple; + simple.reset(SimpleConfigurable::Create( + "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kRawPtrMode)); + ASSERT_OK( + simple->ConfigureFromString(config_options_, "int=24;string=outer")); + ASSERT_OK(simple->ConfigureFromString(config_options_, + "pointer={int=42;string=nested}")); + const auto outer = simple->GetOptions("Outer"); + const auto pointer = simple->GetOptions("OuterPointer"); + ASSERT_NE(outer, nullptr); + ASSERT_NE(pointer, nullptr); + const auto inner = (*pointer)->GetOptions("PointerOuter"); + ASSERT_NE(inner, nullptr); + ASSERT_EQ(outer->i, 24); + ASSERT_EQ(outer->s, "outer"); + ASSERT_EQ(inner->i, 42); + ASSERT_EQ(inner->s, "nested"); +} + +TEST_F(ConfigurableTest, MatchesTest) { + std::string mismatch; + std::unique_ptr base, copy; + base.reset(SimpleConfigurable::Create( + "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode)); + copy.reset(SimpleConfigurable::Create( + "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode)); + ASSERT_OK(base->ConfigureFromString( + config_options_, + "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}")); + ASSERT_OK(copy->ConfigureFromString( + config_options_, + "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}")); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(base->ConfigureOption(config_options_, "shared", "int=44")); + ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_EQ(mismatch, "shared.int"); + std::string c1value, c2value; + ASSERT_OK(base->GetOption(config_options_, mismatch, &c1value)); + ASSERT_OK(copy->GetOption(config_options_, mismatch, &c2value)); + ASSERT_NE(c1value, c2value); +} + +static Configurable* SimpleStructFactory() { + return SimpleConfigurable::Create( + "simple-struct", TestConfigMode::kDefaultMode, &struct_option_info); +} + +TEST_F(ConfigurableTest, ConfigureStructTest) { + std::unique_ptr base(SimpleStructFactory()); + std::unique_ptr copy(SimpleStructFactory()); + std::string opt_str, value; + std::string mismatch; + std::unordered_set names; + + ASSERT_OK( + base->ConfigureFromString(config_options_, "struct={int=10; string=10}")); + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(base->GetOptionNames(config_options_, &names)); + ASSERT_EQ(names.size(), 1); + ASSERT_EQ(*(names.begin()), "struct"); + ASSERT_OK( + base->ConfigureFromString(config_options_, "struct={int=20; string=20}")); + ASSERT_OK(base->GetOption(config_options_, "struct", &value)); + ASSERT_OK(copy->ConfigureOption(config_options_, "struct", value)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + + ASSERT_NOK(base->ConfigureFromString(config_options_, + "struct={int=10; string=10; bad=11}")); + ASSERT_OK(base->ConfigureOption(config_options_, "struct.int", "42")); + ASSERT_NOK(base->ConfigureOption(config_options_, "struct.bad", "42")); + ASSERT_NOK(base->GetOption(config_options_, "struct.bad", &value)); + ASSERT_OK(base->GetOption(config_options_, "struct.int", &value)); + ASSERT_EQ(value, "42"); +} + +TEST_F(ConfigurableTest, ConfigurableEnumTest) { + std::unique_ptr base, copy; + base.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode)); + copy.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode)); + + std::string opts_str; + std::string mismatch; + + ASSERT_OK(base->ConfigureFromString(config_options_, "enum=B")); + ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(base->GetOptionString(config_options_, &opts_str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, opts_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_NOK(base->ConfigureOption(config_options_, "enum", "bad")); + ASSERT_NOK(base->ConfigureOption(config_options_, "unknown", "bad")); +} + +#ifndef ROCKSDB_LITE +static std::unordered_map noserialize_option_info = + { + {"int", + {offsetof(struct TestOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kDontSerialize}}, +}; + +TEST_F(ConfigurableTest, TestNoSerialize) { + std::unique_ptr base; + base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, + &noserialize_option_info)); + std::string opts_str, value; + ASSERT_OK(base->ConfigureFromString(config_options_, "int=10")); + ASSERT_OK(base->GetOptionString(config_options_, &opts_str)); + ASSERT_EQ(opts_str, ""); + ASSERT_NOK(base->GetOption(config_options_, "int", &value)); +} + +TEST_F(ConfigurableTest, TestNoCompare) { + std::unordered_map nocomp_option_info = { + {"int", + {offsetof(struct TestOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}}, + }; + std::unordered_map normal_option_info = { + {"int", + {offsetof(struct TestOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + }; + + std::unique_ptr base, copy; + base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, + &nocomp_option_info)); + copy.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, + &normal_option_info)); + ASSERT_OK(base->ConfigureFromString(config_options_, "int=10")); + ASSERT_OK(copy->ConfigureFromString(config_options_, "int=20")); + std::string bvalue, cvalue, mismatch; + ASSERT_OK(base->GetOption(config_options_, "int", &bvalue)); + ASSERT_OK(copy->GetOption(config_options_, "int", &cvalue)); + ASSERT_EQ(bvalue, "10"); + ASSERT_EQ(cvalue, "20"); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_FALSE(copy->AreEquivalent(config_options_, base.get(), &mismatch)); +} + +TEST_F(ConfigurableTest, NullOptionMapTest) { + std::unique_ptr base; + std::unordered_set names; + std::string str; + + base.reset( + SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr)); + ASSERT_NOK(base->ConfigureFromString(config_options_, "int=10")); + ASSERT_NOK(base->ConfigureFromString(config_options_, "int=20")); + ASSERT_NOK(base->ConfigureOption(config_options_, "int", "20")); + ASSERT_NOK(base->GetOption(config_options_, "int", &str)); + ASSERT_NE(base->GetOptions("c"), nullptr); + ASSERT_OK(base->GetOptionNames(config_options_, &names)); + ASSERT_EQ(names.size(), 0UL); + ASSERT_OK(base->PrepareOptions(config_options_)); + ASSERT_OK(base->ValidateOptions(DBOptions(), ColumnFamilyOptions())); + std::unique_ptr copy; + copy.reset( + SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr)); + ASSERT_OK(base->GetOptionString(config_options_, &str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &str)); +} +#endif + +static std::unordered_map TestFactories = { + {"Simple", []() { return SimpleConfigurable::Create("simple"); }}, + {"Struct", []() { return SimpleStructFactory(); }}, + {"Unique", + []() { + return SimpleConfigurable::Create( + "simple", TestConfigMode::kSimpleMode | TestConfigMode::kUniqueMode); + }}, + {"Shared", + []() { + return SimpleConfigurable::Create( + "simple", TestConfigMode::kSimpleMode | TestConfigMode::kSharedMode); + }}, + {"Nested", + []() { + return SimpleConfigurable::Create( + "simple", TestConfigMode::kSimpleMode | TestConfigMode::kNestedMode); + }}, + {"Mutable", + []() { + return SimpleConfigurable::Create("simple", + TestConfigMode::kMutableMode | + TestConfigMode::kSimpleMode | + TestConfigMode::kNestedMode); + }}, + {"ThreeDeep", + []() { + Configurable* simple = SimpleConfigurable::Create( + "Simple", + TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode); + auto* unique = + simple->GetOptions>("SimpleUnique"); + unique->reset(SimpleConfigurable::Create( + "Child", + TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode)); + unique = unique->get()->GetOptions>( + "ChildUnique"); + unique->reset( + SimpleConfigurable::Create("Child", TestConfigMode::kDefaultMode)); + return simple; + }}, + {"DBOptions", + []() { + auto config = DBOptionsAsConfigurable(DBOptions()); + return config.release(); + }}, + {"CFOptions", + []() { + auto config = CFOptionsAsConfigurable(ColumnFamilyOptions()); + return config.release(); + }}, + {"BlockBased", []() { return NewBlockBasedTableFactory(); }}, +}; + +class ConfigurableParamTest : public ConfigurableTest, + virtual public ::testing::WithParamInterface< + std::pair> { + public: + ConfigurableParamTest() { + type_ = GetParam().first; + configuration_ = GetParam().second; + assert(TestFactories.find(type_) != TestFactories.end()); + object_.reset(CreateConfigurable()); + } + + Configurable* CreateConfigurable() { + const auto& iter = TestFactories.find(type_); + return (iter->second)(); + } + + void TestConfigureOptions(const ConfigOptions& opts); + std::string type_; + std::string configuration_; + std::unique_ptr object_; +}; + +void ConfigurableParamTest::TestConfigureOptions( + const ConfigOptions& config_options) { + std::unique_ptr base, copy; + std::unordered_set names; + std::string opt_str, mismatch; + + base.reset(CreateConfigurable()); + copy.reset(CreateConfigurable()); + + ASSERT_OK(base->ConfigureFromString(config_options, configuration_)); + ASSERT_OK(base->GetOptionString(config_options, &opt_str)); + ASSERT_OK(copy->ConfigureFromString(config_options, opt_str)); + ASSERT_OK(copy->GetOptionString(config_options, &opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch)); + + copy.reset(CreateConfigurable()); + ASSERT_OK(base->GetOptionNames(config_options, &names)); + std::unordered_map unused; + bool found_one = false; + for (auto name : names) { + std::string value; + Status s = base->GetOption(config_options, name, &value); + if (s.ok()) { + s = copy->ConfigureOption(config_options, name, value); + if (s.ok() || s.IsNotSupported()) { + found_one = true; + } else { + unused[name] = value; + } + } else { + ASSERT_TRUE(s.IsNotSupported()); + } + } + ASSERT_TRUE(found_one || names.empty()); + while (found_one && !unused.empty()) { + found_one = false; + for (auto iter = unused.begin(); iter != unused.end();) { + if (copy->ConfigureOption(config_options, iter->first, iter->second) + .ok()) { + found_one = true; + iter = unused.erase(iter); + } else { + ++iter; + } + } + } + ASSERT_EQ(0, unused.size()); + ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch)); +} + +TEST_P(ConfigurableParamTest, GetDefaultOptionsTest) { + TestConfigureOptions(config_options_); +} + +TEST_P(ConfigurableParamTest, ConfigureFromPropsTest) { + std::string opt_str, mismatch; + std::unordered_set names; + std::unique_ptr copy(CreateConfigurable()); + + ASSERT_OK(object_->ConfigureFromString(config_options_, configuration_)); + config_options_.delimiter = "\n"; + ASSERT_OK(object_->GetOptionString(config_options_, &opt_str)); + std::istringstream iss(opt_str); + std::unordered_map copy_map; + std::string line; + for (int line_num = 0; std::getline(iss, line); line_num++) { + std::string name; + std::string value; + ASSERT_OK( + RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num)); + copy_map[name] = value; + } + ASSERT_OK(copy->ConfigureFromMap(config_options_, copy_map)); + ASSERT_TRUE(object_->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +INSTANTIATE_TEST_CASE_P( + ParamTest, ConfigurableParamTest, + testing::Values( + std::pair("Simple", + "int=42;bool=true;string=s"), + std::pair( + "Mutable", "int=42;unique={int=33;string=unique}"), + std::pair( + "Struct", "struct={int=33;bool=true;string=s;}"), + std::pair("Shared", + "int=33;bool=true;string=outer;" + "shared={int=42;string=shared}"), + std::pair("Unique", + "int=33;bool=true;string=outer;" + "unique={int=42;string=unique}"), + std::pair("Nested", + "int=11;bool=true;string=outer;" + "pointer={int=22;string=pointer};" + "unique={int=33;string=unique};" + "shared={int=44;string=shared}"), + std::pair("ThreeDeep", + "int=11;bool=true;string=outer;" + "unique={int=22;string=inner;" + "unique={int=33;string=unique}};"), + std::pair("DBOptions", + "max_background_jobs=100;" + "max_open_files=200;"), + std::pair("CFOptions", + "table_factory=BlockBasedTable;" + "disable_auto_compactions=true;"), + std::pair("BlockBased", + "block_size=1024;" + "no_block_cache=true;"))); +#endif // ROCKSDB_LITE + +} // namespace test +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/configurable.h" +#include "rocksdb/utilities/options_type.h" + +namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyOptions; +struct DBOptions; + +namespace test { +enum TestEnum { kTestA, kTestB }; + +static const std::unordered_map test_enum_map = { + {"A", TestEnum::kTestA}, + {"B", TestEnum::kTestB}, +}; + +struct TestOptions { + int i = 0; + bool b = false; + bool d = true; + TestEnum e = TestEnum::kTestA; + std::string s = ""; + std::string u = ""; +}; + +static std::unordered_map simple_option_info = { +#ifndef ROCKSDB_LITE + {"int", + {offsetof(struct TestOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bool", + {offsetof(struct TestOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"string", + {offsetof(struct TestOptions, s), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map enum_option_info = { +#ifndef ROCKSDB_LITE + {"enum", + OptionTypeInfo::Enum(offsetof(struct TestOptions, e), &test_enum_map)} +#endif +}; + +static std::unordered_map unique_option_info = { +#ifndef ROCKSDB_LITE + {"unique", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + (OptionTypeFlags::kUnique | OptionTypeFlags::kMutable)}}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map shared_option_info = { +#ifndef ROCKSDB_LITE + {"shared", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + (OptionTypeFlags::kShared)}}, +#endif // ROCKSDB_LITE +}; +static std::unordered_map pointer_option_info = { +#ifndef ROCKSDB_LITE + {"pointer", + {0, OptionType::kConfigurable, OptionVerificationType::kNormal, + OptionTypeFlags::kRawPointer}}, +#endif // ROCKSDB_LITE +}; + +enum TestConfigMode { + kEmptyMode = 0x0, // Don't register anything + kMutableMode = 0x01, // Configuration is mutable + kSimpleMode = 0x02, // Use the simple options + kEnumMode = 0x04, // Use the enum options + kDefaultMode = kSimpleMode, // Use no inner nested configurations + kSharedMode = 0x10, // Use shared configuration + kUniqueMode = 0x20, // Use unique configuration + kRawPtrMode = 0x40, // Use pointer configuration + kNestedMode = (kSharedMode | kUniqueMode | kRawPtrMode), + kAllOptMode = (kNestedMode | kEnumMode | kSimpleMode), +}; + +template +class TestConfigurable : public Configurable { + protected: + std::string name_; + std::string prefix_; + TestOptions options_; + + public: + std::unique_ptr unique_; + std::shared_ptr shared_; + T* pointer_; + + TestConfigurable(const std::string& name, int mode, + const std::unordered_map* map = + &simple_option_info) + : name_(name), pointer_(nullptr) { + prefix_ = "test." + name + "."; + if ((mode & TestConfigMode::kSimpleMode) != 0) { + RegisterOptions(name_, &options_, map); + } + if ((mode & TestConfigMode::kEnumMode) != 0) { + RegisterOptions(name_ + "Enum", &options_, &enum_option_info); + } + } + + ~TestConfigurable() override { delete pointer_; } +}; + +} // namespace test +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/customizable.h" + +#include + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string Customizable::GetOptionName(const std::string& long_name) const { + const std::string& name = Name(); + size_t name_len = name.size(); + if (long_name.size() > name_len + 1 && + long_name.compare(0, name_len, name) == 0 && + long_name.at(name_len) == '.') { + return long_name.substr(name_len + 1); + } else { + return Configurable::GetOptionName(long_name); + } +} + +std::string Customizable::GenerateIndividualId() const { + std::ostringstream ostr; + ostr << Name() << "@" << static_cast(this) << "#" + << port::GetProcessID(); + return ostr.str(); +} + +#ifndef ROCKSDB_LITE +Status Customizable::GetOption(const ConfigOptions& config_options, + const std::string& opt_name, + std::string* value) const { + if (opt_name == OptionTypeInfo::kIdPropName()) { + *value = GetId(); + return Status::OK(); + } else { + return Configurable::GetOption(config_options, opt_name, value); + } +} + +std::string Customizable::SerializeOptions(const ConfigOptions& config_options, + const std::string& prefix) const { + std::string result; + std::string parent; + std::string id = GetId(); + if (!config_options.IsShallow() && !id.empty()) { + parent = Configurable::SerializeOptions(config_options, ""); + } + if (parent.empty()) { + result = id; + } else { + result.append(prefix); + result.append(OptionTypeInfo::kIdPropName()); + result.append("="); + result.append(id); + result.append(config_options.delimiter); + result.append(parent); + } + return result; +} + +#endif // ROCKSDB_LITE + +bool Customizable::AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const { + if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && + this != other) { + const Customizable* custom = reinterpret_cast(other); + if (GetId() != custom->GetId()) { + *mismatch = OptionTypeInfo::kIdPropName(); + return false; + } else if (config_options.sanity_level > + ConfigOptions::kSanityLevelLooselyCompatible) { + bool matches = + Configurable::AreEquivalent(config_options, other, mismatch); + return matches; + } + } + return true; +} + +Status Customizable::GetOptionsMap( + const ConfigOptions& config_options, const Customizable* customizable, + const std::string& value, std::string* id, + std::unordered_map* props) { + Status status; + if (value.empty() || value == kNullptrString) { + *id = ""; + props->clear(); + } else if (customizable != nullptr) { + status = + Configurable::GetOptionsMap(value, customizable->GetId(), id, props); +#ifdef ROCKSDB_LITE + (void)config_options; +#else + if (status.ok() && customizable->IsInstanceOf(*id)) { + // The new ID and the old ID match, so the objects are the same type. + // Try to get the existing options, ignoring any errors + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + std::string curr_opts; + if (customizable->GetOptionString(embedded, &curr_opts).ok()) { + std::unordered_map curr_props; + if (StringToMap(curr_opts, &curr_props).ok()) { + props->insert(curr_props.begin(), curr_props.end()); + } + } + } +#endif // ROCKSDB_LITE + } else { + status = Configurable::GetOptionsMap(value, "", id, props); + } + return status; +} + +Status Customizable::ConfigureNewObject( + const ConfigOptions& config_options, Customizable* object, + const std::unordered_map& opt_map) { + Status status; + if (object != nullptr) { + status = object->ConfigureFromMap(config_options, opt_map); + } else if (!opt_map.empty()) { + status = Status::InvalidArgument("Cannot configure null object "); + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,2132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/customizable.h" + +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/sst_partitioner.h" +#include "rocksdb/statistics.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "table/block_based/flush_block_policy.h" +#include "table/mock_table.h" +#include "test_util/mock_time_env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/file_checksum_helper.h" +#include "util/rate_limiter.h" +#include "util/string_util.h" +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" +#include "utilities/memory_allocators.h" +#include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +#ifndef GFLAGS +bool FLAGS_enable_print = false; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { +namespace { +class StringLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + char buffer[1000]; + vsnprintf(buffer, sizeof(buffer), format, ap); + string_.append(buffer); + } + const std::string& str() const { return string_; } + void clear() { string_.clear(); } + + private: + std::string string_; +}; + +class TestCustomizable : public Customizable { + public: + TestCustomizable(const std::string& name) : name_(name) {} + // Method to allow CheckedCast to work for this class + static const char* kClassName() { + return "TestCustomizable"; + } + + const char* Name() const override { return name_.c_str(); } + static const char* Type() { return "test.custom"; } +#ifndef ROCKSDB_LITE + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::unique_ptr* result); + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::shared_ptr* result); + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + TestCustomizable** result); +#endif // ROCKSDB_LITE + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return Customizable::IsInstanceOf(name); + } + } + + protected: + const std::string name_; +}; + +struct AOptions { + static const char* kName() { return "A"; } + int i = 0; + bool b = false; +}; + +static std::unordered_map a_option_info = { +#ifndef ROCKSDB_LITE + {"int", + {offsetof(struct AOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bool", + {offsetof(struct AOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +class ACustomizable : public TestCustomizable { + public: + explicit ACustomizable(const std::string& id) + : TestCustomizable("A"), id_(id) { + RegisterOptions(&opts_, &a_option_info); + } + std::string GetId() const override { return id_; } + static const char* kClassName() { return "A"; } + + private: + AOptions opts_; + const std::string id_; +}; + +struct BOptions { + std::string s; + bool b = false; +}; + +static std::unordered_map b_option_info = { +#ifndef ROCKSDB_LITE + {"string", + {offsetof(struct BOptions, s), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"bool", + {offsetof(struct BOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +class BCustomizable : public TestCustomizable { + private: + public: + explicit BCustomizable(const std::string& name) : TestCustomizable(name) { + RegisterOptions(name, &opts_, &b_option_info); + } + static const char* kClassName() { return "B"; } + + private: + BOptions opts_; +}; + +#ifndef ROCKSDB_LITE +static bool LoadSharedB(const std::string& id, + std::shared_ptr* result) { + if (id == "B") { + result->reset(new BCustomizable(id)); + return true; + } else if (id.empty()) { + result->reset(); + return true; + } else { + return false; + } +} + +static int A_count = 0; +static int RegisterCustomTestObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + ObjectLibrary::PatternEntry("A", true).AddSeparator("_"), + [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new ACustomizable(name)); + A_count++; + return guard->get(); + }); + + library.AddFactory( + "S", [](const std::string& name, + std::unique_ptr* /* guard */, + std::string* /* msg */) { return new BCustomizable(name); }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +struct SimpleOptions { + static const char* kName() { return "simple"; } + bool b = true; + std::unique_ptr cu; + std::shared_ptr cs; + TestCustomizable* cp = nullptr; +}; + +static SimpleOptions dummy_simple_options; +template +int offset_of(T1 SimpleOptions::*member) { + return static_cast( + reinterpret_cast( + std::addressof(dummy_simple_options.*member)) - + reinterpret_cast(std::addressof(dummy_simple_options))); +} + +static std::unordered_map simple_option_info = { +#ifndef ROCKSDB_LITE + {"bool", + {offset_of(&SimpleOptions::b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"unique", + OptionTypeInfo::AsCustomUniquePtr( + offset_of(&SimpleOptions::cu), OptionVerificationType::kNormal, + OptionTypeFlags::kAllowNull)}, + {"shared", + OptionTypeInfo::AsCustomSharedPtr( + offset_of(&SimpleOptions::cs), OptionVerificationType::kNormal, + OptionTypeFlags::kAllowNull)}, + {"pointer", + OptionTypeInfo::AsCustomRawPtr( + offset_of(&SimpleOptions::cp), OptionVerificationType::kNormal, + OptionTypeFlags::kAllowNull)}, +#endif // ROCKSDB_LITE +}; + +class SimpleConfigurable : public Configurable { + private: + SimpleOptions simple_; + + public: + SimpleConfigurable() { RegisterOptions(&simple_, &simple_option_info); } + + explicit SimpleConfigurable( + const std::unordered_map* map) { + RegisterOptions(&simple_, map); + } +}; + +#ifndef ROCKSDB_LITE +static void GetMapFromProperties( + const std::string& props, + std::unordered_map* map) { + std::istringstream iss(props); + std::unordered_map copy_map; + std::string line; + map->clear(); + for (int line_num = 0; std::getline(iss, line); line_num++) { + std::string name; + std::string value; + ASSERT_OK( + RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num)); + (*map)[name] = value; + } +} +#endif // ROCKSDB_LITE +} // namespace + +#ifndef ROCKSDB_LITE +Status TestCustomizable::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + return LoadSharedObject(config_options, value, LoadSharedB, + result); +} + +Status TestCustomizable::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::unique_ptr* result) { + return LoadUniqueObject( + config_options, value, + [](const std::string& id, std::unique_ptr* u) { + if (id == "B") { + u->reset(new BCustomizable(id)); + return true; + } else if (id.empty()) { + u->reset(); + return true; + } else { + return false; + } + }, + result); +} + +Status TestCustomizable::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + TestCustomizable** result) { + return LoadStaticObject( + config_options, value, + [](const std::string& id, TestCustomizable** ptr) { + if (id == "B") { + *ptr = new BCustomizable(id); + return true; + } else if (id.empty()) { + *ptr = nullptr; + return true; + } else { + return false; + } + }, + result); +} +#endif // ROCKSDB_LITE + +class CustomizableTest : public testing::Test { + public: + CustomizableTest() { + config_options_.invoke_prepare_options = false; +#ifndef ROCKSDB_LITE + // GetOptionsFromMap is not supported in ROCKSDB_LITE + config_options_.registry->AddLibrary("CustomizableTest", + RegisterCustomTestObjects, ""); +#endif // ROCKSDB_LITE + } + + ConfigOptions config_options_; +}; + +#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE +// Tests that a Customizable can be created by: +// - a simple name +// - a XXX.id option +// - a property with a name +TEST_F(CustomizableTest, CreateByNameTest) { + ObjectLibrary::Default()->AddFactory( + ObjectLibrary::PatternEntry("TEST", false).AddSeparator("_"), + [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new TestCustomizable(name)); + return guard->get(); + }); + std::unique_ptr configurable(new SimpleConfigurable()); + SimpleOptions* simple = configurable->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique={id=TEST_1}")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_1"); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique.id=TEST_2")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_2"); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique=TEST_3")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_3"); +} + +TEST_F(CustomizableTest, ToStringTest) { + std::unique_ptr custom(new TestCustomizable("test")); + ASSERT_EQ(custom->ToString(config_options_), "test"); +} + +TEST_F(CustomizableTest, SimpleConfigureTest) { + std::unordered_map opt_map = { + {"unique", "id=A;int=1;bool=true"}, + {"shared", "id=B;string=s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); + std::string opt_str; + std::string mismatch; + ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str)); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE( + configurable->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CustomizableTest, ConfigureFromPropsTest) { + std::unordered_map opt_map = { + {"unique.id", "A"}, {"unique.A.int", "1"}, {"unique.A.bool", "true"}, + {"shared.id", "B"}, {"shared.B.string", "s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); + std::string opt_str; + std::string mismatch; + config_options_.delimiter = "\n"; + std::unordered_map props; + ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str)); + GetMapFromProperties(opt_str, &props); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromMap(config_options_, props)); + ASSERT_TRUE( + configurable->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CustomizableTest, ConfigureFromShortTest) { + std::unordered_map opt_map = { + {"unique.id", "A"}, {"unique.A.int", "1"}, {"unique.A.bool", "true"}, + {"shared.id", "B"}, {"shared.B.string", "s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); +} + +TEST_F(CustomizableTest, AreEquivalentOptionsTest) { + std::unordered_map opt_map = { + {"unique", "id=A;int=1;bool=true"}, + {"shared", "id=A;int=1;bool=true"}, + }; + std::string mismatch; + ConfigOptions config_options = config_options_; + std::unique_ptr c1(new SimpleConfigurable()); + std::unique_ptr c2(new SimpleConfigurable()); + ASSERT_OK(c1->ConfigureFromMap(config_options, opt_map)); + ASSERT_OK(c2->ConfigureFromMap(config_options, opt_map)); + ASSERT_TRUE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + SimpleOptions* simple = c1->GetOptions(); + ASSERT_TRUE( + simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch)); + ASSERT_OK(simple->cu->ConfigureOption(config_options, "int", "2")); + ASSERT_FALSE( + simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + ConfigOptions loosely = config_options; + loosely.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_TRUE(simple->cu->AreEquivalent(loosely, simple->cs.get(), &mismatch)); + + ASSERT_OK(c1->ConfigureOption(config_options, "shared", "id=B;string=3")); + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + ASSERT_FALSE(simple->cs->AreEquivalent(loosely, simple->cu.get(), &mismatch)); + simple->cs.reset(); + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); +} + +// Tests that we can initialize a customizable from its options +TEST_F(CustomizableTest, ConfigureStandaloneCustomTest) { + std::unique_ptr base, copy; + const auto& registry = config_options_.registry; + ASSERT_OK(registry->NewUniqueObject("A", &base)); + ASSERT_OK(registry->NewUniqueObject("A", ©)); + ASSERT_OK(base->ConfigureFromString(config_options_, "int=33;bool=true")); + std::string opt_str; + std::string mismatch; + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +// Tests that we fail appropriately if the pattern is not registered +TEST_F(CustomizableTest, BadNameTest) { + config_options_.ignore_unsupported_options = false; + std::unique_ptr c1(new SimpleConfigurable()); + ASSERT_NOK( + c1->ConfigureFromString(config_options_, "unique.shared.id=bad name")); + config_options_.ignore_unsupported_options = true; + ASSERT_OK( + c1->ConfigureFromString(config_options_, "unique.shared.id=bad name")); +} + +// Tests that we fail appropriately if a bad option is passed to the underlying +// configurable +TEST_F(CustomizableTest, BadOptionTest) { + std::unique_ptr c1(new SimpleConfigurable()); + ConfigOptions ignore = config_options_; + ignore.ignore_unknown_options = true; + + ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.int=11")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "shared={id=B;int=1}")); + ASSERT_OK(c1->ConfigureFromString(ignore, "shared={id=A;string=s}")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "B.int=11")); + ASSERT_OK(c1->ConfigureFromString(ignore, "B.int=11")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.string=s")); + ASSERT_OK(c1->ConfigureFromString(ignore, "A.string=s")); + // Test as detached + ASSERT_NOK( + c1->ConfigureFromString(config_options_, "shared.id=A;A.string=b}")); + ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=A;A.string=s}")); +} + +// Tests that different IDs lead to different objects +TEST_F(CustomizableTest, UniqueIdTest) { + std::unique_ptr base(new SimpleConfigurable()); + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=true}")); + SimpleOptions* simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), std::string("A_1")); + std::string opt_str; + std::string mismatch; + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_2;int=1;bool=true}")); + ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_EQ(simple->cu->GetId(), std::string("A_2")); +} + +TEST_F(CustomizableTest, IsInstanceOfTest) { + std::shared_ptr tc = std::make_shared("A_1"); + + ASSERT_EQ(tc->GetId(), std::string("A_1")); + ASSERT_TRUE(tc->IsInstanceOf("A")); + ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable")); + ASSERT_FALSE(tc->IsInstanceOf("B")); + ASSERT_FALSE(tc->IsInstanceOf("A_1")); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), nullptr); + + tc.reset(new BCustomizable("B")); + ASSERT_TRUE(tc->IsInstanceOf("B")); + ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable")); + ASSERT_FALSE(tc->IsInstanceOf("A")); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), nullptr); +} + +TEST_F(CustomizableTest, PrepareOptionsTest) { + static std::unordered_map p_option_info = { +#ifndef ROCKSDB_LITE + {"can_prepare", + {0, OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE + }; + + class PrepareCustomizable : public TestCustomizable { + public: + bool can_prepare_ = true; + + PrepareCustomizable() : TestCustomizable("P") { + RegisterOptions("Prepare", &can_prepare_, &p_option_info); + } + + Status PrepareOptions(const ConfigOptions& opts) override { + if (!can_prepare_) { + return Status::InvalidArgument("Cannot Prepare"); + } else { + return TestCustomizable::PrepareOptions(opts); + } + } + }; + + ObjectLibrary::Default()->AddFactory( + "P", + [](const std::string& /*name*/, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new PrepareCustomizable()); + return guard->get(); + }); + + std::unique_ptr base(new SimpleConfigurable()); + ConfigOptions prepared(config_options_); + prepared.invoke_prepare_options = true; + + ASSERT_OK(base->ConfigureFromString( + prepared, "unique=A_1; shared={id=B;string=s}; pointer.id=S")); + SimpleOptions* simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_NE(simple->cs, nullptr); + ASSERT_NE(simple->cp, nullptr); + delete simple->cp; + base.reset(new SimpleConfigurable()); + ASSERT_OK(base->ConfigureFromString( + config_options_, "unique=A_1; shared={id=B;string=s}; pointer.id=S")); + + simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_NE(simple->cs, nullptr); + ASSERT_NE(simple->cp, nullptr); + + ASSERT_OK(base->PrepareOptions(config_options_)); + delete simple->cp; + base.reset(new SimpleConfigurable()); + simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + + ASSERT_NOK( + base->ConfigureFromString(prepared, "unique={id=P; can_prepare=false}")); + ASSERT_EQ(simple->cu, nullptr); + + ASSERT_OK( + base->ConfigureFromString(prepared, "unique={id=P; can_prepare=true}")); + ASSERT_NE(simple->cu, nullptr); + + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=P; can_prepare=true}")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_OK(simple->cu->PrepareOptions(prepared)); + + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=P; can_prepare=false}")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_NOK(simple->cu->PrepareOptions(prepared)); +} + +namespace { +static std::unordered_map inner_option_info = { +#ifndef ROCKSDB_LITE + {"inner", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)} +#endif // ROCKSDB_LITE +}; + +struct InnerOptions { + static const char* kName() { return "InnerOptions"; } + std::shared_ptr inner; +}; + +class InnerCustomizable : public Customizable { + public: + explicit InnerCustomizable(const std::shared_ptr& w) { + iopts_.inner = w; + RegisterOptions(&iopts_, &inner_option_info); + } + static const char* kClassName() { return "Inner"; } + const char* Name() const override { return kClassName(); } + + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return Customizable::IsInstanceOf(name); + } + } + + protected: + const Customizable* Inner() const override { return iopts_.inner.get(); } + + private: + InnerOptions iopts_; +}; + +struct WrappedOptions1 { + static const char* kName() { return "WrappedOptions1"; } + int i = 42; +}; + +class WrappedCustomizable1 : public InnerCustomizable { + public: + explicit WrappedCustomizable1(const std::shared_ptr& w) + : InnerCustomizable(w) { + RegisterOptions(&wopts_, nullptr); + } + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Wrapped1"; } + + private: + WrappedOptions1 wopts_; +}; + +struct WrappedOptions2 { + static const char* kName() { return "WrappedOptions2"; } + std::string s = "42"; +}; +class WrappedCustomizable2 : public InnerCustomizable { + public: + explicit WrappedCustomizable2(const std::shared_ptr& w) + : InnerCustomizable(w) {} + const void* GetOptionsPtr(const std::string& name) const override { + if (name == WrappedOptions2::kName()) { + return &wopts_; + } else { + return InnerCustomizable::GetOptionsPtr(name); + } + } + + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Wrapped2"; } + + private: + WrappedOptions2 wopts_; +}; +} // namespace + +TEST_F(CustomizableTest, WrappedInnerTest) { + std::shared_ptr ac = + std::make_shared("A"); + + ASSERT_TRUE(ac->IsInstanceOf("A")); + ASSERT_TRUE(ac->IsInstanceOf("TestCustomizable")); + ASSERT_EQ(ac->CheckedCast(), ac.get()); + ASSERT_EQ(ac->CheckedCast(), nullptr); + ASSERT_EQ(ac->CheckedCast(), nullptr); + ASSERT_EQ(ac->CheckedCast(), nullptr); + std::shared_ptr wc1 = + std::make_shared(ac); + + ASSERT_TRUE(wc1->IsInstanceOf(WrappedCustomizable1::kClassName())); + ASSERT_EQ(wc1->CheckedCast(), wc1.get()); + ASSERT_EQ(wc1->CheckedCast(), nullptr); + ASSERT_EQ(wc1->CheckedCast(), wc1.get()); + ASSERT_EQ(wc1->CheckedCast(), ac.get()); + + std::shared_ptr wc2 = + std::make_shared(wc1); + ASSERT_TRUE(wc2->IsInstanceOf(WrappedCustomizable2::kClassName())); + ASSERT_EQ(wc2->CheckedCast(), wc2.get()); + ASSERT_EQ(wc2->CheckedCast(), wc1.get()); + ASSERT_EQ(wc2->CheckedCast(), wc2.get()); + ASSERT_EQ(wc2->CheckedCast(), ac.get()); +} + +TEST_F(CustomizableTest, CustomizableInnerTest) { + std::shared_ptr c = + std::make_shared(std::make_shared("a")); + std::shared_ptr wc1 = std::make_shared(c); + std::shared_ptr wc2 = std::make_shared(c); + auto inner = c->GetOptions(); + ASSERT_NE(inner, nullptr); + + auto aopts = c->GetOptions(); + ASSERT_NE(aopts, nullptr); + ASSERT_EQ(aopts, wc1->GetOptions()); + ASSERT_EQ(aopts, wc2->GetOptions()); + auto w1opts = wc1->GetOptions(); + ASSERT_NE(w1opts, nullptr); + ASSERT_EQ(c->GetOptions(), nullptr); + ASSERT_EQ(wc2->GetOptions(), nullptr); + + auto w2opts = wc2->GetOptions(); + ASSERT_NE(w2opts, nullptr); + ASSERT_EQ(c->GetOptions(), nullptr); + ASSERT_EQ(wc1->GetOptions(), nullptr); +} + +TEST_F(CustomizableTest, CopyObjectTest) { + class CopyCustomizable : public Customizable { + public: + CopyCustomizable() : prepared_(0), validated_(0) {} + const char* Name() const override { return "CopyCustomizable"; } + + Status PrepareOptions(const ConfigOptions& options) override { + prepared_++; + return Customizable::PrepareOptions(options); + } + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + validated_++; + return Customizable::ValidateOptions(db_opts, cf_opts); + } + int prepared_; + mutable int validated_; + }; + + CopyCustomizable c1; + ConfigOptions config_options; + Options options; + + ASSERT_OK(c1.PrepareOptions(config_options)); + ASSERT_OK(c1.ValidateOptions(options, options)); + ASSERT_EQ(c1.prepared_, 1); + ASSERT_EQ(c1.validated_, 1); + CopyCustomizable c2 = c1; + ASSERT_OK(c1.PrepareOptions(config_options)); + ASSERT_OK(c1.ValidateOptions(options, options)); + ASSERT_EQ(c2.prepared_, 1); + ASSERT_EQ(c2.validated_, 1); + ASSERT_EQ(c1.prepared_, 2); + ASSERT_EQ(c1.validated_, 2); +} + +TEST_F(CustomizableTest, TestStringDepth) { + ConfigOptions shallow = config_options_; + std::unique_ptr c( + new InnerCustomizable(std::make_shared("a"))); + std::string opt_str; + shallow.depth = ConfigOptions::Depth::kDepthShallow; + ASSERT_OK(c->GetOptionString(shallow, &opt_str)); + ASSERT_EQ(opt_str, "inner=a;"); + shallow.depth = ConfigOptions::Depth::kDepthDetailed; + ASSERT_OK(c->GetOptionString(shallow, &opt_str)); + ASSERT_NE(opt_str, "inner=a;"); +} + +// Tests that we only get a new customizable when it changes +TEST_F(CustomizableTest, NewUniqueCustomizableTest) { + std::unique_ptr base(new SimpleConfigurable()); + A_count = 0; + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=true}")); + SimpleOptions* simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(A_count, 1); // Created one A + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=false}")); + ASSERT_EQ(A_count, 2); // Create another A_1 + ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}")); + ASSERT_EQ(simple->cu, nullptr); + ASSERT_EQ(A_count, 2); + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_2;int=1;bool=false}")); + ASSERT_EQ(A_count, 3); // Created another A + ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=")); + ASSERT_EQ(simple->cu, nullptr); + ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr")); + ASSERT_EQ(simple->cu, nullptr); + ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr")); + ASSERT_EQ(simple->cu, nullptr); + ASSERT_EQ(A_count, 3); +} + +TEST_F(CustomizableTest, NewEmptyUniqueTest) { + std::unique_ptr base(new SimpleConfigurable()); + SimpleOptions* simple = base->GetOptions(); + ASSERT_EQ(simple->cu, nullptr); + simple->cu.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}")); + ASSERT_EQ(simple->cu, nullptr); + simple->cu.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=nullptr}")); + ASSERT_EQ(simple->cu, nullptr); + simple->cu.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=")); + ASSERT_EQ(simple->cu, nullptr); + simple->cu.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr")); + ASSERT_EQ(simple->cu, nullptr); + simple->cu.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr")); + ASSERT_EQ(simple->cu, nullptr); +} + +TEST_F(CustomizableTest, NewEmptySharedTest) { + std::unique_ptr base(new SimpleConfigurable()); + + SimpleOptions* simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_EQ(simple->cs, nullptr); + simple->cs.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=}")); + ASSERT_NE(simple, nullptr); + ASSERT_EQ(simple->cs, nullptr); + simple->cs.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=nullptr}")); + ASSERT_EQ(simple->cs, nullptr); + simple->cs.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id=")); + ASSERT_EQ(simple->cs, nullptr); + simple->cs.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id=nullptr")); + ASSERT_EQ(simple->cs, nullptr); + simple->cs.reset(new BCustomizable("B")); + + ASSERT_OK(base->ConfigureFromString(config_options_, "shared=nullptr")); + ASSERT_EQ(simple->cs, nullptr); +} + +TEST_F(CustomizableTest, NewEmptyStaticTest) { + std::unique_ptr base(new SimpleConfigurable()); + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=}")); + SimpleOptions* simple = base->GetOptions(); + ASSERT_NE(simple, nullptr); + ASSERT_EQ(simple->cp, nullptr); + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=nullptr}")); + ASSERT_EQ(simple->cp, nullptr); + + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer=")); + ASSERT_EQ(simple->cp, nullptr); + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer=nullptr")); + ASSERT_EQ(simple->cp, nullptr); + + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id=")); + ASSERT_EQ(simple->cp, nullptr); + ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id=nullptr")); + ASSERT_EQ(simple->cp, nullptr); +} + +namespace { +#ifndef ROCKSDB_LITE +static std::unordered_map vector_option_info = { + {"vector", + OptionTypeInfo::Vector>( + 0, OptionVerificationType::kNormal, + + OptionTypeFlags::kNone, + + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone))}, +}; +class VectorConfigurable : public SimpleConfigurable { + public: + VectorConfigurable() { RegisterOptions("vector", &cv, &vector_option_info); } + std::vector> cv; +}; +} // namespace + +TEST_F(CustomizableTest, VectorConfigTest) { + VectorConfigurable orig, copy; + std::shared_ptr c1, c2; + ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "A", &c1)); + ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "B", &c2)); + orig.cv.push_back(c1); + orig.cv.push_back(c2); + ASSERT_OK(orig.ConfigureFromString(config_options_, "unique=A2")); + std::string opt_str, mismatch; + ASSERT_OK(orig.GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(orig.AreEquivalent(config_options_, ©, &mismatch)); +} + +TEST_F(CustomizableTest, NoNameTest) { + // If Customizables are created without names, they are not + // part of the serialization (since they cannot be recreated) + VectorConfigurable orig, copy; + auto sopts = orig.GetOptions(); + auto copts = copy.GetOptions(); + sopts->cu.reset(new ACustomizable("")); + orig.cv.push_back(std::make_shared("")); + orig.cv.push_back(std::make_shared("A_1")); + std::string opt_str, mismatch; + ASSERT_OK(orig.GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str)); + ASSERT_EQ(copy.cv.size(), 1U); + ASSERT_EQ(copy.cv[0]->GetId(), "A_1"); + ASSERT_EQ(copts->cu, nullptr); +} + +#endif // ROCKSDB_LITE + +TEST_F(CustomizableTest, IgnoreUnknownObjects) { + ConfigOptions ignore = config_options_; + std::shared_ptr shared; + std::unique_ptr unique; + TestCustomizable* pointer = nullptr; + ignore.ignore_unsupported_options = false; + ASSERT_NOK( + LoadSharedObject(ignore, "Unknown", nullptr, &shared)); + ASSERT_NOK( + LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); + ASSERT_NOK( + LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ignore.ignore_unsupported_options = true; + ASSERT_OK( + LoadSharedObject(ignore, "Unknown", nullptr, &shared)); + ASSERT_OK( + LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); + ASSERT_OK( + LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_OK(LoadSharedObject(ignore, "id=Unknown", nullptr, + &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown", nullptr, + &unique)); + ASSERT_OK(LoadStaticObject(ignore, "id=Unknown", nullptr, + &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_OK(LoadSharedObject(ignore, "id=Unknown;option=bad", + nullptr, &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown;option=bad", + nullptr, &unique)); + ASSERT_OK(LoadStaticObject(ignore, "id=Unknown;option=bad", + nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); +} + +TEST_F(CustomizableTest, FactoryFunctionTest) { + std::shared_ptr shared; + std::unique_ptr unique; + TestCustomizable* pointer = nullptr; + ConfigOptions ignore = config_options_; + ignore.ignore_unsupported_options = false; + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer)); + ASSERT_NE(shared.get(), nullptr); + ASSERT_NE(unique.get(), nullptr); + ASSERT_NE(pointer, nullptr); + delete pointer; + pointer = nullptr; + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared)); + ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique)); + ASSERT_NOK( + TestCustomizable::CreateFromString(ignore, "option=bad", &pointer)); + ASSERT_EQ(pointer, nullptr); +} + +TEST_F(CustomizableTest, URLFactoryTest) { + std::unique_ptr unique; + config_options_.registry->AddLibrary("URL")->AddFactory( + ObjectLibrary::PatternEntry("Z", false).AddSeparator(""), + [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new TestCustomizable(name)); + return guard->get(); + }); + + ConfigOptions ignore = config_options_; + ignore.ignore_unsupported_options = false; + ignore.ignore_unsupported_options = false; + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1;x=y", &unique)); + ASSERT_NE(unique, nullptr); + ASSERT_EQ(unique->GetId(), "Z=1;x=y"); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z;x=y", &unique)); + ASSERT_NE(unique, nullptr); + ASSERT_EQ(unique->GetId(), "Z;x=y"); + unique.reset(); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1?x=y", &unique)); + ASSERT_NE(unique, nullptr); + ASSERT_EQ(unique->GetId(), "Z=1?x=y"); +} + +TEST_F(CustomizableTest, MutableOptionsTest) { + static std::unordered_map mutable_option_info = { + {"mutable", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}}; + static std::unordered_map immutable_option_info = + {{"immutable", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kAllowNull)}}; + + class MutableCustomizable : public Customizable { + private: + std::shared_ptr mutable_; + std::shared_ptr immutable_; + + public: + MutableCustomizable() { + RegisterOptions("mutable", &mutable_, &mutable_option_info); + RegisterOptions("immutable", &immutable_, &immutable_option_info); + } + const char* Name() const override { return "MutableCustomizable"; } + }; + MutableCustomizable mc, mc2; + std::string mismatch; + std::string opt_str; + + ConfigOptions options = config_options_; + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=B;}")); + options.mutable_options_only = true; + ASSERT_OK(mc.GetOptionString(options, &opt_str)); + ASSERT_OK(mc2.ConfigureFromString(options, opt_str)); + ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch)); + + options.mutable_options_only = false; + ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=A; int=10}")); + auto* mm = mc.GetOptions>("mutable"); + auto* im = mc.GetOptions>("immutable"); + ASSERT_NE(mm, nullptr); + ASSERT_NE(mm->get(), nullptr); + ASSERT_NE(im, nullptr); + ASSERT_NE(im->get(), nullptr); + + // Now only deal with mutable options + options.mutable_options_only = true; + + // Setting nested immutable customizable options fails + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{id=B;}")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable.id", "B")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable.bool", "true")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "bool=true")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{int=11;bool=true}")); + auto* im_a = im->get()->GetOptions("A"); + ASSERT_NE(im_a, nullptr); + ASSERT_EQ(im_a->i, 10); + ASSERT_EQ(im_a->b, false); + + // Setting nested mutable customizable options succeeds but the object did not + // change + ASSERT_OK(mc.ConfigureOption(options, "immutable.int", "11")); + ASSERT_EQ(im_a->i, 11); + ASSERT_EQ(im_a, im->get()->GetOptions("A")); + + // The mutable configurable itself can be changed + ASSERT_OK(mc.ConfigureOption(options, "mutable.id", "A")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "A")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=A}")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}")); + + // The Nested options in the mutable object can be changed + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}")); + auto* mm_a = mm->get()->GetOptions("A"); + ASSERT_EQ(mm_a->b, true); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{int=22;bool=false}")); + mm_a = mm->get()->GetOptions("A"); + ASSERT_EQ(mm_a->i, 22); + ASSERT_EQ(mm_a->b, false); + + // Only the mutable options should get serialized + options.mutable_options_only = false; + ASSERT_OK(mc.GetOptionString(options, &opt_str)); + ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=B;}")); + options.mutable_options_only = true; + + ASSERT_OK(mc.GetOptionString(options, &opt_str)); + ASSERT_OK(mc2.ConfigureFromString(options, opt_str)); + ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch)); + options.mutable_options_only = false; + ASSERT_FALSE(mc.AreEquivalent(options, &mc2, &mismatch)); + ASSERT_EQ(mismatch, "immutable"); +} + +TEST_F(CustomizableTest, CustomManagedObjects) { + std::shared_ptr object1, object2; + ASSERT_OK(LoadManagedObject( + config_options_, "id=A_1;int=1;bool=true", &object1)); + ASSERT_NE(object1, nullptr); + ASSERT_OK( + LoadManagedObject(config_options_, "A_1", &object2)); + ASSERT_EQ(object1, object2); + auto* opts = object2->GetOptions("A"); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->i, 1); + ASSERT_EQ(opts->b, true); + ASSERT_OK( + LoadManagedObject(config_options_, "A_2", &object2)); + ASSERT_NE(object1, object2); + object1.reset(); + ASSERT_OK(LoadManagedObject( + config_options_, "id=A_1;int=2;bool=false", &object1)); + opts = object1->GetOptions("A"); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->i, 2); + ASSERT_EQ(opts->b, false); +} + +TEST_F(CustomizableTest, CreateManagedObjects) { + class ManagedCustomizable : public Customizable { + public: + static const char* Type() { return "ManagedCustomizable"; } + static const char* kClassName() { return "Managed"; } + const char* Name() const override { return kClassName(); } + std::string GetId() const override { return id_; } + ManagedCustomizable() { id_ = GenerateIndividualId(); } + static Status CreateFromString( + const ConfigOptions& opts, const std::string& value, + std::shared_ptr* result) { + return LoadManagedObject(opts, value, result); + } + + private: + std::string id_; + }; + + config_options_.registry->AddLibrary("Managed") + ->AddFactory( + ObjectLibrary::PatternEntry::AsIndividualId( + ManagedCustomizable::kClassName()), + [](const std::string& /*name*/, + std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new ManagedCustomizable()); + return guard->get(); + }); + + std::shared_ptr mc1, mc2, mc3, obj; + // Create a "deadbeef" customizable + std::string deadbeef = + std::string(ManagedCustomizable::kClassName()) + "@0xdeadbeef#0001"; + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1)); + // Create an object with the base/class name + ASSERT_OK(ManagedCustomizable::CreateFromString( + config_options_, ManagedCustomizable::kClassName(), &mc2)); + // Creating another with the base name returns a different object + ASSERT_OK(ManagedCustomizable::CreateFromString( + config_options_, ManagedCustomizable::kClassName(), &mc3)); + // At this point, there should be 4 managed objects (deadbeef, mc1, 2, and 3) + std::vector> objects; + ASSERT_OK(config_options_.registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 4U); + objects.clear(); + // Three separate object, none of them equal + ASSERT_NE(mc1, mc2); + ASSERT_NE(mc1, mc3); + ASSERT_NE(mc2, mc3); + + // Creating another object with "deadbeef" object + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj)); + ASSERT_EQ(mc1, obj); + // Create another with the IDs of the instances + ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc1->GetId(), + &obj)); + ASSERT_EQ(mc1, obj); + ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc2->GetId(), + &obj)); + ASSERT_EQ(mc2, obj); + ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc3->GetId(), + &obj)); + ASSERT_EQ(mc3, obj); + + // Now get rid of deadbeef. 2 Objects left (m2+m3) + mc1.reset(); + ASSERT_EQ( + config_options_.registry->GetManagedObject(deadbeef), + nullptr); + ASSERT_OK(config_options_.registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 2U); + objects.clear(); + + // Associate deadbeef with #2 + ASSERT_OK(config_options_.registry->SetManagedObject(deadbeef, mc2)); + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj)); + ASSERT_EQ(mc2, obj); + obj.reset(); + + // Get the ID of mc2 and then reset it. 1 Object left + std::string mc2id = mc2->GetId(); + mc2.reset(); + ASSERT_EQ( + config_options_.registry->GetManagedObject(mc2id), + nullptr); + ASSERT_OK(config_options_.registry->ListManagedObjects(&objects)); + ASSERT_EQ(objects.size(), 1U); + objects.clear(); + + // Create another object with the old mc2id. + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, mc2id, &mc2)); + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, mc2id, &obj)); + ASSERT_EQ(mc2, obj); + + // For good measure, create another deadbeef object + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1)); + ASSERT_OK( + ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj)); + ASSERT_EQ(mc1, obj); +} + +#endif // !ROCKSDB_LITE + +namespace { +class TestSecondaryCache : public SecondaryCache { + public: + static const char* kClassName() { return "Test"; } + const char* Name() const override { return kClassName(); } + Status Insert(const Slice& /*key*/, void* /*value*/, + const Cache::CacheItemHelper* /*helper*/) override { + return Status::NotSupported(); + } + std::unique_ptr Lookup( + const Slice& /*key*/, const Cache::CreateCallback& /*create_cb*/, + bool /*wait*/) override { + return nullptr; + } + void Erase(const Slice& /*key*/) override {} + + // Wait for a collection of handles to become ready + void WaitAll(std::vector /*handles*/) override {} + + std::string GetPrintableOptions() const override { return ""; } +}; + +class TestStatistics : public StatisticsImpl { + public: + TestStatistics() : StatisticsImpl(nullptr) {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Test"; } +}; + +class TestFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + TestFlushBlockPolicyFactory() {} + + static const char* kClassName() { return "TestFlushBlockPolicyFactory"; } + const char* Name() const override { return kClassName(); } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return nullptr; + } +}; + +class MockSliceTransform : public SliceTransform { + public: + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Mock"; } + + Slice Transform(const Slice& /*key*/) const override { return Slice(); } + + bool InDomain(const Slice& /*key*/) const override { return false; } + + bool InRange(const Slice& /*key*/) const override { return false; } +}; + +class MockMemoryAllocator : public BaseMemoryAllocator { + public: + static const char* kClassName() { return "MockMemoryAllocator"; } + const char* Name() const override { return kClassName(); } +}; + +#ifndef ROCKSDB_LITE +class MockEncryptionProvider : public EncryptionProvider { + public: + explicit MockEncryptionProvider(const std::string& id) : id_(id) {} + static const char* kClassName() { return "Mock"; } + const char* Name() const override { return kClassName(); } + size_t GetPrefixLength() const override { return 0; } + Status CreateNewPrefix(const std::string& /*fname*/, char* /*prefix*/, + size_t /*prefixLength*/) const override { + return Status::NotSupported(); + } + + Status AddCipher(const std::string& /*descriptor*/, const char* /*cipher*/, + size_t /*len*/, bool /*for_write*/) override { + return Status::NotSupported(); + } + + Status CreateCipherStream( + const std::string& /*fname*/, const EnvOptions& /*options*/, + Slice& /*prefix*/, + std::unique_ptr* /*result*/) override { + return Status::NotSupported(); + } + Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + if (EndsWith(id_, "://test")) { + return EncryptionProvider::ValidateOptions(db_opts, cf_opts); + } else { + return Status::InvalidArgument("MockProvider not initialized"); + } + } + + private: + std::string id_; +}; + +class MockCipher : public BlockCipher { + public: + const char* Name() const override { return "Mock"; } + size_t BlockSize() override { return 0; } + Status Encrypt(char* /*data*/) override { return Status::NotSupported(); } + Status Decrypt(char* data) override { return Encrypt(data); } +}; +#endif // ROCKSDB_LITE + +class DummyFileSystem : public FileSystemWrapper { + public: + explicit DummyFileSystem(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + static const char* kClassName() { return "DummyFileSystem"; } + const char* Name() const override { return kClassName(); } +}; + +#ifndef ROCKSDB_LITE + +#endif // ROCKSDB_LITE + +class MockTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + private: + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return nullptr; + } + static const char* kClassName() { return "Mock"; } + const char* Name() const override { return kClassName(); } +}; + +class MockSstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { return "Mock"; } + const char* Name() const override { return kClassName(); } + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return nullptr; + } +}; + +class MockFileChecksumGenFactory : public FileChecksumGenFactory { + public: + static const char* kClassName() { return "Mock"; } + const char* Name() const override { return kClassName(); } + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& /*context*/) override { + return nullptr; + } +}; + +class MockRateLimiter : public RateLimiter { + public: + static const char* kClassName() { return "MockRateLimiter"; } + const char* Name() const override { return kClassName(); } + void SetBytesPerSecond(int64_t /*bytes_per_second*/) override {} + int64_t GetBytesPerSecond() const override { return 0; } + int64_t GetSingleBurstBytes() const override { return 0; } + int64_t GetTotalBytesThrough(const Env::IOPriority /*pri*/) const override { + return 0; + } + int64_t GetTotalRequests(const Env::IOPriority /*pri*/) const override { + return 0; + } +}; + +#ifndef ROCKSDB_LITE +static int RegisterLocalObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + size_t num_types; + library.AddFactory( + mock::MockTableFactory::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new mock::MockTableFactory()); + return guard->get(); + }); + library.AddFactory( + OnFileDeletionListener::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new OnFileDeletionListener()); + return guard->get(); + }); + library.AddFactory( + FlushCounterListener::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FlushCounterListener()); + return guard->get(); + }); + // Load any locally defined objects here + library.AddFactory( + MockSliceTransform::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockSliceTransform()); + return guard->get(); + }); + library.AddFactory( + TestStatistics::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TestStatistics()); + return guard->get(); + }); + + library.AddFactory( + ObjectLibrary::PatternEntry(MockEncryptionProvider::kClassName(), true) + .AddSuffix("://test"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockEncryptionProvider(uri)); + return guard->get(); + }); + library.AddFactory( + "Mock", + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockCipher()); + return guard->get(); + }); + library.AddFactory( + MockMemoryAllocator::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockMemoryAllocator()); + return guard->get(); + }); + library.AddFactory( + TestFlushBlockPolicyFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TestFlushBlockPolicyFactory()); + return guard->get(); + }); + + library.AddFactory( + TestSecondaryCache::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TestSecondaryCache()); + return guard->get(); + }); + + library.AddFactory( + DummyFileSystem::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new DummyFileSystem(nullptr)); + return guard->get(); + }); + + library.AddFactory( + MockSstPartitionerFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockSstPartitionerFactory()); + return guard->get(); + }); + + library.AddFactory( + MockFileChecksumGenFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockFileChecksumGenFactory()); + return guard->get(); + }); + + library.AddFactory( + MockTablePropertiesCollectorFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockTablePropertiesCollectorFactory()); + return guard->get(); + }); + + library.AddFactory( + MockRateLimiter::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockRateLimiter()); + return guard->get(); + }); + + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // !ROCKSDB_LITE +} // namespace + +class LoadCustomizableTest : public testing::Test { + public: + LoadCustomizableTest() { + config_options_.ignore_unsupported_options = false; + config_options_.invoke_prepare_options = false; + } + bool RegisterTests(const std::string& arg) { +#ifndef ROCKSDB_LITE + config_options_.registry->AddLibrary("custom-tests", + test::RegisterTestObjects, arg); + config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects, + arg); + return true; +#else + (void)arg; + return false; +#endif // !ROCKSDB_LITE + } + + protected: + DBOptions db_opts_; + ColumnFamilyOptions cf_opts_; + ConfigOptions config_options_; +}; + +TEST_F(LoadCustomizableTest, LoadTableFactoryTest) { + std::shared_ptr factory; + ASSERT_NOK(TableFactory::CreateFromString( + config_options_, mock::MockTableFactory::kClassName(), &factory)); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, TableFactory::kBlockBasedTableName(), &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), TableFactory::kBlockBasedTableName()); +#ifndef ROCKSDB_LITE + std::string opts_str = "table_factory="; + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options_, cf_opts_, + opts_str + TableFactory::kBlockBasedTableName(), &cf_opts_)); + ASSERT_NE(cf_opts_.table_factory.get(), nullptr); + ASSERT_STREQ(cf_opts_.table_factory->Name(), + TableFactory::kBlockBasedTableName()); +#endif // ROCKSDB_LITE + if (RegisterTests("Test")) { + ASSERT_OK(TableFactory::CreateFromString( + config_options_, mock::MockTableFactory::kClassName(), &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), mock::MockTableFactory::kClassName()); +#ifndef ROCKSDB_LITE + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options_, cf_opts_, + opts_str + mock::MockTableFactory::kClassName(), &cf_opts_)); + ASSERT_NE(cf_opts_.table_factory.get(), nullptr); + ASSERT_STREQ(cf_opts_.table_factory->Name(), + mock::MockTableFactory::kClassName()); +#endif // ROCKSDB_LITE + } +} + +TEST_F(LoadCustomizableTest, LoadFileSystemTest) { + ColumnFamilyOptions cf_opts; + std::shared_ptr result; + ASSERT_NOK(FileSystem::CreateFromString( + config_options_, DummyFileSystem::kClassName(), &result)); + ASSERT_OK(FileSystem::CreateFromString(config_options_, + FileSystem::kDefaultName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_TRUE(result->IsInstanceOf(FileSystem::kDefaultName())); + if (RegisterTests("Test")) { + ASSERT_OK(FileSystem::CreateFromString( + config_options_, DummyFileSystem::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), DummyFileSystem::kClassName()); + ASSERT_FALSE(result->IsInstanceOf(FileSystem::kDefaultName())); + } +} + +TEST_F(LoadCustomizableTest, LoadSecondaryCacheTest) { + std::shared_ptr result; + ASSERT_NOK(SecondaryCache::CreateFromString( + config_options_, TestSecondaryCache::kClassName(), &result)); + if (RegisterTests("Test")) { + ASSERT_OK(SecondaryCache::CreateFromString( + config_options_, TestSecondaryCache::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), TestSecondaryCache::kClassName()); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(LoadCustomizableTest, LoadSstPartitionerFactoryTest) { + std::shared_ptr factory; + ASSERT_NOK(SstPartitionerFactory::CreateFromString(config_options_, "Mock", + &factory)); + ASSERT_OK(SstPartitionerFactory::CreateFromString( + config_options_, SstPartitionerFixedPrefixFactory::kClassName(), + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), SstPartitionerFixedPrefixFactory::kClassName()); + + if (RegisterTests("Test")) { + ASSERT_OK(SstPartitionerFactory::CreateFromString(config_options_, "Mock", + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), "Mock"); + } +} +#endif // ROCKSDB_LITE + +TEST_F(LoadCustomizableTest, LoadChecksumGenFactoryTest) { + std::shared_ptr factory; + ASSERT_NOK(FileChecksumGenFactory::CreateFromString(config_options_, "Mock", + &factory)); + ASSERT_OK(FileChecksumGenFactory::CreateFromString( + config_options_, FileChecksumGenCrc32cFactory::kClassName(), &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), FileChecksumGenCrc32cFactory::kClassName()); + + if (RegisterTests("Test")) { + ASSERT_OK(FileChecksumGenFactory::CreateFromString(config_options_, "Mock", + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), "Mock"); + } +} + +TEST_F(LoadCustomizableTest, LoadTablePropertiesCollectorFactoryTest) { + std::shared_ptr factory; + ASSERT_NOK(TablePropertiesCollectorFactory::CreateFromString( + config_options_, MockTablePropertiesCollectorFactory::kClassName(), + &factory)); + if (RegisterTests("Test")) { + ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString( + config_options_, MockTablePropertiesCollectorFactory::kClassName(), + &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), + MockTablePropertiesCollectorFactory::kClassName()); + } +} + +TEST_F(LoadCustomizableTest, LoadComparatorTest) { + const Comparator* bytewise = BytewiseComparator(); + const Comparator* reverse = ReverseBytewiseComparator(); + + const Comparator* result = nullptr; + ASSERT_NOK(Comparator::CreateFromString( + config_options_, test::SimpleSuffixReverseComparator::kClassName(), + &result)); + ASSERT_OK( + Comparator::CreateFromString(config_options_, bytewise->Name(), &result)); + ASSERT_EQ(result, bytewise); + ASSERT_OK( + Comparator::CreateFromString(config_options_, reverse->Name(), &result)); + ASSERT_EQ(result, reverse); + + if (RegisterTests("Test")) { + ASSERT_OK(Comparator::CreateFromString( + config_options_, test::SimpleSuffixReverseComparator::kClassName(), + &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), + test::SimpleSuffixReverseComparator::kClassName()); + } +} + +TEST_F(LoadCustomizableTest, LoadSliceTransformFactoryTest) { + std::shared_ptr result; + ASSERT_NOK( + SliceTransform::CreateFromString(config_options_, "Mock", &result)); + ASSERT_OK( + SliceTransform::CreateFromString(config_options_, "fixed:16", &result)); + ASSERT_NE(result.get(), nullptr); + ASSERT_TRUE(result->IsInstanceOf("fixed")); + ASSERT_OK(SliceTransform::CreateFromString( + config_options_, "rocksdb.FixedPrefix.22", &result)); + ASSERT_NE(result.get(), nullptr); + ASSERT_TRUE(result->IsInstanceOf("fixed")); + + ASSERT_OK( + SliceTransform::CreateFromString(config_options_, "capped:16", &result)); + ASSERT_NE(result.get(), nullptr); + ASSERT_TRUE(result->IsInstanceOf("capped")); + + ASSERT_OK(SliceTransform::CreateFromString( + config_options_, "rocksdb.CappedPrefix.11", &result)); + ASSERT_NE(result.get(), nullptr); + ASSERT_TRUE(result->IsInstanceOf("capped")); + + if (RegisterTests("Test")) { + ASSERT_OK( + SliceTransform::CreateFromString(config_options_, "Mock", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "Mock"); + } +} + +TEST_F(LoadCustomizableTest, LoadStatisticsTest) { + std::shared_ptr stats; + ASSERT_NOK(Statistics::CreateFromString( + config_options_, TestStatistics::kClassName(), &stats)); + ASSERT_OK( + Statistics::CreateFromString(config_options_, "BasicStatistics", &stats)); + ASSERT_NE(stats, nullptr); + ASSERT_EQ(stats->Name(), std::string("BasicStatistics")); +#ifndef ROCKSDB_LITE + ASSERT_NOK(GetDBOptionsFromString(config_options_, db_opts_, + "statistics=Test", &db_opts_)); + ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_, + "statistics=BasicStatistics", &db_opts_)); + ASSERT_NE(db_opts_.statistics, nullptr); + ASSERT_STREQ(db_opts_.statistics->Name(), "BasicStatistics"); + + if (RegisterTests("test")) { + ASSERT_OK(Statistics::CreateFromString( + config_options_, TestStatistics::kClassName(), &stats)); + ASSERT_NE(stats, nullptr); + ASSERT_STREQ(stats->Name(), TestStatistics::kClassName()); + + ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_, + "statistics=Test", &db_opts_)); + ASSERT_NE(db_opts_.statistics, nullptr); + ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName()); + + ASSERT_OK(GetDBOptionsFromString( + config_options_, db_opts_, "statistics={id=Test;inner=BasicStatistics}", + &db_opts_)); + ASSERT_NE(db_opts_.statistics, nullptr); + ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName()); + auto* inner = db_opts_.statistics->GetOptions>( + "StatisticsOptions"); + ASSERT_NE(inner, nullptr); + ASSERT_NE(inner->get(), nullptr); + ASSERT_STREQ(inner->get()->Name(), "BasicStatistics"); + + ASSERT_OK(Statistics::CreateFromString( + config_options_, "id=BasicStatistics;inner=Test", &stats)); + ASSERT_NE(stats, nullptr); + ASSERT_STREQ(stats->Name(), "BasicStatistics"); + inner = stats->GetOptions>("StatisticsOptions"); + ASSERT_NE(inner, nullptr); + ASSERT_NE(inner->get(), nullptr); + ASSERT_STREQ(inner->get()->Name(), TestStatistics::kClassName()); + } +#endif +} + +TEST_F(LoadCustomizableTest, LoadMemTableRepFactoryTest) { + std::unique_ptr result; + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options_, "SpecialSkipListFactory", &result)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options_, SkipListFactory::kClassName(), &result)); + ASSERT_NE(result.get(), nullptr); + ASSERT_TRUE(result->IsInstanceOf(SkipListFactory::kClassName())); + + if (RegisterTests("Test")) { + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options_, "SpecialSkipListFactory", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "SpecialSkipListFactory"); + } +} + +TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) { + std::shared_ptr result; + + ASSERT_NOK( + MergeOperator::CreateFromString(config_options_, "Changling", &result)); + //**TODO: MJR: Use the constants when these names are in public classes + ASSERT_OK(MergeOperator::CreateFromString(config_options_, "put", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "PutOperator"); + ASSERT_OK( + MergeOperator::CreateFromString(config_options_, "PutOperator", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "PutOperator"); + ASSERT_OK( + MergeOperator::CreateFromString(config_options_, "put_v1", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "PutOperator"); + + ASSERT_OK( + MergeOperator::CreateFromString(config_options_, "uint64add", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "UInt64AddOperator"); + ASSERT_OK(MergeOperator::CreateFromString(config_options_, + "UInt64AddOperator", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "UInt64AddOperator"); + + ASSERT_OK(MergeOperator::CreateFromString(config_options_, "max", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "MaxOperator"); + ASSERT_OK( + MergeOperator::CreateFromString(config_options_, "MaxOperator", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "MaxOperator"); +#ifndef ROCKSDB_LITE + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, StringAppendOperator::kNickName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName()); + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, StringAppendOperator::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName()); + + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, StringAppendTESTOperator::kNickName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName()); + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, StringAppendTESTOperator::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName()); + + ASSERT_OK(MergeOperator::CreateFromString(config_options_, + SortList::kNickName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), SortList::kClassName()); + ASSERT_OK(MergeOperator::CreateFromString(config_options_, + SortList::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), SortList::kClassName()); + + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, BytesXOROperator::kNickName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName()); + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, BytesXOROperator::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName()); +#endif // ROCKSDB_LITE + ASSERT_NOK( + MergeOperator::CreateFromString(config_options_, "Changling", &result)); + if (RegisterTests("Test")) { + ASSERT_OK( + MergeOperator::CreateFromString(config_options_, "Changling", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "ChanglingMergeOperator"); + } +} + +TEST_F(LoadCustomizableTest, LoadCompactionFilterFactoryTest) { + std::shared_ptr result; + + ASSERT_NOK(CompactionFilterFactory::CreateFromString(config_options_, + "Changling", &result)); + if (RegisterTests("Test")) { + ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_, + "Changling", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "ChanglingCompactionFilterFactory"); + } +} + +TEST_F(LoadCustomizableTest, LoadCompactionFilterTest) { + const CompactionFilter* result = nullptr; + + ASSERT_NOK(CompactionFilter::CreateFromString(config_options_, "Changling", + &result)); +#ifndef ROCKSDB_LITE + ASSERT_OK(CompactionFilter::CreateFromString( + config_options_, RemoveEmptyValueCompactionFilter::kClassName(), + &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), RemoveEmptyValueCompactionFilter::kClassName()); + delete result; + result = nullptr; + if (RegisterTests("Test")) { + ASSERT_OK(CompactionFilter::CreateFromString(config_options_, "Changling", + &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "ChanglingCompactionFilter"); + delete result; + } +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +TEST_F(LoadCustomizableTest, LoadEventListenerTest) { + std::shared_ptr result; + + ASSERT_NOK(EventListener::CreateFromString( + config_options_, OnFileDeletionListener::kClassName(), &result)); + ASSERT_NOK(EventListener::CreateFromString( + config_options_, FlushCounterListener::kClassName(), &result)); + if (RegisterTests("Test")) { + ASSERT_OK(EventListener::CreateFromString( + config_options_, OnFileDeletionListener::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), OnFileDeletionListener::kClassName()); + ASSERT_OK(EventListener::CreateFromString( + config_options_, FlushCounterListener::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), FlushCounterListener::kClassName()); + } +} + +TEST_F(LoadCustomizableTest, LoadEncryptionProviderTest) { + std::shared_ptr result; + ASSERT_NOK( + EncryptionProvider::CreateFromString(config_options_, "Mock", &result)); + ASSERT_OK( + EncryptionProvider::CreateFromString(config_options_, "CTR", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "CTR"); + ASSERT_NOK(result->ValidateOptions(db_opts_, cf_opts_)); + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "CTR://test", + &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "CTR"); + ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_)); + + if (RegisterTests("Test")) { + ASSERT_OK( + EncryptionProvider::CreateFromString(config_options_, "Mock", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "Mock"); + ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, + "Mock://test", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "Mock"); + ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_)); + } +} + +TEST_F(LoadCustomizableTest, LoadEncryptionCipherTest) { + std::shared_ptr result; + ASSERT_NOK(BlockCipher::CreateFromString(config_options_, "Mock", &result)); + ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "ROT13"); + if (RegisterTests("Test")) { + ASSERT_OK(BlockCipher::CreateFromString(config_options_, "Mock", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), "Mock"); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(LoadCustomizableTest, LoadSystemClockTest) { + std::shared_ptr result; + ASSERT_NOK(SystemClock::CreateFromString( + config_options_, MockSystemClock::kClassName(), &result)); + ASSERT_OK(SystemClock::CreateFromString( + config_options_, SystemClock::kDefaultName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_TRUE(result->IsInstanceOf(SystemClock::kDefaultName())); + if (RegisterTests("Test")) { + ASSERT_OK(SystemClock::CreateFromString( + config_options_, MockSystemClock::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), MockSystemClock::kClassName()); + } +} + +TEST_F(LoadCustomizableTest, LoadMemoryAllocatorTest) { + std::shared_ptr result; + ASSERT_NOK(MemoryAllocator::CreateFromString( + config_options_, MockMemoryAllocator::kClassName(), &result)); + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, DefaultMemoryAllocator::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), DefaultMemoryAllocator::kClassName()); + if (RegisterTests("Test")) { + ASSERT_OK(MemoryAllocator::CreateFromString( + config_options_, MockMemoryAllocator::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), MockMemoryAllocator::kClassName()); + } +} + +TEST_F(LoadCustomizableTest, LoadRateLimiterTest) { + std::shared_ptr result; + ASSERT_NOK(RateLimiter::CreateFromString( + config_options_, MockRateLimiter::kClassName(), &result)); + ASSERT_OK(RateLimiter::CreateFromString( + config_options_, std::string(GenericRateLimiter::kClassName()) + ":1234", + &result)); + ASSERT_NE(result, nullptr); +#ifndef ROCKSDB_LITE + ASSERT_OK(RateLimiter::CreateFromString( + config_options_, GenericRateLimiter::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_OK(GetDBOptionsFromString( + config_options_, db_opts_, + std::string("rate_limiter=") + GenericRateLimiter::kClassName(), + &db_opts_)); + ASSERT_NE(db_opts_.rate_limiter, nullptr); + if (RegisterTests("Test")) { + ASSERT_OK(RateLimiter::CreateFromString( + config_options_, MockRateLimiter::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_OK(GetDBOptionsFromString( + config_options_, db_opts_, + std::string("rate_limiter=") + MockRateLimiter::kClassName(), + &db_opts_)); + ASSERT_NE(db_opts_.rate_limiter, nullptr); + } +#endif // ROCKSDB_LITE +} + +TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) { + std::shared_ptr table; + std::shared_ptr result; + ASSERT_NOK(FlushBlockPolicyFactory::CreateFromString( + config_options_, TestFlushBlockPolicyFactory::kClassName(), &result)); + + ASSERT_OK( + FlushBlockPolicyFactory::CreateFromString(config_options_, "", &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName()); + + ASSERT_OK(FlushBlockPolicyFactory::CreateFromString( + config_options_, FlushBlockEveryKeyPolicyFactory::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), FlushBlockEveryKeyPolicyFactory::kClassName()); + + ASSERT_OK(FlushBlockPolicyFactory::CreateFromString( + config_options_, FlushBlockBySizePolicyFactory::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName()); +#ifndef ROCKSDB_LITE + std::string table_opts = "id=BlockBasedTable; flush_block_policy_factory="; + ASSERT_OK(TableFactory::CreateFromString( + config_options_, + table_opts + FlushBlockEveryKeyPolicyFactory::kClassName(), &table)); + auto bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr); + ASSERT_STREQ(bbto->flush_block_policy_factory->Name(), + FlushBlockEveryKeyPolicyFactory::kClassName()); + if (RegisterTests("Test")) { + ASSERT_OK(FlushBlockPolicyFactory::CreateFromString( + config_options_, TestFlushBlockPolicyFactory::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), TestFlushBlockPolicyFactory::kClassName()); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + TestFlushBlockPolicyFactory::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr); + ASSERT_STREQ(bbto->flush_block_policy_factory->Name(), + TestFlushBlockPolicyFactory::kClassName()); + } +#endif // ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,16 +7,650 @@ #include -#include "db/version_edit.h" #include "logging/logging.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" +#include "options/options_parser.h" #include "port/port.h" -#include "rocksdb/cache.h" +#include "rocksdb/configurable.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/options_type.h" #include "rocksdb/wal_filter.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +static std::unordered_map + wal_recovery_mode_string_map = { + {"kTolerateCorruptedTailRecords", + WALRecoveryMode::kTolerateCorruptedTailRecords}, + {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency}, + {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery}, + {"kSkipAnyCorruptedRecords", + WALRecoveryMode::kSkipAnyCorruptedRecords}}; + +static std::unordered_map + access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE}, + {"NORMAL", DBOptions::AccessHint::NORMAL}, + {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL}, + {"WILLNEED", DBOptions::AccessHint::WILLNEED}}; + +static std::unordered_map cache_tier_string_map = { + {"kVolatileTier", CacheTier::kVolatileTier}, + {"kNonVolatileBlockTier", CacheTier::kNonVolatileBlockTier}}; + +static std::unordered_map info_log_level_string_map = + {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL}, + {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL}, + {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL}, + {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL}, + {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL}, + {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}}; + +static std::unordered_map + db_mutable_options_type_info = { + {"allow_os_buffer", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kMutable}}, + {"max_background_jobs", + {offsetof(struct MutableDBOptions, max_background_jobs), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_background_compactions", + {offsetof(struct MutableDBOptions, max_background_compactions), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"base_background_compactions", + {offsetof(struct MutableDBOptions, base_background_compactions), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_subcompactions", + {offsetof(struct MutableDBOptions, max_subcompactions), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"avoid_flush_during_shutdown", + {offsetof(struct MutableDBOptions, avoid_flush_during_shutdown), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"writable_file_max_buffer_size", + {offsetof(struct MutableDBOptions, writable_file_max_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"delayed_write_rate", + {offsetof(struct MutableDBOptions, delayed_write_rate), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_total_wal_size", + {offsetof(struct MutableDBOptions, max_total_wal_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"delete_obsolete_files_period_micros", + {offsetof(struct MutableDBOptions, + delete_obsolete_files_period_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_dump_period_sec", + {offsetof(struct MutableDBOptions, stats_dump_period_sec), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_persist_period_sec", + {offsetof(struct MutableDBOptions, stats_persist_period_sec), + OptionType::kUInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"stats_history_buffer_size", + {offsetof(struct MutableDBOptions, stats_history_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_open_files", + {offsetof(struct MutableDBOptions, max_open_files), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bytes_per_sync", + {offsetof(struct MutableDBOptions, bytes_per_sync), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"wal_bytes_per_sync", + {offsetof(struct MutableDBOptions, wal_bytes_per_sync), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"strict_bytes_per_sync", + {offsetof(struct MutableDBOptions, strict_bytes_per_sync), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"compaction_readahead_size", + {offsetof(struct MutableDBOptions, compaction_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"max_background_flushes", + {offsetof(struct MutableDBOptions, max_background_flushes), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + db_immutable_options_type_info = { + /* + // not yet supported + std::shared_ptr row_cache; + std::shared_ptr delete_scheduler; + std::shared_ptr info_log; + std::shared_ptr rate_limiter; + std::shared_ptr statistics; + std::vector db_paths; + FileTypeSet checksum_handoff_file_types; + */ + {"advise_random_on_open", + {offsetof(struct ImmutableDBOptions, advise_random_on_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_mmap_reads", + {offsetof(struct ImmutableDBOptions, allow_mmap_reads), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_fallocate", + {offsetof(struct ImmutableDBOptions, allow_fallocate), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_mmap_writes", + {offsetof(struct ImmutableDBOptions, allow_mmap_writes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_direct_reads", + {offsetof(struct ImmutableDBOptions, use_direct_reads), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_direct_writes", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"use_direct_io_for_flush_and_compaction", + {offsetof(struct ImmutableDBOptions, + use_direct_io_for_flush_and_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_2pc", + {offsetof(struct ImmutableDBOptions, allow_2pc), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"wal_filter", + OptionTypeInfo::AsCustomRawPtr( + offsetof(struct ImmutableDBOptions, wal_filter), + OptionVerificationType::kByName, + (OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever))}, + {"create_if_missing", + {offsetof(struct ImmutableDBOptions, create_if_missing), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"create_missing_column_families", + {offsetof(struct ImmutableDBOptions, create_missing_column_families), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"disableDataSync", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"disable_data_sync", // for compatibility + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"enable_thread_tracking", + {offsetof(struct ImmutableDBOptions, enable_thread_tracking), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"error_if_exists", + {offsetof(struct ImmutableDBOptions, error_if_exists), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"experimental_allow_mempurge", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"experimental_mempurge_policy", + {0, OptionType::kString, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"experimental_mempurge_threshold", + {offsetof(struct ImmutableDBOptions, experimental_mempurge_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_fd_close_on_exec", + {offsetof(struct ImmutableDBOptions, is_fd_close_on_exec), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_checks", + {offsetof(struct ImmutableDBOptions, paranoid_checks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"flush_verify_memtable_count", + {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"track_and_verify_wals_in_manifest", + {offsetof(struct ImmutableDBOptions, + track_and_verify_wals_in_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"skip_log_error_on_recovery", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"skip_stats_update_on_db_open", + {offsetof(struct ImmutableDBOptions, skip_stats_update_on_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"skip_checking_sst_file_sizes_on_db_open", + {offsetof(struct ImmutableDBOptions, + skip_checking_sst_file_sizes_on_db_open), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"new_table_reader_for_compaction_inputs", + {offsetof(struct ImmutableDBOptions, + new_table_reader_for_compaction_inputs), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"random_access_max_buffer_size", + {offsetof(struct ImmutableDBOptions, random_access_max_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_adaptive_mutex", + {offsetof(struct ImmutableDBOptions, use_adaptive_mutex), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_fsync", + {offsetof(struct ImmutableDBOptions, use_fsync), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"max_file_opening_threads", + {offsetof(struct ImmutableDBOptions, max_file_opening_threads), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"table_cache_numshardbits", + {offsetof(struct ImmutableDBOptions, table_cache_numshardbits), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_write_buffer_size", + {offsetof(struct ImmutableDBOptions, db_write_buffer_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"keep_log_file_num", + {offsetof(struct ImmutableDBOptions, keep_log_file_num), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"recycle_log_file_num", + {offsetof(struct ImmutableDBOptions, recycle_log_file_num), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_file_time_to_roll", + {offsetof(struct ImmutableDBOptions, log_file_time_to_roll), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"manifest_preallocation_size", + {offsetof(struct ImmutableDBOptions, manifest_preallocation_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_log_file_size", + {offsetof(struct ImmutableDBOptions, max_log_file_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_log_dir", + {offsetof(struct ImmutableDBOptions, db_log_dir), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"wal_dir", + {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"WAL_size_limit_MB", + {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"WAL_ttl_seconds", + {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_manifest_file_size", + {offsetof(struct ImmutableDBOptions, max_manifest_file_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"persist_stats_to_disk", + {offsetof(struct ImmutableDBOptions, persist_stats_to_disk), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"fail_if_options_file_error", + {offsetof(struct ImmutableDBOptions, fail_if_options_file_error), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"enable_pipelined_write", + {offsetof(struct ImmutableDBOptions, enable_pipelined_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"unordered_write", + {offsetof(struct ImmutableDBOptions, unordered_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_concurrent_memtable_write", + {offsetof(struct ImmutableDBOptions, allow_concurrent_memtable_write), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"wal_recovery_mode", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, wal_recovery_mode), + &wal_recovery_mode_string_map)}, + {"enable_write_thread_adaptive_yield", + {offsetof(struct ImmutableDBOptions, + enable_write_thread_adaptive_yield), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_thread_slow_yield_usec", + {offsetof(struct ImmutableDBOptions, write_thread_slow_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_write_batch_group_size_bytes", + {offsetof(struct ImmutableDBOptions, max_write_batch_group_size_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_thread_max_yield_usec", + {offsetof(struct ImmutableDBOptions, write_thread_max_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"access_hint_on_compaction_start", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, + access_hint_on_compaction_start), + &access_hint_string_map)}, + {"info_log_level", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, info_log_level), + &info_log_level_string_map)}, + {"dump_malloc_stats", + {offsetof(struct ImmutableDBOptions, dump_malloc_stats), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"avoid_flush_during_recovery", + {offsetof(struct ImmutableDBOptions, avoid_flush_during_recovery), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"allow_ingest_behind", + {offsetof(struct ImmutableDBOptions, allow_ingest_behind), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"preserve_deletes", + {offsetof(struct ImmutableDBOptions, preserve_deletes), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"concurrent_prepare", // Deprecated by two_write_queues + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"two_write_queues", + {offsetof(struct ImmutableDBOptions, two_write_queues), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"manual_wal_flush", + {offsetof(struct ImmutableDBOptions, manual_wal_flush), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"seq_per_batch", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"atomic_flush", + {offsetof(struct ImmutableDBOptions, atomic_flush), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"avoid_unnecessary_blocking_io", + {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"write_dbid_to_manifest", + {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"log_readahead_size", + {offsetof(struct ImmutableDBOptions, log_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"best_efforts_recovery", + {offsetof(struct ImmutableDBOptions, best_efforts_recovery), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_bgerror_resume_count", + {offsetof(struct ImmutableDBOptions, max_bgerror_resume_count), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bgerror_resume_retry_interval", + {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_host_id", + {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}}, + {"rate_limiter", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableDBOptions, rate_limiter), + OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull)}, + + // The following properties were handled as special cases in ParseOption + // This means that the properties could be read from the options file + // but never written to the file or compared to each other. + {"rate_limiter_bytes_per_sec", + {offsetof(struct ImmutableDBOptions, rate_limiter), + OptionType::kUnknown, OptionVerificationType::kNormal, + (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever), + // Parse the input value as a RateLimiter + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + auto limiter = static_cast*>(addr); + limiter->reset(NewGenericRateLimiter( + static_cast(ParseUint64(value)))); + return Status::OK(); + }}}, + {"env", + {offsetof(struct ImmutableDBOptions, env), OptionType::kUnknown, + OptionVerificationType::kNormal, + (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever), + // Parse the input value as an Env + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto old_env = static_cast(addr); // Get the old value + Env* new_env = *old_env; // Set new to old + Status s = Env::CreateFromString(opts, value, + &new_env); // Update new value + if (s.ok()) { // It worked + *old_env = new_env; // Update the old one + } + return s; + }}}, + {"allow_data_in_errors", + {offsetof(struct ImmutableDBOptions, allow_data_in_errors), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_checksum_gen_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct ImmutableDBOptions, file_checksum_gen_factory), + OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kAllowNull)}, + {"statistics", + OptionTypeInfo::AsCustomSharedPtr( + // Statistics should not be compared and can be null + // Statistics are maked "don't serialize" until they can be shared + // between DBs + offsetof(struct ImmutableDBOptions, statistics), + OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize | + OptionTypeFlags::kAllowNull)}, + // Allow EventListeners that have a non-empty Name() to be read/written + // as options Each listener will either be + // - A simple name (e.g. "MyEventListener") + // - A name with properties (e.g. "{id=MyListener1; timeout=60}" + // Multiple listeners will be separated by a ":": + // - "MyListener0;{id=MyListener1; timeout=60} + {"listeners", + {offsetof(struct ImmutableDBOptions, listeners), OptionType::kVector, + OptionVerificationType::kByNameAllowNull, + OptionTypeFlags::kCompareNever, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + ConfigOptions embedded = opts; + embedded.ignore_unsupported_options = true; + std::vector> listeners; + Status s; + for (size_t start = 0, end = 0; + s.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + s = OptionTypeInfo::NextToken(value, ':', start, &end, &token); + if (s.ok() && !token.empty()) { + std::shared_ptr listener; + s = EventListener::CreateFromString(embedded, token, &listener); + if (s.ok() && listener != nullptr) { + listeners.push_back(listener); + } + } + } + if (s.ok()) { // It worked + *(static_cast>*>( + addr)) = listeners; + } + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto listeners = + static_cast>*>( + addr); + ConfigOptions embedded = opts; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& listener : *listeners) { + auto id = listener->GetId(); + if (!id.empty()) { + std::string elem_str = listener->ToString(embedded, ""); + if (printed++ == 0) { + value->append("{"); + } else { + value->append(":"); + } + value->append(elem_str); + } + } + if (printed > 0) { + value->append("}"); + } + return Status::OK(); + }, + nullptr}}, + {"lowest_used_cache_tier", + OptionTypeInfo::Enum( + offsetof(struct ImmutableDBOptions, lowest_used_cache_tier), + &cache_tier_string_map, OptionTypeFlags::kNone)}, +}; + +const std::string OptionsHelper::kDBOptionsName = "DBOptions"; + +class MutableDBConfigurable : public Configurable { + public: + explicit MutableDBConfigurable( + const MutableDBOptions& mdb, + const std::unordered_map* map = nullptr) + : mutable_(mdb), opt_map_(map) { + RegisterOptions(&mutable_, &db_mutable_options_type_info); + } + + bool OptionsAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const override { + bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr, + that_ptr, mismatch); + if (!equals && opt_info.IsByName()) { + if (opt_map_ == nullptr) { + equals = true; + } else { + const auto& iter = opt_map_->find(opt_name); + if (iter == opt_map_->end()) { + equals = true; + } else { + equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr, + iter->second); + } + } + if (equals) { // False alarm, clear mismatch + *mismatch = ""; + } + } + if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) { + const auto* this_config = opt_info.AsRawPointer(this_ptr); + if (this_config == nullptr) { + const auto& iter = opt_map_->find(opt_name); + // If the name exists in the map and is not empty/null, + // then the this_config should be set. + if (iter != opt_map_->end() && !iter->second.empty() && + iter->second != kNullptrString) { + *mismatch = opt_name; + equals = false; + } + } + } + return equals; + } + + protected: + MutableDBOptions mutable_; + const std::unordered_map* opt_map_; +}; + +class DBOptionsConfigurable : public MutableDBConfigurable { + public: + explicit DBOptionsConfigurable( + const DBOptions& opts, + const std::unordered_map* map = nullptr) + : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) { + // The ImmutableDBOptions currently requires the env to be non-null. Make + // sure it is + if (opts.env != nullptr) { + immutable_ = ImmutableDBOptions(opts); + } else { + DBOptions copy = opts; + copy.env = Env::Default(); + immutable_ = ImmutableDBOptions(copy); + } + RegisterOptions(&immutable_, &db_immutable_options_type_info); + } + + protected: + Status ConfigureOptions( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + std::unordered_map* unused) override { + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); + if (s.ok()) { + db_options_ = BuildDBOptions(immutable_, mutable_); + s = PrepareOptions(config_options); + } + return s; + } + + const void* GetOptionsPtr(const std::string& name) const override { + if (name == OptionsHelper::kDBOptionsName) { + return &db_options_; + } else { + return MutableDBConfigurable::GetOptionsPtr(name); + } + } + + private: + ImmutableDBOptions immutable_; + DBOptions db_options_; +}; + +std::unique_ptr DBOptionsAsConfigurable( + const MutableDBOptions& opts) { + std::unique_ptr ptr(new MutableDBConfigurable(opts)); + return ptr; +} +std::unique_ptr DBOptionsAsConfigurable( + const DBOptions& opts, + const std::unordered_map* opt_map) { + std::unique_ptr ptr(new DBOptionsConfigurable(opts, opt_map)); + return ptr; +} +#endif // ROCKSDB_LITE ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {} @@ -25,8 +659,10 @@ create_missing_column_families(options.create_missing_column_families), error_if_exists(options.error_if_exists), paranoid_checks(options.paranoid_checks), + flush_verify_memtable_count(options.flush_verify_memtable_count), + track_and_verify_wals_in_manifest( + options.track_and_verify_wals_in_manifest), env(options.env), - fs(options.file_system), rate_limiter(options.rate_limiter), sst_file_manager(options.sst_file_manager), info_log(options.info_log), @@ -37,16 +673,14 @@ db_paths(options.db_paths), db_log_dir(options.db_log_dir), wal_dir(options.wal_dir), - max_subcompactions(options.max_subcompactions), - max_background_flushes(options.max_background_flushes), max_log_file_size(options.max_log_file_size), log_file_time_to_roll(options.log_file_time_to_roll), keep_log_file_num(options.keep_log_file_num), recycle_log_file_num(options.recycle_log_file_num), max_manifest_file_size(options.max_manifest_file_size), table_cache_numshardbits(options.table_cache_numshardbits), - wal_ttl_seconds(options.WAL_ttl_seconds), - wal_size_limit_mb(options.WAL_size_limit_MB), + WAL_ttl_seconds(options.WAL_ttl_seconds), + WAL_size_limit_MB(options.WAL_size_limit_MB), max_write_batch_group_size_bytes( options.max_write_batch_group_size_bytes), manifest_preallocation_size(options.manifest_preallocation_size), @@ -58,6 +692,7 @@ allow_fallocate(options.allow_fallocate), is_fd_close_on_exec(options.is_fd_close_on_exec), advise_random_on_open(options.advise_random_on_open), + experimental_mempurge_threshold(options.experimental_mempurge_threshold), db_write_buffer_size(options.db_write_buffer_size), write_buffer_manager(options.write_buffer_manager), access_hint_on_compaction_start(options.access_hint_on_compaction_start), @@ -95,7 +730,24 @@ persist_stats_to_disk(options.persist_stats_to_disk), write_dbid_to_manifest(options.write_dbid_to_manifest), log_readahead_size(options.log_readahead_size), - sst_file_checksum_func(options.sst_file_checksum_func) { + file_checksum_gen_factory(options.file_checksum_gen_factory), + best_efforts_recovery(options.best_efforts_recovery), + max_bgerror_resume_count(options.max_bgerror_resume_count), + bgerror_resume_retry_interval(options.bgerror_resume_retry_interval), + allow_data_in_errors(options.allow_data_in_errors), + db_host_id(options.db_host_id), + checksum_handoff_file_types(options.checksum_handoff_file_types), + lowest_used_cache_tier(options.lowest_used_cache_tier), + compaction_service(options.compaction_service) { + stats = statistics.get(); + fs = env->GetFileSystem(); + if (env != nullptr) { + clock = env->GetSystemClock().get(); + } else { + clock = SystemClock::Default().get(); + } + logger = info_log.get(); + stats = statistics.get(); } void ImmutableDBOptions::Dump(Logger* log) const { @@ -105,6 +757,12 @@ create_if_missing); ROCKS_LOG_HEADER(log, " Options.paranoid_checks: %d", paranoid_checks); + ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", + flush_verify_memtable_count); + ROCKS_LOG_HEADER(log, + " " + "Options.track_and_verify_wals_in_manifest: %d", + track_and_verify_wals_in_manifest); ROCKS_LOG_HEADER(log, " Options.env: %p", env); ROCKS_LOG_HEADER(log, " Options.fs: %s", @@ -114,7 +772,7 @@ ROCKS_LOG_HEADER(log, " Options.max_file_opening_threads: %d", max_file_opening_threads); ROCKS_LOG_HEADER(log, " Options.statistics: %p", - statistics.get()); + stats); ROCKS_LOG_HEADER(log, " Options.use_fsync: %d", use_fsync); ROCKS_LOG_HEADER( @@ -153,16 +811,11 @@ ROCKS_LOG_HEADER(log, " Options.table_cache_numshardbits: %d", table_cache_numshardbits); ROCKS_LOG_HEADER(log, - " Options.max_subcompactions: %" PRIu32, - max_subcompactions); - ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d", - max_background_flushes); - ROCKS_LOG_HEADER(log, " Options.WAL_ttl_seconds: %" PRIu64, - wal_ttl_seconds); + WAL_ttl_seconds); ROCKS_LOG_HEADER(log, " Options.WAL_size_limit_MB: %" PRIu64, - wal_size_limit_mb); + WAL_size_limit_MB); ROCKS_LOG_HEADER(log, " " "Options.max_write_batch_group_size_bytes: %" PRIu64, @@ -175,6 +828,9 @@ ROCKS_LOG_HEADER(log, " Options.advise_random_on_open: %d", advise_random_on_open); ROCKS_LOG_HEADER( + log, " Options.experimental_mempurge_threshold: %f", + experimental_mempurge_threshold); + ROCKS_LOG_HEADER( log, " Options.db_write_buffer_size: %" ROCKSDB_PRIszt, db_write_buffer_size); ROCKS_LOG_HEADER(log, " Options.write_buffer_manager: %p", @@ -246,16 +902,62 @@ ROCKS_LOG_HEADER( log, " Options.log_readahead_size: %" ROCKSDB_PRIszt, log_readahead_size); - ROCKS_LOG_HEADER(log, " Options.sst_file_checksum_func: %s", - sst_file_checksum_func - ? sst_file_checksum_func->Name() - : kUnknownFileChecksumFuncName.c_str()); + ROCKS_LOG_HEADER(log, " Options.file_checksum_gen_factory: %s", + file_checksum_gen_factory ? file_checksum_gen_factory->Name() + : kUnknownFileChecksumFuncName); + ROCKS_LOG_HEADER(log, " Options.best_efforts_recovery: %d", + static_cast(best_efforts_recovery)); + ROCKS_LOG_HEADER(log, " Options.max_bgerror_resume_count: %d", + max_bgerror_resume_count); + ROCKS_LOG_HEADER(log, + " Options.bgerror_resume_retry_interval: %" PRIu64, + bgerror_resume_retry_interval); + ROCKS_LOG_HEADER(log, " Options.allow_data_in_errors: %d", + allow_data_in_errors); + ROCKS_LOG_HEADER(log, " Options.db_host_id: %s", + db_host_id.c_str()); +} + +bool ImmutableDBOptions::IsWalDirSameAsDBPath() const { + assert(!db_paths.empty()); + return IsWalDirSameAsDBPath(db_paths[0].path); +} + +bool ImmutableDBOptions::IsWalDirSameAsDBPath( + const std::string& db_path) const { + bool same = wal_dir.empty(); + if (!same) { + Status s = env->AreFilesSame(wal_dir, db_path, &same); + if (s.IsNotSupported()) { + same = wal_dir == db_path; + } + } + return same; +} + +const std::string& ImmutableDBOptions::GetWalDir() const { + if (wal_dir.empty()) { + assert(!db_paths.empty()); + return db_paths[0].path; + } else { + return wal_dir; + } +} + +const std::string& ImmutableDBOptions::GetWalDir( + const std::string& path) const { + if (wal_dir.empty()) { + return path; + } else { + return wal_dir; + } } MutableDBOptions::MutableDBOptions() : max_background_jobs(2), base_background_compactions(-1), max_background_compactions(-1), + max_subcompactions(0), avoid_flush_during_shutdown(false), writable_file_max_buffer_size(1024 * 1024), delayed_write_rate(2 * 1024U * 1024U), @@ -268,12 +970,14 @@ bytes_per_sync(0), wal_bytes_per_sync(0), strict_bytes_per_sync(false), - compaction_readahead_size(0) {} + compaction_readahead_size(0), + max_background_flushes(-1) {} MutableDBOptions::MutableDBOptions(const DBOptions& options) : max_background_jobs(options.max_background_jobs), base_background_compactions(options.base_background_compactions), max_background_compactions(options.max_background_compactions), + max_subcompactions(options.max_subcompactions), avoid_flush_during_shutdown(options.avoid_flush_during_shutdown), writable_file_max_buffer_size(options.writable_file_max_buffer_size), delayed_write_rate(options.delayed_write_rate), @@ -287,13 +991,16 @@ bytes_per_sync(options.bytes_per_sync), wal_bytes_per_sync(options.wal_bytes_per_sync), strict_bytes_per_sync(options.strict_bytes_per_sync), - compaction_readahead_size(options.compaction_readahead_size) {} + compaction_readahead_size(options.compaction_readahead_size), + max_background_flushes(options.max_background_flushes) {} void MutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.max_background_jobs: %d", max_background_jobs); ROCKS_LOG_HEADER(log, " Options.max_background_compactions: %d", max_background_compactions); + ROCKS_LOG_HEADER(log, " Options.max_subcompactions: %" PRIu32, + max_subcompactions); ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d", avoid_flush_during_shutdown); ROCKS_LOG_HEADER( @@ -328,6 +1035,40 @@ ROCKS_LOG_HEADER(log, " Options.compaction_readahead_size: %" ROCKSDB_PRIszt, compaction_readahead_size); + ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d", + max_background_flushes); +} + +#ifndef ROCKSDB_LITE +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, db_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; } +bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options, + const MutableDBOptions& that_options) { + ConfigOptions config_options; + std::string mismatch; + return OptionTypeInfo::StructsAreEqual( + config_options, "MutableDBOptions", &db_mutable_options_type_info, + "MutableDBOptions", &this_options, &that_options, &mismatch); +} + +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string) { + return OptionTypeInfo::SerializeType( + config_options, db_mutable_options_type_info, &mutable_opts, opt_string); +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h 2025-05-19 16:14:27.000000000 +0000 @@ -11,8 +11,10 @@ #include "rocksdb/options.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; struct ImmutableDBOptions { + static const char* kName() { return "ImmutableDBOptions"; } ImmutableDBOptions(); explicit ImmutableDBOptions(const DBOptions& options); @@ -22,8 +24,9 @@ bool create_missing_column_families; bool error_if_exists; bool paranoid_checks; + bool flush_verify_memtable_count; + bool track_and_verify_wals_in_manifest; Env* env; - std::shared_ptr fs; std::shared_ptr rate_limiter; std::shared_ptr sst_file_manager; std::shared_ptr info_log; @@ -33,17 +36,18 @@ bool use_fsync; std::vector db_paths; std::string db_log_dir; + // The wal_dir option from the file. To determine the + // directory in use, the GetWalDir or IsWalDirSameAsDBPath + // methods should be used instead of accessing this variable directly. std::string wal_dir; - uint32_t max_subcompactions; - int max_background_flushes; size_t max_log_file_size; size_t log_file_time_to_roll; size_t keep_log_file_num; size_t recycle_log_file_num; uint64_t max_manifest_file_size; int table_cache_numshardbits; - uint64_t wal_ttl_seconds; - uint64_t wal_size_limit_mb; + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; uint64_t max_write_batch_group_size_bytes; size_t manifest_preallocation_size; bool allow_mmap_reads; @@ -53,6 +57,7 @@ bool allow_fallocate; bool is_fd_close_on_exec; bool advise_random_on_open; + double experimental_mempurge_threshold; size_t db_write_buffer_size; std::shared_ptr write_buffer_manager; DBOptions::AccessHint access_hint_on_compaction_start; @@ -87,12 +92,30 @@ bool persist_stats_to_disk; bool write_dbid_to_manifest; size_t log_readahead_size; - std::shared_ptr sst_file_checksum_func; + std::shared_ptr file_checksum_gen_factory; + bool best_efforts_recovery; + int max_bgerror_resume_count; + uint64_t bgerror_resume_retry_interval; + bool allow_data_in_errors; + std::string db_host_id; + FileTypeSet checksum_handoff_file_types; + CacheTier lowest_used_cache_tier; + // Convenience/Helper objects that are not part of the base DBOptions + std::shared_ptr fs; + SystemClock* clock; + Statistics* stats; + Logger* logger; + std::shared_ptr compaction_service; + + bool IsWalDirSameAsDBPath() const; + bool IsWalDirSameAsDBPath(const std::string& path) const; + const std::string& GetWalDir() const; + const std::string& GetWalDir(const std::string& path) const; }; struct MutableDBOptions { + static const char* kName() { return "MutableDBOptions"; } MutableDBOptions(); - explicit MutableDBOptions(const MutableDBOptions& options) = default; explicit MutableDBOptions(const DBOptions& options); void Dump(Logger* log) const; @@ -100,6 +123,7 @@ int max_background_jobs; int base_background_compactions; int max_background_compactions; + uint32_t max_subcompactions; bool avoid_flush_during_shutdown; size_t writable_file_max_buffer_size; uint64_t delayed_write_rate; @@ -113,6 +137,21 @@ uint64_t wal_bytes_per_sync; bool strict_bytes_per_sync; size_t compaction_readahead_size; + int max_background_flushes; }; +#ifndef ROCKSDB_LITE +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options); + +bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options, + const MutableDBOptions& that_options); +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,7 @@ #include #include +#include "logging/logging.h" #include "monitoring/statistics.h" #include "options/db_options.h" #include "options/options_helper.h" @@ -19,11 +20,13 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/sst_partitioner.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/wal_filter.h" @@ -87,7 +90,17 @@ report_bg_io_stats(options.report_bg_io_stats), ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), - sample_for_compression(options.sample_for_compression) { + sample_for_compression(options.sample_for_compression), + enable_blob_files(options.enable_blob_files), + min_blob_size(options.min_blob_size), + blob_file_size(options.blob_file_size), + blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), + blob_compaction_readahead_size(options.blob_compaction_readahead_size) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -122,12 +135,15 @@ ROCKS_LOG_HEADER( log, " Options.compaction_filter_factory: %s", compaction_filter_factory ? compaction_filter_factory->Name() : "None"); + ROCKS_LOG_HEADER( + log, " Options.sst_partitioner_factory: %s", + sst_partitioner_factory ? sst_partitioner_factory->Name() : "None"); ROCKS_LOG_HEADER(log, " Options.memtable_factory: %s", memtable_factory->Name()); ROCKS_LOG_HEADER(log, " Options.table_factory: %s", table_factory->Name()); ROCKS_LOG_HEADER(log, " table_factory options: %s", - table_factory->GetPrintableTableOptions().c_str()); + table_factory->GetPrintableOptions().c_str()); ROCKS_LOG_HEADER(log, " Options.write_buffer_size: %" ROCKSDB_PRIszt, write_buffer_size); ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number: %d", @@ -183,8 +199,18 @@ "%" PRIu32, bottommost_compression_opts.zstd_max_train_bytes); ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.parallel_threads: " + "%" PRIu32, + bottommost_compression_opts.parallel_threads); + ROCKS_LOG_HEADER( log, " Options.bottommost_compression_opts.enabled: %s", bottommost_compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + bottommost_compression_opts.max_dict_buffer_bytes); ROCKS_LOG_HEADER(log, " Options.compression_opts.window_bits: %d", compression_opts.window_bits); ROCKS_LOG_HEADER(log, " Options.compression_opts.level: %d", @@ -200,8 +226,16 @@ "%" PRIu32, compression_opts.zstd_max_train_bytes); ROCKS_LOG_HEADER(log, + " Options.compression_opts.parallel_threads: " + "%" PRIu32, + compression_opts.parallel_threads); + ROCKS_LOG_HEADER(log, " Options.compression_opts.enabled: %s", compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + compression_opts.max_dict_buffer_bytes); ROCKS_LOG_HEADER(log, " Options.level0_file_num_compaction_trigger: %d", level0_file_num_compaction_trigger); ROCKS_LOG_HEADER(log, " Options.level0_slowdown_writes_trigger: %d", @@ -310,14 +344,13 @@ ROCKS_LOG_HEADER(log, "Options.compaction_options_fifo.allow_compaction: %d", compaction_options_fifo.allow_compaction); - std::string collector_names; + std::ostringstream collector_info; for (const auto& collector_factory : table_properties_collector_factories) { - collector_names.append(collector_factory->Name()); - collector_names.append("; "); + collector_info << collector_factory->ToString() << ';'; } ROCKS_LOG_HEADER( log, " Options.table_properties_collectors: %s", - collector_names.c_str()); + collector_info.str().c_str()); ROCKS_LOG_HEADER(log, " Options.inplace_update_support: %d", inplace_update_support); @@ -357,6 +390,25 @@ ROCKS_LOG_HEADER(log, " Options.periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); + ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", + enable_blob_files ? "true" : "false"); + ROCKS_LOG_HEADER( + log, " Options.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " Options.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", + CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); + ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); + ROCKS_LOG_HEADER( + log, " Options.blob_compaction_readahead_size: %" PRIu64, + blob_compaction_readahead_size); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { @@ -422,6 +474,19 @@ return this; } +Options* Options::DisableExtraChecks() { + // See https://github.com/facebook/rocksdb/issues/9354 + force_consistency_checks = false; + // Considered but no clear performance impact seen: + // * check_flush_compaction_key_order + // * paranoid_checks + // * flush_verify_memtable_count + // By current API contract, not including + // * verify_checksums + // because checking storage data integrity is a more standard practice. + return this; +} + Options* Options::OldDefaults(int rocksdb_major_version, int rocksdb_minor_version) { ColumnFamilyOptions::OldDefaults(rocksdb_major_version, @@ -598,7 +663,12 @@ background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), iter_start_seqnum(0), - timestamp(nullptr) {} + timestamp(nullptr), + iter_start_ts(nullptr), + deadline(std::chrono::microseconds::zero()), + io_timeout(std::chrono::microseconds::zero()), + value_size_soft_limit(std::numeric_limits::max()), + adaptive_readahead(false) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -618,6 +688,11 @@ background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), iter_start_seqnum(0), - timestamp(nullptr) {} + timestamp(nullptr), + iter_start_ts(nullptr), + deadline(std::chrono::microseconds::zero()), + io_timeout(std::chrono::microseconds::zero()), + value_size_soft_limit(std::numeric_limits::max()), + adaptive_readahead(false) {} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,13 +7,17 @@ #include #include #include +#include #include #include +#include "options/cf_options.h" +#include "options/db_options.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" @@ -21,12 +25,37 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/utilities/object_registry.h" -#include "table/block_based/block_based_table_factory.h" -#include "table/plain/plain_table_factory.h" -#include "util/cast_util.h" +#include "rocksdb/utilities/options_type.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +ConfigOptions::ConfigOptions() +#ifndef ROCKSDB_LITE + : registry(ObjectRegistry::NewInstance()) +#endif +{ + env = Env::Default(); +} + +ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) { +#ifndef ROCKSDB_LITE + registry = ObjectRegistry::NewInstance(); +#endif +} + +Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) { + Status s; +#ifndef ROCKSDB_LITE + auto db_cfg = DBOptionsAsConfigurable(db_opts); + auto cf_cfg = CFOptionsAsConfigurable(cf_opts); + s = db_cfg->ValidateOptions(db_opts, cf_opts); + if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts); +#else + s = cf_opts.table_factory->ValidateOptions(db_opts, cf_opts); +#endif + return s; +} DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, const MutableDBOptions& mutable_db_options) { @@ -37,8 +66,11 @@ immutable_db_options.create_missing_column_families; options.error_if_exists = immutable_db_options.error_if_exists; options.paranoid_checks = immutable_db_options.paranoid_checks; + options.flush_verify_memtable_count = + immutable_db_options.flush_verify_memtable_count; + options.track_and_verify_wals_in_manifest = + immutable_db_options.track_and_verify_wals_in_manifest; options.env = immutable_db_options.env; - options.file_system = immutable_db_options.fs; options.rate_limiter = immutable_db_options.rate_limiter; options.sst_file_manager = immutable_db_options.sst_file_manager; options.info_log = immutable_db_options.info_log; @@ -62,8 +94,8 @@ options.bytes_per_sync = mutable_db_options.bytes_per_sync; options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync; options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync; - options.max_subcompactions = immutable_db_options.max_subcompactions; - options.max_background_flushes = immutable_db_options.max_background_flushes; + options.max_subcompactions = mutable_db_options.max_subcompactions; + options.max_background_flushes = mutable_db_options.max_background_flushes; options.max_log_file_size = immutable_db_options.max_log_file_size; options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll; options.keep_log_file_num = immutable_db_options.keep_log_file_num; @@ -71,8 +103,8 @@ options.max_manifest_file_size = immutable_db_options.max_manifest_file_size; options.table_cache_numshardbits = immutable_db_options.table_cache_numshardbits; - options.WAL_ttl_seconds = immutable_db_options.wal_ttl_seconds; - options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb; + options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds; + options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB; options.manifest_preallocation_size = immutable_db_options.manifest_preallocation_size; options.allow_mmap_reads = immutable_db_options.allow_mmap_reads; @@ -144,7 +176,18 @@ options.avoid_unnecessary_blocking_io = immutable_db_options.avoid_unnecessary_blocking_io; options.log_readahead_size = immutable_db_options.log_readahead_size; - options.sst_file_checksum_func = immutable_db_options.sst_file_checksum_func; + options.file_checksum_gen_factory = + immutable_db_options.file_checksum_gen_factory; + options.best_efforts_recovery = immutable_db_options.best_efforts_recovery; + options.max_bgerror_resume_count = + immutable_db_options.max_bgerror_resume_count; + options.bgerror_resume_retry_interval = + immutable_db_options.bgerror_resume_retry_interval; + options.db_host_id = immutable_db_options.db_host_id; + options.allow_data_in_errors = immutable_db_options.allow_data_in_errors; + options.checksum_handoff_file_types = + immutable_db_options.checksum_handoff_file_types; + options.lowest_used_cache_tier = immutable_db_options.lowest_used_cache_tier; return options; } @@ -152,69 +195,119 @@ const ColumnFamilyOptions& options, const MutableCFOptions& mutable_cf_options) { ColumnFamilyOptions cf_opts(options); + UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts); + // TODO(yhchiang): find some way to handle the following derived options + // * max_file_size + return cf_opts; +} +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts) { // Memtable related options - cf_opts.write_buffer_size = mutable_cf_options.write_buffer_size; - cf_opts.max_write_buffer_number = mutable_cf_options.max_write_buffer_number; - cf_opts.arena_block_size = mutable_cf_options.arena_block_size; - cf_opts.memtable_prefix_bloom_size_ratio = - mutable_cf_options.memtable_prefix_bloom_size_ratio; - cf_opts.memtable_whole_key_filtering = - mutable_cf_options.memtable_whole_key_filtering; - cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size; - cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges; - cf_opts.inplace_update_num_locks = - mutable_cf_options.inplace_update_num_locks; - cf_opts.prefix_extractor = mutable_cf_options.prefix_extractor; + cf_opts->write_buffer_size = moptions.write_buffer_size; + cf_opts->max_write_buffer_number = moptions.max_write_buffer_number; + cf_opts->arena_block_size = moptions.arena_block_size; + cf_opts->memtable_prefix_bloom_size_ratio = + moptions.memtable_prefix_bloom_size_ratio; + cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; + cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; + cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; + cf_opts->prefix_extractor = moptions.prefix_extractor; // Compaction related options - cf_opts.disable_auto_compactions = - mutable_cf_options.disable_auto_compactions; - cf_opts.soft_pending_compaction_bytes_limit = - mutable_cf_options.soft_pending_compaction_bytes_limit; - cf_opts.hard_pending_compaction_bytes_limit = - mutable_cf_options.hard_pending_compaction_bytes_limit; - cf_opts.level0_file_num_compaction_trigger = - mutable_cf_options.level0_file_num_compaction_trigger; - cf_opts.level0_slowdown_writes_trigger = - mutable_cf_options.level0_slowdown_writes_trigger; - cf_opts.level0_stop_writes_trigger = - mutable_cf_options.level0_stop_writes_trigger; - cf_opts.max_compaction_bytes = mutable_cf_options.max_compaction_bytes; - cf_opts.target_file_size_base = mutable_cf_options.target_file_size_base; - cf_opts.target_file_size_multiplier = - mutable_cf_options.target_file_size_multiplier; - cf_opts.max_bytes_for_level_base = - mutable_cf_options.max_bytes_for_level_base; - cf_opts.max_bytes_for_level_multiplier = - mutable_cf_options.max_bytes_for_level_multiplier; - cf_opts.ttl = mutable_cf_options.ttl; - cf_opts.periodic_compaction_seconds = - mutable_cf_options.periodic_compaction_seconds; - - cf_opts.max_bytes_for_level_multiplier_additional.clear(); - for (auto value : - mutable_cf_options.max_bytes_for_level_multiplier_additional) { - cf_opts.max_bytes_for_level_multiplier_additional.emplace_back(value); - } - - cf_opts.compaction_options_fifo = mutable_cf_options.compaction_options_fifo; - cf_opts.compaction_options_universal = - mutable_cf_options.compaction_options_universal; + cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; + cf_opts->soft_pending_compaction_bytes_limit = + moptions.soft_pending_compaction_bytes_limit; + cf_opts->hard_pending_compaction_bytes_limit = + moptions.hard_pending_compaction_bytes_limit; + cf_opts->level0_file_num_compaction_trigger = + moptions.level0_file_num_compaction_trigger; + cf_opts->level0_slowdown_writes_trigger = + moptions.level0_slowdown_writes_trigger; + cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; + cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; + cf_opts->target_file_size_base = moptions.target_file_size_base; + cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; + cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; + cf_opts->max_bytes_for_level_multiplier = + moptions.max_bytes_for_level_multiplier; + cf_opts->ttl = moptions.ttl; + cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds; + + cf_opts->max_bytes_for_level_multiplier_additional.clear(); + for (auto value : moptions.max_bytes_for_level_multiplier_additional) { + cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value); + } + + cf_opts->compaction_options_fifo = moptions.compaction_options_fifo; + cf_opts->compaction_options_universal = moptions.compaction_options_universal; + + // Blob file related options + cf_opts->enable_blob_files = moptions.enable_blob_files; + cf_opts->min_blob_size = moptions.min_blob_size; + cf_opts->blob_file_size = moptions.blob_file_size; + cf_opts->blob_compression_type = moptions.blob_compression_type; + cf_opts->enable_blob_garbage_collection = + moptions.enable_blob_garbage_collection; + cf_opts->blob_garbage_collection_age_cutoff = + moptions.blob_garbage_collection_age_cutoff; + cf_opts->blob_garbage_collection_force_threshold = + moptions.blob_garbage_collection_force_threshold; + cf_opts->blob_compaction_readahead_size = + moptions.blob_compaction_readahead_size; // Misc options - cf_opts.max_sequential_skip_in_iterations = - mutable_cf_options.max_sequential_skip_in_iterations; - cf_opts.paranoid_file_checks = mutable_cf_options.paranoid_file_checks; - cf_opts.report_bg_io_stats = mutable_cf_options.report_bg_io_stats; - cf_opts.compression = mutable_cf_options.compression; - cf_opts.sample_for_compression = mutable_cf_options.sample_for_compression; + cf_opts->max_sequential_skip_in_iterations = + moptions.max_sequential_skip_in_iterations; + cf_opts->check_flush_compaction_key_order = + moptions.check_flush_compaction_key_order; + cf_opts->paranoid_file_checks = moptions.paranoid_file_checks; + cf_opts->report_bg_io_stats = moptions.report_bg_io_stats; + cf_opts->compression = moptions.compression; + cf_opts->compression_opts = moptions.compression_opts; + cf_opts->bottommost_compression = moptions.bottommost_compression; + cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts; + cf_opts->sample_for_compression = moptions.sample_for_compression; +} + +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts) { + cf_opts->compaction_style = ioptions.compaction_style; + cf_opts->compaction_pri = ioptions.compaction_pri; + cf_opts->comparator = ioptions.user_comparator; + cf_opts->merge_operator = ioptions.merge_operator; + cf_opts->compaction_filter = ioptions.compaction_filter; + cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory; + cf_opts->min_write_buffer_number_to_merge = + ioptions.min_write_buffer_number_to_merge; + cf_opts->max_write_buffer_number_to_maintain = + ioptions.max_write_buffer_number_to_maintain; + cf_opts->max_write_buffer_size_to_maintain = + ioptions.max_write_buffer_size_to_maintain; + cf_opts->inplace_update_support = ioptions.inplace_update_support; + cf_opts->inplace_callback = ioptions.inplace_callback; + cf_opts->memtable_factory = ioptions.memtable_factory; + cf_opts->table_factory = ioptions.table_factory; + cf_opts->table_properties_collector_factories = + ioptions.table_properties_collector_factories; + cf_opts->bloom_locality = ioptions.bloom_locality; + cf_opts->purge_redundant_kvs_while_flush = + ioptions.purge_redundant_kvs_while_flush; + cf_opts->compression_per_level = ioptions.compression_per_level; + cf_opts->level_compaction_dynamic_level_bytes = + ioptions.level_compaction_dynamic_level_bytes; + cf_opts->num_levels = ioptions.num_levels; + cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; + cf_opts->force_consistency_checks = ioptions.force_consistency_checks; + cf_opts->memtable_insert_with_hint_prefix_extractor = + ioptions.memtable_insert_with_hint_prefix_extractor; + cf_opts->cf_paths = ioptions.cf_paths; + cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter; + cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory; - cf_opts.table_factory = options.table_factory; // TODO(yhchiang): find some way to handle the following derived options // * max_file_size - - return cf_opts; } std::map @@ -239,7 +332,8 @@ OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}, - {"kxxHash64", kxxHash64}}; + {"kxxHash64", kxxHash64}, + {"kXXH3", kXXH3}}; std::unordered_map OptionsHelper::compression_type_string_map = { @@ -253,604 +347,215 @@ {"kZSTD", kZSTD}, {"kZSTDNotFinalCompression", kZSTDNotFinalCompression}, {"kDisableCompressionOption", kDisableCompressionOption}}; -#ifndef ROCKSDB_LITE - -const std::string kNameComparator = "comparator"; -const std::string kNameEnv = "env"; -const std::string kNameMergeOperator = "merge_operator"; - -template -Status GetStringFromStruct( - std::string* opt_string, const T& options, - const std::unordered_map& type_info, - const std::string& delimiter); - -namespace { -template -bool ParseEnum(const std::unordered_map& type_map, - const std::string& type, T* value) { - auto iter = type_map.find(type); - if (iter != type_map.end()) { - *value = iter->second; - return true; - } - return false; -} -template -bool SerializeEnum(const std::unordered_map& type_map, - const T& type, std::string* value) { - for (const auto& pair : type_map) { - if (pair.second == type) { - *value = pair.first; - return true; - } - } - return false; -} - -bool SerializeVectorCompressionType(const std::vector& types, - std::string* value) { - std::stringstream ss; - bool result; - for (size_t i = 0; i < types.size(); ++i) { - if (i > 0) { - ss << ':'; - } - std::string string_type; - result = SerializeEnum(compression_type_string_map, - types[i], &string_type); - if (result == false) { - return result; - } - ss << string_type; - } - *value = ss.str(); - return true; -} - -bool ParseVectorCompressionType( - const std::string& value, - std::vector* compression_per_level) { - compression_per_level->clear(); - size_t start = 0; - while (start < value.size()) { - size_t end = value.find(':', start); - bool is_ok; - CompressionType type; - if (end == std::string::npos) { - is_ok = ParseEnum(compression_type_string_map, - value.substr(start), &type); - if (!is_ok) { - return false; - } - compression_per_level->emplace_back(type); - break; - } else { - is_ok = ParseEnum( - compression_type_string_map, value.substr(start, end - start), &type); - if (!is_ok) { - return false; - } - compression_per_level->emplace_back(type); - start = end + 1; +std::vector GetSupportedCompressions() { + // std::set internally to deduplicate potential name aliases + std::set supported_compressions; + for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) { + CompressionType t = comp_to_name.second; + if (t != kDisableCompressionOption && CompressionTypeSupported(t)) { + supported_compressions.insert(t); } } - return true; + return std::vector(supported_compressions.begin(), + supported_compressions.end()); } -// This is to handle backward compatibility, where compaction_options_fifo -// could be assigned a single scalar value, say, like "23", which would be -// assigned to max_table_files_size. -bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str, - CompactionOptionsFIFO* options) { - if (opt_str.find("=") != std::string::npos) { - // New format. Go do your new parsing using ParseStructOptions. - return false; - } - - // Old format. Parse just a single uint64_t value. - options->max_table_files_size = ParseUint64(opt_str); - return true; -} - -template -bool SerializeStruct( - const T& options, std::string* value, - const std::unordered_map& type_info_map) { - std::string opt_str; - Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";"); - if (!s.ok()) { - return false; - } - *value = "{" + opt_str + "}"; - return true; -} - -template -bool ParseSingleStructOption( - const std::string& opt_val_str, T* options, - const std::unordered_map& type_info_map) { - size_t end = opt_val_str.find('='); - std::string key = opt_val_str.substr(0, end); - std::string value = opt_val_str.substr(end + 1); - auto iter = type_info_map.find(key); - if (iter == type_info_map.end()) { - return false; - } - const auto& opt_info = iter->second; - if (opt_info.verification == OptionVerificationType::kDeprecated) { - // Should also skip deprecated sub-options such as - // fifo_compaction_options_type_info.ttl - return true; - } - return ParseOptionHelper( - reinterpret_cast(options) + opt_info.mutable_offset, opt_info.type, - value); -} - -template -bool ParseStructOptions( - const std::string& opt_str, T* options, - const std::unordered_map& type_info_map) { - assert(!opt_str.empty()); - - size_t start = 0; - if (opt_str[0] == '{') { - start++; - } - while ((start != std::string::npos) && (start < opt_str.size())) { - if (opt_str[start] == '}') { - break; - } - size_t end = opt_str.find(';', start); - size_t len = (end == std::string::npos) ? end : end - start; - if (!ParseSingleStructOption(opt_str.substr(start, len), options, - type_info_map)) { - return false; +std::vector GetSupportedDictCompressions() { + std::set dict_compression_types; + for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) { + CompressionType t = comp_to_name.second; + if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) { + dict_compression_types.insert(t); } - start = (end == std::string::npos) ? end : end + 1; } - return true; + return std::vector(dict_compression_types.begin(), + dict_compression_types.end()); } -} // anonymouse namespace -bool ParseSliceTransformHelper( - const std::string& kFixedPrefixName, const std::string& kCappedPrefixName, - const std::string& value, - std::shared_ptr* slice_transform) { - const char* no_op_name = "rocksdb.Noop"; - size_t no_op_length = strlen(no_op_name); - auto& pe_value = value; - if (pe_value.size() > kFixedPrefixName.size() && - pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) { - int prefix_length = ParseInt(trim(value.substr(kFixedPrefixName.size()))); - slice_transform->reset(NewFixedPrefixTransform(prefix_length)); - } else if (pe_value.size() > kCappedPrefixName.size() && - pe_value.compare(0, kCappedPrefixName.size(), kCappedPrefixName) == - 0) { - int prefix_length = - ParseInt(trim(pe_value.substr(kCappedPrefixName.size()))); - slice_transform->reset(NewCappedPrefixTransform(prefix_length)); - } else if (pe_value.size() == no_op_length && - pe_value.compare(0, no_op_length, no_op_name) == 0) { - const SliceTransform* no_op_transform = NewNoopTransform(); - slice_transform->reset(no_op_transform); - } else if (value == kNullptrString) { - slice_transform->reset(); - } else { - return false; +std::vector GetSupportedChecksums() { + std::set checksum_types; + for (const auto& e : OptionsHelper::checksum_type_string_map) { + checksum_types.insert(e.second); } - - return true; + return std::vector(checksum_types.begin(), + checksum_types.end()); } -bool ParseSliceTransform( - const std::string& value, - std::shared_ptr* slice_transform) { - // While we normally don't convert the string representation of a - // pointer-typed option into its instance, here we do so for backward - // compatibility as we allow this action in SetOption(). - - // TODO(yhchiang): A possible better place for these serialization / - // deserialization is inside the class definition of pointer-typed - // option itself, but this requires a bigger change of public API. - bool result = - ParseSliceTransformHelper("fixed:", "capped:", value, slice_transform); - if (result) { - return result; - } - result = ParseSliceTransformHelper( - "rocksdb.FixedPrefix.", "rocksdb.CappedPrefix.", value, slice_transform); - if (result) { - return result; - } - // TODO(yhchiang): we can further support other default - // SliceTransforms here. - return false; -} - -bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, - const std::string& value) { +#ifndef ROCKSDB_LITE +static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type, + const std::string& value) { switch (opt_type) { case OptionType::kBoolean: - *reinterpret_cast(opt_address) = ParseBoolean("", value); + *static_cast(opt_address) = ParseBoolean("", value); break; case OptionType::kInt: - *reinterpret_cast(opt_address) = ParseInt(value); + *static_cast(opt_address) = ParseInt(value); break; case OptionType::kInt32T: - *reinterpret_cast(opt_address) = ParseInt32(value); + *static_cast(opt_address) = ParseInt32(value); break; case OptionType::kInt64T: - PutUnaligned(reinterpret_cast(opt_address), ParseInt64(value)); - break; - case OptionType::kVectorInt: - *reinterpret_cast*>(opt_address) = ParseVectorInt(value); + PutUnaligned(static_cast(opt_address), ParseInt64(value)); break; case OptionType::kUInt: - *reinterpret_cast(opt_address) = ParseUint32(value); + *static_cast(opt_address) = ParseUint32(value); + break; + case OptionType::kUInt8T: + *static_cast(opt_address) = ParseUint8(value); break; case OptionType::kUInt32T: - *reinterpret_cast(opt_address) = ParseUint32(value); + *static_cast(opt_address) = ParseUint32(value); break; case OptionType::kUInt64T: - PutUnaligned(reinterpret_cast(opt_address), ParseUint64(value)); + PutUnaligned(static_cast(opt_address), ParseUint64(value)); break; case OptionType::kSizeT: - PutUnaligned(reinterpret_cast(opt_address), ParseSizeT(value)); + PutUnaligned(static_cast(opt_address), ParseSizeT(value)); break; case OptionType::kString: - *reinterpret_cast(opt_address) = value; + *static_cast(opt_address) = value; break; case OptionType::kDouble: - *reinterpret_cast(opt_address) = ParseDouble(value); + *static_cast(opt_address) = ParseDouble(value); break; case OptionType::kCompactionStyle: return ParseEnum( compaction_style_string_map, value, - reinterpret_cast(opt_address)); + static_cast(opt_address)); case OptionType::kCompactionPri: - return ParseEnum( - compaction_pri_string_map, value, - reinterpret_cast(opt_address)); + return ParseEnum(compaction_pri_string_map, value, + static_cast(opt_address)); case OptionType::kCompressionType: return ParseEnum( compression_type_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kVectorCompressionType: - return ParseVectorCompressionType( - value, reinterpret_cast*>(opt_address)); - case OptionType::kSliceTransform: - return ParseSliceTransform( - value, reinterpret_cast*>( - opt_address)); + static_cast(opt_address)); case OptionType::kChecksumType: - return ParseEnum( - checksum_type_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kBlockBasedTableIndexType: - return ParseEnum( - block_base_table_index_type_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kBlockBasedTableDataBlockIndexType: - return ParseEnum( - block_base_table_data_block_index_type_string_map, value, - reinterpret_cast( - opt_address)); - case OptionType::kBlockBasedTableIndexShorteningMode: - return ParseEnum( - block_base_table_index_shortening_mode_string_map, value, - reinterpret_cast( - opt_address)); + return ParseEnum(checksum_type_string_map, value, + static_cast(opt_address)); case OptionType::kEncodingType: - return ParseEnum( - encoding_type_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kWALRecoveryMode: - return ParseEnum( - wal_recovery_mode_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kAccessHint: - return ParseEnum( - access_hint_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kInfoLogLevel: - return ParseEnum( - info_log_level_string_map, value, - reinterpret_cast(opt_address)); - case OptionType::kCompactionOptionsFIFO: { - if (!FIFOCompactionOptionsSpecialCase( - value, reinterpret_cast(opt_address))) { - return ParseStructOptions( - value, reinterpret_cast(opt_address), - fifo_compaction_options_type_info); - } - return true; - } - case OptionType::kLRUCacheOptions: { - return ParseStructOptions(value, - reinterpret_cast(opt_address), - lru_cache_options_type_info); - } - case OptionType::kCompactionOptionsUniversal: - return ParseStructOptions( - value, reinterpret_cast(opt_address), - universal_compaction_options_type_info); + return ParseEnum(encoding_type_string_map, value, + static_cast(opt_address)); case OptionType::kCompactionStopStyle: return ParseEnum( compaction_stop_style_string_map, value, - reinterpret_cast(opt_address)); + static_cast(opt_address)); + case OptionType::kEncodedString: { + std::string* output_addr = static_cast(opt_address); + (Slice(value)).DecodeHex(output_addr); + break; + } default: return false; } return true; } -bool SerializeSingleOptionHelper(const char* opt_address, +bool SerializeSingleOptionHelper(const void* opt_address, const OptionType opt_type, std::string* value) { - assert(value); switch (opt_type) { case OptionType::kBoolean: - *value = *(reinterpret_cast(opt_address)) ? "true" : "false"; + *value = *(static_cast(opt_address)) ? "true" : "false"; break; case OptionType::kInt: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kInt32T: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kInt64T: { int64_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; - case OptionType::kVectorInt: - return SerializeIntVector( - *reinterpret_cast*>(opt_address), value); case OptionType::kUInt: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); + break; + case OptionType::kUInt8T: + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kUInt32T: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kUInt64T: { uint64_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; case OptionType::kSizeT: { size_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; case OptionType::kDouble: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kString: - *value = EscapeOptionString( - *(reinterpret_cast(opt_address))); + *value = + EscapeOptionString(*(static_cast(opt_address))); break; case OptionType::kCompactionStyle: return SerializeEnum( compaction_style_string_map, - *(reinterpret_cast(opt_address)), value); + *(static_cast(opt_address)), value); case OptionType::kCompactionPri: return SerializeEnum( compaction_pri_string_map, - *(reinterpret_cast(opt_address)), value); + *(static_cast(opt_address)), value); case OptionType::kCompressionType: return SerializeEnum( compression_type_string_map, - *(reinterpret_cast(opt_address)), value); - case OptionType::kVectorCompressionType: - return SerializeVectorCompressionType( - *(reinterpret_cast*>(opt_address)), - value); - break; - case OptionType::kSliceTransform: { - const auto* slice_transform_ptr = - reinterpret_cast*>( - opt_address); - *value = slice_transform_ptr->get() ? slice_transform_ptr->get()->Name() - : kNullptrString; - break; - } - case OptionType::kTableFactory: { - const auto* table_factory_ptr = - reinterpret_cast*>( - opt_address); - *value = table_factory_ptr->get() ? table_factory_ptr->get()->Name() - : kNullptrString; - break; - } - case OptionType::kComparator: { - // it's a const pointer of const Comparator* - const auto* ptr = reinterpret_cast(opt_address); - // Since the user-specified comparator will be wrapped by - // InternalKeyComparator, we should persist the user-specified one - // instead of InternalKeyComparator. - if (*ptr == nullptr) { - *value = kNullptrString; - } else { - const Comparator* root_comp = (*ptr)->GetRootComparator(); - if (root_comp == nullptr) { - root_comp = (*ptr); - } - *value = root_comp->Name(); - } + *(static_cast(opt_address)), value); break; - } - case OptionType::kCompactionFilter: { - // it's a const pointer of const CompactionFilter* - const auto* ptr = - reinterpret_cast(opt_address); - *value = *ptr ? (*ptr)->Name() : kNullptrString; - break; - } - case OptionType::kCompactionFilterFactory: { - const auto* ptr = - reinterpret_cast*>( - opt_address); - *value = ptr->get() ? ptr->get()->Name() : kNullptrString; - break; - } - case OptionType::kMemTableRepFactory: { - const auto* ptr = - reinterpret_cast*>( - opt_address); - *value = ptr->get() ? ptr->get()->Name() : kNullptrString; - break; - } - case OptionType::kMergeOperator: { - const auto* ptr = - reinterpret_cast*>(opt_address); - *value = ptr->get() ? ptr->get()->Name() : kNullptrString; - break; - } case OptionType::kFilterPolicy: { const auto* ptr = - reinterpret_cast*>(opt_address); + static_cast*>(opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; } case OptionType::kChecksumType: return SerializeEnum( checksum_type_string_map, - *reinterpret_cast(opt_address), value); - case OptionType::kBlockBasedTableIndexType: - return SerializeEnum( - block_base_table_index_type_string_map, - *reinterpret_cast( - opt_address), - value); - case OptionType::kBlockBasedTableDataBlockIndexType: - return SerializeEnum( - block_base_table_data_block_index_type_string_map, - *reinterpret_cast( - opt_address), - value); - case OptionType::kBlockBasedTableIndexShorteningMode: - return SerializeEnum( - block_base_table_index_shortening_mode_string_map, - *reinterpret_cast( - opt_address), - value); - case OptionType::kFlushBlockPolicyFactory: { - const auto* ptr = - reinterpret_cast*>( - opt_address); - *value = ptr->get() ? ptr->get()->Name() : kNullptrString; - break; - } + *static_cast(opt_address), value); case OptionType::kEncodingType: return SerializeEnum( encoding_type_string_map, - *reinterpret_cast(opt_address), value); - case OptionType::kWALRecoveryMode: - return SerializeEnum( - wal_recovery_mode_string_map, - *reinterpret_cast(opt_address), value); - case OptionType::kAccessHint: - return SerializeEnum( - access_hint_string_map, - *reinterpret_cast(opt_address), value); - case OptionType::kInfoLogLevel: - return SerializeEnum( - info_log_level_string_map, - *reinterpret_cast(opt_address), value); - case OptionType::kCompactionOptionsFIFO: - return SerializeStruct( - *reinterpret_cast(opt_address), value, - fifo_compaction_options_type_info); - case OptionType::kCompactionOptionsUniversal: - return SerializeStruct( - *reinterpret_cast(opt_address), - value, universal_compaction_options_type_info); + *static_cast(opt_address), value); case OptionType::kCompactionStopStyle: return SerializeEnum( compaction_stop_style_string_map, - *reinterpret_cast(opt_address), value); + *static_cast(opt_address), value); + case OptionType::kEncodedString: { + const auto* ptr = static_cast(opt_address); + *value = (Slice(*ptr)).ToString(true); + break; + } default: return false; } return true; } -Status GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - Logger* info_log, MutableCFOptions* new_options) { - assert(new_options); - *new_options = base_options; - for (const auto& o : options_map) { - try { - auto iter = cf_options_type_info.find(o.first); - if (iter == cf_options_type_info.end()) { - return Status::InvalidArgument("Unrecognized option: " + o.first); - } - const auto& opt_info = iter->second; - if (!opt_info.is_mutable) { - return Status::InvalidArgument("Option not changeable: " + o.first); - } - if (opt_info.verification == OptionVerificationType::kDeprecated) { - // log warning when user tries to set a deprecated option but don't fail - // the call for compatibility. - ROCKS_LOG_WARN(info_log, "%s is a deprecated option and cannot be set", - o.first.c_str()); - continue; - } - bool is_ok = ParseOptionHelper( - reinterpret_cast(new_options) + opt_info.mutable_offset, - opt_info.type, o.second); - if (!is_ok) { - return Status::InvalidArgument("Error parsing " + o.first); - } - } catch (std::exception& e) { - return Status::InvalidArgument("Error parsing " + o.first + ":" + - std::string(e.what())); - } +template +Status ConfigureFromMap( + const ConfigOptions& config_options, + const std::unordered_map& opt_map, + const std::string& option_name, Configurable* config, T* new_opts) { + Status s = config->ConfigureFromMap(config_options, opt_map); + if (s.ok()) { + *new_opts = *(config->GetOptions(option_name)); } - return Status::OK(); + return s; } -Status GetMutableDBOptionsFromStrings( - const MutableDBOptions& base_options, - const std::unordered_map& options_map, - MutableDBOptions* new_options) { - assert(new_options); - *new_options = base_options; - for (const auto& o : options_map) { - try { - auto iter = db_options_type_info.find(o.first); - if (iter == db_options_type_info.end()) { - return Status::InvalidArgument("Unrecognized option: " + o.first); - } - const auto& opt_info = iter->second; - if (!opt_info.is_mutable) { - return Status::InvalidArgument("Option not changeable: " + o.first); - } - bool is_ok = ParseOptionHelper( - reinterpret_cast(new_options) + opt_info.mutable_offset, - opt_info.type, o.second); - if (!is_ok) { - return Status::InvalidArgument("Error parsing " + o.first); - } - } catch (std::exception& e) { - return Status::InvalidArgument("Error parsing " + o.first + ":" + - std::string(e.what())); - } - } - return Status::OK(); -} Status StringToMap(const std::string& opts_str, std::unordered_map* opts_map) { @@ -860,306 +565,74 @@ // "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100" size_t pos = 0; std::string opts = trim(opts_str); + // If the input string starts and ends with "{...}", strip off the brackets + while (opts.size() > 2 && opts[0] == '{' && opts[opts.size() - 1] == '}') { + opts = trim(opts.substr(1, opts.size() - 2)); + } + while (pos < opts.size()) { - size_t eq_pos = opts.find('=', pos); + size_t eq_pos = opts.find_first_of("={};", pos); if (eq_pos == std::string::npos) { return Status::InvalidArgument("Mismatched key value pair, '=' expected"); + } else if (opts[eq_pos] != '=') { + return Status::InvalidArgument("Unexpected char in key"); } + std::string key = trim(opts.substr(pos, eq_pos - pos)); if (key.empty()) { return Status::InvalidArgument("Empty key found"); } - // skip space after '=' and look for '{' for possible nested options - pos = eq_pos + 1; - while (pos < opts.size() && isspace(opts[pos])) { - ++pos; - } - // Empty value at the end - if (pos >= opts.size()) { - (*opts_map)[key] = ""; - break; - } - if (opts[pos] == '{') { - int count = 1; - size_t brace_pos = pos + 1; - while (brace_pos < opts.size()) { - if (opts[brace_pos] == '{') { - ++count; - } else if (opts[brace_pos] == '}') { - --count; - if (count == 0) { - break; - } - } - ++brace_pos; - } - // found the matching closing brace - if (count == 0) { - (*opts_map)[key] = trim(opts.substr(pos + 1, brace_pos - pos - 1)); - // skip all whitespace and move to the next ';' - // brace_pos points to the next position after the matching '}' - pos = brace_pos + 1; - while (pos < opts.size() && isspace(opts[pos])) { - ++pos; - } - if (pos < opts.size() && opts[pos] != ';') { - return Status::InvalidArgument( - "Unexpected chars after nested options"); - } - ++pos; - } else { - return Status::InvalidArgument( - "Mismatched curly braces for nested options"); - } + std::string value; + Status s = OptionTypeInfo::NextToken(opts, ';', eq_pos + 1, &pos, &value); + if (!s.ok()) { + return s; } else { - size_t sc_pos = opts.find(';', pos); - if (sc_pos == std::string::npos) { - (*opts_map)[key] = trim(opts.substr(pos)); - // It either ends with a trailing semi-colon or the last key-value pair + (*opts_map)[key] = value; + if (pos == std::string::npos) { break; } else { - (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos)); + pos++; } - pos = sc_pos + 1; } } return Status::OK(); } -Status ParseCompressionOptions(const std::string& value, const std::string& name, - CompressionOptions& compression_opts) { - size_t start = 0; - size_t end = value.find(':'); - if (end == std::string::npos) { - return Status::InvalidArgument("unable to parse the specified CF option " + - name); - } - compression_opts.window_bits = ParseInt(value.substr(start, end - start)); - start = end + 1; - end = value.find(':', start); - if (end == std::string::npos) { - return Status::InvalidArgument("unable to parse the specified CF option " + - name); - } - compression_opts.level = ParseInt(value.substr(start, end - start)); - start = end + 1; - if (start >= value.size()) { - return Status::InvalidArgument("unable to parse the specified CF option " + - name); - } - end = value.find(':', start); - compression_opts.strategy = - ParseInt(value.substr(start, value.size() - start)); - // max_dict_bytes is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - compression_opts.max_dict_bytes = - ParseInt(value.substr(start, value.size() - start)); - end = value.find(':', start); - } - // zstd_max_train_bytes is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - compression_opts.zstd_max_train_bytes = - ParseInt(value.substr(start, value.size() - start)); - end = value.find(':', start); - } - // enabled is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - compression_opts.enabled = - ParseBoolean("", value.substr(start, value.size() - start)); - } - return Status::OK(); -} - -Status ParseColumnFamilyOption(const std::string& name, - const std::string& org_value, - ColumnFamilyOptions* new_options, - bool input_strings_escaped = false) { - const std::string& value = - input_strings_escaped ? UnescapeOptionString(org_value) : org_value; - try { - if (name == "block_based_table_factory") { - // Nested options - BlockBasedTableOptions table_opt, base_table_options; - BlockBasedTableFactory* block_based_table_factory = - static_cast_with_check( - new_options->table_factory.get()); - if (block_based_table_factory != nullptr) { - base_table_options = block_based_table_factory->table_options(); - } - Status table_opt_s = GetBlockBasedTableOptionsFromString( - base_table_options, value, &table_opt); - if (!table_opt_s.ok()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt)); - } else if (name == "plain_table_factory") { - // Nested options - PlainTableOptions table_opt, base_table_options; - PlainTableFactory* plain_table_factory = - static_cast_with_check( - new_options->table_factory.get()); - if (plain_table_factory != nullptr) { - base_table_options = plain_table_factory->table_options(); - } - Status table_opt_s = GetPlainTableOptionsFromString( - base_table_options, value, &table_opt); - if (!table_opt_s.ok()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - new_options->table_factory.reset(NewPlainTableFactory(table_opt)); - } else if (name == "memtable") { - std::unique_ptr new_mem_factory; - Status mem_factory_s = - GetMemTableRepFactoryFromString(value, &new_mem_factory); - if (!mem_factory_s.ok()) { - return Status::InvalidArgument( - "unable to parse the specified CF option " + name); - } - new_options->memtable_factory.reset(new_mem_factory.release()); - } else if (name == "bottommost_compression_opts") { - Status s = ParseCompressionOptions( - value, name, new_options->bottommost_compression_opts); - if (!s.ok()) { - return s; - } - } else if (name == "compression_opts") { - Status s = - ParseCompressionOptions(value, name, new_options->compression_opts); - if (!s.ok()) { - return s; - } - } else { - if (name == kNameComparator) { - // Try to get comparator from object registry first. - // Only support static comparator for now. - Status status = ObjectRegistry::NewInstance()->NewStaticObject( - value, &new_options->comparator); - if (status.ok()) { - return status; - } - } else if (name == kNameMergeOperator) { - // Try to get merge operator from object registry first. - std::shared_ptr mo; - Status status = - ObjectRegistry::NewInstance()->NewSharedObject( - value, &new_options->merge_operator); - // Only support static comparator for now. - if (status.ok()) { - return status; - } - } - - auto iter = cf_options_type_info.find(name); - if (iter == cf_options_type_info.end()) { - return Status::InvalidArgument( - "Unable to parse the specified CF option " + name); - } - const auto& opt_info = iter->second; - if (opt_info.verification != OptionVerificationType::kDeprecated && - ParseOptionHelper( - reinterpret_cast(new_options) + opt_info.offset, - opt_info.type, value)) { - return Status::OK(); - } - switch (opt_info.verification) { - case OptionVerificationType::kByName: - case OptionVerificationType::kByNameAllowNull: - case OptionVerificationType::kByNameAllowFromNull: - return Status::NotSupported( - "Deserializing the specified CF option " + name + - " is not supported"); - case OptionVerificationType::kDeprecated: - return Status::OK(); - default: - return Status::InvalidArgument( - "Unable to parse the specified CF option " + name); - } - } - } catch (const std::exception&) { - return Status::InvalidArgument( - "unable to parse the specified option " + name); - } - return Status::OK(); -} -template -bool SerializeSingleStructOption( - std::string* opt_string, const T& options, - const std::unordered_map& type_info, - const std::string& name, const std::string& delimiter) { - auto iter = type_info.find(name); - if (iter == type_info.end()) { - return false; - } - auto& opt_info = iter->second; - const char* opt_address = - reinterpret_cast(&options) + opt_info.offset; - std::string value; - bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); - if (result) { - *opt_string = name + "=" + value + delimiter; - } - return result; +Status GetStringFromDBOptions(std::string* opt_string, + const DBOptions& db_options, + const std::string& delimiter) { + ConfigOptions config_options(db_options); + config_options.delimiter = delimiter; + return GetStringFromDBOptions(config_options, db_options, opt_string); } -template -Status GetStringFromStruct( - std::string* opt_string, const T& options, - const std::unordered_map& type_info, - const std::string& delimiter) { +Status GetStringFromDBOptions(const ConfigOptions& config_options, + const DBOptions& db_options, + std::string* opt_string) { assert(opt_string); opt_string->clear(); - for (auto iter = type_info.begin(); iter != type_info.end(); ++iter) { - if (iter->second.verification == OptionVerificationType::kDeprecated) { - // If the option is no longer used in rocksdb and marked as deprecated, - // we skip it in the serialization. - continue; - } - std::string single_output; - bool result = SerializeSingleStructOption( - &single_output, options, type_info, iter->first, delimiter); - if (result) { - opt_string->append(single_output); - } else { - return Status::InvalidArgument("failed to serialize %s\n", - iter->first.c_str()); - } - assert(result); - } - return Status::OK(); + auto config = DBOptionsAsConfigurable(db_options); + return config->GetOptionString(config_options, opt_string); } -Status GetStringFromDBOptions(std::string* opt_string, - const DBOptions& db_options, - const std::string& delimiter) { - return GetStringFromStruct(opt_string, db_options, - db_options_type_info, delimiter); -} Status GetStringFromColumnFamilyOptions(std::string* opt_string, const ColumnFamilyOptions& cf_options, const std::string& delimiter) { - return GetStringFromStruct( - opt_string, cf_options, cf_options_type_info, delimiter); + ConfigOptions config_options; + config_options.delimiter = delimiter; + return GetStringFromColumnFamilyOptions(config_options, cf_options, + opt_string); +} + +Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options, + const ColumnFamilyOptions& cf_options, + std::string* opt_string) { + const auto config = CFOptionsAsConfigurable(cf_options); + return config->GetOptionString(config_options, opt_string); } Status GetStringFromCompressionType(std::string* compression_str, @@ -1173,124 +646,62 @@ } } -std::vector GetSupportedCompressions() { - std::vector supported_compressions; - for (const auto& comp_to_name : compression_type_string_map) { - CompressionType t = comp_to_name.second; - if (t != kDisableCompressionOption && CompressionTypeSupported(t)) { - supported_compressions.push_back(t); - } - } - return supported_compressions; -} - -Status ParseDBOption(const std::string& name, - const std::string& org_value, - DBOptions* new_options, - bool input_strings_escaped = false) { - const std::string& value = - input_strings_escaped ? UnescapeOptionString(org_value) : org_value; - try { - if (name == "rate_limiter_bytes_per_sec") { - new_options->rate_limiter.reset( - NewGenericRateLimiter(static_cast(ParseUint64(value)))); - } else if (name == kNameEnv) { - // Currently `Env` can be deserialized from object registry only. - Env* env = new_options->env; - Status status = Env::LoadEnv(value, &env); - // Only support static env for now. - if (status.ok()) { - new_options->env = env; - } - } else { - auto iter = db_options_type_info.find(name); - if (iter == db_options_type_info.end()) { - return Status::InvalidArgument("Unrecognized option DBOptions:", name); - } - const auto& opt_info = iter->second; - if (opt_info.verification != OptionVerificationType::kDeprecated && - ParseOptionHelper( - reinterpret_cast(new_options) + opt_info.offset, - opt_info.type, value)) { - return Status::OK(); - } - switch (opt_info.verification) { - case OptionVerificationType::kByName: - case OptionVerificationType::kByNameAllowNull: - return Status::NotSupported( - "Deserializing the specified DB option " + name + - " is not supported"); - case OptionVerificationType::kDeprecated: - return Status::OK(); - default: - return Status::InvalidArgument( - "Unable to parse the specified DB option " + name); - } - } - } catch (const std::exception&) { - return Status::InvalidArgument("Unable to parse DBOptions:", name); - } - return Status::OK(); -} - Status GetColumnFamilyOptionsFromMap( const ColumnFamilyOptions& base_options, const std::unordered_map& opts_map, ColumnFamilyOptions* new_options, bool input_strings_escaped, bool ignore_unknown_options) { - return GetColumnFamilyOptionsFromMapInternal( - base_options, opts_map, new_options, input_strings_escaped, nullptr, - ignore_unknown_options); + ConfigOptions config_options; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = input_strings_escaped; + return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map, + new_options); } -Status GetColumnFamilyOptionsFromMapInternal( +Status GetColumnFamilyOptionsFromMap( + const ConfigOptions& config_options, const ColumnFamilyOptions& base_options, const std::unordered_map& opts_map, - ColumnFamilyOptions* new_options, bool input_strings_escaped, - std::vector* unsupported_options_names, - bool ignore_unknown_options) { + ColumnFamilyOptions* new_options) { assert(new_options); + *new_options = base_options; - if (unsupported_options_names) { - unsupported_options_names->clear(); - } - for (const auto& o : opts_map) { - auto s = ParseColumnFamilyOption(o.first, o.second, new_options, - input_strings_escaped); - if (!s.ok()) { - if (s.IsNotSupported()) { - // If the deserialization of the specified option is not supported - // and an output vector of unsupported_options is provided, then - // we log the name of the unsupported option and proceed. - if (unsupported_options_names != nullptr) { - unsupported_options_names->push_back(o.first); - } - // Note that we still return Status::OK in such case to maintain - // the backward compatibility in the old public API defined in - // rocksdb/convenience.h - } else if (s.IsInvalidArgument() && ignore_unknown_options) { - continue; - } else { - // Restore "new_options" to the default "base_options". - *new_options = base_options; - return s; - } - } + + const auto config = CFOptionsAsConfigurable(base_options); + Status s = ConfigureFromMap( + config_options, opts_map, OptionsHelper::kCFOptionsName, config.get(), + new_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); } - return Status::OK(); } Status GetColumnFamilyOptionsFromString( const ColumnFamilyOptions& base_options, const std::string& opts_str, ColumnFamilyOptions* new_options) { + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + return GetColumnFamilyOptionsFromString(config_options, base_options, + opts_str, new_options); +} + +Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options) { std::unordered_map opts_map; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { *new_options = base_options; return s; } - return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options); + return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map, + new_options); } Status GetDBOptionsFromMap( @@ -1298,417 +709,103 @@ const std::unordered_map& opts_map, DBOptions* new_options, bool input_strings_escaped, bool ignore_unknown_options) { - return GetDBOptionsFromMapInternal(base_options, opts_map, new_options, - input_strings_escaped, nullptr, - ignore_unknown_options); + ConfigOptions config_options(base_options); + config_options.input_strings_escaped = input_strings_escaped; + config_options.ignore_unknown_options = ignore_unknown_options; + return GetDBOptionsFromMap(config_options, base_options, opts_map, + new_options); } -Status GetDBOptionsFromMapInternal( - const DBOptions& base_options, +Status GetDBOptionsFromMap( + const ConfigOptions& config_options, const DBOptions& base_options, const std::unordered_map& opts_map, - DBOptions* new_options, bool input_strings_escaped, - std::vector* unsupported_options_names, - bool ignore_unknown_options) { + DBOptions* new_options) { assert(new_options); *new_options = base_options; - if (unsupported_options_names) { - unsupported_options_names->clear(); - } - for (const auto& o : opts_map) { - auto s = ParseDBOption(o.first, o.second, - new_options, input_strings_escaped); - if (!s.ok()) { - if (s.IsNotSupported()) { - // If the deserialization of the specified option is not supported - // and an output vector of unsupported_options is provided, then - // we log the name of the unsupported option and proceed. - if (unsupported_options_names != nullptr) { - unsupported_options_names->push_back(o.first); - } - // Note that we still return Status::OK in such case to maintain - // the backward compatibility in the old public API defined in - // rocksdb/convenience.h - } else if (s.IsInvalidArgument() && ignore_unknown_options) { - continue; - } else { - // Restore "new_options" to the default "base_options". - *new_options = base_options; - return s; - } - } + auto config = DBOptionsAsConfigurable(base_options); + Status s = ConfigureFromMap(config_options, opts_map, + OptionsHelper::kDBOptionsName, + config.get(), new_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); } - return Status::OK(); } -Status GetDBOptionsFromString( - const DBOptions& base_options, - const std::string& opts_str, - DBOptions* new_options) { +Status GetDBOptionsFromString(const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options) { + ConfigOptions config_options(base_options); + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + + return GetDBOptionsFromString(config_options, base_options, opts_str, + new_options); +} + +Status GetDBOptionsFromString(const ConfigOptions& config_options, + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options) { std::unordered_map opts_map; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { *new_options = base_options; return s; } - return GetDBOptionsFromMap(base_options, opts_map, new_options); + return GetDBOptionsFromMap(config_options, base_options, opts_map, + new_options); } Status GetOptionsFromString(const Options& base_options, const std::string& opts_str, Options* new_options) { + ConfigOptions config_options(base_options); + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + + return GetOptionsFromString(config_options, base_options, opts_str, + new_options); +} + +Status GetOptionsFromString(const ConfigOptions& config_options, + const Options& base_options, + const std::string& opts_str, Options* new_options) { + ColumnFamilyOptions new_cf_options; + std::unordered_map unused_opts; std::unordered_map opts_map; + + assert(new_options); + *new_options = base_options; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { return s; } - DBOptions new_db_options(base_options); - ColumnFamilyOptions new_cf_options(base_options); - for (const auto& o : opts_map) { - if (ParseDBOption(o.first, o.second, &new_db_options).ok()) { - } else if (ParseColumnFamilyOption( - o.first, o.second, &new_cf_options).ok()) { + auto config = DBOptionsAsConfigurable(base_options); + s = config->ConfigureFromMap(config_options, opts_map, &unused_opts); + + if (s.ok()) { + DBOptions* new_db_options = + config->GetOptions(OptionsHelper::kDBOptionsName); + if (!unused_opts.empty()) { + s = GetColumnFamilyOptionsFromMap(config_options, base_options, + unused_opts, &new_cf_options); + if (s.ok()) { + *new_options = Options(*new_db_options, new_cf_options); + } } else { - return Status::InvalidArgument("Can't parse option " + o.first); + *new_options = Options(*new_db_options, base_options); } } - *new_options = Options(new_db_options, new_cf_options); - return Status::OK(); -} - -Status GetTableFactoryFromMap( - const std::string& factory_name, - const std::unordered_map& opt_map, - std::shared_ptr* table_factory, bool ignore_unknown_options) { - Status s; - if (factory_name == BlockBasedTableFactory().Name()) { - BlockBasedTableOptions bbt_opt; - s = GetBlockBasedTableOptionsFromMap(BlockBasedTableOptions(), opt_map, - &bbt_opt, - true, /* input_strings_escaped */ - ignore_unknown_options); - if (!s.ok()) { - return s; - } - table_factory->reset(new BlockBasedTableFactory(bbt_opt)); - return Status::OK(); - } else if (factory_name == PlainTableFactory().Name()) { - PlainTableOptions pt_opt; - s = GetPlainTableOptionsFromMap(PlainTableOptions(), opt_map, &pt_opt, - true, /* input_strings_escaped */ - ignore_unknown_options); - if (!s.ok()) { - return s; - } - table_factory->reset(new PlainTableFactory(pt_opt)); - return Status::OK(); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); } - // Return OK for not supported table factories as TableFactory - // Deserialization is optional. - table_factory->reset(); - return Status::OK(); } -std::unordered_map - OptionsHelper::db_options_type_info = { - /* - // not yet supported - std::shared_ptr row_cache; - std::shared_ptr delete_scheduler; - std::shared_ptr info_log; - std::shared_ptr rate_limiter; - std::shared_ptr statistics; - std::vector db_paths; - std::vector> listeners; - */ - {"advise_random_on_open", - {offsetof(struct DBOptions, advise_random_on_open), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"allow_mmap_reads", - {offsetof(struct DBOptions, allow_mmap_reads), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"allow_fallocate", - {offsetof(struct DBOptions, allow_fallocate), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"allow_mmap_writes", - {offsetof(struct DBOptions, allow_mmap_writes), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"use_direct_reads", - {offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"use_direct_writes", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"use_direct_io_for_flush_and_compaction", - {offsetof(struct DBOptions, use_direct_io_for_flush_and_compaction), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"allow_2pc", - {offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"allow_os_buffer", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, - 0}}, - {"create_if_missing", - {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"create_missing_column_families", - {offsetof(struct DBOptions, create_missing_column_families), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"disableDataSync", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"disable_data_sync", // for compatibility - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"enable_thread_tracking", - {offsetof(struct DBOptions, enable_thread_tracking), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"error_if_exists", - {offsetof(struct DBOptions, error_if_exists), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"is_fd_close_on_exec", - {offsetof(struct DBOptions, is_fd_close_on_exec), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"paranoid_checks", - {offsetof(struct DBOptions, paranoid_checks), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"skip_log_error_on_recovery", - {offsetof(struct DBOptions, skip_log_error_on_recovery), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"skip_stats_update_on_db_open", - {offsetof(struct DBOptions, skip_stats_update_on_db_open), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"skip_checking_sst_file_sizes_on_db_open", - {offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"new_table_reader_for_compaction_inputs", - {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"compaction_readahead_size", - {offsetof(struct DBOptions, compaction_readahead_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, compaction_readahead_size)}}, - {"random_access_max_buffer_size", - {offsetof(struct DBOptions, random_access_max_buffer_size), - OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, - {"use_adaptive_mutex", - {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"use_fsync", - {offsetof(struct DBOptions, use_fsync), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"max_background_jobs", - {offsetof(struct DBOptions, max_background_jobs), OptionType::kInt, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, max_background_jobs)}}, - {"max_background_compactions", - {offsetof(struct DBOptions, max_background_compactions), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, max_background_compactions)}}, - {"base_background_compactions", - {offsetof(struct DBOptions, base_background_compactions), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, base_background_compactions)}}, - {"max_background_flushes", - {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt, - OptionVerificationType::kNormal, false, 0}}, - {"max_file_opening_threads", - {offsetof(struct DBOptions, max_file_opening_threads), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"max_open_files", - {offsetof(struct DBOptions, max_open_files), OptionType::kInt, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, max_open_files)}}, - {"table_cache_numshardbits", - {offsetof(struct DBOptions, table_cache_numshardbits), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"db_write_buffer_size", - {offsetof(struct DBOptions, db_write_buffer_size), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"keep_log_file_num", - {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"recycle_log_file_num", - {offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"log_file_time_to_roll", - {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"manifest_preallocation_size", - {offsetof(struct DBOptions, manifest_preallocation_size), - OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, - {"max_log_file_size", - {offsetof(struct DBOptions, max_log_file_size), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"db_log_dir", - {offsetof(struct DBOptions, db_log_dir), OptionType::kString, - OptionVerificationType::kNormal, false, 0}}, - {"wal_dir", - {offsetof(struct DBOptions, wal_dir), OptionType::kString, - OptionVerificationType::kNormal, false, 0}}, - {"max_subcompactions", - {offsetof(struct DBOptions, max_subcompactions), OptionType::kUInt32T, - OptionVerificationType::kNormal, false, 0}}, - {"WAL_size_limit_MB", - {offsetof(struct DBOptions, WAL_size_limit_MB), OptionType::kUInt64T, - OptionVerificationType::kNormal, false, 0}}, - {"WAL_ttl_seconds", - {offsetof(struct DBOptions, WAL_ttl_seconds), OptionType::kUInt64T, - OptionVerificationType::kNormal, false, 0}}, - {"bytes_per_sync", - {offsetof(struct DBOptions, bytes_per_sync), OptionType::kUInt64T, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, bytes_per_sync)}}, - {"delayed_write_rate", - {offsetof(struct DBOptions, delayed_write_rate), OptionType::kUInt64T, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, delayed_write_rate)}}, - {"delete_obsolete_files_period_micros", - {offsetof(struct DBOptions, delete_obsolete_files_period_micros), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, - delete_obsolete_files_period_micros)}}, - {"max_manifest_file_size", - {offsetof(struct DBOptions, max_manifest_file_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, - {"max_total_wal_size", - {offsetof(struct DBOptions, max_total_wal_size), OptionType::kUInt64T, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, max_total_wal_size)}}, - {"wal_bytes_per_sync", - {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, wal_bytes_per_sync)}}, - {"strict_bytes_per_sync", - {offsetof(struct DBOptions, strict_bytes_per_sync), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, strict_bytes_per_sync)}}, - {"stats_dump_period_sec", - {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt, - OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, stats_dump_period_sec)}}, - {"stats_persist_period_sec", - {offsetof(struct DBOptions, stats_persist_period_sec), - OptionType::kUInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, stats_persist_period_sec)}}, - {"persist_stats_to_disk", - {offsetof(struct DBOptions, persist_stats_to_disk), - OptionType::kBoolean, OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}}, - {"stats_history_buffer_size", - {offsetof(struct DBOptions, stats_history_buffer_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, stats_history_buffer_size)}}, - {"fail_if_options_file_error", - {offsetof(struct DBOptions, fail_if_options_file_error), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"enable_pipelined_write", - {offsetof(struct DBOptions, enable_pipelined_write), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"unordered_write", - {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"allow_concurrent_memtable_write", - {offsetof(struct DBOptions, allow_concurrent_memtable_write), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"wal_recovery_mode", - {offsetof(struct DBOptions, wal_recovery_mode), - OptionType::kWALRecoveryMode, OptionVerificationType::kNormal, false, - 0}}, - {"enable_write_thread_adaptive_yield", - {offsetof(struct DBOptions, enable_write_thread_adaptive_yield), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"write_thread_slow_yield_usec", - {offsetof(struct DBOptions, write_thread_slow_yield_usec), - OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, - {"max_write_batch_group_size_bytes", - {offsetof(struct DBOptions, max_write_batch_group_size_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, - {"write_thread_max_yield_usec", - {offsetof(struct DBOptions, write_thread_max_yield_usec), - OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, - {"access_hint_on_compaction_start", - {offsetof(struct DBOptions, access_hint_on_compaction_start), - OptionType::kAccessHint, OptionVerificationType::kNormal, false, 0}}, - {"info_log_level", - {offsetof(struct DBOptions, info_log_level), OptionType::kInfoLogLevel, - OptionVerificationType::kNormal, false, 0}}, - {"dump_malloc_stats", - {offsetof(struct DBOptions, dump_malloc_stats), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"avoid_flush_during_recovery", - {offsetof(struct DBOptions, avoid_flush_during_recovery), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"avoid_flush_during_shutdown", - {offsetof(struct DBOptions, avoid_flush_during_shutdown), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}}, - {"writable_file_max_buffer_size", - {offsetof(struct DBOptions, writable_file_max_buffer_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableDBOptions, writable_file_max_buffer_size)}}, - {"allow_ingest_behind", - {offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean, - OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, allow_ingest_behind)}}, - {"preserve_deletes", - {offsetof(struct DBOptions, preserve_deletes), OptionType::kBoolean, - OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, preserve_deletes)}}, - {"concurrent_prepare", // Deprecated by two_write_queues - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"two_write_queues", - {offsetof(struct DBOptions, two_write_queues), OptionType::kBoolean, - OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, two_write_queues)}}, - {"manual_wal_flush", - {offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean, - OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, manual_wal_flush)}}, - {"seq_per_batch", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"atomic_flush", - {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean, - OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, atomic_flush)}}, - {"avoid_unnecessary_blocking_io", - {offsetof(struct DBOptions, avoid_unnecessary_blocking_io), - OptionType::kBoolean, OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}}, - {"write_dbid_to_manifest", - {offsetof(struct DBOptions, write_dbid_to_manifest), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"log_readahead_size", - {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, -}; - -std::unordered_map - OptionsHelper::block_base_table_index_type_string_map = { - {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, - {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, - {"kTwoLevelIndexSearch", - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, - {"kBinarySearchWithFirstKey", - BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; - -std::unordered_map - OptionsHelper::block_base_table_data_block_index_type_string_map = { - {"kDataBlockBinarySearch", - BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch}, - {"kDataBlockBinaryAndHash", - BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}}; - -std::unordered_map - OptionsHelper::block_base_table_index_shortening_mode_string_map = { - {"kNoShortening", - BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, - {"kShortenSeparators", - BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, - {"kShortenSeparatorsAndSuccessor", - BlockBasedTableOptions::IndexShorteningMode:: - kShortenSeparatorsAndSuccessor}}; - std::unordered_map OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain}, {"kPrefix", kPrefix}}; @@ -1727,398 +824,574 @@ {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst}, {"kMinOverlappingRatio", kMinOverlappingRatio}}; -std::unordered_map - OptionsHelper::wal_recovery_mode_string_map = { - {"kTolerateCorruptedTailRecords", - WALRecoveryMode::kTolerateCorruptedTailRecords}, - {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency}, - {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery}, - {"kSkipAnyCorruptedRecords", - WALRecoveryMode::kSkipAnyCorruptedRecords}}; - -std::unordered_map - OptionsHelper::access_hint_string_map = { - {"NONE", DBOptions::AccessHint::NONE}, - {"NORMAL", DBOptions::AccessHint::NORMAL}, - {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL}, - {"WILLNEED", DBOptions::AccessHint::WILLNEED}}; - -std::unordered_map - OptionsHelper::info_log_level_string_map = { - {"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL}, - {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL}, - {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL}, - {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL}, - {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL}, - {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}}; - -ColumnFamilyOptions OptionsHelper::dummy_cf_options; -CompactionOptionsFIFO OptionsHelper::dummy_comp_options; -LRUCacheOptions OptionsHelper::dummy_lru_cache_options; -CompactionOptionsUniversal OptionsHelper::dummy_comp_options_universal; - -// offset_of is used to get the offset of a class data member -// ex: offset_of(&ColumnFamilyOptions::num_levels) -// This call will return the offset of num_levels in ColumnFamilyOptions class -// -// This is the same as offsetof() but allow us to work with non standard-layout -// classes and structures -// refs: -// http://en.cppreference.com/w/cpp/concept/StandardLayoutType -// https://gist.github.com/graphitemaster/494f21190bb2c63c5516 -template -int offset_of(T1 ColumnFamilyOptions::*member) { - return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) - - size_t(&OptionsHelper::dummy_cf_options)); -} -template -int offset_of(T1 AdvancedColumnFamilyOptions::*member) { - return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) - - size_t(&OptionsHelper::dummy_cf_options)); -} -template -int offset_of(T1 CompactionOptionsFIFO::*member) { - return int(size_t(&(OptionsHelper::dummy_comp_options.*member)) - - size_t(&OptionsHelper::dummy_comp_options)); -} -template -int offset_of(T1 LRUCacheOptions::*member) { - return int(size_t(&(OptionsHelper::dummy_lru_cache_options.*member)) - - size_t(&OptionsHelper::dummy_lru_cache_options)); -} -template -int offset_of(T1 CompactionOptionsUniversal::*member) { - return int(size_t(&(OptionsHelper::dummy_comp_options_universal.*member)) - - size_t(&OptionsHelper::dummy_comp_options_universal)); -} - -std::unordered_map - OptionsHelper::cf_options_type_info = { - /* not yet supported - CompressionOptions compression_opts; - TablePropertiesCollectorFactories table_properties_collector_factories; - typedef std::vector> - TablePropertiesCollectorFactories; - UpdateStatus (*inplace_callback)(char* existing_value, - uint34_t* existing_value_size, - Slice delta_value, - std::string* merged_value); - std::vector cf_paths; - */ - {"report_bg_io_stats", - {offset_of(&ColumnFamilyOptions::report_bg_io_stats), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, report_bg_io_stats)}}, - {"compaction_measure_io_stats", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"disable_auto_compactions", - {offset_of(&ColumnFamilyOptions::disable_auto_compactions), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, disable_auto_compactions)}}, - {"filter_deletes", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, - 0}}, - {"inplace_update_support", - {offset_of(&ColumnFamilyOptions::inplace_update_support), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"level_compaction_dynamic_level_bytes", - {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"optimize_filters_for_hits", - {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"paranoid_file_checks", - {offset_of(&ColumnFamilyOptions::paranoid_file_checks), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, paranoid_file_checks)}}, - {"force_consistency_checks", - {offset_of(&ColumnFamilyOptions::force_consistency_checks), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"purge_redundant_kvs_while_flush", - {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush), - OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}}, - {"verify_checksums_in_compaction", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, - 0}}, - {"soft_pending_compaction_bytes_limit", - {offset_of(&ColumnFamilyOptions::soft_pending_compaction_bytes_limit), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, - soft_pending_compaction_bytes_limit)}}, - {"hard_pending_compaction_bytes_limit", - {offset_of(&ColumnFamilyOptions::hard_pending_compaction_bytes_limit), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, - hard_pending_compaction_bytes_limit)}}, - {"hard_rate_limit", - {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true, - 0}}, - {"soft_rate_limit", - {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true, - 0}}, - {"max_compaction_bytes", - {offset_of(&ColumnFamilyOptions::max_compaction_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, max_compaction_bytes)}}, - {"expanded_compaction_factor", - {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}}, - {"level0_file_num_compaction_trigger", - {offset_of(&ColumnFamilyOptions::level0_file_num_compaction_trigger), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, - level0_file_num_compaction_trigger)}}, - {"level0_slowdown_writes_trigger", - {offset_of(&ColumnFamilyOptions::level0_slowdown_writes_trigger), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger)}}, - {"level0_stop_writes_trigger", - {offset_of(&ColumnFamilyOptions::level0_stop_writes_trigger), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, level0_stop_writes_trigger)}}, - {"max_grandparent_overlap_factor", - {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}}, - {"max_mem_compaction_level", - {0, OptionType::kInt, OptionVerificationType::kDeprecated, false, 0}}, - {"max_write_buffer_number", - {offset_of(&ColumnFamilyOptions::max_write_buffer_number), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, max_write_buffer_number)}}, - {"max_write_buffer_number_to_maintain", - {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"max_write_buffer_size_to_maintain", - {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain), - OptionType::kInt64T, OptionVerificationType::kNormal, false, 0}}, - {"min_write_buffer_number_to_merge", - {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"num_levels", - {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt, - OptionVerificationType::kNormal, false, 0}}, - {"source_compaction_factor", - {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}}, - {"target_file_size_multiplier", - {offset_of(&ColumnFamilyOptions::target_file_size_multiplier), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, target_file_size_multiplier)}}, - {"arena_block_size", - {offset_of(&ColumnFamilyOptions::arena_block_size), OptionType::kSizeT, - OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, arena_block_size)}}, - {"inplace_update_num_locks", - {offset_of(&ColumnFamilyOptions::inplace_update_num_locks), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, inplace_update_num_locks)}}, - {"max_successive_merges", - {offset_of(&ColumnFamilyOptions::max_successive_merges), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, max_successive_merges)}}, - {"memtable_huge_page_size", - {offset_of(&ColumnFamilyOptions::memtable_huge_page_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, memtable_huge_page_size)}}, - {"memtable_prefix_bloom_huge_page_tlb_size", - {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, true, 0}}, - {"write_buffer_size", - {offset_of(&ColumnFamilyOptions::write_buffer_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, write_buffer_size)}}, - {"bloom_locality", - {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T, - OptionVerificationType::kNormal, false, 0}}, - {"memtable_prefix_bloom_bits", - {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, - 0}}, - {"memtable_prefix_bloom_size_ratio", - {offset_of(&ColumnFamilyOptions::memtable_prefix_bloom_size_ratio), - OptionType::kDouble, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio)}}, - {"memtable_prefix_bloom_probes", - {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, - 0}}, - {"memtable_whole_key_filtering", - {offset_of(&ColumnFamilyOptions::memtable_whole_key_filtering), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, memtable_whole_key_filtering)}}, - {"min_partial_merge_operands", - {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, - 0}}, - {"max_bytes_for_level_base", - {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, max_bytes_for_level_base)}}, - {"snap_refresh_nanos", - {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, true, - 0}}, - {"max_bytes_for_level_multiplier", - {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier), - OptionType::kDouble, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier)}}, - {"max_bytes_for_level_multiplier_additional", - {offset_of( - &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional), - OptionType::kVectorInt, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, - max_bytes_for_level_multiplier_additional)}}, - {"max_sequential_skip_in_iterations", - {offset_of(&ColumnFamilyOptions::max_sequential_skip_in_iterations), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, - max_sequential_skip_in_iterations)}}, - {"target_file_size_base", - {offset_of(&ColumnFamilyOptions::target_file_size_base), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, target_file_size_base)}}, - {"rate_limit_delay_max_milliseconds", - {0, OptionType::kUInt, OptionVerificationType::kDeprecated, false, 0}}, - {"compression", - {offset_of(&ColumnFamilyOptions::compression), - OptionType::kCompressionType, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, compression)}}, - {"compression_per_level", - {offset_of(&ColumnFamilyOptions::compression_per_level), - OptionType::kVectorCompressionType, OptionVerificationType::kNormal, - false, 0}}, - {"bottommost_compression", - {offset_of(&ColumnFamilyOptions::bottommost_compression), - OptionType::kCompressionType, OptionVerificationType::kNormal, false, - 0}}, - {kNameComparator, - {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator, - OptionVerificationType::kByName, false, 0}}, - {"prefix_extractor", - {offset_of(&ColumnFamilyOptions::prefix_extractor), - OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, - true, offsetof(struct MutableCFOptions, prefix_extractor)}}, - {"memtable_insert_with_hint_prefix_extractor", - {offset_of( - &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor), - OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, - false, 0}}, - {"memtable_factory", - {offset_of(&ColumnFamilyOptions::memtable_factory), - OptionType::kMemTableRepFactory, OptionVerificationType::kByName, - false, 0}}, - {"table_factory", - {offset_of(&ColumnFamilyOptions::table_factory), - OptionType::kTableFactory, OptionVerificationType::kByName, false, - 0}}, - {"compaction_filter", - {offset_of(&ColumnFamilyOptions::compaction_filter), - OptionType::kCompactionFilter, OptionVerificationType::kByName, false, - 0}}, - {"compaction_filter_factory", - {offset_of(&ColumnFamilyOptions::compaction_filter_factory), - OptionType::kCompactionFilterFactory, OptionVerificationType::kByName, - false, 0}}, - {kNameMergeOperator, - {offset_of(&ColumnFamilyOptions::merge_operator), - OptionType::kMergeOperator, - OptionVerificationType::kByNameAllowFromNull, false, 0}}, - {"compaction_style", - {offset_of(&ColumnFamilyOptions::compaction_style), - OptionType::kCompactionStyle, OptionVerificationType::kNormal, false, - 0}}, - {"compaction_pri", - {offset_of(&ColumnFamilyOptions::compaction_pri), - OptionType::kCompactionPri, OptionVerificationType::kNormal, false, - 0}}, - {"compaction_options_fifo", - {offset_of(&ColumnFamilyOptions::compaction_options_fifo), - OptionType::kCompactionOptionsFIFO, OptionVerificationType::kNormal, - true, offsetof(struct MutableCFOptions, compaction_options_fifo)}}, - {"compaction_options_universal", - {offset_of(&ColumnFamilyOptions::compaction_options_universal), - OptionType::kCompactionOptionsUniversal, - OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, compaction_options_universal)}}, - {"ttl", - {offset_of(&ColumnFamilyOptions::ttl), OptionType::kUInt64T, - OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, ttl)}}, - {"periodic_compaction_seconds", - {offset_of(&ColumnFamilyOptions::periodic_compaction_seconds), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, periodic_compaction_seconds)}}, - {"sample_for_compression", - {offset_of(&ColumnFamilyOptions::sample_for_compression), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, sample_for_compression)}}}; - -std::unordered_map - OptionsHelper::fifo_compaction_options_type_info = { - {"max_table_files_size", - {offset_of(&CompactionOptionsFIFO::max_table_files_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct CompactionOptionsFIFO, max_table_files_size)}}, - {"ttl", - {0, OptionType::kUInt64T, - OptionVerificationType::kDeprecated, false, - 0}}, - {"allow_compaction", - {offset_of(&CompactionOptionsFIFO::allow_compaction), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct CompactionOptionsFIFO, allow_compaction)}}}; - -std::unordered_map - OptionsHelper::universal_compaction_options_type_info = { - {"size_ratio", - {offset_of(&CompactionOptionsUniversal::size_ratio), OptionType::kUInt, - OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, size_ratio)}}, - {"min_merge_width", - {offset_of(&CompactionOptionsUniversal::min_merge_width), - OptionType::kUInt, OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, min_merge_width)}}, - {"max_merge_width", - {offset_of(&CompactionOptionsUniversal::max_merge_width), - OptionType::kUInt, OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, max_merge_width)}}, - {"max_size_amplification_percent", - {offset_of( - &CompactionOptionsUniversal::max_size_amplification_percent), - OptionType::kUInt, OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, - max_size_amplification_percent)}}, - {"compression_size_percent", - {offset_of(&CompactionOptionsUniversal::compression_size_percent), - OptionType::kInt, OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, - compression_size_percent)}}, - {"stop_style", - {offset_of(&CompactionOptionsUniversal::stop_style), - OptionType::kCompactionStopStyle, OptionVerificationType::kNormal, - true, offsetof(class CompactionOptionsUniversal, stop_style)}}, - {"allow_trivial_move", - {offset_of(&CompactionOptionsUniversal::allow_trivial_move), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(class CompactionOptionsUniversal, allow_trivial_move)}}}; - std::unordered_map OptionsHelper::compaction_stop_style_string_map = { {"kCompactionStopStyleSimilarSize", kCompactionStopStyleSimilarSize}, {"kCompactionStopStyleTotalSize", kCompactionStopStyleTotalSize}}; -std::unordered_map - OptionsHelper::lru_cache_options_type_info = { - {"capacity", - {offset_of(&LRUCacheOptions::capacity), OptionType::kSizeT, - OptionVerificationType::kNormal, true, - offsetof(struct LRUCacheOptions, capacity)}}, - {"num_shard_bits", - {offset_of(&LRUCacheOptions::num_shard_bits), OptionType::kInt, - OptionVerificationType::kNormal, true, - offsetof(struct LRUCacheOptions, num_shard_bits)}}, - {"strict_capacity_limit", - {offset_of(&LRUCacheOptions::strict_capacity_limit), - OptionType::kBoolean, OptionVerificationType::kNormal, true, - offsetof(struct LRUCacheOptions, strict_capacity_limit)}}, - {"high_pri_pool_ratio", - {offset_of(&LRUCacheOptions::high_pri_pool_ratio), OptionType::kDouble, - OptionVerificationType::kNormal, true, - offsetof(struct LRUCacheOptions, high_pri_pool_ratio)}}}; +Status OptionTypeInfo::NextToken(const std::string& opts, char delimiter, + size_t pos, size_t* end, std::string* token) { + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + // Empty value at the end + if (pos >= opts.size()) { + *token = ""; + *end = std::string::npos; + return Status::OK(); + } else if (opts[pos] == '{') { + int count = 1; + size_t brace_pos = pos + 1; + while (brace_pos < opts.size()) { + if (opts[brace_pos] == '{') { + ++count; + } else if (opts[brace_pos] == '}') { + --count; + if (count == 0) { + break; + } + } + ++brace_pos; + } + // found the matching closing brace + if (count == 0) { + *token = trim(opts.substr(pos + 1, brace_pos - pos - 1)); + // skip all whitespace and move to the next delimiter + // brace_pos points to the next position after the matching '}' + pos = brace_pos + 1; + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + if (pos < opts.size() && opts[pos] != delimiter) { + return Status::InvalidArgument("Unexpected chars after nested options"); + } + *end = pos; + } else { + return Status::InvalidArgument( + "Mismatched curly braces for nested options"); + } + } else { + *end = opts.find(delimiter, pos); + if (*end == std::string::npos) { + // It either ends with a trailing semi-colon or the last key-value pair + *token = trim(opts.substr(pos)); + } else { + *token = trim(opts.substr(pos, *end - pos)); + } + } + return Status::OK(); +} + +Status OptionTypeInfo::Parse(const ConfigOptions& config_options, + const std::string& opt_name, + const std::string& value, void* opt_ptr) const { + if (IsDeprecated()) { + return Status::OK(); + } + try { + void* opt_addr = static_cast(opt_ptr) + offset_; + const std::string& opt_value = config_options.input_strings_escaped + ? UnescapeOptionString(value) + : value; + + if (opt_addr == nullptr) { + return Status::NotFound("Could not find option", opt_name); + } else if (parse_func_ != nullptr) { + ConfigOptions copy = config_options; + copy.invoke_prepare_options = false; + return parse_func_(copy, opt_name, opt_value, opt_addr); + } else if (ParseOptionHelper(opt_addr, type_, opt_value)) { + return Status::OK(); + } else if (IsConfigurable()) { + // The option is . + Configurable* config = AsRawPointer(opt_ptr); + if (opt_value.empty()) { + return Status::OK(); + } else if (config == nullptr) { + return Status::NotFound("Could not find configurable: ", opt_name); + } else { + ConfigOptions copy = config_options; + copy.ignore_unknown_options = false; + copy.invoke_prepare_options = false; + if (opt_value.find("=") != std::string::npos) { + return config->ConfigureFromString(copy, opt_value); + } else { + return config->ConfigureOption(copy, opt_name, opt_value); + } + } + } else if (IsByName()) { + return Status::NotSupported("Deserializing the option " + opt_name + + " is not supported"); + } else { + return Status::InvalidArgument("Error parsing:", opt_name); + } + } catch (std::exception& e) { + return Status::InvalidArgument("Error parsing " + opt_name + ":" + + std::string(e.what())); + } +} + +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + std::unordered_map opts_map; + Status status = StringToMap(opts_str, &opts_map); + if (!status.ok()) { + return status; + } else { + return ParseType(config_options, opts_map, type_map, opt_addr, unused); + } +} + +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + for (const auto& opts_iter : opts_map) { + std::string opt_name; + const auto* opt_info = Find(opts_iter.first, type_map, &opt_name); + if (opt_info != nullptr) { + Status status = + opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr); + if (!status.ok()) { + return status; + } + } else if (unused != nullptr) { + (*unused)[opts_iter.first] = opts_iter.second; + } else if (!config_options.ignore_unknown_options) { + return Status::NotFound("Unrecognized option", opts_iter.first); + } + } + return Status::OK(); +} + +Status OptionTypeInfo::ParseStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const std::string& opt_value, void* opt_addr) { + assert(struct_map); + Status status; + if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) { + // This option represents the entire struct + std::unordered_map unused; + status = + ParseType(config_options, opt_value, *struct_map, opt_addr, &unused); + if (status.ok() && !unused.empty()) { + status = Status::InvalidArgument( + "Unrecognized option", struct_name + "." + unused.begin()->first); + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr); + } else { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr); + } else { + status = Status::InvalidArgument("Unrecognized option", + struct_name + "." + opt_name); + } + } + return status; +} +Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const opt_ptr, + std::string* opt_value) const { + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + const void* opt_addr = static_cast(opt_ptr) + offset_; + if (opt_addr == nullptr || IsDeprecated()) { + return Status::OK(); + } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { + return Status::NotSupported("Cannot serialize option: ", opt_name); + } else if (serialize_func_ != nullptr) { + return serialize_func_(config_options, opt_name, opt_addr, opt_value); + } else if (IsCustomizable()) { + const Customizable* custom = AsRawPointer(opt_ptr); + opt_value->clear(); + if (custom == nullptr) { + // We do not have a custom object to serialize. + // If the option is not mutable and we are doing only mutable options, + // we return an empty string (which will cause the option not to be + // printed). Otherwise, we return the "nullptr" string, which will result + // in "option=nullptr" being printed. + if (IsMutable() || !config_options.mutable_options_only) { + *opt_value = kNullptrString; + } else { + *opt_value = ""; + } + } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) && + !config_options.IsDetailed()) { + if (!config_options.mutable_options_only || IsMutable()) { + *opt_value = custom->GetId(); + } + } else { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + // If this option is mutable, everything inside it should be considered + // mutable + if (IsMutable()) { + embedded.mutable_options_only = false; + } + std::string value = custom->ToString(embedded); + if (!embedded.mutable_options_only || + value.find("=") != std::string::npos) { + *opt_value = value; + } else { + *opt_value = ""; + } + } + return Status::OK(); + } else if (IsConfigurable()) { + const Configurable* config = AsRawPointer(opt_ptr); + if (config != nullptr) { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + *opt_value = config->ToString(embedded); + } + return Status::OK(); + } else if (config_options.mutable_options_only && !IsMutable()) { + return Status::OK(); + } else if (SerializeSingleOptionHelper(opt_addr, type_, opt_value)) { + return Status::OK(); + } else { + return Status::InvalidArgument("Cannot serialize option: ", opt_name); + } +} + +Status OptionTypeInfo::SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* result) { + Status status; + for (const auto& iter : type_map) { + std::string single; + const auto& opt_info = iter.second; + if (opt_info.ShouldSerialize()) { + status = + opt_info.Serialize(config_options, iter.first, opt_addr, &single); + if (!status.ok()) { + return status; + } else { + result->append(iter.first + "=" + single + config_options.delimiter); + } + } + } + return status; +} + +Status OptionTypeInfo::SerializeStruct( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const void* opt_addr, std::string* value) { + assert(struct_map); + Status status; + if (EndsWith(opt_name, struct_name)) { + // We are going to write the struct as "{ prop1=value1; prop2=value2;}. + // Set the delimiter to ";" so that the everything will be on one line. + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + + // This option represents the entire struct + std::string result; + status = SerializeType(embedded, *struct_map, opt_addr, &result); + if (!status.ok()) { + return status; + } else { + *value = "{" + result + "}"; + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + if (opt_info != nullptr) { + status = opt_info->Serialize(config_options, elem_name, opt_addr, value); + } else { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + if (opt_info == nullptr) { + status = Status::InvalidArgument("Unrecognized option", opt_name); + } else if (opt_info->ShouldSerialize()) { + status = opt_info->Serialize(config_options, opt_name + "." + elem_name, + opt_addr, value); + } + } + return status; +} + +template +bool IsOptionEqual(const void* offset1, const void* offset2) { + return (*static_cast(offset1) == *static_cast(offset2)); +} + +static bool AreEqualDoubles(const double a, const double b) { + return (fabs(a - b) < 0.00001); +} + +static bool AreOptionsEqual(OptionType type, const void* this_offset, + const void* that_offset) { + switch (type) { + case OptionType::kBoolean: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt32T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kInt64T: { + int64_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kUInt8T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt32T: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kUInt64T: { + uint64_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kSizeT: { + size_t v1, v2; + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); + return (v1 == v2); + } + case OptionType::kString: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kDouble: + return AreEqualDoubles(*static_cast(this_offset), + *static_cast(that_offset)); + case OptionType::kCompactionStyle: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompactionStopStyle: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompactionPri: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kCompressionType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kChecksumType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kEncodingType: + return IsOptionEqual(this_offset, that_offset); + case OptionType::kEncodedString: + return IsOptionEqual(this_offset, that_offset); + default: + return false; + } // End switch +} + +bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr, + std::string* mismatch) const { + auto level = GetSanityLevel(); + if (!config_options.IsCheckEnabled(level)) { + return true; // If the sanity level is not being checked, skip it + } + const void* this_addr = static_cast(this_ptr) + offset_; + const void* that_addr = static_cast(that_ptr) + offset_; + if (this_addr == nullptr || that_addr == nullptr) { + if (this_addr == that_addr) { + return true; + } + } else if (equals_func_ != nullptr) { + if (equals_func_(config_options, opt_name, this_addr, that_addr, + mismatch)) { + return true; + } + } else if (AreOptionsEqual(type_, this_addr, that_addr)) { + return true; + } else if (IsConfigurable()) { + const auto* this_config = AsRawPointer(this_ptr); + const auto* that_config = AsRawPointer(that_ptr); + if (this_config == that_config) { + return true; + } else if (this_config != nullptr && that_config != nullptr) { + std::string bad_name; + bool matches; + if (level < config_options.sanity_level) { + ConfigOptions copy = config_options; + copy.sanity_level = level; + matches = this_config->AreEquivalent(copy, that_config, &bad_name); + } else { + matches = + this_config->AreEquivalent(config_options, that_config, &bad_name); + } + if (!matches) { + *mismatch = opt_name + "." + bad_name; + } + return matches; + } + } + if (mismatch->empty()) { + *mismatch = opt_name; + } + return false; +} + +bool OptionTypeInfo::TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* this_addr, const void* that_addr, std::string* mismatch) { + for (const auto& iter : type_map) { + const auto& opt_info = iter.second; + if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr, + mismatch)) { + return false; + } + } + return true; +} + +bool OptionTypeInfo::StructsAreEqual( + const ConfigOptions& config_options, const std::string& struct_name, + const std::unordered_map* struct_map, + const std::string& opt_name, const void* this_addr, const void* that_addr, + std::string* mismatch) { + assert(struct_map); + bool matches = true; + std::string result; + if (EndsWith(opt_name, struct_name)) { + // This option represents the entire struct + matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr, + &result); + if (!matches) { + *mismatch = struct_name + "." + result; + return false; + } + } else if (StartsWith(opt_name, struct_name + ".")) { + // This option represents a nested field in the struct (e.g, struct.field) + std::string elem_name; + const auto opt_info = + Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name); + assert(opt_info); + if (opt_info == nullptr) { + *mismatch = opt_name; + matches = false; + } else if (!opt_info->AreEqual(config_options, elem_name, this_addr, + that_addr, &result)) { + matches = false; + *mismatch = struct_name + "." + result; + } + } else { + // This option represents a field in the struct (e.g. field) + std::string elem_name; + const auto opt_info = Find(opt_name, *struct_map, &elem_name); + assert(opt_info); + if (opt_info == nullptr) { + *mismatch = struct_name + "." + opt_name; + matches = false; + } else if (!opt_info->AreEqual(config_options, elem_name, this_addr, + that_addr, &result)) { + matches = false; + *mismatch = struct_name + "." + result; + } + } + return matches; +} + +bool MatchesOptionsTypeFromMap( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* const this_ptr, const void* const that_ptr, + std::string* mismatch) { + for (auto& pair : type_map) { + // We skip checking deprecated variables as they might + // contain random values since they might not be initialized + if (config_options.IsCheckEnabled(pair.second.GetSanityLevel())) { + if (!pair.second.AreEqual(config_options, pair.first, this_ptr, that_ptr, + mismatch) && + !pair.second.AreEqualByName(config_options, pair.first, this_ptr, + that_ptr)) { + return false; + } + } + } + return true; +} + +bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const this_ptr, + const void* const that_ptr) const { + if (IsByName()) { + std::string that_value; + if (Serialize(config_options, opt_name, that_ptr, &that_value).ok()) { + return AreEqualByName(config_options, opt_name, this_ptr, that_value); + } + } + return false; +} + +bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options, + const std::string& opt_name, + const void* const opt_ptr, + const std::string& that_value) const { + std::string this_value; + if (!IsByName()) { + return false; + } else if (!Serialize(config_options, opt_name, opt_ptr, &this_value).ok()) { + return false; + } else if (IsEnabled(OptionVerificationType::kByNameAllowFromNull)) { + if (that_value == kNullptrString) { + return true; + } + } else if (IsEnabled(OptionVerificationType::kByNameAllowNull)) { + if (that_value == kNullptrString) { + return true; + } + } + return (this_value == that_value); +} + +const OptionTypeInfo* OptionTypeInfo::Find( + const std::string& opt_name, + const std::unordered_map& opt_map, + std::string* elem_name) { + const auto iter = opt_map.find(opt_name); // Look up the value in the map + if (iter != opt_map.end()) { // Found the option in the map + *elem_name = opt_name; // Return the name + return &(iter->second); // Return the contents of the iterator + } else { + auto idx = opt_name.find("."); // Look for a separator + if (idx > 0 && idx != std::string::npos) { // We found a separator + auto siter = + opt_map.find(opt_name.substr(0, idx)); // Look for the short name + if (siter != opt_map.end()) { // We found the short name + if (siter->second.IsStruct() || // If the object is a struct + siter->second.IsConfigurable()) { // or a Configurable + *elem_name = opt_name.substr(idx + 1); // Return the rest + return &(siter->second); // Return the contents of the iterator + } + } + } + } + return nullptr; +} #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,14 +10,36 @@ #include #include -#include "options/cf_options.h" -#include "options/db_options.h" #include "rocksdb/options.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "rocksdb/universal_compaction.h" namespace ROCKSDB_NAMESPACE { +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; +struct ImmutableCFOptions; +struct ImmutableDBOptions; +struct MutableDBOptions; +struct MutableCFOptions; +struct Options; + +std::vector GetSupportedCompressions(); + +std::vector GetSupportedDictCompressions(); + +std::vector GetSupportedChecksums(); + +inline bool IsSupportedChecksumType(ChecksumType type) { + // Avoid annoying compiler warning-as-error (-Werror=type-limits) + auto min = kNoChecksum; + auto max = kXXH3; + return type >= min && type <= max; +} + +// Checks that the combination of DBOptions and ColumnFamilyOptions are valid +Status ValidateOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts); DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, const MutableDBOptions& mutable_db_options); @@ -26,128 +48,31 @@ const ColumnFamilyOptions& ioptions, const MutableCFOptions& mutable_cf_options); -#ifndef ROCKSDB_LITE - -Status GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - Logger* info_log, MutableCFOptions* new_options); - -Status GetMutableDBOptionsFromStrings( - const MutableDBOptions& base_options, - const std::unordered_map& options_map, - MutableDBOptions* new_options); - -Status GetTableFactoryFromMap( - const std::string& factory_name, - const std::unordered_map& opt_map, - std::shared_ptr* table_factory, - bool ignore_unknown_options = false); - -enum class OptionType { - kBoolean, - kInt, - kInt32T, - kInt64T, - kVectorInt, - kUInt, - kUInt32T, - kUInt64T, - kSizeT, - kString, - kDouble, - kCompactionStyle, - kCompactionPri, - kSliceTransform, - kCompressionType, - kVectorCompressionType, - kTableFactory, - kComparator, - kCompactionFilter, - kCompactionFilterFactory, - kCompactionOptionsFIFO, - kCompactionOptionsUniversal, - kCompactionStopStyle, - kMergeOperator, - kMemTableRepFactory, - kBlockBasedTableIndexType, - kBlockBasedTableDataBlockIndexType, - kBlockBasedTableIndexShorteningMode, - kFilterPolicy, - kFlushBlockPolicyFactory, - kChecksumType, - kEncodingType, - kWALRecoveryMode, - kAccessHint, - kInfoLogLevel, - kLRUCacheOptions, - kEnv, - kUnknown, -}; - -enum class OptionVerificationType { - kNormal, - kByName, // The option is pointer typed so we can only verify - // based on it's name. - kByNameAllowNull, // Same as kByName, but it also allows the case - // where one of them is a nullptr. - kByNameAllowFromNull, // Same as kByName, but it also allows the case - // where the old option is nullptr. - kDeprecated // The option is no longer used in rocksdb. The RocksDB - // OptionsParser will still accept this option if it - // happen to exists in some Options file. However, - // the parser will not include it in serialization - // and verification processes. -}; +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts); +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts); -// A struct for storing constant option information such as option name, -// option type, and offset. -struct OptionTypeInfo { - int offset; - OptionType type; - OptionVerificationType verification; - bool is_mutable; - int mutable_offset; -}; - -// A helper function that converts "opt_address" to a std::string -// based on the specified OptionType. -bool SerializeSingleOptionHelper(const char* opt_address, - const OptionType opt_type, std::string* value); - -// In addition to its public version defined in rocksdb/convenience.h, -// this further takes an optional output vector "unsupported_options_names", -// which stores the name of all the unsupported options specified in "opts_map". -Status GetDBOptionsFromMapInternal( - const DBOptions& base_options, - const std::unordered_map& opts_map, - DBOptions* new_options, bool input_strings_escaped, - std::vector* unsupported_options_names = nullptr, - bool ignore_unknown_options = false); - -// In addition to its public version defined in rocksdb/convenience.h, -// this further takes an optional output vector "unsupported_options_names", -// which stores the name of all the unsupported options specified in "opts_map". -Status GetColumnFamilyOptionsFromMapInternal( - const ColumnFamilyOptions& base_options, - const std::unordered_map& opts_map, - ColumnFamilyOptions* new_options, bool input_strings_escaped, - std::vector* unsupported_options_names = nullptr, - bool ignore_unknown_options = false); - -bool ParseSliceTransform( - const std::string& value, - std::shared_ptr* slice_transform); +#ifndef ROCKSDB_LITE +std::unique_ptr DBOptionsAsConfigurable( + const MutableDBOptions& opts); +std::unique_ptr DBOptionsAsConfigurable( + const DBOptions& opts, + const std::unordered_map* opt_map = nullptr); +std::unique_ptr CFOptionsAsConfigurable( + const MutableCFOptions& opts); +std::unique_ptr CFOptionsAsConfigurable( + const ColumnFamilyOptions& opts, + const std::unordered_map* opt_map = nullptr); extern Status StringToMap( const std::string& opts_str, std::unordered_map* opts_map); - -extern bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, - const std::string& value); #endif // !ROCKSDB_LITE struct OptionsHelper { + static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/; + static const std::string kDBOptionsName /*= "DBOptions" */; static std::map compaction_style_to_string; static std::map compaction_pri_to_string; static std::map @@ -156,39 +81,13 @@ static std::unordered_map compression_type_string_map; #ifndef ROCKSDB_LITE - static std::unordered_map cf_options_type_info; - static std::unordered_map - fifo_compaction_options_type_info; - static std::unordered_map - universal_compaction_options_type_info; static std::unordered_map compaction_stop_style_string_map; - static std::unordered_map db_options_type_info; - static std::unordered_map - lru_cache_options_type_info; - static std::unordered_map - block_base_table_index_type_string_map; - static std::unordered_map - block_base_table_data_block_index_type_string_map; - static std::unordered_map - block_base_table_index_shortening_mode_string_map; static std::unordered_map encoding_type_string_map; static std::unordered_map compaction_style_string_map; static std::unordered_map compaction_pri_string_map; - static std::unordered_map - wal_recovery_mode_string_map; - static std::unordered_map - access_hint_string_map; - static std::unordered_map - info_log_level_string_map; - static ColumnFamilyOptions dummy_cf_options; - static CompactionOptionsFIFO dummy_comp_options; - static LRUCacheOptions dummy_lru_cache_options; - static CompactionOptionsUniversal dummy_comp_options_universal; #endif // !ROCKSDB_LITE }; @@ -200,34 +99,15 @@ OptionsHelper::compaction_stop_style_to_string; static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map; #ifndef ROCKSDB_LITE -static auto& cf_options_type_info = OptionsHelper::cf_options_type_info; -static auto& fifo_compaction_options_type_info = - OptionsHelper::fifo_compaction_options_type_info; -static auto& universal_compaction_options_type_info = - OptionsHelper::universal_compaction_options_type_info; static auto& compaction_stop_style_string_map = OptionsHelper::compaction_stop_style_string_map; -static auto& db_options_type_info = OptionsHelper::db_options_type_info; -static auto& lru_cache_options_type_info = - OptionsHelper::lru_cache_options_type_info; static auto& compression_type_string_map = OptionsHelper::compression_type_string_map; -static auto& block_base_table_index_type_string_map = - OptionsHelper::block_base_table_index_type_string_map; -static auto& block_base_table_data_block_index_type_string_map = - OptionsHelper::block_base_table_data_block_index_type_string_map; -static auto& block_base_table_index_shortening_mode_string_map = - OptionsHelper::block_base_table_index_shortening_mode_string_map; static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map; static auto& compaction_style_string_map = OptionsHelper::compaction_style_string_map; static auto& compaction_pri_string_map = OptionsHelper::compaction_pri_string_map; -static auto& wal_recovery_mode_string_map = - OptionsHelper::wal_recovery_mode_string_map; -static auto& access_hint_string_map = OptionsHelper::access_hint_string_map; -static auto& info_log_level_string_map = - OptionsHelper::info_log_level_string_map; #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,17 +13,19 @@ #include #include -#include "file/read_write_util.h" +#include "file/line_file_reader.h" #include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "options/db_options.h" #include "options/options_helper.h" +#include "port/port.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" +#include "rocksdb/utilities/options_type.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/string_util.h" -#include "port/port.h" - namespace ROCKSDB_NAMESPACE { static const std::string option_file_header = @@ -38,6 +40,27 @@ const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs) { + ConfigOptions + config_options; // Use default for escaped(true) and check (exact) + config_options.delimiter = "\n "; + // Do not invoke PrepareOptions when we are doing validation. + config_options.invoke_prepare_options = false; + // If a readahead size was set in the input options, use it + if (db_opt.log_readahead_size > 0) { + config_options.file_readahead_size = db_opt.log_readahead_size; + } + return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts, + file_name, fs); +} + +Status PersistRocksDBOptions(const ConfigOptions& config_options_in, + const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs) { + ConfigOptions config_options = config_options_in; + config_options.delimiter = "\n "; // Override the default to nl + TEST_SYNC_POINT("PersistRocksDBOptions:start"); if (cf_names.size() != cf_opts.size()) { return Status::InvalidArgument( @@ -56,55 +79,68 @@ std::string options_file_content; - writable->Append(option_file_header + "[" + - opt_section_titles[kOptionSectionVersion] + - "]\n" - " rocksdb_version=" + - ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) + - "." + ToString(ROCKSDB_PATCH) + "\n"); - writable->Append(" options_file_version=" + - ToString(ROCKSDB_OPTION_FILE_MAJOR) + "." + - ToString(ROCKSDB_OPTION_FILE_MINOR) + "\n"); - writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] + - "]\n "); + s = writable->Append(option_file_header + "[" + + opt_section_titles[kOptionSectionVersion] + + "]\n" + " rocksdb_version=" + + ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) + + "." + ToString(ROCKSDB_PATCH) + "\n"); + if (s.ok()) { + s = writable->Append( + " options_file_version=" + ToString(ROCKSDB_OPTION_FILE_MAJOR) + "." + + ToString(ROCKSDB_OPTION_FILE_MINOR) + "\n"); + } + if (s.ok()) { + s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] + + "]\n "); + } - s = GetStringFromDBOptions(&options_file_content, db_opt, "\n "); - if (!s.ok()) { - writable->Close(); - return s; + if (s.ok()) { + s = GetStringFromDBOptions(config_options, db_opt, &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); } - writable->Append(options_file_content + "\n"); - for (size_t i = 0; i < cf_opts.size(); ++i) { + for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) { // CFOptions section - writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] + - " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); - s = GetStringFromColumnFamilyOptions(&options_file_content, cf_opts[i], - "\n "); - if (!s.ok()) { - writable->Close(); - return s; + s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] + + " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + if (s.ok()) { + s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i], + &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); } - writable->Append(options_file_content + "\n"); // TableOptions section auto* tf = cf_opts[i].table_factory.get(); if (tf != nullptr) { - writable->Append("[" + opt_section_titles[kOptionSectionTableOptions] + - tf->Name() + " \"" + EscapeOptionString(cf_names[i]) + - "\"]\n "); - options_file_content.clear(); - s = tf->GetOptionString(&options_file_content, "\n "); - if (!s.ok()) { - return s; + if (s.ok()) { + s = writable->Append( + "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() + + " \"" + EscapeOptionString(cf_names[i]) + "\"]\n "); + } + if (s.ok()) { + options_file_content.clear(); + s = tf->GetOptionString(config_options, &options_file_content); + } + if (s.ok()) { + s = writable->Append(options_file_content + "\n"); } - writable->Append(options_file_content + "\n"); } } - writable->Sync(true /* use_fsync */); - writable->Close(); - - return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( - db_opt, cf_names, cf_opts, file_name, fs); + if (s.ok()) { + s = writable->Sync(true /* use_fsync */); + } + if (s.ok()) { + s = writable->Close(); + } + if (s.ok()) { + return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( + config_options, db_opt, cf_names, cf_opts, file_name, fs); + } + return s; } RocksDBOptionsParser::RocksDBOptionsParser() { Reset(); } @@ -205,7 +241,20 @@ Status RocksDBOptionsParser::Parse(const std::string& file_name, FileSystem* fs, bool ignore_unknown_options, size_t file_readahead_size) { + ConfigOptions + config_options; // Use default for escaped(true) and check (exact) + config_options.ignore_unknown_options = ignore_unknown_options; + if (file_readahead_size > 0) { + config_options.file_readahead_size = file_readahead_size; + } + return Parse(config_options, file_name, fs); +} + +Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, + const std::string& file_name, + FileSystem* fs) { Reset(); + ConfigOptions config_options = config_options_in; std::unique_ptr seq_file; Status s = fs->NewSequentialFile(file_name, FileOptions(), &seq_file, @@ -213,29 +262,23 @@ if (!s.ok()) { return s; } - - SequentialFileReader sf_reader(std::move(seq_file), file_name, - file_readahead_size); + LineFileReader lf_reader(std::move(seq_file), file_name, + config_options.file_readahead_size); OptionSection section = kOptionSectionUnknown; std::string title; std::string argument; std::unordered_map opt_map; - std::istringstream iss; std::string line; - bool has_data = true; // we only support single-lined statement. - for (int line_num = 1; ReadOneLine(&iss, &sf_reader, &line, &has_data, &s); - ++line_num) { - if (!s.ok()) { - return s; - } + while (lf_reader.ReadLine(&line)) { + int line_num = static_cast(lf_reader.GetLineNumber()); line = TrimAndRemoveComment(line); if (line.empty()) { continue; } if (IsSection(line)) { - s = EndSection(section, title, argument, opt_map, ignore_unknown_options); + s = EndSection(config_options, section, title, argument, opt_map); opt_map.clear(); if (!s.ok()) { return s; @@ -243,10 +286,11 @@ // If the option file is not generated by a higher minor version, // there shouldn't be any unknown option. - if (ignore_unknown_options && section == kOptionSectionVersion) { + if (config_options.ignore_unknown_options && + section == kOptionSectionVersion) { if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR && db_version[1] <= ROCKSDB_MINOR)) { - ignore_unknown_options = false; + config_options.ignore_unknown_options = false; } } @@ -264,8 +308,12 @@ opt_map.insert({name, value}); } } + s = lf_reader.GetStatus(); + if (!s.ok()) { + return s; + } - s = EndSection(section, title, argument, opt_map, ignore_unknown_options); + s = EndSection(config_options, section, title, argument, opt_map); opt_map.clear(); if (!s.ok()) { return s; @@ -372,14 +420,12 @@ } Status RocksDBOptionsParser::EndSection( - const OptionSection section, const std::string& section_title, - const std::string& section_arg, - const std::unordered_map& opt_map, - bool ignore_unknown_options) { + const ConfigOptions& config_options, const OptionSection section, + const std::string& section_title, const std::string& section_arg, + const std::unordered_map& opt_map) { Status s; if (section == kOptionSectionDBOptions) { - s = GetDBOptionsFromMap(DBOptions(), opt_map, &db_opt_, true, - ignore_unknown_options); + s = GetDBOptionsFromMap(config_options, DBOptions(), opt_map, &db_opt_); if (!s.ok()) { return s; } @@ -390,9 +436,8 @@ assert(GetCFOptions(section_arg) == nullptr); cf_names_.emplace_back(section_arg); cf_opts_.emplace_back(); - s = GetColumnFamilyOptionsFromMap(ColumnFamilyOptions(), opt_map, - &cf_opts_.back(), true, - ignore_unknown_options); + s = GetColumnFamilyOptionsFromMap(config_options, ColumnFamilyOptions(), + opt_map, &cf_opts_.back()); if (!s.ok()) { return s; } @@ -408,15 +453,27 @@ section_arg); } // Ignore error as table factory deserialization is optional - s = GetTableFactoryFromMap( + s = TableFactory::CreateFromString( + config_options, section_title.substr( opt_section_titles[kOptionSectionTableOptions].size()), - opt_map, &(cf_opt->table_factory), ignore_unknown_options); - if (!s.ok()) { - return s; + &(cf_opt->table_factory)); + if (s.ok()) { + s = cf_opt->table_factory->ConfigureFromMap(config_options, opt_map); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } + } else { + // Return OK for not supported table factories as TableFactory + // Deserialization is optional. + cf_opt->table_factory.reset(); + return Status::OK(); } } else if (section == kOptionSectionVersion) { - for (const auto pair : opt_map) { + for (const auto& pair : opt_map) { if (pair.first == "rocksdb_version") { s = ParseVersionNumber(pair.first, pair.second, 3, db_version); if (!s.ok()) { @@ -434,7 +491,7 @@ } } } - return Status::OK(); + return s; } Status RocksDBOptionsParser::ValidityCheck() { @@ -487,204 +544,37 @@ return ""; } -namespace { -bool AreEqualDoubles(const double a, const double b) { - return (fabs(a - b) < 0.00001); -} -} // namespace - -bool AreEqualOptions( - const char* opt1, const char* opt2, const OptionTypeInfo& type_info, - const std::string& opt_name, - const std::unordered_map* opt_map) { - const char* offset1 = opt1 + type_info.offset; - const char* offset2 = opt2 + type_info.offset; - - switch (type_info.type) { - case OptionType::kBoolean: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kInt: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kInt32T: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kInt64T: - { - int64_t v1, v2; - GetUnaligned(reinterpret_cast(offset1), &v1); - GetUnaligned(reinterpret_cast(offset2), &v2); - return (v1 == v2); - } - case OptionType::kVectorInt: - return (*reinterpret_cast*>(offset1) == - *reinterpret_cast*>(offset2)); - case OptionType::kUInt: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kUInt32T: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kUInt64T: - { - uint64_t v1, v2; - GetUnaligned(reinterpret_cast(offset1), &v1); - GetUnaligned(reinterpret_cast(offset2), &v2); - return (v1 == v2); - } - case OptionType::kSizeT: - { - size_t v1, v2; - GetUnaligned(reinterpret_cast(offset1), &v1); - GetUnaligned(reinterpret_cast(offset2), &v2); - return (v1 == v2); - } - case OptionType::kString: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kDouble: - return AreEqualDoubles(*reinterpret_cast(offset1), - *reinterpret_cast(offset2)); - case OptionType::kCompactionStyle: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kCompactionPri: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kCompressionType: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kVectorCompressionType: { - const auto* vec1 = - reinterpret_cast*>(offset1); - const auto* vec2 = - reinterpret_cast*>(offset2); - return (*vec1 == *vec2); - } - case OptionType::kChecksumType: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kBlockBasedTableIndexType: - return ( - *reinterpret_cast( - offset1) == - *reinterpret_cast(offset2)); - case OptionType::kBlockBasedTableDataBlockIndexType: - return ( - *reinterpret_cast( - offset1) == - *reinterpret_cast( - offset2)); - case OptionType::kBlockBasedTableIndexShorteningMode: - return ( - *reinterpret_cast( - offset1) == - *reinterpret_cast( - offset2)); - case OptionType::kWALRecoveryMode: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kAccessHint: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kInfoLogLevel: - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); - case OptionType::kCompactionOptionsFIFO: { - CompactionOptionsFIFO lhs = - *reinterpret_cast(offset1); - CompactionOptionsFIFO rhs = - *reinterpret_cast(offset2); - if (lhs.max_table_files_size == rhs.max_table_files_size && - lhs.allow_compaction == rhs.allow_compaction) { - return true; - } - return false; - } - case OptionType::kCompactionOptionsUniversal: { - CompactionOptionsUniversal lhs = - *reinterpret_cast(offset1); - CompactionOptionsUniversal rhs = - *reinterpret_cast(offset2); - if (lhs.size_ratio == rhs.size_ratio && - lhs.min_merge_width == rhs.min_merge_width && - lhs.max_merge_width == rhs.max_merge_width && - lhs.max_size_amplification_percent == - rhs.max_size_amplification_percent && - lhs.compression_size_percent == rhs.compression_size_percent && - lhs.stop_style == rhs.stop_style && - lhs.allow_trivial_move == rhs.allow_trivial_move) { - return true; - } - return false; - } - default: - if (type_info.verification == OptionVerificationType::kByName || - type_info.verification == - OptionVerificationType::kByNameAllowFromNull || - type_info.verification == OptionVerificationType::kByNameAllowNull) { - std::string value1; - bool result = - SerializeSingleOptionHelper(offset1, type_info.type, &value1); - if (result == false) { - return false; - } - if (opt_map == nullptr) { - return true; - } - auto iter = opt_map->find(opt_name); - if (iter == opt_map->end()) { - return true; - } else { - if (type_info.verification == - OptionVerificationType::kByNameAllowNull) { - if (iter->second == kNullptrString || value1 == kNullptrString) { - return true; - } - } else if (type_info.verification == - OptionVerificationType::kByNameAllowFromNull) { - if (iter->second == kNullptrString) { - return true; - } - } - return (value1 == iter->second); - } - } - return false; - } -} - Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( - const DBOptions& db_opt, const std::vector& cf_names, + const ConfigOptions& config_options_in, const DBOptions& db_opt, + const std::vector& cf_names, const std::vector& cf_opts, - const std::string& file_name, FileSystem* fs, - OptionsSanityCheckLevel sanity_check_level, bool ignore_unknown_options) { - // We infer option file readhead size from log readahead size. - // If it is not given, use 512KB. - size_t file_readahead_size = db_opt.log_readahead_size; - if (file_readahead_size == 0) { - const size_t kDefaultOptionFileReadAheadSize = 512 * 1024; - file_readahead_size = kDefaultOptionFileReadAheadSize; - } - + const std::string& file_name, FileSystem* fs) { RocksDBOptionsParser parser; - Status s = - parser.Parse(file_name, fs, ignore_unknown_options, file_readahead_size); + ConfigOptions config_options = config_options_in; + config_options.invoke_prepare_options = + false; // No need to do a prepare for verify + if (config_options.sanity_level < ConfigOptions::kSanityLevelExactMatch) { + // If we are not doing an exact comparison, we should ignore + // unsupported options, as they may cause the Parse to fail + // (if the ObjectRegistry is not initialized) + config_options.ignore_unsupported_options = true; + } + Status s = parser.Parse(config_options, file_name, fs); if (!s.ok()) { return s; } // Verify DBOptions - s = VerifyDBOptions(db_opt, *parser.db_opt(), parser.db_opt_map(), - sanity_check_level); + s = VerifyDBOptions(config_options, db_opt, *parser.db_opt(), + parser.db_opt_map()); if (!s.ok()) { return s; } // Verify ColumnFamily Name if (cf_names.size() != parser.cf_names()->size()) { - if (sanity_check_level >= kSanityLevelLooselyCompatible) { + if (config_options.sanity_level >= + ConfigOptions::kSanityLevelLooselyCompatible) { return Status::InvalidArgument( "[RocksDBOptionParser Error] The persisted options does not have " "the same number of column family names as the db instance."); @@ -706,7 +596,8 @@ // Verify Column Family Options if (cf_opts.size() != parser.cf_opts()->size()) { - if (sanity_check_level >= kSanityLevelLooselyCompatible) { + if (config_options.sanity_level >= + ConfigOptions::kSanityLevelLooselyCompatible) { return Status::InvalidArgument( "[RocksDBOptionsParser Error]", "The persisted options does not have the same number of " @@ -719,14 +610,13 @@ } } for (size_t i = 0; i < cf_opts.size(); ++i) { - s = VerifyCFOptions(cf_opts[i], parser.cf_opts()->at(i), - &(parser.cf_opt_maps()->at(i)), sanity_check_level); + s = VerifyCFOptions(config_options, cf_opts[i], parser.cf_opts()->at(i), + &(parser.cf_opt_maps()->at(i))); if (!s.ok()) { return s; } - s = VerifyTableFactory(cf_opts[i].table_factory.get(), - parser.cf_opts()->at(i).table_factory.get(), - sanity_check_level); + s = VerifyTableFactory(config_options, cf_opts[i].table_factory.get(), + parser.cf_opts()->at(i).table_factory.get()); if (!s.ok()) { return s; } @@ -736,99 +626,96 @@ } Status RocksDBOptionsParser::VerifyDBOptions( - const DBOptions& base_opt, const DBOptions& persisted_opt, - const std::unordered_map* /*opt_map*/, - OptionsSanityCheckLevel sanity_check_level) { - for (auto pair : db_options_type_info) { - if (pair.second.verification == OptionVerificationType::kDeprecated) { - // We skip checking deprecated variables as they might - // contain random values since they might not be initialized - continue; - } - if (DBOptionSanityCheckLevel(pair.first) <= sanity_check_level) { - if (!AreEqualOptions(reinterpret_cast(&base_opt), - reinterpret_cast(&persisted_opt), - pair.second, pair.first, nullptr)) { - constexpr size_t kBufferSize = 2048; - char buffer[kBufferSize]; - std::string base_value; - std::string persisted_value; - SerializeSingleOptionHelper( - reinterpret_cast(&base_opt) + pair.second.offset, - pair.second.type, &base_value); - SerializeSingleOptionHelper( - reinterpret_cast(&persisted_opt) + pair.second.offset, - pair.second.type, &persisted_value); - snprintf(buffer, sizeof(buffer), - "[RocksDBOptionsParser]: " - "failed the verification on DBOptions::%s --- " - "The specified one is %s while the persisted one is %s.\n", - pair.first.c_str(), base_value.c_str(), - persisted_value.c_str()); - return Status::InvalidArgument(Slice(buffer, strlen(buffer))); - } + const ConfigOptions& config_options, const DBOptions& base_opt, + const DBOptions& file_opt, + const std::unordered_map* opt_map) { + auto base_config = DBOptionsAsConfigurable(base_opt, opt_map); + auto file_config = DBOptionsAsConfigurable(file_opt, opt_map); + std::string mismatch; + if (!base_config->AreEquivalent(config_options, file_config.get(), + &mismatch)) { + const size_t kBufferSize = 2048; + char buffer[kBufferSize]; + std::string base_value; + std::string file_value; + int offset = snprintf(buffer, sizeof(buffer), + "[RocksDBOptionsParser]: " + "failed the verification on DBOptions::%s -- ", + mismatch.c_str()); + Status s = base_config->GetOption(config_options, mismatch, &base_value); + if (s.ok()) { + s = file_config->GetOption(config_options, mismatch, &file_value); + } + assert(offset >= 0); + assert(static_cast(offset) < sizeof(buffer)); + if (s.ok()) { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "-- The specified one is %s while the persisted one is %s.\n", + base_value.c_str(), file_value.c_str()); + } else { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "-- Unable to re-serialize an option: %s.\n", + s.ToString().c_str()); } + return Status::InvalidArgument(Slice(buffer, strlen(buffer))); } return Status::OK(); } Status RocksDBOptionsParser::VerifyCFOptions( - const ColumnFamilyOptions& base_opt, - const ColumnFamilyOptions& persisted_opt, - const std::unordered_map* persisted_opt_map, - OptionsSanityCheckLevel sanity_check_level) { - for (auto& pair : cf_options_type_info) { - if (pair.second.verification == OptionVerificationType::kDeprecated) { - // We skip checking deprecated variables as they might - // contain random values since they might not be initialized - continue; - } - if (CFOptionSanityCheckLevel(pair.first) <= sanity_check_level) { - if (!AreEqualOptions(reinterpret_cast(&base_opt), - reinterpret_cast(&persisted_opt), - pair.second, pair.first, persisted_opt_map)) { - constexpr size_t kBufferSize = 2048; - char buffer[kBufferSize]; - std::string base_value; - std::string persisted_value; - SerializeSingleOptionHelper( - reinterpret_cast(&base_opt) + pair.second.offset, - pair.second.type, &base_value); - SerializeSingleOptionHelper( - reinterpret_cast(&persisted_opt) + pair.second.offset, - pair.second.type, &persisted_value); - snprintf(buffer, sizeof(buffer), - "[RocksDBOptionsParser]: " - "failed the verification on ColumnFamilyOptions::%s --- " - "The specified one is %s while the persisted one is %s.\n", - pair.first.c_str(), base_value.c_str(), - persisted_value.c_str()); - return Status::InvalidArgument(Slice(buffer, sizeof(buffer))); - } + const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt, + const ColumnFamilyOptions& file_opt, + const std::unordered_map* opt_map) { + auto base_config = CFOptionsAsConfigurable(base_opt, opt_map); + auto file_config = CFOptionsAsConfigurable(file_opt, opt_map); + std::string mismatch; + if (!base_config->AreEquivalent(config_options, file_config.get(), + &mismatch)) { + std::string base_value; + std::string file_value; + // The options do not match + const size_t kBufferSize = 2048; + char buffer[kBufferSize]; + Status s = base_config->GetOption(config_options, mismatch, &base_value); + if (s.ok()) { + s = file_config->GetOption(config_options, mismatch, &file_value); + } + int offset = snprintf(buffer, sizeof(buffer), + "[RocksDBOptionsParser]: " + "failed the verification on ColumnFamilyOptions::%s", + mismatch.c_str()); + assert(offset >= 0); + assert(static_cast(offset) < sizeof(buffer)); + if (s.ok()) { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "--- The specified one is %s while the persisted one is %s.\n", + base_value.c_str(), file_value.c_str()); + } else { + snprintf(buffer + offset, sizeof(buffer) - static_cast(offset), + "--- Unable to re-serialize an option: %s.\n", + s.ToString().c_str()); } - } + return Status::InvalidArgument(Slice(buffer, sizeof(buffer))); + } // For each option return Status::OK(); } Status RocksDBOptionsParser::VerifyTableFactory( - const TableFactory* base_tf, const TableFactory* file_tf, - OptionsSanityCheckLevel sanity_check_level) { + const ConfigOptions& config_options, const TableFactory* base_tf, + const TableFactory* file_tf) { + std::string mismatch; if (base_tf && file_tf) { - if (sanity_check_level > kSanityLevelNone && + if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && std::string(base_tf->Name()) != std::string(file_tf->Name())) { return Status::Corruption( "[RocksDBOptionsParser]: " "failed the verification on TableFactory->Name()"); + } else if (!base_tf->AreEquivalent(config_options, file_tf, &mismatch)) { + return Status::Corruption(std::string("[RocksDBOptionsParser]:" + "failed the verification on ") + + base_tf->Name() + "::", + mismatch); } - if (base_tf->Name() == BlockBasedTableFactory::kName) { - return VerifyBlockBasedTableFactory( - static_cast_with_check(base_tf), - static_cast_with_check(file_tf), - sanity_check_level); - } - // TODO(yhchiang): add checks for other table factory types } else { // TODO(yhchiang): further support sanity check here } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,14 +9,15 @@ #include #include -#include "options/options_sanity_check.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "table/block_based/block_based_table_factory.h" namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE +struct ConfigOptions; +class OptionTypeInfo; +class TableFactory; #define ROCKSDB_OPTION_FILE_MAJOR 1 #define ROCKSDB_OPTION_FILE_MINOR 1 @@ -36,11 +37,11 @@ const std::vector& cf_names, const std::vector& cf_opts, const std::string& file_name, FileSystem* fs); - -extern bool AreEqualOptions( - const char* opt1, const char* opt2, const OptionTypeInfo& type_info, - const std::string& opt_name, - const std::unordered_map* opt_map); +Status PersistRocksDBOptions(const ConfigOptions& config_options, + const DBOptions& db_opt, + const std::vector& cf_names, + const std::vector& cf_opts, + const std::string& file_name, FileSystem* fs); class RocksDBOptionsParser { public: @@ -52,6 +53,10 @@ // If 0 is given, a default value will be used. Status Parse(const std::string& file_name, FileSystem* fs, bool ignore_unknown_options, size_t file_readahead_size); + + Status Parse(const ConfigOptions& config_options, + const std::string& file_name, FileSystem* fs); + static std::string TrimAndRemoveComment(const std::string& line, const bool trim_only = false); @@ -70,30 +75,32 @@ return GetCFOptionsImpl(name); } size_t NumColumnFamilies() { return cf_opts_.size(); } - static Status VerifyRocksDBOptionsFromFile( - const DBOptions& db_opt, const std::vector& cf_names, + const ConfigOptions& config_options, const DBOptions& db_opt, + const std::vector& cf_names, const std::vector& cf_opts, - const std::string& file_name, FileSystem* fs, - OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch, - bool ignore_unknown_options = false); - + const std::string& file_name, FileSystem* fs); static Status VerifyDBOptions( - const DBOptions& base_opt, const DBOptions& new_opt, - const std::unordered_map* new_opt_map = nullptr, - OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch); + const ConfigOptions& config_options, const DBOptions& base_opt, + const DBOptions& new_opt, + const std::unordered_map* new_opt_map = + nullptr); static Status VerifyCFOptions( - const ColumnFamilyOptions& base_opt, const ColumnFamilyOptions& new_opt, - const std::unordered_map* new_opt_map = nullptr, - OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch); - - static Status VerifyTableFactory( - const TableFactory* base_tf, const TableFactory* file_tf, - OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch); + const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt, + const ColumnFamilyOptions& new_opt, + const std::unordered_map* new_opt_map = + nullptr); + + static Status VerifyTableFactory(const ConfigOptions& config_options, + const TableFactory* base_tf, + const TableFactory* file_tf); static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser); + static Status ParseStatement(std::string* name, std::string* value, + const std::string& line, const int line_num); + protected: bool IsSection(const std::string& line); Status ParseSection(OptionSection* section, std::string* title, @@ -103,17 +110,14 @@ Status CheckSection(const OptionSection section, const std::string& section_arg, const int line_num); - Status ParseStatement(std::string* name, std::string* value, - const std::string& line, const int line_num); - - Status EndSection(const OptionSection section, const std::string& title, - const std::string& section_arg, - const std::unordered_map& opt_map, - bool ignore_unknown_options); + Status EndSection( + const ConfigOptions& config_options, const OptionSection section, + const std::string& title, const std::string& section_arg, + const std::unordered_map& opt_map); Status ValidityCheck(); - Status InvalidArgument(const int line_num, const std::string& message); + static Status InvalidArgument(const int line_num, const std::string& message); Status ParseVersionNumber(const std::string& ver_name, const std::string& ver_string, const int max_count, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#ifndef ROCKSDB_LITE - -#include "options/options_sanity_check.h" - -namespace ROCKSDB_NAMESPACE { - -namespace { -OptionsSanityCheckLevel SanityCheckLevelHelper( - const std::unordered_map& smap, - const std::string& name) { - auto iter = smap.find(name); - return iter != smap.end() ? iter->second : kSanityLevelExactMatch; -} -} - -OptionsSanityCheckLevel DBOptionSanityCheckLevel( - const std::string& option_name) { - return SanityCheckLevelHelper(sanity_level_db_options, option_name); -} - -OptionsSanityCheckLevel CFOptionSanityCheckLevel( - const std::string& option_name) { - return SanityCheckLevelHelper(sanity_level_cf_options, option_name); -} - -OptionsSanityCheckLevel BBTOptionSanityCheckLevel( - const std::string& option_name) { - return SanityCheckLevelHelper(sanity_level_bbt_options, option_name); -} - -} // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,50 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include -#include - -#include "rocksdb/rocksdb_namespace.h" - -#ifndef ROCKSDB_LITE -namespace ROCKSDB_NAMESPACE { -// This enum defines the RocksDB options sanity level. -enum OptionsSanityCheckLevel : unsigned char { - // Performs no sanity check at all. - kSanityLevelNone = 0x00, - // Performs minimum check to ensure the RocksDB instance can be - // opened without corrupting / mis-interpreting the data. - kSanityLevelLooselyCompatible = 0x01, - // Perform exact match sanity check. - kSanityLevelExactMatch = 0xFF, -}; - -// The sanity check level for DB options -static const std::unordered_map - sanity_level_db_options {}; - -// The sanity check level for column-family options -static const std::unordered_map - sanity_level_cf_options = { - {"comparator", kSanityLevelLooselyCompatible}, - {"table_factory", kSanityLevelLooselyCompatible}, - {"merge_operator", kSanityLevelLooselyCompatible}}; - -// The sanity check level for block-based table options -static const std::unordered_map - sanity_level_bbt_options {}; - -OptionsSanityCheckLevel DBOptionSanityCheckLevel( - const std::string& options_name); -OptionsSanityCheckLevel CFOptionSanityCheckLevel( - const std::string& options_name); -OptionsSanityCheckLevel BBTOptionSanityCheckLevel( - const std::string& options_name); - -} // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_settable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_settable_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,8 @@ #include +#include "options/cf_options.h" +#include "options/db_options.h" #include "options/options_helper.h" #include "rocksdb/convenience.h" #include "test_util/testharness.h" @@ -39,23 +41,24 @@ }; const char kSpecialChar = 'z'; -typedef std::vector> OffsetGap; +using OffsetGap = std::vector>; void FillWithSpecialChar(char* start_ptr, size_t total_size, - const OffsetGap& blacklist) { + const OffsetGap& excluded, + char special_char = kSpecialChar) { size_t offset = 0; - for (auto& pair : blacklist) { - std::memset(start_ptr + offset, kSpecialChar, pair.first - offset); + for (auto& pair : excluded) { + std::memset(start_ptr + offset, special_char, pair.first - offset); offset = pair.first + pair.second; } - std::memset(start_ptr + offset, kSpecialChar, total_size - offset); + std::memset(start_ptr + offset, special_char, total_size - offset); } int NumUnsetBytes(char* start_ptr, size_t total_size, - const OffsetGap& blacklist) { + const OffsetGap& excluded) { int total_unset_bytes_base = 0; size_t offset = 0; - for (auto& pair : blacklist) { + for (auto& pair : excluded) { for (char* ptr = start_ptr + offset; ptr < start_ptr + pair.first; ptr++) { if (*ptr == kSpecialChar) { total_unset_bytes_base++; @@ -71,6 +74,26 @@ return total_unset_bytes_base; } +// Return true iff two structs are the same except excluded fields. +bool CompareBytes(char* start_ptr1, char* start_ptr2, size_t total_size, + const OffsetGap& excluded) { + size_t offset = 0; + for (auto& pair : excluded) { + for (; offset < pair.first; offset++) { + if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) { + return false; + } + } + offset = pair.first + pair.second; + } + for (; offset < total_size; offset++) { + if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) { + return false; + } + } + return true; +} + // If the test fails, likely a new option is added to BlockBasedTableOptions // but it cannot be set through GetBlockBasedTableOptionsFromString(), or the // test is not updated accordingly. @@ -78,11 +101,11 @@ // GetBlockBasedTableOptionsFromString() and add the option to the input string // passed to the GetBlockBasedTableOptionsFromString() in this test. // If it is a complicated type, you also need to add the field to -// kBbtoBlacklist, and maybe add customized verification for it. +// kBbtoExcluded, and maybe add customized verification for it. TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { // Items in the form of . Need to be in ascending order // and not overlapping. Need to updated if new pointer-option is added. - const OffsetGap kBbtoBlacklist = { + const OffsetGap kBbtoExcluded = { {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory), sizeof(std::shared_ptr)}, {offsetof(struct BlockBasedTableOptions, block_cache), @@ -107,20 +130,20 @@ // copy a well constructed struct to this memory and see how many special // bytes left. BlockBasedTableOptions* bbto = new (bbto_ptr) BlockBasedTableOptions(); - FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded); // It based on the behavior of compiler that padding bytes are not changed // when copying the struct. It's prone to failure when compiler behavior // changes. We verify there is unset bytes to detect the case. *bbto = BlockBasedTableOptions(); int unset_bytes_base = - NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded); ASSERT_GT(unset_bytes_base, 0); bbto->~BlockBasedTableOptions(); // Construct the base option passed into // GetBlockBasedTableOptionsFromString(). bbto = new (bbto_ptr) BlockBasedTableOptions(); - FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded); // This option is not setable: bbto->use_delta_encoding = true; @@ -128,13 +151,16 @@ BlockBasedTableOptions* new_bbto = new (new_bbto_ptr) BlockBasedTableOptions(); FillWithSpecialChar(new_bbto_ptr, sizeof(BlockBasedTableOptions), - kBbtoBlacklist); + kBbtoExcluded); // Need to update the option string if a new option is added. ASSERT_OK(GetBlockBasedTableOptionsFromString( *bbto, "cache_index_and_filter_blocks=1;" "cache_index_and_filter_blocks_with_high_priority=true;" + "metadata_cache_options={top_level_index_pinning=kFallback;" + "partition_pinning=kAll;" + "unpartitioned_pinning=kFlushedAndSimilar;};" "pin_l0_filter_and_index_blocks_in_cache=1;" "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" @@ -146,18 +172,22 @@ "block_size_deviation=8;block_restart_interval=4; " "metadata_block_size=1024;" "partition_filters=false;" + "optimize_filters_for_memory=true;" "index_block_restart_interval=4;" "filter_policy=bloomfilter:4:true;whole_key_filtering=1;" + "reserve_table_builder_memory=false;" "format_version=1;" "hash_index_allow_collision=false;" "verify_compression=true;read_amp_bytes_per_bit=0;" "enable_index_compression=false;" - "block_align=true", + "block_align=true;" + "max_auto_readahead_size=0;" + "prepopulate_block_cache=kDisable", new_bbto)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_bbto_ptr, sizeof(BlockBasedTableOptions), - kBbtoBlacklist)); + kBbtoExcluded)); ASSERT_TRUE(new_bbto->block_cache.get() != nullptr); ASSERT_TRUE(new_bbto->block_cache_compressed.get() != nullptr); @@ -177,12 +207,10 @@ // GetDBOptionsFromString() and add the option to the input string passed to // DBOptionsFromString()in this test. // If it is a complicated type, you also need to add the field to -// kDBOptionsBlacklist, and maybe add customized verification for it. +// kDBOptionsExcluded, and maybe add customized verification for it. TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { - const OffsetGap kDBOptionsBlacklist = { + const OffsetGap kDBOptionsExcluded = { {offsetof(struct DBOptions, env), sizeof(Env*)}, - {offsetof(struct DBOptions, file_system), - sizeof(std::shared_ptr)}, {offsetof(struct DBOptions, rate_limiter), sizeof(std::shared_ptr)}, {offsetof(struct DBOptions, sst_file_manager), @@ -199,8 +227,13 @@ sizeof(std::vector>)}, {offsetof(struct DBOptions, row_cache), sizeof(std::shared_ptr)}, {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)}, - {offsetof(struct DBOptions, sst_file_checksum_func), - sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, file_checksum_gen_factory), + sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, db_host_id), sizeof(std::string)}, + {offsetof(struct DBOptions, checksum_handoff_file_types), + sizeof(FileTypeSet)}, + {offsetof(struct DBOptions, compaction_service), + sizeof(std::shared_ptr)}, }; char* options_ptr = new char[sizeof(DBOptions)]; @@ -209,22 +242,22 @@ // copy a well constructed struct to this memory and see how many special // bytes left. DBOptions* options = new (options_ptr) DBOptions(); - FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded); // It based on the behavior of compiler that padding bytes are not changed // when copying the struct. It's prone to failure when compiler behavior // changes. We verify there is unset bytes to detect the case. *options = DBOptions(); int unset_bytes_base = - NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded); ASSERT_GT(unset_bytes_base, 0); options->~DBOptions(); options = new (options_ptr) DBOptions(); - FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded); char* new_options_ptr = new char[sizeof(DBOptions)]; DBOptions* new_options = new (new_options_ptr) DBOptions(); - FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsExcluded); // Need to update the option string if a new option is added. ASSERT_OK( @@ -256,6 +289,8 @@ "skip_log_error_on_recovery=true;" "writable_file_max_buffer_size=1048576;" "paranoid_checks=true;" + "flush_verify_memtable_count=true;" + "track_and_verify_wals_in_manifest=true;" "is_fd_close_on_exec=false;" "bytes_per_sync=4295013613;" "strict_bytes_per_sync=true;" @@ -303,11 +338,17 @@ "atomic_flush=false;" "avoid_unnecessary_blocking_io=false;" "log_readahead_size=0;" - "write_dbid_to_manifest=false", + "write_dbid_to_manifest=false;" + "best_efforts_recovery=false;" + "max_bgerror_resume_count=2;" + "bgerror_resume_retry_interval=1000000" + "db_host_id=hostname;" + "lowest_used_cache_tier=kNonVolatileBlockTier;" + "allow_data_in_errors=false", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), - kDBOptionsBlacklist)); + kDBOptionsExcluded)); options->~DBOptions(); new_options->~DBOptions(); @@ -329,12 +370,12 @@ // GetColumnFamilyOptionsFromString() and add the option to the input // string passed to GetColumnFamilyOptionsFromString()in this test. // If it is a complicated type, you also need to add the field to -// kColumnFamilyOptionsBlacklist, and maybe add customized verification +// kColumnFamilyOptionsExcluded, and maybe add customized verification // for it. TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { - // options in the blacklist need to appear in the same order as in + // options in the excluded set need to appear in the same order as in // ColumnFamilyOptions. - const OffsetGap kColumnFamilyOptionsBlacklist = { + const OffsetGap kColumnFamilyOptionsExcluded = { {offset_of(&ColumnFamilyOptions::inplace_callback), sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))}, {offset_of( @@ -364,6 +405,8 @@ {offset_of(&ColumnFamilyOptions::cf_paths), sizeof(std::vector)}, {offset_of(&ColumnFamilyOptions::compaction_thread_limiter), sizeof(std::shared_ptr)}, + {offset_of(&ColumnFamilyOptions::sst_partitioner_factory), + sizeof(std::shared_ptr)}, }; char* options_ptr = new char[sizeof(ColumnFamilyOptions)]; @@ -371,44 +414,46 @@ // Count padding bytes by setting all bytes in the memory to a special char, // copy a well constructed struct to this memory and see how many special // bytes left. - ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions(); FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions), - kColumnFamilyOptionsBlacklist); - // It based on the behavior of compiler that padding bytes are not changed - // when copying the struct. It's prone to failure when compiler behavior - // changes. We verify there is unset bytes to detect the case. - *options = ColumnFamilyOptions(); + kColumnFamilyOptionsExcluded); + + // Invoke a user-defined constructor in the hope that it does not overwrite + // padding bytes. Note that previously we relied on the implicitly-defined + // copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here, + // which did in fact modify padding bytes. + ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions(); // Deprecatd option which is not initialized. Need to set it to avoid // Valgrind error options->max_mem_compaction_level = 0; int unset_bytes_base = NumUnsetBytes(options_ptr, sizeof(ColumnFamilyOptions), - kColumnFamilyOptionsBlacklist); + kColumnFamilyOptionsExcluded); ASSERT_GT(unset_bytes_base, 0); options->~ColumnFamilyOptions(); options = new (options_ptr) ColumnFamilyOptions(); FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions), - kColumnFamilyOptionsBlacklist); + kColumnFamilyOptionsExcluded); // Following options are not settable through // GetColumnFamilyOptionsFromString(): options->rate_limit_delay_max_milliseconds = 33; options->compaction_options_universal = CompactionOptionsUniversal(); - options->compression_opts = CompressionOptions(); - options->bottommost_compression_opts = CompressionOptions(); options->hard_rate_limit = 0; options->soft_rate_limit = 0; + options->num_levels = 42; // Initialize options for MutableCF options->purge_redundant_kvs_while_flush = false; options->max_mem_compaction_level = 0; options->compaction_filter = nullptr; + options->sst_partitioner_factory = nullptr; + options->bottommost_temperature = Temperature::kUnknown; char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)]; ColumnFamilyOptions* new_options = new (new_options_ptr) ColumnFamilyOptions(); FillWithSpecialChar(new_options_ptr, sizeof(ColumnFamilyOptions), - kColumnFamilyOptionsBlacklist); + kColumnFamilyOptionsExcluded); // Need to update the option string if a new option is added. ASSERT_OK(GetColumnFamilyOptionsFromString( @@ -435,6 +480,8 @@ "max_bytes_for_level_multiplier=60;" "memtable_factory=SkipListFactory;" "compression=kNoCompression;" + "compression_opts=5:6:7:8:9:10:true:11;" + "bottommost_compression_opts=4:5:6:7:8:9:true:10;" "bottommost_compression=kDisableCompressionOption;" "level0_stop_writes_trigger=33;" "num_levels=99;" @@ -449,6 +496,7 @@ "memtable_prefix_bloom_size_ratio=0.4642;" "memtable_whole_key_filtering=true;" "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;" + "check_flush_compaction_key_order=false;" "paranoid_file_checks=true;" "force_consistency_checks=true;" "inplace_update_num_locks=7429;" @@ -463,19 +511,74 @@ "ttl=60;" "periodic_compaction_seconds=3600;" "sample_for_compression=0;" + "enable_blob_files=true;" + "min_blob_size=256;" + "blob_file_size=1000000;" + "blob_compression_type=kBZip2Compression;" + "enable_blob_garbage_collection=true;" + "blob_garbage_collection_age_cutoff=0.5;" + "blob_garbage_collection_force_threshold=0.75;" + "blob_compaction_readahead_size=262144;" "compaction_options_fifo={max_table_files_size=3;allow_" - "compaction=false;};", + "compaction=false;age_for_warm=1;};", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions), - kColumnFamilyOptionsBlacklist)); + kColumnFamilyOptionsExcluded)); + + ColumnFamilyOptions rnd_filled_options = *new_options; options->~ColumnFamilyOptions(); new_options->~ColumnFamilyOptions(); delete[] options_ptr; delete[] new_options_ptr; + + // Test copying to mutabable and immutable options and copy back the mutable + // part. + const OffsetGap kMutableCFOptionsExcluded = { + {offset_of(&MutableCFOptions::prefix_extractor), + sizeof(std::shared_ptr)}, + {offset_of(&MutableCFOptions::max_bytes_for_level_multiplier_additional), + sizeof(std::vector)}, + {offset_of(&MutableCFOptions::max_file_size), + sizeof(std::vector)}, + }; + + // For all memory used for options, pre-fill every char. Otherwise, the + // padding bytes might be different so that byte-wise comparison doesn't + // general equal results even if objects are equal. + const char kMySpecialChar = 'x'; + char* mcfo1_ptr = new char[sizeof(MutableCFOptions)]; + FillWithSpecialChar(mcfo1_ptr, sizeof(MutableCFOptions), + kMutableCFOptionsExcluded, kMySpecialChar); + char* mcfo2_ptr = new char[sizeof(MutableCFOptions)]; + FillWithSpecialChar(mcfo2_ptr, sizeof(MutableCFOptions), + kMutableCFOptionsExcluded, kMySpecialChar); + + // A clean column family options is constructed after filling the same special + // char as the initial one. So that the padding bytes are the same. + char* cfo_clean_ptr = new char[sizeof(ColumnFamilyOptions)]; + FillWithSpecialChar(cfo_clean_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsExcluded); + rnd_filled_options.num_levels = 66; + ColumnFamilyOptions* cfo_clean = new (cfo_clean_ptr) ColumnFamilyOptions(); + + MutableCFOptions* mcfo1 = + new (mcfo1_ptr) MutableCFOptions(rnd_filled_options); + ColumnFamilyOptions cfo_back = BuildColumnFamilyOptions(*cfo_clean, *mcfo1); + MutableCFOptions* mcfo2 = new (mcfo2_ptr) MutableCFOptions(cfo_back); + + ASSERT_TRUE(CompareBytes(mcfo1_ptr, mcfo2_ptr, sizeof(MutableCFOptions), + kMutableCFOptionsExcluded)); + + cfo_clean->~ColumnFamilyOptions(); + mcfo1->~MutableCFOptions(); + mcfo2->~MutableCFOptions(); + delete[] mcfo1_ptr; + delete[] mcfo2_ptr; + delete[] cfo_clean_ptr; } #endif // !__clang__ #endif // OS_LINUX || OS_WIN diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,13 +16,14 @@ #include "cache/sharded_cache.h" #include "options/options_helper.h" #include "options/options_parser.h" -#include "options/options_sanity_check.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" +#include "rocksdb/file_checksum.h" #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/leveldb_options.h" #include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" #include "table/block_based/filter_policy_internal.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -30,6 +31,9 @@ #include "util/stderr_logger.h" #include "util/string_util.h" #include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend.h" +#include "utilities/merge_operators/string_append/stringappend2.h" #ifndef GFLAGS bool FLAGS_enable_print = false; @@ -63,7 +67,7 @@ "kZSTD:" "kZSTDNotFinalCompression"}, {"bottommost_compression", "kLZ4Compression"}, - {"bottommost_compression_opts", "5:6:7:8:9:true"}, + {"bottommost_compression_opts", "5:6:7:8:10:true"}, {"compression_opts", "4:5:6:7:8:true"}, {"num_levels", "8"}, {"level0_file_num_compaction_trigger", "8"}, @@ -98,6 +102,14 @@ {"min_partial_merge_operands", "31"}, {"prefix_extractor", "fixed:31"}, {"optimize_filters_for_hits", "true"}, + {"enable_blob_files", "true"}, + {"min_blob_size", "1K"}, + {"blob_file_size", "1G"}, + {"blob_compression_type", "kZSTD"}, + {"enable_blob_garbage_collection", "true"}, + {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, + {"blob_compaction_readahead_size", "256K"}, }; std::unordered_map db_options_map = { @@ -105,6 +117,7 @@ {"create_missing_column_families", "true"}, {"error_if_exists", "false"}, {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, {"max_open_files", "32"}, {"max_total_wal_size", "33"}, {"use_fsync", "true"}, @@ -133,6 +146,7 @@ {"persist_stats_to_disk", "false"}, {"stats_history_buffer_size", "69"}, {"advise_random_on_open", "true"}, + {"experimental_mempurge_threshold", "0.0"}, {"use_adaptive_mutex", "false"}, {"new_table_reader_for_compaction_inputs", "true"}, {"compaction_readahead_size", "100"}, @@ -145,8 +159,16 @@ ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; - ASSERT_OK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ConfigOptions exact, loose; + exact.input_strings_escaped = false; + exact.ignore_unknown_options = false; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + + loose.input_strings_escaped = false; + loose.ignore_unknown_options = true; + ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, + &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); @@ -168,13 +190,17 @@ ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6); ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u); ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u); + ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, + CompressionOptions().parallel_threads); ASSERT_EQ(new_cf_opt.compression_opts.enabled, true); ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u); - ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 10u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, + CompressionOptions().parallel_threads); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true); ASSERT_EQ(new_cf_opt.num_levels, 8); ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8); @@ -208,41 +234,49 @@ ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); - ASSERT_EQ(std::string(new_cf_opt.prefix_extractor->Name()), - "rocksdb.FixedPrefix.31"); + ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); + ASSERT_EQ(new_cf_opt.enable_blob_files, true); + ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10); + ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30); + ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); + ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); + ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144); cf_options_map["write_buffer_size"] = "hello"; - ASSERT_NOK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, + &new_cf_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); cf_options_map["write_buffer_size"] = "1"; - ASSERT_OK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, + &new_cf_opt)); cf_options_map["unknown_option"] = "1"; - ASSERT_NOK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, + &new_cf_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, - &new_cf_opt, - false, /* input_strings_escaped */ - true /* ignore_unknown_options */)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( - base_cf_opt, new_cf_opt, nullptr, /* new_opt_map */ - kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility*/)); - ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - base_cf_opt, new_cf_opt, nullptr, /* new_opt_map */ - kSanityLevelExactMatch /* default for VerifyCFOptions */)); + // ignore_unknown_options=true;input_strings_escaped=false + ASSERT_OK(GetColumnFamilyOptionsFromMap(loose, base_cf_opt, cf_options_map, + &new_cf_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(loose, base_cf_opt, new_cf_opt)); + ASSERT_NOK( + RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); DBOptions base_db_opt; DBOptions new_db_opt; - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_OK( + GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt)); ASSERT_EQ(new_db_opt.create_if_missing, false); ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); ASSERT_EQ(new_db_opt.max_open_files, 32); ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast(33)); ASSERT_EQ(new_db_opt.use_fsync, true); @@ -272,6 +306,7 @@ ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); ASSERT_EQ(new_db_opt.advise_random_on_open, true); + ASSERT_EQ(new_db_opt.experimental_mempurge_threshold, 0.0); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true); ASSERT_EQ(new_db_opt.compaction_readahead_size, 100); @@ -282,26 +317,30 @@ ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true); db_options_map["max_open_files"] = "hello"; - ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opt, new_db_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions( - base_db_opt, new_db_opt, nullptr, /* new_opt_map */ - kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility */)); + Status s = + GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); // unknow options should fail parsing without ignore_unknown_options = true db_options_map["unknown_db_option"] = "1"; - ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opt, new_db_opt)); + s = GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, - false, /* input_strings_escaped */ - true /* ignore_unknown_options */)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions( - base_db_opt, new_db_opt, nullptr, /* new_opt_map */ - kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility */)); - ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions( - base_db_opt, new_db_opt, nullptr, /* new_opt_mat */ - kSanityLevelExactMatch /* default for VerifyDBOptions */)); + ASSERT_OK( + GetDBOptionsFromMap(loose, base_db_opt, db_options_map, &new_db_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); + ASSERT_NOK( + RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); } #endif // !ROCKSDB_LITE @@ -310,77 +349,91 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + base_cf_opt.table_factory.reset(); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=5", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, "", + &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=5", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 5U); ASSERT_TRUE(new_cf_opt.table_factory == nullptr); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=6;", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=6;", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 6U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - " write_buffer_size = 7 ", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, " write_buffer_size = 7 ", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 7U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - " write_buffer_size = 8 ; ", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, " write_buffer_size = 8 ; ", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 8U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 9U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=11; max_write_buffer_number = 12 ;", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=11; max_write_buffer_number = 12 ;", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 11U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12); // Wrong name "max_write_buffer_number_" - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number_=14;", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number_=14;", &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); // Comparator from object registry std::string kCompName = "reverse_comp"; - ObjectLibrary::Default()->Register( + ObjectLibrary::Default()->AddFactory( kCompName, [](const std::string& /*name*/, std::unique_ptr* /*guard*/, std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); - ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "comparator=" + kCompName + ";", + &new_cf_opt)); ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator()); // MergeOperator from object registry std::unique_ptr bxo(new BytesXOROperator()); std::string kMoName = bxo->Name(); - ObjectLibrary::Default()->Register( - kMoName, - [](const std::string& /*name*/, std::unique_ptr* guard, - std::string* /* errmsg */) { - guard->reset(new BytesXOROperator()); - return guard->get(); - }); - ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "merge_operator=" + kMoName + ";", + &new_cf_opt)); ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name())); // Wrong key/value pair - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); - - // Error Paring value - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + Status s = GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); + + // Error Parsing value + s = GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); // Missing option name - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13; =100;", &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + s = GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=13; =100;", &new_cf_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); const uint64_t kilo = 1024UL; const uint64_t mega = 1024 * kilo; @@ -389,17 +442,17 @@ // Units (k) ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt)); + config_options, base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt)); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo); // Units (m) - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "max_write_buffer_number=16m;inplace_update_num_locks=17M", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "max_write_buffer_number=16m;inplace_update_num_locks=17M", &new_cf_opt)); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega); ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega); // Units (g) ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, + config_options, base_cf_opt, "write_buffer_size=18g;prefix_extractor=capped:8;" "arena_block_size=19G", &new_cf_opt)); @@ -407,129 +460,412 @@ ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga); ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga); ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr); - std::string prefix_name(new_cf_opt.prefix_extractor->Name()); - ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8"); + ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8"); // Units (t) - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=20t;arena_block_size=21T", + &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera); ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera); // Nested block based table options // Empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={};arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={};arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Non-empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_cache=1M;block_size=4;};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Last one - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_cache=1M;block_size=4;}", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;}", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Mismatch curly braces - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={{{block_size=4;};" - "arena_block_size=1024", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={{{block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); // Unexpected chars after closing curly brace - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}};" - "arena_block_size=1024", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}xdfa;" - "arena_block_size=1024", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa;" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}xdfa", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); // Invalid block based table option - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={xx_block_size=4;}", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); - - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=true", - &new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=false", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={xx_block_size=4;}", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=junk", - &new_cf_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=true", + &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=false", + &new_cf_opt)); + + ASSERT_NOK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=junk", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt, + new_cf_opt)); // Nested plain table options // Empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "plain_table_factory={};arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={};arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); // Non-empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); // memtable factory - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "memtable=skip_list:10;arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "memtable=skip_list:10;arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.memtable_factory->Name()), "SkipListFactory"); + ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory")); +} + +TEST_F(OptionsTest, CompressionOptionsFromString) { + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + ConfigOptions config_options; + std::string opts_str; + config_options.ignore_unknown_options = false; + CompressionOptions dflt; + // Test with some optional values removed.... + ASSERT_OK( + GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(), + "compression_opts=3:4:5; " + "bottommost_compression_opts=4:5:6:7", + &base_cf_opt)); + ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 3); + ASSERT_EQ(base_cf_opt.compression_opts.level, 4); + ASSERT_EQ(base_cf_opt.compression_opts.strategy, 5); + ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, dflt.max_dict_bytes); + ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes, + dflt.zstd_max_train_bytes); + ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads, + dflt.parallel_threads); + ASSERT_EQ(base_cf_opt.compression_opts.enabled, dflt.enabled); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 4); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 5); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 6); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, + dflt.zstd_max_train_bytes); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads, + dflt.parallel_threads); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, dflt.enabled); + + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts=4:5:6:7:8:9:true; " + "bottommost_compression_opts=5:6:7:8:9:false", + &base_cf_opt)); + ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 4); + ASSERT_EQ(base_cf_opt.compression_opts.level, 5); + ASSERT_EQ(base_cf_opt.compression_opts.strategy, 6); + ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes, 8u); + ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads, 9u); + ASSERT_EQ(base_cf_opt.compression_opts.enabled, true); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 5); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 6); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 7); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads, + dflt.parallel_threads); + ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, false); + + ASSERT_OK( + GetStringFromColumnFamilyOptions(config_options, base_cf_opt, &opts_str)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), opts_str, &new_cf_opt)); + ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4); + ASSERT_EQ(new_cf_opt.compression_opts.level, 5); + ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6); + ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u); + ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 9u); + ASSERT_EQ(new_cf_opt.compression_opts.enabled, true); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, + dflt.parallel_threads); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false); + + // Test as struct values + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts={window_bits=5; level=6; strategy=7; max_dict_bytes=8;" + "zstd_max_train_bytes=9;parallel_threads=10;enabled=true}; " + "bottommost_compression_opts={window_bits=4; level=5; strategy=6;" + " max_dict_bytes=7;zstd_max_train_bytes=8;parallel_threads=9;" + "enabled=false}; ", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 5); + ASSERT_EQ(new_cf_opt.compression_opts.level, 6); + ASSERT_EQ(new_cf_opt.compression_opts.strategy, 7); + ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 8u); + ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 9u); + ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 10u); + ASSERT_EQ(new_cf_opt.compression_opts.enabled, true); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 4); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 5); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 6); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 8u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, 9u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false); + + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "compression_opts={window_bits=4; strategy=5;};" + "bottommost_compression_opts={level=6; strategy=7;}", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4); + ASSERT_EQ(new_cf_opt.compression_opts.strategy, 5); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7); + + ASSERT_EQ(new_cf_opt.compression_opts.level, + base_cf_opt.compression_opts.level); + ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, + base_cf_opt.compression_opts.max_dict_bytes); + ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, + base_cf_opt.compression_opts.zstd_max_train_bytes); + ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, + base_cf_opt.compression_opts.parallel_threads); + ASSERT_EQ(new_cf_opt.compression_opts.enabled, + base_cf_opt.compression_opts.enabled); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, + base_cf_opt.bottommost_compression_opts.window_bits); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, + base_cf_opt.bottommost_compression_opts.max_dict_bytes); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, + base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, + base_cf_opt.bottommost_compression_opts.parallel_threads); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, + base_cf_opt.bottommost_compression_opts.enabled); + + // Test a few individual struct values + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "compression_opts.enabled=false; " + "bottommost_compression_opts.enabled=true; ", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.compression_opts.enabled, false); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true); + + // Now test some illegal values + ConfigOptions ignore; + ignore.ignore_unknown_options = true; + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts=5:6:7:8:9:x:false", &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + ignore, ColumnFamilyOptions(), "compression_opts=5:6:7:8:9:x:false", + &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8", + &base_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts=1:2:3:4:5:6:true:8:9", &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8:9", + &base_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), "compression_opts={unknown=bad;}", + &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(), + "compression_opts={unknown=bad;}", + &base_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), "compression_opts.unknown=bad", + &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(), + "compression_opts.unknown=bad", + &base_cf_opt)); +} + +TEST_F(OptionsTest, OldInterfaceTest) { + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + ConfigOptions exact; + + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "write_buffer_size=18;prefix_extractor=capped:8;" + "arena_block_size=19", + &new_cf_opt)); + + ASSERT_EQ(new_cf_opt.write_buffer_size, 18); + ASSERT_EQ(new_cf_opt.arena_block_size, 19); + ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr); + + // And with a bad option + ASSERT_NOK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={xx_block_size=4;}", + &new_cf_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + std::unordered_map cf_options_map = { + {"write_buffer_size", "1"}, + {"max_write_buffer_number", "2"}, + {"min_write_buffer_number_to_merge", "3"}, + }; + ASSERT_OK( + GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt)); + cf_options_map["unknown_option"] = "1"; + ASSERT_NOK( + GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, + &new_cf_opt, true, true)); + + DBOptions base_db_opt; + DBOptions new_db_opt; + std::unordered_map db_options_map = { + {"create_if_missing", "false"}, + {"create_missing_column_families", "true"}, + {"error_if_exists", "false"}, + {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, + {"max_open_files", "32"}, + }; + ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_EQ(new_db_opt.create_if_missing, false); + ASSERT_EQ(new_db_opt.create_missing_column_families, true); + ASSERT_EQ(new_db_opt.error_if_exists, false); + ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); + ASSERT_EQ(new_db_opt.max_open_files, 32); + db_options_map["unknown_option"] = "1"; + Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); + ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, true, + true)); + ASSERT_OK(GetDBOptionsFromString( + base_db_opt, + "create_if_missing=false;error_if_exists=false;max_open_files=42;", + &new_db_opt)); + ASSERT_EQ(new_db_opt.create_if_missing, false); + ASSERT_EQ(new_db_opt.error_if_exists, false); + ASSERT_EQ(new_db_opt.max_open_files, 42); + s = GetDBOptionsFromString( + base_db_opt, + "create_if_missing=false;error_if_exists=false;max_open_files=42;" + "unknown_option=1;", + &new_db_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); } + #endif // !ROCKSDB_LITE #ifndef ROCKSDB_LITE // GetBlockBasedTableOptionsFromString is not supported TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { BlockBasedTableOptions table_opt; BlockBasedTableOptions new_opt; + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + // make sure default values are overwritten by something else ASSERT_OK(GetBlockBasedTableOptionsFromString( - table_opt, + config_options, table_opt, "cache_index_and_filter_blocks=1;index_type=kHashSearch;" - "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" + "checksum=kxxHash;hash_index_allow_collision=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" "block_size_deviation=8;block_restart_interval=4;" "format_version=5;whole_key_filtering=1;" - "filter_policy=bloomfilter:4.567:false;", + "reserve_table_builder_memory=true;" + "filter_policy=bloomfilter:4.567:false;" + // A bug caused read_amp_bytes_per_bit to be a large integer in OPTIONS + // file generated by 6.10 to 6.14. Though bug is fixed in these releases, + // we need to handle the case of loading OPTIONS file generated before the + // fix. + "read_amp_bytes_per_bit=17179869185;", &new_opt)); ASSERT_TRUE(new_opt.cache_index_and_filter_blocks); ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch); ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash); ASSERT_TRUE(new_opt.hash_index_allow_collision); - ASSERT_TRUE(new_opt.no_block_cache); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); @@ -539,63 +875,130 @@ ASSERT_EQ(new_opt.block_restart_interval, 4); ASSERT_EQ(new_opt.format_version, 5U); ASSERT_EQ(new_opt.whole_key_filtering, true); + ASSERT_EQ(new_opt.reserve_table_builder_memory, true); ASSERT_TRUE(new_opt.filter_policy != nullptr); - const BloomFilterPolicy& bfp = - dynamic_cast(*new_opt.filter_policy); - EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567); - EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5); + const BloomFilterPolicy* bfp = + dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567); + EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kAutoBloom); + // Verify that only the lower 32bits are stored in + // new_opt.read_amp_bytes_per_bit. + EXPECT_EQ(1U, new_opt.read_amp_bytes_per_bit); // unknown option - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" - "bad_option=1", - &new_opt)); + Status s = GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" + "bad_option=1", + &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_EQ(static_cast(table_opt.cache_index_and_filter_blocks), new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized index type - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", - &new_opt)); + s = GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized checksum type - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;checksum=kxxHashXX", - &new_opt)); + ASSERT_NOK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;checksum=kxxHashXX", &new_opt)); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized filter policy name - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;" - "filter_policy=bloomfilterxx:4:true", - &new_opt)); + s = GetBlockBasedTableOptionsFromString(config_options, table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilterxx:4:true", + &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); // unrecognized filter policy config - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;" - "filter_policy=bloomfilter:4", - &new_opt)); + s = GetBlockBasedTableOptionsFromString(config_options, table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilter:4", + &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); + // Ribbon filter policy (no Bloom hybrid) + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=ribbonfilter:5.678:-1;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + bfp = dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 5678); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + + // Ribbon filter policy (default Bloom hybrid) + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=ribbonfilter:6.789;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + auto ltfp = dynamic_cast( + new_opt.filter_policy.get()); + EXPECT_EQ(ltfp->TEST_GetStartingLevelForB(), 0); + + bfp = dynamic_cast(ltfp->TEST_GetPolicyA()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kFastLocalBloom); + + bfp = dynamic_cast(ltfp->TEST_GetPolicyB()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + + // Ribbon filter policy (custom Bloom hybrid) + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=ribbonfilter:6.789:5;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + ltfp = dynamic_cast( + new_opt.filter_policy.get()); + EXPECT_EQ(ltfp->TEST_GetStartingLevelForB(), 5); + + bfp = dynamic_cast(ltfp->TEST_GetPolicyA()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kFastLocalBloom); + + bfp = dynamic_cast(ltfp->TEST_GetPolicyB()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + + // Old name + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=experimental_ribbon:6.789;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + bfp = dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + // Check block cache options are overwritten when specified // in new format as a struct. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};" - "block_cache_compressed={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast( @@ -614,10 +1017,11 @@ // Set only block cache capacity. Check other values are // reset to default values. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=2M};" - "block_cache_compressed={capacity=2M}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=2M};" + "block_cache_compressed={capacity=2M}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL); // Default values @@ -642,7 +1046,7 @@ // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString( - table_opt, + config_options, table_opt, "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" "block_cache_compressed={num_shard_bits=5;" "high_pri_pool_ratio=0.0;}", @@ -663,12 +1067,13 @@ 0.0); // Set couple of block cache options. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;};" - "block_cache_compressed={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast( @@ -693,12 +1098,16 @@ TEST_F(OptionsTest, GetPlainTableOptionsFromString) { PlainTableOptions table_opt; PlainTableOptions new_opt; + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; // make sure default values are overwritten by something else - ASSERT_OK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" - "full_scan_mode=true;store_index_in_file=true", - &new_opt)); + ASSERT_OK(GetPlainTableOptionsFromString( + config_options, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" + "full_scan_mode=true;store_index_in_file=true", + &new_opt)); ASSERT_EQ(new_opt.user_key_len, 66u); ASSERT_EQ(new_opt.bloom_bits_per_key, 20); ASSERT_EQ(new_opt.hash_table_ratio, 0.5); @@ -709,16 +1118,22 @@ ASSERT_TRUE(new_opt.store_index_in_file); // unknown option - ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "bad_option=1", - &new_opt)); + Status s = GetPlainTableOptionsFromString( + config_options, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "bad_option=1", + &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); // unrecognized EncodingType - ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "encoding_type=kPrefixXX", - &new_opt)); + s = GetPlainTableOptionsFromString( + config_options, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "encoding_type=kPrefixXX", + &new_opt); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); } #endif // !ROCKSDB_LITE @@ -728,14 +1143,14 @@ ASSERT_OK(GetMemTableRepFactoryFromString("skip_list", &new_mem_factory)); ASSERT_OK(GetMemTableRepFactoryFromString("skip_list:16", &new_mem_factory)); - ASSERT_EQ(std::string(new_mem_factory->Name()), "SkipListFactory"); + ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory"); ASSERT_NOK(GetMemTableRepFactoryFromString("skip_list:16:invalid_opt", &new_mem_factory)); ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash", &new_mem_factory)); ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash:1000", &new_mem_factory)); - ASSERT_EQ(std::string(new_mem_factory->Name()), "HashSkipListRepFactory"); + ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory"); ASSERT_NOK(GetMemTableRepFactoryFromString("prefix_hash:1000:invalid_opt", &new_mem_factory)); @@ -761,9 +1176,113 @@ } #endif // !ROCKSDB_LITE +TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) { + std::unique_ptr new_mem_factory = nullptr; + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + config_options.ignore_unknown_options = false; + + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list", + &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list:16", + &new_mem_factory)); + ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory"); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("skip_list")); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("SkipListFactory")); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "skip_list:16:invalid_opt", &new_mem_factory)); + + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "invalid_opt=10", &new_mem_factory)); + + // Test a reset + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "", + &new_mem_factory)); + ASSERT_EQ(new_mem_factory, nullptr); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "invalid_opt=10", &new_mem_factory)); + +#ifndef ROCKSDB_LITE + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, "id=skip_list; lookahead=32", &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "prefix_hash", + &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, "prefix_hash:1000", &new_mem_factory)); + ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory"); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("prefix_hash")); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashSkipListRepFactory")); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "prefix_hash:1000:invalid_opt", &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, + "id=prefix_hash; bucket_count=32; skiplist_height=64; " + "branching_factor=16", + &new_mem_factory)); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, + "id=prefix_hash; bucket_count=32; skiplist_height=64; " + "branching_factor=16; invalid=unknown", + &new_mem_factory)); + + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, "hash_linkedlist", &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, "hash_linkedlist:1000", &new_mem_factory)); + ASSERT_STREQ(new_mem_factory->Name(), "HashLinkListRepFactory"); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("hash_linkedlist")); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashLinkListRepFactory")); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "hash_linkedlist:1000:invalid_opt", &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, + "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; " + "logging_threshold=12; log_when_flash=true", + &new_mem_factory)); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, + "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; " + "logging_threshold=12; log_when_flash=true; invalid=unknown", + &new_mem_factory)); + + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector", + &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector:1024", + &new_mem_factory)); + ASSERT_STREQ(new_mem_factory->Name(), "VectorRepFactory"); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("vector")); + ASSERT_TRUE(new_mem_factory->IsInstanceOf("VectorRepFactory")); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "vector:1024:invalid_opt", &new_mem_factory)); + ASSERT_OK(MemTableRepFactory::CreateFromString( + config_options, "id=vector; count=42", &new_mem_factory)); + ASSERT_NOK(MemTableRepFactory::CreateFromString( + config_options, "id=vector; invalid=unknown", &new_mem_factory)); +#endif // ROCKSDB_LITE + ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo", + &new_mem_factory)); + // CuckooHash memtable is already removed. + ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo:1024", + &new_mem_factory)); + + ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "bad_factory", + &new_mem_factory)); +} + #ifndef ROCKSDB_LITE // GetOptionsFromString is not supported in RocksDB Lite +class CustomEnv : public EnvWrapper { + public: + explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} + static const char* kClassName() { return "CustomEnv"; } + const char* Name() const override { return kClassName(); } +}; + TEST_F(OptionsTest, GetOptionsFromStringTest) { Options base_options, new_options; + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + base_options.write_buffer_size = 20; base_options.min_write_buffer_number_to_merge = 15; BlockBasedTableOptions block_based_table_options; @@ -772,14 +1291,8 @@ NewBlockBasedTableFactory(block_based_table_options)); // Register an Env with object registry. - const static char* kCustomEnvName = "CustomEnv"; - class CustomEnv : public EnvWrapper { - public: - explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} - }; - - ObjectLibrary::Default()->Register( - kCustomEnvName, + ObjectLibrary::Default()->AddFactory( + CustomEnv::kClassName(), [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, std::string* /* errmsg */) { static CustomEnv env(Env::Default()); @@ -787,7 +1300,7 @@ }); ASSERT_OK(GetOptionsFromString( - base_options, + config_options, base_options, "write_buffer_size=10;max_write_buffer_number=16;" "block_based_table_factory={block_cache=1M;block_size=4;};" "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;" @@ -801,6 +1314,7 @@ ASSERT_EQ(new_options.compression_opts.strategy, 6); ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u); ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u); + ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u); ASSERT_EQ(new_options.compression_opts.enabled, false); ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption); ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5); @@ -808,41 +1322,97 @@ ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7); ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u); ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u); + ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u); ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false); ASSERT_EQ(new_options.write_buffer_size, 10U); ASSERT_EQ(new_options.max_write_buffer_number, 16); - BlockBasedTableOptions new_block_based_table_options = - dynamic_cast(new_options.table_factory.get()) - ->table_options(); - ASSERT_EQ(new_block_based_table_options.block_cache->GetCapacity(), 1U << 20); - ASSERT_EQ(new_block_based_table_options.block_size, 4U); + const auto new_bbto = + new_options.table_factory->GetOptions(); + ASSERT_NE(new_bbto, nullptr); + ASSERT_EQ(new_bbto->block_cache->GetCapacity(), 1U << 20); + ASSERT_EQ(new_bbto->block_size, 4U); // don't overwrite block based table options - ASSERT_TRUE(new_block_based_table_options.cache_index_and_filter_blocks); + ASSERT_TRUE(new_bbto->cache_index_and_filter_blocks); ASSERT_EQ(new_options.create_if_missing, true); ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); Env* newEnv = new_options.env; - ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv)); + ASSERT_OK(Env::LoadEnv(CustomEnv::kClassName(), &newEnv)); ASSERT_EQ(newEnv, new_options.env); + + config_options.ignore_unknown_options = false; + // Test a bad value for a DBOption returns a failure + base_options.dump_malloc_stats = false; + base_options.write_buffer_size = 1024; + Options bad_options = new_options; + Status s = GetOptionsFromString(config_options, base_options, + "create_if_missing=XX;dump_malloc_stats=true", + &bad_options); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(bad_options.dump_malloc_stats, false); + + bad_options = new_options; + s = GetOptionsFromString(config_options, base_options, + "write_buffer_size=XX;dump_malloc_stats=true", + &bad_options); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + + ASSERT_EQ(bad_options.dump_malloc_stats, false); + + // Test a bad value for a TableFactory Option returns a failure + bad_options = new_options; + s = GetOptionsFromString(config_options, base_options, + "write_buffer_size=16;dump_malloc_stats=true" + "block_based_table_factory={block_size=XX;};", + &bad_options); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_EQ(bad_options.dump_malloc_stats, false); + ASSERT_EQ(bad_options.write_buffer_size, 1024); + + config_options.ignore_unknown_options = true; + ASSERT_OK(GetOptionsFromString(config_options, base_options, + "create_if_missing=XX;dump_malloc_stats=true;" + "write_buffer_size=XX;" + "block_based_table_factory={block_size=XX;};", + &bad_options)); + ASSERT_EQ(bad_options.create_if_missing, base_options.create_if_missing); + ASSERT_EQ(bad_options.dump_malloc_stats, true); + ASSERT_EQ(bad_options.write_buffer_size, base_options.write_buffer_size); + + // Test the old interface + ASSERT_OK(GetOptionsFromString( + base_options, + "write_buffer_size=22;max_write_buffer_number=33;max_open_files=44;", + &new_options)); + ASSERT_EQ(new_options.write_buffer_size, 22U); + ASSERT_EQ(new_options.max_write_buffer_number, 33); + ASSERT_EQ(new_options.max_open_files, 44); } TEST_F(OptionsTest, DBOptionsSerialization) { Options base_options, new_options; Random rnd(301); + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; // Phase 1: Make big change in base_options test::RandomInitDBOptions(&base_options, &rnd); // Phase 2: obtain a string from base_option std::string base_options_file_content; - ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options)); + ASSERT_OK(GetStringFromDBOptions(config_options, base_options, + &base_options_file_content)); // Phase 3: Set new_options from the derived string and expect // new_options == base_options - ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content, - &new_options)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_options, new_options)); + ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), + base_options_file_content, &new_options)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options, + new_options)); } TEST_F(OptionsTest, OptionsComposeDecompose) { @@ -850,6 +1420,9 @@ // we get same constituent options. DBOptions base_db_opts; ColumnFamilyOptions base_cf_opts; + ConfigOptions + config_options; // Use default for ignore(false) and check (exact) + config_options.input_strings_escaped = false; Random rnd(301); test::RandomInitDBOptions(&base_db_opts, &rnd); @@ -859,34 +1432,254 @@ DBOptions new_db_opts(base_opts); ColumnFamilyOptions new_cf_opts(base_opts); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opts, new_db_opts)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opts, new_cf_opts)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_db_opts, + new_db_opts)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opts, + new_cf_opts)); delete new_cf_opts.compaction_filter; } +TEST_F(OptionsTest, DBOptionsComposeImmutable) { + // Build a DBOptions from an Immutable/Mutable one and verify that + // we get same constituent options. + ConfigOptions config_options; + Random rnd(301); + DBOptions base_opts, new_opts; + test::RandomInitDBOptions(&base_opts, &rnd); + MutableDBOptions m_opts(base_opts); + ImmutableDBOptions i_opts(base_opts); + new_opts = BuildDBOptions(i_opts, m_opts); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_opts, + new_opts)); +} + +TEST_F(OptionsTest, GetMutableDBOptions) { + Random rnd(228); + DBOptions base_opts; + std::string opts_str; + std::unordered_map opts_map; + ConfigOptions config_options; + + test::RandomInitDBOptions(&base_opts, &rnd); + ImmutableDBOptions i_opts(base_opts); + MutableDBOptions m_opts(base_opts); + MutableDBOptions new_opts; + ASSERT_OK(GetStringFromMutableDBOptions(config_options, m_opts, &opts_str)); + ASSERT_OK(StringToMap(opts_str, &opts_map)); + ASSERT_OK(GetMutableDBOptionsFromStrings(m_opts, opts_map, &new_opts)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions( + config_options, base_opts, BuildDBOptions(i_opts, new_opts))); +} + +TEST_F(OptionsTest, CFOptionsComposeImmutable) { + // Build a DBOptions from an Immutable/Mutable one and verify that + // we get same constituent options. + ConfigOptions config_options; + Random rnd(301); + ColumnFamilyOptions base_opts, new_opts; + DBOptions dummy; // Needed to create ImmutableCFOptions + test::RandomInitCFOptions(&base_opts, dummy, &rnd); + MutableCFOptions m_opts(base_opts); + ImmutableCFOptions i_opts(base_opts); + UpdateColumnFamilyOptions(i_opts, &new_opts); + UpdateColumnFamilyOptions(m_opts, &new_opts); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opts, + new_opts)); + delete new_opts.compaction_filter; +} + +TEST_F(OptionsTest, GetMutableCFOptions) { + Random rnd(228); + ColumnFamilyOptions base, copy; + std::string opts_str; + std::unordered_map opts_map; + ConfigOptions config_options; + DBOptions dummy; // Needed to create ImmutableCFOptions + + test::RandomInitCFOptions(&base, dummy, &rnd); + ColumnFamilyOptions result; + MutableCFOptions m_opts(base), new_opts; + + ASSERT_OK(GetStringFromMutableCFOptions(config_options, m_opts, &opts_str)); + ASSERT_OK(StringToMap(opts_str, &opts_map)); + ASSERT_OK(GetMutableOptionsFromStrings(m_opts, opts_map, nullptr, &new_opts)); + UpdateColumnFamilyOptions(ImmutableCFOptions(base), ©); + UpdateColumnFamilyOptions(new_opts, ©); + + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base, copy)); + delete copy.compaction_filter; +} + TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { Options options; ColumnFamilyOptions base_opt, new_opt; Random rnd(302); + ConfigOptions config_options; + config_options.input_strings_escaped = false; + // Phase 1: randomly assign base_opt // custom type options test::RandomInitCFOptions(&base_opt, options, &rnd); // Phase 2: obtain a string from base_opt std::string base_options_file_content; - ASSERT_OK( - GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt)); + ASSERT_OK(GetStringFromColumnFamilyOptions(config_options, base_opt, + &base_options_file_content)); // Phase 3: Set new_opt from the derived string and expect // new_opt == base_opt - ASSERT_OK(GetColumnFamilyOptionsFromString( - ColumnFamilyOptions(), base_options_file_content, &new_opt)); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_opt, new_opt)); + ASSERT_OK( + GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(), + base_options_file_content, &new_opt)); + ASSERT_OK( + RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt)); if (base_opt.compaction_filter) { delete base_opt.compaction_filter; } } +TEST_F(OptionsTest, CheckBlockBasedTableOptions) { + ColumnFamilyOptions cf_opts; + DBOptions db_opts; + ConfigOptions config_opts; + + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_opts, cf_opts, "prefix_extractor=capped:8", &cf_opts)); + ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable", + &cf_opts.table_factory)); + ASSERT_NE(cf_opts.table_factory.get(), nullptr); + ASSERT_TRUE(cf_opts.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())); + auto bbto = cf_opts.table_factory->GetOptions(); + ASSERT_OK(cf_opts.table_factory->ConfigureFromString( + config_opts, + "block_cache={capacity=1M;num_shard_bits=4;};" + "block_size_deviation=101;" + "block_restart_interval=0;" + "index_block_restart_interval=5;" + "partition_filters=true;" + "index_type=kHashSearch;" + "no_block_cache=1;")); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->block_cache.get(), nullptr); + ASSERT_EQ(bbto->block_size_deviation, 0); + ASSERT_EQ(bbto->block_restart_interval, 1); + ASSERT_EQ(bbto->index_block_restart_interval, 1); + ASSERT_FALSE(bbto->partition_filters); + ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable", + &cf_opts.table_factory)); + bbto = cf_opts.table_factory->GetOptions(); + + ASSERT_OK(cf_opts.table_factory->ConfigureFromString(config_opts, + "no_block_cache=0;")); + ASSERT_NE(bbto->block_cache.get(), nullptr); + ASSERT_OK(cf_opts.table_factory->ValidateOptions(db_opts, cf_opts)); +} + +TEST_F(OptionsTest, MutableTableOptions) { + ConfigOptions config_options; + std::shared_ptr bbtf; + bbtf.reset(NewBlockBasedTableFactory()); + auto bbto = bbtf->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_OK(bbtf->ConfigureOption(config_options, "block_align", "true")); + ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024")); + ASSERT_EQ(bbto->block_align, true); + ASSERT_EQ(bbto->block_size, 1024); + ASSERT_OK(bbtf->PrepareOptions(config_options)); + config_options.mutable_options_only = true; + ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024")); + ASSERT_EQ(bbto->block_align, true); + ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false")); + ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "2048")); + ASSERT_EQ(bbto->block_align, true); + ASSERT_EQ(bbto->block_size, 2048); + + ColumnFamilyOptions cf_opts; + cf_opts.table_factory = bbtf; + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, cf_opts, "block_based_table_factory.block_align=false", + &cf_opts)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, cf_opts, "block_based_table_factory.block_size=8192", + &cf_opts)); + ASSERT_EQ(bbto->block_align, true); + ASSERT_EQ(bbto->block_size, 8192); +} + +TEST_F(OptionsTest, MutableCFOptions) { + ConfigOptions config_options; + ColumnFamilyOptions cf_opts; + + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, cf_opts, + "paranoid_file_checks=true; block_based_table_factory.block_align=false; " + "block_based_table_factory.block_size=8192;", + &cf_opts)); + ASSERT_TRUE(cf_opts.paranoid_file_checks); + ASSERT_NE(cf_opts.table_factory.get(), nullptr); + const auto bbto = cf_opts.table_factory->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->block_size, 8192); + ASSERT_EQ(bbto->block_align, false); + std::unordered_map unused_opts; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts)); + ASSERT_EQ(cf_opts.paranoid_file_checks, false); + + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.block_size", "16384"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 16384); + + config_options.mutable_options_only = true; + // Force consistency checks is not mutable + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"force_consistency_checks", "true"}}, + &cf_opts)); + + // Attempt to change the table. It is not mutable, so this should fail and + // leave the original intact + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts)); + ASSERT_NE(cf_opts.table_factory.get(), nullptr); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + + // Change the block size. Should update the value in the current table + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.block_size", "8192"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 8192); + + // Attempt to turn off block cache fails, as this option is not mutable + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + + // Attempt to change the block size via a config string/map. Should update + // the current value + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 32768); + + // Attempt to change the block size and no cache through the map. Should + // fail, leaving the old values intact + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory", + "{block_size=16384; no_block_cache=true}"}}, + &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 32768); +} + #endif // !ROCKSDB_LITE Status StringToMap( @@ -1079,6 +1872,230 @@ ASSERT_NOK( GetStringFromCompressionType(&res, static_cast(-10))); } + +TEST_F(OptionsTest, OnlyMutableDBOptions) { + std::string opt_str; + Random rnd(302); + ConfigOptions cfg_opts; + DBOptions db_opts; + DBOptions mdb_opts; + std::unordered_set m_names; + std::unordered_set a_names; + + test::RandomInitDBOptions(&db_opts, &rnd); + auto db_config = DBOptionsAsConfigurable(db_opts); + + // Get all of the DB Option names (mutable or not) + ASSERT_OK(db_config->GetOptionNames(cfg_opts, &a_names)); + + // Get only the mutable options from db_opts and set those in mdb_opts + cfg_opts.mutable_options_only = true; + + // Get only the Mutable DB Option names + ASSERT_OK(db_config->GetOptionNames(cfg_opts, &m_names)); + ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opt_str)); + ASSERT_OK(GetDBOptionsFromString(cfg_opts, mdb_opts, opt_str, &mdb_opts)); + std::string mismatch; + // Comparing only the mutable options, the two are equivalent + auto mdb_config = DBOptionsAsConfigurable(mdb_opts); + ASSERT_TRUE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch)); + ASSERT_TRUE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch)); + + ASSERT_GT(a_names.size(), m_names.size()); + for (const auto& n : m_names) { + std::string m, d; + ASSERT_OK(mdb_config->GetOption(cfg_opts, n, &m)); + ASSERT_OK(db_config->GetOption(cfg_opts, n, &d)); + ASSERT_EQ(m, d); + } + + cfg_opts.mutable_options_only = false; + // Comparing all of the options, the two are not equivalent + ASSERT_FALSE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch)); + ASSERT_FALSE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch)); + + // Make sure there are only mutable options being configured + ASSERT_OK(GetDBOptionsFromString(cfg_opts, DBOptions(), opt_str, &db_opts)); +} + +TEST_F(OptionsTest, OnlyMutableCFOptions) { + std::string opt_str; + Random rnd(302); + ConfigOptions cfg_opts; + DBOptions db_opts; + ColumnFamilyOptions mcf_opts; + ColumnFamilyOptions cf_opts; + std::unordered_set m_names; + std::unordered_set a_names; + + test::RandomInitCFOptions(&cf_opts, db_opts, &rnd); + cf_opts.comparator = ReverseBytewiseComparator(); + auto cf_config = CFOptionsAsConfigurable(cf_opts); + + // Get all of the CF Option names (mutable or not) + ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &a_names)); + + // Get only the mutable options from cf_opts and set those in mcf_opts + cfg_opts.mutable_options_only = true; + // Get only the Mutable CF Option names + ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &m_names)); + ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opt_str)); + ASSERT_OK( + GetColumnFamilyOptionsFromString(cfg_opts, mcf_opts, opt_str, &mcf_opts)); + std::string mismatch; + + auto mcf_config = CFOptionsAsConfigurable(mcf_opts); + // Comparing only the mutable options, the two are equivalent + ASSERT_TRUE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch)); + ASSERT_TRUE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch)); + + ASSERT_GT(a_names.size(), m_names.size()); + for (const auto& n : m_names) { + std::string m, d; + ASSERT_OK(mcf_config->GetOption(cfg_opts, n, &m)); + ASSERT_OK(cf_config->GetOption(cfg_opts, n, &d)); + ASSERT_EQ(m, d); + } + + cfg_opts.mutable_options_only = false; + // Comparing all of the options, the two are not equivalent + ASSERT_FALSE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch)); + ASSERT_FALSE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch)); + delete cf_opts.compaction_filter; + + // Make sure the options string contains only mutable options + ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, ColumnFamilyOptions(), + opt_str, &cf_opts)); + delete cf_opts.compaction_filter; +} + +TEST_F(OptionsTest, SstPartitionerTest) { + ConfigOptions cfg_opts; + ColumnFamilyOptions cf_opts, new_opt; + std::string opts_str, mismatch; + + ASSERT_OK(SstPartitionerFactory::CreateFromString( + cfg_opts, SstPartitionerFixedPrefixFactory::kClassName(), + &cf_opts.sst_partitioner_factory)); + ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr); + ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(), + SstPartitionerFixedPrefixFactory::kClassName()); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + cfg_opts, ColumnFamilyOptions(), + std::string("sst_partitioner_factory={id=") + + SstPartitionerFixedPrefixFactory::kClassName() + "; unknown=10;}", + &cf_opts)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + cfg_opts, ColumnFamilyOptions(), + std::string("sst_partitioner_factory={id=") + + SstPartitionerFixedPrefixFactory::kClassName() + "; length=10;}", + &cf_opts)); + ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr); + ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(), + SstPartitionerFixedPrefixFactory::kClassName()); + ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opts_str)); + ASSERT_OK( + GetColumnFamilyOptionsFromString(cfg_opts, cf_opts, opts_str, &new_opt)); + ASSERT_NE(new_opt.sst_partitioner_factory, nullptr); + ASSERT_STREQ(new_opt.sst_partitioner_factory->Name(), + SstPartitionerFixedPrefixFactory::kClassName()); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, cf_opts, new_opt)); + ASSERT_TRUE(cf_opts.sst_partitioner_factory->AreEquivalent( + cfg_opts, new_opt.sst_partitioner_factory.get(), &mismatch)); +} + +TEST_F(OptionsTest, FileChecksumGenFactoryTest) { + ConfigOptions cfg_opts; + DBOptions db_opts, new_opt; + std::string opts_str, mismatch; + auto factory = GetFileChecksumGenCrc32cFactory(); + + cfg_opts.ignore_unsupported_options = false; + + ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str)); + ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt)); + + ASSERT_NE(factory, nullptr); + ASSERT_OK(FileChecksumGenFactory::CreateFromString( + cfg_opts, factory->Name(), &db_opts.file_checksum_gen_factory)); + ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr); + ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name()); + ASSERT_NOK(GetDBOptionsFromString( + cfg_opts, DBOptions(), "file_checksum_gen_factory=unknown", &db_opts)); + ASSERT_OK(GetDBOptionsFromString( + cfg_opts, DBOptions(), + std::string("file_checksum_gen_factory=") + factory->Name(), &db_opts)); + ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr); + ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name()); + + ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str)); + ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt)); + ASSERT_NE(new_opt.file_checksum_gen_factory, nullptr); + ASSERT_STREQ(new_opt.file_checksum_gen_factory->Name(), factory->Name()); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(cfg_opts, db_opts, new_opt)); + ASSERT_TRUE(factory->AreEquivalent( + cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch)); + ASSERT_TRUE(db_opts.file_checksum_gen_factory->AreEquivalent( + cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch)); +} + +class TestTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + private: + std::string id_; + + public: + explicit TestTablePropertiesCollectorFactory(const std::string& id) + : id_(id) {} + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return nullptr; + } + static const char* kClassName() { return "TestCollector"; } + const char* Name() const override { return kClassName(); } + std::string GetId() const override { + return std::string(kClassName()) + ":" + id_; + } +}; + +TEST_F(OptionsTest, OptionTablePropertiesTest) { + ConfigOptions cfg_opts; + ColumnFamilyOptions orig, copy; + orig.table_properties_collector_factories.push_back( + std::make_shared("1")); + orig.table_properties_collector_factories.push_back( + std::make_shared("2")); + + // Push two TablePropertiesCollectorFactories then create a new + // ColumnFamilyOptions based on those settings. The copy should + // have no properties but still match the original + std::string opts_str; + ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, orig, &opts_str)); + ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, ©)); + ASSERT_EQ(copy.table_properties_collector_factories.size(), 0); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy)); + + // Now register a TablePropertiesCollectorFactory + // Repeat the experiment. The copy should have the same + // properties as the original + cfg_opts.registry->AddLibrary("collector") + ->AddFactory( + ObjectLibrary::PatternEntry( + TestTablePropertiesCollectorFactory::kClassName(), false) + .AddSeparator(":"), + [](const std::string& name, + std::unique_ptr* guard, + std::string* /* errmsg */) { + std::string id = name.substr( + strlen(TestTablePropertiesCollectorFactory::kClassName()) + 1); + guard->reset(new TestTablePropertiesCollectorFactory(id)); + return guard->get(); + }); + + ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, ©)); + ASSERT_EQ(copy.table_properties_collector_factories.size(), 2); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy)); +} #endif // !ROCKSDB_LITE TEST_F(OptionsTest, ConvertOptionsTest) { @@ -1094,32 +2111,967 @@ ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files); ASSERT_EQ(converted_opt.compression, leveldb_opt.compression); - std::shared_ptr tb_guard = converted_opt.table_factory; - BlockBasedTableFactory* table_factory = - dynamic_cast(converted_opt.table_factory.get()); + std::shared_ptr table_factory = converted_opt.table_factory; + const auto table_opt = table_factory->GetOptions(); + ASSERT_NE(table_opt, nullptr); + + ASSERT_EQ(table_opt->block_cache->GetCapacity(), 8UL << 20); + ASSERT_EQ(table_opt->block_size, leveldb_opt.block_size); + ASSERT_EQ(table_opt->block_restart_interval, + leveldb_opt.block_restart_interval); + ASSERT_EQ(table_opt->filter_policy.get(), leveldb_opt.filter_policy); +} +#ifndef ROCKSDB_LITE +class TestEventListener : public EventListener { + private: + std::string id_; - ASSERT_TRUE(table_factory != nullptr); + public: + explicit TestEventListener(const std::string& id) : id_("Test" + id) {} + const char* Name() const override { return id_.c_str(); } +}; - const BlockBasedTableOptions table_opt = table_factory->table_options(); +static std::unordered_map + test_listener_option_info = { + {"s", + {0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, - ASSERT_EQ(table_opt.block_cache->GetCapacity(), 8UL << 20); - ASSERT_EQ(table_opt.block_size, leveldb_opt.block_size); - ASSERT_EQ(table_opt.block_restart_interval, - leveldb_opt.block_restart_interval); - ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy); +}; + +class TestConfigEventListener : public TestEventListener { + private: + std::string s_; + + public: + explicit TestConfigEventListener(const std::string& id) + : TestEventListener("Config" + id) { + s_ = id; + RegisterOptions("Test", &s_, &test_listener_option_info); + } +}; + +static int RegisterTestEventListener(ObjectLibrary& library, + const std::string& arg) { + library.AddFactory( + "Test" + arg, + [](const std::string& name, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TestEventListener(name.substr(4))); + return guard->get(); + }); + library.AddFactory( + "TestConfig" + arg, + [](const std::string& name, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new TestConfigEventListener(name.substr(10))); + return guard->get(); + }); + return 1; } +TEST_F(OptionsTest, OptionsListenerTest) { + DBOptions orig, copy; + orig.listeners.push_back(std::make_shared("1")); + orig.listeners.push_back(std::make_shared("2")); + orig.listeners.push_back(std::make_shared("")); + orig.listeners.push_back(std::make_shared("1")); + orig.listeners.push_back(std::make_shared("2")); + orig.listeners.push_back(std::make_shared("")); + ConfigOptions config_opts(orig); + config_opts.registry->AddLibrary("listener", RegisterTestEventListener, "1"); + std::string opts_str; + ASSERT_OK(GetStringFromDBOptions(config_opts, orig, &opts_str)); + ASSERT_OK(GetDBOptionsFromString(config_opts, orig, opts_str, ©)); + ASSERT_OK(GetStringFromDBOptions(config_opts, copy, &opts_str)); + ASSERT_EQ( + copy.listeners.size(), + 2); // The Test{Config}1 Listeners could be loaded but not the others + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, orig, copy)); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +const static std::string kCustomEnvName = "Custom"; +const static std::string kCustomEnvProp = "env=" + kCustomEnvName; + +static int RegisterCustomEnv(ObjectLibrary& library, const std::string& arg) { + library.AddFactory( + arg, [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { + static CustomEnv env(Env::Default()); + return &env; + }); + return 1; +} + +// This test suite tests the old APIs into the Configure options methods. +// Once those APIs are officially deprecated, this test suite can be deleted. +class OptionsOldApiTest : public testing::Test {}; + +TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { + std::unordered_map cf_options_map = { + {"write_buffer_size", "1"}, + {"max_write_buffer_number", "2"}, + {"min_write_buffer_number_to_merge", "3"}, + {"max_write_buffer_number_to_maintain", "99"}, + {"max_write_buffer_size_to_maintain", "-99999"}, + {"compression", "kSnappyCompression"}, + {"compression_per_level", + "kNoCompression:" + "kSnappyCompression:" + "kZlibCompression:" + "kBZip2Compression:" + "kLZ4Compression:" + "kLZ4HCCompression:" + "kXpressCompression:" + "kZSTD:" + "kZSTDNotFinalCompression"}, + {"bottommost_compression", "kLZ4Compression"}, + {"bottommost_compression_opts", "5:6:7:8:9:true"}, + {"compression_opts", "4:5:6:7:8:true"}, + {"num_levels", "8"}, + {"level0_file_num_compaction_trigger", "8"}, + {"level0_slowdown_writes_trigger", "9"}, + {"level0_stop_writes_trigger", "10"}, + {"target_file_size_base", "12"}, + {"target_file_size_multiplier", "13"}, + {"max_bytes_for_level_base", "14"}, + {"level_compaction_dynamic_level_bytes", "true"}, + {"max_bytes_for_level_multiplier", "15.0"}, + {"max_bytes_for_level_multiplier_additional", "16:17:18"}, + {"max_compaction_bytes", "21"}, + {"soft_rate_limit", "1.1"}, + {"hard_rate_limit", "2.1"}, + {"hard_pending_compaction_bytes_limit", "211"}, + {"arena_block_size", "22"}, + {"disable_auto_compactions", "true"}, + {"compaction_style", "kCompactionStyleLevel"}, + {"compaction_pri", "kOldestSmallestSeqFirst"}, + {"verify_checksums_in_compaction", "false"}, + {"compaction_options_fifo", "23"}, + {"max_sequential_skip_in_iterations", "24"}, + {"inplace_update_support", "true"}, + {"report_bg_io_stats", "true"}, + {"compaction_measure_io_stats", "false"}, + {"inplace_update_num_locks", "25"}, + {"memtable_prefix_bloom_size_ratio", "0.26"}, + {"memtable_whole_key_filtering", "true"}, + {"memtable_huge_page_size", "28"}, + {"bloom_locality", "29"}, + {"max_successive_merges", "30"}, + {"min_partial_merge_operands", "31"}, + {"prefix_extractor", "fixed:31"}, + {"optimize_filters_for_hits", "true"}, + {"enable_blob_files", "true"}, + {"min_blob_size", "1K"}, + {"blob_file_size", "1G"}, + {"blob_compression_type", "kZSTD"}, + {"enable_blob_garbage_collection", "true"}, + {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, + {"blob_compaction_readahead_size", "256K"}, + }; + + std::unordered_map db_options_map = { + {"create_if_missing", "false"}, + {"create_missing_column_families", "true"}, + {"error_if_exists", "false"}, + {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, + {"max_open_files", "32"}, + {"max_total_wal_size", "33"}, + {"use_fsync", "true"}, + {"db_log_dir", "/db_log_dir"}, + {"wal_dir", "/wal_dir"}, + {"delete_obsolete_files_period_micros", "34"}, + {"max_background_compactions", "35"}, + {"max_background_flushes", "36"}, + {"max_log_file_size", "37"}, + {"log_file_time_to_roll", "38"}, + {"keep_log_file_num", "39"}, + {"recycle_log_file_num", "5"}, + {"max_manifest_file_size", "40"}, + {"table_cache_numshardbits", "41"}, + {"WAL_ttl_seconds", "43"}, + {"WAL_size_limit_MB", "44"}, + {"manifest_preallocation_size", "45"}, + {"allow_mmap_reads", "true"}, + {"allow_mmap_writes", "false"}, + {"use_direct_reads", "false"}, + {"use_direct_io_for_flush_and_compaction", "false"}, + {"is_fd_close_on_exec", "true"}, + {"skip_log_error_on_recovery", "false"}, + {"stats_dump_period_sec", "46"}, + {"stats_persist_period_sec", "57"}, + {"persist_stats_to_disk", "false"}, + {"stats_history_buffer_size", "69"}, + {"advise_random_on_open", "true"}, + {"experimental_mempurge_threshold", "0.0"}, + {"use_adaptive_mutex", "false"}, + {"new_table_reader_for_compaction_inputs", "true"}, + {"compaction_readahead_size", "100"}, + {"random_access_max_buffer_size", "3145728"}, + {"writable_file_max_buffer_size", "314159"}, + {"bytes_per_sync", "47"}, + {"wal_bytes_per_sync", "48"}, + {"strict_bytes_per_sync", "true"}, + }; + + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); + ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); + ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99); + ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999); + ASSERT_EQ(new_cf_opt.compression, kSnappyCompression); + ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U); + ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression); + ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression); + ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[6], kXpressCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[7], kZSTD); + ASSERT_EQ(new_cf_opt.compression_per_level[8], kZSTDNotFinalCompression); + ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4); + ASSERT_EQ(new_cf_opt.compression_opts.level, 5); + ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6); + ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u); + ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, + CompressionOptions().parallel_threads); + ASSERT_EQ(new_cf_opt.compression_opts.enabled, true); + ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, + CompressionOptions().parallel_threads); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true); + ASSERT_EQ(new_cf_opt.num_levels, 8); + ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8); + ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9); + ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10); + ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast(12)); + ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); + ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18); + ASSERT_EQ(new_cf_opt.max_compaction_bytes, 21); + ASSERT_EQ(new_cf_opt.hard_pending_compaction_bytes_limit, 211); + ASSERT_EQ(new_cf_opt.arena_block_size, 22U); + ASSERT_EQ(new_cf_opt.disable_auto_compactions, true); + ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel); + ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst); + ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size, + static_cast(23)); + ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations, + static_cast(24)); + ASSERT_EQ(new_cf_opt.inplace_update_support, true); + ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U); + ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26); + ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true); + ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U); + ASSERT_EQ(new_cf_opt.bloom_locality, 29U); + ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); + ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true); + ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31"); + ASSERT_EQ(new_cf_opt.enable_blob_files, true); + ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10); + ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30); + ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); + ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); + ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144); + + cf_options_map["write_buffer_size"] = "hello"; + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + ConfigOptions exact, loose; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + cf_options_map["write_buffer_size"] = "1"; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + + cf_options_map["unknown_option"] = "1"; + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, + &new_cf_opt, + false, /* input_strings_escaped */ + true /* ignore_unknown_options */)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + loose, base_cf_opt, new_cf_opt, nullptr /* new_opt_map */)); + ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( + exact /* default for VerifyCFOptions */, base_cf_opt, new_cf_opt, nullptr)); + + DBOptions base_db_opt; + DBOptions new_db_opt; + ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_EQ(new_db_opt.create_if_missing, false); + ASSERT_EQ(new_db_opt.create_missing_column_families, true); + ASSERT_EQ(new_db_opt.error_if_exists, false); + ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); + ASSERT_EQ(new_db_opt.max_open_files, 32); + ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast(33)); + ASSERT_EQ(new_db_opt.use_fsync, true); + ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir"); + ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir"); + ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros, + static_cast(34)); + ASSERT_EQ(new_db_opt.max_background_compactions, 35); + ASSERT_EQ(new_db_opt.max_background_flushes, 36); + ASSERT_EQ(new_db_opt.max_log_file_size, 37U); + ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U); + ASSERT_EQ(new_db_opt.keep_log_file_num, 39U); + ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U); + ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast(40)); + ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41); + ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast(43)); + ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast(44)); + ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U); + ASSERT_EQ(new_db_opt.allow_mmap_reads, true); + ASSERT_EQ(new_db_opt.allow_mmap_writes, false); + ASSERT_EQ(new_db_opt.use_direct_reads, false); + ASSERT_EQ(new_db_opt.use_direct_io_for_flush_and_compaction, false); + ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true); + ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false); + ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U); + ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U); + ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); + ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); + ASSERT_EQ(new_db_opt.advise_random_on_open, true); + ASSERT_EQ(new_db_opt.experimental_mempurge_threshold, 0.0); + ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); + ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true); + ASSERT_EQ(new_db_opt.compaction_readahead_size, 100); + ASSERT_EQ(new_db_opt.random_access_max_buffer_size, 3145728); + ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159); + ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast(47)); + ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast(48)); + ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true); + + db_options_map["max_open_files"] = "hello"; + ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); + + // unknow options should fail parsing without ignore_unknown_options = true + db_options_map["unknown_db_option"] = "1"; + ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); + + ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, + false, /* input_strings_escaped */ + true /* ignore_unknown_options */)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); + ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); +} + +TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + base_cf_opt.table_factory.reset(); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=5", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 5U); + ASSERT_TRUE(new_cf_opt.table_factory == nullptr); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=6;", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 6U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + " write_buffer_size = 7 ", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 7U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + " write_buffer_size = 8 ; ", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 8U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 9U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=11; max_write_buffer_number = 12 ;", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 11U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12); + // Wrong name "max_write_buffer_number_" + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number_=14;", + &new_cf_opt)); + ConfigOptions exact; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Comparator from object registry + std::string kCompName = "reverse_comp"; + ObjectLibrary::Default()->AddFactory( + kCompName, + [](const std::string& /*name*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); + + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator()); + + // MergeOperator from object registry + std::unique_ptr bxo(new BytesXOROperator()); + std::string kMoName = bxo->Name(); + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt)); + ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name())); + + // Wrong key/value pair + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Error Paring value + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Missing option name + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13; =100;", &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + const uint64_t kilo = 1024UL; + const uint64_t mega = 1024 * kilo; + const uint64_t giga = 1024 * mega; + const uint64_t tera = 1024 * giga; + + // Units (k) + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo); + // Units (m) + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "max_write_buffer_number=16m;inplace_update_num_locks=17M", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega); + ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega); + // Units (g) + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "write_buffer_size=18g;prefix_extractor=capped:8;" + "arena_block_size=19G", + &new_cf_opt)); + + ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga); + ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga); + ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr); + ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8"); + + // Units (t) + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera); + ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera); + + // Nested block based table options + // Empty + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={};arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Non-empty + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Last one + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;}", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Mismatch curly braces + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={{{block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Unexpected chars after closing curly brace + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa;" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Invalid block based table option + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={xx_block_size=4;}", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "optimize_filters_for_hits=true", + &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "optimize_filters_for_hits=false", + &new_cf_opt)); + + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "optimize_filters_for_hits=junk", + &new_cf_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); + + // Nested plain table options + // Empty + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={};arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); + // Non-empty + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); + + // memtable factory + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "memtable=skip_list:10;arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr); + ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory")); +} + +TEST_F(OptionsTest, SliceTransformCreateFromString) { + std::shared_ptr transform = nullptr; + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + config_options.ignore_unknown_options = false; + + ASSERT_OK( + SliceTransform::CreateFromString(config_options, "fixed:31", &transform)); + ASSERT_NE(transform, nullptr); + ASSERT_FALSE(transform->IsInstanceOf("capped")); + ASSERT_TRUE(transform->IsInstanceOf("fixed")); + ASSERT_TRUE(transform->IsInstanceOf("rocksdb.FixedPrefix")); + ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.31"); + ASSERT_OK(SliceTransform::CreateFromString( + config_options, "rocksdb.FixedPrefix.42", &transform)); + ASSERT_NE(transform, nullptr); + ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.42"); + + ASSERT_OK(SliceTransform::CreateFromString(config_options, "capped:16", + &transform)); + ASSERT_NE(transform, nullptr); + ASSERT_FALSE(transform->IsInstanceOf("fixed")); + ASSERT_TRUE(transform->IsInstanceOf("capped")); + ASSERT_TRUE(transform->IsInstanceOf("rocksdb.CappedPrefix")); + ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.16"); + ASSERT_OK(SliceTransform::CreateFromString( + config_options, "rocksdb.CappedPrefix.42", &transform)); + ASSERT_NE(transform, nullptr); + ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.42"); + + ASSERT_OK(SliceTransform::CreateFromString(config_options, "rocksdb.Noop", + &transform)); + ASSERT_NE(transform, nullptr); + + ASSERT_NOK(SliceTransform::CreateFromString(config_options, + "fixed:21:invalid", &transform)); + ASSERT_NOK(SliceTransform::CreateFromString(config_options, + "capped:21:invalid", &transform)); + ASSERT_NOK( + SliceTransform::CreateFromString(config_options, "fixed", &transform)); + ASSERT_NOK( + SliceTransform::CreateFromString(config_options, "capped", &transform)); + ASSERT_NOK(SliceTransform::CreateFromString( + config_options, "rocksdb.FixedPrefix:42", &transform)); + ASSERT_NOK(SliceTransform::CreateFromString( + config_options, "rocksdb.CappedPrefix:42", &transform)); + ASSERT_NOK( + SliceTransform::CreateFromString(config_options, "invalid", &transform)); + +#ifndef ROCKSDB_LITE + ASSERT_OK(SliceTransform::CreateFromString( + config_options, "id=rocksdb.CappedPrefix; length=11", &transform)); + ASSERT_NE(transform, nullptr); + ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.11"); + + ASSERT_NOK(SliceTransform::CreateFromString( + config_options, "id=rocksdb.CappedPrefix; length=11; invalid=true", + &transform)); +#endif // ROCKSDB_LITE +} + +TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { + BlockBasedTableOptions table_opt; + BlockBasedTableOptions new_opt; + // make sure default values are overwritten by something else + ASSERT_OK(GetBlockBasedTableOptionsFromString( + table_opt, + "cache_index_and_filter_blocks=1;index_type=kHashSearch;" + "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" + "block_cache=1M;block_cache_compressed=1k;block_size=1024;" + "block_size_deviation=8;block_restart_interval=4;" + "format_version=5;whole_key_filtering=1;" + "filter_policy=bloomfilter:4.567:false;", + &new_opt)); + ASSERT_TRUE(new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch); + ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash); + ASSERT_TRUE(new_opt.hash_index_allow_collision); + ASSERT_TRUE(new_opt.no_block_cache); + ASSERT_TRUE(new_opt.block_cache != nullptr); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL); + ASSERT_EQ(new_opt.block_size, 1024UL); + ASSERT_EQ(new_opt.block_size_deviation, 8); + ASSERT_EQ(new_opt.block_restart_interval, 4); + ASSERT_EQ(new_opt.format_version, 5U); + ASSERT_EQ(new_opt.whole_key_filtering, true); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + const BloomFilterPolicy& bfp = + dynamic_cast(*new_opt.filter_policy); + EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567); + EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5); + + // unknown option + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" + "bad_option=1", + &new_opt)); + ASSERT_EQ(static_cast(table_opt.cache_index_and_filter_blocks), + new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(table_opt.index_type, new_opt.index_type); + + // unrecognized index type + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", + &new_opt)); + ASSERT_EQ(table_opt.cache_index_and_filter_blocks, + new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(table_opt.index_type, new_opt.index_type); + + // unrecognized checksum type + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;checksum=kxxHashXX", + &new_opt)); + ASSERT_EQ(table_opt.cache_index_and_filter_blocks, + new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(table_opt.index_type, new_opt.index_type); + + // unrecognized filter policy name + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilterxx:4:true", + &new_opt)); + ASSERT_EQ(table_opt.cache_index_and_filter_blocks, + new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); + + // unrecognized filter policy config + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilter:4", + &new_opt)); + ASSERT_EQ(table_opt.cache_index_and_filter_blocks, + new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); + + // Check block cache options are overwritten when specified + // in new format as a struct. + ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}", + &new_opt)); + ASSERT_TRUE(new_opt.block_cache != nullptr); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetNumShardBits(), 4); + ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache_compressed)->GetNumShardBits(), 4); + ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache_compressed)->GetHighPriPoolRatio(), + 0.5); + + // Set only block cache capacity. Check other values are + // reset to default values. + ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, + "block_cache={capacity=2M};" + "block_cache_compressed={capacity=2M}", + &new_opt)); + ASSERT_TRUE(new_opt.block_cache != nullptr); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL); + // Default values + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetNumShardBits(), + GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity())); + ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL); + // Default values + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache_compressed)->GetNumShardBits(), + GetDefaultCacheShardBits( + new_opt.block_cache_compressed->GetCapacity())); + ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); + + // Set couple of block cache options. + ASSERT_OK(GetBlockBasedTableOptionsFromString( + table_opt, + "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={num_shard_bits=5;" + "high_pri_pool_ratio=0.0;}", + &new_opt)); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetNumShardBits(), 5); + ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache_compressed)->GetNumShardBits(), 5); + ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.0); + + // Set couple of block cache options. + ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;}", + &new_opt)); + ASSERT_TRUE(new_opt.block_cache != nullptr); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache)->GetNumShardBits(), 4); + ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); + ASSERT_EQ(std::dynamic_pointer_cast( + new_opt.block_cache_compressed)->GetNumShardBits(), 4); + ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); +} + +TEST_F(OptionsOldApiTest, GetPlainTableOptionsFromString) { + PlainTableOptions table_opt; + PlainTableOptions new_opt; + // make sure default values are overwritten by something else + ASSERT_OK(GetPlainTableOptionsFromString(table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" + "full_scan_mode=true;store_index_in_file=true", + &new_opt)); + ASSERT_EQ(new_opt.user_key_len, 66u); + ASSERT_EQ(new_opt.bloom_bits_per_key, 20); + ASSERT_EQ(new_opt.hash_table_ratio, 0.5); + ASSERT_EQ(new_opt.index_sparseness, 8); + ASSERT_EQ(new_opt.huge_page_tlb_size, 4); + ASSERT_EQ(new_opt.encoding_type, EncodingType::kPrefix); + ASSERT_TRUE(new_opt.full_scan_mode); + ASSERT_TRUE(new_opt.store_index_in_file); + + std::unordered_map opt_map; + ASSERT_OK(StringToMap( + "user_key_len=55;bloom_bits_per_key=10;huge_page_tlb_size=8;", &opt_map)); + ASSERT_OK(GetPlainTableOptionsFromMap(table_opt, opt_map, &new_opt)); + ASSERT_EQ(new_opt.user_key_len, 55u); + ASSERT_EQ(new_opt.bloom_bits_per_key, 10); + ASSERT_EQ(new_opt.huge_page_tlb_size, 8); + + // unknown option + ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "bad_option=1", + &new_opt)); + + // unrecognized EncodingType + ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "encoding_type=kPrefixXX", + &new_opt)); +} + +TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { + Options base_options, new_options; + base_options.write_buffer_size = 20; + base_options.min_write_buffer_number_to_merge = 15; + BlockBasedTableOptions block_based_table_options; + block_based_table_options.cache_index_and_filter_blocks = true; + base_options.table_factory.reset( + NewBlockBasedTableFactory(block_based_table_options)); + + // Register an Env with object registry. + ObjectLibrary::Default()->AddFactory( + "CustomEnvDefault", + [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { + static CustomEnv env(Env::Default()); + return &env; + }); + + ASSERT_OK(GetOptionsFromString( + base_options, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;};" + "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;" + "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files=" + "1;" + "rate_limiter_bytes_per_sec=1024;env=CustomEnvDefault", + &new_options)); + + ASSERT_EQ(new_options.compression_opts.window_bits, 4); + ASSERT_EQ(new_options.compression_opts.level, 5); + ASSERT_EQ(new_options.compression_opts.strategy, 6); + ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u); + ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u); + ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u); + ASSERT_EQ(new_options.compression_opts.enabled, false); + ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption); + ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5); + ASSERT_EQ(new_options.bottommost_compression_opts.level, 6); + ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7); + ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u); + ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u); + ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u); + ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false); + ASSERT_EQ(new_options.write_buffer_size, 10U); + ASSERT_EQ(new_options.max_write_buffer_number, 16); + + auto new_block_based_table_options = + new_options.table_factory->GetOptions(); + ASSERT_NE(new_block_based_table_options, nullptr); + ASSERT_EQ(new_block_based_table_options->block_cache->GetCapacity(), + 1U << 20); + ASSERT_EQ(new_block_based_table_options->block_size, 4U); + // don't overwrite block based table options + ASSERT_TRUE(new_block_based_table_options->cache_index_and_filter_blocks); + + ASSERT_EQ(new_options.create_if_missing, true); + ASSERT_EQ(new_options.max_open_files, 1); + ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); + Env* newEnv = new_options.env; + ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv)); + ASSERT_EQ(newEnv, new_options.env); +} + +TEST_F(OptionsOldApiTest, DBOptionsSerialization) { + Options base_options, new_options; + Random rnd(301); + + // Phase 1: Make big change in base_options + test::RandomInitDBOptions(&base_options, &rnd); + + // Phase 2: obtain a string from base_option + std::string base_options_file_content; + ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options)); + + // Phase 3: Set new_options from the derived string and expect + // new_options == base_options + ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content, + &new_options)); + ConfigOptions config_options; + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options, new_options)); +} + +TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) { + Options options; + ColumnFamilyOptions base_opt, new_opt; + Random rnd(302); + // Phase 1: randomly assign base_opt + // custom type options + test::RandomInitCFOptions(&base_opt, options, &rnd); + + // Phase 2: obtain a string from base_opt + std::string base_options_file_content; + ASSERT_OK( + GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt)); + + // Phase 3: Set new_opt from the derived string and expect + // new_opt == base_opt + ASSERT_OK(GetColumnFamilyOptionsFromString( + ColumnFamilyOptions(), base_options_file_content, &new_opt)); + ConfigOptions config_options; + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt)); + if (base_opt.compaction_filter) { + delete base_opt.compaction_filter; + } +} +#endif // !ROCKSDB_LITE #ifndef ROCKSDB_LITE class OptionsParserTest : public testing::Test { public: - OptionsParserTest() { - env_.reset(new test::StringEnv(Env::Default())); - fs_.reset(new LegacyFileSystemWrapper(env_.get())); - } + OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); } protected: - std::unique_ptr env_; - std::unique_ptr fs_; + std::shared_ptr fs_; }; TEST_F(OptionsParserTest, Comment) { @@ -1148,15 +3100,19 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_OK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), db_opt)); + ConfigOptions exact; + exact.input_strings_escaped = false; + exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; + ASSERT_OK( + RocksDBOptionsParser::VerifyDBOptions(exact, *parser.db_opt(), db_opt)); ASSERT_EQ(parser.NumColumnFamilies(), 1U); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( - *parser.GetCFOptions("default"), cf_opt)); + exact, *parser.GetCFOptions("default"), cf_opt)); } TEST_F(OptionsParserTest, ExtraSpace) { @@ -1175,7 +3131,7 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_OK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1193,10 +3149,11 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); + ; } TEST_F(OptionsParserTest, DoubleDBOptions) { @@ -1222,7 +3179,7 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1250,7 +3207,7 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1280,7 +3237,7 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1309,7 +3266,7 @@ "[CFOptions \"something_else\"]\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->WriteToNewFile(kTestFileName, options_file_content); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1377,8 +3334,12 @@ " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - env_->DeleteFile(kTestFileName); - env_->WriteToNewFile(kTestFileName, options_file_content); + auto s = fs_->FileExists(kTestFileName, IOOptions(), nullptr); + ASSERT_TRUE(s.ok() || s.IsNotFound()); + if (s.ok()) { + ASSERT_OK(fs_->DeleteFile(kTestFileName, IOOptions(), nullptr)); + } + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -1426,7 +3387,7 @@ snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str()); parser.Reset(); - env_->WriteToNewFile(iv, buffer); + ASSERT_OK(fs_->WriteToNewFile(iv, buffer)); ASSERT_NOK(parser.Parse(iv, fs_.get(), false, 0 /* readahead_size */)); } @@ -1435,7 +3396,7 @@ for (auto vv : valid_versions) { snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str()); parser.Reset(); - env_->WriteToNewFile(vv, buffer); + ASSERT_OK(fs_->WriteToNewFile(vv, buffer)); ASSERT_OK(parser.Parse(vv, fs_.get(), false, 0 /* readahead_size */)); } } @@ -1444,41 +3405,43 @@ ColumnFamilyOptions* base_cf_opt, const ColumnFamilyOptions* new_cf_opt, const std::unordered_map* new_cf_opt_map) { std::string name_buffer; - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ConfigOptions config_options; + config_options.input_strings_escaped = false; + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, *base_cf_opt, + *new_cf_opt, new_cf_opt_map)); // change the name of merge operator back-and-forth { - auto* merge_operator = dynamic_cast( - base_cf_opt->merge_operator.get()); + auto* merge_operator = base_cf_opt->merge_operator + ->CheckedCast(); if (merge_operator != nullptr) { name_buffer = merge_operator->Name(); // change the name and expect non-ok status merge_operator->SetName("some-other-name"); ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - *base_cf_opt, *new_cf_opt, new_cf_opt_map)); + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); // change the name back and expect ok status merge_operator->SetName(name_buffer); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); } } // change the name of the compaction filter factory back-and-forth { auto* compaction_filter_factory = - dynamic_cast( - base_cf_opt->compaction_filter_factory.get()); + base_cf_opt->compaction_filter_factory + ->CheckedCast(); if (compaction_filter_factory != nullptr) { name_buffer = compaction_filter_factory->Name(); // change the name and expect non-ok status compaction_filter_factory->SetName("some-other-name"); ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - *base_cf_opt, *new_cf_opt, new_cf_opt_map)); + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); // change the name back and expect ok status compaction_filter_factory->SetName(name_buffer); - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); } } @@ -1489,11 +3452,11 @@ base_cf_opt->compaction_filter = nullptr; // set compaction_filter to nullptr and expect non-ok status ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - *base_cf_opt, *new_cf_opt, new_cf_opt_map)); + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); // set the value back and expect ok status base_cf_opt->compaction_filter = tmp_compaction_filter; - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); } } @@ -1504,11 +3467,11 @@ base_cf_opt->table_factory.reset(); // set table_factory to nullptr and expect non-ok status ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - *base_cf_opt, *new_cf_opt, new_cf_opt_map)); + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); // set the value back and expect ok status base_cf_opt->table_factory = tmp_table_factory; - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); } } @@ -1519,11 +3482,11 @@ base_cf_opt->memtable_factory.reset(); // set memtable_factory to nullptr and expect non-ok status ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( - *base_cf_opt, *new_cf_opt, new_cf_opt_map)); + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); // set the value back and expect ok status base_cf_opt->memtable_factory = tmp_memtable_factory; - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt, - new_cf_opt_map)); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( + config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map)); } } } @@ -1542,37 +3505,37 @@ kOptionsFileName, fs_.get())); uint64_t file_size = 0; - ASSERT_OK(env_->GetFileSize(kOptionsFileName, &file_size)); + ASSERT_OK( + fs_->GetFileSize(kOptionsFileName, IOOptions(), &file_size, nullptr)); assert(file_size > 0); - + RocksDBOptionsParser parser; - env_->num_seq_file_read_ = 0; + fs_->num_seq_file_read_ = 0; size_t readahead_size = 128 * 1024; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size)); - ASSERT_EQ(env_->num_seq_file_read_.load(), + ASSERT_EQ(fs_->num_seq_file_read_.load(), (file_size - 1) / readahead_size + 1); - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); readahead_size = 1024 * 1024; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size)); - ASSERT_EQ(env_->num_seq_file_read_.load(), + ASSERT_EQ(fs_->num_seq_file_read_.load(), (file_size - 1) / readahead_size + 1); // Tiny readahead. 8 KB is read each time. - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); ASSERT_OK( parser.Parse(kOptionsFileName, fs_.get(), false, 1 /* readahead_size */)); - ASSERT_GE(env_->num_seq_file_read_.load(), file_size / (8 * 1024)); - ASSERT_LT(env_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2); + ASSERT_GE(fs_->num_seq_file_read_.load(), file_size / (8 * 1024)); + ASSERT_LT(fs_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2); // Disable readahead means 512KB readahead. - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); ASSERT_OK( parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */)); - ASSERT_GE(env_->num_seq_file_read_.load(), - (file_size - 1) / (512 * 1024) + 1); + ASSERT_GE(fs_->num_seq_file_read_.load(), (file_size - 1) / (512 * 1024) + 1); } TEST_F(OptionsParserTest, DumpAndParse) { @@ -1607,32 +3570,35 @@ } const std::string kOptionsFileName = "test-persisted-options.ini"; + // Use default for escaped(true), unknown(false) and check (exact) + ConfigOptions config_options; ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts, kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; - ASSERT_OK( - parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */)); + ASSERT_OK(parser.Parse(config_options, kOptionsFileName, fs_.get())); // Make sure block-based table factory options was deserialized correctly std::shared_ptr ttf = (*parser.cf_opts())[4].table_factory; - ASSERT_EQ(BlockBasedTableFactory::kName, std::string(ttf->Name())); - const BlockBasedTableOptions& parsed_bbto = - static_cast(ttf.get())->table_options(); - ASSERT_EQ(special_bbto.block_size, parsed_bbto.block_size); + ASSERT_EQ(TableFactory::kBlockBasedTableName(), std::string(ttf->Name())); + const auto parsed_bbto = ttf->GetOptions(); + ASSERT_NE(parsed_bbto, nullptr); + ASSERT_EQ(special_bbto.block_size, parsed_bbto->block_size); ASSERT_EQ(special_bbto.cache_index_and_filter_blocks, - parsed_bbto.cache_index_and_filter_blocks); + parsed_bbto->cache_index_and_filter_blocks); ASSERT_OK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( - base_db_opt, cf_names, base_cf_opts, kOptionsFileName, fs_.get())); + config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName, + fs_.get())); - ASSERT_OK( - RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), base_db_opt)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions( + config_options, *parser.db_opt(), base_db_opt)); for (int c = 0; c < num_cf; ++c) { const auto* cf_opt = parser.GetCFOptions(cf_names[c]); ASSERT_NE(cf_opt, nullptr); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( - base_cf_opts[c], *cf_opt, &(parser.cf_opt_maps()->at(c)))); + config_options, base_cf_opts[c], *cf_opt, + &(parser.cf_opt_maps()->at(c)))); } // Further verify pointer-typed options @@ -1647,7 +3613,8 @@ base_db_opt.max_open_files++; ASSERT_NOK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( - base_db_opt, cf_names, base_cf_opts, kOptionsFileName, fs_.get())); + config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName, + fs_.get())); for (int c = 0; c < num_cf; ++c) { if (base_cf_opts[c].compaction_filter) { @@ -1671,8 +3638,8 @@ kOptionsFileName, fs_.get())); RocksDBOptionsParser parser; - ASSERT_OK( - parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */)); + ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, + 4096 /* readahead_size */)); { Options old_default_opts; @@ -1747,38 +3714,94 @@ ASSERT_EQ(5000, small_opts.max_open_files); } -class OptionsSanityCheckTest : public OptionsParserTest { +class OptionsSanityCheckTest : public OptionsParserTest, + public ::testing::WithParamInterface { + protected: + ConfigOptions config_options_; + public: - OptionsSanityCheckTest() {} + OptionsSanityCheckTest() { + config_options_.ignore_unknown_options = false; + config_options_.ignore_unsupported_options = GetParam(); + config_options_.input_strings_escaped = true; + } protected: - Status SanityCheckCFOptions(const ColumnFamilyOptions& cf_opts, - OptionsSanityCheckLevel level) { + Status SanityCheckOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts, + ConfigOptions::SanityLevel level) { + config_options_.sanity_level = level; return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( - DBOptions(), {"default"}, {cf_opts}, kOptionsFileName, fs_.get(), - level); + config_options_, db_opts, {"default"}, {cf_opts}, kOptionsFileName, + fs_.get()); } - Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) { - Status s = env_->DeleteFile(kOptionsFileName); + Status SanityCheckCFOptions(const ColumnFamilyOptions& cf_opts, + ConfigOptions::SanityLevel level) { + return SanityCheckOptions(DBOptions(), cf_opts, level); + } + + void SanityCheckCFOptions(const ColumnFamilyOptions& opts, bool exact) { + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); + if (exact) { + ASSERT_OK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + } else { + ASSERT_NOK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + } + } + + Status SanityCheckDBOptions(const DBOptions& db_opts, + ConfigOptions::SanityLevel level) { + return SanityCheckOptions(db_opts, ColumnFamilyOptions(), level); + } + + void SanityCheckDBOptions(const DBOptions& opts, bool exact) { + ASSERT_OK(SanityCheckDBOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelNone)); + if (exact) { + ASSERT_OK( + SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + } else { + ASSERT_NOK( + SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + } + } + + Status PersistOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) { + Status s = fs_->DeleteFile(kOptionsFileName, IOOptions(), nullptr); if (!s.ok()) { return s; } - return PersistRocksDBOptions(DBOptions(), {"default"}, {cf_opts}, + return PersistRocksDBOptions(db_opts, {"default"}, {cf_opts}, kOptionsFileName, fs_.get()); } + Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) { + return PersistOptions(DBOptions(), cf_opts); + } + + Status PersistDBOptions(const DBOptions& db_opts) { + return PersistOptions(db_opts, ColumnFamilyOptions()); + } + const std::string kOptionsFileName = "OPTIONS"; }; -TEST_F(OptionsSanityCheckTest, SanityCheck) { +TEST_P(OptionsSanityCheckTest, CFOptionsSanityCheck) { ColumnFamilyOptions opts; Random rnd(301); // default ColumnFamilyOptions { ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + ASSERT_OK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); } // prefix_extractor @@ -1786,59 +3809,69 @@ // Okay to change prefix_extractor form nullptr to non-nullptr ASSERT_EQ(opts.prefix_extractor.get(), nullptr); opts.prefix_extractor.reset(NewCappedPrefixTransform(10)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + ASSERT_OK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); // use same prefix extractor but with different parameter opts.prefix_extractor.reset(NewCappedPrefixTransform(15)); - // expect pass only in kSanityLevelLooselyCompatible - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + // expect pass only in + // ConfigOptions::kSanityLevelLooselyCompatible + ASSERT_NOK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // repeat the test with FixedPrefixTransform opts.prefix_extractor.reset(NewFixedPrefixTransform(10)); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_NOK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change of prefix_extractor ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + ASSERT_OK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); // use same prefix extractor but with different parameter opts.prefix_extractor.reset(NewFixedPrefixTransform(15)); - // expect pass only in kSanityLevelLooselyCompatible - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + // expect pass only in + // ConfigOptions::kSanityLevelLooselyCompatible + SanityCheckCFOptions(opts, false); // Change prefix extractor from non-nullptr to nullptr opts.prefix_extractor.reset(); // expect pass as it's safe to change prefix_extractor // from non-null to null - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); } // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); // table_factory { for (int tb = 0; tb <= 2; ++tb) { // change the table factory opts.table_factory.reset(test::RandomTableFactory(&rnd, tb)); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_NOK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + ASSERT_OK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); } } @@ -1846,32 +3879,35 @@ { // Test when going from nullptr -> merge operator opts.merge_operator.reset(test::RandomMergeOperator(&rnd)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_OK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); for (int test = 0; test < 5; ++test) { // change the merge operator opts.merge_operator.reset(test::RandomMergeOperator(&rnd)); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_NOK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); } // Test when going from merge operator -> nullptr opts.merge_operator = nullptr; - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + ASSERT_NOK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone)); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + SanityCheckCFOptions(opts, true); } // compaction_filter @@ -1879,12 +3915,11 @@ for (int test = 0; test < 5; ++test) { // change the compaction filter opts.compaction_filter = test::RandomCompactionFilter(&rnd); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); + SanityCheckCFOptions(opts, false); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); delete opts.compaction_filter; opts.compaction_filter = nullptr; } @@ -1896,16 +3931,57 @@ // change the compaction filter factory opts.compaction_filter_factory.reset( test::RandomCompactionFilterFactory(&rnd)); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); + SanityCheckCFOptions(opts, false); // persist the change ASSERT_OK(PersistCFOptions(opts)); - ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); + SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); } } } +TEST_P(OptionsSanityCheckTest, DBOptionsSanityCheck) { + DBOptions opts; + Random rnd(301); + + // default DBOptions + { + ASSERT_OK(PersistDBOptions(opts)); + ASSERT_OK( + SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + } + + // File checksum generator + { + class MockFileChecksumGenFactory : public FileChecksumGenFactory { + public: + static const char* kClassName() { return "Mock"; } + const char* Name() const override { return kClassName(); } + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& /*context*/) override { + return nullptr; + } + }; + + // Okay to change file_checksum_gen_factory form nullptr to non-nullptr + ASSERT_EQ(opts.file_checksum_gen_factory.get(), nullptr); + opts.file_checksum_gen_factory.reset(new MockFileChecksumGenFactory()); + + // persist the change + ASSERT_OK(PersistDBOptions(opts)); + SanityCheckDBOptions(opts, config_options_.ignore_unsupported_options); + + // Change file_checksum_gen_factory from non-nullptr to nullptr + opts.file_checksum_gen_factory.reset(); + // expect pass as it's safe to change file_checksum_gen_factory + // from non-null to null + SanityCheckDBOptions(opts, false); + } + // persist the change + ASSERT_OK(PersistDBOptions(opts)); + ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch)); +} + namespace { bool IsEscapedString(const std::string& str) { for (size_t i = 0; i < str.size(); ++i) { @@ -1992,7 +4068,635 @@ "Escape \\# and # comment together ."), "Escape \\# and"); } + +static void TestAndCompareOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, void* base_ptr, + void* comp_ptr, bool strip = false) { + std::string result, mismatch; + ASSERT_OK(opt_info.Serialize(config_options, opt_name, base_ptr, &result)); + if (strip) { + ASSERT_EQ(result.at(0), '{'); + ASSERT_EQ(result.at(result.size() - 1), '}'); + result = result.substr(1, result.size() - 2); + } + ASSERT_OK(opt_info.Parse(config_options, opt_name, result, comp_ptr)); + ASSERT_TRUE(opt_info.AreEqual(config_options, opt_name, base_ptr, comp_ptr, + &mismatch)); +} + +static void TestParseAndCompareOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, + void* base_ptr, void* comp_ptr, + bool strip = false) { + ASSERT_OK(opt_info.Parse(config_options, opt_name, opt_value, base_ptr)); + TestAndCompareOption(config_options, opt_info, opt_name, base_ptr, comp_ptr, + strip); +} + +template +void TestOptInfo(const ConfigOptions& config_options, OptionType opt_type, + T* base, T* comp) { + std::string result; + OptionTypeInfo opt_info(0, opt_type); + ASSERT_FALSE(opt_info.AreEqual(config_options, "base", base, comp, &result)); + ASSERT_EQ(result, "base"); + ASSERT_NE(*base, *comp); + TestAndCompareOption(config_options, opt_info, "base", base, comp); + ASSERT_EQ(*base, *comp); +} + +class OptionTypeInfoTest : public testing::Test {}; + +TEST_F(OptionTypeInfoTest, BasicTypes) { + ConfigOptions config_options; + { + bool a = true, b = false; + TestOptInfo(config_options, OptionType::kBoolean, &a, &b); + } + { + int a = 100, b = 200; + TestOptInfo(config_options, OptionType::kInt, &a, &b); + } + { + int32_t a = 100, b = 200; + TestOptInfo(config_options, OptionType::kInt32T, &a, &b); + } + { + int64_t a = 100, b = 200; + TestOptInfo(config_options, OptionType::kInt64T, &a, &b); + } + { + unsigned int a = 100, b = 200; + TestOptInfo(config_options, OptionType::kUInt, &a, &b); + } + { + uint32_t a = 100, b = 200; + TestOptInfo(config_options, OptionType::kUInt32T, &a, &b); + } + { + uint64_t a = 100, b = 200; + TestOptInfo(config_options, OptionType::kUInt64T, &a, &b); + } + { + size_t a = 100, b = 200; + TestOptInfo(config_options, OptionType::kSizeT, &a, &b); + } + { + std::string a = "100", b = "200"; + TestOptInfo(config_options, OptionType::kString, &a, &b); + } + { + double a = 1.0, b = 2.0; + TestOptInfo(config_options, OptionType::kDouble, &a, &b); + } +} + +TEST_F(OptionTypeInfoTest, TestInvalidArgs) { + ConfigOptions config_options; + bool b; + int i; + int32_t i32; + int64_t i64; + unsigned int u; + int32_t u32; + int64_t u64; + size_t sz; + double d; + + ASSERT_NOK(OptionTypeInfo(0, OptionType::kBoolean) + .Parse(config_options, "b", "x", &b)); + ASSERT_NOK( + OptionTypeInfo(0, OptionType::kInt).Parse(config_options, "b", "x", &i)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt32T) + .Parse(config_options, "b", "x", &i32)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt64T) + .Parse(config_options, "b", "x", &i64)); + ASSERT_NOK( + OptionTypeInfo(0, OptionType::kUInt).Parse(config_options, "b", "x", &u)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt32T) + .Parse(config_options, "b", "x", &u32)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt64T) + .Parse(config_options, "b", "x", &u64)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kSizeT) + .Parse(config_options, "b", "x", &sz)); + ASSERT_NOK(OptionTypeInfo(0, OptionType::kDouble) + .Parse(config_options, "b", "x", &d)); + + // Don't know how to convert Unknowns to anything else + ASSERT_NOK(OptionTypeInfo(0, OptionType::kUnknown) + .Parse(config_options, "b", "x", &d)); + + // Verify that if the parse function throws an exception, it is also trapped + OptionTypeInfo func_info(0, OptionType::kUnknown, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions&, const std::string&, + const std::string& value, void* addr) { + auto ptr = static_cast(addr); + *ptr = ParseInt(value); + return Status::OK(); + }); + ASSERT_OK(func_info.Parse(config_options, "b", "1", &i)); + ASSERT_NOK(func_info.Parse(config_options, "b", "x", &i)); +} + +TEST_F(OptionTypeInfoTest, TestParseFunc) { + OptionTypeInfo opt_info( + 0, OptionType::kUnknown, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& /*opts*/, const std::string& name, + const std::string& value, void* addr) { + auto ptr = static_cast(addr); + if (name == "Oops") { + return Status::InvalidArgument(value); + } else { + *ptr = value + " " + name; + return Status::OK(); + } + }); + ConfigOptions config_options; + std::string base; + ASSERT_OK(opt_info.Parse(config_options, "World", "Hello", &base)); + ASSERT_EQ(base, "Hello World"); + ASSERT_NOK(opt_info.Parse(config_options, "Oops", "Hello", &base)); +} + +TEST_F(OptionTypeInfoTest, TestSerializeFunc) { + OptionTypeInfo opt_info( + 0, OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, nullptr, + [](const ConfigOptions& /*opts*/, const std::string& name, + const void* /*addr*/, std::string* value) { + if (name == "Oops") { + return Status::InvalidArgument(name); + } else { + *value = name; + return Status::OK(); + } + }, + nullptr); + ConfigOptions config_options; + std::string base; + std::string value; + ASSERT_OK(opt_info.Serialize(config_options, "Hello", &base, &value)); + ASSERT_EQ(value, "Hello"); + ASSERT_NOK(opt_info.Serialize(config_options, "Oops", &base, &value)); +} + +TEST_F(OptionTypeInfoTest, TestEqualsFunc) { + OptionTypeInfo opt_info( + 0, OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, nullptr, nullptr, + [](const ConfigOptions& /*opts*/, const std::string& name, + const void* addr1, const void* addr2, std::string* mismatch) { + auto i1 = *(static_cast(addr1)); + auto i2 = *(static_cast(addr2)); + if (name == "LT") { + return i1 < i2; + } else if (name == "GT") { + return i1 > i2; + } else if (name == "EQ") { + return i1 == i2; + } else { + *mismatch = name + "???"; + return false; + } + }); + + ConfigOptions config_options; + int int1 = 100; + int int2 = 200; + std::string mismatch; + ASSERT_TRUE(opt_info.AreEqual(config_options, "LT", &int1, &int2, &mismatch)); + ASSERT_EQ(mismatch, ""); + ASSERT_FALSE( + opt_info.AreEqual(config_options, "GT", &int1, &int2, &mismatch)); + ASSERT_EQ(mismatch, "GT"); + ASSERT_FALSE( + opt_info.AreEqual(config_options, "NO", &int1, &int2, &mismatch)); + ASSERT_EQ(mismatch, "NO???"); +} + +TEST_F(OptionTypeInfoTest, TestOptionFlags) { + OptionTypeInfo opt_none(0, OptionType::kString, + OptionVerificationType::kNormal, + OptionTypeFlags::kDontSerialize); + OptionTypeInfo opt_never(0, OptionType::kString, + OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever); + OptionTypeInfo opt_alias(0, OptionType::kString, + OptionVerificationType::kAlias, + OptionTypeFlags::kNone); + OptionTypeInfo opt_deprecated(0, OptionType::kString, + OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone); + ConfigOptions config_options; + std::string opts_str; + std::string base = "base"; + std::string comp = "comp"; + + // If marked string none, the serialization returns not supported + ASSERT_NOK(opt_none.Serialize(config_options, "None", &base, &opts_str)); + // If marked never compare, they match even when they do not + ASSERT_TRUE(opt_never.AreEqual(config_options, "Never", &base, &comp, &base)); + ASSERT_FALSE(opt_none.AreEqual(config_options, "Never", &base, &comp, &base)); + + // An alias can change the value via parse, but does nothing on serialize on + // match + std::string result; + ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", &base)); + ASSERT_OK(opt_alias.Serialize(config_options, "Alias", &base, &result)); + ASSERT_TRUE( + opt_alias.AreEqual(config_options, "Alias", &base, &comp, &result)); + ASSERT_EQ(base, "Alias"); + ASSERT_NE(base, comp); + + // Deprecated options do nothing on any of the commands + ASSERT_OK(opt_deprecated.Parse(config_options, "Alias", "Deprecated", &base)); + ASSERT_OK(opt_deprecated.Serialize(config_options, "Alias", &base, &result)); + ASSERT_TRUE( + opt_deprecated.AreEqual(config_options, "Alias", &base, &comp, &result)); + ASSERT_EQ(base, "Alias"); + ASSERT_NE(base, comp); +} + +TEST_F(OptionTypeInfoTest, TestCustomEnum) { + enum TestEnum { kA, kB, kC }; + std::unordered_map enum_map = { + {"A", TestEnum::kA}, + {"B", TestEnum::kB}, + {"C", TestEnum::kC}, + }; + OptionTypeInfo opt_info = OptionTypeInfo::Enum(0, &enum_map); + TestEnum e1, e2; + ConfigOptions config_options; + std::string result, mismatch; + + e2 = TestEnum::kA; + + ASSERT_OK(opt_info.Parse(config_options, "", "B", &e1)); + ASSERT_OK(opt_info.Serialize(config_options, "", &e1, &result)); + ASSERT_EQ(e1, TestEnum::kB); + ASSERT_EQ(result, "B"); + + ASSERT_FALSE(opt_info.AreEqual(config_options, "Enum", &e1, &e2, &mismatch)); + ASSERT_EQ(mismatch, "Enum"); + + TestParseAndCompareOption(config_options, opt_info, "", "C", &e1, &e2); + ASSERT_EQ(e2, TestEnum::kC); + + ASSERT_NOK(opt_info.Parse(config_options, "", "D", &e1)); + ASSERT_EQ(e1, TestEnum::kC); +} + +TEST_F(OptionTypeInfoTest, TestBuiltinEnum) { + ConfigOptions config_options; + for (auto iter : OptionsHelper::compaction_style_string_map) { + CompactionStyle e1, e2; + TestParseAndCompareOption(config_options, + OptionTypeInfo(0, OptionType::kCompactionStyle), + "CompactionStyle", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } + for (auto iter : OptionsHelper::compaction_pri_string_map) { + CompactionPri e1, e2; + TestParseAndCompareOption(config_options, + OptionTypeInfo(0, OptionType::kCompactionPri), + "CompactionPri", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } + for (auto iter : OptionsHelper::compression_type_string_map) { + CompressionType e1, e2; + TestParseAndCompareOption(config_options, + OptionTypeInfo(0, OptionType::kCompressionType), + "CompressionType", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } + for (auto iter : OptionsHelper::compaction_stop_style_string_map) { + CompactionStopStyle e1, e2; + TestParseAndCompareOption( + config_options, OptionTypeInfo(0, OptionType::kCompactionStopStyle), + "CompactionStopStyle", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } + for (auto iter : OptionsHelper::checksum_type_string_map) { + ChecksumType e1, e2; + TestParseAndCompareOption(config_options, + OptionTypeInfo(0, OptionType::kChecksumType), + "CheckSumType", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } + for (auto iter : OptionsHelper::encoding_type_string_map) { + EncodingType e1, e2; + TestParseAndCompareOption(config_options, + OptionTypeInfo(0, OptionType::kEncodingType), + "EncodingType", iter.first, &e1, &e2); + ASSERT_EQ(e1, iter.second); + } +} + +TEST_F(OptionTypeInfoTest, TestStruct) { + struct Basic { + int i = 42; + std::string s = "Hello"; + }; + + struct Extended { + int j = 11; + Basic b; + }; + + std::unordered_map basic_type_map = { + {"i", {offsetof(struct Basic, i), OptionType::kInt}}, + {"s", {offsetof(struct Basic, s), OptionType::kString}}, + }; + OptionTypeInfo basic_info = OptionTypeInfo::Struct( + "b", &basic_type_map, 0, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable); + + std::unordered_map extended_type_map = { + {"j", {offsetof(struct Extended, j), OptionType::kInt}}, + {"b", OptionTypeInfo::Struct( + "b", &basic_type_map, offsetof(struct Extended, b), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"m", OptionTypeInfo::Struct( + "m", &basic_type_map, offsetof(struct Extended, b), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}, + }; + OptionTypeInfo extended_info = OptionTypeInfo::Struct( + "e", &extended_type_map, 0, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable); + Extended e1, e2; + ConfigOptions config_options; + std::string mismatch; + TestParseAndCompareOption(config_options, basic_info, "b", "{i=33;s=33}", + &e1.b, &e2.b); + ASSERT_EQ(e1.b.i, 33); + ASSERT_EQ(e1.b.s, "33"); + + TestParseAndCompareOption(config_options, basic_info, "b.i", "44", &e1.b, + &e2.b); + ASSERT_EQ(e1.b.i, 44); + + TestParseAndCompareOption(config_options, basic_info, "i", "55", &e1.b, + &e2.b); + ASSERT_EQ(e1.b.i, 55); + + e1.b.i = 0; + + ASSERT_FALSE( + basic_info.AreEqual(config_options, "b", &e1.b, &e2.b, &mismatch)); + ASSERT_EQ(mismatch, "b.i"); + mismatch.clear(); + ASSERT_FALSE( + basic_info.AreEqual(config_options, "b.i", &e1.b, &e2.b, &mismatch)); + ASSERT_EQ(mismatch, "b.i"); + mismatch.clear(); + ASSERT_FALSE( + basic_info.AreEqual(config_options, "i", &e1.b, &e2.b, &mismatch)); + ASSERT_EQ(mismatch, "b.i"); + mismatch.clear(); + + e1 = e2; + ASSERT_NOK(basic_info.Parse(config_options, "b", "{i=33;s=33;j=44}", &e1.b)); + ASSERT_NOK(basic_info.Parse(config_options, "b.j", "44", &e1.b)); + ASSERT_NOK(basic_info.Parse(config_options, "j", "44", &e1.b)); + + TestParseAndCompareOption(config_options, extended_info, "e", + "b={i=55;s=55}; j=22;", &e1, &e2); + ASSERT_EQ(e1.b.i, 55); + ASSERT_EQ(e1.j, 22); + ASSERT_EQ(e1.b.s, "55"); + TestParseAndCompareOption(config_options, extended_info, "e.b", + "{i=66;s=66;}", &e1, &e2); + ASSERT_EQ(e1.b.i, 66); + ASSERT_EQ(e1.j, 22); + ASSERT_EQ(e1.b.s, "66"); + TestParseAndCompareOption(config_options, extended_info, "e.b.i", "77", &e1, + &e2); + ASSERT_EQ(e1.b.i, 77); + ASSERT_EQ(e1.j, 22); + ASSERT_EQ(e1.b.s, "66"); +} + +TEST_F(OptionTypeInfoTest, TestVectorType) { + OptionTypeInfo vec_info = OptionTypeInfo::Vector( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kString}); + std::vector vec1, vec2; + std::string mismatch; + + ConfigOptions config_options; + TestParseAndCompareOption(config_options, vec_info, "v", "a:b:c:d", &vec1, + &vec2); + ASSERT_EQ(vec1.size(), 4); + ASSERT_EQ(vec1[0], "a"); + ASSERT_EQ(vec1[1], "b"); + ASSERT_EQ(vec1[2], "c"); + ASSERT_EQ(vec1[3], "d"); + vec1[3] = "e"; + ASSERT_FALSE(vec_info.AreEqual(config_options, "v", &vec1, &vec2, &mismatch)); + ASSERT_EQ(mismatch, "v"); + + // Test vectors with inner brackets + TestParseAndCompareOption(config_options, vec_info, "v", "a:{b}:c:d", &vec1, + &vec2); + ASSERT_EQ(vec1.size(), 4); + ASSERT_EQ(vec1[0], "a"); + ASSERT_EQ(vec1[1], "b"); + ASSERT_EQ(vec1[2], "c"); + ASSERT_EQ(vec1[3], "d"); + + OptionTypeInfo bar_info = OptionTypeInfo::Vector( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kString}, '|'); + TestParseAndCompareOption(config_options, vec_info, "v", "x|y|z", &vec1, + &vec2); + // Test vectors with inner vector + TestParseAndCompareOption(config_options, bar_info, "v", + "a|{b1|b2}|{c1|c2|{d1|d2}}", &vec1, &vec2, false); + ASSERT_EQ(vec1.size(), 3); + ASSERT_EQ(vec1[0], "a"); + ASSERT_EQ(vec1[1], "b1|b2"); + ASSERT_EQ(vec1[2], "c1|c2|{d1|d2}"); + + TestParseAndCompareOption(config_options, bar_info, "v", + "{a1|a2}|{b1|{c1|c2}}|d1", &vec1, &vec2, true); + ASSERT_EQ(vec1.size(), 3); + ASSERT_EQ(vec1[0], "a1|a2"); + ASSERT_EQ(vec1[1], "b1|{c1|c2}"); + ASSERT_EQ(vec1[2], "d1"); + + TestParseAndCompareOption(config_options, bar_info, "v", "{a1}", &vec1, &vec2, + false); + ASSERT_EQ(vec1.size(), 1); + ASSERT_EQ(vec1[0], "a1"); + + TestParseAndCompareOption(config_options, bar_info, "v", "{a1|a2}|{b1|b2}", + &vec1, &vec2, true); + ASSERT_EQ(vec1.size(), 2); + ASSERT_EQ(vec1[0], "a1|a2"); + ASSERT_EQ(vec1[1], "b1|b2"); +} + +TEST_F(OptionTypeInfoTest, TestStaticType) { + struct SimpleOptions { + size_t size = 0; + bool verify = true; + }; + + static std::unordered_map type_map = { + {"size", {offsetof(struct SimpleOptions, size), OptionType::kSizeT}}, + {"verify", + {offsetof(struct SimpleOptions, verify), OptionType::kBoolean}}, + }; + + ConfigOptions config_options; + SimpleOptions opts, copy; + opts.size = 12345; + opts.verify = false; + std::string str, mismatch; + + ASSERT_OK( + OptionTypeInfo::SerializeType(config_options, type_map, &opts, &str)); + ASSERT_FALSE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts, + ©, &mismatch)); + ASSERT_OK(OptionTypeInfo::ParseType(config_options, str, type_map, ©)); + ASSERT_TRUE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts, + ©, &mismatch)); +} + +class ConfigOptionsTest : public testing::Test {}; + +TEST_F(ConfigOptionsTest, EnvFromConfigOptions) { + ConfigOptions config_options; + DBOptions db_opts; + Options opts; + Env* mem_env = NewMemEnv(Env::Default()); + config_options.registry->AddLibrary("custom-env", RegisterCustomEnv, + kCustomEnvName); + + config_options.env = mem_env; + // First test that we can get the env as expected + ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), kCustomEnvProp, + &db_opts)); + ASSERT_OK( + GetOptionsFromString(config_options, Options(), kCustomEnvProp, &opts)); + ASSERT_NE(config_options.env, db_opts.env); + ASSERT_EQ(opts.env, db_opts.env); + Env* custom_env = db_opts.env; + + // Now try a "bad" env" and check that nothing changed + config_options.ignore_unsupported_options = true; + ASSERT_OK( + GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts)); + ASSERT_OK(GetOptionsFromString(config_options, opts, "env=unknown", &opts)); + ASSERT_EQ(config_options.env, mem_env); + ASSERT_EQ(db_opts.env, custom_env); + ASSERT_EQ(opts.env, db_opts.env); + + // Now try a "bad" env" ignoring unknown objects + config_options.ignore_unsupported_options = false; + ASSERT_NOK( + GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts)); + ASSERT_EQ(config_options.env, mem_env); + ASSERT_EQ(db_opts.env, custom_env); + ASSERT_EQ(opts.env, db_opts.env); + + delete mem_env; +} +TEST_F(ConfigOptionsTest, MergeOperatorFromString) { + ConfigOptions config_options; + std::shared_ptr merge_op; + + ASSERT_OK(MergeOperator::CreateFromString(config_options, "put", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("put")); + ASSERT_STREQ(merge_op->Name(), "PutOperator"); + + ASSERT_OK( + MergeOperator::CreateFromString(config_options, "put_v1", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("PutOperator")); + + ASSERT_OK( + MergeOperator::CreateFromString(config_options, "uint64add", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("uint64add")); + ASSERT_STREQ(merge_op->Name(), "UInt64AddOperator"); + + ASSERT_OK(MergeOperator::CreateFromString(config_options, "max", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("max")); + ASSERT_STREQ(merge_op->Name(), "MaxOperator"); + + ASSERT_OK( + MergeOperator::CreateFromString(config_options, "bytesxor", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("bytesxor")); + ASSERT_STREQ(merge_op->Name(), BytesXOROperator::kClassName()); + + ASSERT_OK( + MergeOperator::CreateFromString(config_options, "sortlist", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("sortlist")); + ASSERT_STREQ(merge_op->Name(), SortList::kClassName()); + + ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappend", + &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("stringappend")); + ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName()); + auto delimiter = merge_op->GetOptions("Delimiter"); + ASSERT_NE(delimiter, nullptr); + ASSERT_EQ(*delimiter, ","); + + ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappendtest", + &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest")); + ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName()); + delimiter = merge_op->GetOptions("Delimiter"); + ASSERT_NE(delimiter, nullptr); + ASSERT_EQ(*delimiter, ","); + + ASSERT_OK(MergeOperator::CreateFromString( + config_options, "id=stringappend; delimiter=||", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("stringappend")); + ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName()); + delimiter = merge_op->GetOptions("Delimiter"); + ASSERT_NE(delimiter, nullptr); + ASSERT_EQ(*delimiter, "||"); + + ASSERT_OK(MergeOperator::CreateFromString( + config_options, "id=stringappendtest; delimiter=&&", &merge_op)); + ASSERT_NE(merge_op, nullptr); + ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest")); + ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName()); + delimiter = merge_op->GetOptions("Delimiter"); + ASSERT_NE(delimiter, nullptr); + ASSERT_EQ(*delimiter, "&&"); + + std::shared_ptr copy; + std::string mismatch; + std::string opts_str = merge_op->ToString(config_options); + + ASSERT_OK(MergeOperator::CreateFromString(config_options, opts_str, ©)); + ASSERT_TRUE(merge_op->AreEquivalent(config_options, copy.get(), &mismatch)); + ASSERT_NE(copy, nullptr); + delimiter = copy->GetOptions("Delimiter"); + ASSERT_NE(delimiter, nullptr); + ASSERT_EQ(*delimiter, "&&"); +} + +INSTANTIATE_TEST_CASE_P(OptionsSanityCheckTest, OptionsSanityCheckTest, + ::testing::Bool()); #endif // !ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/plugin/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md --- mariadb-10.11.11/storage/rocksdb/rocksdb/plugin/README.md 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,43 @@ +## Building external plugins together with RocksDB + +RocksDB offers several plugin interfaces for developers to customize its behavior. One difficulty developers face is how to make their plugin available to end users. The approach discussed here involves building the external code together with the RocksDB code into a single binary. Note another approach we plan to support involves loading plugins dynamically from shared libraries. + +### Discovery + +We hope developers will mention their work in "PLUGINS.md" so users can easily discover and reuse solutions for customizing RocksDB. + +### Directory organization + +External plugins will be linked according to their name into a subdirectory of "plugin/". For example, a plugin called "dedupfs" would be linked into "plugin/dedupfs/". + +### Build standard + +Currently the only supported build system are make and cmake. + +For make, files in the plugin directory ending in the .mk extension can define the following variables. + +* `$(PLUGIN_NAME)_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files. +* `$(PLUGIN_NAME)_HEADERS`: these files will be installed in the RocksDB header directory. Their paths will be prefixed by "rocksdb/plugin/$(PLUGIN_NAME)/". +* `$(PLUGIN_NAME)_LDFLAGS`: these flags will be passed to the final link step. For example, library dependencies can be propagated here, or symbols can be forcibly included, e.g., for static registration. +* `$(PLUGIN_NAME)_CXXFLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations. + +Users will run the usual make commands from the RocksDB directory, specifying the plugins to include in a space-separated list in the variable `ROCKSDB_PLUGINS`. + +For CMake, the CMakeLists.txt file in the plugin directory can define the following variables. + +* `${PLUGIN_NAME}_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files. +* `${PLUGIN_NAME}_COMPILE_FLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations. +* `${PLUGIN_NAME}_INCLUDE_PATHS`: paths to directories to search for plugin-specific header files during compilation. +* `${PLUGIN_NAME}_LIBS`: list of library names required to build the plugin, e.g. `dl`, `java`, `jvm`, `rados`, etc. CMake will generate proper flags for linking. +* `${PLUGIN_NAME}_LINK_PATHS`: list of paths for the linker to search for required libraries in additional to standard locations. +* `${PLUGIN_NAME}_CMAKE_SHARED_LINKER_FLAGS` additional linker flags used to generate shared libraries. For example, symbols can be forcibly included, e.g., for static registration. +* `${PLUGIN_NAME}_CMAKE_EXE_LINKER_FLAGS`: additional linker flags used to generate executables. For example, symbols can be forcibly included, e.g., for static registration. + +Users will run the usual cmake commands, specifying the plugins to include in a space-separated list in the command line variable `ROCKSDB_PLUGINS` when invoking cmake. +``` +cmake .. -DROCKSDB_PLUGINS="dedupfs hdfs rados" +``` + +### Example + +For a working example, see [Dedupfs](https://github.com/ajkr/dedupfs). diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/jemalloc_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/jemalloc_helper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,7 +5,7 @@ #pragma once -#if defined(__clang__) +#if defined(__clang__) && defined(__GLIBC__) // glibc's `posix_memalign()` declaration specifies `throw()` while clang's // declaration does not. There is a hack in clang to make its re-declaration // compatible with glibc's if they are declared consecutively. That hack breaks @@ -38,25 +38,54 @@ #else +// definitions for compatibility with older versions of jemalloc +#if !defined(JEMALLOC_ALLOCATOR) +#define JEMALLOC_ALLOCATOR +#endif +#if !defined(JEMALLOC_RESTRICT_RETURN) +#define JEMALLOC_RESTRICT_RETURN +#endif +#if !defined(JEMALLOC_NOTHROW) +#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +#endif +#if !defined(JEMALLOC_ALLOC_SIZE) +#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +#else +#define JEMALLOC_ALLOC_SIZE(s) +#endif +#endif + // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. -extern "C" void* mallocx(size_t, int) __attribute__((__weak__)); -extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__)); -extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__)); -extern "C" size_t sallocx(const void*, int) __attribute__((__weak__)); -extern "C" void dallocx(void*, int) __attribute__((__weak__)); -extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__)); -extern "C" size_t nallocx(size_t, int) __attribute__((__weak__)); -extern "C" int mallctl(const char*, void*, size_t*, void*, size_t) - __attribute__((__weak__)); -extern "C" int mallctlnametomib(const char*, size_t*, size_t*) - __attribute__((__weak__)); -extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*, - size_t) __attribute__((__weak__)); -extern "C" void malloc_stats_print(void (*)(void*, const char*), void*, - const char*) __attribute__((__weak__)); -extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) - JEMALLOC_CXX_THROW __attribute__((__weak__)); +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) + __attribute__((__weak__)); +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int) + JEMALLOC_ATTR(pure) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *, + size_t) __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *, + size_t *) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *, + size_t *, void *, size_t) + __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW +malloc_stats_print(void (*)(void *, const char *), void *, const char *) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW +malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW + __attribute__((__weak__)); // Check if Jemalloc is linked with the binary. Note the main program might be // using a different memory allocator even this method return true. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/lang.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/lang.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef FALLTHROUGH_INTENDED +#if defined(__clang__) +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define FALLTHROUGH_INTENDED [[gnu::fallthrough]] +#else +#define FALLTHROUGH_INTENDED do {} while (0) +#endif +#endif + +// ASAN (Address sanitizer) + +#if defined(__clang__) +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __has_feature(address_sanitizer) +#endif // defined(__has_feature) +#else // __clang__ +#ifdef __SANITIZE_ADDRESS__ +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __SANITIZE_ADDRESS__ +#endif // __clang__ + +#ifdef ROCKSDB_VALGRIND_RUN +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // ROCKSDB_VALGRIND_RUN + +// Coding guidelines say to avoid static objects with non-trivial destructors, +// because it's easy to cause trouble (UB) in static destruction. This +// macro makes it easier to define static objects that are normally never +// destructed, except are destructed when running under ASAN. This should +// avoid unexpected, unnecessary destruction behavior in production. +// Note that constructor arguments can be provided as in +// STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2); +#ifdef MUST_FREE_HEAP_ALLOCATIONS +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name +constexpr bool kMustFreeHeapAllocations = true; +#else +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type +constexpr bool kMustFreeHeapAllocations = false; +#endif + +// TSAN (Thread sanitizer) + +// For simplicity, standardize on the GCC define +#if defined(__clang__) +#if defined(__has_feature) && __has_feature(thread_sanitizer) +#define __SANITIZE_THREAD__ 1 +#endif // __has_feature(thread_sanitizer) +#endif // __clang__ + +#ifdef __SANITIZE_THREAD__ +#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread"))) +#else +#define TSAN_SUPPRESSION +#endif // TSAN_SUPPRESSION diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_example.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_example.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h 2025-05-19 16:14:27.000000000 +0000 @@ -70,7 +70,7 @@ // static void Initializer() { ... do something ...; } // ... // port::InitOnce(&init_control, &Initializer); -typedef intptr_t OnceType; +using OnceType = intptr_t; #define LEVELDB_ONCE_INIT 0 extern void InitOnce(port::OnceType*, void (*initializer)()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if !defined(OS_WIN) + #include "port/port_posix.h" #include @@ -21,8 +23,12 @@ #include #include #include + #include -#include "logging/logging.h" +#include +#include + +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -43,8 +49,8 @@ namespace port { static int PthreadCall(const char* label, int result) { - if (result != 0 && result != ETIMEDOUT) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + if (result != 0 && result != ETIMEDOUT && result != EBUSY) { + fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str()); abort(); } return result; @@ -86,6 +92,16 @@ PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } +bool Mutex::TryLock() { + bool ret = PthreadCall("trylock", pthread_mutex_trylock(&mu_)) == 0; +#ifndef NDEBUG + if (ret) { + locked_ = true; + } +#endif + return ret; +} + void Mutex::AssertHeld() { #ifndef NDEBUG assert(locked_); @@ -230,5 +246,50 @@ const size_t kPageSize = GetPageSize(); +void SetCpuPriority(ThreadId id, CpuPriority priority) { +#ifdef OS_LINUX + sched_param param; + param.sched_priority = 0; + switch (priority) { + case CpuPriority::kHigh: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, -20); + break; + case CpuPriority::kNormal: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, 0); + break; + case CpuPriority::kLow: + sched_setscheduler(id, SCHED_OTHER, ¶m); + setpriority(PRIO_PROCESS, id, 19); + break; + case CpuPriority::kIdle: + sched_setscheduler(id, SCHED_IDLE, ¶m); + break; + default: + assert(false); + } +#else + (void)id; + (void)priority; +#endif +} + +int64_t GetProcessID() { return getpid(); } + +bool GenerateRfcUuid(std::string* output) { + output->clear(); + std::ifstream f("/proc/sys/kernel/random/uuid"); + std::getline(f, /*&*/ *output); + if (output->size() == 36) { + return true; + } else { + output->clear(); + return false; + } +} + } // namespace port } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,6 +13,7 @@ #include +#include "rocksdb/options.h" #include "rocksdb/rocksdb_namespace.h" // size_t printf formatting named in the manner of C99 standard formatting @@ -115,6 +116,9 @@ void Lock(); void Unlock(); + + bool TryLock(); + // this will assert if the mutex is not locked // it does NOT verify that mutex is held by a calling thread void AssertHeld(); @@ -123,7 +127,7 @@ friend class CondVar; pthread_mutex_t mu_; #ifndef NDEBUG - bool locked_; + bool locked_ = false; #endif }; @@ -166,7 +170,7 @@ #if defined(__i386__) || defined(__x86_64__) asm volatile("pause"); #elif defined(__aarch64__) - asm volatile("wfe"); + asm volatile("yield"); #elif defined(__powerpc64__) asm volatile("or 27,27,27"); #endif @@ -176,7 +180,7 @@ // Returns -1 if not available on this platform extern int PhysicalCoreID(); -typedef pthread_once_t OnceType; +using OnceType = pthread_once_t; #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT extern void InitOnce(OnceType* once, void (*initializer)()); @@ -189,7 +193,11 @@ #define ALIGN_AS(n) /*empty*/ #else #if defined(__s390__) +#if defined(__GNUC__) && __GNUC__ < 7 +#define CACHE_LINE_SIZE 64U +#else #define CACHE_LINE_SIZE 256U +#endif #elif defined(__powerpc__) || defined(__aarch64__) #define CACHE_LINE_SIZE 128U #else @@ -214,5 +222,15 @@ extern const size_t kPageSize; +using ThreadId = pid_t; + +extern void SetCpuPriority(ThreadId id, CpuPriority priority); + +int64_t GetProcessID(); + +// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns +// true on success or false on failure. +bool GenerateRfcUuid(std::string* output); + } // namespace port } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,8 +5,9 @@ // #include "port/stack_trace.h" -#if defined(ROCKSDB_LITE) || !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || \ - defined(CYGWIN) || defined(OS_FREEBSD) || defined(OS_SOLARIS) +#if defined(ROCKSDB_LITE) || \ + !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \ + defined(OS_SOLARIS) || defined(OS_WIN) // noop @@ -14,6 +15,10 @@ namespace port { void InstallStackTraceHandler() {} void PrintStack(int /*first_frames_to_skip*/) {} +void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {} +void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { + return nullptr; +} } // namespace port } // namespace ROCKSDB_NAMESPACE @@ -27,15 +32,22 @@ #include #include +#if defined(OS_FREEBSD) +#include +#endif + +#include "port/lang.h" + namespace ROCKSDB_NAMESPACE { namespace port { namespace { -#if defined(OS_LINUX) || defined(OS_FREEBSD) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) const char* GetExecutableName() { static char name[1024]; +#if !defined(OS_FREEBSD) char link[1024]; snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); auto read = readlink(link, name, sizeof(name) - 1); @@ -45,6 +57,17 @@ name[read] = 0; return name; } +#else + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + size_t namesz = sizeof(name); + + auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0); + if (-1 == ret) { + return nullptr; + } else { + return name; + } +#endif } void PrintStackTraceLine(const char* symbol, void* frame) { @@ -99,18 +122,38 @@ } // namespace +void PrintStack(void* frames[], int num_frames) { + auto symbols = backtrace_symbols(frames, num_frames); + + for (int i = 0; i < num_frames; ++i) { + fprintf(stderr, "#%-2d ", i); + PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); + } + free(symbols); +} + void PrintStack(int first_frames_to_skip) { const int kMaxFrames = 100; void* frames[kMaxFrames]; auto num_frames = backtrace(frames, kMaxFrames); - auto symbols = backtrace_symbols(frames, num_frames); + PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip); +} - for (int i = first_frames_to_skip; i < num_frames; ++i) { - fprintf(stderr, "#%-2d ", i - first_frames_to_skip); - PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]); - } - free(symbols); +void PrintAndFreeStack(void* callstack, int num_frames) { + PrintStack(static_cast(callstack), num_frames); + free(callstack); +} + +void* SaveStack(int* num_frames, int first_frames_to_skip) { + const int kMaxFrames = 100; + void* frames[kMaxFrames]; + + auto count = backtrace(frames, kMaxFrames); + *num_frames = count - first_frames_to_skip; + void* callstack = malloc(sizeof(void*) * *num_frames); + memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames); + return callstack; } static void StackTraceHandler(int sig) { @@ -119,6 +162,20 @@ fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); // skip the top three signal handler related frames PrintStack(3); + + // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of + // a signal" have failed, so just warn the user about them. +#ifdef __SANITIZE_THREAD__ + fprintf(stderr, + "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" + "==> ignorable, as they are expected when generating a stack\n" + "==> trace because of a signal under TSAN. Consider why the\n" + "==> signal was generated to begin with, and the stack trace\n" + "==> in the TSAN warning can be useful for that. (The stack\n" + "==> trace printed by the signal handler is likely obscured\n" + "==> by TSAN output.)\n"); +#endif + // re-signal to default handler (so we still get core dump if needed...) raise(sig); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h 2025-05-19 16:14:27.000000000 +0000 @@ -18,5 +18,11 @@ // Prints stack, skips skip_first_frames frames void PrintStack(int first_frames_to_skip = 0); +// Prints the given callstack +void PrintAndFreeStack(void* callstack, int num_frames); + +// Save the current callstack +void* SaveStack(int* num_frame, int first_frames_to_skip = 0); + } // namespace port } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/sys_time.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/sys_time.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h 2025-05-19 16:14:27.000000000 +0000 @@ -23,10 +23,10 @@ namespace port { // Avoid including winsock2.h for this definition -typedef struct timeval { +struct timeval { long tv_sec; long tv_usec; -} timeval; +}; void gettimeofday(struct timeval* tv, struct timezone* tz); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_default.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_default.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,10 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) + #include -#include #include "port/win/env_win.h" +#include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/compression_context_cache.h" #include "util/thread_local.h" @@ -24,18 +26,20 @@ // dead-lock. // in this manner any remaining threads are terminated OK. namespace { - std::once_flag winenv_once_flag; - Env* envptr; -}; -} +std::once_flag winenv_once_flag; +Env* envptr; +}; // namespace +} // namespace port Env* Env::Default() { - using namespace port; ThreadLocalPtr::InitSingletons(); CompressionContextCache::InitSingleton(); INIT_SYNC_POINT_SINGLETONS(); - std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); }); - return envptr; + std::call_once(port::winenv_once_flag, + []() { port::envptr = new port::WinEnv(); }); + return port::envptr; } } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,38 +7,40 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) + #include "port/win/env_win.h" -#include "port/win/win_thread.h" -#include -#include -#include +#include // _rmdir, _mkdir, _getcwd #include -#include // _getpid -#include // _access -#include // _rmdir, _mkdir, _getcwd -#include +#include // _access +#include // for uuid generation +#include #include +#include +#include +#include -#include "rocksdb/env.h" -#include "rocksdb/slice.h" - -#include "port/port.h" -#include "port/port_dirent.h" -#include "port/win/win_logger.h" -#include "port/win/io_win.h" +#include +#include +#include #include "monitoring/iostats_context_imp.h" - #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" - -#include // for uuid generation -#include -#include +#include "port/port.h" +#include "port/port_dirent.h" +#include "port/win/io_win.h" +#include "port/win/win_logger.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" #include "strsafe.h" +#include "util/string_util.h" -#include +// Undefine the functions windows might use (again)... +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary namespace ROCKSDB_NAMESPACE { @@ -53,36 +55,26 @@ // RAII helpers for HANDLEs const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; -typedef std::unique_ptr UniqueCloseHandlePtr; +using UniqueCloseHandlePtr = std::unique_ptr; const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); }; -typedef std::unique_ptr UniqueFindClosePtr; +using UniqueFindClosePtr = std::unique_ptr; void WinthreadCall(const char* label, std::error_code result) { if (0 != result.value()) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); + fprintf(stderr, "Winthread %s: %s\n", label, + errnoStr(result.value()).c_str()); abort(); } } -} +} // namespace namespace port { - -WinEnvIO::WinEnvIO(Env* hosted_env) - : hosted_env_(hosted_env), - page_size_(4 * 1024), - allocation_granularity_(page_size_), - perf_counter_frequency_(0), +WinClock::WinClock() + : perf_counter_frequency_(0), nano_seconds_per_period_(0), GetSystemTimePreciseAsFileTime_(NULL) { - - SYSTEM_INFO sinfo; - GetSystemInfo(&sinfo); - - page_size_ = sinfo.dwPageSize; - allocation_granularity_ = sinfo.dwAllocationGranularity; - { LARGE_INTEGER qpf; BOOL ret __attribute__((__unused__)); @@ -97,39 +89,90 @@ HMODULE module = GetModuleHandle("kernel32.dll"); if (module != NULL) { - GetSystemTimePreciseAsFileTime_ = - (FnGetSystemTimePreciseAsFileTime)GetProcAddress( - module, "GetSystemTimePreciseAsFileTime"); + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)( + void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime"); } } -WinEnvIO::~WinEnvIO() { +void WinClock::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); } -Status WinEnvIO::DeleteFile(const std::string& fname) { - Status result; +std::string WinClock::TimeToString(uint64_t secondsSince1970) { + std::string result; - BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + const time_t seconds = secondsSince1970; + const int maxsize = 64; - if(!ret) { - auto lastError = GetLastError(); - result = IOErrorFromWindowsError("Failed to delete: " + fname, - lastError); + struct tm t; + errno_t ret = localtime_s(&t, &seconds); + + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; + + int len = + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + assert(len > 0); + + result.resize(len); } return result; } -Status WinEnvIO::Truncate(const std::string& fname, size_t size) { - Status s; - int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); - if (result != 0) { - s = IOError("Failed to truncate: " + fname, errno); +uint64_t WinClock::NowMicros() { + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000LL; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; } - return s; + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); } -Status WinEnvIO::GetCurrentTime(int64_t* unix_time) { +uint64_t WinClock::NowNanos() { + if (nano_seconds_per_period_ != 0) { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert performance counter to nanoseconds by precomputed ratio. + // Directly multiply nano::den with li.QuadPart causes overflow. + // Only do this when nano::den is divisible by perf_counter_frequency_, + // which most likely is the case in reality. If it's not, fall back to + // high_resolution_clock, which may be less precise under old compilers. + li.QuadPart *= nano_seconds_per_period_; + return li.QuadPart; + } + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch()) + .count(); +} + +Status WinClock::GetCurrentTime(int64_t* unix_time) { time_t time = std::time(nullptr); if (time == (time_t)(-1)) { return Status::NotSupported("Failed to get time"); @@ -139,10 +182,55 @@ return Status::OK(); } -Status WinEnvIO::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status s; +WinFileSystem::WinFileSystem(const std::shared_ptr& clock) + : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) { + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); + + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; +} + +const std::shared_ptr& WinFileSystem::Default() { + static std::shared_ptr fs = + std::make_shared(WinClock::Default()); + return fs; +} + +WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {} + +WinEnvIO::~WinEnvIO() {} + +IOStatus WinFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError); + } + + return result; +} + +IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); + if (result != 0) { + s = IOError("Failed to truncate: " + fname, errno); + } + return s; +} + +IOStatus WinFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + IOStatus s; result->reset(); @@ -176,11 +264,11 @@ return s; } -Status WinEnvIO::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +IOStatus WinFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { result->reset(); - Status s; + IOStatus s; // Open the file for read-only random access // Random access is to disable read-ahead as the system reads too much data @@ -197,10 +285,10 @@ HANDLE hFile = 0; { IOSTATS_TIMER_GUARD(open_nanos); - hFile = RX_CreateFile( - RX_FN(fname).c_str(), GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, OPEN_EXISTING, fileFlags, NULL); + hFile = + RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { @@ -211,18 +299,18 @@ UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - // CAUTION! This will map the entire file into the process address space - if (options.use_mmap_reads && sizeof(void*) >= 8) { - // Use mmap when virtual address-space is plentiful. + // CAUTION! This will map the entire file into the process address space. + // Not recommended for 32-bit platforms. + if (options.use_mmap_reads) { uint64_t fileSize; - s = GetFileSize(fname, &fileSize); + s = GetFileSize(fname, IOOptions(), &fileSize, dbg); if (s.ok()) { // Will not map empty files if (fileSize == 0) { - return IOError( - "NewRandomAccessFile failed to map empty file: " + fname, EINVAL); + return IOError("NewRandomAccessFile failed to map empty file: " + fname, + EINVAL); } HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY, @@ -240,11 +328,11 @@ UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); const void* mapped_region = - MapViewOfFileEx(hMap, FILE_MAP_READ, - 0, // High DWORD of access start - 0, // Low DWORD - static_cast(fileSize), - NULL); // Let the OS choose the mapping + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping if (!mapped_region) { auto lastError = GetLastError(); @@ -260,26 +348,21 @@ fileGuard.release(); } } else { - result->reset(new WinRandomAccessFile(fname, hFile, - std::max(GetSectorSize(fname), - page_size_), - options)); + result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options)); fileGuard.release(); } return s; } -Status WinEnvIO::OpenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options, - bool reopen) { - +IOStatus WinFileSystem::OpenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, bool reopen) { const size_t c_BufferCapacity = 64 * 1024; EnvOptions local_options(options); result->reset(); - Status s; + IOStatus s; DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; @@ -316,11 +399,11 @@ RX_FN(fname).c_str(), desired_access, // Access desired shared_mode, - NULL, // Security attributes + NULL, // Security attributes // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC creation_disposition, - fileFlags, // Flags - NULL); // Template File + fileFlags, // Flags + NULL); // Template File } if (INVALID_HANDLE_VALUE == hFile) { @@ -350,25 +433,36 @@ } else { // Here we want the buffer allocation to be aligned by the SSD page size // and to be a multiple of it - result->reset(new WinWritableFile(fname, hFile, - std::max(GetSectorSize(fname), - GetPageSize()), + result->reset(new WinWritableFile(fname, hFile, GetPageSize(), c_BufferCapacity, local_options)); } return s; } -Status WinEnvIO::NewRandomRWFile(const std::string & fname, - std::unique_ptr* result, - const EnvOptions & options) { - - Status s; +IOStatus WinFileSystem::NewWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, false); +} + +IOStatus WinFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, true); +} + +IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; // Open the file for read-only random access // Random access is to disable read-ahead as the system reads too much data DWORD desired_access = GENERIC_READ | GENERIC_WRITE; DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist + DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; if (options.use_direct_reads && options.use_direct_writes) { @@ -380,36 +474,27 @@ HANDLE hFile = 0; { IOSTATS_TIMER_GUARD(open_nanos); - hFile = - RX_CreateFile(RX_FN(fname).c_str(), - desired_access, - shared_mode, - NULL, // Security attributes - creation_disposition, - file_flags, - NULL); + hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode, + NULL, // Security attributes + creation_disposition, file_flags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { auto lastError = GetLastError(); return IOErrorFromWindowsError( - "NewRandomRWFile failed to Create/Open: " + fname, lastError); + "NewRandomRWFile failed to Create/Open: " + fname, lastError); } UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - result->reset(new WinRandomRWFile(fname, hFile, - std::max(GetSectorSize(fname), - GetPageSize()), - options)); + result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options)); fileGuard.release(); return s; } -Status WinEnvIO::NewMemoryMappedFileBuffer( - const std::string & fname, - std::unique_ptr* result) { - Status s; +IOStatus WinFileSystem::NewMemoryMappedFileBuffer( + const std::string& fname, std::unique_ptr* result) { + IOStatus s; result->reset(); DWORD fileFlags = FILE_ATTRIBUTE_READONLY; @@ -419,11 +504,9 @@ IOSTATS_TIMER_GUARD(open_nanos); hFile = RX_CreateFile( RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, OPEN_EXISTING, // Open only if it exists - fileFlags, - NULL); + fileFlags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { @@ -435,21 +518,21 @@ UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); uint64_t fileSize = 0; - s = GetFileSize(fname, &fileSize); + s = GetFileSize(fname, IOOptions(), &fileSize, nullptr); if (!s.ok()) { return s; } // Will not map empty files if (fileSize == 0) { - return Status::NotSupported( + return IOStatus::NotSupported( "NewMemoryMappedFileBuffer can not map zero length files: " + fname); } // size_t is 32-bit with 32-bit builds if (fileSize > std::numeric_limits::max()) { - return Status::NotSupported( - "The specified file size does not fit into 32-bit memory addressing: " - + fname); + return IOStatus::NotSupported( + "The specified file size does not fit into 32-bit memory addressing: " + + fname); } HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE, @@ -486,15 +569,16 @@ return s; } -Status WinEnvIO::NewDirectory(const std::string& name, - std::unique_ptr* result) { - Status s; +IOStatus WinFileSystem::NewDirectory(const std::string& name, + const IOOptions& /*options*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; // Must be nullptr on failure result->reset(); if (!DirExists(name)) { - s = IOErrorFromWindowsError( - "open folder: " + name, ERROR_DIRECTORY); + s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY); return s; } @@ -504,10 +588,9 @@ IOSTATS_TIMER_GUARD(open_nanos); handle = RX_CreateFile( RX_FN(name).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); } @@ -522,8 +605,10 @@ return s; } -Status WinEnvIO::FileExists(const std::string& fname) { - Status s; +IOStatus WinFileSystem::FileExists(const std::string& fname, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; // TODO: This does not follow symbolic links at this point // which is consistent with _access() impl on windows // but can be added @@ -532,70 +617,74 @@ GetFileExInfoStandard, &attrs)) { auto lastError = GetLastError(); switch (lastError) { - case ERROR_ACCESS_DENIED: - case ERROR_NOT_FOUND: - case ERROR_FILE_NOT_FOUND: - case ERROR_PATH_NOT_FOUND: - s = Status::NotFound(); - break; - default: - s = IOErrorFromWindowsError("Unexpected error for: " + fname, - lastError); - break; + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = IOStatus::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; } } return s; } -Status WinEnvIO::GetChildren(const std::string& dir, - std::vector* result) { - - Status status; +IOStatus WinFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*opts*/, + std::vector* result, + IODebugContext* /*dbg*/) { + IOStatus status; result->clear(); - std::vector output; RX_WIN32_FIND_DATA data; memset(&data, 0, sizeof(data)); std::string pattern(dir); pattern.append("\\").append("*"); - HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(), - // Do not want alternative name - FindExInfoBasic, - &data, - FindExSearchNameMatch, - NULL, // lpSearchFilter - 0); + HANDLE handle = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + // Do not want alternative name + FindExInfoBasic, &data, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); if (handle == INVALID_HANDLE_VALUE) { auto lastError = GetLastError(); switch (lastError) { - case ERROR_NOT_FOUND: - case ERROR_ACCESS_DENIED: - case ERROR_FILE_NOT_FOUND: - case ERROR_PATH_NOT_FOUND: - status = Status::NotFound(); - break; - default: - status = IOErrorFromWindowsError( - "Failed to GetChhildren for: " + dir, lastError); + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = IOStatus::NotFound(); + break; + default: + status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir, + lastError); } return status; } UniqueFindClosePtr fc(handle, FindCloseFunc); - if (result->capacity() > 0) { - output.reserve(result->capacity()); - } - // For safety data.cFileName[MAX_PATH - 1] = 0; while (true) { - auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); - output.emplace_back(FN_TO_RX(x)); - BOOL ret =- RX_FindNextFile(handle, &data); + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) && + (RX_FNCMP(data.cFileName, ".") == 0 || + RX_FNCMP(data.cFileName, "..") == 0); + if (!ignore) { + auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); + result->push_back(FN_TO_RX(x)); + } + + BOOL ret = -RX_FindNextFile(handle, &data); // If the function fails the return value is zero // and non-zero otherwise. Not TRUE or FALSE. if (ret == FALSE) { @@ -604,24 +693,27 @@ } data.cFileName[MAX_PATH - 1] = 0; } - output.swap(*result); return status; } -Status WinEnvIO::CreateDir(const std::string& name) { - Status result; +IOStatus WinFileSystem::CreateDir(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); if (!ret) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError( - "Failed to create a directory: " + name, lastError); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); } return result; } -Status WinEnvIO::CreateDirIfMissing(const std::string& name) { - Status result; +IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; if (DirExists(name)) { return result; @@ -631,30 +723,32 @@ if (!ret) { auto lastError = GetLastError(); if (lastError != ERROR_ALREADY_EXISTS) { - result = IOErrorFromWindowsError( - "Failed to create a directory: " + name, lastError); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); } else { - result = - Status::IOError(name + ": exists but is not a directory"); + result = IOStatus::IOError(name + ": exists but is not a directory"); } } return result; } -Status WinEnvIO::DeleteDir(const std::string& name) { - Status result; +IOStatus WinFileSystem::DeleteDir(const std::string& name, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str()); if (!ret) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError("Failed to remove dir: " + name, - lastError); + result = + IOErrorFromWindowsError("Failed to remove dir: " + name, lastError); } return result; } -Status WinEnvIO::GetFileSize(const std::string& fname, - uint64_t* size) { - Status s; +IOStatus WinFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* size, + IODebugContext* /*dbg*/) { + IOStatus s; WIN32_FILE_ATTRIBUTE_DATA attrs; if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, @@ -670,7 +764,7 @@ return s; } -uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) { +uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) { const uint64_t c_FileTimePerSecond = 10000000U; // UNIX epoch starts on 1970-01-01T00:00:00Z // Windows FILETIME starts on 1601-01-01T00:00:00Z @@ -684,31 +778,35 @@ li.LowPart = ftTime.dwLowDateTime; uint64_t result = - (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; return result; } -Status WinEnvIO::GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) { - Status s; +IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) { + IOStatus s; WIN32_FILE_ATTRIBUTE_DATA attrs; if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, - &attrs)) { + &attrs)) { *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); } else { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Can not get file modification time for: " + fname, lastError); + "Can not get file modification time for: " + fname, lastError); *file_mtime = 0; } return s; } -Status WinEnvIO::RenameFile(const std::string& src, - const std::string& target) { - Status result; +IOStatus WinFileSystem::RenameFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; // rename() is not capable of replacing the existing file as on Linux // so use OS API directly @@ -725,14 +823,16 @@ return result; } -Status WinEnvIO::LinkFile(const std::string& src, - const std::string& target) { - Status result; +IOStatus WinFileSystem::LinkFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; - if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { + if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { DWORD lastError = GetLastError(); if (lastError == ERROR_NOT_SAME_DEVICE) { - return Status::NotSupported("No cross FS links allowed"); + return IOStatus::NotSupported("No cross FS links allowed"); } std::string text("Failed to link: "); @@ -744,12 +844,14 @@ return result; } -Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) { - Status s; - HANDLE handle = RX_CreateFile( - RX_FN(fname).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); +IOStatus WinFileSystem::NumFileLinks(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* count, + IODebugContext* /*dbg*/) { + IOStatus s; + HANDLE handle = + RX_CreateFile(RX_FN(fname).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); if (INVALID_HANDLE_VALUE == handle) { auto lastError = GetLastError(); @@ -770,26 +872,27 @@ return s; } -Status WinEnvIO::AreFilesSame(const std::string& first, - const std::string& second, bool* res) { +IOStatus WinFileSystem::AreFilesSame(const std::string& first, + const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) { // For MinGW builds #if (_WIN32_WINNT == _WIN32_WINNT_VISTA) - Status s = Status::NotSupported(); + IOStatus s = IOStatus::NotSupported(); #else assert(res != nullptr); - Status s; + IOStatus s; if (res == nullptr) { - s = Status::InvalidArgument("res"); + s = IOStatus::InvalidArgument("res"); return s; } // 0 - for access means read metadata HANDLE file_1 = RX_CreateFile( RX_FN(first).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); if (INVALID_HANDLE_VALUE == file_1) { @@ -801,9 +904,9 @@ HANDLE file_2 = RX_CreateFile( RX_FN(second).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); if (INVALID_HANDLE_VALUE == file_2) { @@ -823,9 +926,9 @@ return s; } - FILE_ID_INFO FileInfo_2; - result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, - sizeof(FileInfo_2)); + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); if (!result) { auto lastError = GetLastError(); @@ -834,9 +937,9 @@ } if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { - *res = (0 == memcmp(FileInfo_1.FileId.Identifier, - FileInfo_2.FileId.Identifier, - sizeof(FileInfo_1.FileId.Identifier))); + *res = + (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); } else { *res = false; } @@ -844,12 +947,13 @@ return s; } -Status WinEnvIO::LockFile(const std::string& lockFname, - FileLock** lock) { +IOStatus WinFileSystem::LockFile(const std::string& lockFname, + const IOOptions& /*opts*/, FileLock** lock, + IODebugContext* /*dbg*/) { assert(lock != nullptr); *lock = NULL; - Status result; + IOStatus result; // No-sharing, this is a LOCK file const DWORD ExclusiveAccessON = 0; @@ -861,15 +965,14 @@ { IOSTATS_TIMER_GUARD(open_nanos); hFile = RX_CreateFile(RX_FN(lockFname).c_str(), - (GENERIC_READ | GENERIC_WRITE), - ExclusiveAccessON, NULL, CREATE_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); + (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); } if (INVALID_HANDLE_VALUE == hFile) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError( - "Failed to create lock file: " + lockFname, lastError); + result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname, + lastError); } else { *lock = new WinFileLock(hFile); } @@ -877,8 +980,9 @@ return result; } -Status WinEnvIO::UnlockFile(FileLock* lock) { - Status result; +IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; assert(lock != nullptr); @@ -887,8 +991,9 @@ return result; } -Status WinEnvIO::GetTestDirectory(std::string* result) { - +IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts, + std::string* result, + IODebugContext* dbg) { std::string output; const char* env = getenv("TEST_TMPDIR"); @@ -903,21 +1008,23 @@ output = "c:\\tmp"; } } - CreateDir(output); + CreateDir(output, opts, dbg); output.append("\\testrocksdb-"); - output.append(std::to_string(_getpid())); + output.append(std::to_string(GetCurrentProcessId())); - CreateDir(output); + CreateDir(output, opts, dbg); output.swap(*result); - return Status::OK(); + return IOStatus::OK(); } -Status WinEnvIO::NewLogger(const std::string& fname, - std::shared_ptr* result) { - Status s; +IOStatus WinFileSystem::NewLogger(const std::string& fname, + const IOOptions& /*opts*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; result->reset(); @@ -950,64 +1057,25 @@ // Set creation, last access and last write time to the same value SetFileTime(hFile, &ft, &ft, &ft); } - result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile)); + result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile)); } return s; } -uint64_t WinEnvIO::NowMicros() { - - if (GetSystemTimePreciseAsFileTime_ != NULL) { - // all std::chrono clocks on windows proved to return - // values that may repeat that is not good enough for some uses. - const int64_t c_UnixEpochStartTicks = 116444736000000000LL; - const int64_t c_FtToMicroSec = 10; - - // This interface needs to return system time and not - // just any microseconds because it is often used as an argument - // to TimedWait() on condition variable - FILETIME ftSystemTime; - GetSystemTimePreciseAsFileTime_(&ftSystemTime); - - LARGE_INTEGER li; - li.LowPart = ftSystemTime.dwLowDateTime; - li.HighPart = ftSystemTime.dwHighDateTime; - // Subtract unix epoch start - li.QuadPart -= c_UnixEpochStartTicks; - // Convert to microsecs - li.QuadPart /= c_FtToMicroSec; - return li.QuadPart; +IOStatus WinFileSystem::IsDirectory(const std::string& path, + const IOOptions& /*opts*/, bool* is_dir, + IODebugContext* /*dbg*/) { + BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str()); + if (is_dir) { + *is_dir = ret ? true : false; } - using namespace std::chrono; - return duration_cast(system_clock::now().time_since_epoch()) - .count(); -} - -uint64_t WinEnvIO::NowNanos() { - if (nano_seconds_per_period_ != 0) { - // all std::chrono clocks on windows have the same resolution that is only - // good enough for microseconds but not nanoseconds - // On Windows 8 and Windows 2012 Server - // GetSystemTimePreciseAsFileTime(¤t_time) can be used - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - // Convert performance counter to nanoseconds by precomputed ratio. - // Directly multiply nano::den with li.QuadPart causes overflow. - // Only do this when nano::den is divisible by perf_counter_frequency_, - // which most likely is the case in reality. If it's not, fall back to - // high_resolution_clock, which may be less precise under old compilers. - li.QuadPart *= nano_seconds_per_period_; - return li.QuadPart; - } - using namespace std::chrono; - return duration_cast( - high_resolution_clock::now().time_since_epoch()).count(); + return IOStatus::OK(); } Status WinEnvIO::GetHostName(char* name, uint64_t len) { Status s; DWORD nSize = static_cast( - std::min(len, std::numeric_limits::max())); + std::min(len, std::numeric_limits::max())); if (!::GetComputerNameA(name, &nSize)) { auto lastError = GetLastError(); @@ -1019,15 +1087,17 @@ return s; } -Status WinEnvIO::GetAbsolutePath(const std::string& db_path, - std::string* output_path) { +IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* dbg) { // Check if we already have an absolute path // For test compatibility we will consider starting slash as an // absolute path if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || - !RX_PathIsRelative(RX_FN(db_path).c_str())) { + !RX_PathIsRelative(RX_FN(db_path).c_str())) { *output_path = db_path; - return Status::OK(); + return IOStatus::OK(); } RX_FILESTRING result; @@ -1046,42 +1116,19 @@ std::string res = FN_TO_RX(result); res.swap(*output_path); - return Status::OK(); + return IOStatus::OK(); } -std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) { - std::string result; - - const time_t seconds = secondsSince1970; - const int maxsize = 64; - - struct tm t; - errno_t ret = localtime_s(&t, &seconds); - - if (ret) { - result = std::to_string(seconds); - } else { - result.resize(maxsize); - char* p = &result[0]; - - int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", - t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, - t.tm_min, t.tm_sec); - assert(len > 0); - - result.resize(len); - } - - return result; -} - -Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) { +IOStatus WinFileSystem::GetFreeSpace(const std::string& path, + const IOOptions& /*options*/, + uint64_t* diskfree, + IODebugContext* /*dbg*/) { assert(diskfree != nullptr); ULARGE_INTEGER freeBytes; BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL); if (f) { *diskfree = freeBytes.QuadPart; - return Status::OK(); + return IOStatus::OK(); } else { DWORD lastError = GetLastError(); return IOErrorFromWindowsError("Failed to get free space: " + path, @@ -1089,9 +1136,9 @@ } } -EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForLogWrite( + const FileOptions& file_options, const DBOptions& db_options) const { + FileOptions optimized(file_options); // These two the same as default optimizations optimized.bytes_per_sync = db_options.wal_bytes_per_sync; optimized.writable_file_max_buffer_size = @@ -1105,42 +1152,52 @@ return optimized; } -EnvOptions WinEnvIO::OptimizeForManifestWrite( - const EnvOptions& env_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForManifestWrite( + const FileOptions& options) const { + FileOptions optimized(options); optimized.use_mmap_writes = false; optimized.use_direct_reads = false; return optimized; } -EnvOptions WinEnvIO::OptimizeForManifestRead( - const EnvOptions& env_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized(file_options); optimized.use_mmap_writes = false; optimized.use_direct_reads = false; return optimized; } // Returns true iff the named directory exists and is a directory. -bool WinEnvIO::DirExists(const std::string& dname) { +bool WinFileSystem::DirExists(const std::string& dname) { WIN32_FILE_ATTRIBUTE_DATA attrs; - if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), - GetFileExInfoStandard, &attrs)) { + if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard, + &attrs)) { return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); } return false; } -size_t WinEnvIO::GetSectorSize(const std::string& fname) { +size_t WinFileSystem::GetSectorSize(const std::string& fname) { size_t sector_size = kSectorSize; - if (RX_PathIsRelative(RX_FN(fname).c_str())) { - return sector_size; - } - // obtain device handle char devicename[7] = "\\\\.\\"; - int erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + int erresult = 0; + if (RX_PathIsRelative(RX_FN(fname).c_str())) { + RX_FILESTRING rx_current_dir; + rx_current_dir.resize(MAX_PATH); + DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]); + if (len == 0) { + return sector_size; + } + rx_current_dir.resize(len); + std::string current_dir = FN_TO_RX(rx_current_dir); + erresult = + strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2); + } else { + erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2); + } if (erresult) { assert(false); @@ -1161,21 +1218,21 @@ BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; DWORD output_bytes = 0; - BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY, - &spropertyquery, sizeof(spropertyquery), - output_buffer, - sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), - &output_bytes, nullptr); + BOOL ret = DeviceIoControl( + hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery, + sizeof(spropertyquery), output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr); if (ret) { - sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector; + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer) + ->BytesPerLogicalSector; } else { - // many devices do not support StorageProcessAlignmentProperty. Any failure here and we - // fall back to logical alignment + // many devices do not support StorageProcessAlignmentProperty. Any failure + // here and we fall back to logical alignment - DISK_GEOMETRY_EX geometry = { 0 }; - ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, - nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr); + DISK_GEOMETRY_EX geometry = {0}; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0, + &geometry, sizeof(geometry), &output_bytes, nullptr); if (ret) { sector_size = geometry.Geometry.BytesPerSector; } @@ -1193,17 +1250,15 @@ WinEnvThreads::WinEnvThreads(Env* hosted_env) : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { - for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].SetThreadPriority( - static_cast(pool_id)); + static_cast(pool_id)); // This allows later initializing the thread-local-env of each thread. thread_pools_[pool_id].SetHostEnv(hosted_env); } } WinEnvThreads::~WinEnvThreads() { - WaitForJoin(); for (auto& thpool : thread_pools_) { @@ -1211,9 +1266,9 @@ } } -void WinEnvThreads::Schedule(void(*function)(void*), void* arg, +void WinEnvThreads::Schedule(void (*function)(void*), void* arg, Env::Priority pri, void* tag, - void(*unschedFunction)(void* arg)) { + void (*unschedFunction)(void* arg)) { assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); } @@ -1224,26 +1279,26 @@ namespace { - struct StartThreadState { - void(*user_function)(void*); - void* arg; - }; +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; - void* StartThreadWrapper(void* arg) { - std::unique_ptr state( +void* StartThreadWrapper(void* arg) { + std::unique_ptr state( reinterpret_cast(arg)); - state->user_function(state->arg); - return nullptr; - } - + state->user_function(state->arg); + return nullptr; } -void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) { +} // namespace + +void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) { std::unique_ptr state(new StartThreadState); state->user_function = function; state->arg = arg; try { - ROCKSDB_NAMESPACE::port::WindowsThread th(&StartThreadWrapper, state.get()); + Thread th(&StartThreadWrapper, state.get()); state.release(); std::lock_guard lg(mu_); @@ -1273,10 +1328,6 @@ uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } -void WinEnvThreads::SleepForMicroseconds(int micros) { - std::this_thread::sleep_for(std::chrono::microseconds(micros)); -} - void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); thread_pools_[pri].SetBackgroundThreads(num); @@ -1295,12 +1346,14 @@ ///////////////////////////////////////////////////////////////////////// // WinEnv -WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) { +WinEnv::WinEnv() + : CompositeEnv(WinFileSystem::Default(), WinClock::Default()), + winenv_io_(this), + winenv_threads_(this) { // Protected member of the base class thread_status_updater_ = CreateThreadStatusUpdater(); } - WinEnv::~WinEnv() { // All threads must be joined before the deletion of // thread_status_updater_. @@ -1312,151 +1365,12 @@ return thread_status_updater_->GetThreadList(thread_list); } -Status WinEnv::DeleteFile(const std::string& fname) { - return winenv_io_.DeleteFile(fname); -} - -Status WinEnv::Truncate(const std::string& fname, size_t size) { - return winenv_io_.Truncate(fname, size); -} - -Status WinEnv::GetCurrentTime(int64_t* unix_time) { - return winenv_io_.GetCurrentTime(unix_time); -} - -Status WinEnv::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.NewSequentialFile(fname, result, options); -} - -Status WinEnv::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.NewRandomAccessFile(fname, result, options); -} - -Status WinEnv::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.OpenWritableFile(fname, result, options, false); -} - -Status WinEnv::ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.OpenWritableFile(fname, result, options, true); -} - -Status WinEnv::NewRandomRWFile(const std::string & fname, - std::unique_ptr* result, - const EnvOptions & options) { - return winenv_io_.NewRandomRWFile(fname, result, options); -} - -Status WinEnv::NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result) { - return winenv_io_.NewMemoryMappedFileBuffer(fname, result); -} - -Status WinEnv::NewDirectory(const std::string& name, - std::unique_ptr* result) { - return winenv_io_.NewDirectory(name, result); -} - -Status WinEnv::FileExists(const std::string& fname) { - return winenv_io_.FileExists(fname); -} - -Status WinEnv::GetChildren(const std::string& dir, - std::vector* result) { - return winenv_io_.GetChildren(dir, result); -} - -Status WinEnv::CreateDir(const std::string& name) { - return winenv_io_.CreateDir(name); -} - -Status WinEnv::CreateDirIfMissing(const std::string& name) { - return winenv_io_.CreateDirIfMissing(name); -} - -Status WinEnv::DeleteDir(const std::string& name) { - return winenv_io_.DeleteDir(name); -} - -Status WinEnv::GetFileSize(const std::string& fname, - uint64_t* size) { - return winenv_io_.GetFileSize(fname, size); -} - -Status WinEnv::GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) { - return winenv_io_.GetFileModificationTime(fname, file_mtime); -} - -Status WinEnv::RenameFile(const std::string& src, - const std::string& target) { - return winenv_io_.RenameFile(src, target); -} - -Status WinEnv::LinkFile(const std::string& src, - const std::string& target) { - return winenv_io_.LinkFile(src, target); -} - -Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) { - return winenv_io_.NumFileLinks(fname, count); -} - -Status WinEnv::AreFilesSame(const std::string& first, - const std::string& second, bool* res) { - return winenv_io_.AreFilesSame(first, second, res); -} - -Status WinEnv::LockFile(const std::string& lockFname, - FileLock** lock) { - return winenv_io_.LockFile(lockFname, lock); -} - -Status WinEnv::UnlockFile(FileLock* lock) { - return winenv_io_.UnlockFile(lock); -} - -Status WinEnv::GetTestDirectory(std::string* result) { - return winenv_io_.GetTestDirectory(result); -} - -Status WinEnv::NewLogger(const std::string& fname, - std::shared_ptr* result) { - return winenv_io_.NewLogger(fname, result); -} - -uint64_t WinEnv::NowMicros() { - return winenv_io_.NowMicros(); -} - -uint64_t WinEnv::NowNanos() { - return winenv_io_.NowNanos(); -} - Status WinEnv::GetHostName(char* name, uint64_t len) { return winenv_io_.GetHostName(name, len); } -Status WinEnv::GetAbsolutePath(const std::string& db_path, - std::string* output_path) { - return winenv_io_.GetAbsolutePath(db_path, output_path); -} - -std::string WinEnv::TimeToString(uint64_t secondsSince1970) { - return winenv_io_.TimeToString(secondsSince1970); -} - -void WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, - void(*unschedFunction)(void* arg)) { +void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); } @@ -1464,32 +1378,20 @@ return winenv_threads_.UnSchedule(arg, pri); } -void WinEnv::StartThread(void(*function)(void* arg), void* arg) { +void WinEnv::StartThread(void (*function)(void* arg), void* arg) { return winenv_threads_.StartThread(function, arg); } -void WinEnv::WaitForJoin() { - return winenv_threads_.WaitForJoin(); -} +void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); } -unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { return winenv_threads_.GetThreadPoolQueueLen(pri); } -uint64_t WinEnv::GetThreadID() const { - return winenv_threads_.GetThreadID(); -} - -Status WinEnv::GetFreeSpace(const std::string& path, uint64_t* diskfree) { - return winenv_io_.GetFreeSpace(path, diskfree); -} - -void WinEnv::SleepForMicroseconds(int micros) { - return winenv_threads_.SleepForMicroseconds(micros); -} +uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); } // Allow increasing the number of worker threads. -void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { return winenv_threads_.SetBackgroundThreads(num, pri); } @@ -1497,44 +1399,21 @@ return winenv_threads_.GetBackgroundThreads(pri); } -void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); } -EnvOptions WinEnv::OptimizeForManifestRead( - const EnvOptions& env_options) const { - return winenv_io_.OptimizeForManifestRead(env_options); -} - -EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const { - return winenv_io_.OptimizeForLogWrite(env_options, db_options); -} - -EnvOptions WinEnv::OptimizeForManifestWrite( - const EnvOptions& env_options) const { - return winenv_io_.OptimizeForManifestWrite(env_options); -} - } // namespace port -std::string Env::GenerateUniqueId() { - std::string result; - - UUID uuid; - UuidCreateSequential(&uuid); - - RPC_CSTR rpc_str; - auto status = UuidToStringA(&uuid, &rpc_str); - (void)status; - assert(status == RPC_S_OK); - - result = reinterpret_cast(rpc_str); - - status = RpcStringFreeA(&rpc_str); - assert(status == RPC_S_OK); - - return result; +std::shared_ptr FileSystem::Default() { + return port::WinFileSystem::Default(); } +const std::shared_ptr& SystemClock::Default() { + static std::shared_ptr clock = + std::make_shared(); + return clock; +} } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,30 +15,30 @@ // multiple threads without any external synchronization. #pragma once - -#include "port/win/win_thread.h" -#include -#include "util/threadpool_imp.h" - #include #include #include -#include #include +#include +#include "env/composite_env_wrapper.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" #undef GetCurrentTime #undef DeleteFile -#undef GetTickCount +#undef LoadLibrary namespace ROCKSDB_NAMESPACE { namespace port { // Currently not designed for inheritance but rather a replacement class WinEnvThreads { -public: - + public: explicit WinEnvThreads(Env* hosted_env); ~WinEnvThreads(); @@ -46,12 +46,12 @@ WinEnvThreads(const WinEnvThreads&) = delete; WinEnvThreads& operator=(const WinEnvThreads&) = delete; - void Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, void(*unschedFunction)(void* arg)); + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)); int UnSchedule(void* arg, Env::Priority pri); - void StartThread(void(*function)(void* arg), void* arg); + void StartThread(void (*function)(void* arg), void* arg); void WaitForJoin(); @@ -61,287 +61,236 @@ uint64_t GetThreadID() const; - void SleepForMicroseconds(int micros); - // Allow increasing the number of worker threads. void SetBackgroundThreads(int num, Env::Priority pri); int GetBackgroundThreads(Env::Priority pri); void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); -private: - + private: Env* hosted_env_; mutable std::mutex mu_; std::vector thread_pools_; - std::vector threads_to_join_; - + std::vector threads_to_join_; }; -// Designed for inheritance so can be re-used -// but certain parts replaced -class WinEnvIO { -public: - explicit WinEnvIO(Env* hosted_env); - - virtual ~WinEnvIO(); - - virtual Status DeleteFile(const std::string& fname); - - Status Truncate(const std::string& fname, size_t size); - - virtual Status GetCurrentTime(int64_t* unix_time); - - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - // Helper for NewWritable and ReopenWritableFile - virtual Status OpenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options, - bool reopen); - - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - // The returned file will only be accessed by one thread at a time. - virtual Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - virtual Status NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result); - - virtual Status NewDirectory(const std::string& name, - std::unique_ptr* result); - - virtual Status FileExists(const std::string& fname); - - virtual Status GetChildren(const std::string& dir, - std::vector* result); - - virtual Status CreateDir(const std::string& name); - - virtual Status CreateDirIfMissing(const std::string& name); - - virtual Status DeleteDir(const std::string& name); +class WinClock : public SystemClock { + public: + WinClock(); + virtual ~WinClock() {} + + static const char* kClassName() { return "WindowsClock"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } - virtual Status GetFileSize(const std::string& fname, uint64_t* size); - - static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); - - virtual Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime); - - virtual Status RenameFile(const std::string& src, const std::string& target); - - virtual Status LinkFile(const std::string& src, const std::string& target); - - virtual Status NumFileLinks(const std::string& /*fname*/, - uint64_t* /*count*/); - - virtual Status AreFilesSame(const std::string& first, - const std::string& second, bool* res); - - virtual Status LockFile(const std::string& lockFname, FileLock** lock); - - virtual Status UnlockFile(FileLock* lock); - - virtual Status GetTestDirectory(std::string* result); - - virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result); - - virtual uint64_t NowMicros(); - - virtual uint64_t NowNanos(); - - virtual Status GetHostName(char* name, uint64_t len); - - virtual Status GetAbsolutePath(const std::string& db_path, - std::string* output_path); - - // This seems to clash with a macro on Windows, so #undef it here -#undef GetFreeSpace - - // Get the amount of free disk space - virtual Status GetFreeSpace(const std::string& path, uint64_t* diskfree); - - virtual std::string TimeToString(uint64_t secondsSince1970); - - virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const; - - virtual EnvOptions OptimizeForManifestWrite( - const EnvOptions& env_options) const; + uint64_t NowMicros() override; - virtual EnvOptions OptimizeForManifestRead( - const EnvOptions& env_options) const; + uint64_t NowNanos() override; - size_t GetPageSize() const { return page_size_; } + // 0 indicates not supported + uint64_t CPUMicros() override { return 0; } + void SleepForMicroseconds(int micros) override; - size_t GetAllocationGranularity() const { return allocation_granularity_; } + Status GetCurrentTime(int64_t* unix_time) override; + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time); uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } - static size_t GetSectorSize(const std::string& fname); + private: + using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME); -private: - // Returns true iff the named directory exists and is a directory. - virtual bool DirExists(const std::string& dname); - - typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); - - Env* hosted_env_; - size_t page_size_; - size_t allocation_granularity_; uint64_t perf_counter_frequency_; uint64_t nano_seconds_per_period_; FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; }; -class WinEnv : public Env { -public: - WinEnv(); +class WinFileSystem : public FileSystem { + public: + static const std::shared_ptr& Default(); + WinFileSystem(const std::shared_ptr& clock); + ~WinFileSystem() {} + static const char* kClassName() { return "WinFS"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const { return kDefaultName(); } - ~WinEnv(); - - Status DeleteFile(const std::string& fname) override; - - Status Truncate(const std::string& fname, size_t size) override; - - Status GetCurrentTime(int64_t* unix_time) override; + static size_t GetSectorSize(const std::string& fname); + size_t GetPageSize() const { return page_size_; } + size_t GetAllocationGranularity() const { return allocation_granularity_; } - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // The returned file will only be accessed by one thread at a time. - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; - Status NewMemoryMappedFileBuffer( + // Truncate the named file to the specified size. + IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override; + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewMemoryMappedFileBuffer( const std::string& fname, std::unique_ptr* result) override; - Status NewDirectory(const std::string& name, - std::unique_ptr* result) override; - - Status FileExists(const std::string& fname) override; - - Status GetChildren(const std::string& dir, - std::vector* result) override; - - Status CreateDir(const std::string& name) override; - - Status CreateDirIfMissing(const std::string& name) override; - - Status DeleteDir(const std::string& name) override; - - Status GetFileSize(const std::string& fname, - uint64_t* size) override; - - Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override; - - Status RenameFile(const std::string& src, - const std::string& target) override; + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + // Delete the specified directory. + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + // Store the last modification time of fname in *file_mtime. + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + // Rename file src to target. + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + + // Hard Link file src to target. + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, uint64_t* /*count*/, + IODebugContext* /*dbg*/) override; + IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can override to provide custom + // logger. + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; + IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options, + bool* is_dir, IODebugContext* /*dgb*/) override; + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) override; + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override; + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override; + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override; - Status LinkFile(const std::string& src, - const std::string& target) override; + protected: + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + // Returns true iff the named directory exists and is a directory. - Status NumFileLinks(const std::string& fname, uint64_t* count) override; + virtual bool DirExists(const std::string& dname); + // Helper for NewWritable and ReopenWritableFile + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + bool reopen); - Status AreFilesSame(const std::string& first, - const std::string& second, bool* res) override; + private: + std::shared_ptr clock_; + size_t page_size_; + size_t allocation_granularity_; +}; - Status LockFile(const std::string& lockFname, FileLock** lock) override; +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { + public: + explicit WinEnvIO(Env* hosted_env); - Status UnlockFile(FileLock* lock) override; + virtual ~WinEnvIO(); - Status GetTestDirectory(std::string* result) override; + virtual Status GetHostName(char* name, uint64_t len); - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override; + private: + Env* hosted_env_; +}; - uint64_t NowMicros() override; +class WinEnv : public CompositeEnv { + public: + WinEnv(); - uint64_t NowNanos() override; + ~WinEnv(); + static const char* kClassName() { return "WinEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kDefaultName(); } Status GetHostName(char* name, uint64_t len) override; - Status GetAbsolutePath(const std::string& db_path, - std::string* output_path) override; - - std::string TimeToString(uint64_t secondsSince1970) override; - Status GetThreadList(std::vector* thread_list) override; - void Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, void(*unschedFunction)(void* arg)) override; + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) override; int UnSchedule(void* arg, Env::Priority pri) override; - void StartThread(void(*function)(void* arg), void* arg) override; + void StartThread(void (*function)(void* arg), void* arg) override; - void WaitForJoin(); + void WaitForJoin() override; unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override; uint64_t GetThreadID() const override; - // This seems to clash with a macro on Windows, so #undef it here -#undef GetFreeSpace - - // Get the amount of free disk space - Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override; - - void SleepForMicroseconds(int micros) override; - // Allow increasing the number of worker threads. void SetBackgroundThreads(int num, Env::Priority pri) override; int GetBackgroundThreads(Env::Priority pri) override; void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; - EnvOptions OptimizeForManifestRead( - const EnvOptions& env_options) const override; - - EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const override; - - EnvOptions OptimizeForManifestWrite( - const EnvOptions& env_options) const override; - - -private: - + private: WinEnvIO winenv_io_; WinEnvThreads winenv_threads_; }; -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,8 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) + #include "port/win/io_win.h" +#include "env_win.h" #include "monitoring/iostats_context_imp.h" #include "test_util/sync_point.h" #include "util/aligned_buffer.h" @@ -18,36 +21,28 @@ namespace port { /* -* DirectIOHelper -*/ + * DirectIOHelper + */ namespace { const size_t kSectorSize = 512; -inline -bool IsPowerOfTwo(const size_t alignment) { +inline bool IsPowerOfTwo(const size_t alignment) { return ((alignment) & (alignment - 1)) == 0; } -inline -bool IsSectorAligned(const size_t off) { - return (off & (kSectorSize - 1)) == 0; -} - -inline -bool IsAligned(size_t alignment, const void* ptr) { +inline bool IsAligned(size_t alignment, const void* ptr) { return ((uintptr_t(ptr)) & (alignment - 1)) == 0; } -} - +} // namespace std::string GetWindowsErrSz(DWORD err) { LPSTR lpMsgBuf; FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, - 0, // Default language - reinterpret_cast(&lpMsgBuf), 0, NULL); + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); std::string Err = lpMsgBuf; LocalFree(lpMsgBuf); @@ -67,21 +62,20 @@ // Because all the reads/writes happen by the specified offset, the caller in // theory should not // rely on the current file offset. -Status pwrite(const WinFileData* file_data, const Slice& data, - uint64_t offset, size_t& bytes_written) { - - Status s; +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + IOStatus s; bytes_written = 0; size_t num_bytes = data.size(); if (num_bytes > std::numeric_limits::max()) { // May happen in 64-bit builds where size_t is 64-bits but // long is still 32-bit, but that's the API here at the moment - return Status::InvalidArgument("num_bytes is too large for a single write: " + - file_data->GetName()); + return IOStatus::InvalidArgument( + "num_bytes is too large for a single write: " + file_data->GetName()); } - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {0}; ULARGE_INTEGER offsetUnion; offsetUnion.QuadPart = offset; @@ -90,11 +84,12 @@ DWORD bytesWritten = 0; - if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast(num_bytes), - &bytesWritten, &overlapped)) { + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), + static_cast(num_bytes), &bytesWritten, + &overlapped)) { auto lastError = GetLastError(); s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), - lastError); + lastError); } else { bytes_written = bytesWritten; } @@ -103,18 +98,17 @@ } // See comments for pwrite above -Status pread(const WinFileData* file_data, char* src, size_t num_bytes, - uint64_t offset, size_t& bytes_read) { - - Status s; +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + IOStatus s; bytes_read = 0; if (num_bytes > std::numeric_limits::max()) { - return Status::InvalidArgument("num_bytes is too large for a single read: " + - file_data->GetName()); + return IOStatus::InvalidArgument( + "num_bytes is too large for a single read: " + file_data->GetName()); } - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {0}; ULARGE_INTEGER offsetUnion; offsetUnion.QuadPart = offset; @@ -123,13 +117,14 @@ DWORD bytesRead = 0; - if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast(num_bytes), - &bytesRead, &overlapped)) { + if (FALSE == ReadFile(file_data->GetFileHandle(), src, + static_cast(num_bytes), &bytesRead, + &overlapped)) { auto lastError = GetLastError(); // EOF is OK with zero bytes read if (lastError != ERROR_HANDLE_EOF) { s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), - lastError); + lastError); } } else { bytes_read = bytesRead; @@ -141,35 +136,34 @@ // SetFileInformationByHandle() is capable of fast pre-allocates. // However, this does not change the file end position unless the file is // truncated and the pre-allocated space is not considered filled with zeros. -Status fallocate(const std::string& filename, HANDLE hFile, - uint64_t to_size) { - Status status; +IOStatus fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + IOStatus status; FILE_ALLOCATION_INFO alloc_info; alloc_info.AllocationSize.QuadPart = to_size; if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, - sizeof(FILE_ALLOCATION_INFO))) { + sizeof(FILE_ALLOCATION_INFO))) { auto lastError = GetLastError(); status = IOErrorFromWindowsError( - "Failed to pre-allocate space: " + filename, lastError); + "Failed to pre-allocate space: " + filename, lastError); } return status; } -Status ftruncate(const std::string& filename, HANDLE hFile, - uint64_t toSize) { - Status status; +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) { + IOStatus status; FILE_END_OF_FILE_INFO end_of_file; end_of_file.EndOfFile.QuadPart = toSize; if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, - sizeof(FILE_END_OF_FILE_INFO))) { + sizeof(FILE_END_OF_FILE_INFO))) { auto lastError = GetLastError(); status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, - lastError); + lastError); } return status; @@ -189,6 +183,17 @@ return 0; } +WinFileData::WinFileData(const std::string& filename, HANDLE hFile, + bool direct_io) + : filename_(filename), + hFile_(hFile), + use_direct_io_(direct_io), + sector_size_(WinFileSystem::GetSectorSize(filename)) {} + +bool WinFileData::IsSectorAligned(const size_t off) const { + return (off & (sector_size_ - 1)) == 0; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // WinMmapReadableFile @@ -210,9 +215,11 @@ assert(ret); } -Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; +IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + IOStatus s; if (offset > length_) { *result = Slice(); @@ -220,13 +227,12 @@ } else if (offset + n > length_) { n = length_ - static_cast(offset); } - *result = - Slice(reinterpret_cast(mapped_region_)+offset, n); + *result = Slice(reinterpret_cast(mapped_region_) + offset, n); return s; } -Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { @@ -236,20 +242,19 @@ /////////////////////////////////////////////////////////////////////////////// /// WinMmapFile - // Can only truncate or reserve to a sector size aligned if // used on files that are opened with Unbuffered I/O -Status WinMmapFile::TruncateFile(uint64_t toSize) { +IOStatus WinMmapFile::TruncateFile(uint64_t toSize) { return ftruncate(filename_, hFile_, toSize); } -Status WinMmapFile::UnmapCurrentRegion() { - Status status; +IOStatus WinMmapFile::UnmapCurrentRegion() { + IOStatus status; if (mapped_begin_ != nullptr) { if (!::UnmapViewOfFile(mapped_begin_)) { status = IOErrorFromWindowsError( - "Failed to unmap file view: " + filename_, GetLastError()); + "Failed to unmap file view: " + filename_, GetLastError()); } // Move on to the next portion of the file @@ -269,16 +274,16 @@ return status; } -Status WinMmapFile::MapNewRegion() { - - Status status; +IOStatus WinMmapFile::MapNewRegion(const IOOptions& options, + IODebugContext* dbg) { + IOStatus status; assert(mapped_begin_ == nullptr); size_t minDiskSize = static_cast(file_offset_) + view_size_; if (minDiskSize > reserved_size_) { - status = Allocate(file_offset_, view_size_); + status = Allocate(file_offset_, view_size_, options, dbg); if (!status.ok()) { return status; } @@ -286,7 +291,6 @@ // Need to remap if (hMap_ == NULL || reserved_size_ > mapping_size_) { - if (hMap_ != NULL) { // Unmap the previous one BOOL ret __attribute__((__unused__)); @@ -299,18 +303,18 @@ mappingSize.QuadPart = reserved_size_; hMap_ = CreateFileMappingA( - hFile_, - NULL, // Security attributes - PAGE_READWRITE, // There is not a write only mode for mapping - mappingSize.HighPart, // Enable mapping the whole file but the actual - // amount mapped is determined by MapViewOfFile - mappingSize.LowPart, - NULL); // Mapping name + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name if (NULL == hMap_) { return IOErrorFromWindowsError( - "WindowsMmapFile failed to create file mapping for: " + filename_, - GetLastError()); + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); } mapping_size_ = reserved_size_; @@ -321,13 +325,13 @@ // View must begin at the granularity aligned offset mapped_begin_ = reinterpret_cast( - MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, - view_size_, NULL)); + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); if (!mapped_begin_) { status = IOErrorFromWindowsError( - "WindowsMmapFile failed to map file view: " + filename_, - GetLastError()); + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); } else { mapped_end_ = mapped_begin_ + view_size_; dst_ = mapped_begin_; @@ -337,15 +341,15 @@ return status; } -Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { +IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { return fallocate(filename_, hFile_, spaceToReserve); } WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, size_t allocation_granularity, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, false), - WritableFile(options), + FSWritableFile(options), hMap_(NULL), page_size_(page_size), allocation_granularity_(allocation_granularity), @@ -371,17 +375,19 @@ // View size must be both the multiple of allocation_granularity AND the // page size and the granularity is usually a multiple of a page size. - const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + const size_t viewSize = + 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode view_size_ = Roundup(viewSize, allocation_granularity_); } WinMmapFile::~WinMmapFile() { if (hFile_) { - this->Close(); + this->Close(IOOptions(), nullptr); } } -Status WinMmapFile::Append(const Slice& data) { +IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { const char* src = data.data(); size_t left = data.size(); @@ -390,9 +396,9 @@ size_t avail = mapped_end_ - dst_; if (avail == 0) { - Status s = UnmapCurrentRegion(); + IOStatus s = UnmapCurrentRegion(); if (s.ok()) { - s = MapNewRegion(); + s = MapNewRegion(options, dbg); } if (!s.ok()) { @@ -414,30 +420,31 @@ memset(dst_, 0, bytesToPad); } - return Status::OK(); + return IOStatus::OK(); } // Means Close() will properly take care of truncate // and it does not need any additional information -Status WinMmapFile::Truncate(uint64_t size) { - return Status::OK(); +IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinMmapFile::Close() { - Status s; +IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) { + IOStatus s; assert(NULL != hFile_); // We truncate to the precise size so no // uninitialized data at the end. SetEndOfFile // which we use does not write zeros and it is good. - uint64_t targetSize = GetFileSize(); + uint64_t targetSize = GetFileSize(options, dbg); if (mapped_begin_ != nullptr) { // Sync before unmapping to make sure everything // is on disk and there is not a lazy writing // so we are deterministic with the tests - Sync(); + Sync(options, dbg); s = UnmapCurrentRegion(); } @@ -446,14 +453,13 @@ if (!ret && s.ok()) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to Close mapping for file: " + filename_, lastError); + "Failed to Close mapping for file: " + filename_, lastError); } hMap_ = NULL; } if (hFile_ != NULL) { - TruncateFile(targetSize); BOOL ret = ::CloseHandle(hFile_); @@ -462,18 +468,22 @@ if (!ret && s.ok()) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to close file map handle: " + filename_, lastError); + "Failed to close file map handle: " + filename_, lastError); } } return s; } -Status WinMmapFile::Flush() { return Status::OK(); } +IOStatus WinMmapFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} // Flush only data -Status WinMmapFile::Sync() { - Status s; +IOStatus WinMmapFile::Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; // Some writes occurred since last sync if (dst_ > last_sync_) { @@ -483,15 +493,15 @@ assert(dst_ < mapped_end_); size_t page_begin = - TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); size_t page_end = - TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); // Flush only the amount of that is a multiple of pages if (!::FlushViewOfFile(mapped_begin_ + page_begin, - (page_end - page_begin) + page_size_)) { + (page_end - page_begin) + page_size_)) { s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, - GetLastError()); + GetLastError()); } else { last_sync_ = dst_; } @@ -501,16 +511,16 @@ } /** -* Flush data as well as metadata to stable storage. -*/ -Status WinMmapFile::Fsync() { - Status s = Sync(); + * Flush data as well as metadata to stable storage. + */ +IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + IOStatus s = Sync(options, dbg); // Flush metadata if (s.ok() && pending_sync_) { if (!::FlushFileBuffers(hFile_)) { s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, - GetLastError()); + GetLastError()); } pending_sync_ = false; } @@ -519,27 +529,31 @@ } /** -* Get the size of valid data in the file. This will not match the -* size that is returned from the filesystem because we use mmap -* to extend file by map_size every time. -*/ -uint64_t WinMmapFile::GetFileSize() { + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { size_t used = dst_ - mapped_begin_; return file_offset_ + used; } -Status WinMmapFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } -Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) { - Status status; - TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds); +IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus status; + TEST_KILL_RANDOM("WinMmapFile::Allocate"); // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(static_cast(offset + len), view_size_); + size_t spaceToReserve = + Roundup(static_cast(offset + len), view_size_); // Nothing to do if (spaceToReserve <= reserved_size_) { return status; @@ -561,31 +575,34 @@ // WinSequentialFile WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, f, options.use_direct_reads) {} WinSequentialFile::~WinSequentialFile() { assert(hFile_ != INVALID_HANDLE_VALUE); } -Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { - Status s; +IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + IOStatus s; size_t r = 0; assert(result != nullptr); if (WinFileData::use_direct_io()) { - return Status::NotSupported("Read() does not support direct_io"); + return IOStatus::NotSupported("Read() does not support direct_io"); } // Windows ReadFile API accepts a DWORD. // While it is possible to read in a loop if n is too big // it is an unlikely case. if (n > std::numeric_limits::max()) { - return Status::InvalidArgument("n is too big for a single ReadFile: " - + filename_); + return IOStatus::InvalidArgument("n is too big for a single ReadFile: " + + filename_); } - DWORD bytesToRead = static_cast(n); //cast is safe due to the check above + DWORD bytesToRead = + static_cast(n); // cast is safe due to the check above DWORD bytesRead = 0; BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); if (ret != FALSE) { @@ -593,8 +610,7 @@ } else { auto lastError = GetLastError(); if (lastError != ERROR_HANDLE_EOF) { - s = IOErrorFromWindowsError("ReadFile failed: " + filename_, - lastError); + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError); } } @@ -602,99 +618,86 @@ return s; } -Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const { +IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { return pread(this, src, numBytes, offset, bytes_read); } -Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) { - - Status s; - +IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { if (!WinFileData::use_direct_io()) { - return Status::NotSupported("This function is only used for direct_io"); + return IOStatus::NotSupported("This function is only used for direct_io"); } - if (!IsSectorAligned(static_cast(offset)) || - !IsSectorAligned(n)) { - return Status::InvalidArgument( - "WinSequentialFile::PositionedRead: offset is not properly aligned"); - } + assert(IsSectorAligned(static_cast(offset))); + assert(IsSectorAligned(static_cast(n))); - size_t bytes_read = 0; // out param - s = PositionedReadInternal(scratch, static_cast(n), offset, bytes_read); + size_t bytes_read = 0; // out param + IOStatus s = PositionedReadInternal(scratch, static_cast(n), offset, + bytes_read); *result = Slice(scratch, bytes_read); return s; } - -Status WinSequentialFile::Skip(uint64_t n) { - // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit - // integer. As such it is a highly unlikley case to have n so large. +IOStatus WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed + // 64-bit integer. As such it is a highly unlikley case to have n so large. if (n > static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" + - filename_); + return IOStatus::InvalidArgument( + "n is too large for a single SetFilePointerEx() call" + filename_); } LARGE_INTEGER li; - li.QuadPart = static_cast(n); //cast is safe due to the check above + li.QuadPart = static_cast(n); // cast is safe due to the check + // above BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); if (ret == FALSE) { auto lastError = GetLastError(); - return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, - lastError); + return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, + lastError); } - return Status::OK(); + return IOStatus::OK(); } -Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } ////////////////////////////////////////////////////////////////////////////////////////////////// /// WinRandomAccessBase -inline -Status WinRandomAccessImpl::PositionedReadInternal(char* src, - size_t numBytes, - uint64_t offset, - size_t& bytes_read) const { +inline IOStatus WinRandomAccessImpl::PositionedReadInternal( + char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const { return pread(file_base_, src, numBytes, offset, bytes_read); } -inline -WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, - size_t alignment, - const EnvOptions& options) : - file_base_(file_base), - alignment_(alignment) { - +inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const FileOptions& options) + : file_base_(file_base), + alignment_(std::max(alignment, file_base->GetSectorSize())) { assert(!options.use_mmap_reads); } -inline -Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - - Status s; - +inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, + Slice* result, + char* scratch) const { // Check buffer alignment if (file_base_->use_direct_io()) { - if (!IsSectorAligned(static_cast(offset)) || - !IsAligned(alignment_, scratch)) { - return Status::InvalidArgument( - "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned"); - } + assert(file_base_->IsSectorAligned(static_cast(offset))); + assert(IsAligned(alignment_, scratch)); } if (n == 0) { *result = Slice(scratch, 0); - return s; + return IOStatus::OK(); } size_t bytes_read = 0; - s = PositionedReadInternal(scratch, n, offset, bytes_read); + IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read); *result = Slice(scratch, bytes_read); return s; } @@ -704,20 +707,21 @@ WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_reads), WinRandomAccessImpl(this, alignment, options) {} -WinRandomAccessFile::~WinRandomAccessFile() { -} +WinRandomAccessFile::~WinRandomAccessFile() {} -Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { return ReadImpl(offset, n, result, scratch); } -Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { @@ -732,27 +736,26 @@ // WinWritableImpl // -inline -Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { - return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve); +inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), + spaceToReserve); } -inline -WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment) - : file_data_(file_data), - alignment_(alignment), - next_write_offset_(0), - reservedsize_(0) { - +inline WinWritableImpl::WinWritableImpl(WinFileData* file_data, + size_t alignment) + : file_data_(file_data), + alignment_(std::max(alignment, file_data->GetSectorSize())), + next_write_offset_(0), + reservedsize_(0) { // Query current position in case ReopenWritableFile is called // This position is only important for buffered writes // for unbuffered writes we explicitely specify the position. LARGE_INTEGER zero_move; - zero_move.QuadPart = 0; // Do not move + zero_move.QuadPart = 0; // Do not move LARGE_INTEGER pos; pos.QuadPart = 0; BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, - FILE_CURRENT); + FILE_CURRENT); // Querying no supped to fail if (ret != 0) { next_write_offset_ = pos.QuadPart; @@ -761,74 +764,62 @@ } } -inline -Status WinWritableImpl::AppendImpl(const Slice& data) { - - Status s; +inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) { + IOStatus s; if (data.size() > std::numeric_limits::max()) { - return Status::InvalidArgument("data is too long for a single write" + - file_data_->GetName()); + return IOStatus::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); } - size_t bytes_written = 0; // out param + size_t bytes_written = 0; // out param if (file_data_->use_direct_io()) { // With no offset specified we are appending // to the end of the file - assert(IsSectorAligned(next_write_offset_)); - if (!IsSectorAligned(data.size()) || - !IsAligned(static_cast(GetAlignement()), data.data())) { - s = Status::InvalidArgument( - "WriteData must be page aligned, size must be sector aligned"); - } else { - s = pwrite(file_data_, data, next_write_offset_, bytes_written); - } + assert(file_data_->IsSectorAligned(next_write_offset_)); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast(GetAlignment()), data.data())); + s = pwrite(file_data_, data, next_write_offset_, bytes_written); } else { - DWORD bytesWritten = 0; if (!WriteFile(file_data_->GetFileHandle(), data.data(), - static_cast(data.size()), &bytesWritten, NULL)) { + static_cast(data.size()), &bytesWritten, NULL)) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to WriteFile: " + file_data_->GetName(), - lastError); + "Failed to WriteFile: " + file_data_->GetName(), lastError); } else { bytes_written = bytesWritten; } } - if(s.ok()) { + if (s.ok()) { if (bytes_written == data.size()) { // This matters for direct_io cases where // we rely on the fact that next_write_offset_ // is sector aligned next_write_offset_ += bytes_written; } else { - s = Status::IOError("Failed to write all bytes: " + - file_data_->GetName()); + s = IOStatus::IOError("Failed to write all bytes: " + + file_data_->GetName()); } } return s; } -inline -Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) { - - if(file_data_->use_direct_io()) { - if (!IsSectorAligned(static_cast(offset)) || - !IsSectorAligned(data.size()) || - !IsAligned(static_cast(GetAlignement()), data.data())) { - return Status::InvalidArgument( - "Data and offset must be page aligned, size must be sector aligned"); - } +inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data, + uint64_t offset) { + if (file_data_->use_direct_io()) { + assert(file_data_->IsSectorAligned(static_cast(offset))); + assert(file_data_->IsSectorAligned(data.size())); + assert(IsAligned(static_cast(GetAlignment()), data.data())); } size_t bytes_written = 0; - Status s = pwrite(file_data_, data, offset, bytes_written); + IOStatus s = pwrite(file_data_, data, offset, bytes_written); - if(s.ok()) { + if (s.ok()) { if (bytes_written == data.size()) { // For sequential write this would be simple // size extension by data.size() @@ -837,23 +828,21 @@ next_write_offset_ = write_end; } } else { - s = Status::IOError("Failed to write all of the requested data: " + - file_data_->GetName()); + s = IOStatus::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); } } return s; } -inline -Status WinWritableImpl::TruncateImpl(uint64_t size) { - +inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) { // It is tempting to check for the size for sector alignment // but truncation may come at the end and there is not a requirement // for this to be sector aligned so long as we do not attempt to write // after that. The interface docs state that the behavior is undefined // in that case. - Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), - size); + IOStatus s = + ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size); if (s.ok()) { next_write_offset_ = size; @@ -861,50 +850,48 @@ return s; } -inline -Status WinWritableImpl::CloseImpl() { - - Status s; +inline IOStatus WinWritableImpl::CloseImpl() { + IOStatus s; auto hFile = file_data_->GetFileHandle(); assert(INVALID_HANDLE_VALUE != hFile); if (!::FlushFileBuffers(hFile)) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " + - file_data_->GetName(), - lastError); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Close() for: " + file_data_->GetName(), + lastError); } - if(!file_data_->CloseFile() && s.ok()) { + if (!file_data_->CloseFile() && s.ok()) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(), - lastError); + s = IOErrorFromWindowsError( + "CloseHandle failed for: " + file_data_->GetName(), lastError); } return s; } -inline -Status WinWritableImpl::SyncImpl() { - Status s; - if (!::FlushFileBuffers (file_data_->GetFileHandle())) { +inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!::FlushFileBuffers(file_data_->GetFileHandle())) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError); + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), + lastError); } return s; } - -inline -Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { - Status status; - TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); +inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + IOStatus status; + TEST_KILL_RANDOM("WinWritableFile::Allocate"); // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(static_cast(offset + len), static_cast(alignment_)); + size_t spaceToReserve = Roundup(static_cast(offset + len), + static_cast(alignment_)); // Nothing to do if (spaceToReserve <= reservedsize_) { return status; @@ -918,66 +905,78 @@ return status; } - //////////////////////////////////////////////////////////////////////////////// /// WinWritableFile WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, size_t /* capacity */, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_writes), WinWritableImpl(this, alignment), - WritableFile(options) { + FSWritableFile(options) { assert(!options.use_mmap_writes); } -WinWritableFile::~WinWritableFile() { -} +WinWritableFile::~WinWritableFile() {} // Indicates if the class makes use of direct I/O -bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); } +bool WinWritableFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} size_t WinWritableFile::GetRequiredBufferAlignment() const { - return static_cast(GetAlignement()); + return static_cast(GetAlignment()); } -Status WinWritableFile::Append(const Slice& data) { +IOStatus WinWritableFile::Append(const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return AppendImpl(data); } -Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { +IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return PositionedAppendImpl(data, offset); } // Need to implement this so the file is truncated correctly // when buffered and unbuffered mode -Status WinWritableFile::Truncate(uint64_t size) { +IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return TruncateImpl(size); } -Status WinWritableFile::Close() { +IOStatus WinWritableFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return CloseImpl(); } - // write out the cached data to the OS cache - // This is now taken care of the WritableFileWriter -Status WinWritableFile::Flush() { - return Status::OK(); +// write out the cached data to the OS cache +// This is now taken care of the WritableFileWriter +IOStatus WinWritableFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinWritableFile::Sync() { - return SyncImpl(); +IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); } -Status WinWritableFile::Fsync() { return SyncImpl(); } +IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} bool WinWritableFile::IsSyncThreadSafe() const { return true; } -uint64_t WinWritableFile::GetFileSize() { +uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return GetFileNextWriteOffset(); } -Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) { +IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return AllocateImpl(offset, len); } @@ -989,36 +988,45 @@ /// WinRandomRWFile WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, - size_t alignment, const EnvOptions& options) + size_t alignment, const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_reads && options.use_direct_writes), WinRandomAccessImpl(this, alignment, options), WinWritableImpl(this, alignment) {} -bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); } +bool WinRandomRWFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} size_t WinRandomRWFile::GetRequiredBufferAlignment() const { - return static_cast(GetAlignement()); + assert(WinRandomAccessImpl::GetAlignment() == + WinWritableImpl::GetAlignment()); + return static_cast(WinRandomAccessImpl::GetAlignment()); } -Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { +IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return PositionedAppendImpl(data, offset); } -Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { return ReadImpl(offset, n, result, scratch); } -Status WinRandomRWFile::Flush() { - return Status::OK(); +IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinRandomRWFile::Sync() { - return SyncImpl(); +IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); } -Status WinRandomRWFile::Close() { +IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return CloseImpl(); } @@ -1027,9 +1035,9 @@ WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { BOOL ret #if defined(_MSC_VER) - = FALSE; + = FALSE; #else - __attribute__((__unused__)); + __attribute__((__unused__)); #endif if (base_ != nullptr) { ret = ::UnmapViewOfFile(base_); @@ -1051,7 +1059,10 @@ ////////////////////////////////////////////////////////////////////////// /// WinDirectory -Status WinDirectory::Fsync() { return Status::OK(); } +IOStatus WinDirectory::Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { return GetUniqueIdFromFile(handle_, id, max_size); @@ -1065,5 +1076,7 @@ assert(ret); } -} +} // namespace port } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,51 +9,53 @@ #pragma once #include +#include + #include #include +#include "rocksdb/file_system.h" #include "rocksdb/status.h" -#include "rocksdb/env.h" #include "util/aligned_buffer.h" - -#include +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { namespace port { std::string GetWindowsErrSz(DWORD err); -inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { +inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) { return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) - ? Status::NoSpace(context, GetWindowsErrSz(err)) + ? IOStatus::NoSpace(context, GetWindowsErrSz(err)) : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) - ? Status::PathNotFound(context, GetWindowsErrSz(err)) - : Status::IOError(context, GetWindowsErrSz(err)); + ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) + : IOStatus::IOError(context, GetWindowsErrSz(err)); } -inline Status IOErrorFromLastWindowsError(const std::string& context) { +inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { return IOErrorFromWindowsError(context, GetLastError()); } -inline Status IOError(const std::string& context, int err_number) { +inline IOStatus IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) - ? Status::NoSpace(context, strerror(err_number)) + ? IOStatus::NoSpace(context, errnoStr(err_number).c_str()) : (err_number == ENOENT) - ? Status::PathNotFound(context, strerror(err_number)) - : Status::IOError(context, strerror(err_number)); + ? IOStatus::PathNotFound(context, + errnoStr(err_number).c_str()) + : IOStatus::IOError(context, errnoStr(err_number).c_str()); } class WinFileData; -Status pwrite(const WinFileData* file_data, const Slice& data, - uint64_t offset, size_t& bytes_written); +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); -Status pread(const WinFileData* file_data, char* src, size_t num_bytes, - uint64_t offset, size_t& bytes_read); +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); -Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); +IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); -Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); @@ -65,12 +67,12 @@ // will need to be aligned (not sure there is a guarantee that the buffer // passed in is aligned). const bool use_direct_io_; + const size_t sector_size_; public: // We want this class be usable both for inheritance (prive // or protected) and for containment so __ctor and __dtor public - WinFileData(const std::string& filename, HANDLE hFile, bool direct_io) - : filename_(filename), hFile_(hFile), use_direct_io_(direct_io) {} + WinFileData(const std::string& filename, HANDLE hFile, bool direct_io); virtual ~WinFileData() { this->CloseFile(); } @@ -91,38 +93,46 @@ bool use_direct_io() const { return use_direct_io_; } + size_t GetSectorSize() const { return sector_size_; } + + bool IsSectorAligned(const size_t off) const; + WinFileData(const WinFileData&) = delete; WinFileData& operator=(const WinFileData&) = delete; }; -class WinSequentialFile : protected WinFileData, public SequentialFile { - +class WinSequentialFile : protected WinFileData, public FSSequentialFile { // Override for behavior change when creating a custom env - virtual Status PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const; + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; -public: + public: WinSequentialFile(const std::string& fname, HANDLE f, - const EnvOptions& options); + const FileOptions& options); ~WinSequentialFile(); WinSequentialFile(const WinSequentialFile&) = delete; WinSequentialFile& operator=(const WinSequentialFile&) = delete; - virtual Status Read(size_t n, Slice* result, char* scratch) override; - virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override; + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; - virtual Status Skip(uint64_t n) override; + IOStatus Skip(uint64_t n) override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } }; // mmap() based random-access -class WinMmapReadableFile : private WinFileData, public RandomAccessFile { +class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { HANDLE hMap_; const void* mapped_region_; @@ -138,10 +148,11 @@ WinMmapReadableFile(const WinMmapReadableFile&) = delete; WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -150,7 +161,7 @@ // data to the file. This is safe since we either properly close the // file before reading from it, or for log files, the reading code // knows enough to skip zero suffixes. -class WinMmapFile : private WinFileData, public WritableFile { +class WinMmapFile : private WinFileData, public FSWritableFile { private: HANDLE hMap_; @@ -179,51 +190,59 @@ // Can only truncate or reserve to a sector size aligned if // used on files that are opened with Unbuffered I/O - Status TruncateFile(uint64_t toSize); + IOStatus TruncateFile(uint64_t toSize); - Status UnmapCurrentRegion(); + IOStatus UnmapCurrentRegion(); - Status MapNewRegion(); + IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg); - virtual Status PreallocateInternal(uint64_t spaceToReserve); + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); public: WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, - size_t allocation_granularity, const EnvOptions& options); + size_t allocation_granularity, const FileOptions& options); ~WinMmapFile(); WinMmapFile(const WinMmapFile&) = delete; WinMmapFile& operator=(const WinMmapFile&) = delete; - virtual Status Append(const Slice& data) override; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } // Means Close() will properly take care of truncate // and it does not need any additional information - virtual Status Truncate(uint64_t size) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; // Flush only data - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; /** - * Flush data as well as metadata to stable storage. - */ - virtual Status Fsync() override; + * Flush data as well as metadata to stable storage. + */ + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; /** - * Get the size of valid data in the file. This will not match the - * size that is returned from the filesystem because we use mmap - * to extend file by map_size every time. - */ - virtual uint64_t GetFileSize() override; + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual Status Allocate(uint64_t offset, uint64_t len) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -231,24 +250,24 @@ class WinRandomAccessImpl { protected: WinFileData* file_base_; - size_t alignment_; + size_t alignment_; // Override for behavior change when creating a custom env - virtual Status PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const; + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; WinRandomAccessImpl(WinFileData* file_base, size_t alignment, - const EnvOptions& options); + const FileOptions& options); virtual ~WinRandomAccessImpl() {} - Status ReadImpl(uint64_t offset, size_t n, Slice* result, - char* scratch) const; + IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; size_t GetAlignment() const { return alignment_; } public: - WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; }; @@ -258,21 +277,24 @@ : private WinFileData, protected WinRandomAccessImpl, // Want to be able to override // PositionedReadInternal - public RandomAccessFile { + public FSRandomAccessFile { public: WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options); + const FileOptions& options); ~WinRandomAccessFile(); - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; - virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; virtual size_t GetRequiredBufferAlignment() const override; }; @@ -293,28 +315,29 @@ protected: WinFileData* file_data_; const uint64_t alignment_; - uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t + next_write_offset_; // Needed because Windows does not support O_APPEND uint64_t reservedsize_; // how far we have reserved space - virtual Status PreallocateInternal(uint64_t spaceToReserve); + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); WinWritableImpl(WinFileData* file_data, size_t alignment); ~WinWritableImpl() {} - uint64_t GetAlignement() const { return alignment_; } + uint64_t GetAlignment() const { return alignment_; } - Status AppendImpl(const Slice& data); + IOStatus AppendImpl(const Slice& data); // Requires that the data is aligned as specified by // GetRequiredBufferAlignment() - Status PositionedAppendImpl(const Slice& data, uint64_t offset); + IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset); - Status TruncateImpl(uint64_t size); + IOStatus TruncateImpl(uint64_t size); - Status CloseImpl(); + IOStatus CloseImpl(); - Status SyncImpl(); + IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg); uint64_t GetFileNextWriteOffset() { // Double accounting now here with WritableFileWriter @@ -326,7 +349,7 @@ return next_write_offset_; } - Status AllocateImpl(uint64_t offset, uint64_t len); + IOStatus AllocateImpl(uint64_t offset, uint64_t len); public: WinWritableImpl(const WinWritableImpl&) = delete; @@ -335,32 +358,47 @@ class WinWritableFile : private WinFileData, protected WinWritableImpl, - public WritableFile { + public FSWritableFile { public: WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, - size_t capacity, const EnvOptions& options); + size_t capacity, const FileOptions& options); ~WinWritableFile(); - virtual Status Append(const Slice& data) override; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } // Requires that the data is aligned as specified by // GetRequiredBufferAlignment() - virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } // Need to implement this so the file is truncated correctly // when buffered and unbuffered mode - virtual Status Truncate(uint64_t size) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; // write out the cached data to the OS cache // This is now taken care of the WritableFileWriter - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Fsync() override; + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; virtual bool IsSyncThreadSafe() const override; @@ -370,9 +408,10 @@ virtual size_t GetRequiredBufferAlignment() const override; - virtual uint64_t GetFileSize() override; + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Allocate(uint64_t offset, uint64_t len) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -380,10 +419,10 @@ class WinRandomRWFile : private WinFileData, protected WinRandomAccessImpl, protected WinWritableImpl, - public RandomRWFile { + public FSRandomRWFile { public: WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options); + const FileOptions& options); ~WinRandomRWFile() {} @@ -397,45 +436,50 @@ // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. - virtual Status Write(uint64_t offset, const Slice& data) override; + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Fsync() { return Sync(); } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return Sync(options, dbg); + } - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { -private: - HANDLE file_handle_; - HANDLE map_handle_; -public: - WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) : - MemoryMappedFileBuffer(base, size), - file_handle_(file_handle), - map_handle_(map_handle) {} + private: + HANDLE file_handle_; + HANDLE map_handle_; + + public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, + size_t size) + : MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} ~WinMemoryMappedBuffer() override; }; -class WinDirectory : public Directory { +class WinDirectory : public FSDirectory { HANDLE handle_; + public: explicit WinDirectory(HANDLE h) noexcept : handle_(h) { assert(handle_ != INVALID_HANDLE_VALUE); } - ~WinDirectory() { - ::CloseHandle(handle_); - } - virtual Status Fsync() override; + ~WinDirectory() { ::CloseHandle(handle_); } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -452,5 +496,5 @@ private: HANDLE hFile_; }; -} +} // namespace port } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,30 +7,29 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#if !defined(OS_WIN) && !defined(WIN32) && !defined(_WIN32) -#error Windows Specific Code -#endif +#if defined(OS_WIN) #include "port/win/port_win.h" +#include #include -#include "port/port_dirent.h" -#include "port/sys_time.h" - -#include +#include #include -#include #include -#include -#include #include +#include +#include +#include + +#include "port/port_dirent.h" +#include "port/sys_time.h" #ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES // utf8 <-> utf16 -#include -#include #include +#include +#include #endif #include "logging/logging.h" @@ -43,7 +42,7 @@ #ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES std::string utf16_to_utf8(const std::wstring& utf16) { - std::wstring_convert,wchar_t> convert; + std::wstring_convert, wchar_t> convert; return convert.to_bytes(utf16); } @@ -54,16 +53,17 @@ #endif void gettimeofday(struct timeval* tv, struct timezone* /* tz */) { - using namespace std::chrono; - - microseconds usNow( - duration_cast(system_clock::now().time_since_epoch())); + std::chrono::microseconds usNow( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch())); - seconds secNow(duration_cast(usNow)); + std::chrono::seconds secNow( + std::chrono::duration_cast(usNow)); tv->tv_sec = static_cast(secNow.count()); - tv->tv_usec = static_cast(usNow.count() - - duration_cast(secNow).count()); + tv->tv_usec = static_cast( + usNow.count() - + std::chrono::duration_cast(secNow).count()); } Mutex::~Mutex() {} @@ -86,20 +86,28 @@ } bool CondVar::TimedWait(uint64_t abs_time_us) { - - using namespace std::chrono; - // MSVC++ library implements wait_until in terms of wait_for so // we need to convert absolute wait into relative wait. - microseconds usAbsTime(abs_time_us); + std::chrono::microseconds usAbsTime(abs_time_us); - microseconds usNow( - duration_cast(system_clock::now().time_since_epoch())); - microseconds relTimeUs = - (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero(); + std::chrono::microseconds usNow( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch())); + std::chrono::microseconds relTimeUs = (usAbsTime > usNow) + ? (usAbsTime - usNow) + : std::chrono::microseconds::zero(); // Caller must ensure that mutex is held prior to calling this method std::unique_lock lk(mu_->getLock(), std::adopt_lock); + + // Work around https://github.com/microsoft/STL/issues/369 +#if defined(_MSC_VER) && \ + (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L) + if (relTimeUs == std::chrono::microseconds::zero()) { + lk.unlock(); + lk.lock(); + } +#endif #ifndef NDEBUG mu_->locked_ = false; #endif @@ -130,13 +138,12 @@ // Private structure, exposed only by pointer struct DIR { - HANDLE handle_; - bool firstread_; + HANDLE handle_; + bool firstread_; RX_WIN32_FIND_DATA data_; dirent entry_; - DIR() : handle_(INVALID_HANDLE_VALUE), - firstread_(true) {} + DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {} DIR(const DIR&) = delete; DIR& operator=(const DIR&) = delete; @@ -159,20 +166,19 @@ std::unique_ptr dir(new DIR); - dir->handle_ = RX_FindFirstFileEx(RX_FN(pattern).c_str(), - FindExInfoBasic, // Do not want alternative name - &dir->data_, - FindExSearchNameMatch, - NULL, // lpSearchFilter - 0); + dir->handle_ = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + FindExInfoBasic, // Do not want alternative name + &dir->data_, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); if (dir->handle_ == INVALID_HANDLE_VALUE) { return nullptr; } RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName)); - strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), - FN_TO_RX(x).c_str()); + strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str()); return dir.release(); } @@ -195,7 +201,7 @@ } RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName)); - strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), + strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), FN_TO_RX(x).c_str()); return &dirp->entry_; @@ -215,7 +221,6 @@ } int Truncate(std::string path, int64_t len) { - if (len < 0) { errno = EINVAL; return -1; @@ -223,10 +228,10 @@ HANDLE hFile = RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, // Security attrs - OPEN_EXISTING, // Truncate existing file only - FILE_ATTRIBUTE_NORMAL, NULL); + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, // Security attrs + OPEN_EXISTING, // Truncate existing file only + FILE_ATTRIBUTE_NORMAL, NULL); if (INVALID_HANDLE_VALUE == hFile) { auto lastError = GetLastError(); @@ -265,5 +270,34 @@ // Assume 4KB page size const size_t kPageSize = 4U * 1024U; +void SetCpuPriority(ThreadId id, CpuPriority priority) { + // Not supported + (void)id; + (void)priority; +} + +int64_t GetProcessID() { return GetCurrentProcessId(); } + +bool GenerateRfcUuid(std::string* output) { + UUID uuid; + UuidCreateSequential(&uuid); + + RPC_CSTR rpc_str; + auto status = UuidToStringA(&uuid, &rpc_str); + if (status != RPC_S_OK) { + return false; + } + + // rpc_str is nul-terminated + *output = reinterpret_cast(rpc_str); + + status = RpcStringFreeA(&rpc_str); + assert(status == RPC_S_OK); + + return true; +} + } // namespace port } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h 2025-05-19 16:14:27.000000000 +0000 @@ -18,12 +18,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include @@ -45,7 +47,7 @@ #undef DeleteFile #ifndef _SSIZE_T_DEFINED -typedef SSIZE_T ssize_t; +using ssize_t = SSIZE_T; #endif // size_t printf formatting named in the manner of C99 standard formatting @@ -146,6 +148,16 @@ mutex_.unlock(); } + bool TryLock() { + bool ret = mutex_.try_lock(); +#ifndef NDEBUG + if (ret) { + locked_ = true; + } +#endif + return ret; + } + // this will assert if the mutex is not locked // it does NOT verify that mutex is held by a calling thread void AssertHeld() { @@ -217,9 +229,14 @@ Mutex* mu_; }; + +#ifdef _POSIX_THREADS +using Thread = std::thread; +#else // Wrapper around the platform efficient // or otherwise preferrable implementation using Thread = WindowsThread; +#endif // OnceInit type helps emulate // Posix semantics with initialization @@ -276,7 +293,7 @@ #endif static inline void AsmVolatilePause() { -#if defined(_M_IX86) || defined(_M_X64) +#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) YieldProcessor(); #endif // it would be nice to get "wfe" on ARM here @@ -285,7 +302,7 @@ extern int PhysicalCoreID(); // For Thread Local Storage abstraction -typedef DWORD pthread_key_t; +using pthread_key_t = DWORD; inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) { // Not used @@ -336,6 +353,16 @@ std::string utf16_to_utf8(const std::wstring& utf16); std::wstring utf8_to_utf16(const std::string& utf8); +using ThreadId = int; + +extern void SetCpuPriority(ThreadId id, CpuPriority priority); + +int64_t GetProcessID(); + +// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns +// true on success or false on failure. +bool GenerateRfcUuid(std::string* output); + } // namespace port @@ -344,6 +371,7 @@ #define RX_FILESTRING std::wstring #define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a) #define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a) +#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str()) #define RX_FNLEN(a) ::wcslen(a) #define RX_DeleteFile DeleteFileW @@ -361,12 +389,14 @@ #define RX_PathIsRelative PathIsRelativeW #define RX_GetCurrentDirectory GetCurrentDirectoryW #define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW +#define RX_PathIsDirectory PathIsDirectoryW #else #define RX_FILESTRING std::string #define RX_FN(a) a #define FN_TO_RX(a) a +#define RX_FNCMP(a, b) strcmp(a, b) #define RX_FNLEN(a) strlen(a) #define RX_DeleteFile DeleteFileA @@ -376,7 +406,7 @@ #define RX_FindFirstFileEx FindFirstFileExA #define RX_CreateDirectory CreateDirectoryA #define RX_FindNextFile FindNextFileA -#define RX_WIN32_FIND_DATA WIN32_FIND_DATA +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA #define RX_CreateDirectory CreateDirectoryA #define RX_RemoveDirectory RemoveDirectoryA #define RX_GetFileAttributesEx GetFileAttributesExA @@ -385,6 +415,7 @@ #define RX_PathIsRelative PathIsRelativeA #define RX_GetCurrentDirectory GetCurrentDirectoryA #define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA +#define RX_PathIsDirectory PathIsDirectoryA #endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) + #ifndef ROCKSDB_JEMALLOC # error This file can only be part of jemalloc aware build #endif @@ -73,3 +75,5 @@ je_free(p); } } + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,32 +10,36 @@ // Logger implementation that can be shared by all environments // where enough posix functionality is available. +#if defined(OS_WIN) + #include "port/win/win_logger.h" -#include "port/win/io_win.h" -#include +#include #include #include -#include -#include -#include "rocksdb/env.h" +#include +#include #include "monitoring/iostats_context_imp.h" #include "port/sys_time.h" +#include "port/win/env_win.h" +#include "port/win/io_win.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { namespace port { -WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, +WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, const InfoLogLevel log_level) : Logger(log_level), file_(file), gettid_(gettid), log_size_(0), last_flush_micros_(0), - env_(env), + clock_(clock), flush_pending_(false) { assert(file_ != NULL); assert(file_ != INVALID_HANDLE_VALUE); @@ -47,13 +51,11 @@ BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL); if (ret == FALSE) { std::string errSz = GetWindowsErrSz(GetLastError()); - fprintf(stderr, errSz.c_str()); + fprintf(stderr, "%s", errSz.c_str()); } } -WinLogger::~WinLogger() { - CloseInternal(); -} +WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); } Status WinLogger::CloseImpl() { return CloseInternal(); @@ -65,15 +67,13 @@ BOOL ret = FlushFileBuffers(file_); if (ret == 0) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", - lastError); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); } ret = CloseHandle(file_); // On error the return value is zero if (ret == 0 && s.ok()) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", - lastError); + s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError); } file_ = INVALID_HANDLE_VALUE; closed_ = true; @@ -90,7 +90,7 @@ // for perf reasons. } - last_flush_micros_ = env_->NowMicros(); + last_flush_micros_ = clock_->NowMicros(); } void WinLogger::Logv(const char* format, va_list ap) { @@ -163,7 +163,7 @@ &bytesWritten, NULL); if (ret == FALSE) { std::string errSz = GetWindowsErrSz(GetLastError()); - fprintf(stderr, errSz.c_str()); + fprintf(stderr, "%s", errSz.c_str()); } flush_pending_ = true; @@ -190,3 +190,5 @@ } } // namespace ROCKSDB_NAMESPACE + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,22 +12,21 @@ #pragma once +#include +#include + #include +#include #include "rocksdb/env.h" -#include -#include - namespace ROCKSDB_NAMESPACE { - -class Env; +class SystemClock; namespace port { - class WinLogger : public ROCKSDB_NAMESPACE::Logger { public: - WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, + WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL); virtual ~WinLogger(); @@ -54,7 +53,7 @@ uint64_t (*gettid_)(); // Return the thread id for the current thread std::atomic_size_t log_size_; std::atomic_uint_fast64_t last_flush_micros_; - Env* env_; + SystemClock* clock_; bool flush_pending_; Status CloseInternal(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) +// Most Mingw builds support std::thread only when using posix threads. +// In that case, some of these functions will be unavailable. +// Note that we're using either WindowsThread or std::thread, depending on +// which one is available. +#ifndef _POSIX_THREADS + #include "port/win/win_thread.h" #include @@ -177,3 +184,6 @@ } } // namespace port } // namespace ROCKSDB_NAMESPACE + +#endif // !_POSIX_THREADS +#endif // OS_WIN diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,8 @@ #pragma once +#ifndef _POSIX_THREADS + #include #include #include @@ -23,11 +25,10 @@ // -- is that it dynamically allocates its internals that are automatically // freed when the thread terminates and not on the destruction of the // object. This makes it difficult to control the source of memory -// allocation +// allocation // - This implements Pimpl so we can easily replace the guts of the // object in our private version if necessary. class WindowsThread { - struct Data; std::shared_ptr data_; @@ -35,15 +36,14 @@ void Init(std::function&&); -public: - - typedef void* native_handle_type; + public: + using native_handle_type = void*; // Construct with no thread WindowsThread(); // Template constructor - // + // // This templated constructor accomplishes several things // // - Allows the class as whole to be not a template @@ -66,17 +66,12 @@ // dependent type that both checks the signature conformance to ensure // that all of the necessary arguments are provided and allows pimpl // implementation. - template::type, - WindowsThread>::value>::type> - explicit WindowsThread(Fn&& fx, Args&&... ax) : - WindowsThread() { - + template ::type, WindowsThread>::value>::type> + explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() { // Use binder to create a single callable entity - auto binder = std::bind(std::forward(fx), - std::forward(ax)...); + auto binder = std::bind(std::forward(fx), std::forward(ax)...); // Use std::function to take advantage of the type erasure // so we can still hide implementation within pimpl // This also makes sure that the binder signature is compliant @@ -85,7 +80,6 @@ Init(std::move(target)); } - ~WindowsThread(); WindowsThread(const WindowsThread&) = delete; @@ -120,3 +114,4 @@ } } // namespace std +#endif // !_POSIX_THREADS diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#if defined(OS_WIN) + #include "port/win/xpress_win.h" #include @@ -127,10 +129,9 @@ } char* Decompress(const char* input_data, size_t input_length, - int* decompress_size) { - + size_t* uncompressed_size) { assert(input_data != nullptr); - assert(decompress_size != nullptr); + assert(uncompressed_size != nullptr); if (input_length == 0) { return nullptr; @@ -183,14 +184,6 @@ assert(decompressedBufferSize > 0); - // On Windows we are limited to a 32-bit int for the - // output data size argument - // so we hopefully never get here - if (decompressedBufferSize > std::numeric_limits::max()) { - assert(false); - return nullptr; - } - // The callers are deallocating using delete[] // thus we must allocate with new[] std::unique_ptr outputBuffer(new char[decompressedBufferSize]); @@ -214,7 +207,7 @@ return nullptr; } - *decompress_size = static_cast(decompressedDataSize); + *uncompressed_size = decompressedDataSize; // Return the raw buffer to the caller supporting the tradition return outputBuffer.release(); @@ -224,3 +217,5 @@ } // namespace ROCKSDB_NAMESPACE #endif + +#endif diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h 2025-05-19 16:14:27.000000000 +0000 @@ -20,8 +20,7 @@ bool Compress(const char* input, size_t length, std::string* output); char* Decompress(const char* input_data, size_t input_length, - int* decompress_size); - + size_t* uncompressed_size); } } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/src.mk mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk --- mariadb-10.11.11/storage/rocksdb/rocksdb/src.mk 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk 2025-05-19 16:14:27.000000000 +0000 @@ -1,22 +1,39 @@ # These are the sources from which librocksdb.a is built: LIB_SOURCES = \ + cache/cache.cc \ + cache/cache_entry_roles.cc \ + cache/cache_key.cc \ + cache/cache_reservation_manager.cc \ cache/clock_cache.cc \ cache/lru_cache.cc \ cache/sharded_cache.cc \ db/arena_wrapped_db_iter.cc \ + db/blob/blob_fetcher.cc \ + db/blob/blob_file_addition.cc \ + db/blob/blob_file_builder.cc \ + db/blob/blob_file_cache.cc \ + db/blob/blob_file_garbage.cc \ + db/blob/blob_file_meta.cc \ + db/blob/blob_file_reader.cc \ + db/blob/blob_garbage_meter.cc \ + db/blob/blob_log_format.cc \ + db/blob/blob_log_sequential_reader.cc \ + db/blob/blob_log_writer.cc \ + db/blob/prefetch_buffer_collection.cc \ db/builder.cc \ db/c.cc \ db/column_family.cc \ - db/compacted_db_impl.cc \ - db/compaction/compaction.cc \ + db/compaction/compaction.cc \ db/compaction/compaction_iterator.cc \ db/compaction/compaction_job.cc \ db/compaction/compaction_picker.cc \ db/compaction/compaction_picker_fifo.cc \ db/compaction/compaction_picker_level.cc \ - db/compaction/compaction_picker_universal.cc \ + db/compaction/compaction_picker_universal.cc \ + db/compaction/sst_partitioner.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ + db/db_impl/compacted_db_impl.cc \ db/db_impl/db_impl.cc \ db/db_impl/db_impl_compaction_flush.cc \ db/db_impl/db_impl_debug.cc \ @@ -29,7 +46,7 @@ db/db_info_dumper.cc \ db/db_iter.cc \ db/dbformat.cc \ - db/error_handler.cc \ + db/error_handler.cc \ db/event_helpers.cc \ db/experimental.cc \ db/external_sst_file_ingestion_job.cc \ @@ -47,6 +64,8 @@ db/memtable_list.cc \ db/merge_helper.cc \ db/merge_operator.cc \ + db/output_validator.cc \ + db/periodic_work_scheduler.cc \ db/range_del_aggregator.cc \ db/range_tombstone_fragmenter.cc \ db/repair.cc \ @@ -57,25 +76,32 @@ db/trim_history_scheduler.cc \ db/version_builder.cc \ db/version_edit.cc \ + db/version_edit_handler.cc \ db/version_set.cc \ + db/wal_edit.cc \ db/wal_manager.cc \ db/write_batch.cc \ db/write_batch_base.cc \ db/write_controller.cc \ db/write_thread.cc \ + env/composite_env.cc \ env/env.cc \ env/env_chroot.cc \ env/env_encryption.cc \ env/env_hdfs.cc \ env/env_posix.cc \ env/file_system.cc \ - env/fs_posix.cc \ + env/fs_posix.cc \ + env/fs_remap.cc \ + env/file_system_tracer.cc \ env/io_posix.cc \ env/mock_env.cc \ + env/unique_id_gen.cc \ file/delete_scheduler.cc \ file/file_prefetch_buffer.cc \ file/file_util.cc \ file/filename.cc \ + file/line_file_reader.cc \ file/random_access_file_reader.cc \ file/read_write_util.cc \ file/readahead_raf.cc \ @@ -88,6 +114,8 @@ memory/arena.cc \ memory/concurrent_arena.cc \ memory/jemalloc_nodump_allocator.cc \ + memory/memkind_kmem_allocator.cc \ + memory/memory_allocator.cc \ memtable/alloc_tracker.cc \ memtable/hash_linklist_rep.cc \ memtable/hash_skiplist_rep.cc \ @@ -109,20 +137,30 @@ monitoring/thread_status_util.cc \ monitoring/thread_status_util_debug.cc \ options/cf_options.cc \ + options/configurable.cc \ + options/customizable.cc \ options/db_options.cc \ options/options.cc \ options/options_helper.cc \ options/options_parser.cc \ - options/options_sanity_check.cc \ port/port_posix.cc \ + port/win/env_default.cc \ + port/win/env_win.cc \ + port/win/io_win.cc \ + port/win/port_win.cc \ + port/win/win_logger.cc \ + port/win/win_thread.cc \ port/stack_trace.cc \ table/adaptive/adaptive_table_factory.cc \ + table/block_based/binary_search_index_reader.cc \ table/block_based/block.cc \ table/block_based/block_based_filter_block.cc \ table/block_based/block_based_table_builder.cc \ table/block_based/block_based_table_factory.cc \ + table/block_based/block_based_table_iterator.cc \ table/block_based/block_based_table_reader.cc \ table/block_based/block_builder.cc \ + table/block_based/block_prefetcher.cc \ table/block_based/block_prefix_index.cc \ table/block_based/data_block_hash_index.cc \ table/block_based/data_block_footer.cc \ @@ -130,11 +168,16 @@ table/block_based/filter_policy.cc \ table/block_based/flush_block_policy.cc \ table/block_based/full_filter_block.cc \ + table/block_based/hash_index_reader.cc \ table/block_based/index_builder.cc \ + table/block_based/index_reader_common.cc \ table/block_based/parsed_full_filter_block.cc \ table/block_based/partitioned_filter_block.cc \ + table/block_based/partitioned_index_iterator.cc \ + table/block_based/partitioned_index_reader.cc \ + table/block_based/reader_common.cc \ table/block_based/uncompression_dict_reader.cc \ - table/block_fetcher.cc \ + table/block_fetcher.cc \ table/cuckoo/cuckoo_table_builder.cc \ table/cuckoo/cuckoo_table_factory.cc \ table/cuckoo/cuckoo_table_reader.cc \ @@ -150,16 +193,23 @@ table/plain/plain_table_index.cc \ table/plain/plain_table_key_coding.cc \ table/plain/plain_table_reader.cc \ + table/sst_file_dumper.cc \ table/sst_file_reader.cc \ table/sst_file_writer.cc \ + table/table_factory.cc \ table/table_properties.cc \ table/two_level_iterator.cc \ + table/unique_id.cc \ test_util/sync_point.cc \ test_util/sync_point_impl.cc \ test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ + trace_replay/trace_record_handler.cc \ + trace_replay/trace_record_result.cc \ + trace_replay/trace_record.cc \ trace_replay/trace_replay.cc \ trace_replay/block_cache_tracer.cc \ + trace_replay/io_tracer.cc \ util/build_version.cc \ util/coding.cc \ util/compaction_job_stats_impl.cc \ @@ -167,13 +217,16 @@ util/compression_context_cache.cc \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ + util/crc32c_arm64.cc \ util/dynamic_bloom.cc \ util/hash.cc \ util/murmurhash.cc \ util/random.cc \ util/rate_limiter.cc \ + util/ribbon_config.cc \ + util/regex.cc \ util/slice.cc \ - util/file_checksum_helper.cc \ + util/file_checksum_helper.cc \ util/status.cc \ util/string_util.cc \ util/thread_local.cc \ @@ -185,23 +238,27 @@ utilities/blob_db/blob_db_impl.cc \ utilities/blob_db/blob_db_impl_filesnapshot.cc \ utilities/blob_db/blob_file.cc \ - utilities/blob_db/blob_log_format.cc \ - utilities/blob_db/blob_log_reader.cc \ - utilities/blob_db/blob_log_writer.cc \ + utilities/cache_dump_load.cc \ + utilities/cache_dump_load_impl.cc \ utilities/cassandra/cassandra_compaction_filter.cc \ utilities/cassandra/format.cc \ utilities/cassandra/merge_operator.cc \ utilities/checkpoint/checkpoint_impl.cc \ + utilities/compaction_filters.cc \ utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \ utilities/convenience/info_log_finder.cc \ utilities/debug.cc \ utilities/env_mirror.cc \ utilities/env_timed.cc \ + utilities/fault_injection_env.cc \ + utilities/fault_injection_fs.cc \ + utilities/fault_injection_secondary_cache.cc \ utilities/leveldb_options/leveldb_options.cc \ utilities/memory/memory_util.cc \ + utilities/merge_operators.cc \ utilities/merge_operators/max.cc \ utilities/merge_operators/put.cc \ - utilities/merge_operators/sortlist.cc \ + utilities/merge_operators/sortlist.cc \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/uint64add.cc \ @@ -218,6 +275,10 @@ utilities/simulator_cache/sim_cache.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/trace/file_trace_reader_writer.cc \ + utilities/trace/replayer_impl.cc \ + utilities/transactions/lock/lock_manager.cc \ + utilities/transactions/lock/point/point_lock_tracker.cc \ + utilities/transactions/lock/point/point_lock_manager.cc \ utilities/transactions/optimistic_transaction.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/pessimistic_transaction.cc \ @@ -225,21 +286,16 @@ utilities/transactions/snapshot_checker.cc \ utilities/transactions/transaction_base.cc \ utilities/transactions/transaction_db_mutex_impl.cc \ - utilities/transactions/transaction_lock_mgr.cc \ utilities/transactions/transaction_util.cc \ utilities/transactions/write_prepared_txn.cc \ utilities/transactions/write_prepared_txn_db.cc \ utilities/transactions/write_unprepared_txn.cc \ utilities/transactions/write_unprepared_txn_db.cc \ utilities/ttl/db_ttl_impl.cc \ + utilities/wal_filter.cc \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ -ifeq ($(ARMCRC_SOURCE),1) -LIB_SOURCES +=\ - util/crc32c_arm64.cc -endif - ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1)) LIB_SOURCES_ASM =\ util/crc32c_ppc_asm.S @@ -250,7 +306,24 @@ LIB_SOURCES_C = endif +RANGE_TREE_SOURCES =\ + utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc \ + utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \ + utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \ + utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \ + utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc \ + utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc + TOOL_LIB_SOURCES = \ + tools/io_tracer_parser_tool.cc \ tools/ldb_cmd.cc \ tools/ldb_tool.cc \ tools/sst_dump_tool.cc \ @@ -262,24 +335,32 @@ MOCK_LIB_SOURCES = \ table/mock_table.cc \ - test_util/fault_injection_test_env.cc BENCH_LIB_SOURCES = \ tools/db_bench_tool.cc \ + tools/simulated_hybrid_file_system.cc \ + +CACHE_BENCH_LIB_SOURCES = \ + cache/cache_bench_tool.cc \ STRESS_LIB_SOURCES = \ db_stress_tool/batched_ops_stress.cc \ db_stress_tool/cf_consistency_stress.cc \ db_stress_tool/db_stress_common.cc \ db_stress_tool/db_stress_driver.cc \ - db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_gflags.cc \ + db_stress_tool/db_stress_listener.cc \ db_stress_tool/db_stress_shared_state.cc \ + db_stress_tool/db_stress_stat.cc \ + db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_tool.cc \ + db_stress_tool/expected_state.cc \ db_stress_tool/no_batched_ops_stress.cc \ + db_stress_tool/multi_ops_txns_stress.cc \ TEST_LIB_SOURCES = \ db/db_test_util.cc \ + test_util/mock_time_env.cc \ test_util/testharness.cc \ test_util/testutil.cc \ utilities/cassandra/test_utils.cc \ @@ -291,21 +372,59 @@ third-party/folly/folly/synchronization/ParkingLot.cpp \ third-party/folly/folly/synchronization/WaitOptions.cpp \ -MAIN_SOURCES = \ +TOOLS_MAIN_SOURCES = \ + db_stress_tool/db_stress.cc \ + tools/blob_dump.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc \ + tools/db_repl_stress.cc \ + tools/db_sanity_test.cc \ + tools/ldb.cc \ + tools/io_tracer_parser.cc \ + tools/sst_dump.cc \ + tools/write_stress.cc \ + tools/dump/rocksdb_dump.cc \ + tools/dump/rocksdb_undump.cc \ + tools/trace_analyzer.cc \ + tools/io_tracer_parser_tool.cc \ + +BENCH_MAIN_SOURCES = \ cache/cache_bench.cc \ + db/range_del_aggregator_bench.cc \ + memtable/memtablerep_bench.cc \ + table/table_reader_bench.cc \ + tools/db_bench.cc \ + util/filter_bench.cc \ + utilities/persistent_cache/persistent_cache_bench.cc \ + #util/log_write_bench.cc \ + +TEST_MAIN_SOURCES = \ cache/cache_test.cc \ - db_stress_tool/db_stress.cc \ + cache/cache_reservation_manager_test.cc \ + cache/lru_cache_test.cc \ + db/blob/blob_counting_iterator_test.cc \ + db/blob/blob_file_addition_test.cc \ + db/blob/blob_file_builder_test.cc \ + db/blob/blob_file_cache_test.cc \ + db/blob/blob_file_garbage_test.cc \ + db/blob/blob_file_reader_test.cc \ + db/blob/blob_garbage_meter_test.cc \ + db/blob/db_blob_basic_test.cc \ + db/blob/db_blob_compaction_test.cc \ + db/blob/db_blob_corruption_test.cc \ + db/blob/db_blob_index_test.cc \ db/column_family_test.cc \ db/compact_files_test.cc \ + db/compaction/clipping_iterator_test.cc \ db/compaction/compaction_iterator_test.cc \ db/compaction/compaction_job_test.cc \ db/compaction/compaction_job_stats_test.cc \ db/compaction/compaction_picker_test.cc \ + db/compaction/compaction_service_test.cc \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ db/db_basic_test.cc \ - db/db_blob_index_test.cc \ + db/db_with_timestamp_basic_test.cc \ db/db_block_cache_test.cc \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ @@ -313,62 +432,58 @@ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \ + db/import_column_family_test.cc \ db/db_inplace_update_test.cc \ db/db_io_failure_test.cc \ db/db_iter_test.cc \ db/db_iter_stress_test.cc \ db/db_iterator_test.cc \ + db/db_kv_checksum_test.cc \ db/db_log_iter_test.cc \ db/db_memtable_test.cc \ db/db_merge_operator_test.cc \ - db/db_merge_operand_test.cc \ + db/db_merge_operand_test.cc \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ - db/db_impl/db_secondary_test.cc \ + db/db_secondary_test.cc \ db/db_sst_test.cc \ db/db_statistics_test.cc \ db/db_table_properties_test.cc \ db/db_tailing_iter_test.cc \ db/db_test.cc \ db/db_test2.cc \ + db/db_logical_block_size_cache_test.cc \ db/db_universal_compaction_test.cc \ db/db_wal_test.cc \ + db/db_with_timestamp_compaction_test.cc \ + db/db_write_buffer_manager_test.cc \ db/db_write_test.cc \ db/dbformat_test.cc \ db/deletefile_test.cc \ - db/env_timed_test.cc \ - db/error_handler_test.cc \ + db/error_handler_fs_test.cc \ db/external_sst_file_basic_test.cc \ db/external_sst_file_test.cc \ db/fault_injection_test.cc \ db/file_indexer_test.cc \ - db/file_reader_writer_test.cc \ db/filename_test.cc \ db/flush_job_test.cc \ - db/hash_table_test.cc \ - db/hash_test.cc \ - db/heap_test.cc \ db/listener_test.cc \ db/log_test.cc \ - db/lru_cache_test.cc \ db/manual_compaction_test.cc \ db/memtable_list_test.cc \ db/merge_helper_test.cc \ db/merge_test.cc \ - db/obsolete_files_test.cc \ - db/options_settable_test.cc \ + db/obsolete_files_test.cc \ db/options_file_test.cc \ db/perf_context_test.cc \ - db/persistent_cache_test.cc \ + db/periodic_work_scheduler_test.cc \ db/plain_table_db_test.cc \ db/prefix_test.cc \ db/repair_test.cc \ db/range_del_aggregator_test.cc \ - db/range_del_aggregator_bench.cc \ db/range_tombstone_fragmenter_test.cc \ db/table_properties_collector_test.cc \ - db/util_merge_operators_test.cc \ db/version_builder_test.cc \ db/version_edit_test.cc \ db/version_set_test.cc \ @@ -378,21 +493,29 @@ db/write_controller_test.cc \ env/env_basic_test.cc \ env/env_test.cc \ + env/io_posix_test.cc \ env/mock_env_test.cc \ + file/delete_scheduler_test.cc \ + file/prefetch_test.cc \ + file/random_access_file_reader_test.cc \ logging/auto_roll_logger_test.cc \ logging/env_logger_test.cc \ logging/event_logger_test.cc \ memory/arena_test.cc \ + memory/memory_allocator_test.cc \ memtable/inlineskiplist_test.cc \ - memtable/memtablerep_bench.cc \ memtable/skiplist_test.cc \ memtable/write_buffer_manager_test.cc \ monitoring/histogram_test.cc \ monitoring/iostats_context_test.cc \ monitoring/statistics_test.cc \ monitoring/stats_history_test.cc \ + options/configurable_test.cc \ + options/customizable_test.cc \ + options/options_settable_test.cc \ options/options_test.cc \ table/block_based/block_based_filter_block_test.cc \ + table/block_based/block_based_table_reader_test.cc \ table/block_based/block_test.cc \ table/block_based/data_block_hash_index_test.cc \ table/block_based/full_filter_block_test.cc \ @@ -402,19 +525,17 @@ table/cuckoo/cuckoo_table_reader_test.cc \ table/merger_test.cc \ table/sst_file_reader_test.cc \ - table/table_reader_bench.cc \ table/table_test.cc \ - third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + table/block_fetcher_test.cc \ + test_util/testutil_test.cc \ tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc \ - tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc \ - tools/db_bench.cc \ - tools/db_bench_tool_test.cc \ - tools/db_sanity_test.cc \ + tools/io_tracer_parser_test.cc \ tools/ldb_cmd_test.cc \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ - tools/trace_analyzer_test.cc \ + tools/trace_analyzer_test.cc \ trace_replay/block_cache_tracer_test.cc \ + trace_replay/io_tracer_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ util/coding_test.cc \ @@ -422,15 +543,20 @@ util/defer_test.cc \ util/dynamic_bloom_test.cc \ util/filelock_test.cc \ - util/log_write_bench.cc \ - util/rate_limiter_test.cc \ + util/file_reader_writer_test.cc \ + util/hash_test.cc \ + util/heap_test.cc \ util/random_test.cc \ + util/rate_limiter_test.cc \ util/repeatable_thread_test.cc \ + util/ribbon_test.cc \ util/slice_test.cc \ util/slice_transform_test.cc \ util/timer_queue_test.cc \ + util/timer_test.cc \ util/thread_list_test.cc \ util/thread_local_test.cc \ + util/work_queue_test.cc \ utilities/backupable/backupable_db_test.cc \ utilities/blob_db/blob_db_test.cc \ utilities/cassandra/cassandra_format_test.cc \ @@ -438,26 +564,39 @@ utilities/cassandra/cassandra_row_merge_test.cc \ utilities/cassandra/cassandra_serialize_test.cc \ utilities/checkpoint/checkpoint_test.cc \ + utilities/env_timed_test.cc \ utilities/memory/memory_test.cc \ utilities/merge_operators/string_append/stringappend_test.cc \ utilities/object_registry_test.cc \ utilities/option_change_migration/option_change_migration_test.cc \ utilities/options/options_util_test.cc \ + utilities/persistent_cache/hash_table_test.cc \ + utilities/persistent_cache/persistent_cache_test.cc \ utilities/simulator_cache/cache_simulator_test.cc \ utilities/simulator_cache/sim_cache_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ + utilities/transactions/lock/range/range_locking_test.cc \ utilities/transactions/transaction_test.cc \ + utilities/transactions/lock/point/point_lock_manager_test.cc \ utilities/transactions/write_prepared_transaction_test.cc \ utilities/transactions/write_unprepared_transaction_test.cc \ utilities/ttl/ttl_test.cc \ + utilities/util_merge_operators_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ +TEST_MAIN_SOURCES_C = \ + db/c_test.c \ + +MICROBENCH_SOURCES = \ + microbench/ribbon_bench.cc \ + JNI_NATIVE_SOURCES = \ java/rocksjni/backupenginejni.cc \ java/rocksjni/backupablejni.cc \ java/rocksjni/checkpoint.cc \ java/rocksjni/clock_cache.cc \ + java/rocksjni/cache.cc \ java/rocksjni/columnfamilyhandle.cc \ java/rocksjni/compact_range_options.cc \ java/rocksjni/compaction_filter.cc \ @@ -471,8 +610,12 @@ java/rocksjni/comparator.cc \ java/rocksjni/comparatorjnicallback.cc \ java/rocksjni/compression_options.cc \ + java/rocksjni/concurrent_task_limiter.cc \ + java/rocksjni/config_options.cc \ java/rocksjni/env.cc \ java/rocksjni/env_options.cc \ + java/rocksjni/event_listener.cc \ + java/rocksjni/event_listener_jnicallback.cc \ java/rocksjni/ingest_external_file_options.cc \ java/rocksjni/filter.cc \ java/rocksjni/iterator.cc \ @@ -502,6 +645,7 @@ java/rocksjni/sst_file_writerjni.cc \ java/rocksjni/sst_file_readerjni.cc \ java/rocksjni/sst_file_reader_iterator.cc \ + java/rocksjni/sst_partitioner.cc \ java/rocksjni/statistics.cc \ java/rocksjni/statisticsjni.cc \ java/rocksjni/table.cc \ @@ -518,6 +662,7 @@ java/rocksjni/transaction_notifier.cc \ java/rocksjni/transaction_notifier_jnicallback.cc \ java/rocksjni/ttl.cc \ + java/rocksjni/testable_event_listener.cc \ java/rocksjni/wal_filter.cc \ java/rocksjni/wal_filter_jnicallback.cc \ java/rocksjni/write_batch.cc \ diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc 2025-05-19 16:14:27.000000000 +0000 @@ -42,12 +42,13 @@ extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, - bool /*prefetch_index_and_filter_in_cache*/) const { + bool prefetch_index_and_filter_in_cache) const { Footer footer; - auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */, + IOOptions opts; + auto s = ReadFooterFromFile(opts, file.get(), nullptr /* prefetch_buffer */, file_size, &footer); if (!s.ok()) { return s; @@ -57,9 +58,10 @@ return plain_table_factory_->NewTableReader( table_reader_options, std::move(file), file_size, table); } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || - footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { + footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( - table_reader_options, std::move(file), file_size, table); + ro, table_reader_options, std::move(file), file_size, table, + prefetch_index_and_filter_in_cache); } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { return cuckoo_table_factory_->NewTableReader( table_reader_options, std::move(file), file_size, table); @@ -69,13 +71,12 @@ } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - return table_factory_to_write_->NewTableBuilder(table_builder_options, - column_family_id, file); + return table_factory_to_write_->NewTableBuilder(table_builder_options, file); } -std::string AdaptiveTableFactory::GetPrintableTableOptions() const { +std::string AdaptiveTableFactory::GetPrintableOptions() const { std::string ret; ret.reserve(20000); const int kBufferSize = 200; @@ -85,13 +86,13 @@ snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n", (table_factory_to_write_->Name() ? table_factory_to_write_->Name() : ""), - table_factory_to_write_->GetPrintableTableOptions().c_str()); + table_factory_to_write_->GetPrintableOptions().c_str()); ret.append(buffer); } if (plain_table_factory_) { snprintf(buffer, kBufferSize, " %s options:\n%s\n", plain_table_factory_->Name() ? plain_table_factory_->Name() : "", - plain_table_factory_->GetPrintableTableOptions().c_str()); + plain_table_factory_->GetPrintableOptions().c_str()); ret.append(buffer); } if (block_based_table_factory_) { @@ -99,13 +100,13 @@ buffer, kBufferSize, " %s options:\n%s\n", (block_based_table_factory_->Name() ? block_based_table_factory_->Name() : ""), - block_based_table_factory_->GetPrintableTableOptions().c_str()); + block_based_table_factory_->GetPrintableOptions().c_str()); ret.append(buffer); } if (cuckoo_table_factory_) { snprintf(buffer, kBufferSize, " %s options:\n%s\n", cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "", - cuckoo_table_factory_->GetPrintableTableOptions().c_str()); + cuckoo_table_factory_->GetPrintableOptions().c_str()); ret.append(buffer); } return ret; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h 2025-05-19 16:14:27.000000000 +0000 @@ -33,24 +33,18 @@ const char* Name() const override { return "AdaptiveTableFactory"; } + using TableFactory::NewTableReader; Status NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; - // Sanitizes the specified DB Options. - Status SanitizeOptions( - const DBOptions& /*db_opts*/, - const ColumnFamilyOptions& /*cf_opts*/) const override { - return Status::OK(); - } - - std::string GetPrintableTableOptions() const override; + std::string GetPrintableOptions() const override; private: std::shared_ptr table_factory_to_write_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,73 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/binary_search_index_reader.h" + +namespace ROCKSDB_NAMESPACE { +Status BinarySearchIndexReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); + + return Status::OK(); +} + +InternalIteratorBase* BinarySearchIndexReader::NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const BlockBasedTable::Rep* rep = table()->get_rep(); + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,48 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + BinarySearchIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,7 +15,6 @@ #include #include -#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" @@ -127,22 +126,48 @@ } }; -void DataBlockIter::Next() { - assert(Valid()); - ParseNextDataKey(); +struct DecodeEntryV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + assert(value_length); + + *value_length = 0; + return DecodeKeyV4()(p, limit, shared, non_shared); + } +}; +void DataBlockIter::NextImpl() { + bool is_shared = false; + ParseNextDataKey(&is_shared); } -void DataBlockIter::NextOrReport() { - assert(Valid()); - ParseNextDataKey(); +void MetaBlockIter::NextImpl() { + bool is_shared = false; + ParseNextKey(&is_shared); } -void IndexBlockIter::Next() { +void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } + +void IndexBlockIter::PrevImpl() { assert(Valid()); - ParseNextIndexKey(); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + // Loop until end of current entry hits the start of original entry + while (ParseNextIndexKey() && NextEntryOffset() < original) { + } } -void IndexBlockIter::Prev() { +void MetaBlockIter::PrevImpl() { assert(Valid()); // Scan backwards to a restart point before current_ const uint32_t original = current_; @@ -156,13 +181,15 @@ restart_index_--; } SeekToRestartPoint(restart_index_); + bool is_shared = false; // Loop until end of current entry hits the start of original entry - while (ParseNextIndexKey() && NextEntryOffset() < original) { + while (ParseNextKey(&is_shared) && + NextEntryOffset() < original) { } } -// Similar to IndexBlockIter::Prev but also caches the prev entries -void DataBlockIter::Prev() { +// Similar to IndexBlockIter::PrevImpl but also caches the prev entries +void DataBlockIter::PrevImpl() { assert(Valid()); assert(prev_entries_idx_ == -1 || @@ -176,19 +203,25 @@ prev_entries_[prev_entries_idx_]; const char* key_ptr = nullptr; + bool raw_key_cached; if (current_prev_entry.key_ptr != nullptr) { // The key is not delta encoded and stored in the data block key_ptr = current_prev_entry.key_ptr; - key_pinned_ = true; + raw_key_cached = false; } else { // The key is delta encoded and stored in prev_entries_keys_buff_ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; - key_pinned_ = false; + raw_key_cached = true; } const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; - key_.SetKey(current_key, false /* copy */); + // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, + // not necessity. It is convenient since this class treats keys as pinned + // when `raw_key_` points to an outside buffer. So we cannot allow + // `raw_key_` point into Prev cache as it is a transient outside buffer + // (i.e., keys in it are not actually pinned). + raw_key_.SetKey(current_key, raw_key_cached /* copy */); value_ = current_prev_entry.value; return; @@ -214,12 +247,13 @@ SeekToRestartPoint(restart_index_); do { - if (!ParseNextDataKey()) { + bool is_shared = false; + if (!ParseNextDataKey(&is_shared)) { break; } - Slice current_key = key(); + Slice current_key = raw_key_.GetKey(); - if (key_.IsKeyPinned()) { + if (raw_key_.IsKeyPinned()) { // The key is not delta encoded prev_entries_.emplace_back(current_, current_key.data(), 0, current_key.size(), value()); @@ -236,24 +270,36 @@ prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } -void DataBlockIter::Seek(const Slice& target) { +void DataBlockIter::SeekImpl(const Slice& target) { Slice seek_key = target; PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } - SeekToRestartPoint(index); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} - // Linear search (within restart block) for first key >= target - while (ParseNextDataKey() && Compare(key_, seek_key) < 0) { +void MetaBlockIter::SeekImpl(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } // Optimized Seek for point lookup for an internal key `target` @@ -273,8 +319,8 @@ // // If the return value is TRUE, iter location has two possibilies: // 1) If iter is valid, it is set to a location as if set by BinarySeek. In -// this case, it points to the first key_ with a larger user_key or a -// matching user_key with a seqno no greater than the seeking seqno. +// this case, it points to the first key with a larger user_key or a matching +// user_key with a seqno no greater than the seeking seqno. // 2) If the iter is invalid, it means that either all the user_key is less // than the seek_user_key, or the block ends with a matching user_key but // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno @@ -287,21 +333,21 @@ if (entry == kCollision) { // HashSeek not effective, falling back - Seek(target); + SeekImpl(target); return true; } if (entry == kNoEntry) { // Even if we cannot find the user_key in this block, the result may - // exist in the next block. Consider this exmpale: + // exist in the next block. Consider this example: // // Block N: [aab@100, ... , app@120] - // bounary key: axy@50 (we make minimal assumption about a boundary key) + // boundary key: axy@50 (we make minimal assumption about a boundary key) // Block N+1: [axy@10, ... ] // // If seek_key = axy@60, the search will starts from Block N. // Even if the user_key is not found in the hash map, the caller still - // have to conntinue searching the next block. + // have to continue searching the next block. // // In this case, we pretend the key is the the last restart interval. // The while-loop below will search the last restart interval for the @@ -315,22 +361,21 @@ // check if the key is in the restart_interval assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); + current_ = GetRestartPoint(restart_index); - const char* limit = nullptr; - if (restart_index_ + 1 < num_restarts_) { - limit = data_ + GetRestartPoint(restart_index_ + 1); - } else { - limit = data_ + restarts_; + uint32_t limit = restarts_; + if (restart_index + 1 < num_restarts_) { + limit = GetRestartPoint(restart_index + 1); } - - while (true) { + while (current_ < limit) { + bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid - // further searching the block content accross restart interval boundary. + // further searching the block content across restart interval boundary. // - // TODO(fwu): check the left and write boundary of the restart interval + // TODO(fwu): check the left and right boundary of the restart interval // to avoid linear seek a target key that is out of range. - if (!ParseNextDataKey(limit) || Compare(key_, target) >= 0) { + if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) { // we stop at the first potential matching user key. break; } @@ -341,7 +386,7 @@ // 1) there is only one user_key match in the block (otherwise collsion). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. - // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. + // ParseNextKey() skiped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than @@ -355,18 +400,18 @@ return true; } - if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) { + if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } // Here we are conservative and only support a limited set of cases - ValueType value_type = ExtractValueType(key_.GetKey()); + ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); if (value_type != ValueType::kTypeValue && value_type != ValueType::kTypeDeletion && value_type != ValueType::kTypeSingleDeletion && value_type != ValueType::kTypeBlobIndex) { - Seek(target); + SeekImpl(target); return true; } @@ -374,18 +419,19 @@ return true; } -void IndexBlockIter::Seek(const Slice& target) { +void IndexBlockIter::SeekImpl(const Slice& target) { TEST_SYNC_POINT("IndexBlockIter::Seek:0"); - Slice seek_key = target; - if (!key_includes_seq_) { - seek_key = ExtractUserKey(target); - } PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } + Slice seek_key = target; + if (raw_key_.IsUserKey()) { + seek_key = ExtractUserKey(target); + } status_ = Status::OK(); uint32_t index = 0; + bool skip_linear_scan = false; bool ok = false; if (prefix_index_) { bool prefix_may_exist = true; @@ -397,68 +443,88 @@ current_ = restarts_; status_ = Status::NotFound(); } + // restart interval must be one when hash search is enabled so the binary + // search simply lands at the right place. + skip_linear_scan = true; } else if (value_delta_encoded_) { - ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + ok = BinarySeek(seek_key, &index, &skip_linear_scan); } else { - ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + ok = BinarySeek(seek_key, &index, &skip_linear_scan); } if (!ok) { return; } - SeekToRestartPoint(index); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); +} - // Linear search (within restart block) for first key >= target - while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) { +void DataBlockIter::SeekForPrevImpl(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); + + if (!ok) { + return; + } + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); + + if (!Valid()) { + SeekToLastImpl(); + } else { + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); + } } } -void DataBlockIter::SeekForPrev(const Slice& target) { +void MetaBlockIter::SeekForPrevImpl(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; - bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - comparator_); + bool skip_linear_scan = false; + bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } - SeekToRestartPoint(index); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); - // Linear search (within restart block) for first key >= seek_key - while (ParseNextDataKey() && Compare(key_, seek_key) < 0) { - } if (!Valid()) { - SeekToLast(); + SeekToLastImpl(); } else { - while (Valid() && Compare(key_, seek_key) > 0) { - Prev(); + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); } } } -void DataBlockIter::SeekToFirst() { +void DataBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); - ParseNextDataKey(); + bool is_shared = false; + ParseNextDataKey(&is_shared); } -void DataBlockIter::SeekToFirstOrReport() { +void MetaBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); - ParseNextDataKey(); + bool is_shared = false; + ParseNextKey(&is_shared); } -void IndexBlockIter::SeekToFirst() { +void IndexBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -467,17 +533,30 @@ ParseNextIndexKey(); } -void DataBlockIter::SeekToLast() { +void DataBlockIter::SeekToLastImpl() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + bool is_shared = false; + while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +void MetaBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); - while (ParseNextDataKey() && NextEntryOffset() < restarts_) { + bool is_shared = false; + while (ParseNextKey(&is_shared) && + NextEntryOffset() < restarts_) { // Keep skipping } } -void IndexBlockIter::SeekToLast() { +void IndexBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -493,17 +572,16 @@ current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); - key_.Clear(); + raw_key_.Clear(); value_.clear(); } +template template -bool DataBlockIter::ParseNextDataKey(const char* limit) { +bool BlockIter::ParseNextKey(bool* is_shared) { current_ = NextEntryOffset(); const char* p = data_ + current_; - if (!limit) { - limit = data_ + restarts_; // Restarts come right after data - } + const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. @@ -511,50 +589,23 @@ restart_index_ = num_restarts_; return false; } - // Decode next entry uint32_t shared, non_shared, value_length; p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); - if (p == nullptr || key_.Size() < shared) { + if (p == nullptr || raw_key_.Size() < shared) { CorruptionError(); return false; } else { if (shared == 0) { - // If this key dont share any bytes with prev key then we dont need - // to decode it and can use it's address in the block directly. - key_.SetKey(Slice(p, non_shared), false /* copy */); - key_pinned_ = true; + *is_shared = false; + // If this key doesn't share any bytes with prev key then we don't need + // to decode it and can use its address in the block directly. + raw_key_.SetKey(Slice(p, non_shared), false /* copy */); } else { // This key share `shared` bytes with prev key, we need to decode it - key_.TrimAppend(shared, p, non_shared); - key_pinned_ = false; - } - - if (global_seqno_ != kDisableGlobalSequenceNumber) { - // If we are reading a file with a global sequence number we should - // expect that all encoded sequence numbers are zeros and any value - // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. - assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); - - ValueType value_type = ExtractValueType(key_.GetKey()); - assert(value_type == ValueType::kTypeValue || - value_type == ValueType::kTypeMerge || - value_type == ValueType::kTypeDeletion || - value_type == ValueType::kTypeRangeDeletion); - - if (key_pinned_) { - // TODO(tec): Investigate updating the seqno in the loaded block - // directly instead of doing a copy and update. - - // We cannot use the key address in the block directly because - // we have a global_seqno_ that will overwrite the encoded one. - key_.OwnKey(); - key_pinned_ = false; - } - - key_.UpdateInternalKey(global_seqno_, value_type); + *is_shared = true; + raw_key_.TrimAppend(shared, p, non_shared); } - value_ = Slice(p + non_shared, value_length); if (shared == 0) { while (restart_index_ + 1 < num_restarts_ && @@ -568,52 +619,42 @@ } } -bool IndexBlockIter::ParseNextIndexKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - if (value_delta_encoded_) { - p = DecodeKeyV4()(p, limit, &shared, &non_shared); - value_length = 0; +bool DataBlockIter::ParseNextDataKey(bool* is_shared) { + if (ParseNextKey(is_shared)) { +#ifndef NDEBUG + if (global_seqno_ != kDisableGlobalSequenceNumber) { + // If we are reading a file with a global sequence number we should + // expect that all encoded sequence numbers are zeros and any value + // type is kTypeValue, kTypeMerge, kTypeDeletion, + // kTypeDeletionWithTimestamp, or kTypeRangeDeletion. + uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); + SequenceNumber seqno; + ValueType value_type; + UnPackSequenceAndType(packed, &seqno, &value_type); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeDeletionWithTimestamp || + value_type == ValueType::kTypeRangeDeletion); + assert(seqno == 0); + } +#endif // NDEBUG + return true; } else { - p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); - } - if (p == nullptr || key_.Size() < shared) { - CorruptionError(); return false; } - if (shared == 0) { - // If this key dont share any bytes with prev key then we dont need - // to decode it and can use it's address in the block directly. - key_.SetKey(Slice(p, non_shared), false /* copy */); - key_pinned_ = true; - } else { - // This key share `shared` bytes with prev key, we need to decode it - key_.TrimAppend(shared, p, non_shared); - key_pinned_ = false; - } - value_ = Slice(p + non_shared, value_length); - if (shared == 0) { - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - } - // else we are in the middle of a restart interval and the restart_index_ - // thus has not changed - if (value_delta_encoded_ || global_seqno_state_ != nullptr) { - DecodeCurrentValue(shared); +} + +bool IndexBlockIter::ParseNextIndexKey() { + bool is_shared = false; + bool ok = (value_delta_encoded_) ? ParseNextKey(&is_shared) + : ParseNextKey(&is_shared); + if (ok) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + DecodeCurrentValue(is_shared); + } } - return true; + return ok; } // The format: @@ -623,16 +664,16 @@ // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // where, k is key, v is value, and its encoding is in parenthesis. // The format of each key is (shared_size, non_shared_size, shared, non_shared) -// The format of each value, i.e., block hanlde, is (offset, size) whenever the -// shared_size is 0, which included the first entry in each restart point. +// The format of each value, i.e., block handle, is (offset, size) whenever the +// is_shared is false, which included the first entry in each restart point. // Otherwise the format is delta-size = block handle size - size of last block // handle. -void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { +void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( &v, have_first_key_, - (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr); assert(decode_s.ok()); value_ = Slice(value_.data(), v.data() - value_.data()); @@ -657,20 +698,78 @@ } } -// Binary search in restart array to find the first restart point that -// is either the last restart point with a key less than target, -// which means the key of next restart point is larger than target, or -// the first restart point with a key = target +template +void BlockIter::FindKeyAfterBinarySeek(const Slice& target, + uint32_t index, + bool skip_linear_scan) { + // SeekToRestartPoint() only does the lookup in the restart block. We need + // to follow it up with NextImpl() to position the iterator at the restart + // key. + SeekToRestartPoint(index); + NextImpl(); + + if (!skip_linear_scan) { + // Linear search (within restart block) for first key >= target + uint32_t max_offset; + if (index + 1 < num_restarts_) { + // We are in a non-last restart interval. Since `BinarySeek()` guarantees + // the next restart key is strictly greater than `target`, we can + // terminate upon reaching it without any additional key comparison. + max_offset = GetRestartPoint(index + 1); + } else { + // We are in the last restart interval. The while-loop will terminate by + // `Valid()` returning false upon advancing past the block's last key. + max_offset = port::kMaxUint32; + } + while (true) { + NextImpl(); + if (!Valid()) { + break; + } + if (current_ == max_offset) { + assert(CompareCurrentKey(target) > 0); + break; + } else if (CompareCurrentKey(target) >= 0) { + break; + } + } + } +} + +// Binary searches in restart array to find the starting restart point for the +// linear scan, and stores it in `*index`. Assumes restart array does not +// contain duplicate keys. It is guaranteed that the restart key at `*index + 1` +// is strictly greater than `target` or does not exist (this can be used to +// elide a comparison when linear scan reaches all the way to the next restart +// key). Furthermore, `*skip_linear_scan` is set to indicate whether the +// `*index`th restart key is the final result so that key does not need to be +// compared again later. template template -bool BlockIter::BinarySeek(const Slice& target, uint32_t left, - uint32_t right, uint32_t* index, - const Comparator* comp) { - assert(left <= right); +bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, + bool* skip_linear_scan) { + if (restarts_ == 0) { + // SST files dedicated to range tombstones are written with index blocks + // that have no keys while also having `num_restarts_ == 1`. This would + // cause a problem for `BinarySeek()` as it'd try to access the first key + // which does not exist. We identify such blocks by the offset at which + // their restarts are stored, and return false to prevent any attempted + // key accesses. + return false; + } - while (left < right) { - uint32_t mid = (left + right + 1) / 2; - uint32_t region_offset = GetRestartPoint(mid); + *skip_linear_scan = false; + // Loop invariants: + // - Restart key at index `left` is less than or equal to the target key. The + // sentinel index `-1` is considered to have a key that is less than all + // keys. + // - Any restart keys after index `right` are strictly greater than the target + // key. + int64_t left = -1, right = num_restarts_ - 1; + while (left != right) { + // The `mid` is computed by rounding up so it lands in (`left`, `right`]. + int64_t mid = left + (right - left + 1) / 2; + uint32_t region_offset = GetRestartPoint(static_cast(mid)); uint32_t shared, non_shared; const char* key_ptr = DecodeKeyFunc()( data_ + region_offset, data_ + restarts_, &shared, &non_shared); @@ -679,7 +778,8 @@ return false; } Slice mid_key(key_ptr, non_shared); - int cmp = comp->Compare(mid_key, target); + raw_key_.SetKey(mid_key, false /* copy */); + int cmp = CompareCurrentKey(target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all // blocks before "mid" are uninteresting. @@ -689,11 +789,19 @@ // after "mid" are uninteresting. right = mid - 1; } else { + *skip_linear_scan = true; left = right = mid; } } - *index = left; + if (left == -1) { + // All keys in the block were strictly greater than `target`. So the very + // first key in the block is the final seek result. + *skip_linear_scan = true; + *index = 0; + } else { + *index = static_cast(left); + } return true; } @@ -713,7 +821,8 @@ return 1; // Return target is smaller } Slice block_key(key_ptr, non_shared); - return Compare(block_key, target); + raw_key_.SetKey(block_key, false /* copy */); + return CompareCurrentKey(target); } // Binary search in block_ids to find the first block @@ -807,7 +916,7 @@ assert(prefix_index_); *prefix_may_exist = true; Slice seek_key = target; - if (!key_includes_seq_) { + if (raw_key_.IsUserKey()) { seek_key = ExtractUserKey(target); } uint32_t* block_ids = nullptr; @@ -865,14 +974,13 @@ // TEST_SYNC_POINT("Block::~Block"); } -Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, - size_t read_amp_bytes_per_bit, Statistics* statistics) +Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, + Statistics* statistics) : contents_(std::move(contents)), data_(contents_.data.data()), size_(contents_.data.size()), restart_offset_(0), - num_restarts_(0), - global_seqno_(_global_seqno) { + num_restarts_(0) { TEST_SYNC_POINT("Block::Block:0"); if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker @@ -923,8 +1031,23 @@ } } -DataBlockIter* Block::NewDataIterator(const Comparator* cmp, - const Comparator* ucmp, +MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { + MetaBlockIter* iter = new MetaBlockIter(); + if (size_ < 2 * sizeof(uint32_t)) { + iter->Invalidate(Status::Corruption("bad block contents")); + return iter; + } else if (num_restarts_ == 0) { + // Empty block. + iter->Invalidate(Status::OK()); + } else { + iter->Initialize(data_, restart_offset_, num_restarts_, + block_contents_pinned); + } + return iter; +} + +DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, DataBlockIter* iter, Statistics* stats, bool block_contents_pinned) { DataBlockIter* ret_iter; @@ -943,7 +1066,7 @@ return ret_iter; } else { ret_iter->Initialize( - cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, + raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); if (read_amp_bitmap_) { @@ -958,10 +1081,10 @@ } IndexBlockIter* Block::NewIndexIterator( - const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, - Statistics* /*stats*/, bool total_order_seek, bool have_first_key, - bool key_includes_seq, bool value_is_full, bool block_contents_pinned, - BlockPrefixIndex* prefix_index) { + const Comparator* raw_ucmp, SequenceNumber global_seqno, + IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, + bool have_first_key, bool key_includes_seq, bool value_is_full, + bool block_contents_pinned, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -979,8 +1102,8 @@ } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; - ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - global_seqno_, prefix_index_ptr, have_first_key, + ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, + global_seqno, prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, block_contents_pinned); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,7 +13,6 @@ #include #include -#include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "port/malloc.h" #include "rocksdb/iterator.h" @@ -35,6 +34,7 @@ class BlockIter; class DataBlockIter; class IndexBlockIter; +class MetaBlockIter; class BlockPrefixIndex; // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data @@ -151,8 +151,7 @@ class Block { public: // Initialize the block with the specified contents. - explicit Block(BlockContents&& contents, SequenceNumber _global_seqno, - size_t read_amp_bytes_per_bit = 0, + explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0, Statistics* statistics = nullptr); // No copying allowed Block(const Block&) = delete; @@ -169,8 +168,8 @@ BlockBasedTableOptions::DataBlockIndexType IndexType() const; - // If comparator is InternalKeyComparator, user_comparator is its user - // comparator; they are equal otherwise. + // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. // // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* @@ -188,12 +187,30 @@ // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. - DataBlockIter* NewDataIterator(const Comparator* comparator, - const Comparator* user_comparator, + DataBlockIter* NewDataIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, DataBlockIter* iter = nullptr, Statistics* stats = nullptr, bool block_contents_pinned = false); + // Returns an MetaBlockIter for iterating over blocks containing metadata + // (like Properties blocks). Unlike data blocks, the keys for these blocks + // do not contain sequence numbers, do not use a user-define comparator, and + // do not track read amplification/statistics. Additionally, MetaBlocks will + // not assert if the block is formatted improperly. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false); + + // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. + // // key_includes_seq, default true, means that the keys are in internal key // format. // value_is_full, default true, means that no delta encoding is @@ -206,8 +223,8 @@ // first_internal_key. It affects data serialization format, so the same value // have_first_key must be used when writing and reading index. // It is determined by IndexType property of the table. - IndexBlockIter* NewIndexIterator(const Comparator* comparator, - const Comparator* user_comparator, + IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp, + SequenceNumber global_seqno, IndexBlockIter* iter, Statistics* stats, bool total_order_seek, bool have_first_key, bool key_includes_seq, bool value_is_full, @@ -217,8 +234,6 @@ // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; - SequenceNumber global_seqno() const { return global_seqno_; } - private: BlockContents contents_; const char* data_; // contents_.data.data() @@ -226,23 +241,38 @@ uint32_t restart_offset_; // Offset in data_ of restart array uint32_t num_restarts_; std::unique_ptr read_amp_bitmap_; - // All keys in the block will have seqno = global_seqno_, regardless of - // the encoded value (kDisableGlobalSequenceNumber means disabled) - const SequenceNumber global_seqno_; - DataBlockHashIndex data_block_hash_index_; }; +// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The +// format of this data buffer is an uncompressed, sorted sequence of key-value +// pairs (see `Block` API for more details). +// +// Notably, the keys may either be in internal key format or user key format. +// Subclasses are responsible for configuring the key format. +// +// `BlockIter` intends to provide final overrides for all of +// `InternalIteratorBase` functions that can move the iterator. It does +// this to guarantee `UpdateKey()` is called exactly once after each key +// movement potentially visible to users. In this step, the key is prepared +// (e.g., serialized if global seqno is in effect) so it can be returned +// immediately when the user asks for it via calling `key() const`. +// +// For its subclasses, it provides protected variants of the above-mentioned +// final-overridden methods. They are named with the "Impl" suffix, e.g., +// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These +// "Impl" functions are responsible for positioning `raw_key_` but not +// invoking `UpdateKey()`. template class BlockIter : public InternalIteratorBase { public: - void InitializeBase(const Comparator* comparator, const char* data, + void InitializeBase(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, bool block_contents_pinned) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid - comparator_ = comparator; + raw_ucmp_ = raw_ucmp; data_ = data; restarts_ = restarts; num_restarts_ = num_restarts; @@ -255,10 +285,9 @@ // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do // nothing. Calls cleanup functions. - void InvalidateBase(Status s) { + virtual void Invalidate(const Status& s) { // Assert that the BlockIter is never deleted while Pinning is Enabled. - assert(!pinned_iters_mgr_ || - (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled()); data_ = nullptr; current_ = restarts_; @@ -269,10 +298,47 @@ } bool Valid() const override { return current_ < restarts_; } + + virtual void SeekToFirst() override final { + SeekToFirstImpl(); + UpdateKey(); + } + + virtual void SeekToLast() override final { + SeekToLastImpl(); + UpdateKey(); + } + + virtual void Seek(const Slice& target) override final { + SeekImpl(target); + UpdateKey(); + } + + virtual void SeekForPrev(const Slice& target) override final { + SeekForPrevImpl(target); + UpdateKey(); + } + + virtual void Next() override final { + NextImpl(); + UpdateKey(); + } + + virtual bool NextAndGetResult(IterateResult* result) override final { + // This does not need to call `UpdateKey()` as the parent class only has + // access to the `UpdateKey()`-invoking functions. + return InternalIteratorBase::NextAndGetResult(result); + } + + virtual void Prev() override final { + PrevImpl(); + UpdateKey(); + } + Status status() const override { return status_; } Slice key() const override { assert(Valid()); - return key_.GetKey(); + return key_; } #ifndef NDEBUG @@ -280,6 +346,7 @@ // Assert that the BlockIter is never deleted while Pinning is Enabled. assert(!pinned_iters_mgr_ || (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + status_.PermitUncheckedError(); } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; @@ -304,9 +371,6 @@ Cache::Handle* cache_handle() { return cache_handle_; } protected: - // Note: The type could be changed to InternalKeyComparator but we see a weird - // performance drop by that. - const Comparator* comparator_; const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array @@ -315,9 +379,14 @@ uint32_t restarts_; // Offset of restart array (list of fixed32) // current_ is offset in data_ of current entry. >= restarts_ if !Valid uint32_t current_; - IterKey key_; + // Raw key from block. + IterKey raw_key_; + // Buffer for key data when global seqno assignment is enabled. + IterKey key_buf_; Slice value_; Status status_; + // Key to be exposed to users. + Slice key_; bool key_pinned_; // Whether the block data is guaranteed to outlive this iterator, and // as long as the cleanup functions are transferred to another class, @@ -325,7 +394,62 @@ bool block_contents_pinned_; SequenceNumber global_seqno_; + virtual void SeekToFirstImpl() = 0; + virtual void SeekToLastImpl() = 0; + virtual void SeekImpl(const Slice& target) = 0; + virtual void SeekForPrevImpl(const Slice& target) = 0; + virtual void NextImpl() = 0; + + virtual void PrevImpl() = 0; + + template + inline bool ParseNextKey(bool* is_shared); + + InternalKeyComparator icmp() { + return InternalKeyComparator(raw_ucmp_, false /* named */); + } + + UserComparatorWrapper ucmp() { return UserComparatorWrapper(raw_ucmp_); } + + // Must be called every time a key is found that needs to be returned to user, + // and may be called when no key is found (as a no-op). Updates `key_`, + // `key_buf_`, and `key_pinned_` with info about the found key. + void UpdateKey() { + key_buf_.Clear(); + if (!Valid()) { + return; + } + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + key_ = raw_key_.GetUserKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + key_ = raw_key_.GetInternalKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else { + key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, + ExtractValueType(raw_key_.GetInternalKey())); + key_ = key_buf_.GetInternalKey(); + key_pinned_ = false; + } + } + + // Returns the result of `Comparator::Compare()`, where the appropriate + // comparator is used for the block contents, the LHS argument is the current + // key with global seqno applied, and the RHS argument is `other`. + int CompareCurrentKey(const Slice& other) { + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + return ucmp().Compare(raw_key_.GetUserKey(), other); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + return icmp().Compare(raw_key_.GetInternalKey(), other); + } + return icmp().Compare(raw_key_.GetInternalKey(), global_seqno_, other, + kDisableGlobalSequenceNumber); + } + private: + const Comparator* raw_ucmp_; // Store the cache handle, if the block is cached. We need this since the // only other place the handle is stored is as an argument to the Cleanable // function callback, which is hard to retrieve. When multiple value @@ -346,7 +470,7 @@ } void SeekToRestartPoint(uint32_t index) { - key_.Clear(); + raw_key_.Clear(); restart_index_ = index; // current_ will be fixed by ParseNextKey(); @@ -357,36 +481,36 @@ void CorruptionError(); + protected: template - inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, const Comparator* comp); + inline bool BinarySeek(const Slice& target, uint32_t* index, + bool* is_index_key_result); + + void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, + bool is_index_key_result); }; class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, - const char* data, uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, + DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts, + uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, DataBlockHashIndex* data_block_hash_index) : DataBlockIter() { - Initialize(comparator, user_comparator, data, restarts, num_restarts, - global_seqno, read_amp_bitmap, block_contents_pinned, - data_block_hash_index); + Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno, + read_amp_bitmap, block_contents_pinned, data_block_hash_index); } - void Initialize(const Comparator* comparator, - const Comparator* user_comparator, const char* data, + void Initialize(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, DataBlockHashIndex* data_block_hash_index) { - InitializeBase(comparator, data, restarts, num_restarts, global_seqno, + InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, block_contents_pinned); - user_comparator_ = user_comparator; - key_.SetIsUserKey(false); + raw_key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; data_block_hash_index_ = data_block_hash_index; @@ -403,45 +527,35 @@ return value_; } - void Seek(const Slice& target) override; - inline bool SeekForGet(const Slice& target) { if (!data_block_hash_index_) { - Seek(target); + SeekImpl(target); + UpdateKey(); return true; } - - return SeekForGetImpl(target); + bool res = SeekForGetImpl(target); + UpdateKey(); + return res; } - void SeekForPrev(const Slice& target) override; - - void Prev() override; - - void Next() final override; - - // Try to advance to the next entry in the block. If there is data corruption - // or error, report it to the caller instead of aborting the process. May - // incur higher CPU overhead because we need to perform check on every entry. - void NextOrReport(); - - void SeekToFirst() override; - - // Try to seek to the first entry in the block. If there is data corruption - // or error, report it to caller instead of aborting the process. May incur - // higher CPU overhead because we need to perform check on every entry. - void SeekToFirstOrReport(); - - void SeekToLast() override; - - void Invalidate(Status s) { - InvalidateBase(s); + void Invalidate(const Status& s) override { + BlockIter::Invalidate(s); // Clear prev entries cache. prev_entries_keys_buff_.clear(); prev_entries_.clear(); prev_entries_idx_ = -1; } + protected: + friend Block; + inline bool ParseNextDataKey(bool* is_shared); + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; + private: // read-amp bitmap BlockReadAmpBitmap* read_amp_bitmap_; @@ -472,41 +586,57 @@ int32_t prev_entries_idx_ = -1; DataBlockHashIndex* data_block_hash_index_; - const Comparator* user_comparator_; - template - inline bool ParseNextDataKey(const char* limit = nullptr); + bool SeekForGetImpl(const Slice& target); +}; - inline int Compare(const IterKey& ikey, const Slice& b) const { - return comparator_->Compare(ikey.GetInternalKey(), b); +// Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and +// are used to store Properties associated with table. +// Meta blocks always store user keys (no sequence number) and always +// use the BytewiseComparator. Additionally, MetaBlock accesses are +// not recorded in the Statistics or for Read-Amplification. +class MetaBlockIter final : public BlockIter { + public: + MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } + void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, + bool block_contents_pinned) { + // Initializes the iterator with a BytewiseComparator and + // the raw key being a user key. + InitializeBase(BytewiseComparator(), data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned); + raw_key_.SetIsUserKey(true); } - bool SeekForGetImpl(const Slice& target); + Slice value() const override { + assert(Valid()); + return value_; + } + + protected: + void SeekToFirstImpl() override; + void SeekToLastImpl() override; + void SeekImpl(const Slice& target) override; + void SeekForPrevImpl(const Slice& target) override; + void NextImpl() override; + void PrevImpl() override; }; class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} - Slice key() const override { - assert(Valid()); - return key_.GetKey(); - } // key_includes_seq, default true, means that the keys are in internal key // format. // value_is_full, default true, means that no delta encoding is // applied to values. - void Initialize(const Comparator* comparator, - const Comparator* user_comparator, const char* data, + void Initialize(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, bool have_first_key, bool key_includes_seq, bool value_is_full, bool block_contents_pinned) { - InitializeBase(key_includes_seq ? comparator : user_comparator, data, - restarts, num_restarts, kDisableGlobalSequenceNumber, - block_contents_pinned); - key_includes_seq_ = key_includes_seq; - key_.SetIsUserKey(!key_includes_seq_); + InitializeBase(raw_ucmp, data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned); + raw_key_.SetIsUserKey(!key_includes_seq); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; have_first_key_ = have_first_key; @@ -518,10 +648,8 @@ } Slice user_key() const override { - if (key_includes_seq_) { - return ExtractUserKey(key()); - } - return key(); + assert(Valid()); + return raw_key_.GetUserKey(); } IndexValue value() const override { @@ -538,6 +666,11 @@ } } + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + + protected: // IndexBlockIter follows a different contract for prefix iterator // from data iterators. // If prefix of the seek key `target` exists in the file, it must @@ -545,36 +678,28 @@ // If the prefix of `target` doesn't exist in the file, it can either // return the result of total order seek, or set both of Valid() = false // and status() = NotFound(). - void Seek(const Slice& target) override; + void SeekImpl(const Slice& target) override; - void SeekForPrev(const Slice&) override { + void SeekForPrevImpl(const Slice&) override { assert(false); current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::InvalidArgument( "RocksDB internal error: should never call SeekForPrev() on index " "blocks"); - key_.Clear(); + raw_key_.Clear(); value_.clear(); } - void Prev() override; - - void Next() override; + void PrevImpl() override; - void SeekToFirst() override; + void NextImpl() override; - void SeekToLast() override; + void SeekToFirstImpl() override; - void Invalidate(Status s) { InvalidateBase(s); } - - bool IsValuePinned() const override { - return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); - } + void SeekToLastImpl() override; private: - // Key is in InternalKey format - bool key_includes_seq_; bool value_delta_encoded_; bool have_first_key_; // value includes first_internal_key BlockPrefixIndex* prefix_index_; @@ -613,19 +738,11 @@ bool* prefix_may_exist); inline int CompareBlockKey(uint32_t block_index, const Slice& target); - inline int Compare(const Slice& a, const Slice& b) const { - return comparator_->Compare(a, b); - } - - inline int Compare(const IterKey& ikey, const Slice& b) const { - return comparator_->Compare(ikey.GetKey(), b); - } - inline bool ParseNextIndexKey(); // When value_delta_encoded_ is enabled it decodes the value which is assumed // to be BlockHandle and put it to decoded_value_ - inline void DecodeCurrentValue(uint32_t shared); + inline void DecodeCurrentValue(bool is_shared); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc 2025-05-19 16:14:27.000000000 +0000 @@ -68,7 +68,7 @@ whole_key_filtering_(table_opt.whole_key_filtering), prev_prefix_start_(0), prev_prefix_size_(0), - num_added_(0) { + total_added_in_built_(0) { assert(policy_); } @@ -80,19 +80,22 @@ } } -void BlockBasedFilterBlockBuilder::Add(const Slice& key) { - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { - AddPrefix(key); +size_t BlockBasedFilterBlockBuilder::EstimateEntriesAdded() { + return total_added_in_built_ + start_.size(); +} + +void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) { + if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) { + AddPrefix(key_without_ts); } if (whole_key_filtering_) { - AddKey(key); + AddKey(key_without_ts); } } // Add key to filter if needed inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { - num_added_++; start_.push_back(entries_.size()); entries_.append(key.data(), key.size()); } @@ -114,10 +117,12 @@ } } -Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, - Status* status) { - // In this impl we ignore BlockHandle +Slice BlockBasedFilterBlockBuilder::Finish( + const BlockHandle& /*tmp*/, Status* status, + std::unique_ptr* /* filter_data */) { + // In this impl we ignore BlockHandle and filter_data *status = Status::OK(); + if (!start_.empty()) { GenerateFilter(); } @@ -140,6 +145,7 @@ filter_offsets_.push_back(static_cast(result_.size())); return; } + total_added_in_built_ += num_entries; // Make list of keys from flattened key structure start_.push_back(entries_.size()); // Simplify length computation @@ -171,19 +177,20 @@ } std::unique_ptr BlockBasedFilterBlockReader::Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context) { + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { assert(table); assert(table->get_rep()); assert(!pin || prefetch); CachableEntry filter_block; if (prefetch || !use_cache) { - const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), - use_cache, nullptr /* get_context */, - lookup_context, &filter_block); + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return std::unique_ptr(); } @@ -251,6 +258,7 @@ const Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return true; } @@ -309,6 +317,7 @@ GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, nullptr /* lookup_context */, &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return std::string("Unable to retrieve filter block"); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h 2025-05-19 16:14:27.000000000 +0000 @@ -44,9 +44,14 @@ virtual bool IsBlockBased() override { return true; } virtual void StartBlock(uint64_t block_offset) override; - virtual void Add(const Slice& key) override; - virtual size_t NumAdded() const override { return num_added_; } - virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + virtual void Add(const Slice& key_without_ts) override; + virtual bool IsEmpty() const override { + return start_.empty() && filter_offsets_.empty(); + } + virtual size_t EstimateEntriesAdded() override; + virtual Slice Finish( + const BlockHandle& tmp, Status* status, + std::unique_ptr* filter_data = nullptr) override; using FilterBlockBuilder::Finish; private: @@ -70,7 +75,7 @@ std::string result_; // Filter data computed so far std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; - size_t num_added_; // Number of keys added + uint64_t total_added_in_built_; // Total keys added to filters built so far }; // A FilterBlockReader is used to parse filter from SST table. @@ -85,9 +90,9 @@ void operator=(const BlockBasedFilterBlockReader&) = delete; static std::unique_ptr Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context); + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return true; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -76,17 +76,26 @@ TEST_F(FilterBlockTest, SingleChunk) { BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - ASSERT_EQ(0, builder.NumAdded()); + ASSERT_TRUE(builder.IsEmpty()); builder.StartBlock(100); builder.Add("foo"); + ASSERT_FALSE(builder.IsEmpty()); + builder.Add("bar"); builder.Add("bar"); builder.Add("box"); builder.StartBlock(200); builder.Add("box"); builder.StartBlock(300); builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - Slice slice(builder.Finish()); + // XXX: "bar" should only count once but is counted twice. This actually + // indicates a serious space usage bug in old block-based filter. Good + // that it is deprecated. + // "box" counts twice, because it's in distinct blocks. + ASSERT_EQ(6, builder.EstimateEntriesAdded()); + ASSERT_FALSE(builder.IsEmpty()); + Status s; + Slice slice = builder.Finish(BlockHandle(), &s); + ASSERT_OK(s); CachableEntry block( new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -11,53 +11,61 @@ #include #include + +#include #include #include #include +#include #include #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "cache/cache_reservation_manager.h" #include "db/dbformat.h" #include "index_builder.h" - +#include "logging/logging.h" +#include "memory/memory_allocator.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" - +#include "rocksdb/types.h" #include "table/block_based/block.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" +#include "table/block_based/block_like_traits.h" #include "table/block_based/filter_block.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/partitioned_filter_block.h" #include "table/format.h" +#include "table/meta_blocks.h" #include "table/table_builder.h" - -#include "memory/memory_allocator.h" #include "util/coding.h" #include "util/compression.h" -#include "util/crc32c.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/xxhash.h" +#include "util/work_queue.h" namespace ROCKSDB_NAMESPACE { extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; -typedef BlockBasedTableOptions::IndexType IndexType; // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace { +constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize; + // Create a filter block builder based on its type. FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, @@ -65,7 +73,7 @@ const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { const BlockBasedTableOptions& table_opt = context.table_options; - if (table_opt.filter_policy == nullptr) return nullptr; + assert(table_opt.filter_policy); // precondition FilterBitsBuilder* filter_bits_builder = BloomFilterPolicy::GetBuilderFromContext(context); @@ -76,8 +84,9 @@ if (table_opt.partition_filters) { assert(p_index_builder != nullptr); // Since after partition cut request from filter builder it takes time - // until index builder actully cuts the partition, we take the lower bound - // as partition size. + // until index builder actully cuts the partition, until the end of a + // data block potentially with many keys, we take the lower bound as + // partition size. assert(table_opt.block_size_deviation <= 100); auto partition_size = static_cast(((table_opt.metadata_block_size * @@ -102,48 +111,6 @@ return compressed_size < raw_size - (raw_size / 8u); } -bool CompressBlockInternal(const Slice& raw, - const CompressionInfo& compression_info, - uint32_t format_version, - std::string* compressed_output) { - // Will return compressed block contents if (1) the compression method is - // supported in this platform and (2) the compression rate is "good enough". - switch (compression_info.type()) { - case kSnappyCompression: - return Snappy_Compress(compression_info, raw.data(), raw.size(), - compressed_output); - case kZlibCompression: - return Zlib_Compress( - compression_info, - GetCompressFormatForVersion(kZlibCompression, format_version), - raw.data(), raw.size(), compressed_output); - case kBZip2Compression: - return BZip2_Compress( - compression_info, - GetCompressFormatForVersion(kBZip2Compression, format_version), - raw.data(), raw.size(), compressed_output); - case kLZ4Compression: - return LZ4_Compress( - compression_info, - GetCompressFormatForVersion(kLZ4Compression, format_version), - raw.data(), raw.size(), compressed_output); - case kLZ4HCCompression: - return LZ4HC_Compress( - compression_info, - GetCompressFormatForVersion(kLZ4HCCompression, format_version), - raw.data(), raw.size(), compressed_output); - case kXpressCompression: - return XPRESS_Compress(raw.data(), raw.size(), compressed_output); - case kZSTD: - case kZSTDNotFinalCompression: - return ZSTD_Compress(compression_info, raw.data(), raw.size(), - compressed_output); - default: - // Do not recognize this compression type - return false; - } -} - } // namespace // format_version is the block format as defined in include/rocksdb/table.h @@ -152,11 +119,9 @@ bool do_sample, std::string* compressed_output, std::string* sampled_output_fast, std::string* sampled_output_slow) { - *type = info.type(); - - if (info.type() == kNoCompression && !info.SampleForCompression()) { - return raw; - } + assert(type); + assert(compressed_output); + assert(compressed_output->empty()); // If requested, we sample one in every N block with a // fast and slow compression algorithm and report the stats. @@ -164,10 +129,10 @@ // enabling compression and they also get a hint about which // compression algorithm wil be beneficial. if (do_sample && info.SampleForCompression() && - Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) && - sampled_output_fast && sampled_output_slow) { + Random::GetTLSInstance()->OneIn( + static_cast(info.SampleForCompression()))) { // Sampling with a fast compression algorithm - if (LZ4_Supported() || Snappy_Supported()) { + if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) { CompressionType c = LZ4_Supported() ? kLZ4Compression : kSnappyCompression; CompressionContext context(c); @@ -176,33 +141,46 @@ CompressionDict::GetEmptyDict(), c, info.SampleForCompression()); - CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast); + CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version), + sampled_output_fast); } // Sampling with a slow but high-compression algorithm - if (ZSTD_Supported() || Zlib_Supported()) { + if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) { CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; CompressionContext context(c); CompressionOptions options; CompressionInfo info_tmp(options, context, CompressionDict::GetEmptyDict(), c, info.SampleForCompression()); - CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow); + + CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version), + sampled_output_slow); } } - // Actually compress the data - if (*type != kNoCompression) { - if (CompressBlockInternal(raw, info, format_version, compressed_output) && - GoodCompressionRatio(compressed_output->size(), raw.size())) { - return *compressed_output; - } + if (info.type() == kNoCompression) { + *type = kNoCompression; + return raw; + } + + // Actually compress the data; if the compression method is not supported, + // or the compression fails etc., just fall back to uncompressed + if (!CompressData(raw, info, GetCompressFormatForVersion(format_version), + compressed_output)) { + *type = kNoCompression; + return raw; + } + + // Check the compression ratio; if it's not good enough, just fall back to + // uncompressed + if (!GoodCompressionRatio(compressed_output->size(), raw.size())) { + *type = kNoCompression; + return raw; } - // Compression method is not supported, or not good - // compression ratio, so just fall back to uncompressed form. - *type = kNoCompression; - return raw; + *type = info.type(); + return *compressed_output; } // kBlockBasedTableMagicNumber was picked by running @@ -240,9 +218,9 @@ return Status::OK(); } - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. return; @@ -276,22 +254,18 @@ }; struct BlockBasedTableBuilder::Rep { - const ImmutableCFOptions ioptions; + const ImmutableOptions ioptions; const MutableCFOptions moptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFileWriter* file; - uint64_t offset = 0; - Status status; + std::atomic offset; size_t alignment; BlockBuilder data_block; - // Buffers uncompressed data blocks and keys to replay later. Needed when + // Buffers uncompressed data blocks to replay later. Needed when // compression dictionary is enabled so we can finalize the dictionary before // compressing any data blocks. - // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data - // blocks as it's redundant, but it's easier to implement for now. - std::vector>> - data_block_and_keys_buffers; + std::vector data_block_buffers; BlockBuilder range_del_block; InternalKeySliceTransform internal_prefix_transform; @@ -299,12 +273,18 @@ PartitionedIndexBuilder* p_index_builder_ = nullptr; std::string last_key; + const Slice* first_key_in_next_block = nullptr; CompressionType compression_type; uint64_t sample_for_compression; + std::atomic compressible_input_data_bytes; + std::atomic uncompressible_input_data_bytes; + std::atomic sampled_input_data_bytes; + std::atomic sampled_output_slow_data_bytes; + std::atomic sampled_output_fast_data_bytes; CompressionOptions compression_opts; std::unique_ptr compression_dict; - CompressionContext compression_ctx; - std::unique_ptr verify_ctx; + std::vector> compression_ctxs; + std::vector> verify_ctxs; std::unique_ptr verify_dict; size_t data_begin_offset = 0; @@ -335,77 +315,149 @@ kClosed, }; State state; - + // `kBuffered` state is allowed only as long as the buffering of uncompressed + // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`. + uint64_t buffer_limit; + std::unique_ptr + compression_dict_buffer_cache_res_mgr; const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; - char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; - size_t compressed_cache_key_prefix_size; + OffsetableCacheKey base_cache_key; + const TableFileCreationReason reason; BlockHandle pending_handle; // Handle to add to index block std::string compressed_output; std::unique_ptr flush_block_policy; - int level_at_creation; - uint32_t column_family_id; - const std::string& column_family_name; - uint64_t creation_time = 0; - uint64_t oldest_key_time = 0; - const uint64_t target_file_size; - uint64_t file_creation_time = 0; std::vector> table_properties_collectors; - Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, - const BlockBasedTableOptions& table_opt, - const InternalKeyComparator& icomparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t _column_family_id, WritableFileWriter* f, - const CompressionType _compression_type, - const uint64_t _sample_for_compression, - const CompressionOptions& _compression_opts, const bool skip_filters, - const int _level_at_creation, const std::string& _column_family_name, - const uint64_t _creation_time, const uint64_t _oldest_key_time, - const uint64_t _target_file_size, const uint64_t _file_creation_time) - : ioptions(_ioptions), - moptions(_moptions), + std::unique_ptr pc_rep; + + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } + void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } + + bool IsParallelCompressionEnabled() const { + return compression_opts.parallel_threads > 1; + } + + Status GetStatus() { + // We need to make modifications of status visible when status_ok is set + // to false, and this is ensured by status_mutex, so no special memory + // order for status_ok is required. + if (status_ok.load(std::memory_order_relaxed)) { + return Status::OK(); + } else { + return CopyStatus(); + } + } + + Status CopyStatus() { + std::lock_guard lock(status_mutex); + return status; + } + + IOStatus GetIOStatus() { + // We need to make modifications of io_status visible when status_ok is set + // to false, and this is ensured by io_status_mutex, so no special memory + // order for io_status_ok is required. + if (io_status_ok.load(std::memory_order_relaxed)) { + return IOStatus::OK(); + } else { + return CopyIOStatus(); + } + } + + IOStatus CopyIOStatus() { + std::lock_guard lock(io_status_mutex); + return io_status; + } + + // Never erase an existing status that is not OK. + void SetStatus(Status s) { + if (!s.ok() && status_ok.load(std::memory_order_relaxed)) { + // Locking is an overkill for non compression_opts.parallel_threads + // case but since it's unlikely that s is not OK, we take this cost + // to be simplicity. + std::lock_guard lock(status_mutex); + status = s; + status_ok.store(false, std::memory_order_relaxed); + } + } + + // Never erase an existing I/O status that is not OK. + void SetIOStatus(IOStatus ios) { + if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) { + // Locking is an overkill for non compression_opts.parallel_threads + // case but since it's unlikely that s is not OK, we take this cost + // to be simplicity. + std::lock_guard lock(io_status_mutex); + io_status = ios; + io_status_ok.store(false, std::memory_order_relaxed); + } + } + + Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo, + WritableFileWriter* f) + : ioptions(tbo.ioptions), + moptions(tbo.moptions), table_options(table_opt), - internal_comparator(icomparator), + internal_comparator(tbo.internal_comparator), file(f), + offset(0), alignment(table_options.block_align - ? std::min(table_options.block_size, kDefaultPageSize) + ? std::min(static_cast(table_options.block_size), + kDefaultPageSize) : 0), data_block(table_options.block_restart_interval, table_options.use_delta_encoding, false /* use_value_delta_encoding */, - icomparator.user_comparator() + tbo.internal_comparator.user_comparator() ->CanKeysWithDifferentByteContentsBeEqual() ? BlockBasedTableOptions::kDataBlockBinarySearch : table_options.data_block_index_type, table_options.data_block_hash_table_util_ratio), range_del_block(1 /* block_restart_interval */), - internal_prefix_transform(_moptions.prefix_extractor.get()), - compression_type(_compression_type), - sample_for_compression(_sample_for_compression), - compression_opts(_compression_opts), + internal_prefix_transform(tbo.moptions.prefix_extractor.get()), + compression_type(tbo.compression_type), + sample_for_compression(tbo.moptions.sample_for_compression), + compressible_input_data_bytes(0), + uncompressible_input_data_bytes(0), + sampled_input_data_bytes(0), + sampled_output_slow_data_bytes(0), + sampled_output_fast_data_bytes(0), + compression_opts(tbo.compression_opts), compression_dict(), - compression_ctx(_compression_type), + compression_ctxs(tbo.compression_opts.parallel_threads), + verify_ctxs(tbo.compression_opts.parallel_threads), verify_dict(), - state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered - : State::kUnbuffered), + state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && !table_opt.block_align), - compressed_cache_key_prefix_size(0), + reason(tbo.reason), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), - level_at_creation(_level_at_creation), - column_family_id(_column_family_id), - column_family_name(_column_family_name), - creation_time(_creation_time), - oldest_key_time(_oldest_key_time), - target_file_size(_target_file_size), - file_creation_time(_file_creation_time) { + status_ok(true), + io_status_ok(true) { + if (tbo.target_file_size == 0) { + buffer_limit = compression_opts.max_dict_buffer_bytes; + } else if (compression_opts.max_dict_buffer_bytes == 0) { + buffer_limit = tbo.target_file_size; + } else { + buffer_limit = std::min(tbo.target_file_size, + compression_opts.max_dict_buffer_bytes); + } + if (table_options.no_block_cache || table_options.block_cache == nullptr) { + compression_dict_buffer_cache_res_mgr.reset(nullptr); + } else { + compression_dict_buffer_cache_res_mgr.reset( + new CacheReservationManager(table_options.block_cache)); + } + for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { + compression_ctxs[i].reset(new CompressionContext(compression_type)); + } if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( @@ -418,57 +470,407 @@ &this->internal_prefix_transform, use_delta_encoding_for_index_values, table_options)); } - if (skip_filters) { - filter_builder = nullptr; + if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) { + // Apply optimize_filters_for_hits setting here when applicable by + // skipping filter generation + filter_builder.reset(); + } else if (tbo.skip_filters) { + // For SstFileWriter skip_filters + filter_builder.reset(); + } else if (!table_options.filter_policy) { + // Null filter_policy -> no filter + filter_builder.reset(); } else { - FilterBuildingContext context(table_options); - context.column_family_name = column_family_name; - context.compaction_style = ioptions.compaction_style; - context.level_at_creation = level_at_creation; - context.info_log = ioptions.info_log; + FilterBuildingContext filter_context(table_options); + + filter_context.info_log = ioptions.logger; + filter_context.column_family_name = tbo.column_family_name; + filter_context.reason = reason; + + // Only populate other fields if known to be in LSM rather than + // generating external SST file + if (reason != TableFileCreationReason::kMisc) { + filter_context.compaction_style = ioptions.compaction_style; + filter_context.num_levels = ioptions.num_levels; + filter_context.level_at_creation = tbo.level_at_creation; + filter_context.is_bottommost = tbo.is_bottommost; + assert(filter_context.level_at_creation < filter_context.num_levels); + } + filter_builder.reset(CreateFilterBlockBuilder( - ioptions, moptions, context, use_delta_encoding_for_index_values, - p_index_builder_)); + ioptions, moptions, filter_context, + use_delta_encoding_for_index_values, p_index_builder_)); } - for (auto& collector_factories : *int_tbl_prop_collector_factories) { + assert(tbo.int_tbl_prop_collector_factories); + for (auto& factory : *tbo.int_tbl_prop_collector_factories) { + assert(factory); + table_properties_collectors.emplace_back( - collector_factories->CreateIntTblPropCollector(column_family_id)); + factory->CreateIntTblPropCollector(tbo.column_family_id, + tbo.level_at_creation)); } table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( table_options.index_type, table_options.whole_key_filtering, - _moptions.prefix_extractor != nullptr)); + moptions.prefix_extractor != nullptr)); + const Comparator* ucmp = tbo.internal_comparator.user_comparator(); + assert(ucmp); + if (ucmp->timestamp_size() > 0) { + table_properties_collectors.emplace_back( + new TimestampTablePropertiesCollector(ucmp)); + } if (table_options.verify_compression) { - verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(), - compression_type)); + for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { + verify_ctxs[i].reset(new UncompressionContext(compression_type)); + } + } + + // These are only needed for populating table properties + props.column_family_id = tbo.column_family_id; + props.column_family_name = tbo.column_family_name; + props.creation_time = tbo.creation_time; + props.oldest_key_time = tbo.oldest_key_time; + props.file_creation_time = tbo.file_creation_time; + props.orig_file_number = tbo.cur_file_num; + props.db_id = tbo.db_id; + props.db_session_id = tbo.db_session_id; + props.db_host_id = ioptions.db_host_id; + if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); } } Rep(const Rep&) = delete; Rep& operator=(const Rep&) = delete; - ~Rep() {} + private: + // Synchronize status & io_status accesses across threads from main thread, + // compression thread and write thread in parallel compression. + std::mutex status_mutex; + std::atomic status_ok; + Status status; + std::mutex io_status_mutex; + std::atomic io_status_ok; + IOStatus io_status; +}; + +struct BlockBasedTableBuilder::ParallelCompressionRep { + // Keys is a wrapper of vector of strings avoiding + // releasing string memories during vector clear() + // in order to save memory allocation overhead + class Keys { + public: + Keys() : keys_(kKeysInitSize), size_(0) {} + void PushBack(const Slice& key) { + if (size_ == keys_.size()) { + keys_.emplace_back(key.data(), key.size()); + } else { + keys_[size_].assign(key.data(), key.size()); + } + size_++; + } + void SwapAssign(std::vector& keys) { + size_ = keys.size(); + std::swap(keys_, keys); + } + void Clear() { size_ = 0; } + size_t Size() { return size_; } + std::string& Back() { return keys_[size_ - 1]; } + std::string& operator[](size_t idx) { + assert(idx < size_); + return keys_[idx]; + } + + private: + const size_t kKeysInitSize = 32; + std::vector keys_; + size_t size_; + }; + std::unique_ptr curr_block_keys; + + class BlockRepSlot; + + // BlockRep instances are fetched from and recycled to + // block_rep_pool during parallel compression. + struct BlockRep { + Slice contents; + Slice compressed_contents; + std::unique_ptr data; + std::unique_ptr compressed_data; + CompressionType compression_type; + std::unique_ptr first_key_in_next_block; + std::unique_ptr keys; + std::unique_ptr slot; + Status status; + }; + // Use a vector of BlockRep as a buffer for a determined number + // of BlockRep structures. All data referenced by pointers in + // BlockRep will be freed when this vector is destructed. + using BlockRepBuffer = std::vector; + BlockRepBuffer block_rep_buf; + // Use a thread-safe queue for concurrent access from block + // building thread and writer thread. + using BlockRepPool = WorkQueue; + BlockRepPool block_rep_pool; + + // Use BlockRepSlot to keep block order in write thread. + // slot_ will pass references to BlockRep + class BlockRepSlot { + public: + BlockRepSlot() : slot_(1) {} + template + void Fill(T&& rep) { + slot_.push(std::forward(rep)); + }; + void Take(BlockRep*& rep) { slot_.pop(rep); } + + private: + // slot_ will pass references to BlockRep in block_rep_buf, + // and those references are always valid before the destruction of + // block_rep_buf. + WorkQueue slot_; + }; + + // Compression queue will pass references to BlockRep in block_rep_buf, + // and those references are always valid before the destruction of + // block_rep_buf. + using CompressQueue = WorkQueue; + CompressQueue compress_queue; + std::vector compress_thread_pool; + + // Write queue will pass references to BlockRep::slot in block_rep_buf, + // and those references are always valid before the corresponding + // BlockRep::slot is destructed, which is before the destruction of + // block_rep_buf. + using WriteQueue = WorkQueue; + WriteQueue write_queue; + std::unique_ptr write_thread; + + // Estimate output file size when parallel compression is enabled. This is + // necessary because compression & flush are no longer synchronized, + // and BlockBasedTableBuilder::FileSize() is no longer accurate. + // memory_order_relaxed suffices because accurate statistics is not required. + class FileSizeEstimator { + public: + explicit FileSizeEstimator() + : raw_bytes_compressed(0), + raw_bytes_curr_block(0), + raw_bytes_curr_block_set(false), + raw_bytes_inflight(0), + blocks_inflight(0), + curr_compression_ratio(0), + estimated_file_size(0) {} + + // Estimate file size when a block is about to be emitted to + // compression thread + void EmitBlock(uint64_t raw_block_size, uint64_t curr_file_size) { + uint64_t new_raw_bytes_inflight = + raw_bytes_inflight.fetch_add(raw_block_size, + std::memory_order_relaxed) + + raw_block_size; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_raw_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + } + + // Estimate file size when a block is already reaped from + // compression thread + void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) { + assert(raw_bytes_curr_block_set); + + uint64_t new_raw_bytes_compressed = + raw_bytes_compressed + raw_bytes_curr_block; + assert(new_raw_bytes_compressed > 0); + + curr_compression_ratio.store( + (curr_compression_ratio.load(std::memory_order_relaxed) * + raw_bytes_compressed + + compressed_block_size) / + static_cast(new_raw_bytes_compressed), + std::memory_order_relaxed); + raw_bytes_compressed = new_raw_bytes_compressed; + + uint64_t new_raw_bytes_inflight = + raw_bytes_inflight.fetch_sub(raw_bytes_curr_block, + std::memory_order_relaxed) - + raw_bytes_curr_block; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_raw_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + + raw_bytes_curr_block_set = false; + } + + void SetEstimatedFileSize(uint64_t size) { + estimated_file_size.store(size, std::memory_order_relaxed); + } + + uint64_t GetEstimatedFileSize() { + return estimated_file_size.load(std::memory_order_relaxed); + } + + void SetCurrBlockRawSize(uint64_t size) { + raw_bytes_curr_block = size; + raw_bytes_curr_block_set = true; + } + + private: + // Raw bytes compressed so far. + uint64_t raw_bytes_compressed; + // Size of current block being appended. + uint64_t raw_bytes_curr_block; + // Whether raw_bytes_curr_block has been set for next + // ReapBlock call. + bool raw_bytes_curr_block_set; + // Raw bytes under compression and not appended yet. + std::atomic raw_bytes_inflight; + // Number of blocks under compression and not appended yet. + std::atomic blocks_inflight; + // Current compression ratio, maintained by BGWorkWriteRawBlock. + std::atomic curr_compression_ratio; + // Estimated SST file size. + std::atomic estimated_file_size; + }; + FileSizeEstimator file_size_estimator; + + // Facilities used for waiting first block completion. Need to Wait for + // the completion of first block compression and flush to get a non-zero + // compression ratio. + std::atomic first_block_processed; + std::condition_variable first_block_cond; + std::mutex first_block_mutex; + + explicit ParallelCompressionRep(uint32_t parallel_threads) + : curr_block_keys(new Keys()), + block_rep_buf(parallel_threads), + block_rep_pool(parallel_threads), + compress_queue(parallel_threads), + write_queue(parallel_threads), + first_block_processed(false) { + for (uint32_t i = 0; i < parallel_threads; i++) { + block_rep_buf[i].contents = Slice(); + block_rep_buf[i].compressed_contents = Slice(); + block_rep_buf[i].data.reset(new std::string()); + block_rep_buf[i].compressed_data.reset(new std::string()); + block_rep_buf[i].compression_type = CompressionType(); + block_rep_buf[i].first_key_in_next_block.reset(new std::string()); + block_rep_buf[i].keys.reset(new Keys()); + block_rep_buf[i].slot.reset(new BlockRepSlot()); + block_rep_buf[i].status = Status::OK(); + block_rep_pool.push(&block_rep_buf[i]); + } + } + + ~ParallelCompressionRep() { block_rep_pool.finish(); } + + // Make a block prepared to be emitted to compression thread + // Used in non-buffered mode + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + BlockBuilder* data_block) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + data_block->SwapAndReset(*(block_rep->data)); + block_rep->contents = *(block_rep->data); + std::swap(block_rep->keys, curr_block_keys); + curr_block_keys->Clear(); + return block_rep; + } + + // Used in EnterUnbuffered + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + std::string* data_block, + std::vector* keys) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + std::swap(*(block_rep->data), *data_block); + block_rep->contents = *(block_rep->data); + block_rep->keys->SwapAssign(*keys); + return block_rep; + } + + // Emit a block to compression thread + void EmitBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + assert(block_rep->status.ok()); + if (!write_queue.push(block_rep->slot.get())) { + return; + } + if (!compress_queue.push(block_rep)) { + return; + } + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::unique_lock lock(first_block_mutex); + first_block_cond.wait(lock, [this] { + return first_block_processed.load(std::memory_order_relaxed); + }); + } + } + + // Reap a block from compression thread + void ReapBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + block_rep->compressed_data->clear(); + block_rep_pool.push(block_rep); + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::lock_guard lock(first_block_mutex); + first_block_processed.store(true, std::memory_order_relaxed); + first_block_cond.notify_one(); + } + } + + private: + BlockRep* PrepareBlockInternal(CompressionType compression_type, + const Slice* first_key_in_next_block) { + BlockRep* block_rep = nullptr; + block_rep_pool.pop(block_rep); + assert(block_rep != nullptr); + + assert(block_rep->data); + + block_rep->compression_type = compression_type; + + if (first_key_in_next_block == nullptr) { + block_rep->first_key_in_next_block.reset(nullptr); + } else { + block_rep->first_key_in_next_block->assign( + first_key_in_next_block->data(), first_key_in_next_block->size()); + } + + return block_rep; + } }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, - const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const int level_at_creation, - const uint64_t creation_time, const uint64_t oldest_key_time, - const uint64_t target_file_size, const uint64_t file_creation_time) { + const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo, + WritableFileWriter* file) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && sanitized_table_options.checksum != kCRC32c) { ROCKS_LOG_WARN( - ioptions.info_log, + tbo.ioptions.logger, "Silently converting format_version to 1 because checksum is " "non-default"); // silently convert format_version to 1 to keep consistent with current @@ -476,21 +878,25 @@ sanitized_table_options.format_version = 1; } - rep_ = new Rep(ioptions, moptions, sanitized_table_options, - internal_comparator, int_tbl_prop_collector_factories, - column_family_id, file, compression_type, - sample_for_compression, compression_opts, skip_filters, - level_at_creation, column_family_name, creation_time, - oldest_key_time, target_file_size, file_creation_time); + rep_ = new Rep(sanitized_table_options, tbo, file); if (rep_->filter_builder != nullptr) { rep_->filter_builder->StartBlock(0); } - if (table_options.block_cache_compressed.get() != nullptr) { - BlockBasedTable::GenerateCachePrefix( - table_options.block_cache_compressed.get(), file->writable_file(), - &rep_->compressed_cache_key_prefix[0], - &rep_->compressed_cache_key_prefix_size); + + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey", + const_cast(&rep_->props)); + + // Extremely large files use atypical cache key encoding, and we don't + // know ahead of time how big the file will be. But assuming it's less + // than 4TB, we will correctly predict the cache keys. + BlockBasedTable::SetupBaseCacheKey( + &rep_->props, tbo.db_session_id, tbo.cur_file_num, + BlockBasedTable::kMaxFileSizeStandardEncoding, &rep_->base_cache_key); + + if (rep_->IsParallelCompressionEnabled()) { + StartParallelCompression(); } } @@ -510,16 +916,33 @@ if (r->props.num_entries > r->props.num_range_deletions) { assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); } -#endif // NDEBUG +#endif // !NDEBUG auto should_flush = r->flush_block_policy->Update(key, value); if (should_flush) { assert(!r->data_block.empty()); + r->first_key_in_next_block = &key; Flush(); + if (r->state == Rep::State::kBuffered) { + bool exceeds_buffer_limit = + (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit); + bool exceeds_global_block_cache_limit = false; + + // Increase cache reservation for the last buffered data block + // only if the block is not going to be unbuffered immediately + // and there exists a cache reservation manager + if (!exceeds_buffer_limit && + r->compression_dict_buffer_cache_res_mgr != nullptr) { + Status s = + r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation< + CacheEntryRole::kCompressionDictionaryBuildingBuffer>( + r->data_begin_offset); + exceeds_global_block_cache_limit = s.IsIncomplete(); + } - if (r->state == Rep::State::kBuffered && - r->data_begin_offset > r->target_file_size) { - EnterUnbuffered(); + if (exceeds_buffer_limit || exceeds_global_block_cache_limit) { + EnterUnbuffered(); + } } // Add item to index block. @@ -531,38 +954,50 @@ // entries in the first block and < all entries in subsequent // blocks. if (ok() && r->state == Rep::State::kUnbuffered) { - r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle); + if (r->IsParallelCompressionEnabled()) { + r->pc_rep->curr_block_keys->Clear(); + } else { + r->index_builder->AddIndexEntry(&r->last_key, &key, + r->pending_handle); + } } } // Note: PartitionedFilterBlockBuilder requires key being added to filter // builder after being added to index builder. - if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { - size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + if (r->state == Rep::State::kUnbuffered) { + if (r->IsParallelCompressionEnabled()) { + r->pc_rep->curr_block_keys->PushBack(key); + } else { + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + } } + r->data_block.AddWithLastKey(key, value, r->last_key); r->last_key.assign(key.data(), key.size()); - r->data_block.Add(key, value); if (r->state == Rep::State::kBuffered) { - // Buffer keys to be replayed during `Finish()` once compression - // dictionary has been finalized. - if (r->data_block_and_keys_buffers.empty() || should_flush) { - r->data_block_and_keys_buffers.emplace_back(); - } - r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + // Buffered keys will be replayed from data_block_buffers during + // `Finish()` once compression dictionary has been finalized. } else { - r->index_builder->OnKeyAdded(key); + if (!r->IsParallelCompressionEnabled()) { + r->index_builder->OnKeyAdded(key); + } } - NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + // TODO offset passed in is not accurate for parallel compression case + NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), r->table_properties_collectors, - r->ioptions.info_log); + r->ioptions.logger); } else if (value_type == kTypeRangeDeletion) { r->range_del_block.Add(key, value); - NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + // TODO offset passed in is not accurate for parallel compression case + NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), r->table_properties_collectors, - r->ioptions.info_log); + r->ioptions.logger); } else { assert(false); } @@ -585,44 +1020,108 @@ assert(rep_->state != Rep::State::kClosed); if (!ok()) return; if (r->data_block.empty()) return; - WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */); + if (r->IsParallelCompressionEnabled() && + r->state == Rep::State::kUnbuffered) { + r->data_block.Finish(); + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, r->first_key_in_next_block, &(r->data_block)); + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); + } else { + WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData); + } } void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle, - bool is_data_block) { - WriteBlock(block->Finish(), handle, is_data_block); - block->Reset(); + BlockType block_type) { + block->Finish(); + std::string raw_block_contents; + raw_block_contents.reserve(rep_->table_options.block_size); + block->SwapAndReset(raw_block_contents); + if (rep_->state == Rep::State::kBuffered) { + assert(block_type == BlockType::kData); + rep_->data_block_buffers.emplace_back(std::move(raw_block_contents)); + rep_->data_begin_offset += rep_->data_block_buffers.back().size(); + return; + } + WriteBlock(raw_block_contents, handle, block_type); } void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, BlockHandle* handle, - bool is_data_block) { + BlockType block_type) { + Rep* r = rep_; + assert(r->state == Rep::State::kUnbuffered); + Slice block_contents; + CompressionType type; + Status compress_status; + bool is_data_block = block_type == BlockType::kData; + CompressAndVerifyBlock(raw_block_contents, is_data_block, + *(r->compression_ctxs[0]), r->verify_ctxs[0].get(), + &(r->compressed_output), &(block_contents), &type, + &compress_status); + r->SetStatus(compress_status); + if (!ok()) { + return; + } + + WriteRawBlock(block_contents, type, handle, block_type, &raw_block_contents); + r->compressed_output.clear(); + if (is_data_block) { + if (r->filter_builder != nullptr) { + r->filter_builder->StartBlock(r->get_offset()); + } + r->props.data_size = r->get_offset(); + ++r->props.num_data_blocks; + } +} + +void BlockBasedTableBuilder::BGWorkCompression( + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx) { + ParallelCompressionRep::BlockRep* block_rep = nullptr; + while (rep_->pc_rep->compress_queue.pop(block_rep)) { + assert(block_rep != nullptr); + CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/ + compression_ctx, verify_ctx, + block_rep->compressed_data.get(), + &block_rep->compressed_contents, + &(block_rep->compression_type), &block_rep->status); + block_rep->slot->Fill(block_rep); + } +} + +void BlockBasedTableBuilder::CompressAndVerifyBlock( + const Slice& raw_block_contents, bool is_data_block, + const CompressionContext& compression_ctx, UncompressionContext* verify_ctx, + std::string* compressed_output, Slice* block_contents, + CompressionType* type, Status* out_status) { // File format contains a sequence of blocks where each block has: // block_data: uint8[n] // type: uint8 // crc: uint32 - assert(ok()); Rep* r = rep_; + bool is_status_ok = ok(); + if (!r->IsParallelCompressionEnabled()) { + assert(is_status_ok); + } - auto type = r->compression_type; + *type = r->compression_type; uint64_t sample_for_compression = r->sample_for_compression; - Slice block_contents; bool abort_compression = false; StopWatchNano timer( - r->ioptions.env, - ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)); - - if (r->state == Rep::State::kBuffered) { - assert(is_data_block); - assert(!r->data_block_and_keys_buffers.empty()); - r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); - r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); - return; - } + r->ioptions.clock, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); - if (raw_block_contents.size() < kCompressionSizeLimit) { + if (is_status_ok && raw_block_contents.size() < kCompressionSizeLimit) { + if (is_data_block) { + r->compressible_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + } const CompressionDict* compression_dict; if (!is_data_block || r->compression_dict == nullptr) { compression_dict = &CompressionDict::GetEmptyDict(); @@ -630,17 +1129,27 @@ compression_dict = r->compression_dict.get(); } assert(compression_dict != nullptr); - CompressionInfo compression_info(r->compression_opts, r->compression_ctx, - *compression_dict, type, + CompressionInfo compression_info(r->compression_opts, compression_ctx, + *compression_dict, *type, sample_for_compression); std::string sampled_output_fast; std::string sampled_output_slow; - block_contents = CompressBlock( - raw_block_contents, compression_info, &type, + *block_contents = CompressBlock( + raw_block_contents, compression_info, type, r->table_options.format_version, is_data_block /* do_sample */, - &r->compressed_output, &sampled_output_fast, &sampled_output_slow); + compressed_output, &sampled_output_fast, &sampled_output_slow); + if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) { + // Currently compression sampling is only enabled for data block. + assert(is_data_block); + r->sampled_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(), + std::memory_order_relaxed); + r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(), + std::memory_order_relaxed); + } // notify collectors on block add NotifyCollectTableCollectorsOnBlockAdd( r->table_properties_collectors, raw_block_contents.size(), @@ -649,7 +1158,7 @@ // Some of the compression algorithms are known to be unreliable. If // the verify_compression flag is set then try to de-compress the // compressed data and compare to the input. - if (type != kNoCompression && r->table_options.verify_compression) { + if (*type != kNoCompression && r->table_options.verify_compression) { // Retrieve the uncompressed contents into a new buffer const UncompressionDict* verify_dict; if (!is_data_block || r->verify_dict == nullptr) { @@ -659,10 +1168,10 @@ } assert(verify_dict != nullptr); BlockContents contents; - UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict, + UncompressionInfo uncompression_info(*verify_ctx, *verify_dict, r->compression_type); Status stat = UncompressBlockContentsForCompressionType( - uncompression_info, block_contents.data(), block_contents.size(), + uncompression_info, block_contents->data(), block_contents->size(), &contents, r->table_options.format_version, r->ioptions); if (stat.ok()) { @@ -670,140 +1179,250 @@ if (!compressed_ok) { // The result of the compression was invalid. abort. abort_compression = true; - ROCKS_LOG_ERROR(r->ioptions.info_log, + ROCKS_LOG_ERROR(r->ioptions.logger, "Decompressed block did not match raw block"); - r->status = + *out_status = Status::Corruption("Decompressed block did not match raw block"); } } else { // Decompression reported an error. abort. - r->status = Status::Corruption("Could not decompress"); + *out_status = Status::Corruption(std::string("Could not decompress: ") + + stat.getState()); abort_compression = true; } } } else { // Block is too big to be compressed. + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + } abort_compression = true; } + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize, + std::memory_order_relaxed); + } // Abort compression if the block is too big, or did not pass // verification. if (abort_compression) { - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); - type = kNoCompression; - block_contents = raw_block_contents; - } else if (type != kNoCompression) { - if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) { - RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS, + RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); + *type = kNoCompression; + *block_contents = raw_block_contents; + } else if (*type != kNoCompression) { + if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) { + RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED, + RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED, raw_block_contents.size()); - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); - } else if (type != r->compression_type) { - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); - } - - WriteRawBlock(block_contents, type, handle, is_data_block); - r->compressed_output.clear(); - if (is_data_block) { - if (r->filter_builder != nullptr) { - r->filter_builder->StartBlock(r->offset); - } - r->props.data_size = r->offset; - ++r->props.num_data_blocks; + RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED); + } else if (*type != r->compression_type) { + RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); } } void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle, - bool is_data_block) { + BlockType block_type, + const Slice* raw_block_contents, + bool is_top_level_filter_block) { Rep* r = rep_; - StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); - handle->set_offset(r->offset); + bool is_data_block = block_type == BlockType::kData; + Status s = Status::OK(); + IOStatus io_s = IOStatus::OK(); + StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->get_offset()); handle->set_size(block_contents.size()); - assert(r->status.ok()); - r->status = r->file->Append(block_contents); - if (r->status.ok()) { - char trailer[kBlockTrailerSize]; + assert(status().ok()); + assert(io_status().ok()); + io_s = r->file->Append(block_contents); + if (io_s.ok()) { + std::array trailer; trailer[0] = type; - char* trailer_without_type = trailer + 1; - switch (r->table_options.checksum) { - case kNoChecksum: - EncodeFixed32(trailer_without_type, 0); - break; - case kCRC32c: { - auto crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); - break; - } - case kxxHash: { - XXH32_state_t* const state = XXH32_createState(); - XXH32_reset(state, 0); - XXH32_update(state, block_contents.data(), - static_cast(block_contents.size())); - XXH32_update(state, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer_without_type, XXH32_digest(state)); - XXH32_freeState(state); - break; - } - case kxxHash64: { - XXH64_state_t* const state = XXH64_createState(); - XXH64_reset(state, 0); - XXH64_update(state, block_contents.data(), - static_cast(block_contents.size())); - XXH64_update(state, trailer, 1); // Extend to cover block type - EncodeFixed32( - trailer_without_type, - static_cast(XXH64_digest(state) & // lower 32 bits - uint64_t{0xffffffff})); - XXH64_freeState(state); - break; - } - } + uint32_t checksum = ComputeBuiltinChecksumWithLastByte( + r->table_options.checksum, block_contents.data(), block_contents.size(), + /*last_byte*/ type); + EncodeFixed32(trailer.data() + 1, checksum); - assert(r->status.ok()); + assert(io_s.ok()); TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", - static_cast(trailer)); - r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); - if (r->status.ok()) { - r->status = InsertBlockInCache(block_contents, type, handle); + trailer.data()); + io_s = r->file->Append(Slice(trailer.data(), trailer.size())); + if (io_s.ok()) { + assert(s.ok()); + bool warm_cache; + switch (r->table_options.prepopulate_block_cache) { + case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly: + warm_cache = (r->reason == TableFileCreationReason::kFlush); + break; + case BlockBasedTableOptions::PrepopulateBlockCache::kDisable: + warm_cache = false; + break; + default: + // missing case + assert(false); + warm_cache = false; + } + if (warm_cache) { + if (type == kNoCompression) { + s = InsertBlockInCacheHelper(block_contents, handle, block_type, + is_top_level_filter_block); + } else if (raw_block_contents != nullptr) { + s = InsertBlockInCacheHelper(*raw_block_contents, handle, block_type, + is_top_level_filter_block); + } + if (!s.ok()) { + r->SetStatus(s); + } + } + // TODO:: Should InsertBlockInCompressedCache take into account error from + // InsertBlockInCache or ignore and overwrite it. + s = InsertBlockInCompressedCache(block_contents, type, handle); + if (!s.ok()) { + r->SetStatus(s); + } + } else { + r->SetIOStatus(io_s); } - if (r->status.ok()) { - r->offset += block_contents.size() + kBlockTrailerSize; + if (s.ok() && io_s.ok()) { + r->set_offset(r->get_offset() + block_contents.size() + + kBlockTrailerSize); if (r->table_options.block_align && is_data_block) { size_t pad_bytes = (r->alignment - ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) & (r->alignment - 1); - r->status = r->file->Pad(pad_bytes); - if (r->status.ok()) { - r->offset += pad_bytes; + io_s = r->file->Pad(pad_bytes); + if (io_s.ok()) { + r->set_offset(r->get_offset() + pad_bytes); + } else { + r->SetIOStatus(io_s); } } + if (r->IsParallelCompressionEnabled()) { + if (is_data_block) { + r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(), + r->get_offset()); + } else { + r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset()); + } + } + } + } else { + r->SetIOStatus(io_s); + } + if (!io_s.ok() && s.ok()) { + r->SetStatus(io_s); + } +} + +void BlockBasedTableBuilder::BGWorkWriteRawBlock() { + Rep* r = rep_; + ParallelCompressionRep::BlockRepSlot* slot = nullptr; + ParallelCompressionRep::BlockRep* block_rep = nullptr; + while (r->pc_rep->write_queue.pop(slot)) { + assert(slot != nullptr); + slot->Take(block_rep); + assert(block_rep != nullptr); + if (!block_rep->status.ok()) { + r->SetStatus(block_rep->status); + // Reap block so that blocked Flush() can finish + // if there is one, and Flush() will notice !ok() next time. + block_rep->status = Status::OK(); + r->pc_rep->ReapBlock(block_rep); + continue; + } + + for (size_t i = 0; i < block_rep->keys->Size(); i++) { + auto& key = (*block_rep->keys)[i]; + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + r->index_builder->OnKeyAdded(key); + } + + r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size()); + WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type, + &r->pending_handle, BlockType::kData, &block_rep->contents); + if (!ok()) { + break; + } + + if (r->filter_builder != nullptr) { + r->filter_builder->StartBlock(r->get_offset()); + } + r->props.data_size = r->get_offset(); + ++r->props.num_data_blocks; + + if (block_rep->first_key_in_next_block == nullptr) { + r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr, + r->pending_handle); + } else { + Slice first_key_in_next_block = + Slice(*block_rep->first_key_in_next_block); + r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), + &first_key_in_next_block, + r->pending_handle); } + + r->pc_rep->ReapBlock(block_rep); + } +} + +void BlockBasedTableBuilder::StartParallelCompression() { + rep_->pc_rep.reset( + new ParallelCompressionRep(rep_->compression_opts.parallel_threads)); + rep_->pc_rep->compress_thread_pool.reserve( + rep_->compression_opts.parallel_threads); + for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) { + rep_->pc_rep->compress_thread_pool.emplace_back([this, i] { + BGWorkCompression(*(rep_->compression_ctxs[i]), + rep_->verify_ctxs[i].get()); + }); + } + rep_->pc_rep->write_thread.reset( + new port::Thread([this] { BGWorkWriteRawBlock(); })); +} + +void BlockBasedTableBuilder::StopParallelCompression() { + rep_->pc_rep->compress_queue.finish(); + for (auto& thread : rep_->pc_rep->compress_thread_pool) { + thread.join(); } + rep_->pc_rep->write_queue.finish(); + rep_->pc_rep->write_thread->join(); } -Status BlockBasedTableBuilder::status() const { return rep_->status; } +Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); } -static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) { - BlockContents* bc = reinterpret_cast(value); - delete bc; +IOStatus BlockBasedTableBuilder::io_status() const { + return rep_->GetIOStatus(); } +namespace { +// Delete the entry resided in the cache. +template +void DeleteEntryCached(const Slice& /*key*/, void* value) { + auto entry = reinterpret_cast(value); + delete entry; +} +} // namespace + // // Make a copy of the block contents and insert into compressed block cache // -Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, - const CompressionType type, - const BlockHandle* handle) { +Status BlockBasedTableBuilder::InsertBlockInCompressedCache( + const Slice& block_contents, const CompressionType type, + const BlockHandle* handle) { Rep* r = rep_; Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); - + Status s; if (type != kNoCompression && block_cache_compressed != nullptr) { size_t size = block_contents.size(); @@ -818,39 +1437,133 @@ block_contents_to_cache->is_raw_block = true; #endif // NDEBUG - // make cache key by appending the file offset to the cache prefix id - char* end = EncodeVarint64( - r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size, - handle->offset()); - Slice key(r->compressed_cache_key_prefix, - static_cast(end - r->compressed_cache_key_prefix)); - - // Insert into compressed block cache. - block_cache_compressed->Insert( - key, block_contents_to_cache, - block_contents_to_cache->ApproximateMemoryUsage(), - &DeleteCachedBlockContents); + CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); + s = block_cache_compressed->Insert( + key.AsSlice(), block_contents_to_cache, + block_contents_to_cache->ApproximateMemoryUsage(), + &DeleteEntryCached); + if (s.ok()) { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } // Invalidate OS cache. - r->file->InvalidateCache(static_cast(r->offset), size); + r->file->InvalidateCache(static_cast(r->get_offset()), size) + .PermitUncheckedError(); + } + return s; +} + +Status BlockBasedTableBuilder::InsertBlockInCacheHelper( + const Slice& block_contents, const BlockHandle* handle, + BlockType block_type, bool is_top_level_filter_block) { + Status s; + if (block_type == BlockType::kData || block_type == BlockType::kIndex) { + s = InsertBlockInCache(block_contents, handle, block_type); + } else if (block_type == BlockType::kFilter) { + if (rep_->filter_builder->IsBlockBased()) { + // for block-based filter which is deprecated. + s = InsertBlockInCache(block_contents, handle, block_type); + } else if (is_top_level_filter_block) { + // for top level filter block in partitioned filter. + s = InsertBlockInCache(block_contents, handle, block_type); + } else { + // for second level partitioned filters and full filters. + s = InsertBlockInCache(block_contents, handle, + block_type); + } + } else if (block_type == BlockType::kCompressionDictionary) { + s = InsertBlockInCache(block_contents, handle, + block_type); + } + return s; +} + +template +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const BlockHandle* handle, + BlockType block_type) { + // Uncompressed regular block cache + Cache* block_cache = rep_->table_options.block_cache.get(); + Status s; + if (block_cache != nullptr) { + size_t size = block_contents.size(); + auto buf = AllocateBlock(size, block_cache->memory_allocator()); + memcpy(buf.get(), block_contents.data(), size); + BlockContents results(std::move(buf), size); + + CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); + + const size_t read_amp_bytes_per_bit = + rep_->table_options.read_amp_bytes_per_bit; + + // TODO akanksha:: Dedup below code by calling + // BlockBasedTable::PutDataBlockToCache. + std::unique_ptr block_holder( + BlocklikeTraits::Create( + std::move(results), read_amp_bytes_per_bit, + rep_->ioptions.statistics.get(), + false /*rep_->blocks_definitely_zstd_compressed*/, + rep_->table_options.filter_policy.get())); + + assert(block_holder->own_bytes()); + size_t charge = block_holder->ApproximateMemoryUsage(); + s = block_cache->Insert( + key.AsSlice(), block_holder.get(), + BlocklikeTraits::GetCacheItemHelper(block_type), charge, + nullptr, Cache::Priority::LOW); + + if (s.ok()) { + // Release ownership of block_holder. + block_holder.release(); + BlockBasedTable::UpdateCacheInsertionMetrics( + block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(), + rep_->ioptions.stats); + } else { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES); + } } - return Status::OK(); + return s; } void BlockBasedTableBuilder::WriteFilterBlock( MetaIndexBuilder* meta_index_builder) { BlockHandle filter_block_handle; - bool empty_filter_block = (rep_->filter_builder == nullptr || - rep_->filter_builder->NumAdded() == 0); + bool empty_filter_block = + (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty()); if (ok() && !empty_filter_block) { + rep_->props.num_filter_entries += + rep_->filter_builder->EstimateEntriesAdded(); Status s = Status::Incomplete(); while (ok() && s.IsIncomplete()) { + // filter_data is used to store the transferred filter data payload from + // FilterBlockBuilder and deallocate the payload by going out of scope. + // Otherwise, the payload will unnecessarily remain until + // BlockBasedTableBuilder is deallocated. + // + // See FilterBlockBuilder::Finish() for more on the difference in + // transferred filter data payload among different FilterBlockBuilder + // subtypes. + std::unique_ptr filter_data; Slice filter_content = - rep_->filter_builder->Finish(filter_block_handle, &s); + rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data); assert(s.ok() || s.IsIncomplete()); rep_->props.filter_size += filter_content.size(); - WriteRawBlock(filter_content, kNoCompression, &filter_block_handle); + + // TODO: Refactor code so that BlockType can determine both the C++ type + // of a block cache entry (TBlocklike) and the CacheEntryRole while + // inserting blocks in cache. + bool top_level_filter_block = false; + if (s.ok() && rep_->table_options.partition_filters && + !rep_->filter_builder->IsBlockBased()) { + top_level_filter_block = true; + } + WriteRawBlock(filter_content, kNoCompression, &filter_block_handle, + BlockType::kFilter, nullptr /*raw_contents*/, + top_level_filter_block); } + rep_->filter_builder->ResetFilterBitsBuilder(); } if (ok() && !empty_filter_block) { // Add mapping from ".Name" to location @@ -878,12 +1591,12 @@ // HashIndexBuilder which is not multi-partition. assert(index_blocks.meta_blocks.empty()); } else if (ok() && !index_builder_status.ok()) { - rep_->status = index_builder_status; + rep_->SetStatus(index_builder_status); } if (ok()) { for (const auto& item : index_blocks.meta_blocks) { BlockHandle block_handle; - WriteBlock(item.second, &block_handle, false /* is_data_block */); + WriteBlock(item.second, &block_handle, BlockType::kIndex); if (!ok()) { break; } @@ -892,27 +1605,39 @@ } if (ok()) { if (rep_->table_options.enable_index_compression) { - WriteBlock(index_blocks.index_block_contents, index_block_handle, false); + WriteBlock(index_blocks.index_block_contents, index_block_handle, + BlockType::kIndex); } else { WriteRawBlock(index_blocks.index_block_contents, kNoCompression, - index_block_handle); + index_block_handle, BlockType::kIndex); } } // If there are more index partitions, finish them and write them out - Status s = index_builder_status; - while (ok() && s.IsIncomplete()) { - s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); - if (!s.ok() && !s.IsIncomplete()) { - rep_->status = s; - return; - } - if (rep_->table_options.enable_index_compression) { - WriteBlock(index_blocks.index_block_contents, index_block_handle, false); - } else { - WriteRawBlock(index_blocks.index_block_contents, kNoCompression, - index_block_handle); + if (index_builder_status.IsIncomplete()) { + bool index_building_finished = false; + while (ok() && !index_building_finished) { + Status s = + rep_->index_builder->Finish(&index_blocks, *index_block_handle); + if (s.ok()) { + index_building_finished = true; + } else if (s.IsIncomplete()) { + // More partitioned index after this one + assert(!index_building_finished); + } else { + // Error + rep_->SetStatus(s); + return; + } + + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, + BlockType::kIndex); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle, BlockType::kIndex); + } + // The last index_block_handle will be for the partition index block } - // The last index_block_handle will be for the partition index block } } @@ -921,8 +1646,6 @@ BlockHandle properties_block_handle; if (ok()) { PropertyBlockBuilder property_block_builder; - rep_->props.column_family_id = rep_->column_family_id; - rep_->props.column_family_name = rep_->column_family_name; rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr ? rep_->table_options.filter_policy->Name() @@ -942,9 +1665,8 @@ CompressionOptionsToString(rep_->compression_opts); rep_->props.prefix_extractor_name = rep_->moptions.prefix_extractor != nullptr - ? rep_->moptions.prefix_extractor->Name() + ? rep_->moptions.prefix_extractor->AsString() : "nullptr"; - std::string property_collectors_names = "["; for (size_t i = 0; i < rep_->ioptions.table_properties_collector_factories.size(); ++i) { @@ -967,20 +1689,41 @@ !rep_->index_builder->seperator_is_key_plus_seq(); rep_->props.index_value_is_delta_encoded = rep_->use_delta_encoding_for_index_values; - rep_->props.creation_time = rep_->creation_time; - rep_->props.oldest_key_time = rep_->oldest_key_time; - rep_->props.file_creation_time = rep_->file_creation_time; + if (rep_->sampled_input_data_bytes > 0) { + rep_->props.slow_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_slow_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + rep_->props.fast_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_fast_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + } else if (rep_->sample_for_compression > 0) { + // We tried to sample but none were found. Assume worst-case (compression + // ratio 1.0) so data is complete and aggregatable. + rep_->props.slow_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + rep_->props.fast_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + } // Add basic properties property_block_builder.AddTableProperty(rep_->props); // Add use collected properties NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, - rep_->ioptions.info_log, + rep_->ioptions.logger, &property_block_builder); - WriteRawBlock(property_block_builder.Finish(), kNoCompression, - &properties_block_handle); + Slice block_data = property_block_builder.Finish(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data); + WriteRawBlock(block_data, kNoCompression, &properties_block_handle, + BlockType::kProperties); } if (ok()) { #ifndef NDEBUG @@ -995,7 +1738,12 @@ &props_block_size); } #endif // !NDEBUG - meta_index_builder->Add(kPropertiesBlock, properties_block_handle); + + const std::string* properties_block_meta = &kPropertiesBlockName; + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:Meta", + &properties_block_meta); + meta_index_builder->Add(*properties_block_meta, properties_block_handle); } } @@ -1006,7 +1754,8 @@ BlockHandle compression_dict_block_handle; if (ok()) { WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression, - &compression_dict_block_handle); + &compression_dict_block_handle, + BlockType::kCompressionDictionary); #ifndef NDEBUG Slice compression_dict = rep_->compression_dict->GetRawDict(); TEST_SYNC_POINT_CALLBACK( @@ -1015,7 +1764,7 @@ #endif // NDEBUG } if (ok()) { - meta_index_builder->Add(kCompressionDictBlock, + meta_index_builder->Add(kCompressionDictBlockName, compression_dict_block_handle); } } @@ -1026,37 +1775,29 @@ if (ok() && !rep_->range_del_block.empty()) { BlockHandle range_del_block_handle; WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression, - &range_del_block_handle); - meta_index_builder->Add(kRangeDelBlock, range_del_block_handle); + &range_del_block_handle, BlockType::kRangeDeletion); + meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle); } } void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, BlockHandle& index_block_handle) { Rep* r = rep_; - // No need to write out new footer if we're using default checksum. - // We're writing legacy magic number because we want old versions of RocksDB - // be able to read files generated with new release (just in case if - // somebody wants to roll back after an upgrade) - // TODO(icanadi) at some point in the future, when we're absolutely sure - // nobody will roll back to RocksDB 2.x versions, retire the legacy magic - // number and always write new table files with new magic number - bool legacy = (r->table_options.format_version == 0); // this is guaranteed by BlockBasedTableBuilder's constructor assert(r->table_options.checksum == kCRC32c || r->table_options.format_version != 0); - Footer footer( - legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber, - r->table_options.format_version); - footer.set_metaindex_handle(metaindex_block_handle); - footer.set_index_handle(index_block_handle); - footer.set_checksum(r->table_options.checksum); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - assert(r->status.ok()); - r->status = r->file->Append(footer_encoding); - if (r->status.ok()) { - r->offset += footer_encoding.size(); + assert(ok()); + + FooterBuilder footer; + footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version, + r->get_offset(), r->table_options.checksum, + metaindex_block_handle, index_block_handle); + IOStatus ios = r->file->Append(footer.GetSlice()); + if (ios.ok()) { + r->set_offset(r->get_offset() + footer.GetSlice().size()); + } else { + r->SetIOStatus(ios); + r->SetStatus(ios); } } @@ -1067,20 +1808,45 @@ const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 ? r->compression_opts.zstd_max_train_bytes : r->compression_opts.max_dict_bytes; - Random64 generator{r->creation_time}; + const size_t kNumBlocksBuffered = r->data_block_buffers.size(); + if (kNumBlocksBuffered == 0) { + // The below code is neither safe nor necessary for handling zero data + // blocks. + return; + } + + // Abstract algebra teaches us that a finite cyclic group (such as the + // additive group of integers modulo N) can be generated by a number that is + // coprime with N. Since N is variable (number of buffered data blocks), we + // must then pick a prime number in order to guarantee coprimeness with any N. + // + // One downside of this approach is the spread will be poor when + // `kPrimeGeneratorRemainder` is close to zero or close to + // `kNumBlocksBuffered`. + // + // Picked a random number between one and one trillion and then chose the + // next prime number greater than or equal to it. + const uint64_t kPrimeGenerator = 545055921143ull; + // Can avoid repeated division by just adding the remainder repeatedly. + const size_t kPrimeGeneratorRemainder = static_cast( + kPrimeGenerator % static_cast(kNumBlocksBuffered)); + const size_t kInitSampleIdx = kNumBlocksBuffered / 2; + std::string compression_dict_samples; std::vector compression_dict_sample_lens; - if (!r->data_block_and_keys_buffers.empty()) { - while (compression_dict_samples.size() < kSampleBytes) { - size_t rand_idx = - static_cast( - generator.Uniform(r->data_block_and_keys_buffers.size())); - size_t copy_len = - std::min(kSampleBytes - compression_dict_samples.size(), - r->data_block_and_keys_buffers[rand_idx].first.size()); - compression_dict_samples.append( - r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); - compression_dict_sample_lens.emplace_back(copy_len); + size_t buffer_idx = kInitSampleIdx; + for (size_t i = 0; + i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes; + ++i) { + size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_buffers[buffer_idx].size()); + compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0, + copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + + buffer_idx += kPrimeGeneratorRemainder; + if (buffer_idx >= kNumBlocksBuffered) { + buffer_idx -= kNumBlocksBuffered; } } @@ -1100,45 +1866,114 @@ dict, r->compression_type == kZSTD || r->compression_type == kZSTDNotFinalCompression)); - for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) { - const auto& data_block = r->data_block_and_keys_buffers[i].first; - auto& keys = r->data_block_and_keys_buffers[i].second; + auto get_iterator_for_block = [&r](size_t i) { + auto& data_block = r->data_block_buffers[i]; assert(!data_block.empty()); - assert(!keys.empty()); - for (const auto& key : keys) { - if (r->filter_builder != nullptr) { - size_t ts_sz = - r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + Block reader{BlockContents{data_block}}; + DataBlockIter* iter = reader.NewDataIterator( + r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber); + + iter->SeekToFirst(); + assert(iter->Valid()); + return std::unique_ptr(iter); + }; + + std::unique_ptr iter = nullptr, next_block_iter = nullptr; + + for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) { + if (iter == nullptr) { + iter = get_iterator_for_block(i); + assert(iter != nullptr); + }; + + if (i + 1 < r->data_block_buffers.size()) { + next_block_iter = get_iterator_for_block(i + 1); + } + + auto& data_block = r->data_block_buffers[i]; + + if (r->IsParallelCompressionEnabled()) { + Slice first_key_in_next_block; + const Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + if (i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + first_key_in_next_block = next_block_iter->key(); + } else { + first_key_in_next_block_ptr = r->first_key_in_next_block; + } + + std::vector keys; + for (; iter->Valid(); iter->Next()) { + keys.emplace_back(iter->key().ToString()); + } + + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, first_key_in_next_block_ptr, &data_block, &keys); + + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); + } else { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + r->index_builder->OnKeyAdded(key); + } + WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData); + if (ok() && i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + Slice first_key_in_next_block = next_block_iter->key(); + + Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + + iter->SeekToLast(); + std::string last_key = iter->key().ToString(); + r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr, + r->pending_handle); } - r->index_builder->OnKeyAdded(key); - } - WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */); - if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) { - Slice first_key_in_next_block = - r->data_block_and_keys_buffers[i + 1].second.front(); - Slice* first_key_in_next_block_ptr = &first_key_in_next_block; - r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr, - r->pending_handle); } + std::swap(iter, next_block_iter); + } + r->data_block_buffers.clear(); + r->data_begin_offset = 0; + // Release all reserved cache for data block buffers + if (r->compression_dict_buffer_cache_res_mgr != nullptr) { + Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation< + CacheEntryRole::kCompressionDictionaryBuildingBuffer>( + r->data_begin_offset); + s.PermitUncheckedError(); } - r->data_block_and_keys_buffers.clear(); } Status BlockBasedTableBuilder::Finish() { Rep* r = rep_; assert(r->state != Rep::State::kClosed); bool empty_data_block = r->data_block.empty(); + r->first_key_in_next_block = nullptr; Flush(); if (r->state == Rep::State::kBuffered) { EnterUnbuffered(); } - // To make sure properties block is able to keep the accurate size of index - // block, we will finish writing all index entries first. - if (ok() && !empty_data_block) { - r->index_builder->AddIndexEntry( - &r->last_key, nullptr /* no next data block */, r->pending_handle); + if (r->IsParallelCompressionEnabled()) { + StopParallelCompression(); +#ifndef NDEBUG + for (const auto& br : r->pc_rep->block_rep_buf) { + assert(br.status.ok()); + } +#endif // !NDEBUG + } else { + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries first. + if (ok() && !empty_data_block) { + r->index_builder->AddIndexEntry( + &r->last_key, nullptr /* no next data block */, r->pending_handle); + } } // Write meta blocks, metaindex block and footer in the following order. @@ -1159,29 +1994,48 @@ if (ok()) { // flush the meta index block WriteRawBlock(meta_index_builder.Finish(), kNoCompression, - &metaindex_block_handle); + &metaindex_block_handle, BlockType::kMetaIndex); } if (ok()) { WriteFooter(metaindex_block_handle, index_block_handle); } - if (r->file != nullptr) { - file_checksum_ = r->file->GetFileChecksum(); - } r->state = Rep::State::kClosed; - return r->status; + r->SetStatus(r->CopyIOStatus()); + Status ret_status = r->CopyStatus(); + assert(!ret_status.ok() || io_status().ok()); + return ret_status; } void BlockBasedTableBuilder::Abandon() { assert(rep_->state != Rep::State::kClosed); + if (rep_->IsParallelCompressionEnabled()) { + StopParallelCompression(); + } rep_->state = Rep::State::kClosed; + rep_->CopyStatus().PermitUncheckedError(); + rep_->CopyIOStatus().PermitUncheckedError(); } uint64_t BlockBasedTableBuilder::NumEntries() const { return rep_->props.num_entries; } +bool BlockBasedTableBuilder::IsEmpty() const { + return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0; +} + uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } +uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { + if (rep_->IsParallelCompressionEnabled()) { + // Use compression ratio so far and inflight raw bytes to estimate + // final SST size. + return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize(); + } else { + return FileSize(); + } +} + bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { if (collector->NeedCompact()) { @@ -1197,16 +2051,24 @@ for (const auto& prop : collector->GetReadableProperties()) { ret.readable_properties.insert(prop); } - collector->Finish(&ret.user_collected_properties); + collector->Finish(&ret.user_collected_properties).PermitUncheckedError(); } return ret; } +std::string BlockBasedTableBuilder::GetFileChecksum() const { + if (rep_->file != nullptr) { + return rep_->file->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const { if (rep_->file != nullptr) { return rep_->file->GetFileChecksumFuncName(); } else { - return kUnknownFileChecksumFuncName.c_str(); + return kUnknownFileChecksumFuncName; } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,8 @@ #pragma once #include + +#include #include #include #include @@ -19,6 +21,7 @@ #include "rocksdb/listener.h" #include "rocksdb/options.h" #include "rocksdb/status.h" +#include "rocksdb/table.h" #include "table/meta_blocks.h" #include "table/table_builder.h" #include "util/compression.h" @@ -38,20 +41,9 @@ // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, - const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const int level_at_creation, - const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, - const uint64_t target_file_size = 0, - const uint64_t file_creation_time = 0); + BlockBasedTableBuilder(const BlockBasedTableOptions& table_options, + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file); // No copying allowed BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; @@ -68,6 +60,9 @@ // Return non-ok iff some error has been detected. Status status() const override; + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override; + // Finish building the table. Stops using the file passed to the // constructor after this function returns. // REQUIRES: Finish(), Abandon() have not been called @@ -83,17 +78,24 @@ // Number of calls to Add() so far. uint64_t NumEntries() const override; + bool IsEmpty() const override; + // Size of the file generated so far. If invoked after a successful // Finish() call, returns the size of the final generated file. uint64_t FileSize() const override; + // Estimated size of the file generated so far. This is used when + // FileSize() cannot estimate final SST size, e.g. parallel compression + // is enabled. + uint64_t EstimatedFileSize() const override; + bool NeedCompact() const override; // Get table properties TableProperties GetTableProperties() const override; // Get file checksum - const std::string& GetFileChecksum() const override { return file_checksum_; } + std::string GetFileChecksum() const override; // Get file checksum function name const char* GetFileChecksumFuncName() const override; @@ -106,19 +108,34 @@ // REQUIRES: `rep_->state == kBuffered` void EnterUnbuffered(); - // Call block's Finish() method - // and then write the compressed block contents to file. - void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); + // Call block's Finish() method and then + // - in buffered mode, buffer the uncompressed block contents. + // - in unbuffered mode, write the compressed block contents to file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle, + BlockType blocktype); // Compress and write block content to the file. void WriteBlock(const Slice& block_contents, BlockHandle* handle, - bool is_data_block); + BlockType block_type); // Directly write data to the file. void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle, - bool is_data_block = false); + BlockType block_type, const Slice* raw_data = nullptr, + bool is_top_level_filter_block = false); + + void SetupCacheKeyPrefix(const TableBuilderOptions& tbo); + + template Status InsertBlockInCache(const Slice& block_contents, - const CompressionType type, - const BlockHandle* handle); + const BlockHandle* handle, BlockType block_type); + + Status InsertBlockInCacheHelper(const Slice& block_contents, + const BlockHandle* handle, + BlockType block_type, + bool is_top_level_filter_block); + + Status InsertBlockInCompressedCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, @@ -134,6 +151,8 @@ class BlockBasedTablePropertiesCollector; Rep* rep_; + struct ParallelCompressionRep; + // Advanced operation: flush any buffered key/value pairs to file. // Can be used to ensure that two adjacent entries never live in // the same data block. Most clients should not need to use this method. @@ -144,8 +163,31 @@ // uncompressed size is bigger than kCompressionSizeLimit, don't compress it const uint64_t kCompressionSizeLimit = std::numeric_limits::max(); - // Store file checksum. If checksum is disabled, its value is "0". - std::string file_checksum_ = kUnknownFileChecksum; + // Get blocks from mem-table walking thread, compress them and + // pass them to the write thread. Used in parallel compression mode only + void BGWorkCompression(const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx); + + // Given raw block content, try to compress it and return result and + // compression type + void CompressAndVerifyBlock(const Slice& raw_block_contents, + bool is_data_block, + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx, + std::string* compressed_output, + Slice* result_block_contents, + CompressionType* result_compression_type, + Status* out_status); + + // Get compressed blocks from BGWorkCompression and write them into SST + void BGWorkWriteRawBlock(); + + // Initialize parallel compression context and + // start BGWorkCompression and BGWorkWriteRawBlock threads + void StartParallelCompression(); + + // Stop BGWorkCompression and BGWorkWriteRawBlock threads + void StopParallelCompression(); }; Slice CompressBlock(const Slice& raw, const CompressionInfo& info, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,19 +7,26 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_factory.h" + #include -#include +#include #include #include +#include "cache/cache_entry_roles.h" +#include "logging/logging.h" #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" #include "table/block_based/block_based_table_builder.h" -#include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "util/mutexlock.h" @@ -157,11 +164,294 @@ return std::min(kMaxPrefetchSize, max_qualified_size); } +#ifndef ROCKSDB_LITE + +const std::string kOptNameMetadataCacheOpts = "metadata_cache_options"; + +static std::unordered_map + pinning_tier_type_string_map = { + {"kFallback", PinningTier::kFallback}, + {"kNone", PinningTier::kNone}, + {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar}, + {"kAll", PinningTier::kAll}}; + +static std::unordered_map + block_base_table_index_type_string_map = { + {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, + {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, + {"kTwoLevelIndexSearch", + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, + {"kBinarySearchWithFirstKey", + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; + +static std::unordered_map + block_base_table_data_block_index_type_string_map = { + {"kDataBlockBinarySearch", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch}, + {"kDataBlockBinaryAndHash", + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}}; + +static std::unordered_map + block_base_table_index_shortening_mode_string_map = { + {"kNoShortening", + BlockBasedTableOptions::IndexShorteningMode::kNoShortening}, + {"kShortenSeparators", + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators}, + {"kShortenSeparatorsAndSuccessor", + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor}}; + +static std::unordered_map + metadata_cache_options_type_info = { + {"top_level_index_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, top_level_index_pinning), + &pinning_tier_type_string_map)}, + {"partition_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, partition_pinning), + &pinning_tier_type_string_map)}, + {"unpartitioned_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, unpartitioned_pinning), + &pinning_tier_type_string_map)}}; + +static std::unordered_map + block_base_table_prepopulate_block_cache_string_map = { + {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable}, + {"kFlushOnly", + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}}; + +#endif // ROCKSDB_LITE + +static std::unordered_map + block_based_table_type_info = { +#ifndef ROCKSDB_LITE + /* currently not supported + std::shared_ptr block_cache = nullptr; + std::shared_ptr block_cache_compressed = nullptr; + */ + {"flush_block_policy_factory", + OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct BlockBasedTableOptions, + flush_block_policy_factory), + OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)}, + {"cache_index_and_filter_blocks", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cache_index_and_filter_blocks_with_high_priority", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks_with_high_priority), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"pin_l0_filter_and_index_blocks_in_cache", + {offsetof(struct BlockBasedTableOptions, + pin_l0_filter_and_index_blocks_in_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_type", OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, index_type), + &block_base_table_index_type_string_map)}, + {"hash_index_allow_collision", + {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"data_block_index_type", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, data_block_index_type), + &block_base_table_data_block_index_type_string_map)}, + {"index_shortening", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, index_shortening), + &block_base_table_index_shortening_mode_string_map)}, + {"data_block_hash_table_util_ratio", + {offsetof(struct BlockBasedTableOptions, + data_block_hash_table_util_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"checksum", + {offsetof(struct BlockBasedTableOptions, checksum), + OptionType::kChecksumType, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"no_block_cache", + {offsetof(struct BlockBasedTableOptions, no_block_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_size", + {offsetof(struct BlockBasedTableOptions, block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"block_size_deviation", + {offsetof(struct BlockBasedTableOptions, block_size_deviation), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_restart_interval", + {offsetof(struct BlockBasedTableOptions, block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"index_block_restart_interval", + {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_per_partition", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"metadata_block_size", + {offsetof(struct BlockBasedTableOptions, metadata_block_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"partition_filters", + {offsetof(struct BlockBasedTableOptions, partition_filters), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"optimize_filters_for_memory", + {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"filter_policy", + {offsetof(struct BlockBasedTableOptions, filter_policy), + OptionType::kUnknown, OptionVerificationType::kByNameAllowFromNull, + OptionTypeFlags::kNone, + // Parses the Filter policy + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* policy = + static_cast*>(addr); + return FilterPolicy::CreateFromString(opts, value, policy); + }, + // Converts the FilterPolicy to its string representation + [](const ConfigOptions&, const std::string&, const void* addr, + std::string* value) { + const auto* policy = + static_cast*>(addr); + if (policy->get()) { + *value = (*policy)->Name(); + } else { + *value = kNullptrString; + } + return Status::OK(); + }, + // Compares two FilterPolicy objects for equality + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { + const auto* policy1 = + static_cast*>(addr1) + ->get(); + const auto* policy2 = + static_cast*>(addr2)->get(); + if (policy1 == policy2) { + return true; + } else if (policy1 != nullptr && policy2 != nullptr) { + return (strcmp(policy1->Name(), policy2->Name()) == 0); + } else { + return false; + } + }}}, + {"whole_key_filtering", + {offsetof(struct BlockBasedTableOptions, whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"reserve_table_builder_memory", + {offsetof(struct BlockBasedTableOptions, reserve_table_builder_memory), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"skip_table_builder_flush", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, + {"format_version", + {offsetof(struct BlockBasedTableOptions, format_version), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"verify_compression", + {offsetof(struct BlockBasedTableOptions, verify_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"read_amp_bytes_per_bit", + {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& /*opts*/, const std::string& /*name*/, + const std::string& value, void* addr) { + // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13 + // and 6.14. The bug will write out 8 bytes to OPTIONS file from the + // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit + // which is actually a uint32. Consequently, the value of + // read_amp_bytes_per_bit written in the OPTIONS file is wrong. + // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit + // from OPTIONS file as a uint32. To be able to load OPTIONS file + // generated by affected releases before the fix, we need to + // manually parse read_amp_bytes_per_bit with this special hack. + uint64_t read_amp_bytes_per_bit = ParseUint64(value); + *(static_cast(addr)) = + static_cast(read_amp_bytes_per_bit); + return Status::OK(); + }}}, + {"enable_index_compression", + {offsetof(struct BlockBasedTableOptions, enable_index_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"block_align", + {offsetof(struct BlockBasedTableOptions, block_align), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"pin_top_level_index_and_filter", + {offsetof(struct BlockBasedTableOptions, + pin_top_level_index_and_filter), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {kOptNameMetadataCacheOpts, + OptionTypeInfo::Struct( + kOptNameMetadataCacheOpts, &metadata_cache_options_type_info, + offsetof(struct BlockBasedTableOptions, metadata_cache_options), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"block_cache", + {offsetof(struct BlockBasedTableOptions, block_cache), + OptionType::kUnknown, OptionVerificationType::kNormal, + (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), + // Parses the input vsalue as a Cache + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); + return Cache::CreateFromString(opts, value, cache); + }}}, + {"block_cache_compressed", + {offsetof(struct BlockBasedTableOptions, block_cache_compressed), + OptionType::kUnknown, OptionVerificationType::kNormal, + (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), + // Parses the input vsalue as a Cache + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); + return Cache::CreateFromString(opts, value, cache); + }}}, + {"max_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prepopulate_block_cache", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, prepopulate_block_cache), + &block_base_table_prepopulate_block_cache_string_map, + OptionTypeFlags::kMutable)}, + +#endif // ROCKSDB_LITE +}; + // TODO(myabandeh): We should return an error instead of silently changing the // options BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) : table_options_(_table_options) { + InitializeOptions(); + RegisterOptions(&table_options_, &block_based_table_type_info); +} + +void BlockBasedTableFactory::InitializeOptions() { if (table_options_.flush_block_policy_factory == nullptr) { table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); @@ -199,42 +489,148 @@ } } +Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) { + InitializeOptions(); + return TableFactory::PrepareOptions(opts); +} + +namespace { +// Different cache kinds use the same keys for physically different values, so +// they must not share an underlying key space with each other. +Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { + int cache_count = (bbto.block_cache != nullptr) + + (bbto.block_cache_compressed != nullptr) + + (bbto.persistent_cache != nullptr); + if (cache_count <= 1) { + // Nothing to share / overlap + return Status::OK(); + } + + // Simple pointer equality + if (bbto.block_cache == bbto.block_cache_compressed) { + return Status::InvalidArgument( + "block_cache same as block_cache_compressed not currently supported, " + "and would be bad for performance anyway"); + } + + // More complex test of shared key space, in case the instances are wrappers + // for some shared underlying cache. + std::string sentinel_key(size_t{1}, '\0'); + static char kRegularBlockCacheMarker = 'b'; + static char kCompressedBlockCacheMarker = 'c'; + static char kPersistentCacheMarker = 'p'; + if (bbto.block_cache) { + bbto.block_cache + ->Insert(Slice(sentinel_key), &kRegularBlockCacheMarker, 1, + GetNoopDeleterForRole()) + .PermitUncheckedError(); + } + if (bbto.block_cache_compressed) { + bbto.block_cache_compressed + ->Insert(Slice(sentinel_key), &kCompressedBlockCacheMarker, 1, + GetNoopDeleterForRole()) + .PermitUncheckedError(); + } + if (bbto.persistent_cache) { + // Note: persistent cache copies the data, not keeping the pointer + bbto.persistent_cache + ->Insert(Slice(sentinel_key), &kPersistentCacheMarker, 1) + .PermitUncheckedError(); + } + // If we get something different from what we inserted, that indicates + // dangerously overlapping key spaces. + if (bbto.block_cache) { + auto handle = bbto.block_cache->Lookup(Slice(sentinel_key)); + if (handle) { + auto v = static_cast(bbto.block_cache->Value(handle)); + char c = *v; + bbto.block_cache->Release(handle); + if (v == &kCompressedBlockCacheMarker) { + return Status::InvalidArgument( + "block_cache and block_cache_compressed share the same key space, " + "which is not supported"); + } else if (c == kPersistentCacheMarker) { + return Status::InvalidArgument( + "block_cache and persistent_cache share the same key space, " + "which is not supported"); + } else if (v != &kRegularBlockCacheMarker) { + return Status::Corruption("Unexpected mutation to block_cache"); + } + } + } + if (bbto.block_cache_compressed) { + auto handle = bbto.block_cache_compressed->Lookup(Slice(sentinel_key)); + if (handle) { + auto v = static_cast(bbto.block_cache_compressed->Value(handle)); + char c = *v; + bbto.block_cache_compressed->Release(handle); + if (v == &kRegularBlockCacheMarker) { + return Status::InvalidArgument( + "block_cache_compressed and block_cache share the same key space, " + "which is not supported"); + } else if (c == kPersistentCacheMarker) { + return Status::InvalidArgument( + "block_cache_compressed and persistent_cache share the same key " + "space, " + "which is not supported"); + } else if (v != &kCompressedBlockCacheMarker) { + return Status::Corruption( + "Unexpected mutation to block_cache_compressed"); + } + } + } + if (bbto.persistent_cache) { + std::unique_ptr data; + size_t size = 0; + bbto.persistent_cache->Lookup(Slice(sentinel_key), &data, &size) + .PermitUncheckedError(); + if (data && size > 0) { + if (data[0] == kRegularBlockCacheMarker) { + return Status::InvalidArgument( + "persistent_cache and block_cache share the same key space, " + "which is not supported"); + } else if (data[0] == kCompressedBlockCacheMarker) { + return Status::InvalidArgument( + "persistent_cache and block_cache_compressed share the same key " + "space, " + "which is not supported"); + } else if (data[0] != kPersistentCacheMarker) { + return Status::Corruption("Unexpected mutation to persistent_cache"); + } + } + } + return Status::OK(); +} + +} // namespace + Status BlockBasedTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, bool prefetch_index_and_filter_in_cache) const { return BlockBasedTable::Open( - table_reader_options.ioptions, table_reader_options.env_options, + ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, - table_reader_options.largest_seqno, &tail_prefetch_stats_, - table_reader_options.block_cache_tracer); + table_reader_options.largest_seqno, + table_reader_options.force_direct_prefetch, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer, + table_reader_options.max_file_size_for_l0_meta_pin, + table_reader_options.cur_db_session_id, + table_reader_options.cur_file_num); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - auto table_builder = new BlockBasedTableBuilder( - table_builder_options.ioptions, table_builder_options.moptions, - table_options_, table_builder_options.internal_comparator, - table_builder_options.int_tbl_prop_collector_factories, column_family_id, - file, table_builder_options.compression_type, - table_builder_options.sample_for_compression, - table_builder_options.compression_opts, - table_builder_options.skip_filters, - table_builder_options.column_family_name, table_builder_options.level, - table_builder_options.creation_time, - table_builder_options.oldest_key_time, - table_builder_options.target_file_size, - table_builder_options.file_creation_time); - - return table_builder; + return new BlockBasedTableBuilder(table_options_, table_builder_options, + file); } -Status BlockBasedTableFactory::SanitizeOptions( +Status BlockBasedTableFactory::ValidateOptions( const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && cf_opts.prefix_extractor == nullptr) { @@ -254,7 +650,7 @@ "Enable pin_l0_filter_and_index_blocks_in_cache, " ", but block cache is disabled"); } - if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { + if (!IsSupportedFormatVersion(table_options_.format_version)) { return Status::InvalidArgument( "Unsupported BlockBasedTable format_version. Please check " "include/rocksdb/table.h for more info"); @@ -286,10 +682,24 @@ "max_successive_merges larger than 0 is currently inconsistent with " "unordered_write"); } - return Status::OK(); + { + Status s = CheckCacheOptionCompatibility(table_options_); + if (!s.ok()) { + return s; + } + } + std::string garbage; + if (!SerializeEnum(checksum_type_string_map, + table_options_.checksum, &garbage)) { + return Status::InvalidArgument( + "Unrecognized ChecksumType for checksum: " + + ROCKSDB_NAMESPACE::ToString( + static_cast(table_options_.checksum))); + } + return TableFactory::ValidateOptions(db_opts, cf_opts); } -std::string BlockBasedTableFactory::GetPrintableTableOptions() const { +std::string BlockBasedTableFactory::GetPrintableOptions() const { std::string ret; ret.reserve(20000); const int kBufferSize = 200; @@ -368,7 +778,7 @@ ret.append(buffer); ret.append(table_options_.persistent_cache->GetPrintableOptions()); } - snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n", + snprintf(buffer, kBufferSize, " block_size: %" PRIu64 "\n", table_options_.block_size); ret.append(buffer); snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", @@ -412,149 +822,112 @@ snprintf(buffer, kBufferSize, " block_align: %d\n", table_options_.block_align); ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.max_auto_readahead_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n", + static_cast(table_options_.prepopulate_block_cache)); + ret.append(buffer); return ret; } -#ifndef ROCKSDB_LITE -namespace { -bool SerializeSingleBlockBasedTableOption( - std::string* opt_string, const BlockBasedTableOptions& bbt_options, - const std::string& name, const std::string& delimiter) { - auto iter = block_based_table_type_info.find(name); - if (iter == block_based_table_type_info.end()) { - return false; - } - auto& opt_info = iter->second; - const char* opt_address = - reinterpret_cast(&bbt_options) + opt_info.offset; - std::string value; - bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); - if (result) { - *opt_string = name + "=" + value + delimiter; - } - return result; -} -} // namespace - -Status BlockBasedTableFactory::GetOptionString( - std::string* opt_string, const std::string& delimiter) const { - assert(opt_string); - opt_string->clear(); - for (auto iter = block_based_table_type_info.begin(); - iter != block_based_table_type_info.end(); ++iter) { - if (iter->second.verification == OptionVerificationType::kDeprecated) { - // If the option is no longer used in rocksdb and marked as deprecated, - // we skip it in the serialization. - continue; - } - std::string single_output; - bool result = SerializeSingleBlockBasedTableOption( - &single_output, table_options_, iter->first, delimiter); - assert(result); - if (result) { - opt_string->append(single_output); +const void* BlockBasedTableFactory::GetOptionsPtr( + const std::string& name) const { + if (name == kBlockCacheOpts()) { + if (table_options_.no_block_cache) { + return nullptr; + } else { + return table_options_.block_cache.get(); } + } else { + return TableFactory::GetOptionsPtr(name); } - return Status::OK(); -} -#else -Status BlockBasedTableFactory::GetOptionString( - std::string* /*opt_string*/, const std::string& /*delimiter*/) const { - return Status::OK(); -} -#endif // !ROCKSDB_LITE - -const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { - return table_options_; } #ifndef ROCKSDB_LITE -namespace { -std::string ParseBlockBasedTableOption(const std::string& name, - const std::string& org_value, - BlockBasedTableOptions* new_options, - bool input_strings_escaped = false, - bool ignore_unknown_options = false) { - const std::string& value = - input_strings_escaped ? UnescapeOptionString(org_value) : org_value; - if (!input_strings_escaped) { - // if the input string is not escaped, it means this function is - // invoked from SetOptions, which takes the old format. - if (name == "block_cache" || name == "block_cache_compressed") { - // cache options can be specified in the following format - // "block_cache={capacity=1M;num_shard_bits=4; - // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}" - // To support backward compatibility, the following format - // is also supported. - // "block_cache=1M" - std::shared_ptr cache; - // block_cache is specified in format block_cache=. - if (value.find('=') == std::string::npos) { - cache = NewLRUCache(ParseSizeT(value)); - } else { - LRUCacheOptions cache_opts; - if (!ParseOptionHelper(reinterpret_cast(&cache_opts), - OptionType::kLRUCacheOptions, value)) { - return "Invalid cache options"; - } - cache = NewLRUCache(cache_opts); - } - - if (name == "block_cache") { - new_options->block_cache = cache; - } else { - new_options->block_cache_compressed = cache; - } - return ""; - } else if (name == "filter_policy") { - // Expect the following format - // bloomfilter:int:bool - const std::string kName = "bloomfilter:"; - if (value.compare(0, kName.size(), kName) != 0) { - return "Invalid filter policy name"; - } - size_t pos = value.find(':', kName.size()); - if (pos == std::string::npos) { - return "Invalid filter policy config, missing bits_per_key"; - } - double bits_per_key = - ParseDouble(trim(value.substr(kName.size(), pos - kName.size()))); - bool use_block_based_builder = - ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); - new_options->filter_policy.reset( - NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); - return ""; +// Take a default BlockBasedTableOptions "table_options" in addition to a +// map "opts_map" of option name to option value to construct the new +// BlockBasedTableOptions "new_table_options". +// +// Below are the instructions of how to config some non-primitive-typed +// options in BlockBasedTableOptions: +// +// * filter_policy: +// We currently only support the following FilterPolicy in the convenience +// functions: +// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]" +// to specify BloomFilter. The above string is equivalent to calling +// NewBloomFilterPolicy(bits_per_key, use_block_based_builder). +// [Example]: +// - Pass {"filter_policy", "bloomfilter:4:true"} in +// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits +// per key and use_block_based_builder enabled. +// +// * block_cache / block_cache_compressed: +// We currently only support LRU cache in the GetOptions API. The LRU +// cache can be set by directly specifying its size. +// [Example]: +// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is +// equivalent to setting block_cache using NewLRUCache(1024 * 1024). +// +// @param table_options the default options of the output "new_table_options". +// @param opts_map an option name to value map for specifying how +// "new_table_options" should be set. +// @param new_table_options the resulting options based on "table_options" +// with the change specified in "opts_map". +// @param input_strings_escaped when set to true, each escaped characters +// prefixed by '\' in the values of the opts_map will be further converted +// back to the raw string before assigning to the associated options. +// @param ignore_unknown_options when set to true, unknown options are ignored +// instead of resulting in an unknown-option error. +// @return Status::OK() on success. Otherwise, a non-ok status indicating +// error will be returned, and "new_table_options" will be set to +// "table_options". +Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, + const std::string& opt_value, + void* opt_ptr) { + Status status = TableFactory::ParseOption(config_options, opt_info, opt_name, + opt_value, opt_ptr); + if (config_options.input_strings_escaped && !status.ok()) { // Got an error + // !input_strings_escaped indicates the old API, where everything is + // parsable. + if (opt_info.IsByName()) { + status = Status::OK(); } } - const auto iter = block_based_table_type_info.find(name); - if (iter == block_based_table_type_info.end()) { - if (ignore_unknown_options) { - return ""; - } else { - return "Unrecognized option"; - } - } - const auto& opt_info = iter->second; - if (opt_info.verification != OptionVerificationType::kDeprecated && - !ParseOptionHelper(reinterpret_cast(new_options) + opt_info.offset, - opt_info.type, value)) { - return "Invalid value"; - } - return ""; + return status; } -} // namespace Status GetBlockBasedTableOptionsFromString( const BlockBasedTableOptions& table_options, const std::string& opts_str, BlockBasedTableOptions* new_table_options) { + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + config_options.invoke_prepare_options = false; + return GetBlockBasedTableOptionsFromString(config_options, table_options, + opts_str, new_table_options); +} +Status GetBlockBasedTableOptionsFromString( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { std::unordered_map opts_map; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { return s; } - - return GetBlockBasedTableOptionsFromMap(table_options, opts_map, - new_table_options); + s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map, + new_table_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; + } else { + return Status::InvalidArgument(s.getState()); + } } Status GetBlockBasedTableOptionsFromMap( @@ -562,69 +935,29 @@ const std::unordered_map& opts_map, BlockBasedTableOptions* new_table_options, bool input_strings_escaped, bool ignore_unknown_options) { - assert(new_table_options); - *new_table_options = table_options; - for (const auto& o : opts_map) { - auto error_message = ParseBlockBasedTableOption( - o.first, o.second, new_table_options, input_strings_escaped, - ignore_unknown_options); - if (error_message != "") { - const auto iter = block_based_table_type_info.find(o.first); - if (iter == block_based_table_type_info.end() || - !input_strings_escaped || // !input_strings_escaped indicates - // the old API, where everything is - // parsable. - (iter->second.verification != OptionVerificationType::kByName && - iter->second.verification != - OptionVerificationType::kByNameAllowNull && - iter->second.verification != - OptionVerificationType::kByNameAllowFromNull && - iter->second.verification != OptionVerificationType::kDeprecated)) { - // Restore "new_options" to the default "base_options". - *new_table_options = table_options; - return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", - o.first + " " + error_message); - } - } - } - return Status::OK(); -} + ConfigOptions config_options; + config_options.input_strings_escaped = input_strings_escaped; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.invoke_prepare_options = false; -Status VerifyBlockBasedTableFactory( - const BlockBasedTableFactory* base_tf, - const BlockBasedTableFactory* file_tf, - OptionsSanityCheckLevel sanity_check_level) { - if ((base_tf != nullptr) != (file_tf != nullptr) && - sanity_check_level > kSanityLevelNone) { - return Status::Corruption( - "[RocksDBOptionsParser]: Inconsistent TableFactory class type"); - } - if (base_tf == nullptr) { - return Status::OK(); - } - assert(file_tf != nullptr); - - const auto& base_opt = base_tf->table_options(); - const auto& file_opt = file_tf->table_options(); + return GetBlockBasedTableOptionsFromMap(config_options, table_options, + opts_map, new_table_options); +} - for (auto& pair : block_based_table_type_info) { - if (pair.second.verification == OptionVerificationType::kDeprecated) { - // We skip checking deprecated variables as they might - // contain random values since they might not be initialized - continue; - } - if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) { - if (!AreEqualOptions(reinterpret_cast(&base_opt), - reinterpret_cast(&file_opt), - pair.second, pair.first, nullptr)) { - return Status::Corruption( - "[RocksDBOptionsParser]: " - "failed the verification on BlockBasedTableOptions::", - pair.first); - } - } +Status GetBlockBasedTableOptionsFromMap( + const ConfigOptions& config_options, + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options) { + assert(new_table_options); + BlockBasedTableFactory bbtf(table_options); + Status s = bbtf.ConfigureFromMap(config_options, opts_map); + if (s.ok()) { + *new_table_options = *(bbtf.GetOptions()); + } else { + *new_table_options = table_options; } - return Status::OK(); + return s; } #endif // !ROCKSDB_LITE @@ -633,7 +966,6 @@ return new BlockBasedTableFactory(_table_options); } -const std::string BlockBasedTableFactory::kName = "BlockBasedTable"; const std::string BlockBasedTablePropertyNames::kIndexType = "rocksdb.block.based.table.index.type"; const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h 2025-05-19 16:14:27.000000000 +0000 @@ -13,17 +13,19 @@ #include #include -#include "db/dbformat.h" -#include "options/options_helper.h" -#include "options/options_parser.h" +#include "port/port.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { - +struct ColumnFamilyOptions; +struct ConfigOptions; +struct DBOptions; struct EnvOptions; class BlockBasedTableBuilder; +class RandomAccessFileReader; +class WritableFileWriter; // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. @@ -48,34 +50,42 @@ ~BlockBasedTableFactory() {} - const char* Name() const override { return kName.c_str(); } + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kBlockBasedTableName(); } + + const char* Name() const override { return kBlockBasedTableName(); } + using TableFactory::NewTableReader; Status NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; - // Sanitizes the specified DB Options. - Status SanitizeOptions(const DBOptions& db_opts, + // Valdates the specified DB Options. + Status ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const override; + Status PrepareOptions(const ConfigOptions& opts) override; - std::string GetPrintableTableOptions() const override; - - Status GetOptionString(std::string* opt_string, - const std::string& delimiter) const override; - - const BlockBasedTableOptions& table_options() const; - - void* GetOptions() override { return &table_options_; } + std::string GetPrintableOptions() const override; bool IsDeleteRangeSupported() const override { return true; } - static const std::string kName; + TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; } + + protected: + const void* GetOptionsPtr(const std::string& name) const override; +#ifndef ROCKSDB_LITE + Status ParseOption(const ConfigOptions& config_options, + const OptionTypeInfo& opt_info, + const std::string& opt_name, const std::string& opt_value, + void* opt_ptr) override; +#endif + void InitializeOptions(); private: BlockBasedTableOptions table_options_; @@ -86,110 +96,4 @@ extern const std::string kHashIndexPrefixesMetadataBlock; extern const std::string kPropTrue; extern const std::string kPropFalse; - -#ifndef ROCKSDB_LITE -extern Status VerifyBlockBasedTableFactory( - const BlockBasedTableFactory* base_tf, - const BlockBasedTableFactory* file_tf, - OptionsSanityCheckLevel sanity_check_level); - -static std::unordered_map - block_based_table_type_info = { - /* currently not supported - std::shared_ptr block_cache = nullptr; - std::shared_ptr block_cache_compressed = nullptr; - */ - {"flush_block_policy_factory", - {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory), - OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName, - false, 0}}, - {"cache_index_and_filter_blocks", - {offsetof(struct BlockBasedTableOptions, - cache_index_and_filter_blocks), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"cache_index_and_filter_blocks_with_high_priority", - {offsetof(struct BlockBasedTableOptions, - cache_index_and_filter_blocks_with_high_priority), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"pin_l0_filter_and_index_blocks_in_cache", - {offsetof(struct BlockBasedTableOptions, - pin_l0_filter_and_index_blocks_in_cache), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"index_type", - {offsetof(struct BlockBasedTableOptions, index_type), - OptionType::kBlockBasedTableIndexType, - OptionVerificationType::kNormal, false, 0}}, - {"hash_index_allow_collision", - {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"data_block_index_type", - {offsetof(struct BlockBasedTableOptions, data_block_index_type), - OptionType::kBlockBasedTableDataBlockIndexType, - OptionVerificationType::kNormal, false, 0}}, - {"index_shortening", - {offsetof(struct BlockBasedTableOptions, index_shortening), - OptionType::kBlockBasedTableIndexShorteningMode, - OptionVerificationType::kNormal, false, 0}}, - {"data_block_hash_table_util_ratio", - {offsetof(struct BlockBasedTableOptions, - data_block_hash_table_util_ratio), - OptionType::kDouble, OptionVerificationType::kNormal, false, 0}}, - {"checksum", - {offsetof(struct BlockBasedTableOptions, checksum), - OptionType::kChecksumType, OptionVerificationType::kNormal, false, - 0}}, - {"no_block_cache", - {offsetof(struct BlockBasedTableOptions, no_block_cache), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"block_size", - {offsetof(struct BlockBasedTableOptions, block_size), - OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, - {"block_size_deviation", - {offsetof(struct BlockBasedTableOptions, block_size_deviation), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"block_restart_interval", - {offsetof(struct BlockBasedTableOptions, block_restart_interval), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"index_block_restart_interval", - {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), - OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, - {"index_per_partition", - {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false, - 0}}, - {"metadata_block_size", - {offsetof(struct BlockBasedTableOptions, metadata_block_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, - {"partition_filters", - {offsetof(struct BlockBasedTableOptions, partition_filters), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"filter_policy", - {offsetof(struct BlockBasedTableOptions, filter_policy), - OptionType::kFilterPolicy, OptionVerificationType::kByName, false, - 0}}, - {"whole_key_filtering", - {offsetof(struct BlockBasedTableOptions, whole_key_filtering), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"skip_table_builder_flush", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, - 0}}, - {"format_version", - {offsetof(struct BlockBasedTableOptions, format_version), - OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}}, - {"verify_compression", - {offsetof(struct BlockBasedTableOptions, verify_compression), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"read_amp_bytes_per_bit", - {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), - OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, - {"enable_index_compression", - {offsetof(struct BlockBasedTableOptions, enable_index_compression), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"block_align", - {offsetof(struct BlockBasedTableOptions, block_align), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, - {"pin_top_level_index_and_filter", - {offsetof(struct BlockBasedTableOptions, - pin_top_level_index_and_filter), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,382 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_iterator.h" + +namespace ROCKSDB_NAMESPACE { +void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); } + +void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); } + +void BlockBasedTableIterator::SeekImpl(const Slice* target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { + ResetDataIter(); + return; + } + + bool need_seek_index = true; + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + // Reseek. + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + } + + if (need_seek_index) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + allow_unprepared_value_) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + // ResetDataIter() will invalidate block_iter_. Thus, there is no need to + // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound + // as that will be done later when the data block is actually read. + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } else { + // When the user does a reseek, the iterate_upper_bound might have + // changed. CheckDataBlockWithinUpperBound() needs to be called + // explicitly if the reseek ends up in the same data block. + // If the reseek ends up in a different block, InitDataBlock() will do + // the iterator upper bound check. + CheckDataBlockWithinUpperBound(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || icomp_.Compare(*target, key()) <= 0); + } +} + +void BlockBasedTableIterator::SeekForPrev(const Slice& target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + // For now totally disable prefix seek in auto prefix mode because we don't + // have logic + if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + auto seek_status = index_iter_->status(); + // Check for IO error + if (!seek_status.IsNotFound() && !seek_status.ok()) { + ResetDataIter(); + return; + } + + // With prefix index, Seek() returns NotFound if the prefix doesn't exist + if (seek_status.IsNotFound()) { + // Any key less than the target is fine for prefix seek + ResetDataIter(); + return; + } else { + index_iter_->SeekToLast(); + } + // Check for IO error + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +void BlockBasedTableIterator::SeekToLast() { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); +} + +void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); + CheckOutOfBound(); +} + +bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = !is_at_first_key_from_index_; + } + return is_valid; +} + +void BlockBasedTableIterator::Prev() { + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + + FindKeyBackward(); +} + +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle, + read_options_.readahead_size, + is_for_compaction); + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, s, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction); + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + } +} + +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.status().ok()) { + return false; + } + + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void BlockBasedTableIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + // Whether next data block is out of upper bound, if there is one. + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && + block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock; + assert(!next_block_is_out_of_bound || + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, + index_iter_->user_key(), /*b_has_ts=*/true) <= 0); + ResetDataIter(); + index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + + IndexValue v = index_iter_->value(); + + if (!v.first_internal_key.empty() && allow_unprepared_value_) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void BlockBasedTableIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock && + Valid()) { + is_out_of_bound_ = + user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(), + /*b_has_ts=*/true) <= 0; + } +} + +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp( + *read_options_.iterate_upper_bound, + /*a_has_ts=*/false, index_iter_->user_key(), + /*b_has_ts=*/true) > 0) + ? BlockUpperBound::kUpperBoundBeyondCurBlock + : BlockUpperBound::kUpperBoundInCurBlock; + } +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,273 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterates over the contents of BlockBasedTable. +class BlockBasedTableIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + // @param read_options Must outlive this iterator. + public: + BlockBasedTableIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, TableReaderCaller caller, + size_t compaction_readahead_size = 0, bool allow_unprepared_value = false) + : index_iter_(std::move(index_iter)), + table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor), + lookup_context_(caller), + block_prefetcher_(compaction_readahead_size), + allow_unprepared_value_(allow_unprepared_value), + block_iter_points_to_real_block_(false), + check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check) {} + + ~BlockBasedTableIterator() {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + bool Valid() const override { + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); + } + Slice key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } + } + Slice user_key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } + } + bool PrepareValue() override { + assert(Valid()); + + if (!is_at_first_key_from_index_) { + return true; + } + + return const_cast(this) + ->MaterializeCurrentBlock(); + } + Slice value() const override { + // PrepareValue() must have been called. + assert(!is_at_first_key_from_index_); + assert(Valid()); + + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + + inline IterBoundCheck UpperBoundCheckResult() override { + if (is_out_of_bound_) { + return IterBoundCheck::kOutOfBound; + } else if (block_upper_bound_check_ == + BlockUpperBound::kUpperBoundBeyondCurBlock) { + assert(!is_out_of_bound_); + return IterBoundCheck::kInbound; + } else { + return IterBoundCheck::kUnknown; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); + } + bool IsValuePinned() const override { + assert(!is_at_first_key_from_index_); + assert(Valid()); + + // BlockIter::IsValuePinned() is always true. No need to check + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_; + } + + void ResetDataIter() { + if (block_iter_points_to_real_block_) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { + block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + } + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + block_upper_bound_check_ = BlockUpperBound::kUnknown; + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (block_prefetcher_.prefetch_buffer() != nullptr && + read_options_.adaptive_readahead) { + block_prefetcher_.prefetch_buffer()->GetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->GetReadaheadState(readahead_file_info); + } + } + } + + void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (read_options_.adaptive_readahead) { + block_prefetcher_.SetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->SetReadaheadState(readahead_file_info); + } + } + } + + std::unique_ptr> index_iter_; + + private: + enum class IterDirection { + kForward, + kBackward, + }; + // This enum indicates whether the upper bound falls into current block + // or beyond. + // +-------------+ + // | cur block | <-- (1) + // +-------------+ + // <-- (2) + // --- --- + // <-- (3) + // +-------------+ + // | next block | <-- (4) + // ...... + // + // When the block is smaller than , kUpperBoundInCurBlock + // is the value to use. The examples are (1) or (2) in the graph. It means + // all keys in the next block or beyond will be out of bound. Keys within + // the current block may or may not be out of bound. + // When the block is larger or equal to , + // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4) + // in the graph. It means that all keys in the current block is within the + // upper bound and keys in the next block may or may not be within the uppder + // bound. + // If the boundary key hasn't been checked against the upper bound, + // kUnknown can be used. + enum class BlockUpperBound { + kUpperBoundInCurBlock, + kUpperBoundBeyondCurBlock, + kUnknown, + }; + + const BlockBasedTable* table_; + const ReadOptions& read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + PinnedIteratorsManager* pinned_iters_mgr_; + DataBlockIter block_iter_; + const SliceTransform* prefix_extractor_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + + BlockPrefetcher block_prefetcher_; + + const bool allow_unprepared_value_; + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). + bool is_out_of_bound_ = false; + // How current data block's boundary key with the next block is compared with + // iterate upper bound. + BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to PrepareValue() will trigger loading the block. + bool is_at_first_key_from_index_ = false; + bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitDataBlock(); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); + + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { + if (need_upper_bound_check_ && direction == IterDirection::kBackward) { + // Upper bound check isn't sufficient for backward direction to + // guarantee the same result as total order, so disable prefix + // check. + return true; + } + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/block_based_table_reader.h" + #include #include #include @@ -14,30 +15,45 @@ #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" +#include "cache/sharded_cache.h" +#include "db/compaction/compaction_picker.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" - #include "file/file_prefetch_buffer.h" +#include "file/file_util.h" #include "file/random_access_file_reader.h" - +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "port/lang.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/filter_policy.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "rocksdb/snapshot.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" - +#include "rocksdb/trace_record.h" +#include "table/block_based/binary_search_index_reader.h" #include "table/block_based/block.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_iterator.h" +#include "table/block_based/block_like_traits.h" #include "table/block_based/block_prefix_index.h" +#include "table/block_based/block_type.h" #include "table/block_based/filter_block.h" #include "table/block_based/full_filter_block.h" +#include "table/block_based/hash_index_reader.h" #include "table/block_based/partitioned_filter_block.h" +#include "table/block_based/partitioned_index_reader.h" #include "table/block_fetcher.h" #include "table/format.h" #include "table/get_context.h" @@ -45,17 +61,14 @@ #include "table/meta_blocks.h" #include "table/multiget_context.h" #include "table/persistent_cache_helper.h" +#include "table/persistent_cache_options.h" #include "table/sst_file_writer_collectors.h" #include "table/two_level_iterator.h" - -#include "monitoring/perf_context_imp.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/util.h" -#include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { @@ -63,89 +76,10 @@ extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; -typedef BlockBasedTable::IndexReader IndexReader; - -// Found that 256 KB readahead size provides the best performance, based on -// experiments, for auto readahead. Experiment data is in PR #3282. -const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; - BlockBasedTable::~BlockBasedTable() { delete rep_; } -std::atomic BlockBasedTable::next_cache_key_id_(0); - -template -class BlocklikeTraits; - -template <> -class BlocklikeTraits { - public: - static BlockContents* Create(BlockContents&& contents, - SequenceNumber /* global_seqno */, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new BlockContents(std::move(contents)); - } - - static uint32_t GetNumRestarts(const BlockContents& /* contents */) { - return 0; - } -}; - -template <> -class BlocklikeTraits { - public: - static ParsedFullFilterBlock* Create(BlockContents&& contents, - SequenceNumber /* global_seqno */, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* filter_policy) { - return new ParsedFullFilterBlock(filter_policy, std::move(contents)); - } - - static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { - return 0; - } -}; - -template <> -class BlocklikeTraits { - public: - static Block* Create(BlockContents&& contents, SequenceNumber global_seqno, - size_t read_amp_bytes_per_bit, Statistics* statistics, - bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, - statistics); - } - - static uint32_t GetNumRestarts(const Block& block) { - return block.NumRestarts(); - } -}; - -template <> -class BlocklikeTraits { - public: - static UncompressionDict* Create(BlockContents&& contents, - SequenceNumber /* global_seqno */, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool using_zstd, - const FilterPolicy* /* filter_policy */) { - return new UncompressionDict(contents.data, std::move(contents.allocation), - using_zstd); - } - - static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { - return 0; - } -}; - namespace { // Read the block identified by "handle" from "file". // The only relevant option is options.verify_checksums for now. @@ -157,12 +91,12 @@ Status ReadBlockFromFile( RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - std::unique_ptr* result, const ImmutableCFOptions& ioptions, + std::unique_ptr* result, const ImmutableOptions& ioptions, bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, - const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, - bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) { + const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit, + MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd, + const FilterPolicy* filter_policy) { assert(result); BlockContents contents; @@ -173,41 +107,13 @@ Status s = block_fetcher.ReadBlockContents(); if (s.ok()) { result->reset(BlocklikeTraits::Create( - std::move(contents), global_seqno, read_amp_bytes_per_bit, - ioptions.statistics, using_zstd, filter_policy)); + std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd, + filter_policy)); } return s; } -inline MemoryAllocator* GetMemoryAllocator( - const BlockBasedTableOptions& table_options) { - return table_options.block_cache.get() - ? table_options.block_cache->memory_allocator() - : nullptr; -} - -inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( - const BlockBasedTableOptions& table_options) { - return table_options.block_cache_compressed.get() - ? table_options.block_cache_compressed->memory_allocator() - : nullptr; -} - -// Delete the entry resided in the cache. -template -void DeleteCachedEntry(const Slice& /*key*/, void* value) { - auto entry = reinterpret_cast(value); - delete entry; -} - -// Release the cached entry and decrement its ref count. -void ForceReleaseCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle, true /* force_erase */); -} - // Release the cached entry and decrement its ref count. // Do not force erase void ReleaseCachedEntry(void* arg, void* h) { @@ -219,8 +125,9 @@ // For hash based index, return true if prefix_extractor and // prefix_extractor_block mismatch, false otherwise. This flag will be used // as total_order_seek via NewIndexIterator -bool PrefixExtractorChanged(const TableProperties* table_properties, - const SliceTransform* prefix_extractor) { +inline bool PrefixExtractorChangedHelper( + const TableProperties* table_properties, + const SliceTransform* prefix_extractor) { // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. // Turn off hash index in prefix_extractor is not set; if prefix_extractor // is set but prefix_extractor_block is not set, also disable hash index @@ -230,8 +137,7 @@ } // prefix_extractor and prefix_extractor_block are both non-empty - if (table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) != 0) { + if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) { return true; } else { return false; @@ -244,553 +150,12 @@ memcpy(heap_buf.get(), buf.data(), buf.size()); return heap_buf; } - } // namespace -// Encapsulates common functionality for the various index reader -// implementations. Provides access to the index block regardless of whether -// it is owned by the reader or stored in the cache, or whether it is pinned -// in the cache or not. -class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { - public: - IndexReaderCommon(const BlockBasedTable* t, - CachableEntry&& index_block) - : table_(t), index_block_(std::move(index_block)) { - assert(table_ != nullptr); - } - - protected: - static Status ReadIndexBlock(const BlockBasedTable* table, - FilePrefetchBuffer* prefetch_buffer, - const ReadOptions& read_options, bool use_cache, - GetContext* get_context, - BlockCacheLookupContext* lookup_context, - CachableEntry* index_block); - - const BlockBasedTable* table() const { return table_; } - - const InternalKeyComparator* internal_comparator() const { - assert(table_ != nullptr); - assert(table_->get_rep() != nullptr); - - return &table_->get_rep()->internal_comparator; - } - - bool index_has_first_key() const { - assert(table_ != nullptr); - assert(table_->get_rep() != nullptr); - return table_->get_rep()->index_has_first_key; - } - - bool index_key_includes_seq() const { - assert(table_ != nullptr); - assert(table_->get_rep() != nullptr); - return table_->get_rep()->index_key_includes_seq; - } - - bool index_value_is_full() const { - assert(table_ != nullptr); - assert(table_->get_rep() != nullptr); - return table_->get_rep()->index_value_is_full; - } - - bool cache_index_blocks() const { - assert(table_ != nullptr); - assert(table_->get_rep() != nullptr); - return table_->get_rep()->table_options.cache_index_and_filter_blocks; - } - - Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - CachableEntry* index_block) const; - - size_t ApproximateIndexBlockMemoryUsage() const { - assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); - return index_block_.GetOwnValue() - ? index_block_.GetValue()->ApproximateMemoryUsage() - : 0; - } - - private: - const BlockBasedTable* table_; - CachableEntry index_block_; -}; - -Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - const ReadOptions& read_options, bool use_cache, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - CachableEntry* index_block) { - PERF_TIMER_GUARD(read_index_block_nanos); - - assert(table != nullptr); - assert(index_block != nullptr); - assert(index_block->IsEmpty()); - - const Rep* const rep = table->get_rep(); - assert(rep != nullptr); - - const Status s = table->RetrieveBlock( - prefetch_buffer, read_options, rep->footer.index_handle(), - UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, - get_context, lookup_context, /* for_compaction */ false, use_cache); - - return s; -} - -Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( - bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - CachableEntry* index_block) const { - assert(index_block != nullptr); - - if (!index_block_.IsEmpty()) { - index_block->SetUnownedValue(index_block_.GetValue()); - return Status::OK(); - } - - ReadOptions read_options; - if (no_io) { - read_options.read_tier = kBlockCacheTier; - } - - return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, - cache_index_blocks(), get_context, lookup_context, - index_block); -} - -// Index that allows binary search lookup in a two-level index structure. -class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { - public: - // Read the partition index from the file and create an instance for - // `PartitionIndexReader`. - // On success, index_reader will be populated; otherwise it will remain - // unmodified. - static Status Create(const BlockBasedTable* table, - FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context, - std::unique_ptr* index_reader) { - assert(table != nullptr); - assert(table->get_rep()); - assert(!pin || prefetch); - assert(index_reader != nullptr); - - CachableEntry index_block; - if (prefetch || !use_cache) { - const Status s = - ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, - /*get_context=*/nullptr, lookup_context, &index_block); - if (!s.ok()) { - return s; - } - - if (use_cache && !pin) { - index_block.Reset(); - } - } - - index_reader->reset( - new PartitionIndexReader(table, std::move(index_block))); - - return Status::OK(); - } - - // return a two-level iterator: first level is on the partition index - InternalIteratorBase* NewIterator( - const ReadOptions& read_options, bool /* disable_prefix_seek */, - IndexBlockIter* iter, GetContext* get_context, - BlockCacheLookupContext* lookup_context) override { - const bool no_io = (read_options.read_tier == kBlockCacheTier); - CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); - if (!s.ok()) { - if (iter != nullptr) { - iter->Invalidate(s); - return iter; - } - - return NewErrorInternalIterator(s); - } - - InternalIteratorBase* it = nullptr; - - Statistics* kNullStats = nullptr; - // Filters are already checked before seeking the index - if (!partition_map_.empty()) { - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - it = NewTwoLevelIterator( - new BlockBasedTable::PartitionedIndexIteratorState(table(), - &partition_map_), - index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_has_first_key(), - index_key_includes_seq(), index_value_is_full())); - } else { - ReadOptions ro; - ro.fill_cache = read_options.fill_cache; - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - it = new BlockBasedTableIterator( - table(), ro, *internal_comparator(), - index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), - nullptr, kNullStats, true, index_has_first_key(), - index_key_includes_seq(), index_value_is_full()), - false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, - lookup_context ? lookup_context->caller - : TableReaderCaller::kUncategorized); - } - - assert(it != nullptr); - index_block.TransferTo(it); - - return it; - - // TODO(myabandeh): Update TwoLevelIterator to be able to make use of - // on-stack BlockIter while the state is on heap. Currentlly it assumes - // the first level iter is always on heap and will attempt to delete it - // in its destructor. - } - - void CacheDependencies(bool pin) override { - // Before read partitions, prefetch them to avoid lots of IOs - BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - const BlockBasedTable::Rep* rep = table()->rep_; - IndexBlockIter biter; - BlockHandle handle; - Statistics* kNullStats = nullptr; - - CachableEntry index_block; - Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, - &lookup_context, &index_block); - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Error retrieving top-level index block while trying to " - "cache index partitions: %s", - s.ToString().c_str()); - return; - } - - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), &biter, - kNullStats, true, index_has_first_key(), index_key_includes_seq(), - index_value_is_full()); - // Index partitions are assumed to be consecuitive. Prefetch them all. - // Read the first block offset - biter.SeekToFirst(); - if (!biter.Valid()) { - // Empty index. - return; - } - handle = biter.value().handle; - uint64_t prefetch_off = handle.offset(); - - // Read the last block's offset - biter.SeekToLast(); - if (!biter.Valid()) { - // Empty index. - return; - } - handle = biter.value().handle; - uint64_t last_off = handle.offset() + block_size(handle); - uint64_t prefetch_len = last_off - prefetch_off; - std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer); - s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, - static_cast(prefetch_len)); - - // After prefetch, read the partitions one by one - biter.SeekToFirst(); - auto ro = ReadOptions(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value().handle; - CachableEntry block; - // TODO: Support counter batch update for partitioned index and - // filter blocks - s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, - /*contents=*/nullptr); - - assert(s.ok() || block.GetValue() == nullptr); - if (s.ok() && block.GetValue() != nullptr) { - if (block.IsCached()) { - if (pin) { - partition_map_[handle.offset()] = std::move(block); - } - } - } - } - } - - size_t ApproximateMemoryUsage() const override { - size_t usage = ApproximateIndexBlockMemoryUsage(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size(const_cast(this)); -#else - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - // TODO(myabandeh): more accurate estimate of partition_map_ mem usage - return usage; - } - - private: - PartitionIndexReader(const BlockBasedTable* t, - CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) {} - - std::unordered_map> partition_map_; -}; - -// Index that allows binary search lookup for the first key of each block. -// This class can be viewed as a thin wrapper for `Block` class which already -// supports binary search. -class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { - public: - // Read index from the file and create an intance for - // `BinarySearchIndexReader`. - // On success, index_reader will be populated; otherwise it will remain - // unmodified. - static Status Create(const BlockBasedTable* table, - FilePrefetchBuffer* prefetch_buffer, bool use_cache, - bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context, - std::unique_ptr* index_reader) { - assert(table != nullptr); - assert(table->get_rep()); - assert(!pin || prefetch); - assert(index_reader != nullptr); - - CachableEntry index_block; - if (prefetch || !use_cache) { - const Status s = - ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, - /*get_context=*/nullptr, lookup_context, &index_block); - if (!s.ok()) { - return s; - } - - if (use_cache && !pin) { - index_block.Reset(); - } - } - - index_reader->reset( - new BinarySearchIndexReader(table, std::move(index_block))); - - return Status::OK(); - } - - InternalIteratorBase* NewIterator( - const ReadOptions& read_options, bool /* disable_prefix_seek */, - IndexBlockIter* iter, GetContext* get_context, - BlockCacheLookupContext* lookup_context) override { - const bool no_io = (read_options.read_tier == kBlockCacheTier); - CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); - if (!s.ok()) { - if (iter != nullptr) { - iter->Invalidate(s); - return iter; - } - - return NewErrorInternalIterator(s); - } - - Statistics* kNullStats = nullptr; - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, true, index_has_first_key(), index_key_includes_seq(), - index_value_is_full()); - - assert(it != nullptr); - index_block.TransferTo(it); - - return it; - } - - size_t ApproximateMemoryUsage() const override { - size_t usage = ApproximateIndexBlockMemoryUsage(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size(const_cast(this)); -#else - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return usage; - } - - private: - BinarySearchIndexReader(const BlockBasedTable* t, - CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) {} -}; - -// Index that leverages an internal hash table to quicken the lookup for a given -// key. -class HashIndexReader : public BlockBasedTable::IndexReaderCommon { - public: - static Status Create(const BlockBasedTable* table, - FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_index_iter, bool use_cache, - bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context, - std::unique_ptr* index_reader) { - assert(table != nullptr); - assert(index_reader != nullptr); - assert(!pin || prefetch); - - const BlockBasedTable::Rep* rep = table->get_rep(); - assert(rep != nullptr); - - CachableEntry index_block; - if (prefetch || !use_cache) { - const Status s = - ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, - /*get_context=*/nullptr, lookup_context, &index_block); - if (!s.ok()) { - return s; - } - - if (use_cache && !pin) { - index_block.Reset(); - } - } - - // Note, failure to create prefix hash index does not need to be a - // hard error. We can still fall back to the original binary search index. - // So, Create will succeed regardless, from this point on. - - index_reader->reset(new HashIndexReader(table, std::move(index_block))); - - // Get prefixes block - BlockHandle prefixes_handle; - Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, - &prefixes_handle); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - // Get index metadata block - BlockHandle prefixes_meta_handle; - s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, - &prefixes_meta_handle); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - RandomAccessFileReader* const file = rep->file.get(); - const Footer& footer = rep->footer; - const ImmutableCFOptions& ioptions = rep->ioptions; - const PersistentCacheOptions& cache_options = rep->persistent_cache_options; - MemoryAllocator* const memory_allocator = - GetMemoryAllocator(rep->table_options); - - // Read contents for the blocks - BlockContents prefixes_contents; - BlockFetcher prefixes_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); - s = prefixes_block_fetcher.ReadBlockContents(); - if (!s.ok()) { - return s; - } - BlockContents prefixes_meta_contents; - BlockFetcher prefixes_meta_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, - &prefixes_meta_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, BlockType::kHashIndexMetadata, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); - s = prefixes_meta_block_fetcher.ReadBlockContents(); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - BlockPrefixIndex* prefix_index = nullptr; - assert(rep->internal_prefix_transform.get() != nullptr); - s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), - prefixes_contents.data, - prefixes_meta_contents.data, &prefix_index); - // TODO: log error - if (s.ok()) { - HashIndexReader* const hash_index_reader = - static_cast(index_reader->get()); - hash_index_reader->prefix_index_.reset(prefix_index); - } - - return Status::OK(); - } - - InternalIteratorBase* NewIterator( - const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* iter, GetContext* get_context, - BlockCacheLookupContext* lookup_context) override { - const bool no_io = (read_options.read_tier == kBlockCacheTier); - CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); - if (!s.ok()) { - if (iter != nullptr) { - iter->Invalidate(s); - return iter; - } - - return NewErrorInternalIterator(s); - } - - Statistics* kNullStats = nullptr; - const bool total_order_seek = - read_options.total_order_seek || disable_prefix_seek; - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - auto it = index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), iter, - kNullStats, total_order_seek, index_has_first_key(), - index_key_includes_seq(), index_value_is_full(), - false /* block_contents_pinned */, prefix_index_.get()); - - assert(it != nullptr); - index_block.TransferTo(it); - - return it; - } - - size_t ApproximateMemoryUsage() const override { - size_t usage = ApproximateIndexBlockMemoryUsage(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size(const_cast(this)); -#else - if (prefix_index_) { - usage += prefix_index_->ApproximateMemoryUsage(); - } - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return usage; - } - - private: - HashIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) - : IndexReaderCommon(t, std::move(index_block)) {} - - std::unique_ptr prefix_index_; -}; - void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, size_t usage) const { - Statistics* const statistics = rep_->ioptions.statistics; + Statistics* const statistics = rep_->ioptions.stats; PERF_COUNTER_ADD(block_cache_hit_count, 1); PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, @@ -848,7 +213,7 @@ void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, GetContext* get_context) const { - Statistics* const statistics = rep_->ioptions.statistics; + Statistics* const statistics = rep_->ioptions.stats; // TODO: introduce aggregate (not per-level) block cache miss count PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, @@ -898,17 +263,21 @@ } } -void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, - GetContext* get_context, - size_t usage) const { - Statistics* const statistics = rep_->ioptions.statistics; - +void BlockBasedTable::UpdateCacheInsertionMetrics( + BlockType block_type, GetContext* get_context, size_t usage, bool redundant, + Statistics* const statistics) { // TODO: introduce perf counters for block cache insertions if (get_context) { ++get_context->get_context_stats_.num_cache_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_add_redundant; + } get_context->get_context_stats_.num_cache_bytes_write += usage; } else { RecordTick(statistics, BLOCK_CACHE_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT); + } RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); } @@ -916,9 +285,15 @@ case BlockType::kFilter: if (get_context) { ++get_context->get_context_stats_.num_cache_filter_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_filter_add_redundant; + } get_context->get_context_stats_.num_cache_filter_bytes_insert += usage; } else { RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT); + } RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); } break; @@ -926,10 +301,17 @@ case BlockType::kCompressionDictionary: if (get_context) { ++get_context->get_context_stats_.num_cache_compression_dict_add; + if (redundant) { + ++get_context->get_context_stats_ + .num_cache_compression_dict_add_redundant; + } get_context->get_context_stats_ .num_cache_compression_dict_bytes_insert += usage; } else { RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT); + } RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage); } @@ -938,9 +320,15 @@ case BlockType::kIndex: if (get_context) { ++get_context->get_context_stats_.num_cache_index_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_index_add_redundant; + } get_context->get_context_stats_.num_cache_index_bytes_insert += usage; } else { RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT); + } RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage); } break; @@ -950,9 +338,15 @@ // for range tombstones if (get_context) { ++get_context->get_context_stats_.num_cache_data_add; + if (redundant) { + ++get_context->get_context_stats_.num_cache_data_add_redundant; + } get_context->get_context_stats_.num_cache_data_bytes_insert += usage; } else { RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + if (redundant) { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT); + } RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage); } break; @@ -960,9 +354,17 @@ } Cache::Handle* BlockBasedTable::GetEntryFromCache( - Cache* block_cache, const Slice& key, BlockType block_type, - GetContext* get_context) const { - auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics); + const CacheTier& cache_tier, Cache* block_cache, const Slice& key, + BlockType block_type, const bool wait, GetContext* get_context, + const Cache::CacheItemHelper* cache_helper, + const Cache::CreateCallback& create_cb, Cache::Priority priority) const { + Cache::Handle* cache_handle = nullptr; + if (cache_tier == CacheTier::kNonVolatileBlockTier) { + cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority, + wait, rep_->ioptions.statistics.get()); + } else { + cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get()); + } if (cache_handle != nullptr) { UpdateCacheHitMetrics(block_type, get_context, @@ -974,51 +376,21 @@ return cache_handle; } -// Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { - assert(kMaxCacheKeyPrefixSize >= 10); - rep->cache_key_prefix_size = 0; - rep->compressed_cache_key_prefix_size = 0; - if (rep->table_options.block_cache != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), - &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); - } - if (rep->table_options.persistent_cache != nullptr) { - GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), - &rep->persistent_cache_key_prefix[0], - &rep->persistent_cache_key_prefix_size); - } - if (rep->table_options.block_cache_compressed != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), - rep->file->file(), &rep->compressed_cache_key_prefix[0], - &rep->compressed_cache_key_prefix_size); - } -} - -void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file, - char* buffer, size_t* size) { - // generate an id from the file - *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); - - // If the prefix wasn't generated or was too long, - // create one from the cache. - if (cc != nullptr && *size == 0) { - char* end = EncodeVarint64(buffer, cc->NewId()); - *size = static_cast(end - buffer); - } -} - -void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file, - char* buffer, size_t* size) { - // generate an id from the file - *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); - - // If the prefix wasn't generated or was too long, - // create one from the cache. - if (cc != nullptr && *size == 0) { - char* end = EncodeVarint64(buffer, cc->NewId()); - *size = static_cast(end - buffer); +template +Status BlockBasedTable::InsertEntryToCache( + const CacheTier& cache_tier, Cache* block_cache, const Slice& key, + const Cache::CacheItemHelper* cache_helper, + std::unique_ptr& block_holder, size_t charge, + Cache::Handle** cache_handle, Cache::Priority priority) const { + Status s = Status::OK(); + if (cache_tier == CacheTier::kNonVolatileBlockTier) { + s = block_cache->Insert(key, block_holder.get(), cache_helper, charge, + cache_handle, priority); + } else { + s = block_cache->Insert(key, block_holder.get(), charge, + cache_helper->del_cb, cache_handle, priority); } + return s; } namespace { @@ -1115,46 +487,110 @@ } } // namespace -Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, - size_t cache_key_prefix_size, - const BlockHandle& handle, char* cache_key) { - assert(cache_key != nullptr); - assert(cache_key_prefix_size != 0); - assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); - memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = - EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); - return Slice(cache_key, static_cast(end - cache_key)); +void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, + const std::string& cur_db_session_id, + uint64_t cur_file_number, + uint64_t file_size, + OffsetableCacheKey* out_base_cache_key, + bool* out_is_stable) { + // Use a stable cache key if sufficient data is in table properties + std::string db_session_id; + uint64_t file_num; + std::string db_id; + if (properties && !properties->db_session_id.empty() && + properties->orig_file_number > 0) { + // (Newer SST file case) + // We must have both properties to get a stable unique id because + // CreateColumnFamilyWithImport or IngestExternalFiles can change the + // file numbers on a file. + db_session_id = properties->db_session_id; + file_num = properties->orig_file_number; + // Less critical, populated in earlier release than above + db_id = properties->db_id; + if (out_is_stable) { + *out_is_stable = true; + } + } else { + // (Old SST file case) + // We use (unique) cache keys based on current identifiers. These are at + // least stable across table file close and re-open, but not across + // different DBs nor DB close and re-open. + db_session_id = cur_db_session_id; + file_num = cur_file_number; + // Plumbing through the DB ID to here would be annoying, and of limited + // value because of the case of VersionSet::Recover opening some table + // files and later setting the DB ID. So we just rely on uniqueness + // level provided by session ID. + db_id = "unknown"; + if (out_is_stable) { + *out_is_stable = false; + } + } + + // Too many tests to update to get these working + // assert(file_num > 0); + // assert(!db_session_id.empty()); + // assert(!db_id.empty()); + + // Minimum block size is 5 bytes; therefore we can trim off two lower bits + // from offets. See GetCacheKey. + *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num, + /*max_offset*/ file_size >> 2); +} + +CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key, + const BlockHandle& handle) { + // Minimum block size is 5 bytes; therefore we can trim off two lower bits + // from offet. + return base_cache_key.WithOffset(handle.offset() >> 2); } Status BlockBasedTable::Open( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const BlockBasedTableOptions& table_options, + const ReadOptions& read_options, const ImmutableOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor, + const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, const int level, const bool immortal_table, - const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats, - BlockCacheTracer* const block_cache_tracer) { + const SequenceNumber largest_seqno, const bool force_direct_prefetch, + TailPrefetchStats* tail_prefetch_stats, + BlockCacheTracer* const block_cache_tracer, + size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id, + uint64_t cur_file_num) { table_reader->reset(); Status s; Footer footer; std::unique_ptr prefetch_buffer; + // Only retain read_options.deadline and read_options.io_timeout. + // In future, we may retain more + // options. Specifically, w ignore verify_checksums and default to + // checksum verification anyway when creating the index and filter + // readers. + ReadOptions ro; + ro.deadline = read_options.deadline; + ro.io_timeout = read_options.io_timeout; + // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; const bool preload_all = !table_options.cache_index_and_filter_blocks; if (!ioptions.allow_mmap_reads) { - s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all, - preload_all, &prefetch_buffer); + s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, + tail_prefetch_stats, prefetch_all, preload_all, + &prefetch_buffer); + // Return error in prefetch path to users. + if (!s.ok()) { + return s; + } } else { // Should not prefetch for mmap mode. prefetch_buffer.reset(new FilePrefetchBuffer( - nullptr, 0, 0, false /* enable */, true /* track_min_offset */)); + 0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */, + true /* track_min_offset */)); } // Read in the following order: @@ -1165,12 +601,16 @@ // 5. [meta block: compression dictionary] // 6. [meta block: index] // 7. [meta block: filter] - s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, - kBlockBasedTableMagicNumber); + IOOptions opts; + s = file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size, + &footer, kBlockBasedTableMagicNumber); + } if (!s.ok()) { return s; } - if (!BlockBasedTableSupportedVersion(footer.version())) { + if (!IsSupportedFormatVersion(footer.format_version())) { return Status::Corruption( "Unknown Footer version. Maybe this file was created with newer " "version of RocksDB?"); @@ -1182,8 +622,8 @@ // access a dangling pointer. BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, - internal_comparator, skip_filters, level, - immortal_table); + internal_comparator, skip_filters, + file_size, level, immortal_table); rep->file = std::move(file); rep->footer = footer; rep->hash_index_allow_collision = table_options.hash_index_allow_collision; @@ -1191,18 +631,13 @@ // handle prefix correctly. if (prefix_extractor != nullptr) { rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor)); + new InternalKeySliceTransform(prefix_extractor.get())); } - SetupCacheKeyPrefix(rep); - std::unique_ptr new_table( - new BlockBasedTable(rep, block_cache_tracer)); - // page cache options - rep->persistent_cache_options = - PersistentCacheOptions(rep->table_options.persistent_cache, - std::string(rep->persistent_cache_key_prefix, - rep->persistent_cache_key_prefix_size), - rep->ioptions.statistics); + // For fully portable/stable cache keys, we need to read the properties + // block before setting up cache keys. TODO: consider setting up a bootstrap + // cache key for PersistentCache to use for metaindex and properties blocks. + rep->persistent_cache_options = PersistentCacheOptions(); // Meta-blocks are not dictionary compressed. Explicitly set the dictionary // handle to null, otherwise it may be seen as uninitialized during the below @@ -1210,9 +645,11 @@ rep->compression_dict_handle = BlockHandle::NullBlockHandle(); // Read metaindex + std::unique_ptr new_table( + new BlockBasedTable(rep, block_cache_tracer)); std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex, + s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex, &metaindex_iter); if (!s.ok()) { return s; @@ -1220,19 +657,54 @@ // Populates table_properties and some fields that depend on it, // such as index_type. - s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), + s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(), metaindex_iter.get(), largest_seqno); if (!s.ok()) { return s; } - s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(), - internal_comparator, &lookup_context); + if (!PrefixExtractorChangedHelper(rep->table_properties.get(), + prefix_extractor.get())) { + // Establish fast path for unchanged prefix_extractor + rep->table_prefix_extractor = prefix_extractor; + } else { + // Current prefix_extractor doesn't match table +#ifndef ROCKSDB_LITE + if (rep->table_properties) { + //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions + // will need to use it + ConfigOptions config_options; + Status st = SliceTransform::CreateFromString( + config_options, rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + if (!st.ok()) { + //**TODO: Should this be error be returned or swallowed? + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Failed to create prefix extractor[%s]: %s", + rep->table_properties->prefix_extractor_name.c_str(), + st.ToString().c_str()); + } + } +#endif // ROCKSDB_LITE + } + + // With properties loaded, we can set up portable/stable cache keys + SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id, + cur_file_num, file_size, &rep->base_cache_key); + + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + rep->base_cache_key, rep->ioptions.stats); + + s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(), + metaindex_iter.get(), internal_comparator, + &lookup_context); if (!s.ok()) { return s; } s = new_table->PrefetchIndexAndFilterBlocks( - prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), - prefetch_all, table_options, level, &lookup_context); + ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), + prefetch_all, table_options, level, file_size, + max_file_size_for_l0_meta_pin, &lookup_context); if (s.ok()) { // Update tail prefetch stats @@ -1250,9 +722,9 @@ } Status BlockBasedTable::PrefetchTail( - RandomAccessFileReader* file, uint64_t file_size, - TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, - const bool preload_all, + const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, + bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, + const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer) { size_t tail_prefetch_size = 0; if (tail_prefetch_stats != nullptr) { @@ -1280,121 +752,58 @@ } TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", &tail_prefetch_size); - Status s; - // TODO should not have this special logic in the future. - if (!file->use_direct_io()) { - prefetch_buffer->reset(new FilePrefetchBuffer( - nullptr, 0, 0, false /* enable */, true /* track_min_offset */)); - s = file->Prefetch(prefetch_off, prefetch_len); - } else { - prefetch_buffer->reset(new FilePrefetchBuffer( - nullptr, 0, 0, true /* enable */, true /* track_min_offset */)); - s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len); - } - return s; -} -Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, - uint32_t expected) { - Status s; - uint32_t actual = 0; - switch (type) { - case kNoChecksum: - break; - case kCRC32c: - expected = crc32c::Unmask(expected); - actual = crc32c::Value(buf, len); - break; - case kxxHash: - actual = XXH32(buf, static_cast(len), 0); - break; - case kxxHash64: - actual = static_cast(XXH64(buf, static_cast(len), 0) & - uint64_t{0xffffffff}); - break; - default: - s = Status::Corruption("unknown checksum type"); - } - if (s.ok() && actual != expected) { - s = Status::Corruption("properties block checksum mismatched"); + // Try file system prefetch + if (!file->use_direct_io() && !force_direct_prefetch) { + if (!file->Prefetch(prefetch_off, prefetch_len).IsNotSupported()) { + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, + false /* enable */, true /* track_min_offset */)); + return Status::OK(); + } } - return s; -} -Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( - FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, - TableProperties** table_properties) { - assert(table_properties != nullptr); - // If this is an external SST file ingested with write_global_seqno set to - // true, then we expect the checksum mismatch because checksum was written - // by SstFileWriter, but its global seqno in the properties block may have - // been changed during ingestion. In this case, we read the properties - // block, copy it to a memory buffer, change the global seqno to its - // original value, i.e. 0, and verify the checksum again. - BlockHandle props_block_handle; - CacheAllocationPtr tmp_buf; - Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer, - rep_->footer, rep_->ioptions, table_properties, - false /* verify_checksum */, &props_block_handle, - &tmp_buf, false /* compression_type_missing */, - nullptr /* memory_allocator */); - if (s.ok() && tmp_buf) { - const auto seqno_pos_iter = - (*table_properties) - ->properties_offsets.find( - ExternalSstFilePropertyNames::kGlobalSeqno); - size_t block_size = static_cast(props_block_handle.size()); - if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) { - uint64_t global_seqno_offset = seqno_pos_iter->second; - EncodeFixed64( - tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); - } - uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); - s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(), - tmp_buf.get(), block_size + 1, value); + // Use `FilePrefetchBuffer` + prefetch_buffer->reset( + new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */, + true /* enable */, true /* track_min_offset */)); + IOOptions opts; + Status s = file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = (*prefetch_buffer)->Prefetch(opts, file, prefetch_off, prefetch_len); } return s; } Status BlockBasedTable::ReadPropertiesBlock( - FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - const SequenceNumber largest_seqno) { - bool found_properties_block = true; + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, const SequenceNumber largest_seqno) { Status s; - s = SeekToPropertiesBlock(meta_iter, &found_properties_block); + BlockHandle handle; + s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle); if (!s.ok()) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "Error when seeking to properties block from file: %s", s.ToString().c_str()); - } else if (found_properties_block) { + } else if (!handle.IsNull()) { s = meta_iter->status(); - TableProperties* table_properties = nullptr; + std::unique_ptr table_properties; if (s.ok()) { - s = ReadProperties( - meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer, - rep_->ioptions, &table_properties, true /* verify_checksum */, - nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, - false /* compression_type_missing */, nullptr /* memory_allocator */); - } - - if (s.IsCorruption()) { - s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(), - &table_properties); - } - std::unique_ptr props_guard; - if (table_properties != nullptr) { - props_guard.reset(table_properties); + s = ReadTablePropertiesHelper( + ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, nullptr /* memory_allocator */); } + IGNORE_STATUS_IF_ERROR(s); if (!s.ok()) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "Encountered error while reading data from properties " "block %s", s.ToString().c_str()); } else { assert(table_properties != nullptr); - rep_->table_properties.reset(props_guard.release()); + rep_->table_properties = std::move(table_properties); rep_->blocks_maybe_compressed = rep_->table_properties->compression_name != CompressionTypeToString(kNoCompression); @@ -1405,26 +814,19 @@ CompressionTypeToString(kZSTDNotFinalCompression)); } } else { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.logger, "Cannot find Properties block from file."); } -#ifndef ROCKSDB_LITE - if (rep_->table_properties) { - ParseSliceTransform(rep_->table_properties->prefix_extractor_name, - &(rep_->table_prefix_extractor)); - } -#endif // ROCKSDB_LITE // Read the table properties, if provided. if (rep_->table_properties) { rep_->whole_key_filtering &= IsFeatureSupported(*(rep_->table_properties), BlockBasedTablePropertyNames::kWholeKeyFiltering, - rep_->ioptions.info_log); - rep_->prefix_filtering &= - IsFeatureSupported(*(rep_->table_properties), - BlockBasedTablePropertyNames::kPrefixFiltering, - rep_->ioptions.info_log); + rep_->ioptions.logger); + rep_->prefix_filtering &= IsFeatureSupported( + *(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger); rep_->index_key_includes_seq = rep_->table_properties->index_key_is_user_key == 0; @@ -1447,27 +849,26 @@ s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, &(rep_->global_seqno)); if (!s.ok()) { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str()); + ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str()); } } return s; } Status BlockBasedTable::ReadRangeDelBlock( - FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator, BlockCacheLookupContext* lookup_context) { Status s; - bool found_range_del_block; BlockHandle range_del_handle; - s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); + s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle); if (!s.ok()) { ROCKS_LOG_WARN( - rep_->ioptions.info_log, + rep_->ioptions.logger, "Error when seeking to range delete tombstones block from file: %s", s.ToString().c_str()); - } else if (found_range_del_block && !range_del_handle.IsNull()) { - ReadOptions read_options; + } else if (!range_del_handle.IsNull()) { std::unique_ptr iter(NewDataBlockIterator( read_options, range_del_handle, /*input_iter=*/nullptr, BlockType::kRangeDeletion, @@ -1476,9 +877,10 @@ s = iter->status(); if (!s.ok()) { ROCKS_LOG_WARN( - rep_->ioptions.info_log, + rep_->ioptions.logger, "Encountered error while reading data from range del block %s", s.ToString().c_str()); + IGNORE_STATUS_IF_ERROR(s); } else { rep_->fragmented_range_dels = std::make_shared(std::move(iter), @@ -1489,9 +891,10 @@ } Status BlockBasedTable::PrefetchIndexAndFilterBlocks( - FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - BlockBasedTable* new_table, bool prefetch_all, + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all, const BlockBasedTableOptions& table_options, const int level, + size_t file_size, size_t max_file_size_for_l0_meta_pin, BlockCacheLookupContext* lookup_context) { Status s; @@ -1523,11 +926,13 @@ } } } + // Partition filters cannot be enabled without partition indexes + assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter || + rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // Find compression dictionary handle - bool found_compression_dict = false; - s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, - &rep_->compression_dict_handle); + s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName, + &rep_->compression_dict_handle); if (!s.ok()) { return s; } @@ -1536,22 +941,58 @@ const bool use_cache = table_options.cache_index_and_filter_blocks; - // pin both index and filters, down to all partitions - const bool pin_all = - rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + const bool maybe_flushed = + level == 0 && file_size <= max_file_size_for_l0_meta_pin; + std::function is_pinned = + [maybe_flushed, &is_pinned](PinningTier pinning_tier, + PinningTier fallback_pinning_tier) { + // Fallback to fallback would lead to infinite recursion. Disallow it. + assert(fallback_pinning_tier != PinningTier::kFallback); + + switch (pinning_tier) { + case PinningTier::kFallback: + return is_pinned(fallback_pinning_tier, + PinningTier::kNone /* fallback_pinning_tier */); + case PinningTier::kNone: + return false; + case PinningTier::kFlushedAndSimilar: + return maybe_flushed; + case PinningTier::kAll: + return true; + }; + + // In GCC, this is needed to suppress `control reaches end of non-void + // function [-Werror=return-type]`. + assert(false); + return false; + }; + const bool pin_top_level_index = is_pinned( + table_options.metadata_cache_options.top_level_index_pinning, + table_options.pin_top_level_index_and_filter ? PinningTier::kAll + : PinningTier::kNone); + const bool pin_partition = + is_pinned(table_options.metadata_cache_options.partition_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); + const bool pin_unpartitioned = + is_pinned(table_options.metadata_cache_options.unpartitioned_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); - // prefetch the first level of index - const bool prefetch_index = - prefetch_all || - (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // pin the first level of index const bool pin_index = - pin_all || (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of index + // WART: this might be redundant (unnecessary cache hit) if !pin_index, + // depending on prepopulate_block_cache option + const bool prefetch_index = prefetch_all || pin_index; std::unique_ptr index_reader; - s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache, prefetch_index, pin_index, lookup_context, &index_reader); if (!s.ok()) { @@ -1563,41 +1004,45 @@ // The partitions of partitioned index are always stored in cache. They // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks - if (prefetch_all) { - rep_->index_reader->CacheDependencies(pin_all); + if (prefetch_all || pin_partition) { + s = rep_->index_reader->CacheDependencies(ro, pin_partition); + } + if (!s.ok()) { + return s; } - // prefetch the first level of filter - const bool prefetch_filter = - prefetch_all || - (table_options.pin_top_level_index_and_filter && - rep_->filter_type == Rep::FilterType::kPartitionedFilter); - // Partition fitlers cannot be enabled without partition indexes - assert(!prefetch_filter || prefetch_index); // pin the first level of filter const bool pin_filter = - pin_all || (table_options.pin_top_level_index_and_filter && - rep_->filter_type == Rep::FilterType::kPartitionedFilter); + rep_->filter_type == Rep::FilterType::kPartitionedFilter + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of filter + // WART: this might be redundant (unnecessary cache hit) if !pin_filter, + // depending on prepopulate_block_cache option + const bool prefetch_filter = prefetch_all || pin_filter; if (rep_->filter_policy) { auto filter = new_table->CreateFilterBlockReader( - prefetch_buffer, use_cache, prefetch_filter, pin_filter, + ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter, lookup_context); + if (filter) { // Refer to the comment above about paritioned indexes always being cached - if (prefetch_all) { - filter->CacheDependencies(pin_all); + if (prefetch_all || pin_partition) { + s = filter->CacheDependencies(ro, pin_partition); + if (!s.ok()) { + return s; + } } - rep_->filter = std::move(filter); } } if (!rep_->compression_dict_handle.IsNull()) { std::unique_ptr uncompression_dict_reader; - s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache, - prefetch_all, pin_all, lookup_context, - &uncompression_dict_reader); + s = UncompressionDictReader::Create( + this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned, + pin_unpartitioned, lookup_context, &uncompression_dict_reader); if (!s.ok()) { return s; } @@ -1650,23 +1095,23 @@ // metaindex // block and its iterator. Status BlockBasedTable::ReadMetaIndexBlock( - FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. std::unique_ptr metaindex; Status s = ReadBlockFromFile( - rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(), + rep_->file.get(), prefetch_buffer, rep_->footer, ro, rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, - kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, - GetMemoryAllocator(rep_->table_options), false /* for_compaction */, - rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */); + 0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options), + false /* for_compaction */, rep_->blocks_definitely_zstd_compressed, + nullptr /* filter_policy */); if (!s.ok()) { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.logger, "Encountered error while reading data from properties" " block %s", s.ToString().c_str()); @@ -1675,33 +1120,48 @@ *metaindex_block = std::move(metaindex); // meta block uses bytewise comparator. - iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(), - BytewiseComparator())); + iter->reset(metaindex_block->get()->NewMetaIterator()); return Status::OK(); } template Status BlockBasedTable::GetDataBlockFromCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, + const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context) const { + const bool wait, GetContext* get_context) const { const size_t read_amp_bytes_per_bit = block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0; assert(block); assert(block->IsEmpty()); + const Cache::Priority priority = + rep_->table_options.cache_index_and_filter_blocks_with_high_priority && + (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary || + block_type == BlockType::kIndex) + ? Cache::Priority::HIGH + : Cache::Priority::LOW; Status s; BlockContents* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; + Statistics* statistics = rep_->ioptions.statistics.get(); + bool using_zstd = rep_->blocks_definitely_zstd_compressed; + const FilterPolicy* filter_policy = rep_->filter_policy; + Cache::CreateCallback create_cb = GetCreateCallback( + read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); // Lookup uncompressed cache first if (block_cache != nullptr) { - auto cache_handle = GetEntryFromCache(block_cache, block_cache_key, - block_type, get_context); + assert(!cache_key.empty()); + Cache::Handle* cache_handle = nullptr; + cache_handle = GetEntryFromCache( + rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, + block_type, wait, get_context, + BlocklikeTraits::GetCacheItemHelper(block_type), create_cb, + priority); if (cache_handle != nullptr) { block->SetCachedValue( reinterpret_cast(block_cache->Value(cache_handle)), @@ -1717,11 +1177,20 @@ return s; } - assert(!compressed_block_cache_key.empty()); - block_cache_compressed_handle = - block_cache_compressed->Lookup(compressed_block_cache_key); - - Statistics* statistics = rep_->ioptions.statistics; + assert(!cache_key.empty()); + BlockContents contents; + if (rep_->ioptions.lowest_used_cache_tier == + CacheTier::kNonVolatileBlockTier) { + Cache::CreateCallback create_cb_special = GetCreateCallback( + read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); + block_cache_compressed_handle = block_cache_compressed->Lookup( + cache_key, + BlocklikeTraits::GetCacheItemHelper(block_type), + create_cb_special, priority, true); + } else { + block_cache_compressed_handle = + block_cache_compressed->Lookup(cache_key, statistics); + } // if we found in the compressed cache, then uncompress and insert into // uncompressed cache @@ -1734,11 +1203,10 @@ RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); compressed_block = reinterpret_cast( block_cache_compressed->Value(block_cache_compressed_handle)); - CompressionType compression_type = compressed_block->get_compression_type(); + CompressionType compression_type = GetBlockCompressionType(*compressed_block); assert(compression_type != kNoCompression); // Retrieve the uncompressed contents into a new buffer - BlockContents contents; UncompressionContext context(compression_type); UncompressionInfo info(context, uncompression_dict, compression_type); s = UncompressBlockContents( @@ -1746,12 +1214,12 @@ &contents, rep_->table_options.format_version, rep_->ioptions, GetMemoryAllocator(rep_->table_options)); - // Insert uncompressed block into block cache + // Insert uncompressed block into block cache, the priority is based on the + // data block type. if (s.ok()) { std::unique_ptr block_holder( BlocklikeTraits::Create( - std::move(contents), rep_->get_global_seqno(block_type), - read_amp_bytes_per_bit, statistics, + std::move(contents), read_amp_bytes_per_bit, statistics, rep_->blocks_definitely_zstd_compressed, rep_->table_options.filter_policy.get())); // uncompressed block @@ -1759,14 +1227,17 @@ read_options.fill_cache) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; - s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle); + s = InsertEntryToCache( + rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, + BlocklikeTraits::GetCacheItemHelper(block_type), + block_holder, charge, &cache_handle, priority); if (s.ok()) { assert(cache_handle != nullptr); block->SetCachedValue(block_holder.release(), block_cache, cache_handle); - UpdateCacheInsertionMetrics(block_type, get_context, charge); + UpdateCacheInsertionMetrics(block_type, get_context, charge, + s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1782,14 +1253,13 @@ template Status BlockBasedTable::PutDataBlockToCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, + const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed, CachableEntry* cached_block, BlockContents* raw_block_contents, CompressionType raw_block_comp_type, - const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + const UncompressionDict& uncompression_dict, MemoryAllocator* memory_allocator, BlockType block_type, GetContext* get_context) const { - const ImmutableCFOptions& ioptions = rep_->ioptions; + const ImmutableOptions& ioptions = rep_->ioptions; const uint32_t format_version = rep_->table_options.format_version; const size_t read_amp_bytes_per_bit = block_type == BlockType::kData @@ -1806,7 +1276,7 @@ assert(cached_block->IsEmpty()); Status s; - Statistics* statistics = ioptions.statistics; + Statistics* statistics = ioptions.stats; std::unique_ptr block_holder; if (raw_block_comp_type != kNoCompression) { @@ -1823,13 +1293,13 @@ } block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit, + std::move(uncompressed_block_contents), read_amp_bytes_per_bit, statistics, rep_->blocks_definitely_zstd_compressed, rep_->table_options.filter_policy.get())); } else { block_holder.reset(BlocklikeTraits::Create( - std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit, - statistics, rep_->blocks_definitely_zstd_compressed, + std::move(*raw_block_contents), read_amp_bytes_per_bit, statistics, + rep_->blocks_definitely_zstd_compressed, rep_->table_options.filter_policy.get())); } @@ -1838,24 +1308,28 @@ if (block_cache_compressed != nullptr && raw_block_comp_type != kNoCompression && raw_block_contents != nullptr && raw_block_contents->own_bytes()) { -#ifndef NDEBUG assert(raw_block_contents->is_raw_block); -#endif // NDEBUG + assert(!cache_key.empty()); // We cannot directly put raw_block_contents because this could point to // an object in the stack. - BlockContents* block_cont_for_comp_cache = - new BlockContents(std::move(*raw_block_contents)); - s = block_cache_compressed->Insert( - compressed_block_cache_key, block_cont_for_comp_cache, - block_cont_for_comp_cache->ApproximateMemoryUsage(), - &DeleteCachedEntry); + std::unique_ptr block_cont_for_comp_cache( + new BlockContents(std::move(*raw_block_contents))); + s = InsertEntryToCache( + rep_->ioptions.lowest_used_cache_tier, block_cache_compressed, + cache_key, + BlocklikeTraits::GetCacheItemHelper(block_type), + block_cont_for_comp_cache, + block_cont_for_comp_cache->ApproximateMemoryUsage(), nullptr, + Cache::Priority::LOW); + + BlockContents* block_cont_raw_ptr = block_cont_for_comp_cache.release(); if (s.ok()) { // Avoid the following code to delete this cached block. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); } else { RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); - delete block_cont_for_comp_cache; + delete block_cont_raw_ptr; } } @@ -1863,15 +1337,17 @@ if (block_cache != nullptr && block_holder->own_bytes()) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; - s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle, - priority); + s = InsertEntryToCache( + rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, + BlocklikeTraits::GetCacheItemHelper(block_type), + block_holder, charge, &cache_handle, priority); if (s.ok()) { assert(cache_handle != nullptr); cached_block->SetCachedValue(block_holder.release(), block_cache, cache_handle); - UpdateCacheInsertionMetrics(block_type, get_context, charge); + UpdateCacheInsertionMetrics(block_type, get_context, charge, + s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1883,8 +1359,8 @@ } std::unique_ptr BlockBasedTable::CreateFilterBlockReader( - FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, - bool pin, BlockCacheLookupContext* lookup_context) { + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) { auto& rep = rep_; auto filter_type = rep->filter_type; if (filter_type == Rep::FilterType::kNoFilter) { @@ -1896,14 +1372,14 @@ switch (filter_type) { case Rep::FilterType::kPartitionedFilter: return PartitionedFilterBlockReader::Create( - this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context); case Rep::FilterType::kBlockFilter: return BlockBasedFilterBlockReader::Create( - this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context); case Rep::FilterType::kFullFilter: - return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache, + return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context); default: @@ -1930,195 +1406,25 @@ lookup_context); } -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -// If input_iter is null, new a iterator -// If input_iter is not null, update this iter and return it -template -TBlockIter* BlockBasedTable::NewDataBlockIterator( - const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, - BlockType block_type, GetContext* get_context, - BlockCacheLookupContext* lookup_context, Status s, - FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { - PERF_TIMER_GUARD(new_table_block_iter_nanos); - - TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; - if (!s.ok()) { - iter->Invalidate(s); - return iter; - } - - CachableEntry uncompression_dict; - if (rep_->uncompression_dict_reader) { - const bool no_io = (ro.read_tier == kBlockCacheTier); - s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - prefetch_buffer, no_io, get_context, lookup_context, - &uncompression_dict); - if (!s.ok()) { - iter->Invalidate(s); - return iter; - } - } - - const UncompressionDict& dict = uncompression_dict.GetValue() - ? *uncompression_dict.GetValue() - : UncompressionDict::GetEmptyDict(); - - CachableEntry block; - s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, - get_context, lookup_context, for_compaction, - /* use_cache */ true); - - if (!s.ok()) { - assert(block.IsEmpty()); - iter->Invalidate(s); - return iter; - } - - assert(block.GetValue() != nullptr); - - // Block contents are pinned and it is still pinned after the iterator - // is destroyed as long as cleanup functions are moved to another object, - // when: - // 1. block cache handle is set to be released in cleanup function, or - // 2. it's pointing to immortal source. If own_bytes is true then we are - // not reading data from the original source, whether immortal or not. - // Otherwise, the block is pinned iff the source is immortal. - const bool block_contents_pinned = - block.IsCached() || - (!block.GetValue()->own_bytes() && rep_->immortal_table); - iter = InitBlockIterator(rep_, block.GetValue(), iter, - block_contents_pinned); - - if (!block.IsCached()) { - if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { - // insert a dummy record to block cache to track the memory usage - Cache* const block_cache = rep_->table_options.block_cache.get(); - Cache::Handle* cache_handle = nullptr; - // There are two other types of cache keys: 1) SST cache key added in - // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in - // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate - // from SST cache key(31 bytes), and use non-zero prefix to - // differentiate from `write_buffer_manager` - const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; - char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; - // Prefix: use rep_->cache_key_prefix padded by 0s - memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); - assert(rep_->cache_key_prefix_size != 0); - assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); - memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, - next_cache_key_id_++); - assert(end - cache_key <= - static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); - const Slice unique_key(cache_key, static_cast(end - cache_key)); - s = block_cache->Insert(unique_key, nullptr, - block.GetValue()->ApproximateMemoryUsage(), - nullptr, &cache_handle); - - if (s.ok()) { - assert(cache_handle != nullptr); - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, - cache_handle); - } - } - } else { - iter->SetCacheHandle(block.GetCacheHandle()); - } - - block.TransferTo(iter); - - return iter; -} - template <> DataBlockIter* BlockBasedTable::InitBlockIterator( - const Rep* rep, Block* block, DataBlockIter* input_iter, - bool block_contents_pinned) { - return block->NewDataIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - input_iter, rep->ioptions.statistics, block_contents_pinned); + const Rep* rep, Block* block, BlockType block_type, + DataBlockIter* input_iter, bool block_contents_pinned) { + return block->NewDataIterator(rep->internal_comparator.user_comparator(), + rep->get_global_seqno(block_type), input_iter, + rep->ioptions.stats, block_contents_pinned); } template <> IndexBlockIter* BlockBasedTable::InitBlockIterator( - const Rep* rep, Block* block, IndexBlockIter* input_iter, - bool block_contents_pinned) { + const Rep* rep, Block* block, BlockType block_type, + IndexBlockIter* input_iter, bool block_contents_pinned) { return block->NewIndexIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - input_iter, rep->ioptions.statistics, /* total_order_seek */ true, - rep->index_has_first_key, rep->index_key_includes_seq, - rep->index_value_is_full, block_contents_pinned); -} - -// Convert an uncompressed data block (i.e CachableEntry) -// into an iterator over the contents of the corresponding block. -// If input_iter is null, new a iterator -// If input_iter is not null, update this iter and return it -template -TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, - CachableEntry& block, - TBlockIter* input_iter, - Status s) const { - PERF_TIMER_GUARD(new_table_block_iter_nanos); - - TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; - if (!s.ok()) { - iter->Invalidate(s); - return iter; - } - - assert(block.GetValue() != nullptr); - // Block contents are pinned and it is still pinned after the iterator - // is destroyed as long as cleanup functions are moved to another object, - // when: - // 1. block cache handle is set to be released in cleanup function, or - // 2. it's pointing to immortal source. If own_bytes is true then we are - // not reading data from the original source, whether immortal or not. - // Otherwise, the block is pinned iff the source is immortal. - const bool block_contents_pinned = - block.IsCached() || - (!block.GetValue()->own_bytes() && rep_->immortal_table); - iter = InitBlockIterator(rep_, block.GetValue(), iter, - block_contents_pinned); - - if (!block.IsCached()) { - if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { - // insert a dummy record to block cache to track the memory usage - Cache* const block_cache = rep_->table_options.block_cache.get(); - Cache::Handle* cache_handle = nullptr; - // There are two other types of cache keys: 1) SST cache key added in - // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in - // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate - // from SST cache key(31 bytes), and use non-zero prefix to - // differentiate from `write_buffer_manager` - const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; - char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; - // Prefix: use rep_->cache_key_prefix padded by 0s - memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); - assert(rep_->cache_key_prefix_size != 0); - assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); - memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, - next_cache_key_id_++); - assert(end - cache_key <= - static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); - const Slice unique_key(cache_key, static_cast(end - cache_key)); - s = block_cache->Insert(unique_key, nullptr, - block.GetValue()->ApproximateMemoryUsage(), - nullptr, &cache_handle); - if (s.ok()) { - assert(cache_handle != nullptr); - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, - cache_handle); - } - } - } else { - iter->SetCacheHandle(block.GetCacheHandle()); - } - - block.TransferTo(iter); - return iter; + rep->internal_comparator.user_comparator(), + rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats, + /* total_order_seek */ true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full, + block_contents_pinned); } // If contents is nullptr, this function looks up the block caches for the @@ -2130,54 +1436,54 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const bool wait, const bool for_compaction, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); - // No point to cache compressed blocks if it never goes away Cache* block_cache_compressed = - rep_->immortal_table ? nullptr - : rep_->table_options.block_cache_compressed.get(); + rep_->table_options.block_cache_compressed.get(); // First, try to get the block from the cache // // If either block cache is enabled, we'll try to read from it. Status s; - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice key /* key to the block cache */; - Slice ckey /* key to the compressed block cache */; + CacheKey key_data; + Slice key; bool is_cache_hit = false; if (block_cache != nullptr || block_cache_compressed != nullptr) { // create key for block cache - if (block_cache != nullptr) { - key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - handle, cache_key); - } - - if (block_cache_compressed != nullptr) { - ckey = GetCacheKey(rep_->compressed_cache_key_prefix, - rep_->compressed_cache_key_prefix_size, handle, - compressed_cache_key); - } + key_data = GetCacheKey(rep_->base_cache_key, handle); + key = key_data.AsSlice(); if (!contents) { - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - ro, block_entry, uncompression_dict, block_type, - get_context); - if (block_entry->GetValue()) { + s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro, + block_entry, uncompression_dict, block_type, + wait, get_context); + // Value could still be null at this point, so check the cache handle + // and update the read pattern for prefetching + if (block_entry->GetValue() || block_entry->GetCacheHandle()) { // TODO(haoyu): Differentiate cache hit on uncompressed block cache and // compressed block cache. is_cache_hit = true; + if (prefetch_buffer) { + // Update the block details so that PrefetchBuffer can use the read + // pattern to determine if reads are sequential or not for + // prefetching. It should also take in account blocks read from cache. + prefetch_buffer->UpdateReadPattern(handle.offset(), + BlockSizeWithTrailer(handle), + ro.adaptive_readahead); + } } } // Can't find the block from the cache. If I/O is allowed, read from the // file. - if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { - Statistics* statistics = rep_->ioptions.statistics; + if (block_entry->GetValue() == nullptr && + block_entry->GetCacheHandle() == nullptr && !no_io && ro.fill_cache) { + Statistics* statistics = rep_->ioptions.stats; const bool maybe_compressed = block_type != BlockType::kFilter && block_type != BlockType::kCompressionDictionary && @@ -2186,7 +1492,9 @@ CompressionType raw_block_comp_type; BlockContents raw_block_contents; if (!contents) { - StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS + : READ_BLOCK_GET_MICROS; + StopWatch sw(rep_->ioptions.clock, statistics, histogram); BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &raw_block_contents, rep_->ioptions, do_uncompress, @@ -2197,17 +1505,31 @@ s = block_fetcher.ReadBlockContents(); raw_block_comp_type = block_fetcher.get_compression_type(); contents = &raw_block_contents; + if (get_context) { + switch (block_type) { + case BlockType::kIndex: + ++get_context->get_context_stats_.num_index_read; + break; + case BlockType::kFilter: + ++get_context->get_context_stats_.num_filter_read; + break; + case BlockType::kData: + ++get_context->get_context_stats_.num_data_read; + break; + default: + break; + } + } } else { - raw_block_comp_type = contents->get_compression_type(); + raw_block_comp_type = GetBlockCompressionType(*contents); } if (s.ok()) { - SequenceNumber seq_no = rep_->get_global_seqno(block_type); // If filling cache is allowed and a cache is configured, try to put the // block to the cache. s = PutDataBlockToCache( - key, ckey, block_cache, block_cache_compressed, block_entry, - contents, raw_block_comp_type, uncompression_dict, seq_no, + key, block_cache, block_cache_compressed, block_entry, contents, + raw_block_comp_type, uncompression_dict, GetMemoryAllocator(rep_->table_options), block_type, get_context); } } @@ -2261,7 +1583,7 @@ // Avoid making copy of block_key and cf_name when constructing the access // record. BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", trace_block_type, /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -2269,9 +1591,11 @@ no_insert, lookup_context->get_id, lookup_context->get_from_user_specified_snapshot, /*referenced_key=*/""); - block_cache_tracer_->WriteBlockAccess(access_record, key, - rep_->cf_name_for_tracing(), - lookup_context->referenced_key); + // TODO: Should handle this error? + block_cache_tracer_ + ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(), + lookup_context->referenced_key) + .PermitUncheckedError(); } } @@ -2304,12 +1628,11 @@ char* scratch, const UncompressionDict& uncompression_dict) const { RandomAccessFileReader* file = rep_->file.get(); const Footer& footer = rep_->footer; - const ImmutableCFOptions& ioptions = rep_->ioptions; - SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData); + const ImmutableOptions& ioptions = rep_->ioptions; size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); - if (file->use_direct_io() || ioptions.allow_mmap_reads) { + if (ioptions.allow_mmap_reads) { size_t idx_in_batch = 0; for (auto mget_iter = batch->begin(); mget_iter != batch->end(); ++mget_iter, ++idx_in_batch) { @@ -2324,11 +1647,16 @@ RetrieveBlock(nullptr, options, handle, uncompression_dict, &(*results)[idx_in_batch], BlockType::kData, mget_iter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ true); } return; } + // In direct IO mode, blocks share the direct io buffer. + // Otherwise, blocks share the scratch buffer. + const bool use_shared_buffer = file->use_direct_io() || scratch != nullptr; + autovector read_reqs; size_t buf_offset = 0; size_t idx_in_batch = 0; @@ -2349,9 +1677,13 @@ // If current block is adjacent to the previous one, at the same time, // compression is enabled and there is no compressed cache, we combine // the two block read as one. - if (scratch != nullptr && prev_end == handle.offset()) { + // We don't combine block reads here in direct IO mode, because when doing + // direct IO read, the block requests will be realigned and merged when + // necessary. + if (use_shared_buffer && !file->use_direct_io() && + prev_end == handle.offset()) { req_offset_for_block.emplace_back(prev_len); - prev_len += block_size(handle); + prev_len += BlockSizeWithTrailer(handle); } else { // No compression or current block and previous one is not adjacent: // Step 1, create a new request for previous blocks @@ -2359,38 +1691,58 @@ FSReadRequest req; req.offset = prev_offset; req.len = prev_len; - if (scratch == nullptr) { - req.scratch = new char[req.len]; - } else { + if (file->use_direct_io()) { + req.scratch = nullptr; + } else if (use_shared_buffer) { req.scratch = scratch + buf_offset; buf_offset += req.len; + } else { + req.scratch = new char[req.len]; } - req.status = IOStatus::OK(); read_reqs.emplace_back(req); } // Step 2, remeber the previous block info prev_offset = handle.offset(); - prev_len = block_size(handle); + prev_len = BlockSizeWithTrailer(handle); req_offset_for_block.emplace_back(0); } req_idx_for_block.emplace_back(read_reqs.size()); + + PERF_COUNTER_ADD(block_read_count, 1); + PERF_COUNTER_ADD(block_read_byte, BlockSizeWithTrailer(handle)); } // Handle the last block and process the pending last request if (prev_len != 0) { FSReadRequest req; req.offset = prev_offset; req.len = prev_len; - if (scratch == nullptr) { - req.scratch = new char[req.len]; - } else { + if (file->use_direct_io()) { + req.scratch = nullptr; + } else if (use_shared_buffer) { req.scratch = scratch + buf_offset; + } else { + req.scratch = new char[req.len]; } - req.status = IOStatus::OK(); read_reqs.emplace_back(req); } - file->MultiRead(&read_reqs[0], read_reqs.size()); + AlignedBuf direct_io_buf; + { + IOOptions opts; + IOStatus s = file->PrepareIOOptions(options, opts); + if (s.ok()) { + s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), + &direct_io_buf); + } + if (!s.ok()) { + // Discard all the results in this batch if there is any time out + // or overall MultiRead error + for (FSReadRequest& req : read_reqs) { + req.status = s; + } + } + } idx_in_batch = 0; size_t valid_batch_idx = 0; @@ -2408,10 +1760,14 @@ size_t& req_idx = req_idx_for_block[valid_batch_idx]; size_t& req_offset = req_offset_for_block[valid_batch_idx]; valid_batch_idx++; + if (mget_iter->get_context) { + ++(mget_iter->get_context->get_context_stats_.num_data_read); + } FSReadRequest& req = read_reqs[req_idx]; Status s = req.status; if (s.ok()) { - if (req.result.size() != req.len) { + if ((req.result.size() != req.len) || + (req_offset + BlockSizeWithTrailer(handle) > req.result.size())) { s = Status::Corruption( "truncated block read from " + rep_->file->file_name() + " offset " + ToString(handle.offset()) + ", expected " + @@ -2420,60 +1776,63 @@ } BlockContents raw_block_contents; - size_t cur_read_end = req_offset + block_size(handle); - if (cur_read_end > req.result.size()) { - s = Status::Corruption( - "truncated block read from " + rep_->file->file_name() + " offset " + - ToString(handle.offset()) + ", expected " + ToString(req.len) + - " bytes, got " + ToString(req.result.size())); - } - - bool blocks_share_read_buffer = (req.result.size() != block_size(handle)); if (s.ok()) { - if (scratch == nullptr && !blocks_share_read_buffer) { + if (!use_shared_buffer) { // We allocated a buffer for this block. Give ownership of it to // BlockContents so it can free the memory assert(req.result.data() == req.scratch); - std::unique_ptr raw_block(req.scratch + req_offset); + assert(req.result.size() == BlockSizeWithTrailer(handle)); + assert(req_offset == 0); + std::unique_ptr raw_block(req.scratch); raw_block_contents = BlockContents(std::move(raw_block), handle.size()); } else { - // We used the scratch buffer which are shared by the blocks. + // We used the scratch buffer or direct io buffer + // which are shared by the blocks. // raw_block_contents does not have the ownership. raw_block_contents = - BlockContents(Slice(req.scratch + req_offset, handle.size())); + BlockContents(Slice(req.result.data() + req_offset, handle.size())); } - #ifndef NDEBUG raw_block_contents.is_raw_block = true; #endif + if (options.verify_checksums) { PERF_TIMER_GUARD(block_checksum_time); const char* data = req.result.data(); - uint32_t expected = - DecodeFixed32(data + req_offset + handle.size() + 1); - // Since the scratch might be shared. the offset of the data block in + // Since the scratch might be shared, the offset of the data block in // the buffer might not be 0. req.result.data() only point to the // begin address of each read request, we need to add the offset // in each read request. Checksum is stored in the block trailer, - // which is handle.size() + 1. - s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(), - req.result.data() + req_offset, - handle.size() + 1, expected); + // beyond the payload size. + s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset, + handle.size(), rep_->file->file_name(), + handle.offset()); TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); } + } else if (!use_shared_buffer) { + // Free the allocated scratch buffer. + delete[] req.scratch; } if (s.ok()) { - // It handles a rare case: compression is set and these is no compressed - // cache (enable combined read). In this case, the scratch != nullptr. - // At the same time, some blocks are actually not compressed, - // since its compression space saving is smaller than the threshold. In - // this case, if the block shares the scratch memory, we need to copy it - // to the heap such that it can be added to the regular block cache. + // When the blocks share the same underlying buffer (scratch or direct io + // buffer), we may need to manually copy the block into heap if the raw + // block has to be inserted into a cache. That falls into th following + // cases - + // 1. Raw block is not compressed, it needs to be inserted into the + // uncompressed block cache if there is one + // 2. If the raw block is compressed, it needs to be inserted into the + // compressed block cache if there is one + // + // In all other cases, the raw block is either uncompressed into a heap + // buffer or there is no cache at all. CompressionType compression_type = - raw_block_contents.get_compression_type(); - if (scratch != nullptr && compression_type == kNoCompression) { - Slice raw = Slice(req.scratch + req_offset, block_size(handle)); + GetBlockCompressionType(raw_block_contents); + if (use_shared_buffer && (compression_type == kNoCompression || + (compression_type != kNoCompression && + rep_->table_options.block_cache_compressed))) { + Slice raw = + Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle)); raw_block_contents = BlockContents( CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw), handle.size()); @@ -2492,40 +1851,43 @@ // necessary. Since we're passing the raw block contents, it will // avoid looking up the block cache s = MaybeReadBlockAndLoadToCache( - nullptr, options, handle, uncompression_dict, block_entry, - BlockType::kData, mget_iter->get_context, - &lookup_data_block_context, &raw_block_contents); + nullptr, options, handle, uncompression_dict, /*wait=*/true, + /*for_compaction=*/false, block_entry, BlockType::kData, + mget_iter->get_context, &lookup_data_block_context, + &raw_block_contents); // block_entry value could be null if no block cache is present, i.e // BlockBasedTableOptions::no_block_cache is true and no compressed // block cache is configured. In that case, fall // through and set up the block explicitly if (block_entry->GetValue() != nullptr) { + s.PermitUncheckedError(); continue; } } CompressionType compression_type = - raw_block_contents.get_compression_type(); + GetBlockCompressionType(raw_block_contents); BlockContents contents; if (compression_type != kNoCompression) { UncompressionContext context(compression_type); UncompressionInfo info(context, uncompression_dict, compression_type); - s = UncompressBlockContents(info, req.result.data() + req_offset, - handle.size(), &contents, footer.version(), - rep_->ioptions, memory_allocator); + s = UncompressBlockContents( + info, req.result.data() + req_offset, handle.size(), &contents, + footer.format_version(), rep_->ioptions, memory_allocator); } else { - // There are two cases here: 1) caller uses the scratch buffer; 2) we - // use the requst buffer. If scratch buffer is used, we ensure that + // There are two cases here: + // 1) caller uses the shared buffer (scratch or direct io buffer); + // 2) we use the requst buffer. + // If scratch buffer or direct io buffer is used, we ensure that // all raw blocks are copyed to the heap as single blocks. If scratch // buffer is not used, we also have no combined read, so the raw // block can be used directly. contents = std::move(raw_block_contents); } if (s.ok()) { - (*results)[idx_in_batch].SetOwnedValue( - new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, - ioptions.statistics)); + (*results)[idx_in_batch].SetOwnedValue(new Block( + std::move(contents), read_amp_bytes_per_bit, ioptions.stats)); } } (*statuses)[idx_in_batch] = s; @@ -2538,22 +1900,23 @@ const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const { + bool for_compaction, bool use_cache, bool wait_for_cache) const { assert(block_entry); assert(block_entry->IsEmpty()); Status s; if (use_cache) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, - uncompression_dict, block_entry, - block_type, get_context, lookup_context, - /*contents=*/nullptr); + s = MaybeReadBlockAndLoadToCache( + prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache, + for_compaction, block_entry, block_type, get_context, lookup_context, + /*contents=*/nullptr); if (!s.ok()) { return s; } - if (block_entry->GetValue() != nullptr) { + if (block_entry->GetValue() != nullptr || + block_entry->GetCacheHandle() != nullptr) { assert(s.ok()); return s; } @@ -2574,19 +1937,35 @@ std::unique_ptr block; { - StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, - READ_BLOCK_GET_MICROS); + Histograms histogram = + for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; + StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram); s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, rep_->ioptions, do_uncompress, maybe_compressed, block_type, uncompression_dict, rep_->persistent_cache_options, - rep_->get_global_seqno(block_type), block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0, GetMemoryAllocator(rep_->table_options), for_compaction, rep_->blocks_definitely_zstd_compressed, rep_->table_options.filter_policy.get()); + + if (get_context) { + switch (block_type) { + case BlockType::kIndex: + ++(get_context->get_context_stats_.num_index_read); + break; + case BlockType::kFilter: + ++(get_context->get_context_stats_.num_filter_read); + break; + case BlockType::kData: + ++(get_context->get_context_stats_.num_data_read); + break; + default: + break; + } + } } if (!s.ok()) { @@ -2606,28 +1985,28 @@ const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, @@ -2639,22 +2018,23 @@ const BlockHandle& handle) { // Return a block iterator on the index partition auto block = block_map_->find(handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (block != block_map_->end()) { - const Rep* rep = table_->get_rep(); - assert(rep); - - Statistics* kNullStats = nullptr; - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - return block->second.GetValue()->NewIndexIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, rep->index_has_first_key, - rep->index_key_includes_seq, rep->index_value_is_full); + // block_map_ must be exhaustive + if (block == block_map_->end()) { + assert(false); + // Signal problem to caller + return nullptr; } - // Create an empty iterator - return new IndexBlockIter(); + const Rep* rep = table_->get_rep(); + assert(rep); + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.GetValue()->NewIndexIterator( + rep->internal_comparator.user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full); } // This will be broken if the user specifies an unusual implementation @@ -2666,7 +2046,9 @@ // 2) Compare(prefix(key), key) <= 0. // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 // -// Otherwise, this method guarantees no I/O will be incurred. +// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and +// will return true if the filter block is not in memory and not found in block +// cache. // // REQUIRES: this method shouldn't be called while the DB lock is held. bool BlockBasedTable::PrefixMayMatch( @@ -2688,30 +2070,34 @@ } else { prefix_extractor = rep_->table_prefix_extractor.get(); } - auto user_key = ExtractUserKey(internal_key); - if (!prefix_extractor->InDomain(user_key)) { + auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + auto user_key_without_ts = + ExtractUserKeyAndStripTimestamp(internal_key, ts_sz); + if (!prefix_extractor->InDomain(user_key_without_ts)) { return true; } bool may_match = true; - Status s; // First, try check with full filter FilterBlockReader* const filter = rep_->filter.get(); bool filter_checked = true; if (filter != nullptr) { + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (!filter->IsBlockBased()) { const Slice* const const_ikey_ptr = &internal_key; may_match = filter->RangeMayExist( - read_options.iterate_upper_bound, user_key, prefix_extractor, - rep_->internal_comparator.user_comparator(), const_ikey_ptr, - &filter_checked, need_upper_bound_check, lookup_context); + read_options.iterate_upper_bound, user_key_without_ts, + prefix_extractor, rep_->internal_comparator.user_comparator(), + const_ikey_ptr, &filter_checked, need_upper_bound_check, no_io, + lookup_context); } else { // if prefix_extractor changed for block based filter, skip filter if (need_upper_bound_check) { return true; } - auto prefix = prefix_extractor->Transform(user_key); + auto prefix = prefix_extractor->Transform(user_key_without_ts); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -2758,14 +2144,14 @@ // is the only on could potentially contain the prefix. BlockHandle handle = iiter->value().handle; may_match = filter->PrefixMayMatch( - prefix, prefix_extractor, handle.offset(), /*no_io=*/false, + prefix, prefix_extractor, handle.offset(), no_io, /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context); } } } if (filter_checked) { - Statistics* statistics = rep_->ioptions.statistics; + Statistics* statistics = rep_->ioptions.stats; RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); @@ -2775,465 +2161,45 @@ return may_match; } -template -void BlockBasedTableIterator::Seek(const Slice& target) { - SeekImpl(&target); -} - -template -void BlockBasedTableIterator::SeekToFirst() { - SeekImpl(nullptr); -} - -template -void BlockBasedTableIterator::SeekImpl( - const Slice* target) { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { - ResetDataIter(); - return; - } - - bool need_seek_index = true; - if (block_iter_points_to_real_block_ && block_iter_.Valid()) { - // Reseek. - prev_block_offset_ = index_iter_->value().handle.offset(); - - if (target) { - // We can avoid an index seek if: - // 1. The new seek key is larger than the current key - // 2. The new seek key is within the upper bound of the block - // Since we don't necessarily know the internal key for either - // the current key or the upper bound, we check user keys and - // exclude the equality case. Considering internal keys can - // improve for the boundary cases, but it would complicate the - // code. - if (user_comparator_.Compare(ExtractUserKey(*target), - block_iter_.user_key()) > 0 && - user_comparator_.Compare(ExtractUserKey(*target), - index_iter_->user_key()) < 0) { - need_seek_index = false; - } - } - } - - if (need_seek_index) { - if (target) { - index_iter_->Seek(*target); - } else { - index_iter_->SeekToFirst(); - } - - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - - IndexValue v = index_iter_->value(); - const bool same_block = block_iter_points_to_real_block_ && - v.handle.offset() == prev_block_offset_; - - // TODO(kolmike): Remove the != kBlockCacheTier condition. - if (!v.first_internal_key.empty() && !same_block && - (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && - read_options_.read_tier != kBlockCacheTier) { - // Index contains the first key of the block, and it's >= target. - // We can defer reading the block. - is_at_first_key_from_index_ = true; - // ResetDataIter() will invalidate block_iter_. Thus, there is no need to - // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound - // as that will be done later when the data block is actually read. - ResetDataIter(); - } else { - // Need to use the data block. - if (!same_block) { - InitDataBlock(); - } else { - // When the user does a reseek, the iterate_upper_bound might have - // changed. CheckDataBlockWithinUpperBound() needs to be called - // explicitly if the reseek ends up in the same data block. - // If the reseek ends up in a different block, InitDataBlock() will do - // the iterator upper bound check. - CheckDataBlockWithinUpperBound(); - } - - if (target) { - block_iter_.Seek(*target); - } else { - block_iter_.SeekToFirst(); - } - FindKeyForward(); - } - - CheckOutOfBound(); - - if (target) { - assert(!Valid() || ((block_type_ == BlockType::kIndex && - !table_->get_rep()->index_key_includes_seq) - ? (user_comparator_.Compare(ExtractUserKey(*target), - key()) <= 0) - : (icomp_.Compare(*target, key()) <= 0))); - } -} - -template -void BlockBasedTableIterator::SeekForPrev( - const Slice& target) { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - // For now totally disable prefix seek in auto prefix mode because we don't - // have logic - if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { - ResetDataIter(); - return; - } - - SavePrevIndexValue(); - - // Call Seek() rather than SeekForPrev() in the index block, because the - // target data block will likely to contain the position for `target`, the - // same as Seek(), rather than than before. - // For example, if we have three data blocks, each containing two keys: - // [2, 4] [6, 8] [10, 12] - // (the keys in the index block would be [4, 8, 12]) - // and the user calls SeekForPrev(7), we need to go to the second block, - // just like if they call Seek(7). - // The only case where the block is difference is when they seek to a position - // in the boundary. For example, if they SeekForPrev(5), we should go to the - // first block, rather than the second. However, we don't have the information - // to distinguish the two unless we read the second block. In this case, we'll - // end up with reading two blocks. - index_iter_->Seek(target); - - if (!index_iter_->Valid()) { - auto seek_status = index_iter_->status(); - // Check for IO error - if (!seek_status.IsNotFound() && !seek_status.ok()) { - ResetDataIter(); - return; - } - - // With prefix index, Seek() returns NotFound if the prefix doesn't exist - if (seek_status.IsNotFound()) { - // Any key less than the target is fine for prefix seek - ResetDataIter(); - return; - } else { - index_iter_->SeekToLast(); - } - // Check for IO error - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - - InitDataBlock(); - - block_iter_.SeekForPrev(target); - - FindKeyBackward(); - CheckDataBlockWithinUpperBound(); - assert(!block_iter_.Valid() || - icomp_.Compare(target, block_iter_.key()) >= 0); -} - -template -void BlockBasedTableIterator::SeekToLast() { - is_out_of_bound_ = false; - is_at_first_key_from_index_ = false; - SavePrevIndexValue(); - index_iter_->SeekToLast(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToLast(); - FindKeyBackward(); - CheckDataBlockWithinUpperBound(); -} - -template -void BlockBasedTableIterator::Next() { - if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { - return; - } - assert(block_iter_points_to_real_block_); - block_iter_.Next(); - FindKeyForward(); - CheckOutOfBound(); -} - -template -bool BlockBasedTableIterator::NextAndGetResult( - IterateResult* result) { - Next(); - bool is_valid = Valid(); - if (is_valid) { - result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); - } - return is_valid; -} - -template -void BlockBasedTableIterator::Prev() { - if (is_at_first_key_from_index_) { - is_at_first_key_from_index_ = false; - - index_iter_->Prev(); - if (!index_iter_->Valid()) { - return; - } - - InitDataBlock(); - block_iter_.SeekToLast(); - } else { - assert(block_iter_points_to_real_block_); - block_iter_.Prev(); - } - - FindKeyBackward(); -} - -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value().handle; - if (!block_iter_points_to_real_block_ || - data_block_handle.offset() != prev_block_offset_ || - // if previous attempt of reading the block missed cache, try again - block_iter_.status().IsIncomplete()) { - if (block_iter_points_to_real_block_) { - ResetDataIter(); - } - auto* rep = table_->get_rep(); - - // Prefetch additional data for range scans (iterators). Enabled only for - // user reads. - // Implicit auto readahead: - // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. - // Explicit user requested readahead: - // Enabled from the very first IO when ReadOptions.readahead_size is set. - if (lookup_context_.caller != TableReaderCaller::kCompaction) { - if (read_options_.readahead_size == 0) { - // Implicit auto readahead - num_file_reads_++; - if (num_file_reads_ > - BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { - if (!rep->file->use_direct_io() && - (data_block_handle.offset() + - static_cast(block_size(data_block_handle)) > - readahead_limit_)) { - // Buffered I/O - // Discarding the return status of Prefetch calls intentionally, as - // we can fallback to reading from disk if Prefetch fails. - rep->file->Prefetch(data_block_handle.offset(), readahead_size_); - readahead_limit_ = static_cast(data_block_handle.offset() + - readahead_size_); - // Keep exponentially increasing readahead size until - // kMaxAutoReadaheadSize. - readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, - readahead_size_ * 2); - } else if (rep->file->use_direct_io() && !prefetch_buffer_) { - // Direct I/O - // Let FilePrefetchBuffer take care of the readahead. - rep->CreateFilePrefetchBuffer( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); - } - } - } else if (!prefetch_buffer_) { - // Explicit user requested readahead - // The actual condition is: - // if (read_options_.readahead_size != 0 && !prefetch_buffer_) - rep->CreateFilePrefetchBuffer(read_options_.readahead_size, - read_options_.readahead_size, - &prefetch_buffer_); - } - } else if (!prefetch_buffer_) { - rep->CreateFilePrefetchBuffer(compaction_readahead_size_, - compaction_readahead_size_, - &prefetch_buffer_); - } - - Status s; - table_->NewDataBlockIterator( - read_options_, data_block_handle, &block_iter_, block_type_, - /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), - /*for_compaction=*/lookup_context_.caller == - TableReaderCaller::kCompaction); - block_iter_points_to_real_block_ = true; - CheckDataBlockWithinUpperBound(); - } -} - -template -bool BlockBasedTableIterator::MaterializeCurrentBlock() { - assert(is_at_first_key_from_index_); - assert(!block_iter_points_to_real_block_); - assert(index_iter_->Valid()); - - is_at_first_key_from_index_ = false; - InitDataBlock(); - assert(block_iter_points_to_real_block_); - block_iter_.SeekToFirst(); - - if (!block_iter_.Valid() || - icomp_.Compare(block_iter_.key(), - index_iter_->value().first_internal_key) != 0) { - // Uh oh. - block_iter_.Invalidate(Status::Corruption( - "first key in index doesn't match first key in block")); +bool BlockBasedTable::PrefixExtractorChanged( + const SliceTransform* prefix_extractor) const { + if (prefix_extractor == nullptr) { + return true; + } else if (prefix_extractor == rep_->table_prefix_extractor.get()) { return false; - } - - return true; -} - -template -void BlockBasedTableIterator::FindKeyForward() { - // This method's code is kept short to make it likely to be inlined. - - assert(!is_out_of_bound_); - assert(block_iter_points_to_real_block_); - - if (!block_iter_.Valid()) { - // This is the only call site of FindBlockForward(), but it's extracted into - // a separate method to keep FindKeyForward() short and likely to be - // inlined. When transitioning to a different block, we call - // FindBlockForward(), which is much longer and is probably not inlined. - FindBlockForward(); } else { - // This is the fast path that avoids a function call. - } -} - -template -void BlockBasedTableIterator::FindBlockForward() { - // TODO the while loop inherits from two-level-iterator. We don't know - // whether a block can be empty so it can be replaced by an "if". - do { - if (!block_iter_.status().ok()) { - return; - } - // Whether next data block is out of upper bound, if there is one. - const bool next_block_is_out_of_bound = - read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && !data_block_within_upper_bound_; - assert(!next_block_is_out_of_bound || - user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) <= 0); - ResetDataIter(); - index_iter_->Next(); - if (next_block_is_out_of_bound) { - // The next block is out of bound. No need to read it. - TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); - // We need to make sure this is not the last data block before setting - // is_out_of_bound_, since the index key for the last data block can be - // larger than smallest key of the next file on the same level. - if (index_iter_->Valid()) { - is_out_of_bound_ = true; - } - return; - } - - if (!index_iter_->Valid()) { - return; - } - - IndexValue v = index_iter_->value(); - - // TODO(kolmike): Remove the != kBlockCacheTier condition. - if (!v.first_internal_key.empty() && - read_options_.read_tier != kBlockCacheTier) { - // Index contains the first key of the block. Defer reading the block. - is_at_first_key_from_index_ = true; - return; - } - - InitDataBlock(); - block_iter_.SeekToFirst(); - } while (!block_iter_.Valid()); -} - -template -void BlockBasedTableIterator::FindKeyBackward() { - while (!block_iter_.Valid()) { - if (!block_iter_.status().ok()) { - return; - } - - ResetDataIter(); - index_iter_->Prev(); - - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToLast(); - } else { - return; - } - } - - // We could have check lower bound here too, but we opt not to do it for - // code simplicity. -} - -template -void BlockBasedTableIterator::CheckOutOfBound() { - if (read_options_.iterate_upper_bound != nullptr && Valid()) { - is_out_of_bound_ = user_comparator_.Compare( - *read_options_.iterate_upper_bound, user_key()) <= 0; - } -} - -template -void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - data_block_within_upper_bound_ = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) > 0); + return PrefixExtractorChangedHelper(rep_->table_properties.get(), + prefix_extractor); } } InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, - size_t compaction_readahead_size) { + size_t compaction_readahead_size, bool allow_unprepared_value) { BlockCacheLookupContext lookup_context{caller}; bool need_upper_bound_check = - read_options.auto_prefix_mode || - PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); + read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor); + std::unique_ptr> index_iter(NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context)); if (arena == nullptr) { - return new BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator( - read_options, - need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch, - /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + return new BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, caller, - compaction_readahead_size); + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value); } else { - auto* mem = - arena->AllocateAligned(sizeof(BlockBasedTableIterator)); - return new (mem) BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator( - read_options, - need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch, - /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); + return new (mem) BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, std::move(index_iter), !skip_filters && !read_options.total_order_seek && prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, BlockType::kData, caller, - compaction_readahead_size); + need_upper_bound_check, prefix_extractor, caller, + compaction_readahead_size, allow_unprepared_value); } } @@ -3261,25 +2227,23 @@ Slice user_key = ExtractUserKey(internal_key); const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; + size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); if (rep_->whole_key_filtering) { - size_t ts_sz = - rep_->internal_comparator.user_comparator()->timestamp_size(); - Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, no_io, const_ikey_ptr, get_context, lookup_context); - } else if (!read_options.total_order_seek && prefix_extractor && - rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0 && - prefix_extractor->InDomain(user_key) && - !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), - prefix_extractor, kNotValid, no_io, - const_ikey_ptr, get_context, - lookup_context)) { + } else if (!read_options.total_order_seek && + !PrefixExtractorChanged(prefix_extractor) && + prefix_extractor->InDomain(user_key_without_ts) && + !filter->PrefixMayMatch( + prefix_extractor->Transform(user_key_without_ts), + prefix_extractor, kNotValid, no_io, const_ikey_ptr, + get_context, lookup_context)) { may_match = false; } if (may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); } return may_match; @@ -3293,14 +2257,34 @@ if (filter == nullptr || filter->IsBlockBased()) { return; } + uint64_t before_keys = range->KeysLeft(); + assert(before_keys > 0); // Caller should ensure if (rep_->whole_key_filtering) { filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, lookup_context); - } else if (!read_options.total_order_seek && prefix_extractor && - rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0) { + uint64_t after_keys = range->KeysLeft(); + if (after_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys, + rep_->level); + } + uint64_t filtered_keys = before_keys - after_keys; + if (filtered_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys, + rep_->level); + } + } else if (!read_options.total_order_seek && + !PrefixExtractorChanged(prefix_extractor)) { filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false, lookup_context); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys); + uint64_t after_keys = range->KeysLeft(); + uint64_t filtered_keys = before_keys - after_keys; + if (filtered_keys) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL, + filtered_keys); + } } } @@ -3328,11 +2312,13 @@ lookup_context.get_from_user_specified_snapshot = read_options.snapshot != nullptr; } + TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch"); const bool may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, get_context, &lookup_context); + TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch"); if (!may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); } else { IndexBlockIter iiter_on_stack; @@ -3340,8 +2326,7 @@ // BlockPrefixIndex. Only do this check when index_type is kHashSearch. bool need_upper_bound_check = false; if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - need_upper_bound_check = PrefixExtractorChanged( - rep_->table_properties.get(), prefix_extractor); + need_upper_bound_check = PrefixExtractorChanged(prefix_extractor); } auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, @@ -3353,7 +2338,7 @@ size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); - bool matched = false; // if such user key mathced a key in SST + bool matched = false; // if such user key matched a key in SST bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { IndexValue v = iiter->value(); @@ -3369,15 +2354,16 @@ // Not found // TODO: think about interaction with Merge. If a user key cannot // cross one data block, we should be fine. - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; } if (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { // The requested key falls between highest key in previous block and // lowest key in current block. break; @@ -3400,6 +2386,7 @@ // Update Saver.state to Found because we are only looking for // whether we can guarantee the key is not there when "no_io" is set get_context->MarkKeyMayExist(); + s = biter.status(); break; } if (!biter.status().ok()) { @@ -3420,8 +2407,10 @@ // Call the *saver function on each entry/block until it returns false for (; biter.Valid(); biter.Next()) { ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); + Status pik_status = ParseInternalKey( + biter.key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; } if (!get_context->SaveValue( @@ -3448,7 +2437,7 @@ referenced_key = key; } BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -3460,9 +2449,12 @@ /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); - block_cache_tracer_->WriteBlockAccess( - access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), referenced_key); + // TODO: Should handle status here? + block_cache_tracer_ + ->WriteBlockAccess(access_record, + lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key) + .PermitUncheckedError(); } if (done) { @@ -3471,7 +2463,7 @@ } } if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } @@ -3488,6 +2480,12 @@ const MultiGetRange* mget_range, const SliceTransform* prefix_extractor, bool skip_filters) { + if (mget_range->empty()) { + // Caller should ensure non-empty (performance bug) + assert(false); + return; // Nothing to do + } + FilterBlockReader* const filter = !skip_filters ? rep_->filter.get() : nullptr; MultiGetRange sst_file_range(*mget_range, mget_range->begin(), @@ -3497,7 +2495,7 @@ // If full filter not useful, Then go into each block const bool no_io = read_options.read_tier == kBlockCacheTier; uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; - if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { + if (sst_file_range.begin()->get_context) { tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } BlockCacheLookupContext lookup_context{ @@ -3506,14 +2504,13 @@ FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, prefix_extractor, &lookup_context); - if (skip_filters || !sst_file_range.empty()) { + if (!sst_file_range.empty()) { IndexBlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable // BlockPrefixIndex. Only do this check when index_type is kHashSearch. bool need_upper_bound_check = false; if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - need_upper_bound_check = PrefixExtractorChanged( - rep_->table_properties.get(), prefix_extractor); + need_upper_bound_check = PrefixExtractorChanged(prefix_extractor); } auto iiter = NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, @@ -3532,21 +2529,13 @@ { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); + std::vector cache_handles; + bool wait_for_cache_results = false; CachableEntry uncompression_dict; Status uncompression_dict_status; - if (rep_->uncompression_dict_reader) { - uncompression_dict_status = - rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, no_io, - sst_file_range.begin()->get_context, &lookup_context, - &uncompression_dict); - } - - const UncompressionDict& dict = uncompression_dict.GetValue() - ? *uncompression_dict.GetValue() - : UncompressionDict::GetEmptyDict(); - + uncompression_dict_status.PermitUncheckedError(); + bool uncompression_dict_inited = false; size_t total_len = 0; ReadOptions ro = read_options; ro.read_tier = kBlockCacheTier; @@ -3563,17 +2552,30 @@ if (!iiter->Valid() || (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0)) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { // The requested key falls between highest key in previous block and // lowest key in current block. - *(miter->s) = iiter->status(); + if (!iiter->status().IsNotFound()) { + *(miter->s) = iiter->status(); + } data_block_range.SkipKey(miter); sst_file_range.SkipKey(miter); continue; } + if (!uncompression_dict_inited && rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, no_io, + sst_file_range.begin()->get_context, &lookup_context, + &uncompression_dict); + uncompression_dict_inited = true; + } + if (!uncompression_dict_status.ok()) { + assert(!uncompression_dict_status.IsNotFound()); *(miter->s) = uncompression_dict_status; data_block_range.SkipKey(miter); sst_file_range.SkipKey(miter); @@ -3595,25 +2597,75 @@ BlockHandle handle = v.handle; BlockCacheLookupContext lookup_data_block_context( TableReaderCaller::kUserMultiGet); + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); Status s = RetrieveBlock( nullptr, ro, handle, dict, &(results.back()), BlockType::kData, miter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ false); if (s.IsIncomplete()) { s = Status::OK(); } if (s.ok() && !results.back().IsEmpty()) { - // Found it in the cache. Add NULL handle to indicate there is - // nothing to read from disk - block_handles.emplace_back(BlockHandle::NullBlockHandle()); + // Since we have a valid handle, check the value. If its nullptr, + // it means the cache is waiting for the final result and we're + // supposed to call WaitAll() to wait for the result. + if (results.back().GetValue() != nullptr) { + // Found it in the cache. Add NULL handle to indicate there is + // nothing to read from disk. + if (results.back().GetCacheHandle()) { + results.back().UpdateCachedValue(); + } + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + } else { + // We have to wait for the cache lookup to finish in the + // background, and then we may have to read the block from disk + // anyway + assert(results.back().GetCacheHandle()); + wait_for_cache_results = true; + block_handles.emplace_back(handle); + cache_handles.emplace_back(results.back().GetCacheHandle()); + } } else { block_handles.emplace_back(handle); - total_len += block_size(handle); + total_len += BlockSizeWithTrailer(handle); + } + } + + if (wait_for_cache_results) { + Cache* block_cache = rep_->table_options.block_cache.get(); + block_cache->WaitAll(cache_handles); + for (size_t i = 0; i < block_handles.size(); ++i) { + // If this block was a success or failure or not needed because + // the corresponding key is in the same block as a prior key, skip + if (block_handles[i] == BlockHandle::NullBlockHandle() || + results[i].IsEmpty()) { + continue; + } + results[i].UpdateCachedValue(); + void* val = results[i].GetValue(); + if (!val) { + // The async cache lookup failed - could be due to an error + // or a false positive. We need to read the data block from + // the SST file + results[i].Reset(); + total_len += BlockSizeWithTrailer(block_handles[i]); + } else { + block_handles[i] = BlockHandle::NullBlockHandle(); + } } } if (total_len) { char* scratch = nullptr; + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + assert(uncompression_dict_inited || !rep_->uncompression_dict_reader); + assert(uncompression_dict_status.ok()); + // If using direct IO, then scratch is not used, so keep it nullptr. // If the blocks need to be uncompressed and we don't need the // compressed blocks, then we can use a contiguous block of // memory to read in all the blocks as it will be temporary @@ -3623,7 +2675,8 @@ // 2. If blocks are uncompressed, alloc heap bufs // 3. If blocks are compressed and no compressed block cache, use // stack buf - if (rep_->table_options.block_cache_compressed == nullptr && + if (!rep_->file->use_direct_io() && + rep_->table_options.block_cache_compressed == nullptr && rep_->blocks_maybe_compressed) { if (total_len <= kMultiGetReadStackBufSize) { scratch = stack_buf; @@ -3634,6 +2687,10 @@ } RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles, &statuses, &results, scratch, dict); + if (sst_file_range.begin()->get_context) { + ++(sst_file_range.begin() + ->get_context->get_context_stats_.num_sst_read); + } } } @@ -3665,6 +2722,10 @@ read_options, results[idx_in_batch], &first_biter, statuses[idx_in_batch]); reusing_block = false; + } else { + // If handler is null and result is empty, then the status is never + // set, which should be the initial value: ok(). + assert(statuses[idx_in_batch].ok()); } biter = &first_biter; idx_in_batch++; @@ -3672,8 +2733,9 @@ IndexValue v = iiter->value(); if (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { // The requested key falls between highest key in previous block and // lowest key in current block. break; @@ -3715,8 +2777,10 @@ ParsedInternalKey parsed_key; Cleanable dummy; Cleanable* value_pinner = nullptr; - if (!ParseInternalKey(biter->key(), &parsed_key)) { - s = Status::Corruption(Slice()); + Status pik_status = ParseInternalKey( + biter->key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; } if (biter->IsValuePinned()) { if (reusing_block) { @@ -3753,7 +2817,7 @@ referenced_key = key; } BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -3765,9 +2829,12 @@ /*referenced_key=*/"", referenced_data_size, lookup_data_block_context.num_keys_in_block, does_referenced_key_exist); - block_cache_tracer_->WriteBlockAccess( - access_record, lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), referenced_key); + // TODO: Should handle status here? + block_cache_tracer_ + ->WriteBlockAccess(access_record, + lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key) + .PermitUncheckedError(); } s = biter->status(); if (done) { @@ -3782,15 +2849,21 @@ } while (iiter->Valid()); if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } - if (s.ok()) { + if (s.ok() && !iiter->status().IsNotFound()) { s = iiter->status(); } *(miter->s) = s; } +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + // Not sure why we need to do it. Should investigate more. + for (auto& st : statuses) { + st.PermitUncheckedError(); + } +#endif // ROCKSDB_ASSERT_STATUS_CHECKED } } @@ -3860,7 +2933,8 @@ // Check Meta blocks std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex, + ReadOptions ro; + s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex, &metaindex_iter); if (s.ok()) { s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); @@ -3896,11 +2970,11 @@ // increasing of the buffer size. size_t readahead_size = (read_options.readahead_size != 0) ? read_options.readahead_size - : kMaxAutoReadaheadSize; + : rep_->table_options.max_auto_readahead_size; // FilePrefetchBuffer doesn't work in mmap mode and readahead is not // needed there. FilePrefetchBuffer prefetch_buffer( - rep_->file.get(), readahead_size /* readadhead_size */, + readahead_size /* readahead_size */, readahead_size /* max_readahead_size */, !rep_->ioptions.allow_mmap_reads /* enable */); @@ -3921,6 +2995,12 @@ break; } } + if (s.ok()) { + // In the case of two level indexes, we would have exited the above loop + // by checking index_iter->Valid(), but Valid() might have returned false + // due to an IO error. So check the index_iter status + s = index_iter->status(); + } return s; } @@ -3932,15 +3012,15 @@ return BlockType::kFilter; } - if (meta_block_name == kPropertiesBlock) { + if (meta_block_name == kPropertiesBlockName) { return BlockType::kProperties; } - if (meta_block_name == kCompressionDictBlock) { + if (meta_block_name == kCompressionDictBlockName) { return BlockType::kCompressionDictionary; } - if (meta_block_name == kRangeDelBlock) { + if (meta_block_name == kRangeDelBlockName) { return BlockType::kRangeDeletion; } @@ -3969,19 +3049,22 @@ s = handle.DecodeFrom(&input); BlockContents contents; const Slice meta_block_name = index_iter->key(); - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, - GetBlockTypeForMetaBlockByName(meta_block_name), - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (s.IsCorruption() && meta_block_name == kPropertiesBlock) { - TableProperties* table_properties; - s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */, - index_iter->value(), - &table_properties); - delete table_properties; + if (meta_block_name == kPropertiesBlockName) { + // Unfortunate special handling for properties block checksum w/ + // global seqno + std::unique_ptr table_properties; + s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(), + nullptr /* prefetch_buffer */, rep_->footer, + rep_->ioptions, &table_properties, + nullptr /* memory_allocator */); + } else { + s = BlockFetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + ReadOptions(), handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) + .ReadBlockContents(); } if (!s.ok()) { break; @@ -3998,12 +3081,9 @@ return false; } - char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, - cache_key_storage); + CacheKey key = GetCacheKey(rep_->base_cache_key, handle); - Cache::Handle* const cache_handle = cache->Lookup(cache_key); + Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice()); if (cache_handle == nullptr) { return false; } @@ -4031,9 +3111,9 @@ // 4. internal_comparator // 5. index_type Status BlockBasedTable::CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, - InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, - bool pin, BlockCacheLookupContext* lookup_context, + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, std::unique_ptr* index_reader) { // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. @@ -4043,47 +3123,34 @@ switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { - return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, + return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context, index_reader); } case BlockBasedTableOptions::kBinarySearch: FALLTHROUGH_INTENDED; case BlockBasedTableOptions::kBinarySearchWithFirstKey: { - return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, lookup_context, - index_reader); + return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, + use_cache, prefetch, pin, + lookup_context, index_reader); } case BlockBasedTableOptions::kHashSearch: { std::unique_ptr metaindex_guard; std::unique_ptr metaindex_iter_guard; - auto meta_index_iter = preloaded_meta_index_iter; bool should_fallback = false; if (rep_->internal_prefix_transform.get() == nullptr) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "No prefix extractor passed in. Fall back to binary" " search index."); should_fallback = true; - } else if (meta_index_iter == nullptr) { - auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard, - &metaindex_iter_guard); - if (!s.ok()) { - // we simply fall back to binary search in case there is any - // problem with prefix hash index loading. - ROCKS_LOG_WARN(rep_->ioptions.info_log, - "Unable to read the metaindex block." - " Fall back to binary search index."); - should_fallback = true; - } - meta_index_iter = metaindex_iter_guard.get(); } if (should_fallback) { - return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, - prefetch, pin, lookup_context, - index_reader); + return BinarySearchIndexReader::Create(this, ro, prefetch_buffer, + use_cache, prefetch, pin, + lookup_context, index_reader); } else { - return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter, use_cache, prefetch, pin, lookup_context, index_reader); } @@ -4096,30 +3163,38 @@ } } -uint64_t BlockBasedTable::ApproximateOffsetOf( - const InternalIteratorBase& index_iter) const { - uint64_t result = 0; +uint64_t BlockBasedTable::ApproximateDataOffsetOf( + const InternalIteratorBase& index_iter, + uint64_t data_size) const { + assert(index_iter.status().ok()); if (index_iter.Valid()) { BlockHandle handle = index_iter.value().handle; - result = handle.offset(); + return handle.offset(); } else { - // The iterator is past the last key in the file. If table_properties is not - // available, approximate the offset by returning the offset of the - // metaindex block (which is right near the end of the file). - if (rep_->table_properties) { - result = rep_->table_properties->data_size; - } - // table_properties is not present in the table. - if (result == 0) { - result = rep_->footer.metaindex_handle().offset(); - } + // The iterator is past the last key in the file. + return data_size; } +} - return result; +uint64_t BlockBasedTable::GetApproximateDataSize() { + // Should be in table properties unless super old version + if (rep_->table_properties) { + return rep_->table_properties->data_size; + } + // Fall back to rough estimate from footer + return rep_->footer.metaindex_handle().offset(); } uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) { + uint64_t data_size = GetApproximateDataSize(); + if (UNLIKELY(data_size == 0)) { + // Hmm. Let's just split in half to avoid skewing one way or another, + // since we don't know whether we're operating on lower bound or + // upper bound. + return rep_->file_size / 2; + } + BlockCacheLookupContext context(caller); IndexBlockIter iiter_on_stack; ReadOptions ro; @@ -4134,13 +3209,37 @@ } index_iter->Seek(key); - return ApproximateOffsetOf(*index_iter); + uint64_t offset; + if (index_iter->status().ok()) { + offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Split in half to avoid skewing one way or another, + // since we don't know whether we're operating on lower bound or + // upper bound. + return rep_->file_size / 2; + } + + // Pro-rate file metadata (incl filters) size-proportionally across data + // blocks. + double size_ratio = + static_cast(offset) / static_cast(data_size); + return static_cast(size_ratio * + static_cast(rep_->file_size)); } uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, TableReaderCaller caller) { assert(rep_->internal_comparator.Compare(start, end) <= 0); + uint64_t data_size = GetApproximateDataSize(); + if (UNLIKELY(data_size == 0)) { + // Hmm. Assume whole file is involved, since we have lower and upper + // bound. This likely skews the estimate if we consider that this function + // is typically called with `[start, end]` fully contained in the file's + // key-range. + return rep_->file_size; + } + BlockCacheLookupContext context(caller); IndexBlockIter iiter_on_stack; ReadOptions ro; @@ -4155,17 +3254,38 @@ } index_iter->Seek(start); - uint64_t start_offset = ApproximateOffsetOf(*index_iter); + uint64_t start_offset; + if (index_iter->status().ok()) { + start_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved from the start. This likely skews the estimate + // but is consistent with the above error handling. + start_offset = 0; + } + index_iter->Seek(end); - uint64_t end_offset = ApproximateOffsetOf(*index_iter); + uint64_t end_offset; + if (index_iter->status().ok()) { + end_offset = ApproximateDataOffsetOf(*index_iter, data_size); + } else { + // Assume file is involved until the end. This likely skews the estimate + // but is consistent with the above error handling. + end_offset = data_size; + } assert(end_offset >= start_offset); - return end_offset - start_offset; + // Pro-rate file metadata (incl filters) size-proportionally across data + // blocks. + double size_ratio = static_cast(end_offset - start_offset) / + static_cast(data_size); + return static_cast(size_ratio * + static_cast(rep_->file_size)); } bool BlockBasedTable::TEST_FilterBlockInCache() const { assert(rep_ != nullptr); - return TEST_BlockInCache(rep_->filter_handle); + return rep_->filter_type != Rep::FilterType::kNoFilter && + TEST_BlockInCache(rep_->filter_handle); } bool BlockBasedTable::TEST_IndexBlockInCache() const { @@ -4230,21 +3350,20 @@ } Status BlockBasedTable::DumpTable(WritableFile* out_file) { + WritableFileStringStreamAdapter out_file_wrapper(out_file); + std::ostream out_stream(&out_file_wrapper); // Output Footer - out_file->Append( - "Footer Details:\n" - "--------------------------------------\n" - " "); - out_file->Append(rep_->footer.ToString().c_str()); - out_file->Append("\n"); + out_stream << "Footer Details:\n" + "--------------------------------------\n"; + out_stream << " " << rep_->footer.ToString() << "\n"; // Output MetaIndex - out_file->Append( - "Metaindex Details:\n" - "--------------------------------------\n"); + out_stream << "Metaindex Details:\n" + "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex, + ReadOptions ro; + Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); if (s.ok()) { for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); @@ -4253,27 +3372,22 @@ if (!s.ok()) { return s; } - if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) { - out_file->Append(" Properties block handle: "); - out_file->Append(metaindex_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } else if (metaindex_iter->key() == - ROCKSDB_NAMESPACE::kCompressionDictBlock) { - out_file->Append(" Compression dictionary block handle: "); - out_file->Append(metaindex_iter->value().ToString(true).c_str()); - out_file->Append("\n"); + if (metaindex_iter->key() == kPropertiesBlockName) { + out_stream << " Properties block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } else if (metaindex_iter->key() == kCompressionDictBlockName) { + out_stream << " Compression dictionary block handle: " + << metaindex_iter->value().ToString(true) << "\n"; } else if (strstr(metaindex_iter->key().ToString().c_str(), "filter.rocksdb.") != nullptr) { - out_file->Append(" Filter block handle: "); - out_file->Append(metaindex_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) { - out_file->Append(" Range deletion block handle: "); - out_file->Append(metaindex_iter->value().ToString(true).c_str()); - out_file->Append("\n"); + out_stream << " Filter block handle: " + << metaindex_iter->value().ToString(true) << "\n"; + } else if (metaindex_iter->key() == kRangeDelBlockName) { + out_stream << " Range deletion block handle: " + << metaindex_iter->value().ToString(true) << "\n"; } } - out_file->Append("\n"); + out_stream << "\n"; } else { return s; } @@ -4283,25 +3397,19 @@ table_properties = rep_->table_properties.get(); if (table_properties != nullptr) { - out_file->Append( - "Table Properties:\n" - "--------------------------------------\n" - " "); - out_file->Append(table_properties->ToString("\n ", ": ").c_str()); - out_file->Append("\n"); + out_stream << "Table Properties:\n" + "--------------------------------------\n"; + out_stream << " " << table_properties->ToString("\n ", ": ") << "\n"; } if (rep_->filter) { - out_file->Append( - "Filter Details:\n" - "--------------------------------------\n" - " "); - out_file->Append(rep_->filter->ToString().c_str()); - out_file->Append("\n"); + out_stream << "Filter Details:\n" + "--------------------------------------\n"; + out_stream << " " << rep_->filter->ToString() << "\n"; } // Output Index block - s = DumpIndexBlock(out_file); + s = DumpIndexBlock(out_stream); if (!s.ok()) { return s; } @@ -4320,15 +3428,10 @@ assert(uncompression_dict.GetValue()); const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict(); - out_file->Append( - "Compression Dictionary:\n" - "--------------------------------------\n"); - out_file->Append(" size (bytes): "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size())); - out_file->Append("\n\n"); - out_file->Append(" HEX "); - out_file->Append(raw_dict.ToString(true).c_str()); - out_file->Append("\n\n"); + out_stream << "Compression Dictionary:\n" + "--------------------------------------\n"; + out_stream << " size (bytes): " << raw_dict.size() << "\n\n"; + out_stream << " HEX " << raw_dict.ToString(true) << "\n\n"; } // Output range deletions block @@ -4336,39 +3439,44 @@ if (range_del_iter != nullptr) { range_del_iter->SeekToFirst(); if (range_del_iter->Valid()) { - out_file->Append( - "Range deletions:\n" - "--------------------------------------\n" - " "); + out_stream << "Range deletions:\n" + "--------------------------------------\n"; for (; range_del_iter->Valid(); range_del_iter->Next()) { - DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file); + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), + out_stream); } - out_file->Append("\n"); + out_stream << "\n"; } delete range_del_iter; } // Output Data blocks - s = DumpDataBlocks(out_file); + s = DumpDataBlocks(out_stream); - return s; + if (!s.ok()) { + return s; + } + + if (!out_stream.good()) { + return Status::IOError("Failed to write to output file"); + } + return Status::OK(); } -Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { - out_file->Append( - "Index Details:\n" - "--------------------------------------\n"); +Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { + out_stream << "Index Details:\n" + "--------------------------------------\n"; std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { - out_file->Append("Can not read Index Block \n\n"); + out_stream << "Can not read Index Block \n\n"; return s; } - out_file->Append(" Block key hex dump: Data block handle\n"); - out_file->Append(" Block key ascii\n\n"); + out_stream << " Block key hex dump: Data block handle\n"; + out_stream << " Block key ascii\n\n"; for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); blockhandles_iter->Next()) { s = blockhandles_iter->status(); @@ -4385,13 +3493,10 @@ user_key = ikey.user_key(); } - out_file->Append(" HEX "); - out_file->Append(user_key.ToString(true).c_str()); - out_file->Append(": "); - out_file->Append(blockhandles_iter->value() - .ToString(true, rep_->index_has_first_key) - .c_str()); - out_file->Append("\n"); + out_stream << " HEX " << user_key.ToString(true) << ": " + << blockhandles_iter->value().ToString(true, + rep_->index_has_first_key) + << "\n"; std::string str_key = user_key.ToString(); std::string res_key(""); @@ -4400,22 +3505,21 @@ res_key.append(&str_key[i], 1); res_key.append(1, cspace); } - out_file->Append(" ASCII "); - out_file->Append(res_key.c_str()); - out_file->Append("\n ------\n"); + out_stream << " ASCII " << res_key << "\n"; + out_stream << " ------\n"; } - out_file->Append("\n"); + out_stream << "\n"; return Status::OK(); } -Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { +Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { std::unique_ptr> blockhandles_iter( NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); if (!s.ok()) { - out_file->Append("Can not read Index Block \n\n"); + out_stream << "Can not read Index Block \n\n"; return s; } @@ -4437,12 +3541,9 @@ datablock_size_max = std::max(datablock_size_max, datablock_size); datablock_size_sum += datablock_size; - out_file->Append("Data Block # "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id)); - out_file->Append(" @ "); - out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str()); - out_file->Append("\n"); - out_file->Append("--------------------------------------\n"); + out_stream << "Data Block # " << block_id << " @ " + << blockhandles_iter->value().handle.ToString(true) << "\n"; + out_stream << "--------------------------------------\n"; std::unique_ptr datablock_iter; datablock_iter.reset(NewDataBlockIterator( @@ -4453,7 +3554,7 @@ s = datablock_iter->status(); if (!s.ok()) { - out_file->Append("Error reading the block - Skipped \n\n"); + out_stream << "Error reading the block - Skipped \n\n"; continue; } @@ -4461,44 +3562,37 @@ datablock_iter->Next()) { s = datablock_iter->status(); if (!s.ok()) { - out_file->Append("Error reading the block - Skipped \n"); + out_stream << "Error reading the block - Skipped \n"; break; } - DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file); + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream); } - out_file->Append("\n"); + out_stream << "\n"; } uint64_t num_datablocks = block_id - 1; if (num_datablocks) { double datablock_size_avg = static_cast(datablock_size_sum) / num_datablocks; - out_file->Append("Data Block Summary:\n"); - out_file->Append("--------------------------------------"); - out_file->Append("\n # data blocks: "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks)); - out_file->Append("\n min data block size: "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min)); - out_file->Append("\n max data block size: "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max)); - out_file->Append("\n avg data block size: "); - out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg)); - out_file->Append("\n"); + out_stream << "Data Block Summary:\n"; + out_stream << "--------------------------------------\n"; + out_stream << " # data blocks: " << num_datablocks << "\n"; + out_stream << " min data block size: " << datablock_size_min << "\n"; + out_stream << " max data block size: " << datablock_size_max << "\n"; + out_stream << " avg data block size: " << ToString(datablock_size_avg) + << "\n"; } return Status::OK(); } void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, - WritableFile* out_file) { + std::ostream& out_stream) { InternalKey ikey; ikey.DecodeFrom(key); - out_file->Append(" HEX "); - out_file->Append(ikey.user_key().ToString(true).c_str()); - out_file->Append(": "); - out_file->Append(value.ToString(true).c_str()); - out_file->Append("\n"); + out_stream << " HEX " << ikey.user_key().ToString(true) << ": " + << value.ToString(true) << "\n"; std::string str_key = ikey.user_key().ToString(); std::string str_value = value.ToString(); @@ -4521,11 +3615,8 @@ res_value.append(1, cspace); } - out_file->Append(" ASCII "); - out_file->Append(res_key.c_str()); - out_file->Append(": "); - out_file->Append(res_value.c_str()); - out_file->Append("\n ------\n"); + out_stream << " ASCII " << res_key << ": " << res_value << "\n"; + out_stream << " ------\n"; } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,22 +9,13 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include +#include "cache/cache_key.h" #include "db/range_tombstone_fragmenter.h" #include "file/filename.h" -#include "file/random_access_file_reader.h" -#include "options/cf_options.h" -#include "rocksdb/options.h" -#include "rocksdb/persistent_cache.h" -#include "rocksdb/statistics.h" -#include "rocksdb/status.h" -#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table_properties.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_type.h" @@ -32,15 +23,11 @@ #include "table/block_based/filter_block.h" #include "table/block_based/uncompression_dict_reader.h" #include "table/format.h" -#include "table/get_context.h" -#include "table/multiget_context.h" -#include "table/persistent_cache_helper.h" +#include "table/persistent_cache_options.h" #include "table/table_properties_internal.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" #include "trace_replay/block_cache_tracer.h" -#include "util/coding.h" -#include "util/user_comparator_wrapper.h" namespace ROCKSDB_NAMESPACE { @@ -60,7 +47,7 @@ struct ReadOptions; class GetContext; -typedef std::vector> KVPairBlock; +using KVPairBlock = std::vector>; // Reader class for BlockBasedTable format. // For the format of BlockBasedTable refer to @@ -78,17 +65,14 @@ static const std::string kFilterBlockPrefix; static const std::string kFullFilterBlockPrefix; static const std::string kPartitionedFilterBlockPrefix; - // The longest prefix of the cache key used to identify blocks. - // For Posix files the unique ID is three varints. - static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; // All the below fields control iterator readahead static const size_t kInitAutoReadaheadSize = 8 * 1024; - // Found that 256 KB readahead size provides the best performance, based on - // experiments, for auto readahead. Experiment data is in PR #3282. - static const size_t kMaxAutoReadaheadSize; static const int kMinNumFileReadsToStartAutoReadahead = 2; + // 1-byte compression type + 32-bit checksum + static constexpr size_t kBlockTrailerSize = 5; + // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow // retrieving data from the table. @@ -105,20 +89,24 @@ // @param skip_filters Disables loading/accessing the filter block. Overrides // prefetch_index_and_filter_in_cache, so filter will be skipped if both // are set. - static Status Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_key_comparator, - std::unique_ptr&& file, - uint64_t file_size, - std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor = nullptr, - bool prefetch_index_and_filter_in_cache = true, - bool skip_filters = false, int level = -1, - const bool immortal_table = false, - const SequenceNumber largest_seqno = 0, - TailPrefetchStats* tail_prefetch_stats = nullptr, - BlockCacheTracer* const block_cache_tracer = nullptr); + // @param force_direct_prefetch if true, always prefetching to RocksDB + // buffer, rather than calling RandomAccessFile::Prefetch(). + static Status Open( + const ReadOptions& ro, const ImmutableOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + const std::shared_ptr& prefix_extractor = nullptr, + bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false, + int level = -1, const bool immortal_table = false, + const SequenceNumber largest_seqno = 0, + bool force_direct_prefetch = false, + TailPrefetchStats* tail_prefetch_stats = nullptr, + BlockCacheTracer* const block_cache_tracer = nullptr, + size_t max_file_size_for_l0_meta_pin = 0, + const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, @@ -129,6 +117,7 @@ // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). + // @param read_options Must outlive the returned iterator. // @param skip_filters Disables loading/accessing the filter block // compaction_readahead_size: its value will only be used if caller = // kCompaction. @@ -136,7 +125,8 @@ const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, - size_t compaction_readahead_size = 0) override; + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; @@ -220,14 +210,52 @@ virtual size_t ApproximateMemoryUsage() const = 0; // Cache the dependencies of the index reader (e.g. the partitions // of a partitioned index). - virtual void CacheDependencies(bool /* pin */) {} + virtual Status CacheDependencies(const ReadOptions& /*ro*/, + bool /* pin */) { + return Status::OK(); + } }; class IndexReaderCommon; - static Slice GetCacheKey(const char* cache_key_prefix, - size_t cache_key_prefix_size, - const BlockHandle& handle, char* cache_key); + // Maximum SST file size that uses standard CacheKey encoding scheme. + // See GetCacheKey to explain << 2. + 3 is permitted because it is trimmed + // off by >> 2 in GetCacheKey. + static constexpr uint64_t kMaxFileSizeStandardEncoding = + (OffsetableCacheKey::kMaxOffsetStandardEncoding << 2) + 3; + + static void SetupBaseCacheKey(const TableProperties* properties, + const std::string& cur_db_session_id, + uint64_t cur_file_number, uint64_t file_size, + OffsetableCacheKey* out_base_cache_key, + bool* out_is_stable = nullptr); + + static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key, + const BlockHandle& handle); + + static void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage, + bool redundant, + Statistics* const statistics); + + // Get the size to read from storage for a BlockHandle. size_t because we + // are about to load into memory. + static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) { + return static_cast(handle.size() + kBlockTrailerSize); + } + + // It's the caller's responsibility to make sure that this is + // for raw block contents, which contains the compression + // byte in the end. + static inline CompressionType GetBlockCompressionType(const char* block_data, + size_t block_size) { + return static_cast(block_data[block_size]); + } + static inline CompressionType GetBlockCompressionType( + const BlockContents& contents) { + assert(contents.is_raw_block); + return GetBlockCompressionType(contents.data.data(), contents.data.size()); + } // Retrieve all key value pairs from data blocks in the table. // The key retrieved are internal keys. @@ -271,22 +299,34 @@ private: friend class MockedBlockBasedTable; - static std::atomic next_cache_key_id_; + friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test; BlockCacheTracer* const block_cache_tracer_; void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, size_t usage) const; void UpdateCacheMissMetrics(BlockType block_type, GetContext* get_context) const; - void UpdateCacheInsertionMetrics(BlockType block_type, - GetContext* get_context, size_t usage) const; - Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, - BlockType block_type, - GetContext* get_context) const; + + Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier, + Cache* block_cache, const Slice& key, + BlockType block_type, const bool wait, + GetContext* get_context, + const Cache::CacheItemHelper* cache_helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority) const; + + template + Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache, + const Slice& key, + const Cache::CacheItemHelper* cache_helper, + std::unique_ptr& block_holder, + size_t charge, Cache::Handle** cache_handle, + Cache::Priority priority) const; // Either Block::NewDataIterator() or Block::NewIndexIterator(). template static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + BlockType block_type, TBlockIter* input_iter, bool block_contents_pinned); @@ -303,6 +343,7 @@ Status MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, + const bool wait, const bool for_compaction, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, BlockContents* contents) const; @@ -317,7 +358,8 @@ CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, + bool wait_for_cache) const; void RetrieveMultipleBlocks( const ReadOptions& options, const MultiGetRange* batch, @@ -352,12 +394,13 @@ // @param uncompression_dict Data for presetting the compression library's // dictionary. template - Status GetDataBlockFromCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context) const; + Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache, + Cache* block_cache_compressed, + const ReadOptions& read_options, + CachableEntry* block, + const UncompressionDict& uncompression_dict, + BlockType block_type, const bool wait, + GetContext* get_context) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -370,14 +413,15 @@ // @param uncompression_dict Data for presetting the compression library's // dictionary. template - Status PutDataBlockToCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, - CachableEntry* cached_block, - BlockContents* raw_block_contents, CompressionType raw_block_comp_type, - const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - MemoryAllocator* memory_allocator, BlockType block_type, - GetContext* get_context) const; + Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache, + Cache* block_cache_compressed, + CachableEntry* cached_block, + BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, + MemoryAllocator* memory_allocator, + BlockType block_type, + GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -389,7 +433,8 @@ // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter // helps avoid re-reading meta index block if caller already created one. - Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, + Status CreateIndexReader(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, bool pin, BlockCacheLookupContext* lookup_context, @@ -408,28 +453,31 @@ const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context) const; + // If force_direct_prefetch is true, always prefetching to RocksDB + // buffer, rather than calling RandomAccessFile::Prefetch(). static Status PrefetchTail( - RandomAccessFileReader* file, uint64_t file_size, - TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, - const bool preload_all, + const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, + bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, + const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer); - Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer, + Status ReadMetaIndexBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, std::unique_ptr* iter); - Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer, - const Slice& handle_value, - TableProperties** table_properties); - Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer, + Status ReadPropertiesBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, const SequenceNumber largest_seqno); - Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, + Status ReadRangeDelBlock(const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator, BlockCacheLookupContext* lookup_context); Status PrefetchIndexAndFilterBlocks( - FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - BlockBasedTable* new_table, bool prefetch_all, - const BlockBasedTableOptions& table_options, const int level, + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, BlockBasedTable* new_table, + bool prefetch_all, const BlockBasedTableOptions& table_options, + const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin, BlockCacheLookupContext* lookup_context); static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); @@ -440,26 +488,27 @@ // Create the filter from the filter block. std::unique_ptr CreateFilterBlockReader( - FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, - bool pin, BlockCacheLookupContext* lookup_context); + const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); - static void SetupCacheKeyPrefix(Rep* rep); + // Size of all data blocks, maybe approximate + uint64_t GetApproximateDataSize(); - // Generate a cache key prefix from the file - static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file, - char* buffer, size_t* size); - static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer, - size_t* size); - - // Given an iterator return its offset in file. - uint64_t ApproximateOffsetOf( - const InternalIteratorBase& index_iter) const; + // Given an iterator return its offset in data block section of file. + uint64_t ApproximateDataOffsetOf( + const InternalIteratorBase& index_iter, + uint64_t data_size) const; // Helper functions for DumpTable() - Status DumpIndexBlock(WritableFile* out_file); - Status DumpDataBlocks(WritableFile* out_file); + Status DumpIndexBlock(std::ostream& out_stream); + Status DumpDataBlocks(std::ostream& out_stream); void DumpKeyValue(const Slice& key, const Slice& value, - WritableFile* out_file); + std::ostream& out_stream); + + // Returns true if prefix_extractor is compatible with that used in building + // the table file. + bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const; // A cumulative data block file read in MultiGet lower than this size will // use a stack buffer @@ -470,7 +519,7 @@ friend class DBBasicTest_MultiGetIOBufferOverrun_Test; }; -// Maitaning state of a two-level iteration on a partitioned index structure. +// Maintaining state of a two-level iteration on a partitioned index structure. class BlockBasedTable::PartitionedIndexIteratorState : public TwoLevelIteratorState { public: @@ -489,10 +538,10 @@ // Stores all the properties associated with a BlockBasedTable. // These are immutable. struct BlockBasedTable::Rep { - Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, + Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, const InternalKeyComparator& _internal_comparator, bool skip_filters, - int _level, const bool _immortal_table) + uint64_t _file_size, int _level, const bool _immortal_table) : ioptions(_ioptions), env_options(_env_options), table_options(_table_opt), @@ -504,22 +553,18 @@ whole_key_filtering(_table_opt.whole_key_filtering), prefix_filtering(true), global_seqno(kDisableGlobalSequenceNumber), + file_size(_file_size), level(_level), immortal_table(_immortal_table) {} - - const ImmutableCFOptions& ioptions; + ~Rep() { status.PermitUncheckedError(); } + const ImmutableOptions& ioptions; const EnvOptions& env_options; const BlockBasedTableOptions table_options; const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; Status status; std::unique_ptr file; - char cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t cache_key_prefix_size = 0; - char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t persistent_cache_key_prefix_size = 0; - char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t compressed_cache_key_prefix_size = 0; + OffsetableCacheKey base_cache_key; PersistentCacheOptions persistent_cache_options; // Footer contains the fixed table information @@ -561,6 +606,9 @@ // and every key have it's own seqno. SequenceNumber global_seqno; + // Size of the table file on disk + uint64_t file_size; + // the level when the table is opened, could potentially change when trivial // move is involved int level; @@ -606,219 +654,68 @@ uint64_t sst_number_for_tracing() const { return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; } - void CreateFilePrefetchBuffer( - size_t readahead_size, size_t max_readahead_size, - std::unique_ptr* fpb) const { - fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size, - max_readahead_size, - !ioptions.allow_mmap_reads /* enable */)); - } -}; - -// Iterates over the contents of BlockBasedTable. -template -class BlockBasedTableIterator : public InternalIteratorBase { - // compaction_readahead_size: its value will only be used if for_compaction = - // true - public: - BlockBasedTableIterator(const BlockBasedTable* table, - const ReadOptions& read_options, - const InternalKeyComparator& icomp, - InternalIteratorBase* index_iter, - bool check_filter, bool need_upper_bound_check, - const SliceTransform* prefix_extractor, - BlockType block_type, TableReaderCaller caller, - size_t compaction_readahead_size = 0) - : table_(table), - read_options_(read_options), - icomp_(icomp), - user_comparator_(icomp.user_comparator()), - index_iter_(index_iter), - pinned_iters_mgr_(nullptr), - block_iter_points_to_real_block_(false), - check_filter_(check_filter), - need_upper_bound_check_(need_upper_bound_check), - prefix_extractor_(prefix_extractor), - block_type_(block_type), - lookup_context_(caller), - compaction_readahead_size_(compaction_readahead_size) {} - - ~BlockBasedTableIterator() { delete index_iter_; } - - void Seek(const Slice& target) override; - void SeekForPrev(const Slice& target) override; - void SeekToFirst() override; - void SeekToLast() override; - void Next() final override; - bool NextAndGetResult(IterateResult* result) override; - void Prev() override; - bool Valid() const override { - return !is_out_of_bound_ && - (is_at_first_key_from_index_ || - (block_iter_points_to_real_block_ && block_iter_.Valid())); - } - Slice key() const override { - assert(Valid()); - if (is_at_first_key_from_index_) { - return index_iter_->value().first_internal_key; - } else { - return block_iter_.key(); - } - } - Slice user_key() const override { - assert(Valid()); - if (is_at_first_key_from_index_) { - return ExtractUserKey(index_iter_->value().first_internal_key); - } else { - return block_iter_.user_key(); - } + void CreateFilePrefetchBuffer(size_t readahead_size, + size_t max_readahead_size, + std::unique_ptr* fpb, + bool implicit_auto_readahead) const { + fpb->reset(new FilePrefetchBuffer(readahead_size, max_readahead_size, + !ioptions.allow_mmap_reads /* enable */, + false /* track_min_offset */, + implicit_auto_readahead)); } - TValue value() const override { - assert(Valid()); - // Load current block if not loaded. - if (is_at_first_key_from_index_ && - !const_cast(this) - ->MaterializeCurrentBlock()) { - // Oops, index is not consistent with block contents, but we have - // no good way to report error at this point. Let's return empty value. - return TValue(); - } - - return block_iter_.value(); - } - Status status() const override { - // Prefix index set status to NotFound when the prefix does not exist - if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { - return index_iter_->status(); - } else if (block_iter_points_to_real_block_) { - return block_iter_.status(); - } else { - return Status::OK(); + void CreateFilePrefetchBufferIfNotExists( + size_t readahead_size, size_t max_readahead_size, + std::unique_ptr* fpb, + bool implicit_auto_readahead) const { + if (!(*fpb)) { + CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, + implicit_auto_readahead); } } +}; - // Whether iterator invalidated for being out of bound. - bool IsOutOfBound() override { return is_out_of_bound_; } - - inline bool MayBeOutOfUpperBound() override { - assert(Valid()); - return !data_block_within_upper_bound_; - } - - void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { - pinned_iters_mgr_ = pinned_iters_mgr; - } - bool IsKeyPinned() const override { - // Our key comes either from block_iter_'s current key - // or index_iter_'s current *value*. - return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || - (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); - } - bool IsValuePinned() const override { - // Load current block if not loaded. - if (is_at_first_key_from_index_) { - const_cast(this)->MaterializeCurrentBlock(); - } - // BlockIter::IsValuePinned() is always true. No need to check - return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - block_iter_points_to_real_block_; - } +// This is an adapter class for `WritableFile` to be used for `std::ostream`. +// The adapter wraps a `WritableFile`, which can be passed to a `std::ostream` +// constructor for storing streaming data. +// Note: +// * This adapter doesn't provide any buffering, each write is forwarded to +// `WritableFile->Append()` directly. +// * For a failed write, the user needs to check the status by `ostream.good()` +class WritableFileStringStreamAdapter : public std::stringbuf { + public: + explicit WritableFileStringStreamAdapter(WritableFile* writable_file) + : file_(writable_file) {} - void ResetDataIter() { - if (block_iter_points_to_real_block_) { - if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { - block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + // Override overflow() to handle `sputc()`. There are cases that will not go + // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by + // `os.put()` directly and will call `sputc()` By internal implementation: + // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character + // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) : + // overflow(_Traits::to_int_type(_Ch)); + // } + // As we explicitly disabled buffering (_Pnavail() is always 0), every write, + // not captured by xsputn(), becomes an overflow here. + int overflow(int ch = EOF) override { + if (ch != EOF) { + Status s = file_->Append(Slice((char*)&ch, 1)); + if (s.ok()) { + return ch; } - block_iter_.Invalidate(Status::OK()); - block_iter_points_to_real_block_ = false; } + return EOF; } - void SavePrevIndexValue() { - if (block_iter_points_to_real_block_) { - // Reseek. If they end up with the same data block, we shouldn't re-fetch - // the same data block. - prev_block_offset_ = index_iter_->value().handle.offset(); + std::streamsize xsputn(char const* p, std::streamsize n) override { + Status s = file_->Append(Slice(p, n)); + if (!s.ok()) { + return 0; } + return n; } private: - enum class IterDirection { - kForward, - kBackward, - }; - - const BlockBasedTable* table_; - const ReadOptions read_options_; - const InternalKeyComparator& icomp_; - UserComparatorWrapper user_comparator_; - InternalIteratorBase* index_iter_; - PinnedIteratorsManager* pinned_iters_mgr_; - TBlockIter block_iter_; - - // True if block_iter_ is initialized and points to the same block - // as index iterator. - bool block_iter_points_to_real_block_; - // See InternalIteratorBase::IsOutOfBound(). - bool is_out_of_bound_ = false; - // Whether current data block being fully within iterate upper bound. - bool data_block_within_upper_bound_ = false; - // True if we're standing at the first key of a block, and we haven't loaded - // that block yet. A call to value() will trigger loading the block. - bool is_at_first_key_from_index_ = false; - bool check_filter_; - // TODO(Zhongyi): pick a better name - bool need_upper_bound_check_; - const SliceTransform* prefix_extractor_; - BlockType block_type_; - uint64_t prev_block_offset_ = std::numeric_limits::max(); - BlockCacheLookupContext lookup_context_; - // Readahead size used in compaction, its value is used only if - // lookup_context_.caller = kCompaction. - size_t compaction_readahead_size_; - - size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; - size_t readahead_limit_ = 0; - int64_t num_file_reads_ = 0; - std::unique_ptr prefetch_buffer_; - - // If `target` is null, seek to first. - void SeekImpl(const Slice* target); - - void InitDataBlock(); - bool MaterializeCurrentBlock(); - void FindKeyForward(); - void FindBlockForward(); - void FindKeyBackward(); - void CheckOutOfBound(); - - // Check if data block is fully within iterate_upper_bound. - // - // Note MyRocks may update iterate bounds between seek. To workaround it, - // we need to check and update data_block_within_upper_bound_ accordingly. - void CheckDataBlockWithinUpperBound(); - - bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { - if (need_upper_bound_check_ && direction == IterDirection::kBackward) { - // Upper bound check isn't sufficnet for backward direction to - // guarantee the same result as total order, so disable prefix - // check. - return true; - } - if (check_filter_ && - !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, - need_upper_bound_check_, &lookup_context_)) { - // TODO remember the iterator is invalidated because of prefix - // match. This can avoid the upper level file iterator to falsely - // believe the position is the end of the SST file and move to - // the first key of the next file. - ResetDataIter(); - return false; - } - return true; - } + WritableFile* file_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/reader_common.h" + +// The file contains some member functions of BlockBasedTable that +// cannot be implemented in block_based_table_reader.cc because +// it's called by other files (e.g. block_based_iterator.h) and +// are templates. + +namespace ROCKSDB_NAMESPACE { +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + CachableEntry uncompression_dict; + if (rep_->uncompression_dict_reader) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + prefetch_buffer, no_io, get_context, lookup_context, + &uncompression_dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + } + + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + + CachableEntry block; + s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, + get_context, lookup_context, for_compaction, + /* use_cache */ true, /* wait_for_cache */ true); + + if (!s.ok()) { + assert(block.IsEmpty()); + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), block_type, iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache) { + Cache* const block_cache = rep_->table_options.block_cache.get(); + if (block_cache) { + // insert a dummy record to block cache to track the memory usage + Cache::Handle* cache_handle = nullptr; + CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache); + s = block_cache->Insert(key.AsSlice(), nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + + return iter; +} + +// Convert an uncompressed data block (i.e CachableEntry) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, + Status s) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), BlockType::kData, + iter, block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache) { + Cache* const block_cache = rep_->table_options.block_cache.get(); + if (block_cache) { + // insert a dummy record to block cache to track the memory usage + Cache::Handle* cache_handle = nullptr; + CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache); + s = block_cache->Insert(key.AsSlice(), nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + return iter; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,357 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/block_based_table_reader.h" + +#include "db/table_properties_collector.h" +#include "file/file_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/partitioned_index_iterator.h" +#include "table/format.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBasedTableReaderTest + : public testing::Test, + public testing::WithParamInterface> { + protected: + CompressionType compression_type_; + bool use_direct_reads_; + + void SetUp() override { + BlockBasedTableOptions::IndexType index_type; + bool no_block_cache; + std::tie(compression_type_, use_direct_reads_, index_type, no_block_cache) = + GetParam(); + + SetupSyncPointsToMockDirectIO(); + test_dir_ = test::PerThreadDBPath("block_based_table_reader_test"); + env_ = Env::Default(); + fs_ = FileSystem::Default(); + ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr)); + + BlockBasedTableOptions opts; + opts.index_type = index_type; + opts.no_block_cache = no_block_cache; + table_factory_.reset( + static_cast(NewBlockBasedTableFactory(opts))); + } + + void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); } + + // Creates a table with the specificied key value pairs (kv). + void CreateTable(const std::string& table_name, + const CompressionType& compression_type, + const std::map& kv) { + std::unique_ptr writer; + NewFileWriter(table_name, &writer); + + // Create table builder. + Options options; + ImmutableOptions ioptions(options); + InternalKeyComparator comparator(options.comparator); + ColumnFamilyOptions cf_options; + MutableCFOptions moptions(cf_options); + IntTblPropCollectorFactories factories; + std::unique_ptr table_builder(table_factory_->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, comparator, &factories, + compression_type, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, + -1 /* level */), + writer.get())); + + // Build table. + for (auto it = kv.begin(); it != kv.end(); it++) { + std::string k = ToInternalKey(it->first); + std::string v = it->second; + table_builder->Add(k, v); + } + ASSERT_OK(table_builder->Finish()); + } + + void NewBlockBasedTableReader(const FileOptions& foptions, + const ImmutableOptions& ioptions, + const InternalKeyComparator& comparator, + const std::string& table_name, + std::unique_ptr* table) { + std::unique_ptr file; + NewFileReader(table_name, foptions, &file); + + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size)); + + std::unique_ptr table_reader; + ReadOptions ro; + const auto* table_options = + table_factory_->GetOptions(); + ASSERT_NE(table_options, nullptr); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + &table_reader)); + + table->reset(reinterpret_cast(table_reader.release())); + } + + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } + + const std::shared_ptr& fs() const { return fs_; } + + private: + std::string test_dir_; + Env* env_; + std::shared_ptr fs_; + std::unique_ptr table_factory_; + + void WriteToFile(const std::string& content, const std::string& filename) { + std::unique_ptr f; + ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append(content, IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + + void NewFileWriter(const std::string& filename, + std::unique_ptr* writer) { + std::string path = Path(filename); + EnvOptions env_options; + FileOptions foptions; + std::unique_ptr file; + ASSERT_OK(fs_->NewWritableFile(path, foptions, &file, nullptr)); + writer->reset(new WritableFileWriter(std::move(file), path, env_options)); + } + + void NewFileReader(const std::string& filename, const FileOptions& opt, + std::unique_ptr* reader) { + std::string path = Path(filename); + std::unique_ptr f; + ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr)); + reader->reset(new RandomAccessFileReader(std::move(f), path, + env_->GetSystemClock().get())); + } + + std::string ToInternalKey(const std::string& key) { + InternalKey internal_key(key, 0, ValueType::kTypeValue); + return internal_key.Encode().ToString(); + } +}; + +// Tests MultiGet in both direct IO and non-direct IO mode. +// The keys should be in cache after MultiGet. +TEST_P(BlockBasedTableReaderTest, MultiGet) { + // Prepare key-value pairs to occupy multiple blocks. + // Each value is 256B, every 16 pairs constitute 1 block. + // Adjacent blocks contain values with different compression complexity: + // human readable strings are easier to compress than random strings. + std::map kv; + { + Random rnd(101); + uint32_t key = 0; + for (int block = 0; block < 100; block++) { + for (int i = 0; i < 16; i++) { + char k[9] = {0}; + // Internal key is constructed directly from this key, + // and internal key size is required to be >= 8 bytes, + // so use %08u as the format string. + sprintf(k, "%08u", key); + std::string v; + if (block % 2) { + v = rnd.HumanReadableString(256); + } else { + v = rnd.RandomString(256); + } + kv[std::string(k)] = v; + key++; + } + } + } + + // Prepare keys, values, and statuses for MultiGet. + autovector keys; + autovector values; + autovector statuses; + { + const int step = + static_cast(kv.size()) / MultiGetContext::MAX_BATCH_SIZE; + auto it = kv.begin(); + for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE; i++) { + keys.emplace_back(it->first); + values.emplace_back(); + statuses.emplace_back(); + std::advance(it, step); + } + } + + std::string table_name = + "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_); + CreateTable(table_name, compression_type_, kv); + + std::unique_ptr table; + Options options; + ImmutableOptions ioptions(options); + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + + // Ensure that keys are not in cache before MultiGet. + for (auto& key : keys) { + ASSERT_FALSE(table->TEST_KeyInCache(ReadOptions(), key)); + } + + // Prepare MultiGetContext. + autovector get_context; + autovector key_context; + autovector sorted_keys; + for (size_t i = 0; i < keys.size(); ++i) { + get_context.emplace_back( + BytewiseComparator(), nullptr, nullptr, nullptr, GetContext::kNotFound, + keys[i], &values[i], nullptr, nullptr, nullptr, true /* do_merge */, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); + key_context.emplace_back(nullptr, keys[i], &values[i], nullptr, + &statuses.back()); + key_context.back().get_context = &get_context.back(); + } + for (auto& key_ctx : key_context) { + sorted_keys.emplace_back(&key_ctx); + } + MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions()); + + // Execute MultiGet. + MultiGetContext::Range range = ctx.GetMultiGetRange(); + PerfContext* perf_ctx = get_perf_context(); + perf_ctx->Reset(); + table->MultiGet(ReadOptions(), &range, nullptr); + + ASSERT_GE(perf_ctx->block_read_count - perf_ctx->index_block_read_count - + perf_ctx->filter_block_read_count - + perf_ctx->compression_dict_block_read_count, + 1); + ASSERT_GE(perf_ctx->block_read_byte, 1); + + for (const Status& status : statuses) { + ASSERT_OK(status); + } + // Check that keys are in cache after MultiGet. + for (size_t i = 0; i < keys.size(); i++) { + ASSERT_TRUE(table->TEST_KeyInCache(ReadOptions(), keys[i])); + ASSERT_EQ(values[i].ToString(), kv[keys[i].ToString()]); + } +} + +class BlockBasedTableReaderTestVerifyChecksum + : public BlockBasedTableReaderTest { + public: + BlockBasedTableReaderTestVerifyChecksum() : BlockBasedTableReaderTest() {} +}; + +TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { + // Prepare key-value pairs to occupy multiple blocks. + // Each value is 256B, every 16 pairs constitute 1 block. + // Adjacent blocks contain values with different compression complexity: + // human readable strings are easier to compress than random strings. + Random rnd(101); + std::map kv; + { + uint32_t key = 0; + for (int block = 0; block < 800; block++) { + for (int i = 0; i < 16; i++) { + char k[9] = {0}; + // Internal key is constructed directly from this key, + // and internal key size is required to be >= 8 bytes, + // so use %08u as the format string. + sprintf(k, "%08u", key); + std::string v = rnd.RandomString(256); + kv[std::string(k)] = v; + key++; + } + } + } + + std::string table_name = + "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_); + CreateTable(table_name, compression_type_, kv); + + std::unique_ptr table; + Options options; + ImmutableOptions ioptions(options); + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + + // Use the top level iterator to find the offset/size of the first + // 2nd level index block and corrupt the block + IndexBlockIter iiter_on_stack; + BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum}; + InternalIteratorBase* iiter = table->NewIndexIterator( + ReadOptions(), /*disable_prefix_seek=*/false, &iiter_on_stack, + /*get_context=*/nullptr, &context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr>(iiter); + } + ASSERT_OK(iiter->status()); + iiter->SeekToFirst(); + BlockHandle handle = static_cast(iiter) + ->index_iter_->value() + .handle; + table.reset(); + + // Corrupt the block pointed to by handle + ASSERT_OK(test::CorruptFile(options.env, Path(table_name), + static_cast(handle.offset()), 128)); + + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + Status s = table->VerifyChecksum(ReadOptions(), + TableReaderCaller::kUserVerifyChecksum); + ASSERT_EQ(s.code(), Status::kCorruption); +} + +// Param 1: compression type +// Param 2: whether to use direct reads +// Param 3: Block Based Table Index type +// Param 4: BBTO no_block_cache option +#ifdef ROCKSDB_LITE +// Skip direct I/O tests in lite mode since direct I/O is unsupported. +INSTANTIATE_TEST_CASE_P( + MultiGet, BlockBasedTableReaderTest, + ::testing::Combine( + ::testing::ValuesIn(GetSupportedCompressions()), + ::testing::Values(false), + ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch), + ::testing::Values(false))); +#else // ROCKSDB_LITE +INSTANTIATE_TEST_CASE_P( + MultiGet, BlockBasedTableReaderTest, + ::testing::Combine( + ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), + ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch), + ::testing::Values(false))); +#endif // ROCKSDB_LITE +INSTANTIATE_TEST_CASE_P( + VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum, + ::testing::Combine( + ::testing::ValuesIn(GetSupportedCompressions()), + ::testing::Values(false), + ::testing::Values( + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch), + ::testing::Values(true))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -50,7 +50,7 @@ : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), use_value_delta_encoding_(use_value_delta_encoding), - restarts_(), + restarts_(1, 0), // First restart point is at offset 0 counter_(0), finished_(false) { switch (index_type) { @@ -64,14 +64,13 @@ assert(0); } assert(block_restart_interval_ >= 1); - restarts_.push_back(0); // First restart point is at offset 0 estimate_ = sizeof(uint32_t) + sizeof(uint32_t); } void BlockBuilder::Reset() { buffer_.clear(); - restarts_.clear(); - restarts_.push_back(0); // First restart point is at offset 0 + restarts_.resize(1); // First restart point is at offset 0 + assert(restarts_[0] == 0); estimate_ = sizeof(uint32_t) + sizeof(uint32_t); counter_ = 0; finished_ = false; @@ -79,6 +78,14 @@ if (data_block_hash_index_builder_.Valid()) { data_block_hash_index_builder_.Reset(); } +#ifndef NDEBUG + add_with_last_key_called_ = false; +#endif +} + +void BlockBuilder::SwapAndReset(std::string& buffer) { + std::swap(buffer_, buffer); + Reset(); } size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, @@ -134,33 +141,62 @@ void BlockBuilder::Add(const Slice& key, const Slice& value, const Slice* const delta_value) { + // Ensure no unsafe mixing of Add and AddWithLastKey + assert(!add_with_last_key_called_); + + AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size()); + if (use_delta_encoding_) { + // Update state + // We used to just copy the changed data, but it appears to be + // faster to just copy the whole thing. + last_key_.assign(key.data(), key.size()); + } +} + +void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, + const Slice& last_key_param, + const Slice* const delta_value) { + // Ensure no unsafe mixing of Add and AddWithLastKey + assert(last_key_.empty()); +#ifndef NDEBUG + add_with_last_key_called_ = false; +#endif + + // Here we make sure to use an empty `last_key` on first call after creation + // or Reset. This is more convenient for the caller and we can be more + // clever inside BlockBuilder. On this hot code path, we want to avoid + // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a + // fast min operation instead, with an assertion to be sure our logic is + // sound. + size_t buffer_size = buffer_.size(); + size_t last_key_size = last_key_param.size(); + assert(buffer_size == 0 || buffer_size >= last_key_size); + + Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size)); + + AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); +} + +inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, + const Slice& value, + const Slice& last_key, + const Slice* const delta_value, + size_t buffer_size) { assert(!finished_); assert(counter_ <= block_restart_interval_); assert(!use_value_delta_encoding_ || delta_value); size_t shared = 0; // number of bytes shared with prev key if (counter_ >= block_restart_interval_) { // Restart compression - restarts_.push_back(static_cast(buffer_.size())); + restarts_.push_back(static_cast(buffer_size)); estimate_ += sizeof(uint32_t); counter_ = 0; - - if (use_delta_encoding_) { - // Update state - last_key_.assign(key.data(), key.size()); - } } else if (use_delta_encoding_) { - Slice last_key_piece(last_key_); // See how much sharing to do with previous string - shared = key.difference_offset(last_key_piece); - - // Update state - // We used to just copy the changed data here, but it appears to be - // faster to just copy the whole thing. - last_key_.assign(key.data(), key.size()); + shared = key.difference_offset(last_key); } const size_t non_shared = key.size() - shared; - const size_t curr_size = buffer_.size(); if (use_value_delta_encoding_) { // Add "" to buffer_ @@ -190,7 +226,7 @@ } counter_++; - estimate_ += buffer_.size() - curr_size; + estimate_ += buffer_.size() - buffer_size; } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -32,11 +32,29 @@ // Reset the contents as if the BlockBuilder was just constructed. void Reset(); + // Swap the contents in BlockBuilder with buffer, then reset the BlockBuilder. + void SwapAndReset(std::string& buffer); + // REQUIRES: Finish() has not been called since the last call to Reset(). // REQUIRES: key is larger than any previously added key + // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use + // AddWithLastKey() in contexts where previous added key is already known + // and delta encoding might be used. void Add(const Slice& key, const Slice& value, const Slice* const delta_value = nullptr); + // A faster version of Add() if the previous key is already known for all + // Add()s. + // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key + // is the key from most recent AddWithLastKey. (For convenience, last_key + // is ignored on first call after creation or Reset().) + // DO NOT mix with Add() between Resets. + void AddWithLastKey(const Slice& key, const Slice& value, + const Slice& last_key, + const Slice* const delta_value = nullptr); + // Finish building the block and return a slice that refers to the // block contents. The returned slice will remain valid for the // lifetime of this builder or until Reset() is called. @@ -57,6 +75,11 @@ bool empty() const { return buffer_.empty(); } private: + inline void AddWithLastKeyImpl(const Slice& key, const Slice& value, + const Slice& last_key, + const Slice* const delta_value, + size_t buffer_size); + const int block_restart_interval_; // TODO(myabandeh): put it into a separate IndexBlockBuilder const bool use_delta_encoding_; @@ -70,6 +93,9 @@ bool finished_; // Has Finish() been called? std::string last_key_; DataBlockHashIndexBuilder data_block_hash_index_builder_; +#ifndef NDEBUG + bool add_with_last_key_called_ = false; +#endif }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,225 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "cache/cache_entry_roles.h" +#include "port/lang.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +template +class BlocklikeTraits; + +template +Cache::CacheItemHelper* GetCacheItemHelperForRole(); + +template +Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit, + Statistics* statistics, bool using_zstd, + const FilterPolicy* filter_policy) { + return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy]( + void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + assert(buf != nullptr); + std::unique_ptr buf_data(new char[size]()); + memcpy(buf_data.get(), buf, size); + BlockContents bc = BlockContents(std::move(buf_data), size); + TBlocklike* ucd_ptr = BlocklikeTraits::Create( + std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd, + filter_policy); + *out_obj = reinterpret_cast(ucd_ptr); + *charge = size; + return Status::OK(); + }; +} + +template <> +class BlocklikeTraits { + public: + static BlockContents* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + BlockContents* ptr = static_cast(obj); + return ptr->data.size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + BlockContents* ptr = static_cast(from_obj); + const char* buf = ptr->data.data(); + assert(length == ptr->data.size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + if (block_type == BlockType::kFilter) { + return GetCacheItemHelperForRole< + BlockContents, CacheEntryRole::kDeprecatedFilterBlock>(); + } else { + // E.g. compressed cache + return GetCacheItemHelperForRole(); + } + } +}; + +template <> +class BlocklikeTraits { + public: + static ParsedFullFilterBlock* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* filter_policy) { + return new ParsedFullFilterBlock(filter_policy, std::move(contents)); + } + + static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + ParsedFullFilterBlock* ptr = static_cast(obj); + return ptr->GetBlockContentsData().size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + ParsedFullFilterBlock* ptr = static_cast(from_obj); + const char* buf = ptr->GetBlockContentsData().data(); + assert(length == ptr->GetBlockContentsData().size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + (void)block_type; + assert(block_type == BlockType::kFilter); + return GetCacheItemHelperForRole(); + } +}; + +template <> +class BlocklikeTraits { + public: + static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit, + Statistics* statistics, bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new Block(std::move(contents), read_amp_bytes_per_bit, statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + Block* ptr = static_cast(obj); + return ptr->size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + Block* ptr = static_cast(from_obj); + const char* buf = ptr->data(); + assert(length == ptr->size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + switch (block_type) { + case BlockType::kData: + return GetCacheItemHelperForRole(); + case BlockType::kIndex: + return GetCacheItemHelperForRole(); + case BlockType::kFilter: + return GetCacheItemHelperForRole(); + default: + // Not a recognized combination + assert(false); + FALLTHROUGH_INTENDED; + case BlockType::kRangeDeletion: + return GetCacheItemHelperForRole(); + } + } +}; + +template <> +class BlocklikeTraits { + public: + static UncompressionDict* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool using_zstd, + const FilterPolicy* /* filter_policy */) { + return new UncompressionDict(contents.data, std::move(contents.allocation), + using_zstd); + } + + static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + UncompressionDict* ptr = static_cast(obj); + return ptr->slice_.size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + UncompressionDict* ptr = static_cast(from_obj); + const char* buf = ptr->slice_.data(); + assert(length == ptr->slice_.size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + (void)block_type; + assert(block_type == BlockType::kCompressionDictionary); + return GetCacheItemHelperForRole(); + } +}; + +// Get an CacheItemHelper pointer for value type T and role R. +template +Cache::CacheItemHelper* GetCacheItemHelperForRole() { + static Cache::CacheItemHelper cache_helper( + BlocklikeTraits::SizeCallback, BlocklikeTraits::SaveToCallback, + GetCacheEntryDeleterForRole()); + return &cache_helper; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_prefetcher.h" + +#include "table/block_based/block_based_table_reader.h" + +namespace ROCKSDB_NAMESPACE { +void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, + const BlockHandle& handle, + size_t readahead_size, + bool is_for_compaction) { + if (is_for_compaction) { + rep->CreateFilePrefetchBufferIfNotExists(compaction_readahead_size_, + compaction_readahead_size_, + &prefetch_buffer_, false); + return; + } + + // Explicit user requested readahead. + if (readahead_size > 0) { + rep->CreateFilePrefetchBufferIfNotExists(readahead_size, readahead_size, + &prefetch_buffer_, false); + return; + } + + // Implicit readahead. + + // If max_auto_readahead_size is set to be 0 by user, no data will be + // prefetched. + size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size; + if (max_auto_readahead_size == 0) { + return; + } + + size_t len = BlockBasedTable::BlockSizeWithTrailer(handle); + size_t offset = handle.offset(); + + // If FS supports prefetching (readahead_limit_ will be non zero in that case) + // and current block exists in prefetch buffer then return. + if (offset + len <= readahead_limit_) { + UpdateReadPattern(offset, len); + return; + } + + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, len); + ResetValues(); + return; + } + UpdateReadPattern(offset, len); + + // Implicit auto readahead, which will be enabled if the number of reads + // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2) and scans are + // sequential. + num_file_reads_++; + if (num_file_reads_ <= + BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { + return; + } + + if (initial_auto_readahead_size_ > max_auto_readahead_size) { + initial_auto_readahead_size_ = max_auto_readahead_size; + } + + if (rep->file->use_direct_io()) { + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size_, + max_auto_readahead_size, + &prefetch_buffer_, true); + return; + } + + if (readahead_size_ > max_auto_readahead_size) { + readahead_size_ = max_auto_readahead_size; + } + + // If prefetch is not supported, fall back to use internal prefetch buffer. + // Discarding other return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + Status s = rep->file->Prefetch( + handle.offset(), + BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_); + if (s.IsNotSupported()) { + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size_, + max_auto_readahead_size, + &prefetch_buffer_, true); + return; + } + + readahead_limit_ = offset + len + readahead_size_; + // Keep exponentially increasing readahead size until + // max_auto_readahead_size. + readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +namespace ROCKSDB_NAMESPACE { +class BlockPrefetcher { + public: + explicit BlockPrefetcher(size_t compaction_readahead_size) + : compaction_readahead_size_(compaction_readahead_size) {} + void PrefetchIfNeeded(const BlockBasedTable::Rep* rep, + const BlockHandle& handle, size_t readahead_size, + bool is_for_compaction); + FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } + + void UpdateReadPattern(const uint64_t& offset, const size_t& len) { + prev_offset_ = offset; + prev_len_ = len; + } + + bool IsBlockSequential(const uint64_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + void ResetValues() { + num_file_reads_ = 1; + // Since initial_auto_readahead_size_ can be different from + // kInitAutoReadaheadSize in case of adaptive_readahead, so fallback the + // readahead_size_ to kInitAutoReadaheadSize in case of reset. + initial_auto_readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + readahead_size_ = initial_auto_readahead_size_; + readahead_limit_ = 0; + return; + } + + void SetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) { + num_file_reads_ = readahead_info->num_file_reads; + initial_auto_readahead_size_ = readahead_info->readahead_size; + TEST_SYNC_POINT_CALLBACK("BlockPrefetcher::SetReadaheadState", + &initial_auto_readahead_size_); + } + + private: + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; + + // readahead_size_ is used if underlying FS supports prefetching. + size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + size_t readahead_limit_ = 0; + // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch + // buffer. + uint64_t initial_auto_readahead_size_ = + BlockBasedTable::kInitAutoReadaheadSize; + int64_t num_file_reads_ = 0; + uint64_t prev_offset_ = 0; + size_t prev_len_ = 0; + std::unique_ptr prefetch_buffer_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,7 +4,10 @@ // (found in the LICENSE.Apache file in the root directory). // +#include "table/block_based/block.h" + #include + #include #include #include @@ -20,7 +23,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" #include "table/format.h" #include "test_util/testharness.h" @@ -29,20 +32,16 @@ namespace ROCKSDB_NAMESPACE { -static std::string RandomString(Random *rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} -std::string GenerateKey(int primary_key, int secondary_key, int padding_size, - Random *rnd) { +std::string GenerateInternalKey(int primary_key, int secondary_key, + int padding_size, Random *rnd) { char buf[50]; char *p = &buf[0]; snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); std::string k(p); if (padding_size) { - k += RandomString(rnd, padding_size); + k += rnd->RandomString(padding_size); } + AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue); return k; } @@ -61,10 +60,11 @@ for (int i = from; i < from + len; i += step) { // generating keys that shares the prefix for (int j = 0; j < keys_share_prefix; ++j) { - keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + // `DataBlockIter` assumes it reads only internal keys. + keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd)); // 100 bytes values - values->emplace_back(RandomString(&rnd, 100)); + values->emplace_back(rnd.RandomString(100)); } } } @@ -93,12 +93,12 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); // read contents of block sequentially int count = 0; InternalIterator *iter = - reader.NewDataIterator(options.comparator, options.comparator); + reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -111,7 +111,8 @@ delete iter; // read block contents randomly - iter = reader.NewDataIterator(options.comparator, options.comparator); + iter = + reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -151,14 +152,14 @@ const size_t prefix_size = 6; // create block reader BlockContents contents_ref(contents.data); - Block reader1(std::move(contents), kDisableGlobalSequenceNumber); - Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber); + Block reader1(std::move(contents)); + Block reader2(std::move(contents_ref)); std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); - std::unique_ptr regular_iter( - reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator())); + std::unique_ptr regular_iter(reader2.NewDataIterator( + BytewiseComparator(), kDisableGlobalSequenceNumber)); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -175,7 +176,8 @@ // simply be set as invalid; whereas the binary search based iterator will // return the one that is closest. for (int i = 1; i < max_key - 1; i += 2) { - auto key = GenerateKey(i, 0, 0, nullptr); + // `DataBlockIter` assumes its APIs receive only internal keys. + auto key = GenerateInternalKey(i, 0, 0, nullptr); regular_iter->Seek(key); ASSERT_TRUE(regular_iter->Valid()); } @@ -375,13 +377,12 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber, - kBytesPerBit, stats.get()); + Block reader(std::move(contents), kBytesPerBit, stats.get()); // read contents of block sequentially size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -408,12 +409,11 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber, - kBytesPerBit, stats.get()); + Block reader(std::move(contents), kBytesPerBit, stats.get()); size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -443,12 +443,11 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber, - kBytesPerBit, stats.get()); + Block reader(std::move(contents), kBytesPerBit, stats.get()); size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); @@ -524,7 +523,7 @@ separators->emplace_back(*it++); uint64_t size = rnd.Uniform(1024 * 16); BlockHandle handle(offset, size); - offset += size + kBlockTrailerSize; + offset += size + BlockBasedTable::kBlockTrailerSize; block_handles->emplace_back(handle); } } @@ -563,7 +562,7 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); const bool kTotalOrderSeek = true; const bool kIncludesSeq = true; @@ -572,7 +571,7 @@ Statistics *kNullStats = nullptr; // read contents of block sequentially InternalIteratorBase *iter = reader.NewIndexIterator( - options.comparator, options.comparator, kNullIter, kNullStats, + options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); iter->SeekToFirst(); for (int index = 0; index < num_records; ++index) { @@ -592,9 +591,9 @@ delete iter; // read block contents randomly - iter = reader.NewIndexIterator(options.comparator, options.comparator, - kNullIter, kNullStats, kTotalOrderSeek, - includeFirstKey(), kIncludesSeq, kValueIsFull); + iter = reader.NewIndexIterator( + options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); for (int i = 0; i < num_records * 2; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_type.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ #include +#include "rocksdb/rocksdb_namespace.h" + namespace ROCKSDB_NAMESPACE { // Represents the types of blocks used in the block based table format. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h 2025-05-19 16:14:27.000000000 +0000 @@ -162,7 +162,6 @@ } void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { - assert(value != nullptr); assert(cache != nullptr); assert(cache_handle != nullptr); @@ -179,6 +178,22 @@ assert(!own_value_); } + void UpdateCachedValue() { + assert(cache_ != nullptr); + assert(cache_handle_ != nullptr); + + value_ = static_cast(cache_->Value(cache_handle_)); + } + + bool IsReady() { + if (!own_value_) { + assert(cache_ != nullptr); + assert(cache_handle_ != nullptr); + return cache_->IsReady(cache_handle_); + } + return true; + } + private: void ReleaseResource() { if (LIKELY(cache_handle_ != nullptr)) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/data_block_hash_index.h" + #include #include #include @@ -12,11 +14,11 @@ #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#include "table/block_based/data_block_hash_index.h" #include "table/get_context.h" #include "table/table_builder.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -35,12 +37,6 @@ return entry == restart_point; } -// Random KV generator similer to block_test -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} std::string GenerateKey(int primary_key, int secondary_key, int padding_size, Random* rnd) { char buf[50]; @@ -48,7 +44,7 @@ snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); std::string k(p); if (padding_size) { - k += RandomString(rnd, padding_size); + k += rnd->RandomString(padding_size); } return k; @@ -71,7 +67,7 @@ keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); // 100 bytes values - values->emplace_back(RandomString(&rnd, 100)); + values->emplace_back(rnd.RandomString(100)); } } } @@ -284,7 +280,7 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); ASSERT_EQ(reader.IndexType(), BlockBasedTableOptions::kDataBlockBinaryAndHash); @@ -306,7 +302,7 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); ASSERT_EQ(reader.IndexType(), BlockBasedTableOptions::kDataBlockBinarySearch); @@ -337,7 +333,7 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); ASSERT_EQ(reader.IndexType(), BlockBasedTableOptions::kDataBlockBinaryAndHash); @@ -361,7 +357,7 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); // the index type have fallen back to binary when build finish. ASSERT_EQ(reader.IndexType(), @@ -388,10 +384,11 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); const InternalKeyComparator icmp(BytewiseComparator()); - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(icmp.user_comparator(), + kDisableGlobalSequenceNumber); bool may_exist; // search in block for the key just inserted { @@ -469,12 +466,13 @@ // create block reader BlockContents contents; contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); + Block reader(std::move(contents)); const InternalKeyComparator icmp(BytewiseComparator()); // random seek existent keys for (int i = 0; i < num_records; i++) { - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(icmp.user_comparator(), + kDisableGlobalSequenceNumber); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "1" /* existing key marker */); @@ -511,7 +509,8 @@ // C true false for (int i = 0; i < num_records; i++) { - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(icmp.user_comparator(), + kDisableGlobalSequenceNumber); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "0" /* non-existing key marker */); @@ -540,26 +539,27 @@ int level_ = -1; std::vector keys; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); EnvOptions soptions; soptions.use_mmap_reads = ioptions.allow_mmap_reads; + test::StringSink* sink = new test::StringSink(); + std::unique_ptr f(sink); file_writer.reset( - test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions())); std::unique_ptr builder; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, - &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - CompressionOptions(), false /* skip_filters */, - column_family_name, level_), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions( + ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, options.compression, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + column_family_name, level_), file_writer.get())); builder->Add(ik1.Encode().ToString(), v1); @@ -570,23 +570,20 @@ file_writer->Flush(); EXPECT_TRUE(s.ok()) << s.ToString(); - EXPECT_EQ( - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), - builder->FileSize()); + EXPECT_EQ(sink->contents().size(), builder->FileSize()); // Open the table - file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource( - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(), - 0 /*uniq_id*/, ioptions.allow_mmap_reads))); + test::StringSource* source = new test::StringSource( + sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads); + std::unique_ptr file(source); + file_reader.reset(new RandomAccessFileReader(std::move(file), "test")); const bool kSkipFilters = true; const bool kImmortal = true; - ioptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + ASSERT_OK(ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, internal_comparator, !kSkipFilters, !kImmortal, level_), - std::move(file_reader), - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), - &table_reader); + std::move(file_reader), sink->contents().size(), &table_reader)); // Search using Get() ReadOptions ro; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h 2025-05-19 16:14:27.000000000 +0000 @@ -20,10 +20,11 @@ #include #include + #include #include #include -#include "db/dbformat.h" + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" @@ -60,8 +61,11 @@ virtual bool IsBlockBased() = 0; // If is blockbased filter virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter - virtual void Add(const Slice& key) = 0; // Add a key to current filter - virtual size_t NumAdded() const = 0; // Number of keys added + virtual void Add( + const Slice& key_without_ts) = 0; // Add a key to current filter + virtual bool IsEmpty() const = 0; // Empty == none added + // For reporting stats on how many entries the builder considered unique + virtual size_t EstimateEntriesAdded() = 0; Slice Finish() { // Generate Filter const BlockHandle empty_handle; Status dont_care_status; @@ -69,7 +73,19 @@ assert(dont_care_status.ok()); return ret; } - virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0; + // If filter_data is not nullptr, Finish() may transfer ownership of + // underlying filter data to the caller, so that it can be freed as soon as + // possible. BlockBasedFilterBlock will ignore this parameter. + // + virtual Slice Finish( + const BlockHandle& tmp /* only used in PartitionedFilterBlock as + last_partition_block_handle */ + , + Status* status, std::unique_ptr* filter_data = nullptr) = 0; + + // It is for releasing the memory usage and cache reservation of filter bits + // builder in FullFilter and PartitionedFilter + virtual void ResetFilterBitsBuilder() {} }; // A FilterBlockReader is used to parse filter from SST table. @@ -108,11 +124,11 @@ uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { - const Slice ukey = iter->ukey; + const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; - if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, - get_context, lookup_context)) { + if (!KeyMayMatch(ukey_without_ts, prefix_extractor, block_offset, no_io, + &ikey, get_context, lookup_context)) { range->SkipKey(iter); } } @@ -133,13 +149,13 @@ uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { - const Slice ukey = iter->ukey; + const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; - if (prefix_extractor->InDomain(ukey) && - !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey, get_context, - lookup_context)) { + if (prefix_extractor->InDomain(ukey_without_ts) && + !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), + prefix_extractor, block_offset, no_io, &ikey, + get_context, lookup_context)) { range->SkipKey(iter); } } @@ -153,21 +169,24 @@ return error_msg; } - virtual void CacheDependencies(bool /*pin*/) {} + virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + return Status::OK(); + } virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, - const Slice& user_key, + const Slice& user_key_without_ts, const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, + bool no_io, BlockCacheLookupContext* lookup_context) { if (need_upper_bound_check) { return true; } *filter_checked = true; - Slice prefix = prefix_extractor->Transform(user_key); - return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + Slice prefix = prefix_extractor->Transform(user_key_without_ts); + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io, const_ikey_ptr, /* get_context */ nullptr, lookup_context); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc 2025-05-19 16:14:27.000000000 +0000 @@ -30,7 +30,8 @@ table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter, get_context, lookup_context, - /* for_compaction */ false, use_cache); + /* for_compaction */ false, use_cache, + /* wait_for_cache */ true); return s; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,30 +7,239 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/filter_policy.h" + #include #include +#include +#include -#include "rocksdb/filter_policy.h" - +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "logging/logging.h" #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" -#include "table/block_based/full_filter_block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" #include "third-party/folly/folly/ConstexprMath.h" #include "util/bloom_impl.h" #include "util/coding.h" #include "util/hash.h" +#include "util/ribbon_config.h" +#include "util/ribbon_impl.h" namespace ROCKSDB_NAMESPACE { namespace { +// Metadata trailer size for built-in filters. (This is separate from +// block-based table block trailer.) +// +// Originally this was 1 byte for num_probes and 4 bytes for number of +// cache lines in the Bloom filter, but now the first trailer byte is +// usually an implementation marker and remaining 4 bytes have various +// meanings. +static constexpr uint32_t kMetadataLen = 5; + +Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { + // Missing metadata, treated as zero entries + return Slice(nullptr, 0); +} + +// Base class for filter builders using the XXH3 preview hash, +// also known as Hash64 or GetSliceHash64. +class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr) + : aggregate_rounding_balance_(aggregate_rounding_balance), + cache_res_mgr_(cache_res_mgr) {} + + ~XXPH3FilterBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override { + uint64_t hash = GetSliceHash64(key); + // Especially with prefixes, it is common to have repetition, + // though only adjacent repetition, which we want to immediately + // recognize and collapse for estimating true filter space + // requirements. + if (hash_entries_.empty() || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_.size() % kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entry_cache_res_bucket_handles_.emplace_back(nullptr); + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entry_cache_res_bucket_handles_.back()); + s.PermitUncheckedError(); + } + } + } + + virtual size_t EstimateEntriesAdded() override { + return hash_entries_.size(); + } + + protected: + static constexpr uint32_t kMetadataLen = 5; + + // Number of hash entries to accumulate before charging their memory usage to + // the cache when cache reservation is available + static const std::size_t kUint64tHashEntryCacheResBucketSize = + CacheReservationManager::GetDummyEntrySize() / sizeof(uint64_t); + + // For delegating between XXPH3FilterBitsBuilders + void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { + std::swap(hash_entries_, other->hash_entries_); + if (cache_res_mgr_) { + std::swap(hash_entry_cache_res_bucket_handles_, + other->hash_entry_cache_res_bucket_handles_); + } + } + + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; + + // To choose size using malloc_usable_size, we have to actually allocate. + size_t AllocateMaybeRounding(size_t target_len_with_metadata, + size_t num_entries, + std::unique_ptr* buf) { + // Return value set to a default; overwritten in some cases + size_t rv = target_len_with_metadata; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + if (aggregate_rounding_balance_ != nullptr) { + // Do optimize_filters_for_memory, using malloc_usable_size. + // Approach: try to keep FP rate balance better than or on + // target (negative aggregate_rounding_balance_). We can then select a + // lower bound filter size (within reasonable limits) that gets us as + // close to on target as possible. We request allocation for that filter + // size and use malloc_usable_size to "round up" to the actual + // allocation size. + + // Although it can be considered bad practice to use malloc_usable_size + // to access an object beyond its original size, this approach should be + // quite general: working for all allocators that properly support + // malloc_usable_size. + + // Race condition on balance is OK because it can only cause temporary + // skew in rounding up vs. rounding down, as long as updates are atomic + // and relative. + int64_t balance = aggregate_rounding_balance_->load(); + + double target_fp_rate = + EstimatedFpRate(num_entries, target_len_with_metadata); + double rv_fp_rate = target_fp_rate; + + if (balance < 0) { + // See formula for BloomFilterPolicy::aggregate_rounding_balance_ + double for_balance_fp_rate = + -balance / double{0x100000000} + target_fp_rate; + + // To simplify, we just try a few modified smaller sizes. This also + // caps how much we vary filter size vs. target, to avoid outlier + // behavior from excessive variance. + size_t target_len = target_len_with_metadata - kMetadataLen; + assert(target_len < target_len_with_metadata); // check underflow + for (uint64_t maybe_len_rough : + {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16, + uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) { + size_t maybe_len_with_metadata = + RoundDownUsableSpace(maybe_len_rough + kMetadataLen); + double maybe_fp_rate = + EstimatedFpRate(num_entries, maybe_len_with_metadata); + if (maybe_fp_rate <= for_balance_fp_rate) { + rv = maybe_len_with_metadata; + rv_fp_rate = maybe_fp_rate; + break; + } + } + } + + // Filter blocks are loaded into block cache with their block trailer. + // We need to make sure that's accounted for in choosing a + // fragmentation-friendly size. + const size_t kExtraPadding = BlockBasedTable::kBlockTrailerSize; + size_t requested = rv + kExtraPadding; + + // Allocate and get usable size + buf->reset(new char[requested]); + size_t usable = malloc_usable_size(buf->get()); + + if (usable - usable / 4 > requested) { + // Ratio greater than 4/3 is too much for utilizing, if it's + // not a buggy or mislinked malloc_usable_size implementation. + // Non-linearity of FP rates with bits/key means rapidly + // diminishing returns in overall accuracy for additional + // storage on disk. + // Nothing to do, except assert that the result is accurate about + // the usable size. (Assignment never used.) + assert(((*buf)[usable - 1] = 'x')); + } else if (usable > requested) { + rv = RoundDownUsableSpace(usable - kExtraPadding); + assert(rv <= usable - kExtraPadding); + rv_fp_rate = EstimatedFpRate(num_entries, rv); + } else { + // Too small means bad malloc_usable_size + assert(usable == requested); + } + memset(buf->get(), 0, rv); + + // Update balance + int64_t diff = static_cast((rv_fp_rate - target_fp_rate) * + double{0x100000000}); + *aggregate_rounding_balance_ += diff; + } else { + buf->reset(new char[rv]()); + } +#else + (void)num_entries; + buf->reset(new char[rv]()); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return rv; + } + + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque hash_entries_; + + // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, + // always "round up" like historic behavior. + std::atomic* aggregate_rounding_balance_; + + // For reserving memory used in (new) Bloom and Ribbon Filter construction + std::shared_ptr cache_res_mgr_; + + // For managing cache reservation for buckets of hash entry in (new) Bloom and + // Ribbon Filter construction + std::deque>> + hash_entry_cache_res_bucket_handles_; + + // For managing cache reservation for final filter in (new) Bloom and Ribbon + // Filter construction + std::deque>> + final_filter_cache_res_handles_; +}; + +// #################### FastLocalBloom implementation ################## // +// ############## also known as format_version=5 Bloom filter ########## // + // See description in FastLocalBloomImpl -class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { +class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { public: - explicit FastLocalBloomBitsBuilder(const int millibits_per_key) - : millibits_per_key_(millibits_per_key), - num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) { + // Non-null aggregate_rounding_balance implies optimize_filters_for_memory + explicit FastLocalBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), + millibits_per_key_(millibits_per_key) { assert(millibits_per_key >= 1000); } @@ -40,70 +249,126 @@ ~FastLocalBloomBitsBuilder() override {} - virtual void AddKey(const Slice& key) override { - uint64_t hash = GetSliceHash64(key); - if (hash_entries_.empty() || hash != hash_entries_.back()) { - hash_entries_.push_back(hash); + virtual Slice Finish(std::unique_ptr* buf) override { + size_t num_entries = hash_entries_.size(); + size_t len_with_metadata = CalculateSpace(num_entries); + + std::unique_ptr mutable_buf; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + std::unique_ptr< + CacheReservationHandle> + final_filter_cache_res_handle; + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + len_with_metadata * sizeof(char), + &final_filter_cache_res_handle); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + s.PermitUncheckedError(); } - } - virtual Slice Finish(std::unique_ptr* buf) override { - uint32_t len_with_metadata = - CalculateSpace(static_cast(hash_entries_.size())); - char* data = new char[len_with_metadata]; - memset(data, 0, len_with_metadata); + assert(mutable_buf); + assert(len_with_metadata >= kMetadataLen); + + // Max size supported by implementation + assert(len_with_metadata <= 0xffffffffU); - assert(data); - assert(len_with_metadata >= 5); + // Compute num_probes after any rounding / adjustments + int num_probes = GetNumProbes(num_entries, len_with_metadata); - uint32_t len = len_with_metadata - 5; + uint32_t len = static_cast(len_with_metadata - kMetadataLen); if (len > 0) { - AddAllEntries(data, len); + AddAllEntries(mutable_buf.get(), len, num_probes); } + assert(hash_entries_.empty()); + // Release cache for hash entries + hash_entry_cache_res_bucket_handles_.clear(); + // See BloomFilterPolicy::GetBloomBitsReader re: metadata // -1 = Marker for newer Bloom implementations - data[len] = static_cast(-1); + mutable_buf[len] = static_cast(-1); // 0 = Marker for this sub-implementation - data[len + 1] = static_cast(0); + mutable_buf[len + 1] = static_cast(0); // num_probes (and 0 in upper bits for 64-byte block size) - data[len + 2] = static_cast(num_probes_); + mutable_buf[len + 2] = static_cast(num_probes); // rest of metadata stays zero - const char* const_data = data; - buf->reset(const_data); - assert(hash_entries_.empty()); + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + return rv; + } + + size_t ApproximateNumEntries(size_t bytes) override { + size_t bytes_no_meta = + bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0; + return static_cast(uint64_t{8000} * bytes_no_meta / + millibits_per_key_); + } + + size_t CalculateSpace(size_t num_entries) override { + // If not for cache line blocks in the filter, what would the target + // length in bytes be? + size_t raw_target_len = static_cast( + (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000); + + if (raw_target_len >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + raw_target_len = size_t{0xffffffc0}; + } - return Slice(data, len_with_metadata); + // Round up to nearest multiple of 64 (block size). This adjustment is + // used for target FP rate only so that we don't receive complaints about + // lower FP rate vs. historic Bloom filter behavior. + return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen; } - int CalculateNumEntry(const uint32_t bytes) override { - uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; - return static_cast(uint64_t{8000} * bytes_no_meta / - millibits_per_key_); + double EstimatedFpRate(size_t keys, size_t len_with_metadata) override { + int num_probes = GetNumProbes(keys, len_with_metadata); + return FastLocalBloomImpl::EstimatedFpRate( + keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64); } - uint32_t CalculateSpace(const int num_entry) override { - uint32_t num_cache_lines = 0; - if (millibits_per_key_ > 0 && num_entry > 0) { - num_cache_lines = static_cast( - (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000); + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + if (rv >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + rv = size_t{0xffffffc0}; } - return num_cache_lines * 64 + /*metadata*/ 5; - } - double EstimatedFpRate(size_t keys, size_t bytes) override { - return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, - num_probes_, /*hash bits*/ 64); + // round down to multiple of 64 (block size) + rv &= ~size_t{63}; + + return rv + kMetadataLen; } private: - void AddAllEntries(char* data, uint32_t len) { + // Compute num_probes after any rounding / adjustments + int GetNumProbes(size_t keys, size_t len_with_metadata) { + uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000; + int actual_millibits_per_key = + static_cast(millibits / std::max(keys, size_t{1})); + // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to + // minimize unit test churn. Remove this some time. + if (!aggregate_rounding_balance_) { + actual_millibits_per_key = millibits_per_key_; + } + // END XXX/TODO + return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key); + } + + void AddAllEntries(char* data, uint32_t len, int num_probes) { // Simple version without prefetching: // // for (auto h : hash_entries_) { // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, - // num_probes_, data); + // num_probes, data); // } const size_t num_entries = hash_entries_.size(); @@ -129,7 +394,7 @@ uint32_t& hash_ref = hashes[i & kBufferMask]; uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask]; // Process (add) - FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_, + FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes, data + byte_offset_ref); // And buffer uint64_t h = hash_entries_.front(); @@ -141,16 +406,13 @@ // Finish processing for (i = 0; i <= kBufferMask && i < num_entries; ++i) { - FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_, + FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes, data + byte_offsets[i]); } } + // Target allocation per added key, in thousandths of a bit. int millibits_per_key_; - int num_probes_; - // A deque avoids unnecessary copying of already-saved values - // and has near-minimal peak memory use. - std::deque hash_entries_; }; // See description in FastLocalBloomImpl @@ -195,6 +457,405 @@ const uint32_t len_bytes_; }; +// ##################### Ribbon filter implementation ################### // + +// Implements concept RehasherTypesAndSettings in ribbon_impl.h +struct Standard128RibbonRehasherTypesAndSettings { + // These are schema-critical. Any change almost certainly changes + // underlying data. + static constexpr bool kIsFilter = true; + static constexpr bool kHomogeneous = false; + static constexpr bool kFirstCoeffAlwaysOne = true; + static constexpr bool kUseSmash = false; + using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; + using Hash = uint64_t; + using Seed = uint32_t; + // Changing these doesn't necessarily change underlying data, + // but might affect supported scalability of those dimensions. + using Index = uint32_t; + using ResultRow = uint32_t; + // Save a conditional in Ribbon queries + static constexpr bool kAllowZeroStarts = false; +}; + +using Standard128RibbonTypesAndSettings = + ribbon::StandardRehasherAdapter; + +class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { + public: + explicit Standard128RibbonBitsBuilder( + double desired_one_in_fp_rate, int bloom_millibits_per_key, + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, Logger* info_log) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), + desired_one_in_fp_rate_(desired_one_in_fp_rate), + info_log_(info_log), + bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance, + cache_res_mgr) { + assert(desired_one_in_fp_rate >= 1.0); + } + + // No Copy allowed + Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete; + void operator=(const Standard128RibbonBitsBuilder&) = delete; + + ~Standard128RibbonBitsBuilder() override {} + + virtual Slice Finish(std::unique_ptr* buf) override { + if (hash_entries_.size() > kMaxRibbonEntries) { + ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu", + static_cast(hash_entries_.size())); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + if (hash_entries_.size() == 0) { + // Save a conditional in Ribbon queries by using alternate reader + // for zero entries added. + return FinishAlwaysFalse(buf); + } + uint32_t num_entries = static_cast(hash_entries_.size()); + uint32_t num_slots; + size_t len_with_metadata; + + CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots); + + // Bloom fall-back indicator + if (num_slots == 0) { + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + + uint32_t entropy = 0; + if (!hash_entries_.empty()) { + entropy = Lower32of64(hash_entries_.front()); + } + + BandingType banding; + std::size_t bytes_banding = ribbon::StandardBanding< + Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots); + Status status_banding_cache_res = Status::OK(); + + // Cache reservation for banding + std::unique_ptr> + banding_res_handle; + if (cache_res_mgr_) { + status_banding_cache_res = + cache_res_mgr_ + ->MakeCacheReservation( + bytes_banding, &banding_res_handle); + } + + if (status_banding_cache_res.IsIncomplete()) { + ROCKS_LOG_WARN(info_log_, + "Cache reservation for Ribbon filter banding failed due " + "to cache full"); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + // Release cache for banding since the banding won't be allocated + banding_res_handle.reset(); + return bloom_fallback_.Finish(buf); + } + + bool success = banding.ResetAndFindSeedToSolve( + num_slots, hash_entries_.begin(), hash_entries_.end(), + /*starting seed*/ entropy & 255, /*seed mask*/ 255); + if (!success) { + ROCKS_LOG_WARN(info_log_, + "Too many re-seeds (256) for Ribbon filter, %llu / %llu", + static_cast(hash_entries_.size()), + static_cast(num_slots)); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + hash_entries_.clear(); + // Release cache for hash entries + hash_entry_cache_res_bucket_handles_.clear(); + + uint32_t seed = banding.GetOrdinalSeed(); + assert(seed < 256); + + std::unique_ptr mutable_buf; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + std::unique_ptr< + CacheReservationHandle> + final_filter_cache_res_handle; + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + len_with_metadata * sizeof(char), + &final_filter_cache_res_handle); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + s.PermitUncheckedError(); + } + + SolnType soln(mutable_buf.get(), len_with_metadata); + soln.BackSubstFrom(banding); + uint32_t num_blocks = soln.GetNumBlocks(); + // This should be guaranteed: + // num_entries < 2^30 + // => (overhead_factor < 2.0) + // num_entries * overhead_factor == num_slots < 2^31 + // => (num_blocks = num_slots / 128) + // num_blocks < 2^24 + assert(num_blocks < 0x1000000U); + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -2 = Marker for Standard128 Ribbon + mutable_buf[len_with_metadata - 5] = static_cast(-2); + // Hash seed + mutable_buf[len_with_metadata - 4] = static_cast(seed); + // Number of blocks, in 24 bits + // (Along with bytes, we can derive other settings) + mutable_buf[len_with_metadata - 3] = static_cast(num_blocks & 255); + mutable_buf[len_with_metadata - 2] = + static_cast((num_blocks >> 8) & 255); + mutable_buf[len_with_metadata - 1] = + static_cast((num_blocks >> 16) & 255); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + return rv; + } + + // Setting num_slots to 0 means "fall back on Bloom filter." + // And note this implementation does not support num_entries or num_slots + // beyond uint32_t; see kMaxRibbonEntries. + void CalculateSpaceAndSlots(size_t num_entries, + size_t* target_len_with_metadata, + uint32_t* num_slots) { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries); + return; + } + uint32_t entropy = 0; + if (!hash_entries_.empty()) { + entropy = Upper32of64(hash_entries_.front()); + } + + *num_slots = NumEntriesToNumSlots(static_cast(num_entries)); + *target_len_with_metadata = + SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_, + /*rounding*/ entropy) + + kMetadataLen; + + // Consider possible Bloom fallback for small filters + if (*num_slots < 1024) { + size_t bloom = bloom_fallback_.CalculateSpace(num_entries); + if (bloom < *target_len_with_metadata) { + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom; + return; + } + } + } + + size_t CalculateSpace(size_t num_entries) override { + if (num_entries == 0) { + // See FinishAlwaysFalse + return 0; + } + size_t target_len_with_metadata; + uint32_t num_slots; + CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots); + (void)num_slots; + return target_len_with_metadata; + } + + // This is a somewhat ugly but reasonably fast and reasonably accurate + // reversal of CalculateSpace. + size_t ApproximateNumEntries(size_t bytes) override { + size_t len_no_metadata = + RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) - + kMetadataLen; + + if (!(desired_one_in_fp_rate_ > 1.0)) { + // Effectively asking for 100% FP rate, or NaN etc. + // Note that NaN is neither < 1.0 nor > 1.0 + return kMaxRibbonEntries; + } + + // Find a slight under-estimate for actual average bits per slot + double min_real_bits_per_slot; + if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits::max()) { + // Max of 32 solution columns (result bits) + min_real_bits_per_slot = 32.0; + } else { + // Account for mix of b and b+1 solution columns being slightly + // suboptimal vs. ideal log2(1/fp_rate) bits. + uint32_t rounded = static_cast(desired_one_in_fp_rate_); + int upper_bits_per_key = 1 + FloorLog2(rounded); + double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key); + double portion_lower = + (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) / + fp_rate_for_upper; + min_real_bits_per_slot = upper_bits_per_key - portion_lower; + assert(min_real_bits_per_slot > 0.0); + assert(min_real_bits_per_slot <= 32.0); + } + + // An overestimate, but this should only be O(1) slots away from truth. + double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot; + + // Let's not bother accounting for overflow to Bloom filter + // (Includes NaN case) + if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) { + return kMaxRibbonEntries; + } + + // Set up for short iteration + uint32_t slots = static_cast(max_slots); + slots = SolnType::RoundUpNumSlots(slots); + + // Assert that we have a valid upper bound on slots + assert(SolnType::GetBytesForOneInFpRate( + SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_, + /*rounding*/ 0) > len_no_metadata); + + // Iterate up to a few times to rather precisely account for small effects + for (int i = 0; slots > 0; ++i) { + size_t reqd_bytes = + SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_, + /*rounding*/ 0); + if (reqd_bytes <= len_no_metadata) { + break; // done + } + if (i >= 2) { + // should have been enough iterations + assert(false); + break; + } + slots = SolnType::RoundDownNumSlots(slots - 1); + } + + uint32_t num_entries = ConfigHelper::GetNumToAdd(slots); + + // Consider possible Bloom fallback for small filters + if (slots < 1024) { + size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes); + if (bloom > num_entries) { + return bloom; + } else { + return num_entries; + } + } else { + return std::min(num_entries, kMaxRibbonEntries); + } + } + + double EstimatedFpRate(size_t num_entries, + size_t len_with_metadata) override { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata); + } + uint32_t num_slots = + NumEntriesToNumSlots(static_cast(num_entries)); + SolnType fake_soln(nullptr, len_with_metadata); + fake_soln.ConfigureForNumSlots(num_slots); + return fake_soln.ExpectedFpRate(); + } + + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + // round down to multiple of 16 (segment size) + rv &= ~size_t{15}; + + return rv + kMetadataLen; + } + + private: + using TS = Standard128RibbonTypesAndSettings; + using SolnType = ribbon::SerializableInterleavedSolution; + using BandingType = ribbon::StandardBanding; + using ConfigHelper = ribbon::BandingConfigHelper1TS; + + static uint32_t NumEntriesToNumSlots(uint32_t num_entries) { + uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries); + return SolnType::RoundUpNumSlots(num_slots1); + } + + // Approximate num_entries to ensure number of bytes fits in 32 bits, which + // is not an inherent limitation but does ensure somewhat graceful Bloom + // fallback for crazy high number of entries, since the Bloom implementation + // does not support number of bytes bigger than fits in 32 bits. This is + // within an order of magnitude of implementation limit on num_slots + // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits + // (for filter metadata). + static constexpr uint32_t kMaxRibbonEntries = 950000000; // ~ 1 billion + + // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + + // For warnings, or can be nullptr + Logger* info_log_; + + // For falling back on Bloom filter in some exceptional cases and + // very small filter cases + FastLocalBloomBitsBuilder bloom_fallback_; +}; + +// for the linker, at least with DEBUG_LEVEL=2 +constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries; + +class Standard128RibbonBitsReader : public FilterBitsReader { + public: + Standard128RibbonBitsReader(const char* data, size_t len_bytes, + uint32_t num_blocks, uint32_t seed) + : soln_(const_cast(data), len_bytes) { + soln_.ConfigureForNumBlocks(num_blocks); + hasher_.SetOrdinalSeed(seed); + } + + // No Copy allowed + Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete; + void operator=(const Standard128RibbonBitsReader&) = delete; + + ~Standard128RibbonBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + return soln_.FilterQuery(h, hasher_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + struct SavedData { + uint64_t seeded_hash; + uint32_t segment_num; + uint32_t num_columns; + uint32_t start_bits; + }; + std::array saved; + for (int i = 0; i < num_keys; ++i) { + ribbon::InterleavedPrepareQuery( + GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash, + &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = ribbon::InterleavedFilterQuery( + saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns, + saved[i].start_bits, hasher_, soln_); + } + } + + private: + using TS = Standard128RibbonTypesAndSettings; + ribbon::SerializableInterleavedSolution soln_; + ribbon::StandardHasher hasher_; +}; + +// ##################### Legacy Bloom implementation ################### // + using LegacyBloomImpl = LegacyLocalityBloomImpl; class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { @@ -209,21 +870,25 @@ void AddKey(const Slice& key) override; - Slice Finish(std::unique_ptr* buf) override; + virtual size_t EstimateEntriesAdded() override { + return hash_entries_.size(); + } - int CalculateNumEntry(const uint32_t bytes) override; + Slice Finish(std::unique_ptr* buf) override; - uint32_t CalculateSpace(const int num_entry) override { + size_t CalculateSpace(size_t num_entries) override { uint32_t dont_care1; uint32_t dont_care2; - return CalculateSpace(num_entry, &dont_care1, &dont_care2); + return CalculateSpace(num_entries, &dont_care1, &dont_care2); } double EstimatedFpRate(size_t keys, size_t bytes) override { - return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen, num_probes_); } + size_t ApproximateNumEntries(size_t bytes) override; + private: int bits_per_key_; int num_probes_; @@ -234,11 +899,11 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits); // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, + char* ReserveSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines); // Implementation-specific variant of public CalculateSpace - uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines); // Assuming single threaded access to this function. @@ -306,7 +971,29 @@ buf->reset(const_data); hash_entries_.clear(); - return Slice(data, total_bits / 8 + 5); + return Slice(data, total_bits / 8 + kMetadataLen); +} + +size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) { + assert(bits_per_key_); + assert(bytes > 0); + + uint64_t total_bits_tmp = bytes * 8; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000}); + + uint32_t high = static_cast(total_bits_tmp) / + static_cast(bits_per_key_) + + 1; + uint32_t low = 1; + uint32_t n = high; + for (; n >= low; n--) { + if (CalculateSpace(n) <= bytes) { + break; + } + } + return n; } uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { @@ -321,14 +1008,18 @@ return num_lines * (CACHE_LINE_SIZE * 8); } -uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, +uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines) { assert(bits_per_key_); - if (num_entry != 0) { - uint32_t total_bits_tmp = static_cast(num_entry * bits_per_key_); + if (num_entries != 0) { + size_t total_bits_tmp = num_entries * bits_per_key_; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000}); - *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *total_bits = + GetTotalBitsForLocality(static_cast(total_bits_tmp)); *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); assert(*total_bits > 0 && *total_bits % 8 == 0); } else { @@ -339,34 +1030,19 @@ // Reserve space for Filter uint32_t sz = *total_bits / 8; - sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + sz += kMetadataLen; // 4 bytes for num_lines, 1 byte for num_probes return sz; } -char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry, +char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines) { - uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines); char* data = new char[sz]; memset(data, 0, sz); return data; } -int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { - assert(bits_per_key_); - assert(bytes > 0); - int high = static_cast(bytes * 8 / bits_per_key_ + 1); - int low = 1; - int n = high; - for (; n >= low; n--) { - if (CalculateSpace(n) <= bytes) { - break; - } - } - assert(n < high); // High should be an overestimation - return n; -} - inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits) { @@ -449,15 +1125,17 @@ kLegacyBloom, kDeprecatedBlock, kFastLocalBloom, + kStandard128Ribbon, }; const std::vector BloomFilterPolicy::kAllUserModes = { kDeprecatedBlock, - kAuto, + kAutoBloom, + kStandard128Ribbon, }; BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) - : mode_(mode), warned_(false) { + : mode_(mode), warned_(false), aggregate_rounding_balance_(0) { // Sanitize bits_per_key if (bits_per_key < 1.0) { bits_per_key = 1.0; @@ -470,6 +1148,15 @@ // point are interpreted accurately. millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); + // For now configure Ribbon filter to match Bloom FP rate and save + // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key + // for same FP rate.) + desired_one_in_fp_rate_ = + 1.0 / BloomMath::CacheLocalFpRate( + bits_per_key, + FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_), + /*cache_line_bits*/ 512); + // For better or worse, this is a rounding up of a nudged rounding up, // e.g. 7.4999999999999 will round up to 8, but that provides more // predictability against small arithmetic errors in floating point. @@ -478,7 +1165,7 @@ BloomFilterPolicy::~BloomFilterPolicy() {} -const char* BloomFilterPolicy::Name() const { +const char* BuiltinFilterPolicy::Name() const { return "rocksdb.BuiltinBloomFilter"; } @@ -511,8 +1198,8 @@ } } -bool BloomFilterPolicy::KeyMayMatch(const Slice& key, - const Slice& bloom_filter) const { +bool BuiltinFilterPolicy::KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const { const size_t len = bloom_filter.size(); if (len < 2 || len > 0xffffffffU) { return false; @@ -534,7 +1221,7 @@ array); } -FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { +FilterBitsBuilder* BuiltinFilterPolicy::GetFilterBitsBuilder() const { // This code path should no longer be used, for the built-in // BloomFilterPolicy. Internal to RocksDB and outside // BloomFilterPolicy, only get a FilterBitsBuilder with @@ -549,11 +1236,20 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( const FilterBuildingContext& context) const { Mode cur = mode_; + bool offm = context.table_options.optimize_filters_for_memory; + bool reserve_filter_construction_mem = + (context.table_options.reserve_table_builder_memory && + context.table_options.block_cache); + std::shared_ptr cache_res_mgr; + if (reserve_filter_construction_mem) { + cache_res_mgr = std::make_shared( + context.table_options.block_cache); + } // Unusual code construction so that we can have just // one exhaustive switch without (risky) recursion for (int i = 0; i < 2; ++i) { switch (cur) { - case kAuto: + case kAutoBloom: if (context.table_options.format_version < 5) { cur = kLegacyBloom; } else { @@ -561,9 +1257,18 @@ } break; case kDeprecatedBlock: + if (context.info_log && !warned_.load(std::memory_order_relaxed)) { + warned_ = true; + ROCKS_LOG_WARN(context.info_log, + "Using deprecated block-based Bloom filter is " + "inefficient (%d bits per key).", + whole_bits_per_key_); + } return nullptr; case kFastLocalBloom: - return new FastLocalBloomBitsBuilder(millibits_per_key_); + return new FastLocalBloomBitsBuilder( + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr); case kLegacyBloom: if (whole_bits_per_key_ >= 14 && context.info_log && !warned_.load(std::memory_order_relaxed)) { @@ -585,6 +1290,11 @@ } return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log); + case kStandard128Ribbon: + return new Standard128RibbonBitsBuilder( + desired_one_in_fp_rate_, millibits_per_key_, + offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr, + context.info_log); } } assert(false); @@ -602,10 +1312,10 @@ // Read metadata to determine what kind of FilterBitsReader is needed // and return a new one. -FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( +FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); - if (len_with_meta <= 5) { + if (len_with_meta <= kMetadataLen) { // filter is empty or broken. Treat like zero keys added. return new AlwaysFalseFilter(); } @@ -623,7 +1333,7 @@ // len_with_meta +-----------------------------------+ int8_t raw_num_probes = - static_cast(contents.data()[len_with_meta - 5]); + static_cast(contents.data()[len_with_meta - kMetadataLen]); // NB: *num_probes > 30 and < 128 probably have not been used, because of // BloomFilterPolicy::initialize, unless directly calling // LegacyBloomBitsBuilder as an API, but we are leaving those cases in @@ -632,13 +1342,20 @@ if (raw_num_probes < 1) { // Note: < 0 (or unsigned > 127) indicate special new implementations // (or reserved for future use) - if (raw_num_probes == -1) { - // Marker for newer Bloom implementations - return GetBloomBitsReader(contents); + switch (raw_num_probes) { + case 0: + // Treat as zero probes (always FP) + return new AlwaysTrueFilter(); + case -1: + // Marker for newer Bloom implementations + return GetBloomBitsReader(contents); + case -2: + // Marker for Ribbon implementations + return GetRibbonBitsReader(contents); + default: + // Reserved (treat as zero probes, always FP, for now) + return new AlwaysTrueFilter(); } - // otherwise - // Treat as zero probes (always FP) for now. - return new AlwaysTrueFilter(); } // else attempt decode for LegacyBloomBitsReader @@ -646,7 +1363,7 @@ assert(num_probes >= 1); assert(num_probes <= 127); - uint32_t len = len_with_meta - 5; + uint32_t len = len_with_meta - kMetadataLen; assert(len > 0); uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); @@ -676,11 +1393,34 @@ log2_cache_line_size); } +FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + uint32_t len = len_with_meta - kMetadataLen; + + assert(len > 0); // precondition + + uint32_t seed = static_cast(contents.data()[len + 1]); + uint32_t num_blocks = static_cast(contents.data()[len + 2]); + num_blocks |= static_cast(contents.data()[len + 3]) << 8; + num_blocks |= static_cast(contents.data()[len + 4]) << 16; + if (num_blocks < 2) { + // Not supported + // num_blocks == 1 is not used because num_starts == 1 is problematic + // for the hashing scheme. num_blocks == 0 is unused because there's + // already a concise encoding of an "always false" filter. + // Return something safe: + return new AlwaysTrueFilter(); + } + return new Standard128RibbonBitsReader(contents.data(), len, num_blocks, + seed); +} + // For newer Bloom filter implementations -FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( +FilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); - uint32_t len = len_with_meta - 5; + uint32_t len = len_with_meta - kMetadataLen; assert(len > 0); // precondition @@ -742,7 +1482,7 @@ if (use_block_based_builder) { m = BloomFilterPolicy::kDeprecatedBlock; } else { - m = BloomFilterPolicy::kAuto; + m = BloomFilterPolicy::kAutoBloom; } assert(std::find(BloomFilterPolicy::kAllUserModes.begin(), BloomFilterPolicy::kAllUserModes.end(), @@ -750,10 +1490,125 @@ return new BloomFilterPolicy(bits_per_key, m); } +// Chooses between two filter policies based on LSM level, but +// only for Level and Universal compaction styles. Flush is treated +// as level -1. Policy b is considered fallback / primary policy. +LevelThresholdFilterPolicy::LevelThresholdFilterPolicy( + std::unique_ptr&& a, + std::unique_ptr&& b, int starting_level_for_b) + : policy_a_(std::move(a)), + policy_b_(std::move(b)), + starting_level_for_b_(starting_level_for_b) { + // Don't use this wrapper class if you were going to set to -1 + assert(starting_level_for_b_ >= 0); +} + +// Deprecated block-based filter only +void LevelThresholdFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + policy_b_->CreateFilter(keys, n, dst); +} + +FilterBitsBuilder* LevelThresholdFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + switch (context.compaction_style) { + case kCompactionStyleLevel: + case kCompactionStyleUniversal: { + int levelish; + if (context.reason == TableFileCreationReason::kFlush) { + // Treat flush as level -1 + assert(context.level_at_creation == 0); + levelish = -1; + } else if (context.level_at_creation == -1) { + // Unknown level + // Policy b considered fallback / primary + return policy_b_->GetBuilderWithContext(context); + } else { + levelish = context.level_at_creation; + } + if (levelish >= starting_level_for_b_) { + return policy_b_->GetBuilderWithContext(context); + } else { + return policy_a_->GetBuilderWithContext(context); + } + } + case kCompactionStyleFIFO: + case kCompactionStyleNone: + break; + } + // Policy b considered fallback / primary + return policy_b_->GetBuilderWithContext(context); +} + +const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level) { + std::unique_ptr ribbon_only{new BloomFilterPolicy( + bloom_equivalent_bits_per_key, BloomFilterPolicy::kStandard128Ribbon)}; + if (bloom_before_level > -1) { + // Could also use Bloom policy + std::unique_ptr bloom_only{new BloomFilterPolicy( + bloom_equivalent_bits_per_key, BloomFilterPolicy::kFastLocalBloom)}; + return new LevelThresholdFilterPolicy( + std::move(bloom_only), std::move(ribbon_only), bloom_before_level); + } else { + return ribbon_only.release(); + } +} + FilterBuildingContext::FilterBuildingContext( const BlockBasedTableOptions& _table_options) : table_options(_table_options) {} FilterPolicy::~FilterPolicy() { } +Status FilterPolicy::CreateFromString( + const ConfigOptions& /*options*/, const std::string& value, + std::shared_ptr* policy) { + const std::string kBloomName = "bloomfilter:"; + const std::string kExpRibbonName = "experimental_ribbon:"; + const std::string kRibbonName = "ribbonfilter:"; + if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") { + policy->reset(); +#ifndef ROCKSDB_LITE + } else if (value.compare(0, kBloomName.size(), kBloomName) == 0) { + size_t pos = value.find(':', kBloomName.size()); + if (pos == std::string::npos) { + return Status::InvalidArgument( + "Invalid filter policy config, missing bits_per_key"); + } else { + double bits_per_key = ParseDouble( + trim(value.substr(kBloomName.size(), pos - kBloomName.size()))); + bool use_block_based_builder = + ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); + policy->reset( + NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); + } + } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) { + double bloom_equivalent_bits_per_key = + ParseDouble(trim(value.substr(kExpRibbonName.size()))); + policy->reset( + NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key)); + } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) { + size_t pos = value.find(':', kRibbonName.size()); + int bloom_before_level; + if (pos == std::string::npos) { + pos = value.size(); + bloom_before_level = 0; + } else { + bloom_before_level = ParseInt(trim(value.substr(pos + 1))); + } + double bloom_equivalent_bits_per_key = + ParseDouble(trim(value.substr(kRibbonName.size(), pos))); + policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, + bloom_before_level)); + } else { + return Status::NotFound("Invalid filter policy name ", value); +#else + } else { + return Status::NotSupported("Cannot load filter policy in LITE mode ", + value); +#endif // ROCKSDB_LITE + } + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,20 +25,52 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder { public: // Calculate number of bytes needed for a new filter, including - // metadata. Passing the result to CalculateNumEntry should - // return >= the num_entry passed in. - virtual uint32_t CalculateSpace(const int num_entry) = 0; + // metadata. Passing the result to ApproximateNumEntries should + // (ideally, usually) return >= the num_entry passed in. + // When optimize_filters_for_memory is enabled, this function + // is not authoritative but represents a target size that should + // be close to the average size. + virtual size_t CalculateSpace(size_t num_entries) = 0; // Returns an estimate of the FP rate of the returned filter if - // `keys` keys are added and the filter returned by Finish is `bytes` - // bytes. - virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0; + // `num_entries` keys are added and the filter returned by Finish + // is `bytes` bytes. + virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; -// RocksDB built-in filter policy for Bloom or Bloom-like filters. +// Abstract base class for RocksDB built-in filter policies. // This class is considered internal API and subject to change. -// See NewBloomFilterPolicy. -class BloomFilterPolicy : public FilterPolicy { +class BuiltinFilterPolicy : public FilterPolicy { + public: + // Shared name because any built-in policy can read filters from + // any other + const char* Name() const override; + + // Deprecated block-based filter only + bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override; + + // Old API + FilterBitsBuilder* GetFilterBitsBuilder() const override; + + // Read metadata to determine what kind of FilterBitsReader is needed + // and return a new one. This must successfully process any filter data + // generated by a built-in FilterBitsBuilder, regardless of the impl + // chosen for this BloomFilterPolicy. Not compatible with CreateFilter. + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + + private: + // For Bloom filter implementation(s) (except deprecated block-based filter) + FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; + + // For Ribbon filter implementation(s) + FilterBitsReader* GetRibbonBitsReader(const Slice& contents) const; +}; + +// RocksDB built-in filter policy for Bloom or Bloom-like filters including +// Ribbon filters. +// This class is considered internal API and subject to change. +// See NewBloomFilterPolicy and NewRibbonFilterPolicy. +class BloomFilterPolicy : public BuiltinFilterPolicy { public: // An internal marker for operating modes of BloomFilterPolicy, in terms // of selecting an implementation. This makes it easier for tests to track @@ -64,10 +96,12 @@ // FastLocalBloomImpl. // NOTE: TESTING ONLY as this mode does not check format_version kFastLocalBloom = 2, - // Automatically choose from the above (except kDeprecatedBlock) based on + // A Bloom alternative saving about 30% space for ~3-4x construction + // CPU time. See ribbon_alg.h and ribbon_impl.h. + kStandard128Ribbon = 3, + // Automatically choose between kLegacyBloom and kFastLocalBloom based on // context at build time, including compatibility with format_version. - // NOTE: This is currently the only recommended mode that is user exposed. - kAuto = 100, + kAutoBloom = 100, }; // All the different underlying implementations that a BloomFilterPolicy // might use, as a mode that says "always use this implementation." @@ -83,16 +117,9 @@ ~BloomFilterPolicy() override; - const char* Name() const override; - // Deprecated block-based filter only void CreateFilter(const Slice* keys, int n, std::string* dst) const override; - // Deprecated block-based filter only - bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override; - - FilterBitsBuilder* GetFilterBitsBuilder() const override; - // To use this function, call GetBuilderFromContext(). // // Neither the context nor any objects therein should be saved beyond @@ -105,18 +132,16 @@ // (An internal convenience function to save boilerplate.) static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&); - // Read metadata to determine what kind of FilterBitsReader is needed - // and return a new one. This must successfully process any filter data - // generated by a built-in FilterBitsBuilder, regardless of the impl - // chosen for this BloomFilterPolicy. Not compatible with CreateFilter. - FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; - // Essentially for testing only: configured millibits/key int GetMillibitsPerKey() const { return millibits_per_key_; } // Essentially for testing only: legacy whole bits/key int GetWholeBitsPerKey() const { return whole_bits_per_key_; } + // Testing only + Mode GetMode() const { return mode_; } private: + // Bits per key settings are for configuring Bloom filters. + // Newer filters support fractional bits per key. For predictable behavior // of 0.001-precision values across floating point implementations, we // round to thousandths of a bit (on average) per key. @@ -127,6 +152,10 @@ // behavior with format_version < 5 just in case.) int whole_bits_per_key_; + // For configuring Ribbon filter: a desired value for 1/fp_rate. For + // example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + // Selected mode (a specific implementation or way of selecting an // implementation) for building new SST filters. Mode mode_; @@ -135,8 +164,42 @@ // only report once per BloomFilterPolicy instance, to keep the noise down.) mutable std::atomic warned_; - // For newer Bloom filter implementation(s) - FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; + // State for implementing optimize_filters_for_memory. Essentially, this + // tracks a surplus or deficit in total FP rate of filters generated by + // builders under this policy vs. what would have been generated without + // optimize_filters_for_memory. + // + // To avoid floating point weirdness, the actual value is + // Sum over all generated filters f: + // (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32 + mutable std::atomic aggregate_rounding_balance_; +}; + +// Chooses between two filter policies based on LSM level, but +// only for Level and Universal compaction styles. Flush is treated +// as level -1. Policy b is considered fallback / primary policy. +class LevelThresholdFilterPolicy : public BuiltinFilterPolicy { + public: + LevelThresholdFilterPolicy(std::unique_ptr&& a, + std::unique_ptr&& b, + int starting_level_for_b); + + // Deprecated block-based filter only + void CreateFilter(const Slice* keys, int n, std::string* dst) const override; + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + inline int TEST_GetStartingLevelForB() const { return starting_level_for_b_; } + + inline const FilterPolicy* TEST_GetPolicyA() const { return policy_a_.get(); } + + inline const FilterPolicy* TEST_GetPolicyB() const { return policy_b_.get(); } + + private: + const std::unique_ptr policy_a_; + const std::unique_ptr policy_b_; + int starting_level_for_b_; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,12 +4,18 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/flush_block_policy.h" + +#include +#include + #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/utilities/customizable_util.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" +#include "table/block_based/flush_block_policy.h" #include "table/format.h" -#include namespace ROCKSDB_NAMESPACE { @@ -57,7 +63,7 @@ data_block_builder_.EstimateSizeAfterKV(key, value); if (align_) { - estimated_size_after += kBlockTrailerSize; + estimated_size_after += BlockBasedTable::kBlockTrailerSize; return estimated_size_after > block_size_; } @@ -85,4 +91,58 @@ return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); } +#ifndef ROCKSDB_LITE +static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + FlushBlockBySizePolicyFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FlushBlockBySizePolicyFactory()); + return guard->get(); + }); + library.AddFactory( + FlushBlockEveryKeyPolicyFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FlushBlockEveryKeyPolicyFactory()); + return guard->get(); + }); + return 2; +} +#endif // ROCKSDB_LITE + +static bool LoadFlushPolicyFactory( + const std::string& id, std::shared_ptr* result) { + if (id.empty()) { + result->reset(new FlushBlockBySizePolicyFactory()); +#ifdef ROCKSDB_LITE + } else if (id == FlushBlockBySizePolicyFactory::kClassName()) { + result->reset(new FlushBlockBySizePolicyFactory()); + } else if (id == FlushBlockEveryKeyPolicyFactory::kClassName()) { + result->reset(new FlushBlockEveryKeyPolicyFactory()); +#endif // ROCKSDB_LITE + } else { + return false; + } + return true; +} + +FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory() + : FlushBlockPolicyFactory() {} + +Status FlushBlockPolicyFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* factory) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + return LoadSharedObject( + config_options, value, LoadFlushPolicyFactory, factory); +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h 2025-05-19 16:14:27.000000000 +0000 @@ -27,9 +27,8 @@ public: explicit FlushBlockEveryKeyPolicyFactory() {} - const char* Name() const override { - return "FlushBlockEveryKeyPolicyFactory"; - } + static const char* kClassName() { return "FlushBlockEveryKeyPolicyFactory"; } + const char* Name() const override { return kClassName(); } FlushBlockPolicy* NewFlushBlockPolicy( const BlockBasedTableOptions& /*table_options*/, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc 2025-05-19 16:14:27.000000000 +0000 @@ -22,42 +22,63 @@ whole_key_filtering_(whole_key_filtering), last_whole_key_recorded_(false), last_prefix_recorded_(false), - num_added_(0) { + last_key_in_domain_(false), + any_added_(false) { assert(filter_bits_builder != nullptr); filter_bits_builder_.reset(filter_bits_builder); } -void FullFilterBlockBuilder::Add(const Slice& key) { - const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key); +size_t FullFilterBlockBuilder::EstimateEntriesAdded() { + return filter_bits_builder_->EstimateEntriesAdded(); +} + +void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { + const bool add_prefix = + prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts); + + if (!last_prefix_recorded_ && last_key_in_domain_) { + // We can reach here when a new filter partition starts in partitioned + // filter. The last prefix in the previous partition should be added if + // necessary regardless of key_without_ts, to support prefix SeekForPrev. + AddKey(last_prefix_str_); + last_prefix_recorded_ = true; + } + if (whole_key_filtering_) { if (!add_prefix) { - AddKey(key); + AddKey(key_without_ts); } else { // if both whole_key and prefix are added to bloom then we will have whole - // key and prefix addition being interleaved and thus cannot rely on the - // bits builder to properly detect the duplicates by comparing with the - // last item. + // key_without_ts and prefix addition being interleaved and thus cannot + // rely on the bits builder to properly detect the duplicates by comparing + // with the last item. Slice last_whole_key = Slice(last_whole_key_str_); - if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) { - AddKey(key); + if (!last_whole_key_recorded_ || + last_whole_key.compare(key_without_ts) != 0) { + AddKey(key_without_ts); last_whole_key_recorded_ = true; - last_whole_key_str_.assign(key.data(), key.size()); + last_whole_key_str_.assign(key_without_ts.data(), + key_without_ts.size()); } } } if (add_prefix) { - AddPrefix(key); + last_key_in_domain_ = true; + AddPrefix(key_without_ts); + } else { + last_key_in_domain_ = false; } } // Add key to filter if needed inline void FullFilterBlockBuilder::AddKey(const Slice& key) { filter_bits_builder_->AddKey(key); - num_added_++; + any_added_ = true; } // Add prefix to filter if needed void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + assert(prefix_extractor_ && prefix_extractor_->InDomain(key)); Slice prefix = prefix_extractor_->Transform(key); if (whole_key_filtering_) { // if both whole_key and prefix are added to bloom then we will have whole @@ -80,14 +101,17 @@ last_prefix_recorded_ = false; } -Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, - Status* status) { +Slice FullFilterBlockBuilder::Finish( + const BlockHandle& /*tmp*/, Status* status, + std::unique_ptr* filter_data) { Reset(); // In this impl we ignore BlockHandle *status = Status::OK(); - if (num_added_ != 0) { - num_added_ = 0; - return filter_bits_builder_->Finish(&filter_data_); + if (any_added_) { + any_added_ = false; + Slice filter_content = + filter_bits_builder_->Finish(filter_data ? filter_data : &filter_data_); + return filter_content; } return Slice(); } @@ -119,19 +143,20 @@ } std::unique_ptr FullFilterBlockReader::Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context) { + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { assert(table); assert(table->get_rep()); assert(!pin || prefetch); CachableEntry filter_block; if (prefetch || !use_cache) { - const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), - use_cache, nullptr /* get_context */, - lookup_context, &filter_block); + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return std::unique_ptr(); } @@ -164,6 +189,7 @@ const Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return true; } @@ -189,7 +215,6 @@ uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG - (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); @@ -206,7 +231,6 @@ uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG - (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); @@ -221,6 +245,7 @@ const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return; } @@ -244,9 +269,9 @@ MultiGetRange filter_range(*range, range->begin(), range->end()); for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { if (!prefix_extractor) { - keys[num_keys++] = &iter->ukey; - } else if (prefix_extractor->InDomain(iter->ukey)) { - prefixes.emplace_back(prefix_extractor->Transform(iter->ukey)); + keys[num_keys++] = &iter->ukey_without_ts; + } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) { + prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts)); keys[num_keys++] = &prefixes.back(); } else { filter_range.SkipKey(iter); @@ -282,22 +307,23 @@ } bool FullFilterBlockReader::RangeMayExist( - const Slice* iterate_upper_bound, const Slice& user_key, + const Slice* iterate_upper_bound, const Slice& user_key_without_ts, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { - if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + bool need_upper_bound_check, bool no_io, + BlockCacheLookupContext* lookup_context) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) { *filter_checked = false; return true; } - Slice prefix = prefix_extractor->Transform(user_key); + Slice prefix = prefix_extractor->Transform(user_key_without_ts); if (need_upper_bound_check && !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { *filter_checked = false; return true; } else { *filter_checked = true; - return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io, const_ikey_ptr, /* get_context */ nullptr, lookup_context); } @@ -316,7 +342,8 @@ } Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); // first check if user_key and upper_bound all share the same prefix - if (!comparator->Equal(prefix, upper_bound_xform)) { + if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform, + false) != 0) { // second check if user_key's prefix is the immediate predecessor of // upper_bound and have the same length. If so, we know for sure all // keys in the range [user_key, upper_bound) share the same prefix. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,11 +7,12 @@ #include #include + #include #include #include -#include "db/dbformat.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" @@ -50,17 +51,25 @@ virtual bool IsBlockBased() override { return false; } virtual void StartBlock(uint64_t /*block_offset*/) override {} - virtual void Add(const Slice& key) override; - virtual size_t NumAdded() const override { return num_added_; } - virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + virtual void Add(const Slice& key_without_ts) override; + virtual bool IsEmpty() const override { return !any_added_; } + virtual size_t EstimateEntriesAdded() override; + virtual Slice Finish( + const BlockHandle& tmp, Status* status, + std::unique_ptr* filter_data = nullptr) override; using FilterBlockBuilder::Finish; + virtual void ResetFilterBitsBuilder() override { + filter_bits_builder_.reset(); + } + protected: virtual void AddKey(const Slice& key); std::unique_ptr filter_bits_builder_; virtual void Reset(); void AddPrefix(const Slice& key); const SliceTransform* prefix_extractor() { return prefix_extractor_; } + const std::string& last_prefix_str() const { return last_prefix_str_; } private: // important: all of these might point to invalid addresses @@ -72,10 +81,13 @@ std::string last_whole_key_str_; bool last_prefix_recorded_; std::string last_prefix_str_; - - uint32_t num_added_; + // Whether prefix_extractor_->InDomain(last_whole_key_) is true. + // Used in partitioned filters so that the last prefix from the previous + // filter partition will be added to the current partition if + // last_key_in_domain_ is true, regardless of the current key. + bool last_key_in_domain_; + bool any_added_; std::unique_ptr filter_data_; - }; // A FilterBlockReader is used to parse filter from SST table. @@ -87,9 +99,9 @@ CachableEntry&& filter_block); static std::unique_ptr Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context); + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } @@ -119,7 +131,7 @@ const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check, + bool need_upper_bound_check, bool no_io, BlockCacheLookupContext* lookup_context) override; private: diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,13 +3,16 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/full_filter_block.h" + #include -#include "table/block_based/full_filter_block.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/status.h" #include "table/block_based/block_based_table_reader.h" -#include "table/block_based/mock_block_based_table.h" #include "table/block_based/filter_policy_internal.h" +#include "table/block_based/mock_block_based_table.h" +#include "table/format.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -224,8 +227,8 @@ return rv; } - int CalculateNumEntry(const uint32_t bytes) override { - return b_->CalculateNumEntry(bytes); + size_t ApproximateNumEntries(size_t bytes) override { + return b_->ApproximateNumEntries(bytes); } size_t CountUnique() { return uniq_.size(); } @@ -239,11 +242,9 @@ const bool WHOLE_KEY = true; FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, bits_builder); - ASSERT_EQ(0, builder.NumAdded()); ASSERT_EQ(0, bits_builder->CountUnique()); // adds key and empty prefix; both abstractions count them builder.Add("key1"); - ASSERT_EQ(2, builder.NumAdded()); ASSERT_EQ(2, bits_builder->CountUnique()); // Add different key (unique) and also empty prefix (not unique). // From here in this test, it's immaterial whether the block builder @@ -262,7 +263,6 @@ const bool WHOLE_KEY = true; FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, bits_builder); - ASSERT_EQ(0, builder.NumAdded()); builder.Add(""); // test with empty key too builder.Add("prefix1key1"); builder.Add("prefix1key1"); @@ -275,14 +275,19 @@ TEST_F(FullFilterBlockTest, SingleChunk) { FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); - ASSERT_EQ(0, builder.NumAdded()); + ASSERT_TRUE(builder.IsEmpty()); builder.Add("foo"); + ASSERT_FALSE(builder.IsEmpty()); builder.Add("bar"); builder.Add("box"); builder.Add("box"); builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - Slice slice = builder.Finish(); + // "box" only counts once + ASSERT_EQ(4, builder.EstimateEntriesAdded()); + ASSERT_FALSE(builder.IsEmpty()); + Status s; + Slice slice = builder.Finish(BlockHandle(), &s); + ASSERT_OK(s); CachableEntry block( new ParsedFullFilterBlock(table_options_.filter_policy.get(), diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,147 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/hash_index_reader.h" + +#include "table/block_fetcher.h" +#include "table/meta_blocks.h" + +namespace ROCKSDB_NAMESPACE { +Status HashIndexReader::Create(const BlockBasedTable* table, + const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(index_reader != nullptr); + assert(!pin || prefetch); + + const BlockBasedTable::Rep* rep = table->get_rep(); + assert(rep != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + // Note, failure to create prefix hash index does not need to be a + // hard error. We can still fall back to the original binary search index. + // So, Create will succeed regardless, from this point on. + + index_reader->reset(new HashIndexReader(table, std::move(index_block))); + + // Get prefixes block + BlockHandle prefixes_handle; + Status s = + FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, &prefixes_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + RandomAccessFileReader* const file = rep->file.get(); + const Footer& footer = rep->footer; + const ImmutableOptions& ioptions = rep->ioptions; + const PersistentCacheOptions& cache_options = rep->persistent_cache_options; + MemoryAllocator* const memory_allocator = + GetMemoryAllocator(rep->table_options); + + // Read contents for the blocks + BlockContents prefixes_contents; + BlockFetcher prefixes_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, + &prefixes_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + BlockFetcher prefixes_meta_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + &prefixes_meta_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexMetadata, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_meta_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + BlockPrefixIndex* prefix_index = nullptr; + assert(rep->internal_prefix_transform.get() != nullptr); + s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + HashIndexReader* const hash_index_reader = + static_cast(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); + } + + return Status::OK(); +} + +InternalIteratorBase* HashIndexReader::NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const BlockBasedTable::Rep* rep = table()->get_rep(); + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + const bool total_order_seek = + read_options.total_order_seek || disable_prefix_seek; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, + total_order_seek, index_has_first_key(), index_key_includes_seq(), + index_value_is_full(), false /* block_contents_pinned */, + prefix_index_.get()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + if (prefix_index_) { + usage += prefix_index_->ApproximateMemoryUsage(); + } + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + HashIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unique_ptr prefix_index_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -20,9 +20,8 @@ #include "table/block_based/partitioned_filter_block.h" #include "table/format.h" -// Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace ROCKSDB_NAMESPACE { -// using namespace rocksdb; + // Create a index builder based on its type. IndexBuilder* IndexBuilder::CreateIndexBuilder( BlockBasedTableOptions::IndexType index_type, @@ -37,7 +36,8 @@ comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, table_opt.index_shortening, /* include_first_key */ false); - } break; + break; + } case BlockBasedTableOptions::kHashSearch: { // Currently kHashSearch is incompatible with index_block_restart_interval // > 1 @@ -46,20 +46,24 @@ comparator, int_key_slice_transform, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, table_opt.index_shortening); - } break; + break; + } case BlockBasedTableOptions::kTwoLevelIndexSearch: { result = PartitionedIndexBuilder::CreateIndexBuilder( comparator, use_value_delta_encoding, table_opt); - } break; + break; + } case BlockBasedTableOptions::kBinarySearchWithFirstKey: { result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, table_opt.index_shortening, /* include_first_key */ true); - } break; + break; + } default: { assert(!"Do not recognize the index type "); - } break; + break; + } } return result; } @@ -104,6 +108,15 @@ comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, table_opt_.index_shortening, /* include_first_key */ false); + + // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if + // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by + // default on Creation) so that flush policy can point to + // sub_index_builder_->index_block_builder_ + if (seperator_is_key_plus_seq_) { + sub_index_builder_->seperator_is_key_plus_seq_ = true; + } + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset @@ -129,9 +142,15 @@ } sub_index_builder_->AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); - if (sub_index_builder_->seperator_is_key_plus_seq_) { - // then we need to apply it to all sub-index builders + if (!seperator_is_key_plus_seq_ && + sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders and reset + // flush_policy to point to Block Builder of sub_index_builder_ that store + // internal keys. seperator_is_key_plus_seq_ = true; + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + sub_index_builder_->index_block_builder_)); } sub_index_last_key_ = std::string(*last_key_in_current_block); entries_.push_back( @@ -161,9 +180,15 @@ sub_index_builder_->AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); sub_index_last_key_ = std::string(*last_key_in_current_block); - if (sub_index_builder_->seperator_is_key_plus_seq_) { - // then we need to apply it to all sub-index builders + if (!seperator_is_key_plus_seq_ && + sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders and reset + // flush_policy to point to Block Builder of sub_index_builder_ that store + // internal keys. seperator_is_key_plus_seq_ = true; + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + sub_index_builder_->index_block_builder_)); } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -307,12 +307,13 @@ if (pending_block_num_ != 0) { FlushPendingPrefix(); } - primary_index_builder_.Finish(index_blocks, last_partition_block_handle); + Status s = primary_index_builder_.Finish(index_blocks, + last_partition_block_handle); index_blocks->meta_blocks.insert( {kHashIndexPrefixesBlock.c_str(), prefix_block_}); index_blocks->meta_blocks.insert( {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); - return Status::OK(); + return s; } virtual size_t IndexSize() const override { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) { + PERF_TIMER_GUARD(read_index_block_nanos); + + assert(table != nullptr); + assert(index_block != nullptr); + assert(index_block->IsEmpty()); + + const Rep* const rep = table->get_rep(); + assert(rep != nullptr); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, + get_context, lookup_context, /* for_compaction */ false, use_cache, + /* wait_for_cache */ true); + + return s; +} + +Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) const { + assert(index_block != nullptr); + + if (!index_block_.IsEmpty()) { + index_block->SetUnownedValue(index_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, + cache_index_blocks(), get_context, lookup_context, + index_block); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,85 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Encapsulates common functionality for the various index reader +// implementations. Provides access to the index block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { + public: + IndexReaderCommon(const BlockBasedTable* t, + CachableEntry&& index_block) + : table_(t), index_block_(std::move(index_block)) { + assert(table_ != nullptr); + } + + protected: + static Status ReadIndexBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block); + + const BlockBasedTable* table() const { return table_; } + + const InternalKeyComparator* internal_comparator() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + return &table_->get_rep()->internal_comparator; + } + + bool index_has_first_key() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } + + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; + } + + bool index_value_is_full() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_value_is_full; + } + + bool cache_index_blocks() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->table_options.cache_index_and_filter_blocks; + } + + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) const; + + size_t ApproximateIndexBlockMemoryUsage() const { + assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); + return index_block_.GetOwnValue() + ? index_block_.GetValue()->ApproximateMemoryUsage() + : 0; + } + + private: + const BlockBasedTable* table_; + CachableEntry index_block_; +}; + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h 2025-05-19 16:14:27.000000000 +0000 @@ -23,7 +23,7 @@ public: Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; EnvOptions env_options_; BlockBasedTableOptions table_options_; InternalKeyComparator icomp_; @@ -39,7 +39,7 @@ constexpr bool immortal_table = false; table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep( ioptions_, env_options_, table_options_, icomp_, skip_filters, - kMockLevel, immortal_table))); + 12345 /*file_size*/, kMockLevel, immortal_table))); } FilterBitsBuilder* GetBuilder() const { @@ -47,7 +47,7 @@ context.column_family_name = "mock_cf"; context.compaction_style = ioptions_.compaction_style; context.level_at_creation = kMockLevel; - context.info_log = ioptions_.info_log; + context.info_log = ioptions_.logger; return BloomFilterPolicy::GetBuilderFromContext(context); } }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h 2025-05-19 16:14:27.000000000 +0000 @@ -32,6 +32,8 @@ bool own_bytes() const { return block_contents_.own_bytes(); } + const Slice GetBlockContentsData() const { return block_contents_.data; } + private: BlockContents block_contents_; std::unique_ptr filter_bits_reader_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,6 +7,8 @@ #include +#include "file/random_access_file_reader.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "port/malloc.h" #include "port/port.h" @@ -32,9 +34,30 @@ true /*use_delta_encoding*/, use_value_delta_encoding), p_index_builder_(p_index_builder), - keys_added_to_partition_(0) { - keys_per_partition_ = - filter_bits_builder_->CalculateNumEntry(partition_size); + keys_added_to_partition_(0), + total_added_in_built_(0) { + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(partition_size)); + if (keys_per_partition_ < 1) { + // partition_size (minus buffer, ~10%) might be smaller than minimum + // filter size, sometimes based on cache line size. Try to find that + // minimum size without CalculateSpace (not necessarily available). + uint32_t larger = std::max(partition_size + 4, uint32_t{16}); + for (;;) { + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(larger)); + if (keys_per_partition_ >= 1) { + break; + } + larger += larger / 4; + if (larger > 100000) { + // might be a broken implementation. substitute something reasonable: + // 1 key / byte. + keys_per_partition_ = partition_size; + break; + } + } + } } PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} @@ -50,20 +73,24 @@ if (!p_index_builder_->ShouldCutFilterBlock()) { return; } - filter_gc.push_back(std::unique_ptr(nullptr)); - // Add the prefix of the next key before finishing the partition. This hack, - // fixes a bug with format_verison=3 where seeking for the prefix would lead - // us to the previous partition. - const bool add_prefix = + // Add the prefix of the next key before finishing the partition without + // updating last_prefix_str_. This hack, fixes a bug with format_verison=3 + // where seeking for the prefix would lead us to the previous partition. + const bool maybe_add_prefix = next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); - if (add_prefix) { - FullFilterBlockBuilder::AddPrefix(*next_key); + if (maybe_add_prefix) { + const Slice next_key_prefix = prefix_extractor()->Transform(*next_key); + if (next_key_prefix.compare(last_prefix_str()) != 0) { + AddKey(next_key_prefix); + } } - Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); + total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded(); + std::unique_ptr filter_data; + Slice filter = filter_bits_builder_->Finish(&filter_data); std::string& index_key = p_index_builder_->GetPartitionKey(); - filters.push_back({index_key, filter}); + filters.push_back({index_key, filter, std::move(filter_data)}); keys_added_to_partition_ = 0; Reset(); } @@ -78,11 +105,15 @@ keys_added_to_partition_++; } +size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() { + return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded(); +} + Slice PartitionedFilterBlockBuilder::Finish( - const BlockHandle& last_partition_block_handle, Status* status) { + const BlockHandle& last_partition_block_handle, Status* status, + std::unique_ptr* filter_data) { if (finishing_filters == true) { // Record the handle of the last written filter block in the index - FilterEntry& last_entry = filters.front(); std::string handle_encoding; last_partition_block_handle.EncodeTo(&handle_encoding); std::string handle_delta_encoding; @@ -91,14 +122,13 @@ last_partition_block_handle.size() - last_encoded_handle_.size()); last_encoded_handle_ = last_partition_block_handle; const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding, &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq()) { index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_entry.key), handle_encoding, + ExtractUserKey(last_filter_entry_key), handle_encoding, &handle_delta_encoding_slice); } - filters.pop_front(); } else { MaybeCutAFilterBlock(nullptr); } @@ -106,7 +136,10 @@ // partitions if (UNLIKELY(filters.empty())) { *status = Status::OK(); + last_filter_data.reset(); if (finishing_filters) { + // Simplest to just add them all at the end + total_added_in_built_ = 0; if (p_index_builder_->seperator_is_key_plus_seq()) { return index_on_filter_block_builder_.Finish(); } else { @@ -121,7 +154,15 @@ // indicate we expect more calls to Finish *status = Status::Incomplete(); finishing_filters = true; - return filters.front().filter; + + last_filter_entry_key = filters.front().key; + Slice filter = filters.front().filter; + last_filter_data = std::move(filters.front().filter_data); + if (filter_data != nullptr) { + *filter_data = std::move(last_filter_data); + } + filters.pop_front(); + return filter; } } @@ -130,19 +171,20 @@ : FilterBlockReaderCommon(t, std::move(filter_block)) {} std::unique_ptr PartitionedFilterBlockReader::Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context) { + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { assert(table); assert(table->get_rep()); assert(!pin || prefetch); CachableEntry filter_block; if (prefetch || !use_cache) { - const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), - use_cache, nullptr /* get_context */, - lookup_context, &filter_block); + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); if (!s.ok()) { + IGNORE_STATUS_IF_ERROR(s); return std::unique_ptr(); } @@ -170,13 +212,23 @@ &FullFilterBlockReader::KeyMayMatch); } +void PartitionedFilterBlockReader::KeysMayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + return; // Any/all may match + } + + MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context, + &FullFilterBlockReader::KeysMayMatch); +} + bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) { -#ifdef NDEBUG - (void)block_offset; -#endif assert(const_ikey_ptr != nullptr); assert(block_offset == kNotValid); if (!table_prefix_extractor() && !prefix_extractor) { @@ -188,14 +240,28 @@ &FullFilterBlockReader::PrefixMayMatch); } +void PartitionedFilterBlockReader::PrefixesMayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { + assert(block_offset == kNotValid); + if (!table_prefix_extractor() && !prefix_extractor) { + return; // Any/all may match + } + + MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context, + &FullFilterBlockReader::PrefixesMayMatch); +} + BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const CachableEntry& filter_block, const Slice& entry) const { IndexBlockIter iter; const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( - comparator, comparator->user_comparator(), &iter, kNullStats, - true /* total_order_seek */, false /* have_first_key */, + comparator->user_comparator(), + table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter, + kNullStats, true /* total_order_seek */, false /* have_first_key */, index_key_includes_seq(), index_value_is_full()); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { @@ -239,7 +305,8 @@ table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter, get_context, lookup_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ true); return s; } @@ -253,6 +320,7 @@ Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); return true; } @@ -270,6 +338,7 @@ no_io, get_context, lookup_context, &filter_partition_block); if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); return true; } @@ -280,6 +349,79 @@ lookup_context); } +void PartitionedFilterBlockReader::MayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, BlockCacheLookupContext* lookup_context, + FilterManyFunction filter_function) const { + CachableEntry filter_block; + Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return; // Any/all may match + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return; // Any/all may match + } + + auto start_iter_same_handle = range->begin(); + BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle(); + + // For all keys mapping to same partition (must be adjacent in sorted order) + // share block cache lookup and use full filter multiget on the partition + // filter. + for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) { + // TODO: re-use one top-level index iterator + BlockHandle this_filter_handle = + GetFilterPartitionHandle(filter_block, iter->ikey); + if (!prev_filter_handle.IsNull() && + this_filter_handle != prev_filter_handle) { + MultiGetRange subrange(*range, start_iter_same_handle, iter); + MayMatchPartition(&subrange, prefix_extractor, block_offset, + prev_filter_handle, no_io, lookup_context, + filter_function); + range->AddSkipsFrom(subrange); + start_iter_same_handle = iter; + } + if (UNLIKELY(this_filter_handle.size() == 0)) { // key is out of range + // Not reachable with current behavior of GetFilterPartitionHandle + assert(false); + range->SkipKey(iter); + prev_filter_handle = BlockHandle::NullBlockHandle(); + } else { + prev_filter_handle = this_filter_handle; + } + } + if (!prev_filter_handle.IsNull()) { + MultiGetRange subrange(*range, start_iter_same_handle, range->end()); + MayMatchPartition(&subrange, prefix_extractor, block_offset, + prev_filter_handle, no_io, lookup_context, + filter_function); + range->AddSkipsFrom(subrange); + } +} + +void PartitionedFilterBlockReader::MayMatchPartition( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, BlockHandle filter_handle, bool no_io, + BlockCacheLookupContext* lookup_context, + FilterManyFunction filter_function) const { + CachableEntry filter_partition_block; + Status s = GetFilterPartitionBlock( + nullptr /* prefetch_buffer */, filter_handle, no_io, + range->begin()->get_context, lookup_context, &filter_partition_block); + if (UNLIKELY(!s.ok())) { + IGNORE_STATUS_IF_ERROR(s); + return; // Any/all may match + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + (filter_partition.*filter_function)(range, prefix_extractor, block_offset, + no_io, lookup_context); +} + size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE @@ -292,7 +434,8 @@ } // TODO(myabandeh): merge this with the same function in IndexReader -void PartitionedFilterBlockReader::CacheDependencies(bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, + bool pin) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -305,11 +448,11 @@ Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, &lookup_context, &filter_block); if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Error retrieving top-level filter block while trying to " - "cache filter partitions: %s", - s.ToString().c_str()); - return; + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return s; } // Before read partitions, prefetch them to avoid lots of IOs @@ -319,9 +462,10 @@ const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( - comparator, comparator->user_comparator(), &biter, kNullStats, - true /* total_order_seek */, false /* have_first_key */, - index_key_includes_seq(), index_value_is_full()); + comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter), + &biter, kNullStats, true /* total_order_seek */, + false /* have_first_key */, index_key_includes_seq(), + index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -331,16 +475,24 @@ // Read the last block's offset biter.SeekToLast(); handle = biter.value().handle; - uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; + uint64_t last_off = + handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; + rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, + false /* Implicit autoreadahead */); - prefetch_buffer.reset(new FilePrefetchBuffer()); - s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, - static_cast(prefetch_len)); + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + } + if (!s.ok()) { + return s; + } // After prefetch, read the partitions one by one - ReadOptions read_options; for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; @@ -348,12 +500,15 @@ // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), read_options, handle, - UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter, + prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter, nullptr /* get_context */, &lookup_context, nullptr /* contents */); - + if (!s.ok()) { + return s; + } assert(s.ok() || block.GetValue() == nullptr); - if (s.ok() && block.GetValue() != nullptr) { + + if (block.GetValue() != nullptr) { if (block.IsCached()) { if (pin) { filter_map_[handle.offset()] = std::move(block); @@ -361,6 +516,7 @@ } } } + return biter.status(); } const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,20 +5,22 @@ #pragma once +#include #include #include #include -#include "db/dbformat.h" -#include "index_builder.h" + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/block_based/block.h" #include "table/block_based/filter_block_reader_common.h" #include "table/block_based/full_filter_block.h" +#include "table/block_based/index_builder.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +class InternalKeyComparator; class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { public: @@ -33,9 +35,11 @@ void AddKey(const Slice& key) override; void Add(const Slice& key) override; + size_t EstimateEntriesAdded() override; - virtual Slice Finish(const BlockHandle& last_partition_block_handle, - Status* status) override; + virtual Slice Finish( + const BlockHandle& last_partition_block_handle, Status* status, + std::unique_ptr* filter_data = nullptr) override; private: // Filter data @@ -45,10 +49,13 @@ struct FilterEntry { std::string key; Slice filter; + std::unique_ptr filter_data; }; - std::list filters; // list of partitioned indexes and their keys + std::deque filters; // list of partitioned filters and keys used + // in building the index + std::string last_filter_entry_key; + std::unique_ptr last_filter_data; std::unique_ptr value; - std::vector> filter_gc; bool finishing_filters = false; // true if Finish is called once but not complete yet. // The policy of when cut a filter block and Finish it @@ -62,6 +69,9 @@ uint32_t keys_per_partition_; // The number of keys added to the last partition so far uint32_t keys_added_to_partition_; + // According to the bits builders, how many keys/prefixes added + // in all the filters we have fully built + uint64_t total_added_in_built_; BlockHandle last_encoded_handle_; }; @@ -71,21 +81,30 @@ CachableEntry&& filter_block); static std::unique_ptr Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context); + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); bool IsBlockBased() override { return false; } bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; + void KeysMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; + bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; size_t ApproximateMemoryUsage() const override; @@ -108,13 +127,28 @@ GetContext* get_context, BlockCacheLookupContext* lookup_context, FilterFunction filter_function) const; - void CacheDependencies(bool pin) override; + using FilterManyFunction = void (FullFilterBlockReader::*)( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context); + void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, + BlockCacheLookupContext* lookup_context, + FilterManyFunction filter_function) const; + void MayMatchPartition(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, BlockHandle filter_handle, + bool no_io, BlockCacheLookupContext* lookup_context, + FilterManyFunction filter_function) const; + Status CacheDependencies(const ReadOptions& ro, bool pin) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; bool index_value_is_full() const; protected: + // For partition blocks pinned in cache. Can be a subset of blocks + // in case some fail insertion on attempt to pin. std::unordered_map> filter_map_; }; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,16 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/partitioned_filter_block.h" + #include +#include "index_builder.h" #include "rocksdb/filter_policy.h" - #include "table/block_based/block_based_table_reader.h" -#include "table/block_based/partitioned_filter_block.h" #include "table/block_based/filter_policy_internal.h" - -#include "index_builder.h" -#include "logging/logging.h" +#include "table/format.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -59,7 +58,7 @@ virtual public ::testing::WithParamInterface { public: Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; EnvOptions env_options_; BlockBasedTableOptions table_options_; InternalKeyComparator icomp_; @@ -137,22 +136,24 @@ BlockHandle bh; Status status; Slice slice; + std::unique_ptr filter_data; do { - slice = builder->Finish(bh, &status); + slice = builder->Finish(bh, &status, &filter_data); bh = Write(slice); } while (status.IsIncomplete()); constexpr bool skip_filters = false; + constexpr uint64_t file_size = 12345; constexpr int level = 0; constexpr bool immortal_table = false; table_.reset(new MockedBlockBasedTable( new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, - icomp_, skip_filters, level, immortal_table), + icomp_, skip_filters, file_size, level, + immortal_table), pib)); BlockContents contents(slice); CachableEntry block( - new Block(std::move(contents), kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, nullptr), + new Block(std::move(contents), 0 /* read_amp_bytes_per_bit */, nullptr), nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); auto reader = new MyPartitionedFilterBlockReader(table_.get(), std::move(block)); @@ -291,10 +292,11 @@ } }; -INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest, - testing::Values(test::kDefaultFormatVersion)); -INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest, - testing::Values(test::kLatestFormatVersion)); +// Format versions potentially intersting to partitioning +INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest, + testing::ValuesIn(std::set{ + 2, 3, 4, test::kDefaultFormatVersion, + kLatestFormatVersion})); TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { std::unique_ptr pib(NewIndexBuilder()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,162 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/partitioned_index_iterator.h" + +namespace ROCKSDB_NAMESPACE { +void PartitionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); } + +void PartitionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); } + +void PartitionedIndexIterator::SeekImpl(const Slice* target) { + SavePrevIndexValue(); + + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + + InitPartitionedIndexBlock(); + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + + // We could check upper bound here, but that would be too complicated + // and checking index upper bound is less useful than for data blocks. + + if (target) { + assert(!Valid() || (table_->get_rep()->index_key_includes_seq + ? (icomp_.Compare(*target, key()) <= 0) + : (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0))); + } +} + +void PartitionedIndexIterator::SeekToLast() { + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetPartitionedIndexIter(); + return; + } + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); +} + +void PartitionedIndexIterator::Next() { + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); +} + +void PartitionedIndexIterator::Prev() { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + + FindKeyBackward(); +} + +void PartitionedIndexIterator::InitPartitionedIndexBlock() { + BlockHandle partitioned_index_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + partitioned_index_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetPartitionedIndexIter(); + } + auto* rep = table_->get_rep(); + bool is_for_compaction = + lookup_context_.caller == TableReaderCaller::kCompaction; + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + block_prefetcher_.PrefetchIfNeeded(rep, partitioned_index_handle, + read_options_.readahead_size, + is_for_compaction); + Status s; + table_->NewDataBlockIterator( + read_options_, partitioned_index_handle, &block_iter_, + BlockType::kIndex, + /*get_context=*/nullptr, &lookup_context_, s, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction); + block_iter_points_to_real_block_ = true; + // We could check upper bound here but it is complicated to reason about + // upper bound in index iterator. On the other than, in large scans, index + // iterators are moved much less frequently compared to data blocks. So + // the upper bound check is skipped for simplicity. + } +} + +void PartitionedIndexIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +void PartitionedIndexIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + ResetPartitionedIndexIter(); + index_iter_->Next(); + + if (!index_iter_->Valid()) { + return; + } + + InitPartitionedIndexBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +void PartitionedIndexIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetPartitionedIndexIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitPartitionedIndexBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,159 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" + +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterator that iterates over partitioned index. +// Some upper and lower bound tricks played in block based table iterators +// could be played here, but it's too complicated to reason about index +// keys with upper or lower bound, so we skip it for simplicity. +class PartitionedIndexIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + public: + PartitionedIndexIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr>&& index_iter, + TableReaderCaller caller, size_t compaction_readahead_size = 0) + : index_iter_(std::move(index_iter)), + table_(table), + read_options_(read_options), +#ifndef NDEBUG + icomp_(icomp), +#endif + user_comparator_(icomp.user_comparator()), + block_iter_points_to_real_block_(false), + lookup_context_(caller), + block_prefetcher_(compaction_readahead_size) { + } + + ~PartitionedIndexIterator() override {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice&) override { + // Shouldn't be called. + assert(false); + } + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult*) override { + assert(false); + return false; + } + void Prev() override; + bool Valid() const override { + return block_iter_points_to_real_block_ && block_iter_.Valid(); + } + Slice key() const override { + assert(Valid()); + return block_iter_.key(); + } + Slice user_key() const override { + assert(Valid()); + return block_iter_.user_key(); + } + IndexValue value() const override { + assert(Valid()); + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + inline IterBoundCheck UpperBoundCheckResult() override { + // Shouldn't be called. + assert(false); + return IterBoundCheck::kUnknown; + } + void SetPinnedItersMgr(PinnedIteratorsManager*) override { + // Shouldn't be called. + assert(false); + } + bool IsKeyPinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + bool IsValuePinned() const override { + // Shouldn't be called. + assert(false); + return false; + } + + void ResetPartitionedIndexIter() { + if (block_iter_points_to_real_block_) { + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (block_prefetcher_.prefetch_buffer() != nullptr && + read_options_.adaptive_readahead) { + block_prefetcher_.prefetch_buffer()->GetReadaheadState( + &(readahead_file_info->index_block_readahead_info)); + } + } + + void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (read_options_.adaptive_readahead) { + block_prefetcher_.SetReadaheadState( + &(readahead_file_info->index_block_readahead_info)); + } + } + + std::unique_ptr> index_iter_; + + private: + friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test; + const BlockBasedTable* table_; + const ReadOptions read_options_; +#ifndef NDEBUG + const InternalKeyComparator& icomp_; +#endif + UserComparatorWrapper user_comparator_; + IndexBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + BlockPrefetcher block_prefetcher_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitPartitionedIndexBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,207 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/partitioned_index_reader.h" + +#include "file/random_access_file_reader.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/partitioned_index_iterator.h" + +namespace ROCKSDB_NAMESPACE { +Status PartitionIndexReader::Create( + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ro, use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset(new PartitionIndexReader(table, std::move(index_block))); + + return Status::OK(); +} + +InternalIteratorBase* PartitionIndexReader::NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + const BlockBasedTable::Rep* rep = table()->rep_; + InternalIteratorBase* it = nullptr; + + Statistics* kNullStats = nullptr; + // Filters are already checked before seeking the index + if (!partition_map_.empty()) { + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = NewTwoLevelIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), + index_value_is_full())); + } else { + ReadOptions ro; + ro.fill_cache = read_options.fill_cache; + ro.deadline = read_options.deadline; + ro.io_timeout = read_options.io_timeout; + ro.adaptive_readahead = read_options.adaptive_readahead; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + std::unique_ptr> index_iter( + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), + index_value_is_full())); + + it = new PartitionedIndexIterator( + table(), ro, *internal_comparator(), std::move(index_iter), + lookup_context ? lookup_context->caller + : TableReaderCaller::kUncategorized); + } + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of + // on-stack BlockIter while the state is on heap. Currentlly it assumes + // the first level iter is always on heap and will attempt to delete it + // in its destructor. +} +Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, + bool pin) { + // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + const BlockBasedTable::Rep* rep = table()->rep_; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + + CachableEntry index_block; + { + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block); + if (!s.ok()) { + return s; + } + } + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + index_block.GetValue()->NewIndexIterator( + internal_comparator()->user_comparator(), + rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true, + index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + if (!biter.Valid()) { + // Empty index. + return biter.status(); + } + handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + if (!biter.Valid()) { + // Empty index. + return biter.status(); + } + handle = biter.value().handle; + uint64_t last_off = + handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr prefetch_buffer; + rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, + false /*Implicit auto readahead*/); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + } + if (!s.ok()) { + return s; + } + } + + // For saving "all or nothing" to partition_map_ + std::unordered_map> map_in_progress; + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + size_t partition_count = 0; + for (; biter.Valid(); biter.Next()) { + handle = biter.value().handle; + CachableEntry block; + ++partition_count; + // TODO: Support counter batch update for partitioned index and + // filter blocks + Status s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex, + /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr); + + if (!s.ok()) { + return s; + } + if (block.GetValue() != nullptr) { + // Might need to "pin" some mmap-read blocks (GetOwnValue) if some + // partitions are successfully compressed (cached) and some are not + // compressed (mmap eligible) + if (block.IsCached() || block.GetOwnValue()) { + if (pin) { + map_in_progress[handle.offset()] = std::move(block); + } + } + } + } + Status s = biter.status(); + // Save (pin) them only if everything checks out + if (map_in_progress.size() == partition_count && s.ok()) { + std::swap(partition_map_, map_in_progress); + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/index_reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Index that allows binary search lookup in a two-level index structure. +class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read the partition index from the file and create an instance for + // `PartitionIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + // return a two-level iterator: first level is on the partition index + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + Status CacheDependencies(const ReadOptions& ro, bool pin) override; + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; + } + + private: + PartitionIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + // For partition blocks pinned in cache. This is expected to be "all or + // none" so that !partition_map_.empty() can use an iterator expecting + // all partitions to be saved here. + std::unordered_map> partition_map_; +}; +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/reader_common.h" + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/table.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +void ForceReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle, true /* force_erase */); +} + +// WART: this is specific to block-based table +Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, const std::string& file_name, + uint64_t offset) { + PERF_TIMER_GUARD(block_checksum_time); + // After block_size bytes is compression type (1 byte), which is part of + // the checksummed section. + size_t len = block_size + 1; + // And then the stored checksum value (4 bytes). + uint32_t stored = DecodeFixed32(data + len); + + uint32_t computed = ComputeBuiltinChecksum(type, data, len); + if (stored == computed) { + return Status::OK(); + } else { + // Unmask for people who might look for reference crc value + if (type == kCRC32c) { + stored = crc32c::Unmask(stored); + computed = crc32c::Unmask(computed); + } + return Status::Corruption( + "block checksum mismatch: stored = " + ToString(stored) + + ", computed = " + ToString(computed) + ", type = " + ToString(type) + + " in " + file_name + " offset " + ToString(offset) + " size " + + ToString(block_size)); + } +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/cache.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { +// Release the cached entry and decrement its ref count. +extern void ForceReleaseCachedEntry(void* arg, void* h); + +inline MemoryAllocator* GetMemoryAllocator( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache.get() + ? table_options.block_cache->memory_allocator() + : nullptr; +} + +inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache_compressed.get() + ? table_options.block_cache_compressed->memory_allocator() + : nullptr; +} + +// Assumes block has a trailer as in format.h. file_name and offset provided +// for generating a diagnostic message in returned status. +extern Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, + const std::string& file_name, + uint64_t offset); +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,8 @@ // #include "table/block_based/uncompression_dict_reader.h" + +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "table/block_based/block_based_table_reader.h" #include "util/compression.h" @@ -12,9 +14,9 @@ namespace ROCKSDB_NAMESPACE { Status UncompressionDictReader::Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context, + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, std::unique_ptr* uncompression_dict_reader) { assert(table); assert(table->get_rep()); @@ -24,8 +26,8 @@ CachableEntry uncompression_dict; if (prefetch || !use_cache) { const Status s = ReadUncompressionDictionary( - table, prefetch_buffer, ReadOptions(), use_cache, - nullptr /* get_context */, lookup_context, &uncompression_dict); + table, prefetch_buffer, ro, use_cache, nullptr /* get_context */, + lookup_context, &uncompression_dict); if (!s.ok()) { return s; } @@ -60,11 +62,11 @@ prefetch_buffer, read_options, rep->compression_dict_handle, UncompressionDict::GetEmptyDict(), uncompression_dict, BlockType::kCompressionDictionary, get_context, lookup_context, - /* for_compaction */ false, use_cache); + /* for_compaction */ false, use_cache, /* wait_for_cache */ true); if (!s.ok()) { ROCKS_LOG_WARN( - rep->ioptions.info_log, + rep->ioptions.logger, "Encountered error while reading data from compression dictionary " "block %s", s.ToString().c_str()); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -25,9 +25,9 @@ class UncompressionDictReader { public: static Status Create( - const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, - bool use_cache, bool prefetch, bool pin, - BlockCacheLookupContext* lookup_context, + const BlockBasedTable* table, const ReadOptions& ro, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, std::unique_ptr* uncompression_dict_reader); Status GetOrReadUncompressionDictionary( diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,54 +15,32 @@ #include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" +#include "rocksdb/compression_type.h" #include "rocksdb/env.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_type.h" +#include "table/block_based/reader_common.h" #include "table/format.h" #include "table/persistent_cache_helper.h" -#include "util/coding.h" #include "util/compression.h" -#include "util/crc32c.h" #include "util/stop_watch.h" -#include "util/string_util.h" -#include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { -inline void BlockFetcher::CheckBlockChecksum() { - // Check the crc of the type and the block contents - if (read_options_.verify_checksums) { - const char* data = slice_.data(); // Pointer to where Read put the data - PERF_TIMER_GUARD(block_checksum_time); - uint32_t value = DecodeFixed32(data + block_size_ + 1); - uint32_t actual = 0; - switch (footer_.checksum()) { - case kNoChecksum: - break; - case kCRC32c: - value = crc32c::Unmask(value); - actual = crc32c::Value(data, block_size_ + 1); - break; - case kxxHash: - actual = XXH32(data, static_cast(block_size_) + 1, 0); - break; - case kxxHash64: - actual = static_cast( - XXH64(data, static_cast(block_size_) + 1, 0) & - uint64_t{0xffffffff}); - break; - default: - status_ = Status::Corruption( - "unknown checksum type " + ToString(footer_.checksum()) + " in " + - file_->file_name() + " offset " + ToString(handle_.offset()) + - " size " + ToString(block_size_)); - } - if (status_.ok() && actual != value) { - status_ = Status::Corruption( - "block checksum mismatch: expected " + ToString(actual) + ", got " + - ToString(value) + " in " + file_->file_name() + " offset " + - ToString(handle_.offset()) + " size " + ToString(block_size_)); +inline void BlockFetcher::ProcessTrailerIfPresent() { + if (footer_.GetBlockTrailerSize() > 0) { + assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize); + if (read_options_.verify_checksums) { + io_status_ = status_to_io_status(VerifyBlockChecksum( + footer_.checksum_type(), slice_.data(), block_size_, + file_->file_name(), handle_.offset())); } + compression_type_ = + BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_); + } else { + // E.g. plain table or cuckoo table + compression_type_ = kNoCompression; } } @@ -76,9 +54,9 @@ return true; } else { // uncompressed page is not found - if (ioptions_.info_log && !status.IsNotFound()) { + if (ioptions_.logger && !status.IsNotFound()) { assert(!status.ok()); - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "Error reading from persistent cache. %s", status.ToString().c_str()); } @@ -88,18 +66,23 @@ } inline bool BlockFetcher::TryGetFromPrefetchBuffer() { - if (prefetch_buffer_ != nullptr && - prefetch_buffer_->TryReadFromCache( - handle_.offset(), - static_cast(handle_.size()) + kBlockTrailerSize, &slice_, - for_compaction_)) { - block_size_ = static_cast(handle_.size()); - CheckBlockChecksum(); - if (!status_.ok()) { + if (prefetch_buffer_ != nullptr) { + IOOptions opts; + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + if (io_s.ok() && + prefetch_buffer_->TryReadFromCache(opts, file_, handle_.offset(), + block_size_with_trailer_, &slice_, + &io_s, for_compaction_)) { + ProcessTrailerIfPresent(); + if (!io_status_.ok()) { + return true; + } + got_from_prefetch_buffer_ = true; + used_buf_ = const_cast(slice_.data()); + } else if (!io_s.ok()) { + io_status_ = io_s; return true; } - got_from_prefetch_buffer_ = true; - used_buf_ = const_cast(slice_.data()); } return got_from_prefetch_buffer_; } @@ -109,18 +92,19 @@ cache_options_.persistent_cache->IsCompressed()) { // lookup uncompressed cache mode p-cache std::unique_ptr raw_data; - status_ = PersistentCacheHelper::LookupRawPage( - cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize); - if (status_.ok()) { + io_status_ = status_to_io_status(PersistentCacheHelper::LookupRawPage( + cache_options_, handle_, &raw_data, block_size_with_trailer_)); + if (io_status_.ok()) { heap_buf_ = CacheAllocationPtr(raw_data.release()); used_buf_ = heap_buf_.get(); slice_ = Slice(heap_buf_.get(), block_size_); + ProcessTrailerIfPresent(); return true; - } else if (!status_.IsNotFound() && ioptions_.info_log) { - assert(!status_.ok()); - ROCKS_LOG_INFO(ioptions_.info_log, + } else if (!io_status_.IsNotFound() && ioptions_.logger) { + assert(!io_status_.ok()); + ROCKS_LOG_INFO(ioptions_.logger, "Error reading from persistent cache. %s", - status_.ToString().c_str()); + io_status_.ToString().c_str()); } } return false; @@ -128,35 +112,53 @@ inline void BlockFetcher::PrepareBufferForBlockFromFile() { // cache miss read from device - if (do_uncompress_ && - block_size_ + kBlockTrailerSize < kDefaultStackBufferSize) { + if ((do_uncompress_ || ioptions_.allow_mmap_reads) && + block_size_with_trailer_ < kDefaultStackBufferSize) { // If we've got a small enough hunk of data, read it in to the // trivially allocated stack buffer instead of needing a full malloc() + // + // `GetBlockContents()` cannot return this data as its lifetime is tied to + // this `BlockFetcher`'s lifetime. That is fine because this is only used + // in cases where we do not expect the `GetBlockContents()` result to be the + // same buffer we are assigning here. If we guess incorrectly, there will be + // a heap allocation and memcpy in `GetBlockContents()` to obtain the final + // result. Considering we are eliding a heap allocation here by using the + // stack buffer, the cost of guessing incorrectly here is one extra memcpy. + // + // When `do_uncompress_` is true, we expect the uncompression step will + // allocate heap memory for the final result. However this expectation will + // be wrong if the block turns out to already be uncompressed, which we + // won't know for sure until after reading it. + // + // When `ioptions_.allow_mmap_reads` is true, we do not expect the file + // reader to use the scratch buffer at all, but instead return a pointer + // into the mapped memory. This expectation will be wrong when using a + // file reader that does not implement mmap reads properly. used_buf_ = &stack_buf_[0]; } else if (maybe_compressed_ && !do_uncompress_) { - compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, + compressed_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_); used_buf_ = compressed_buf_.get(); } else { heap_buf_ = - AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_); + AllocateBlock(block_size_with_trailer_, memory_allocator_); used_buf_ = heap_buf_.get(); } } inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { - if (status_.ok() && read_options_.fill_cache && + if (io_status_.ok() && read_options_.fill_cache && cache_options_.persistent_cache && cache_options_.persistent_cache->IsCompressed()) { // insert to raw cache PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_, - block_size_ + kBlockTrailerSize); + block_size_with_trailer_); } } inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { - if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache && - cache_options_.persistent_cache && + if (io_status_.ok() && !got_from_prefetch_buffer_ && + read_options_.fill_cache && cache_options_.persistent_cache && !cache_options_.persistent_cache->IsCompressed()) { // insert to uncompressed cache PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_, @@ -164,12 +166,35 @@ } } -inline void BlockFetcher::CopyBufferToHeap() { +inline void BlockFetcher::CopyBufferToHeapBuf() { assert(used_buf_ != heap_buf_.get()); - heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_); - memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize); + heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_); + memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_); +#ifndef NDEBUG + num_heap_buf_memcpy_++; +#endif +} + +inline void BlockFetcher::CopyBufferToCompressedBuf() { + assert(used_buf_ != compressed_buf_.get()); + compressed_buf_ = AllocateBlock(block_size_with_trailer_, + memory_allocator_compressed_); + memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_); +#ifndef NDEBUG + num_compressed_buf_memcpy_++; +#endif } +// Entering this method means the block is not compressed or do not need to be +// uncompressed. The block can be in one of the following buffers: +// 1. prefetch buffer if prefetch is enabled and the block is prefetched before +// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block +// is not compressed +// 3. heap_buf_ if the block is not compressed +// 4. compressed_buf_ if the block is compressed +// 5. direct_io_buf_ if direct IO is enabled +// After this method, if the block is compressed, it should be in +// compressed_buf_, otherwise should be in heap_buf_. inline void BlockFetcher::GetBlockContents() { if (slice_.data() != used_buf_) { // the slice content is not the buffer provided @@ -178,12 +203,19 @@ // page can be either uncompressed or compressed, the buffer either stack // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096 if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { - CopyBufferToHeap(); + CopyBufferToHeapBuf(); } else if (used_buf_ == compressed_buf_.get()) { if (compression_type_ == kNoCompression && memory_allocator_ != memory_allocator_compressed_) { - CopyBufferToHeap(); + CopyBufferToHeapBuf(); + } else { + heap_buf_ = std::move(compressed_buf_); + } + } else if (direct_io_buf_.get() != nullptr) { + if (compression_type_ == kNoCompression) { + CopyBufferToHeapBuf(); } else { + CopyBufferToCompressedBuf(); heap_buf_ = std::move(compressed_buf_); } } @@ -194,31 +226,48 @@ #endif } -Status BlockFetcher::ReadBlockContents() { - block_size_ = static_cast(handle_.size()); - +IOStatus BlockFetcher::ReadBlockContents() { if (TryGetUncompressBlockFromPersistentCache()) { compression_type_ = kNoCompression; #ifndef NDEBUG contents_->is_raw_block = true; #endif // NDEBUG - return Status::OK(); + return IOStatus::OK(); } if (TryGetFromPrefetchBuffer()) { - if (!status_.ok()) { - return status_; + if (!io_status_.ok()) { + return io_status_; } } else if (!TryGetCompressedBlockFromPersistentCache()) { - PrepareBufferForBlockFromFile(); - Status s; - - { - PERF_TIMER_GUARD(block_read_time); - // Actual file read - status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, - &slice_, used_buf_, for_compaction_); + IOOptions opts; + io_status_ = file_->PrepareIOOptions(read_options_, opts); + // Actual file read + if (io_status_.ok()) { + if (file_->use_direct_io()) { + PERF_TIMER_GUARD(block_read_time); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, nullptr, &direct_io_buf_, for_compaction_); + PERF_COUNTER_ADD(block_read_count, 1); + used_buf_ = const_cast(slice_.data()); + } else { + PrepareBufferForBlockFromFile(); + PERF_TIMER_GUARD(block_read_time); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, used_buf_, nullptr, for_compaction_); + PERF_COUNTER_ADD(block_read_count, 1); +#ifndef NDEBUG + if (slice_.data() == &stack_buf_[0]) { + num_stack_buf_memcpy_++; + } else if (slice_.data() == heap_buf_.get()) { + num_heap_buf_memcpy_++; + } else if (slice_.data() == compressed_buf_.get()) { + num_compressed_buf_memcpy_++; + } +#endif + } } - PERF_COUNTER_ADD(block_read_count, 1); // TODO: introduce dedicated perf counter for range tombstones switch (block_type_) { @@ -239,38 +288,38 @@ break; } - PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize); - if (!status_.ok()) { - return status_; + PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); + if (!io_status_.ok()) { + return io_status_; } - if (slice_.size() != block_size_ + kBlockTrailerSize) { - return Status::Corruption("truncated block read from " + - file_->file_name() + " offset " + - ToString(handle_.offset()) + ", expected " + - ToString(block_size_ + kBlockTrailerSize) + - " bytes, got " + ToString(slice_.size())); + if (slice_.size() != block_size_with_trailer_) { + return IOStatus::Corruption("truncated block read from " + + file_->file_name() + " offset " + + ToString(handle_.offset()) + ", expected " + + ToString(block_size_with_trailer_) + + " bytes, got " + ToString(slice_.size())); } - CheckBlockChecksum(); - if (status_.ok()) { + ProcessTrailerIfPresent(); + if (io_status_.ok()) { InsertCompressedBlockToPersistentCacheIfNeeded(); } else { - return status_; + return io_status_; } } - PERF_TIMER_GUARD(block_decompress_time); - - compression_type_ = get_block_compression_type(slice_.data(), block_size_); - if (do_uncompress_ && compression_type_ != kNoCompression) { + PERF_TIMER_GUARD(block_decompress_time); // compressed page, uncompress, update cache UncompressionContext context(compression_type_); UncompressionInfo info(context, uncompression_dict_, compression_type_); - status_ = UncompressBlockContents(info, slice_.data(), block_size_, - contents_, footer_.version(), ioptions_, - memory_allocator_); + io_status_ = status_to_io_status(UncompressBlockContents( + info, slice_.data(), block_size_, contents_, footer_.format_version(), + ioptions_, memory_allocator_)); +#ifndef NDEBUG + num_heap_buf_memcpy_++; +#endif compression_type_ = kNoCompression; } else { GetBlockContents(); @@ -278,7 +327,7 @@ InsertUncompressedBlockToPersistentCacheIfNeeded(); - return status_; + return io_status_; } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,6 +12,7 @@ #include "table/block_based/block.h" #include "table/block_based/block_type.h" #include "table/format.h" +#include "table/persistent_cache_options.h" namespace ROCKSDB_NAMESPACE { @@ -37,12 +38,15 @@ class BlockFetcher { public: BlockFetcher(RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, const Footer& footer, - const ReadOptions& read_options, const BlockHandle& handle, - BlockContents* contents, const ImmutableCFOptions& ioptions, + FilePrefetchBuffer* prefetch_buffer, + const Footer& footer /* ref retained */, + const ReadOptions& read_options, + const BlockHandle& handle /* ref retained */, + BlockContents* contents, + const ImmutableOptions& ioptions /* ref retained */, bool do_uncompress, bool maybe_compressed, BlockType block_type, - const UncompressionDict& uncompression_dict, - const PersistentCacheOptions& cache_options, + const UncompressionDict& uncompression_dict /* ref retained */, + const PersistentCacheOptions& cache_options /* ref retained */, MemoryAllocator* memory_allocator = nullptr, MemoryAllocator* memory_allocator_compressed = nullptr, bool for_compaction = false) @@ -56,16 +60,39 @@ do_uncompress_(do_uncompress), maybe_compressed_(maybe_compressed), block_type_(block_type), + block_size_(static_cast(handle_.size())), + block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()), uncompression_dict_(uncompression_dict), cache_options_(cache_options), memory_allocator_(memory_allocator), memory_allocator_compressed_(memory_allocator_compressed), - for_compaction_(for_compaction) {} - - Status ReadBlockContents(); - CompressionType get_compression_type() const { return compression_type_; } + for_compaction_(for_compaction) { + io_status_.PermitUncheckedError(); // TODO(AR) can we improve on this? + } + + IOStatus ReadBlockContents(); + inline CompressionType get_compression_type() const { + return compression_type_; + } + inline size_t GetBlockSizeWithTrailer() const { + return block_size_with_trailer_; + } + +#ifndef NDEBUG + int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; } + int TEST_GetNumHeapBufMemcpy() const { return num_heap_buf_memcpy_; } + int TEST_GetNumCompressedBufMemcpy() const { + return num_compressed_buf_memcpy_; + } +#endif private: +#ifndef NDEBUG + int num_stack_buf_memcpy_ = 0; + int num_heap_buf_memcpy_ = 0; + int num_compressed_buf_memcpy_ = 0; + +#endif static const uint32_t kDefaultStackBufferSize = 5000; RandomAccessFileReader* file_; @@ -74,23 +101,25 @@ const ReadOptions read_options_; const BlockHandle& handle_; BlockContents* contents_; - const ImmutableCFOptions& ioptions_; - bool do_uncompress_; - bool maybe_compressed_; - BlockType block_type_; + const ImmutableOptions& ioptions_; + const bool do_uncompress_; + const bool maybe_compressed_; + const BlockType block_type_; + const size_t block_size_; + const size_t block_size_with_trailer_; const UncompressionDict& uncompression_dict_; const PersistentCacheOptions& cache_options_; MemoryAllocator* memory_allocator_; MemoryAllocator* memory_allocator_compressed_; - Status status_; + IOStatus io_status_; Slice slice_; char* used_buf_ = nullptr; - size_t block_size_; + AlignedBuf direct_io_buf_; CacheAllocationPtr heap_buf_; CacheAllocationPtr compressed_buf_; char stack_buf_[kDefaultStackBufferSize]; bool got_from_prefetch_buffer_ = false; - ROCKSDB_NAMESPACE::CompressionType compression_type_; + CompressionType compression_type_; bool for_compaction_ = false; // return true if found @@ -99,11 +128,13 @@ bool TryGetFromPrefetchBuffer(); bool TryGetCompressedBlockFromPersistentCache(); void PrepareBufferForBlockFromFile(); - // Copy content from used_buf_ to new heap buffer. - void CopyBufferToHeap(); + // Copy content from used_buf_ to new heap_buf_. + void CopyBufferToHeapBuf(); + // Copy content from used_buf_ to new compressed_buf_. + void CopyBufferToCompressedBuf(); void GetBlockContents(); void InsertCompressedBlockToPersistentCacheIfNeeded(); void InsertUncompressedBlockToPersistentCacheIfNeeded(); - void CheckBlockChecksum(); + void ProcessTrailerIfPresent(); }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,521 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_fetcher.h" + +#include "db/table_properties_collector.h" +#include "file/file_util.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "table/block_based/binary_search_index_reader.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/format.h" +#include "test_util/testharness.h" +#include "utilities/memory_allocators.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +struct MemcpyStats { + int num_stack_buf_memcpy; + int num_heap_buf_memcpy; + int num_compressed_buf_memcpy; +}; + +struct BufAllocationStats { + int num_heap_buf_allocations; + int num_compressed_buf_allocations; +}; + +struct TestStats { + MemcpyStats memcpy_stats; + BufAllocationStats buf_allocation_stats; +}; + +class BlockFetcherTest : public testing::Test { + public: + enum class Mode { + kBufferedRead = 0, + kBufferedMmap, + kDirectRead, + kNumModes, + }; + // use NumModes as array size to avoid "size of array '...' has non-integral + // type" errors. + const static int NumModes = static_cast(Mode::kNumModes); + + protected: + void SetUp() override { + SetupSyncPointsToMockDirectIO(); + test_dir_ = test::PerThreadDBPath("block_fetcher_test"); + env_ = Env::Default(); + fs_ = FileSystem::Default(); + ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr)); + } + + void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); } + + void AssertSameBlock(const std::string& block1, const std::string& block2) { + ASSERT_EQ(block1, block2); + } + + // Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive. + void CreateTable(const std::string& table_name, + const CompressionType& compression_type) { + std::unique_ptr writer; + NewFileWriter(table_name, &writer); + + // Create table builder. + ImmutableOptions ioptions(options_); + InternalKeyComparator comparator(options_.comparator); + ColumnFamilyOptions cf_options(options_); + MutableCFOptions moptions(cf_options); + IntTblPropCollectorFactories factories; + std::unique_ptr table_builder(table_factory_.NewTableBuilder( + TableBuilderOptions(ioptions, moptions, comparator, &factories, + compression_type, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, + -1 /* level */), + writer.get())); + + // Build table. + for (int i = 0; i < 9; i++) { + std::string key = ToInternalKey(std::to_string(i)); + // Append "00000000" to string value to enhance compression ratio + std::string value = "00000000" + std::to_string(i); + table_builder->Add(key, value); + } + ASSERT_OK(table_builder->Finish()); + } + + void FetchIndexBlock(const std::string& table_name, + CountedMemoryAllocator* heap_buf_allocator, + CountedMemoryAllocator* compressed_buf_allocator, + MemcpyStats* memcpy_stats, BlockContents* index_block, + std::string* result) { + FileOptions fopt(options_); + std::unique_ptr file; + NewFileReader(table_name, fopt, &file); + + // Get handle of the index block. + Footer footer; + ReadFooter(file.get(), &footer); + const BlockHandle& index_handle = footer.index_handle(); + + CompressionType compression_type; + FetchBlock(file.get(), index_handle, BlockType::kIndex, + false /* compressed */, false /* do_uncompress */, + heap_buf_allocator, compressed_buf_allocator, index_block, + memcpy_stats, &compression_type); + ASSERT_EQ(compression_type, CompressionType::kNoCompression); + result->assign(index_block->data.ToString()); + } + + // Fetches the first data block in both direct IO and non-direct IO mode. + // + // compressed: whether the data blocks are compressed; + // do_uncompress: whether the data blocks should be uncompressed on fetching. + // compression_type: the expected compression type. + // + // Expects: + // Block contents are the same. + // Bufferr allocation and memory copy statistics are expected. + void TestFetchDataBlock( + const std::string& table_name_prefix, bool compressed, bool do_uncompress, + std::array expected_stats_by_mode) { + for (CompressionType compression_type : GetSupportedCompressions()) { + bool do_compress = compression_type != kNoCompression; + if (compressed != do_compress) continue; + std::string compression_type_str = + CompressionTypeToString(compression_type); + + std::string table_name = table_name_prefix + compression_type_str; + CreateTable(table_name, compression_type); + + CompressionType expected_compression_type_after_fetch = + (compressed && !do_uncompress) ? compression_type : kNoCompression; + + BlockContents blocks[NumModes]; + std::string block_datas[NumModes]; + MemcpyStats memcpy_stats[NumModes]; + CountedMemoryAllocator heap_buf_allocators[NumModes]; + CountedMemoryAllocator compressed_buf_allocators[NumModes]; + for (int i = 0; i < NumModes; ++i) { + SetMode(static_cast(i)); + FetchFirstDataBlock(table_name, compressed, do_uncompress, + expected_compression_type_after_fetch, + &heap_buf_allocators[i], + &compressed_buf_allocators[i], &blocks[i], + &block_datas[i], &memcpy_stats[i]); + } + + for (int i = 0; i < NumModes - 1; ++i) { + AssertSameBlock(block_datas[i], block_datas[i + 1]); + } + + // Check memcpy and buffer allocation statistics. + for (int i = 0; i < NumModes; ++i) { + const TestStats& expected_stats = expected_stats_by_mode[i]; + + ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy, + expected_stats.memcpy_stats.num_stack_buf_memcpy); + ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy, + expected_stats.memcpy_stats.num_heap_buf_memcpy); + ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy, + expected_stats.memcpy_stats.num_compressed_buf_memcpy); + + if (kXpressCompression == compression_type) { + // XPRESS allocates memory internally, thus does not support for + // custom allocator verification + continue; + } else { + ASSERT_EQ( + heap_buf_allocators[i].GetNumAllocations(), + expected_stats.buf_allocation_stats.num_heap_buf_allocations); + ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(), + expected_stats.buf_allocation_stats + .num_compressed_buf_allocations); + + // The allocated buffers are not deallocated until + // the block content is deleted. + ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0); + ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0); + blocks[i].allocation.reset(); + ASSERT_EQ( + heap_buf_allocators[i].GetNumDeallocations(), + expected_stats.buf_allocation_stats.num_heap_buf_allocations); + ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), + expected_stats.buf_allocation_stats + .num_compressed_buf_allocations); + } + } + } + } + + void SetMode(Mode mode) { + switch (mode) { + case Mode::kBufferedRead: + options_.use_direct_reads = false; + options_.allow_mmap_reads = false; + break; + case Mode::kBufferedMmap: + options_.use_direct_reads = false; + options_.allow_mmap_reads = true; + break; + case Mode::kDirectRead: + options_.use_direct_reads = true; + options_.allow_mmap_reads = false; + break; + case Mode::kNumModes: + assert(false); + } + } + + private: + std::string test_dir_; + Env* env_; + std::shared_ptr fs_; + BlockBasedTableFactory table_factory_; + Options options_; + + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } + + void WriteToFile(const std::string& content, const std::string& filename) { + std::unique_ptr f; + ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append(content, IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + + void NewFileWriter(const std::string& filename, + std::unique_ptr* writer) { + std::string path = Path(filename); + FileOptions file_options; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path, + file_options, writer, nullptr)); + } + + void NewFileReader(const std::string& filename, const FileOptions& opt, + std::unique_ptr* reader) { + std::string path = Path(filename); + std::unique_ptr f; + ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr)); + reader->reset(new RandomAccessFileReader(std::move(f), path, + env_->GetSystemClock().get())); + } + + void NewTableReader(const ImmutableOptions& ioptions, + const FileOptions& foptions, + const InternalKeyComparator& comparator, + const std::string& table_name, + std::unique_ptr* table) { + std::unique_ptr file; + NewFileReader(table_name, foptions, &file); + + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size)); + + std::unique_ptr table_reader; + ReadOptions ro; + const auto* table_options = + table_factory_.GetOptions(); + ASSERT_NE(table_options, nullptr); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + &table_reader)); + + table->reset(reinterpret_cast(table_reader.release())); + } + + std::string ToInternalKey(const std::string& key) { + InternalKey internal_key(key, 0, ValueType::kTypeValue); + return internal_key.Encode().ToString(); + } + + void ReadFooter(RandomAccessFileReader* file, Footer* footer) { + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size)); + IOOptions opts; + ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */, + file_size, footer, + kBlockBasedTableMagicNumber)); + } + + // NOTE: compression_type returns the compression type of the fetched block + // contents, so if the block is fetched and uncompressed, then it's + // kNoCompression. + void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block, + BlockType block_type, bool compressed, bool do_uncompress, + MemoryAllocator* heap_buf_allocator, + MemoryAllocator* compressed_buf_allocator, + BlockContents* contents, MemcpyStats* stats, + CompressionType* compresstion_type) { + ImmutableOptions ioptions(options_); + ReadOptions roptions; + PersistentCacheOptions persistent_cache_options; + Footer footer; + ReadFooter(file, &footer); + std::unique_ptr fetcher(new BlockFetcher( + file, nullptr /* prefetch_buffer */, footer, roptions, block, contents, + ioptions, do_uncompress, compressed, block_type, + UncompressionDict::GetEmptyDict(), persistent_cache_options, + heap_buf_allocator, compressed_buf_allocator)); + + ASSERT_OK(fetcher->ReadBlockContents()); + + stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy(); + stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy(); + stats->num_compressed_buf_memcpy = + fetcher->TEST_GetNumCompressedBufMemcpy(); + + *compresstion_type = fetcher->get_compression_type(); + } + + // NOTE: expected_compression_type is the expected compression + // type of the fetched block content, if the block is uncompressed, + // then the expected compression type is kNoCompression. + void FetchFirstDataBlock(const std::string& table_name, bool compressed, + bool do_uncompress, + CompressionType expected_compression_type, + MemoryAllocator* heap_buf_allocator, + MemoryAllocator* compressed_buf_allocator, + BlockContents* block, std::string* result, + MemcpyStats* memcpy_stats) { + ImmutableOptions ioptions(options_); + InternalKeyComparator comparator(options_.comparator); + FileOptions foptions(options_); + + // Get block handle for the first data block. + std::unique_ptr table; + NewTableReader(ioptions, foptions, comparator, table_name, &table); + + std::unique_ptr index_reader; + ReadOptions ro; + ASSERT_OK(BinarySearchIndexReader::Create( + table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */, + false /* prefetch */, false /* pin */, nullptr /* lookup_context */, + &index_reader)); + + std::unique_ptr> iter( + index_reader->NewIterator( + ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */, + nullptr /* get_context */, nullptr /* lookup_context */)); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + BlockHandle first_block_handle = iter->value().handle; + + // Fetch first data block. + std::unique_ptr file; + NewFileReader(table_name, foptions, &file); + CompressionType compression_type; + FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed, + do_uncompress, heap_buf_allocator, compressed_buf_allocator, + block, memcpy_stats, &compression_type); + ASSERT_EQ(compression_type, expected_compression_type); + result->assign(block->data.ToString()); + } +}; + +// Skip the following tests in lite mode since direct I/O is unsupported. +#ifndef ROCKSDB_LITE + +// Fetch index block under both direct IO and non-direct IO. +// Expects: +// the index block contents are the same for both read modes. +TEST_F(BlockFetcherTest, FetchIndexBlock) { + for (CompressionType compression : GetSupportedCompressions()) { + std::string table_name = + "FetchIndexBlock" + CompressionTypeToString(compression); + CreateTable(table_name, compression); + + CountedMemoryAllocator allocator; + MemcpyStats memcpy_stats; + BlockContents indexes[NumModes]; + std::string index_datas[NumModes]; + for (int i = 0; i < NumModes; ++i) { + SetMode(static_cast(i)); + FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats, + &indexes[i], &index_datas[i]); + } + for (int i = 0; i < NumModes - 1; ++i) { + AssertSameBlock(index_datas[i], index_datas[i + 1]); + } + } +} + +// Data blocks are not compressed, +// fetch data block under direct IO, mmap IO,and non-direct IO. +// Expects: +// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block +// into the buffer; +// 2. in direct IO mode, allocate a heap buffer and memcpy from the +// direct IO buffer to the heap buffer. +TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) { + TestStats expected_non_mmap_stats = { + { + 0 /* num_stack_buf_memcpy */, + 1 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 1 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + TestStats expected_mmap_stats = {{ + 0 /* num_stack_buf_memcpy */, + 0 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 0 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + std::array expected_stats_by_mode{{ + expected_non_mmap_stats /* kBufferedRead */, + expected_mmap_stats /* kBufferedMmap */, + expected_non_mmap_stats /* kDirectRead */, + }}; + TestFetchDataBlock("FetchUncompressedDataBlock", false, false, + expected_stats_by_mode); +} + +// Data blocks are compressed, +// fetch data block under both direct IO and non-direct IO, +// but do not uncompress. +// Expects: +// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block +// into the buffer; +// 2. in direct IO mode, allocate a compressed buffer and memcpy from the +// direct IO buffer to the compressed buffer. +TEST_F(BlockFetcherTest, FetchCompressedDataBlock) { + TestStats expected_non_mmap_stats = { + { + 0 /* num_stack_buf_memcpy */, + 0 /* num_heap_buf_memcpy */, + 1 /* num_compressed_buf_memcpy */, + }, + { + 0 /* num_heap_buf_allocations */, + 1 /* num_compressed_buf_allocations */, + }}; + TestStats expected_mmap_stats = {{ + 0 /* num_stack_buf_memcpy */, + 0 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 0 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + std::array expected_stats_by_mode{{ + expected_non_mmap_stats /* kBufferedRead */, + expected_mmap_stats /* kBufferedMmap */, + expected_non_mmap_stats /* kDirectRead */, + }}; + TestFetchDataBlock("FetchCompressedDataBlock", true, false, + expected_stats_by_mode); +} + +// Data blocks are compressed, +// fetch and uncompress data block under both direct IO and non-direct IO. +// Expects: +// 1. in non-direct IO mode, since the block is small, so it's first memcpyed +// to the stack buffer, then a heap buffer is allocated and the block is +// uncompressed into the heap. +// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress +// and memcpy from the direct IO buffer to the heap buffer. +TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) { + TestStats expected_buffered_read_stats = { + { + 1 /* num_stack_buf_memcpy */, + 1 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 1 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + TestStats expected_mmap_stats = {{ + 0 /* num_stack_buf_memcpy */, + 1 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 1 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + TestStats expected_direct_read_stats = { + { + 0 /* num_stack_buf_memcpy */, + 1 /* num_heap_buf_memcpy */, + 0 /* num_compressed_buf_memcpy */, + }, + { + 1 /* num_heap_buf_allocations */, + 0 /* num_compressed_buf_allocations */, + }}; + std::array expected_stats_by_mode{{ + expected_buffered_read_stats, + expected_mmap_stats, + expected_direct_read_stats, + }}; + TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true, + expected_stats_by_mode); +} + +#endif // ROCKSDB_LITE + +} // namespace +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -53,7 +53,9 @@ const Comparator* user_comparator, uint32_t cuckoo_block_size, bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), - uint32_t column_family_id, const std::string& column_family_name) + uint32_t column_family_id, const std::string& column_family_name, + const std::string& db_id, const std::string& db_session_id, + uint64_t file_number) : num_hash_func_(2), file_(file), max_hash_table_ratio_(max_hash_table_ratio), @@ -79,6 +81,11 @@ properties_.filter_size = 0; properties_.column_family_id = column_family_id; properties_.column_family_name = column_family_name; + properties_.db_id = db_id; + properties_.db_session_id = db_session_id; + properties_.orig_file_number = file_number; + status_.PermitUncheckedError(); + io_status_.PermitUncheckedError(); } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { @@ -87,8 +94,11 @@ return; } ParsedInternalKey ikey; - if (!ParseInternalKey(key, &ikey)) { - status_ = Status::Corruption("Unable to parse key into inernal key."); + Status pik_status = + ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + status_ = Status::Corruption("Unable to parse key into internal key. ", + pik_status.getState()); return; } if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { @@ -244,7 +254,6 @@ assert(!closed_); closed_ = true; std::vector buckets; - Status s; std::string unused_bucket; if (num_entries_ > 0) { // Calculate the real hash size if module hash is enabled. @@ -252,9 +261,9 @@ hash_table_size_ = static_cast(num_entries_ / max_hash_table_ratio_); } - s = MakeHashTable(&buckets); - if (!s.ok()) { - return s; + status_ = MakeHashTable(&buckets); + if (!status_.ok()) { + return status_; } // Determine unused_user_key to fill empty buckets. std::string unused_user_key = smallest_user_key_; @@ -301,18 +310,19 @@ uint32_t num_added = 0; for (auto& bucket : buckets) { if (bucket.vector_idx == kMaxVectorIdx) { - s = file_->Append(Slice(unused_bucket)); + io_status_ = file_->Append(Slice(unused_bucket)); } else { ++num_added; - s = file_->Append(GetKey(bucket.vector_idx)); - if (s.ok()) { + io_status_ = file_->Append(GetKey(bucket.vector_idx)); + if (io_status_.ok()) { if (value_size_ > 0) { - s = file_->Append(GetValue(bucket.vector_idx)); + io_status_ = file_->Append(GetValue(bucket.vector_idx)); } } } - if (!s.ok()) { - return s; + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } } assert(num_added == NumEntries()); @@ -364,34 +374,31 @@ BlockHandle property_block_handle; property_block_handle.set_offset(offset); property_block_handle.set_size(property_block.size()); - s = file_->Append(property_block); + io_status_ = file_->Append(property_block); offset += property_block.size(); - if (!s.ok()) { - return s; + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } - meta_index_builder.Add(kPropertiesBlock, property_block_handle); + meta_index_builder.Add(kPropertiesBlockName, property_block_handle); Slice meta_index_block = meta_index_builder.Finish(); BlockHandle meta_index_block_handle; meta_index_block_handle.set_offset(offset); meta_index_block_handle.set_size(meta_index_block.size()); - s = file_->Append(meta_index_block); - if (!s.ok()) { - return s; + io_status_ = file_->Append(meta_index_block); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } - Footer footer(kCuckooTableMagicNumber, 1); - footer.set_metaindex_handle(meta_index_block_handle); - footer.set_index_handle(BlockHandle::NullBlockHandle()); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - s = file_->Append(footer_encoding); - - if (file_ != nullptr) { - file_checksum_ = file_->GetFileChecksum(); - } - return s; + FooterBuilder footer; + footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset, + kNoChecksum, meta_index_block_handle); + io_status_ = file_->Append(footer.GetSlice()); + status_ = io_status_; + return status_; } void CuckooTableBuilder::Abandon() { @@ -516,11 +523,19 @@ return null_found; } +std::string CuckooTableBuilder::GetFileChecksum() const { + if (file_ != nullptr) { + return file_->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + const char* CuckooTableBuilder::GetFileChecksumFuncName() const { if (file_ != nullptr) { return file_->GetFileChecksumFuncName(); } else { - return kUnknownFileChecksumFuncName.c_str(); + return kUnknownFileChecksumFuncName; } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -22,15 +22,15 @@ class CuckooTableBuilder: public TableBuilder { public: - CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio, - uint32_t max_num_hash_func, uint32_t max_search_depth, - const Comparator* user_comparator, - uint32_t cuckoo_block_size, bool use_module_hash, - bool identity_as_first_hash, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, - uint64_t), - uint32_t column_family_id, - const std::string& column_family_name); + CuckooTableBuilder( + WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), + uint32_t column_family_id, const std::string& column_family_name, + const std::string& db_id = "", const std::string& db_session_id = "", + uint64_t file_number = 0); // No copying allowed CuckooTableBuilder(const CuckooTableBuilder&) = delete; void operator=(const CuckooTableBuilder&) = delete; @@ -46,6 +46,9 @@ // Return non-ok iff some error has been detected. Status status() const override { return status_; } + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return io_status_; } + // Finish building the table. Stops using the file passed to the // constructor after this function returns. // REQUIRES: Finish(), Abandon() have not been called @@ -68,7 +71,7 @@ TableProperties GetTableProperties() const override { return properties_; } // Get file checksum - const std::string& GetFileChecksum() const override { return file_checksum_; } + std::string GetFileChecksum() const override; // Get file checksum function name const char* GetFileChecksumFuncName() const override; @@ -116,6 +119,7 @@ // Number of keys that contain value (non-deletion op) uint64_t num_values_; Status status_; + IOStatus io_status_; TableProperties properties_; const Comparator* ucomp_; bool use_module_hash_; @@ -126,9 +130,6 @@ std::string smallest_user_key_ = ""; bool closed_; // Either Finish() or Abandon() has been called. - - // Store file checksum. If checksum is disabled, its value is "0" - std::string file_checksum_ = kUnknownFileChecksum; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,14 +5,17 @@ #ifndef ROCKSDB_LITE -#include -#include +#include "table/cuckoo/cuckoo_table_builder.h" + #include +#include #include +#include #include "file/random_access_file_reader.h" #include "file/writable_file_writer.h" -#include "table/cuckoo/cuckoo_table_builder.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "table/meta_blocks.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -35,7 +38,7 @@ env_ = Env::Default(); Options options; options.allow_mmap_reads = true; - env_options_ = EnvOptions(options); + file_options_ = FileOptions(options); } void CheckFileContents(const std::vector& keys, @@ -47,29 +50,27 @@ uint64_t num_deletions = 0; for (const auto& key : keys) { ParsedInternalKey parsed; - if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) { + Status pik_status = + ParseInternalKey(key, &parsed, true /* log_err_key */); + if (pik_status.ok() && parsed.type == kTypeDeletion) { num_deletions++; } } // Read file - std::unique_ptr read_file; - ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); uint64_t read_file_size; ASSERT_OK(env_->GetFileSize(fname, &read_file_size)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr)); - // @lint-ignore TXT2 T25377293 Grandfathered in - Options options; - options.allow_mmap_reads = true; - ImmutableCFOptions ioptions(options); + Options options; + options.allow_mmap_reads = true; + ImmutableOptions ioptions(options); // Assert Table Properties. - TableProperties* props = nullptr; - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); + std::unique_ptr props; ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, - kCuckooTableMagicNumber, ioptions, - &props, true /* compression_type_missing */)); + kCuckooTableMagicNumber, ioptions, &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; @@ -106,15 +107,14 @@ ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); ASSERT_EQ(props->column_family_id, 0); ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName); - delete props; // Check contents of the bucket. std::vector keys_found(keys.size(), false); size_t bucket_size = expected_unused_bucket.size(); - for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { + for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) { Slice read_slice; - ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, - nullptr)); + ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size, + &read_slice, nullptr, nullptr)); size_t key_idx = std::find(expected_locations.begin(), expected_locations.end(), i) - expected_locations.begin(); @@ -157,7 +157,7 @@ Env* env_; - EnvOptions env_options_; + FileOptions file_options_; std::string fname; const double kHashTableRatio = 0.9; }; @@ -165,10 +165,9 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { std::unique_ptr writable_file; fname = test::PerThreadDBPath("EmptyFile"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -206,12 +205,10 @@ } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; fname = test::PerThreadDBPath("NoCollisionFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -256,12 +253,10 @@ } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; fname = test::PerThreadDBPath("WithCollisionFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -305,13 +300,11 @@ } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; uint32_t cuckoo_block_size = 2; fname = test::PerThreadDBPath("WithCollisionFullKey2"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder( file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash, @@ -360,12 +353,10 @@ } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -411,12 +402,10 @@ } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash, 0 /* column_family_id */, @@ -455,12 +444,11 @@ std::vector expected_locations = {0, 1, 2, 3}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("NoCollisionUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -500,12 +488,11 @@ std::vector expected_locations = {0, 1, 2, 3}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -547,12 +534,11 @@ std::vector expected_locations = {0, 1, 3, 4, 2}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -593,12 +579,10 @@ }; hash_map = std::move(hm); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -622,12 +606,10 @@ uint32_t num_hash_fun = 4; std::string user_key = "repeatedkey"; - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("FailWhenSameKeyInserted"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,13 +7,15 @@ #include "table/cuckoo/cuckoo_table_factory.h" #include "db/dbformat.h" +#include "options/configurable_helper.h" +#include "rocksdb/utilities/options_type.h" #include "table/cuckoo/cuckoo_table_builder.h" #include "table/cuckoo/cuckoo_table_reader.h" namespace ROCKSDB_NAMESPACE { Status CuckooTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const { @@ -28,11 +30,8 @@ } TableBuilder* CuckooTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - // Ignore the skipFIlters flag. Does not apply to this file format - // - // TODO: change builder to take the option struct return new CuckooTableBuilder( file, table_options_.hash_table_ratio, 64, @@ -40,10 +39,12 @@ table_builder_options.internal_comparator.user_comparator(), table_options_.cuckoo_block_size, table_options_.use_module_hash, table_options_.identity_as_first_hash, nullptr /* get_slice_hash */, - column_family_id, table_builder_options.column_family_name); + table_builder_options.column_family_id, + table_builder_options.column_family_name, table_builder_options.db_id, + table_builder_options.db_session_id, table_builder_options.cur_file_num); } -std::string CuckooTableFactory::GetPrintableTableOptions() const { +std::string CuckooTableFactory::GetPrintableOptions() const { std::string ret; ret.reserve(2000); const int kBufferSize = 200; @@ -64,6 +65,37 @@ return ret; } +static std::unordered_map cuckoo_table_type_info = + { +#ifndef ROCKSDB_LITE + {"hash_table_ratio", + {offsetof(struct CuckooTableOptions, hash_table_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"max_search_depth", + {offsetof(struct CuckooTableOptions, max_search_depth), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cuckoo_block_size", + {offsetof(struct CuckooTableOptions, cuckoo_block_size), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"identity_as_first_hash", + {offsetof(struct CuckooTableOptions, identity_as_first_hash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"use_module_hash", + {offsetof(struct CuckooTableOptions, use_module_hash), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options) + : table_options_(table_options) { + RegisterOptions(&table_options_, &cuckoo_table_type_info); +} + TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { return new CuckooTableFactory(table_options); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h 2025-05-19 16:14:27.000000000 +0000 @@ -52,37 +52,26 @@ // - Does not support prefix bloom filters. class CuckooTableFactory : public TableFactory { public: - explicit CuckooTableFactory(const CuckooTableOptions& table_options) - : table_options_(table_options) {} + explicit CuckooTableFactory( + const CuckooTableOptions& table_option = CuckooTableOptions()); ~CuckooTableFactory() {} - const char* Name() const override { return "CuckooTable"; } + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kCuckooTableName(); } + const char* Name() const override { return kCuckooTableName(); } + using TableFactory::NewTableReader; Status NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; - // Sanitizes the specified DB Options. - Status SanitizeOptions( - const DBOptions& /*db_opts*/, - const ColumnFamilyOptions& /*cf_opts*/) const override { - return Status::OK(); - } - - std::string GetPrintableTableOptions() const override; - - void* GetOptions() override { return &table_options_; } - - Status GetOptionString(std::string* /*opt_string*/, - const std::string& /*delimiter*/) const override { - return Status::OK(); - } + std::string GetPrintableOptions() const override; private: CuckooTableOptions table_options_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -15,7 +15,9 @@ #include #include #include + #include "memory/arena.h" +#include "options/cf_options.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" #include "table/cuckoo/cuckoo_table_factory.h" @@ -33,7 +35,7 @@ extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) @@ -54,15 +56,18 @@ get_slice_hash_(get_slice_hash) { if (!ioptions.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); - } - TableProperties* props = nullptr; - status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - ioptions, &props, true /* compression_type_missing */); - if (!status_.ok()) { return; } - table_props_.reset(props); - auto& user_props = props->user_collected_properties; + { + std::unique_ptr props; + status_ = ReadTableProperties(file_.get(), file_size, + kCuckooTableMagicNumber, ioptions, &props); + if (!status_.ok()) { + return; + } + table_props_ = std::move(props); + } + auto& user_props = table_props_->user_collected_properties; auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); if (hash_funs == user_props.end()) { status_ = Status::Corruption("Number of hash functions not found"); @@ -76,7 +81,7 @@ } unused_key_ = unused_key->second; - key_length_ = static_cast(props->fixed_key_len); + key_length_ = static_cast(table_props_->fixed_key_len); auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); if (user_key_len == user_props.end()) { status_ = Status::Corruption("User key length not found"); @@ -136,7 +141,8 @@ cuckoo_block_size_ = *reinterpret_cast( cuckoo_block_size->second.data()); cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; - status_ = file_->Read(0, static_cast(file_size), &file_data_, nullptr); + status_ = file_->Read(IOOptions(), 0, static_cast(file_size), + &file_data_, nullptr, nullptr); } Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, @@ -170,7 +176,9 @@ } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; - ParseInternalKey(full_key, &found_ikey); + Status s = ParseInternalKey(full_key, &found_ikey, + false /* log_err_key */); // TODO + if (!s.ok()) return s; bool dont_care __attribute__((__unused__)); get_context->SaveValue(found_ikey, value, &dont_care); } @@ -378,7 +386,8 @@ const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, - size_t /*compaction_readahead_size*/) { + size_t /*compaction_readahead_size*/, + bool /* allow_unprepared_value */) { if (!status().ok()) { return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,9 +14,7 @@ #include #include -#include "db/dbformat.h" #include "file/random_access_file_reader.h" -#include "options/cf_options.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "table/table_reader.h" @@ -25,10 +23,11 @@ class Arena; class TableReader; +struct ImmutableOptions; class CuckooTableReader: public TableReader { public: - CuckooTableReader(const ImmutableCFOptions& ioptions, + CuckooTableReader(const ImmutableOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* user_comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, @@ -52,7 +51,8 @@ const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, - size_t compaction_readahead_size = 0) override; + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -19,6 +19,7 @@ #include #include "memory/arena.h" +#include "rocksdb/db.h" #include "table/cuckoo/cuckoo_table_builder.h" #include "table/cuckoo/cuckoo_table_factory.h" #include "table/cuckoo/cuckoo_table_reader.h" @@ -31,7 +32,6 @@ #include "util/string_util.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; -using GFLAGS_NAMESPACE::SetUsageMessage; DEFINE_string(file_dir, "", "Directory where the files will be created" " for benchmark. Added for using tmpfs."); @@ -69,7 +69,7 @@ CuckooReaderTest() { options.allow_mmap_reads = true; env = options.env; - env_options = EnvOptions(options); + file_options = FileOptions(options); } void SetUp(int num) { @@ -89,12 +89,9 @@ void CreateCuckooFileAndCheckReader( const Comparator* ucomp = BytewiseComparator()) { - std::unique_ptr writable_file; - ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - env_options)); - + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname, + file_options, &file_writer, nullptr)); CuckooTableBuilder builder( file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false, GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName); @@ -110,12 +107,10 @@ ASSERT_OK(file_writer->Close()); // Check reader now. - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); @@ -140,12 +135,10 @@ } void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); @@ -212,9 +205,17 @@ uint64_t file_size; Options options; Env* env; - EnvOptions env_options; + FileOptions file_options; }; +TEST_F(CuckooReaderTest, FileNotMmaped) { + options.allow_mmap_reads = false; + ImmutableOptions ioptions(options); + CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr); + ASSERT_TRUE(reader.status().IsInvalidArgument()); + ASSERT_STREQ("File is not mmaped", reader.status().getState()); +} + TEST_F(CuckooReaderTest, WhenKeyExists) { SetUp(kNumHashFunc); fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists"); @@ -323,12 +324,12 @@ } auto* ucmp = BytewiseComparator(); CreateCuckooFileAndCheckReader(); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp, GetSliceHash); ASSERT_OK(reader.status()); @@ -408,15 +409,13 @@ const uint64_t num, double hash_ratio) { Options options; options.allow_mmap_reads = true; - Env* env = options.env; - EnvOptions env_options = EnvOptions(options); + const auto& fs = options.env->GetFileSystem(); + FileOptions file_options(options); std::string fname = GetFileName(num); - std::unique_ptr writable_file; - ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - env_options)); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer, + nullptr)); CuckooTableBuilder builder( file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5, false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */, @@ -433,14 +432,13 @@ ASSERT_OK(file_writer->Close()); uint64_t file_size; - env->GetFileSize(fname, &file_size); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); + ASSERT_OK( + fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options, + &file_reader, nullptr)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); @@ -462,18 +460,18 @@ Options options; options.allow_mmap_reads = true; Env* env = options.env; - EnvOptions env_options = EnvOptions(options); + const auto& fs = options.env->GetFileSystem(); + FileOptions file_options(options); std::string fname = GetFileName(num); uint64_t file_size; - env->GetFileSize(fname, &file_size); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); + ASSERT_OK( + fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options, + &file_reader, nullptr)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); @@ -492,7 +490,7 @@ for (uint64_t i = 0; i < num; ++i) { keys.push_back(2 * i); } - std::random_shuffle(keys.begin(), keys.end()); + RandomShuffle(keys.begin(), keys.end()); PinnableSlice value; // Assume only the fast path is triggered diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc 2025-05-19 16:14:27.000000000 +0000 @@ -14,19 +14,24 @@ #include "block_fetcher.h" #include "file/random_access_file_reader.h" -#include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" +#include "options/options_helper.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" #include "table/persistent_cache_helper.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" +#include "util/hash.h" #include "util/stop_watch.h" #include "util/string_util.h" +#include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { @@ -41,6 +46,7 @@ const uint64_t kLegacyPlainTableMagicNumber = 0; const uint64_t kPlainTableMagicNumber = 0; #endif +const char* kHostnameForDbHostId = "__hostname__"; bool ShouldReportDetailedTime(Env* env, Statistics* stats) { return env != nullptr && stats != nullptr && @@ -49,11 +55,20 @@ void BlockHandle::EncodeTo(std::string* dst) const { // Sanity check that all fields have been set - assert(offset_ != ~static_cast(0)); - assert(size_ != ~static_cast(0)); + assert(offset_ != ~uint64_t{0}); + assert(size_ != ~uint64_t{0}); PutVarint64Varint64(dst, offset_, size_); } +char* BlockHandle::EncodeTo(char* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~uint64_t{0}); + assert(size_ != ~uint64_t{0}); + char* cur = EncodeVarint64(dst, offset_); + cur = EncodeVarint64(cur, size_); + return cur; +} + Status BlockHandle::DecodeFrom(Slice* input) { if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { return Status::OK(); @@ -93,8 +108,10 @@ void IndexValue::EncodeTo(std::string* dst, bool have_first_key, const BlockHandle* previous_handle) const { if (previous_handle) { + // WART: this is specific to Block-based table assert(handle.offset() == previous_handle->offset() + - previous_handle->size() + kBlockTrailerSize); + previous_handle->size() + + BlockBasedTable::kBlockTrailerSize); PutVarsignedint64(dst, handle.size() - previous_handle->size()); } else { handle.EncodeTo(dst); @@ -113,9 +130,10 @@ if (!GetVarsignedint64(input, &delta)) { return Status::Corruption("bad delta-encoded index value"); } - handle = BlockHandle( - previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, - previous_handle->size() + delta); + // WART: this is specific to Block-based table + handle = BlockHandle(previous_handle->offset() + previous_handle->size() + + BlockBasedTable::kBlockTrailerSize, + previous_handle->size() + delta); } else { Status s = handle.DecodeFrom(input); if (!s.ok()) { @@ -155,107 +173,156 @@ return kPlainTableMagicNumber; } assert(false); - return 0; + return magic_number; +} +inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kBlockBasedTableMagicNumber) { + return kLegacyBlockBasedTableMagicNumber; + } + if (magic_number == kPlainTableMagicNumber) { + return kLegacyPlainTableMagicNumber; + } + assert(false); + return magic_number; } +inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) { + if (magic_number == kBlockBasedTableMagicNumber || + magic_number == kLegacyBlockBasedTableMagicNumber) { + return static_cast(BlockBasedTable::kBlockTrailerSize); + } else { + return 0; + } +} + +// Footer format, in three parts: +// * Part1 +// -> format_version == 0 (inferred from legacy magic number) +// (0 bytes) +// -> format_version >= 1 +// checksum type (char, 1 byte) +// * Part2 +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40 +// * Part3 +// -> format_version == 0 (inferred from legacy magic number) +// legacy magic number (8 bytes) +// -> format_version >= 1 (inferred from NOT legacy magic number) +// format_version (uint32LE, 4 bytes), also called "footer version" +// newer magic number (8 bytes) + +constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength; } // namespace -// legacy footer format: -// metaindex handle (varint64 offset, varint64 size) -// index handle (varint64 offset, varint64 size) -// to make the total size 2 * BlockHandle::kMaxEncodedLength -// table_magic_number (8 bytes) -// new footer format: -// checksum type (char, 1 byte) -// metaindex handle (varint64 offset, varint64 size) -// index handle (varint64 offset, varint64 size) -// to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 -// footer version (4 bytes) -// table_magic_number (8 bytes) -void Footer::EncodeTo(std::string* dst) const { - assert(HasInitializedTableMagicNumber()); - if (IsLegacyFooterFormat(table_magic_number())) { - // has to be default checksum with legacy footer - assert(checksum_ == kCRC32c); - const size_t original_size = dst->size(); - metaindex_handle_.EncodeTo(dst); - index_handle_.EncodeTo(dst); - dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); - PutFixed32(dst, static_cast(table_magic_number() >> 32)); - assert(dst->size() == original_size + kVersion0EncodedLength); +void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle) { + (void)footer_offset; // Future use + + assert(magic_number != Footer::kNullTableMagicNumber); + assert(IsSupportedFormatVersion(format_version)); + + char* part2; + char* part3; + if (format_version > 0) { + slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength); + // Generate parts 1 and 3 + char* cur = data_.data(); + // Part 1 + *(cur++) = checksum_type; + // Part 2 + part2 = cur; + // Skip over part 2 for now + cur += kFooterPart2Size; + // Part 3 + part3 = cur; + EncodeFixed32(cur, format_version); + cur += 4; + EncodeFixed64(cur, magic_number); + assert(cur + 8 == slice_.data() + slice_.size()); } else { - const size_t original_size = dst->size(); - dst->push_back(static_cast(checksum_)); - metaindex_handle_.EncodeTo(dst); - index_handle_.EncodeTo(dst); - dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding - PutFixed32(dst, version()); - PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); - PutFixed32(dst, static_cast(table_magic_number() >> 32)); - assert(dst->size() == original_size + kNewVersionsEncodedLength); + slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength); + // Legacy SST files use kCRC32c checksum but it's not stored in footer. + assert(checksum_type == kNoChecksum || checksum_type == kCRC32c); + // Generate part 3 (part 1 empty, skip part 2 for now) + part2 = data_.data(); + part3 = part2 + kFooterPart2Size; + char* cur = part3; + // Use legacy magic numbers to indicate format_version=0, for + // compatibility. No other cases should use format_version=0. + EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number)); + assert(cur + 8 == slice_.data() + slice_.size()); + } + + { + char* cur = part2; + cur = metaindex_handle.EncodeTo(cur); + cur = index_handle.EncodeTo(cur); + // Zero pad remainder + std::fill(cur, part3, char{0}); } } -Footer::Footer(uint64_t _table_magic_number, uint32_t _version) - : version_(_version), - checksum_(kCRC32c), - table_magic_number_(_table_magic_number) { - // This should be guaranteed by constructor callers - assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); -} +Status Footer::DecodeFrom(Slice input, uint64_t input_offset) { + (void)input_offset; // Future use -Status Footer::DecodeFrom(Slice* input) { - assert(!HasInitializedTableMagicNumber()); + // Only decode to unused Footer + assert(table_magic_number_ == kNullTableMagicNumber); assert(input != nullptr); - assert(input->size() >= kMinEncodedLength); + assert(input.size() >= kMinEncodedLength); - const char* magic_ptr = - input->data() + input->size() - kMagicNumberLengthByte; - const uint32_t magic_lo = DecodeFixed32(magic_ptr); - const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); - uint64_t magic = ((static_cast(magic_hi) << 32) | - (static_cast(magic_lo))); + const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte; + uint64_t magic = DecodeFixed64(magic_ptr); // We check for legacy formats here and silently upconvert them bool legacy = IsLegacyFooterFormat(magic); if (legacy) { magic = UpconvertLegacyFooterFormat(magic); } - set_table_magic_number(magic); + table_magic_number_ = magic; + block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic); + // Parse Part3 if (legacy) { // The size is already asserted to be at least kMinEncodedLength // at the beginning of the function - input->remove_prefix(input->size() - kVersion0EncodedLength); - version_ = 0 /* legacy */; - checksum_ = kCRC32c; + input.remove_prefix(input.size() - kVersion0EncodedLength); + format_version_ = 0 /* legacy */; + checksum_type_ = kCRC32c; } else { - version_ = DecodeFixed32(magic_ptr - 4); - // Footer version 1 and higher will always occupy exactly this many bytes. - // It consists of the checksum type, two block handles, padding, - // a version number, and a magic number - if (input->size() < kNewVersionsEncodedLength) { - return Status::Corruption("input is too short to be an sstable"); - } else { - input->remove_prefix(input->size() - kNewVersionsEncodedLength); + const char* part3_ptr = magic_ptr - 4; + format_version_ = DecodeFixed32(part3_ptr); + if (!IsSupportedFormatVersion(format_version_)) { + return Status::Corruption("Corrupt or unsupported format_version: " + + ROCKSDB_NAMESPACE::ToString(format_version_)); } - uint32_t chksum; - if (!GetVarint32(input, &chksum)) { - return Status::Corruption("bad checksum type"); + // All known format versions >= 1 occupy exactly this many bytes. + if (input.size() < kNewVersionsEncodedLength) { + return Status::Corruption("Input is too short to be an SST file"); } - checksum_ = static_cast(chksum); - } + uint64_t adjustment = input.size() - kNewVersionsEncodedLength; + input.remove_prefix(adjustment); - Status result = metaindex_handle_.DecodeFrom(input); - if (result.ok()) { - result = index_handle_.DecodeFrom(input); + // Parse Part1 + char chksum = input.data()[0]; + checksum_type_ = lossless_cast(chksum); + if (!IsSupportedChecksumType(checksum_type())) { + return Status::Corruption( + "Corrupt or unsupported checksum type: " + + ROCKSDB_NAMESPACE::ToString(lossless_cast(chksum))); + } + // Consume checksum type field + input.remove_prefix(1); } + + // Parse Part2 + Status result = metaindex_handle_.DecodeFrom(&input); if (result.ok()) { - // We skip over any leftover data (just padding for now) in "input" - const char* end = magic_ptr + kMagicNumberLengthByte; - *input = Slice(end, input->data() + input->size() - end); + result = index_handle_.DecodeFrom(&input); } return result; + // Padding in part2 is ignored } std::string Footer::ToString() const { @@ -269,19 +336,17 @@ result.append("table_magic_number: " + ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); } else { - result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) + - "\n "); result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); result.append("index handle: " + index_handle_.ToString() + "\n "); - result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) + - "\n "); result.append("table_magic_number: " + ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); + result.append("format version: " + + ROCKSDB_NAMESPACE::ToString(format_version_) + "\n "); } return result; } -Status ReadFooterFromFile(RandomAccessFileReader* file, +Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number) { @@ -292,18 +357,30 @@ file->file_name()); } - char footer_space[Footer::kMaxEncodedLength]; + std::string footer_buf; + AlignedBuf internal_buf; Slice footer_input; - size_t read_offset = - (file_size > Footer::kMaxEncodedLength) - ? static_cast(file_size - Footer::kMaxEncodedLength) - : 0; + uint64_t read_offset = (file_size > Footer::kMaxEncodedLength) + ? file_size - Footer::kMaxEncodedLength + : 0; Status s; + // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now, + // there is no readahead for point lookups, so TryReadFromCache will fail if + // the required data is not in the prefetch buffer. Once deadline is enabled + // for iterator, TryReadFromCache might do a readahead. Revisit to see if we + // need to pass a timeout at that point if (prefetch_buffer == nullptr || - !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, - &footer_input)) { - s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, - footer_space); + !prefetch_buffer->TryReadFromCache(IOOptions(), file, read_offset, + Footer::kMaxEncodedLength, + &footer_input, nullptr)) { + if (file->use_direct_io()) { + s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, + &footer_input, nullptr, &internal_buf); + } else { + footer_buf.reserve(Footer::kMaxEncodedLength); + s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, + &footer_input, &footer_buf[0], nullptr); + } if (!s.ok()) return s; } @@ -316,7 +393,7 @@ file->file_name()); } - s = footer->DecodeFrom(&footer_input); + s = footer->DecodeFrom(footer_input, read_offset); if (!s.ok()) { return s; } @@ -330,117 +407,134 @@ return Status::OK(); } +namespace { +// Custom handling for the last byte of a block, to avoid invoking streaming +// API to get an effective block checksum. This function is its own inverse +// because it uses xor. +inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) { + // This strategy bears some resemblance to extending a CRC checksum by one + // more byte, except we don't need to re-mix the input checksum as long as + // we do this step only once (per checksum). + const uint32_t kRandomPrime = 0x6b9083d9; + return checksum ^ lossless_cast(last_byte) * kRandomPrime; +} +} // namespace + +uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data, + size_t data_size) { + switch (type) { + case kCRC32c: + return crc32c::Mask(crc32c::Value(data, data_size)); + case kxxHash: + return XXH32(data, data_size, /*seed*/ 0); + case kxxHash64: + return Lower32of64(XXH64(data, data_size, /*seed*/ 0)); + case kXXH3: { + if (data_size == 0) { + // Special case because of special handling for last byte, not + // present in this case. Can be any value different from other + // small input size checksums. + return 0; + } else { + // See corresponding code in ComputeBuiltinChecksumWithLastByte + uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1)); + return ModifyChecksumForLastByte(v, data[data_size - 1]); + } + } + default: // including kNoChecksum + return 0; + } +} + +uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, + size_t data_size, char last_byte) { + switch (type) { + case kCRC32c: { + uint32_t crc = crc32c::Value(data, data_size); + // Extend to cover last byte (compression type) + crc = crc32c::Extend(crc, &last_byte, 1); + return crc32c::Mask(crc); + } + case kxxHash: { + XXH32_state_t* const state = XXH32_createState(); + XXH32_reset(state, 0); + XXH32_update(state, data, data_size); + // Extend to cover last byte (compression type) + XXH32_update(state, &last_byte, 1); + uint32_t v = XXH32_digest(state); + XXH32_freeState(state); + return v; + } + case kxxHash64: { + XXH64_state_t* const state = XXH64_createState(); + XXH64_reset(state, 0); + XXH64_update(state, data, data_size); + // Extend to cover last byte (compression type) + XXH64_update(state, &last_byte, 1); + uint32_t v = Lower32of64(XXH64_digest(state)); + XXH64_freeState(state); + return v; + } + case kXXH3: { + // XXH3 is a complicated hash function that is extremely fast on + // contiguous input, but that makes its streaming support rather + // complex. It is worth custom handling of the last byte (`type`) + // in order to avoid allocating a large state object and bringing + // that code complexity into CPU working set. + uint32_t v = Lower32of64(XXH3_64bits(data, data_size)); + return ModifyChecksumForLastByte(v, last_byte); + } + default: // including kNoChecksum + return 0; + } +} + Status UncompressBlockContentsForCompressionType( const UncompressionInfo& uncompression_info, const char* data, size_t n, BlockContents* contents, uint32_t format_version, - const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) { - CacheAllocationPtr ubuf; + const ImmutableOptions& ioptions, MemoryAllocator* allocator) { + Status ret = Status::OK(); assert(uncompression_info.type() != kNoCompression && "Invalid compression type"); - StopWatchNano timer(ioptions.env, ShouldReportDetailedTime( - ioptions.env, ioptions.statistics)); - int decompress_size = 0; - switch (uncompression_info.type()) { - case kSnappyCompression: { - size_t ulength = 0; - static char snappy_corrupt_msg[] = - "Snappy not supported or corrupted Snappy compressed block contents"; - if (!Snappy_GetUncompressedLength(data, n, &ulength)) { - return Status::Corruption(snappy_corrupt_msg); - } - ubuf = AllocateBlock(ulength, allocator); - if (!Snappy_Uncompress(data, n, ubuf.get())) { - return Status::Corruption(snappy_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), ulength); - break; + StopWatchNano timer(ioptions.clock, + ShouldReportDetailedTime(ioptions.env, ioptions.stats)); + size_t uncompressed_size = 0; + CacheAllocationPtr ubuf = + UncompressData(uncompression_info, data, n, &uncompressed_size, + GetCompressFormatForVersion(format_version), allocator); + if (!ubuf) { + if (!CompressionTypeSupported(uncompression_info.type())) { + return Status::NotSupported( + "Unsupported compression method for this build", + CompressionTypeToString(uncompression_info.type())); + } else { + return Status::Corruption( + "Corrupted compressed block contents", + CompressionTypeToString(uncompression_info.type())); } - case kZlibCompression: - ubuf = Zlib_Uncompress( - uncompression_info, data, n, &decompress_size, - GetCompressFormatForVersion(kZlibCompression, format_version), - allocator); - if (!ubuf) { - static char zlib_corrupt_msg[] = - "Zlib not supported or corrupted Zlib compressed block contents"; - return Status::Corruption(zlib_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - case kBZip2Compression: - ubuf = BZip2_Uncompress( - data, n, &decompress_size, - GetCompressFormatForVersion(kBZip2Compression, format_version), - allocator); - if (!ubuf) { - static char bzip2_corrupt_msg[] = - "Bzip2 not supported or corrupted Bzip2 compressed block contents"; - return Status::Corruption(bzip2_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - case kLZ4Compression: - ubuf = LZ4_Uncompress( - uncompression_info, data, n, &decompress_size, - GetCompressFormatForVersion(kLZ4Compression, format_version), - allocator); - if (!ubuf) { - static char lz4_corrupt_msg[] = - "LZ4 not supported or corrupted LZ4 compressed block contents"; - return Status::Corruption(lz4_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - case kLZ4HCCompression: - ubuf = LZ4_Uncompress( - uncompression_info, data, n, &decompress_size, - GetCompressFormatForVersion(kLZ4HCCompression, format_version), - allocator); - if (!ubuf) { - static char lz4hc_corrupt_msg[] = - "LZ4HC not supported or corrupted LZ4HC compressed block contents"; - return Status::Corruption(lz4hc_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - case kXpressCompression: - // XPRESS allocates memory internally, thus no support for custom - // allocator. - ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size)); - if (!ubuf) { - static char xpress_corrupt_msg[] = - "XPRESS not supported or corrupted XPRESS compressed block " - "contents"; - return Status::Corruption(xpress_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - case kZSTD: - case kZSTDNotFinalCompression: - ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size, - allocator); - if (!ubuf) { - static char zstd_corrupt_msg[] = - "ZSTD not supported or corrupted ZSTD compressed block contents"; - return Status::Corruption(zstd_corrupt_msg); - } - *contents = BlockContents(std::move(ubuf), decompress_size); - break; - default: - return Status::Corruption("bad block type"); } - if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) { - RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, + *contents = BlockContents(std::move(ubuf), uncompressed_size); + + if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) { + RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED, + RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED, contents->data.size()); - RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); + RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); - return Status::OK(); + TEST_SYNC_POINT_CALLBACK( + "UncompressBlockContentsForCompressionType:TamperWithReturnValue", + static_cast(&ret)); + TEST_SYNC_POINT_CALLBACK( + "UncompressBlockContentsForCompressionType:" + "TamperWithDecompressionOutput", + static_cast(contents)); + + return ret; } // @@ -453,13 +547,27 @@ Status UncompressBlockContents(const UncompressionInfo& uncompression_info, const char* data, size_t n, BlockContents* contents, uint32_t format_version, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, MemoryAllocator* allocator) { assert(data[n] != kNoCompression); - assert(data[n] == uncompression_info.type()); + assert(data[n] == static_cast(uncompression_info.type())); return UncompressBlockContentsForCompressionType(uncompression_info, data, n, contents, format_version, ioptions, allocator); } +// Replace the contents of db_host_id with the actual hostname, if db_host_id +// matches the keyword kHostnameForDbHostId +Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) { + assert(db_host_id); + if (*db_host_id == kHostnameForDbHostId) { + Status s = env->GetHostNameString(db_host_id); + if (!s.ok()) { + db_host_id->clear(); + } + return s; + } + + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,21 +8,21 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include + +#include +#include #include + #include "file/file_prefetch_buffer.h" #include "file/random_access_file_reader.h" - -#include "rocksdb/options.h" -#include "rocksdb/slice.h" -#include "rocksdb/status.h" -#include "rocksdb/table.h" - #include "memory/memory_allocator.h" #include "options/cf_options.h" #include "port/malloc.h" #include "port/port.h" // noexcept -#include "table/persistent_cache_options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "util/hash.h" namespace ROCKSDB_NAMESPACE { @@ -32,12 +32,14 @@ extern bool ShouldReportDetailedTime(Env* env, Statistics* stats); // the length of the magic number in bytes. -const int kMagicNumberLengthByte = 8; +constexpr uint32_t kMagicNumberLengthByte = 8; // BlockHandle is a pointer to the extent of a file that stores a data // block or a meta block. class BlockHandle { public: + // Creates a block handle with special values indicating "uninitialized," + // distinct from the "null" block handle. BlockHandle(); BlockHandle(uint64_t offset, uint64_t size); @@ -50,6 +52,7 @@ void set_size(uint64_t _size) { size_ = _size; } void EncodeTo(std::string* dst) const; + char* EncodeTo(char* dst) const; Status DecodeFrom(Slice* input); Status DecodeSizeFrom(uint64_t offset, Slice* input); @@ -63,7 +66,14 @@ static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; } // Maximum encoding length of a BlockHandle - enum { kMaxEncodedLength = 10 + 10 }; + static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length; + + inline bool operator==(const BlockHandle& rhs) const { + return offset_ == rhs.offset_ && size_ == rhs.size_; + } + inline bool operator!=(const BlockHandle& rhs) const { + return !(*this == rhs); + } private: uint64_t offset_; @@ -101,140 +111,160 @@ std::string ToString(bool hex, bool have_first_key) const; }; -inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, - uint32_t version) { -#ifdef NDEBUG - (void)compression_type; -#endif - // snappy is not versioned - assert(compression_type != kSnappyCompression && - compression_type != kXpressCompression && - compression_type != kNoCompression); - // As of version 2, we encode compressed block with +inline uint32_t GetCompressFormatForVersion(uint32_t format_version) { + // As of format_version 2, we encode compressed block with // compress_format_version == 2. Before that, the version is 1. // DO NOT CHANGE THIS FUNCTION, it affects disk format - return version >= 2 ? 2 : 1; + return format_version >= 2 ? 2 : 1; } -inline bool BlockBasedTableSupportedVersion(uint32_t version) { - return version <= 5; +constexpr uint32_t kLatestFormatVersion = 5; + +inline bool IsSupportedFormatVersion(uint32_t version) { + return version <= kLatestFormatVersion; } -// Footer encapsulates the fixed information stored at the tail -// end of every table file. +// Footer encapsulates the fixed information stored at the tail end of every +// SST file. In general, it should only include things that cannot go +// elsewhere under the metaindex block. For example, checksum_type is +// required for verifying metaindex block checksum (when applicable), but +// index block handle can easily go in metaindex block (possible future). +// See also FooterBuilder below. class Footer { public: - // Constructs a footer without specifying its table magic number. - // In such case, the table magic number of such footer should be - // initialized via @ReadFooterFromFile(). - // Use this when you plan to load Footer with DecodeFrom(). Never use this - // when you plan to EncodeTo. - Footer() : Footer(kInvalidTableMagicNumber, 0) {} - - // Use this constructor when you plan to write out the footer using - // EncodeTo(). Never use this constructor with DecodeFrom(). - Footer(uint64_t table_magic_number, uint32_t version); - - // The version of the footer in this file - uint32_t version() const { return version_; } - - // The checksum type used in this file - ChecksumType checksum() const { return checksum_; } - void set_checksum(const ChecksumType c) { checksum_ = c; } + // Create empty. Populate using DecodeFrom. + Footer() {} + + // Deserialize a footer (populate fields) from `input` and check for various + // corruptions. `input_offset` is the offset within the target file of + // `input` buffer (future use). + Status DecodeFrom(Slice input, uint64_t input_offset); + + // Table magic number identifies file as RocksDB SST file and which kind of + // SST format is use. + uint64_t table_magic_number() const { return table_magic_number_; } + + // A version (footer and more) within a kind of SST. (It would add more + // unnecessary complexity to separate footer versions and + // BBTO::format_version.) + uint32_t format_version() const { return format_version_; } - // The block handle for the metaindex block of the table + // Block handle for metaindex block. const BlockHandle& metaindex_handle() const { return metaindex_handle_; } - void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } - // The block handle for the index block of the table + // Block handle for (top-level) index block. const BlockHandle& index_handle() const { return index_handle_; } - void set_index_handle(const BlockHandle& h) { index_handle_ = h; } + // Checksum type used in the file. + ChecksumType checksum_type() const { + return static_cast(checksum_type_); + } - uint64_t table_magic_number() const { return table_magic_number_; } + // Block trailer size used by file with this footer (e.g. 5 for block-based + // table and 0 for plain table). This is inferred from magic number so + // not in the serialized form. + inline size_t GetBlockTrailerSize() const { return block_trailer_size_; } - void EncodeTo(std::string* dst) const; + // Convert this object to a human readable form + std::string ToString() const; - // Set the current footer based on the input slice. + // Encoded lengths of Footers. Bytes for serialized Footer will always be + // >= kMinEncodedLength and <= kMaxEncodedLength. // - // REQUIRES: table_magic_number_ is not set (i.e., - // HasInitializedTableMagicNumber() is true). The function will initialize the - // magic number - Status DecodeFrom(Slice* input); - - // Encoded length of a Footer. Note that the serialization of a Footer will - // always occupy at least kMinEncodedLength bytes. If fields are changed - // the version number should be incremented and kMaxEncodedLength should be - // increased accordingly. - enum { - // Footer version 0 (legacy) will always occupy exactly this many bytes. - // It consists of two block handles, padding, and a magic number. - kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8, - // Footer of versions 1 and higher will always occupy exactly this many - // bytes. It consists of the checksum type, two block handles, padding, - // a version number (bigger than 1), and a magic number - kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, - kMinEncodedLength = kVersion0EncodedLength, - kMaxEncodedLength = kNewVersionsEncodedLength, - }; + // Footer version 0 (legacy) will always occupy exactly this many bytes. + // It consists of two block handles, padding, and a magic number. + static constexpr uint32_t kVersion0EncodedLength = + 2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte; + static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength; + + // Footer of versions 1 and higher will always occupy exactly this many + // bytes. It originally consisted of the checksum type, two block handles, + // padding (to maximum handle encoding size), a format version number, and a + // magic number. + static constexpr uint32_t kNewVersionsEncodedLength = + 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte; + static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength; - static const uint64_t kInvalidTableMagicNumber = 0; + static constexpr uint64_t kNullTableMagicNumber = 0; - // convert this object to a human readable form - std::string ToString() const; + static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU; private: - // REQUIRES: magic number wasn't initialized. - void set_table_magic_number(uint64_t magic_number) { - assert(!HasInitializedTableMagicNumber()); - table_magic_number_ = magic_number; - } + static constexpr int kInvalidChecksumType = + (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum; - // return true if @table_magic_number_ is set to a value different - // from @kInvalidTableMagicNumber. - bool HasInitializedTableMagicNumber() const { - return (table_magic_number_ != kInvalidTableMagicNumber); - } - - uint32_t version_; - ChecksumType checksum_; + uint64_t table_magic_number_ = kNullTableMagicNumber; + uint32_t format_version_ = kInvalidFormatVersion; BlockHandle metaindex_handle_; BlockHandle index_handle_; - uint64_t table_magic_number_ = 0; + int checksum_type_ = kInvalidChecksumType; + uint8_t block_trailer_size_ = 0; +}; + +// Builder for Footer +class FooterBuilder { + public: + // Run builder in inputs. This is a single step with lots of parameters for + // efficiency (based on perf testing). + // * table_magic_number identifies file as RocksDB SST file and which kind of + // SST format is use. + // * format_version is a version for the footer and can also apply to other + // aspects of the SST file (see BlockBasedTableOptions::format_version). + // NOTE: To save complexity in the caller, when format_version == 0 and + // there is a corresponding legacy magic number to the one specified, the + // legacy magic number will be written for forward compatibility. + // * footer_offset is the file offset where the footer will be written + // (for future use). + // * checksum_type is for formats using block checksums. + // * index_handle is optional for some kinds of SST files. + void Build(uint64_t table_magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle = BlockHandle::NullBlockHandle()); + + // After Builder, get a Slice for the serialized Footer, backed by this + // FooterBuilder. + const Slice& GetSlice() const { + assert(slice_.size()); + return slice_; + } + + private: + Slice slice_; + std::array data_; }; // Read the footer from file // If enforce_table_magic_number != 0, ReadFooterFromFile() will return // corruption if table_magic number is not equal to enforce_table_magic_number -Status ReadFooterFromFile(RandomAccessFileReader* file, +Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number = 0); -// 1-byte type + 32-bit crc -static const size_t kBlockTrailerSize = 5; - -// Make block size calculation for IO less error prone -inline uint64_t block_size(const BlockHandle& handle) { - return handle.size() + kBlockTrailerSize; -} - -inline CompressionType get_block_compression_type(const char* block_data, - size_t block_size) { - return static_cast(block_data[block_size]); -} +// Computes a checksum using the given ChecksumType. Sometimes we need to +// include one more input byte logically at the end but not part of the main +// data buffer. If data_size >= 1, then +// ComputeBuiltinChecksum(type, data, size) +// == +// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1]) +uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data, + size_t size); +uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, + size_t size, char last_byte); // Represents the contents of a block read from an SST file. Depending on how // it's created, it may or may not own the actual block bytes. As an example, // BlockContents objects representing data read from mmapped files only point // into the mmapped region. struct BlockContents { - Slice data; // Actual contents of data + // Points to block payload (without trailer) + Slice data; CacheAllocationPtr allocation; #ifndef NDEBUG - // Whether the block is a raw block, which contains compression type - // byte. It is only used for assertion. + // Whether there is a known trailer after what is pointed to by `data`. + // See BlockBasedTable::GetCompressionType. bool is_raw_block = false; #endif // NDEBUG @@ -256,14 +286,6 @@ // Returns whether the object has ownership of the underlying data bytes. bool own_bytes() const { return allocation.get() != nullptr; } - // It's the caller's responsibility to make sure that this is - // for raw block contents, which contains the compression - // byte in the end. - CompressionType get_compression_type() const { - assert(is_raw_block); - return get_block_compression_type(data.data(), data.size()); - } - // The additional memory space taken by the block data. size_t usable_size() const { if (allocation.get() != nullptr) { @@ -299,15 +321,6 @@ } }; -// Read the block identified by "handle" from "file". On failure -// return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents( - RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - BlockContents* contents, const ImmutableCFOptions& ioptions, - bool do_uncompress = true, const Slice& compression_dict = Slice(), - const PersistentCacheOptions& cache_options = PersistentCacheOptions()); - // The 'data' points to the raw block contents read in from file. // This method allocates a new heap buffer and the raw block // contents are uncompresed into this buffer. This buffer is @@ -319,7 +332,7 @@ const char* data, size_t n, BlockContents* contents, uint32_t compress_format_version, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); // This is an extension to UncompressBlockContents that accepts @@ -328,15 +341,17 @@ extern Status UncompressBlockContentsForCompressionType( const UncompressionInfo& info, const char* data, size_t n, BlockContents* contents, uint32_t compress_format_version, - const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr); + const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); + +// Replace db_host_id contents with the real hostname if necessary +extern Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id); // Implementation details follow. Clients should ignore, // TODO(andrewkr): we should prefer one way of representing a null/uninitialized // BlockHandle. Currently we use zeros for null and use negation-of-zeros for // uninitialized. -inline BlockHandle::BlockHandle() - : BlockHandle(~static_cast(0), ~static_cast(0)) {} +inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {} inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size) : offset_(_offset), size_(_size) {} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,15 +4,17 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/get_context.h" + +#include "db/blob//blob_fetcher.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/read_callback.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" -#include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -38,13 +40,17 @@ } // namespace -GetContext::GetContext( - const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, - Statistics* statistics, GetState init_state, const Slice& user_key, - PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, - bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env, - SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr, - ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id) +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* pinnable_val, + std::string* timestamp, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index, + uint64_t tracing_get_id, BlobFetcher* blob_fetcher) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -52,23 +58,38 @@ state_(init_state), user_key_(user_key), pinnable_val_(pinnable_val), + timestamp_(timestamp), value_found_(value_found), merge_context_(merge_context), max_covering_tombstone_seq_(_max_covering_tombstone_seq), - env_(env), + clock_(clock), seq_(seq), replay_log_(nullptr), pinned_iters_mgr_(_pinned_iters_mgr), callback_(callback), do_merge_(do_merge), is_blob_index_(is_blob_index), - tracing_get_id_(tracing_get_id) { + tracing_get_id_(tracing_get_id), + blob_fetcher_(blob_fetcher) { if (seq_) { *seq_ = kMaxSequenceNumber; } sample_ = should_sample_file_read(); } +GetContext::GetContext( + const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, const Slice& user_key, + PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, + bool do_merge, SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, + bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher) + : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key, + pinnable_val, nullptr, value_found, merge_context, do_merge, + _max_covering_tombstone_seq, clock, seq, _pinned_iters_mgr, + callback, is_blob_index, tracing_get_id, blob_fetcher) {} + // Called from TableCache::Get and Table::Get when file/block in which // key may exist are not there in TableCache/BlockCache respectively. In this // case we can't guarantee that key does not exist and are not permitted to do @@ -138,6 +159,10 @@ if (get_context_stats_.num_cache_add > 0) { RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add); } + if (get_context_stats_.num_cache_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT, + get_context_stats_.num_cache_add_redundant); + } if (get_context_stats_.num_cache_bytes_write > 0) { RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE, get_context_stats_.num_cache_bytes_write); @@ -146,6 +171,10 @@ RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD, get_context_stats_.num_cache_index_add); } + if (get_context_stats_.num_cache_index_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT, + get_context_stats_.num_cache_index_add_redundant); + } if (get_context_stats_.num_cache_index_bytes_insert > 0) { RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT, get_context_stats_.num_cache_index_bytes_insert); @@ -154,6 +183,10 @@ RecordTick(statistics_, BLOCK_CACHE_DATA_ADD, get_context_stats_.num_cache_data_add); } + if (get_context_stats_.num_cache_data_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT, + get_context_stats_.num_cache_data_add_redundant); + } if (get_context_stats_.num_cache_data_bytes_insert > 0) { RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT, get_context_stats_.num_cache_data_bytes_insert); @@ -162,6 +195,10 @@ RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD, get_context_stats_.num_cache_filter_add); } + if (get_context_stats_.num_cache_filter_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT, + get_context_stats_.num_cache_filter_add_redundant); + } if (get_context_stats_.num_cache_filter_bytes_insert > 0) { RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT, get_context_stats_.num_cache_filter_bytes_insert); @@ -170,6 +207,10 @@ RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD, get_context_stats_.num_cache_compression_dict_add); } + if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, + get_context_stats_.num_cache_compression_dict_add_redundant); + } if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) { RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, get_context_stats_.num_cache_compression_dict_bytes_insert); @@ -182,7 +223,7 @@ assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); - if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) { + if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) { *matched = true; // If the value is not in the snapshot, skip it if (!CheckCallback(parsed_key.sequence)) { @@ -211,9 +252,12 @@ assert(state_ == kNotFound || state_ == kMerge); if (type == kTypeBlobIndex && is_blob_index_ == nullptr) { // Blob value not supported. Stop. - state_ = kBlobIndex; + state_ = kUnexpectedBlobIndex; return false; } + if (is_blob_index_ != nullptr) { + *is_blob_index_ = (type == kTypeBlobIndex); + } if (kNotFound == state_) { state_ = kFound; if (do_merge_) { @@ -224,7 +268,6 @@ } else { TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); - // Otherwise copy the value pinnable_val_->PinSelf(value); } @@ -233,35 +276,57 @@ // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of // merge_context_->operand_list - push_operand(value, value_pinner); + if (is_blob_index_ != nullptr && *is_blob_index_) { + PinnableSlice pin_val; + if (GetBlobValue(value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + push_operand(blob_value, nullptr); + } else { + push_operand(value, value_pinner); + } } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); - state_ = kFound; - if (do_merge_) { - if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, &value, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } + if (is_blob_index_ != nullptr && *is_blob_index_) { + PinnableSlice pin_val; + if (GetBlobValue(value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + state_ = kFound; + if (do_merge_) { + Merge(&blob_value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(blob_value, nullptr); } } else { - // It means this function is called as part of DB GetMergeOperands - // API and the current value should be part of - // merge_context_->operand_list - push_operand(value, value_pinner); + state_ = kFound; + if (do_merge_) { + Merge(&value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); + } } } - if (is_blob_index_ != nullptr) { - *is_blob_index_ = (type == kTypeBlobIndex); + if (state_ == kFound) { + size_t ts_sz = ucmp_->timestamp_size(); + if (ts_sz > 0 && timestamp_ != nullptr) { + Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz); + timestamp_->assign(ts.data(), ts.size()); + } } return false; case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: case kTypeRangeDeletion: // TODO(noetzli): Verify correctness once merge of single-deletes @@ -271,20 +336,9 @@ state_ = kDeleted; } else if (kMerge == state_) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - if (do_merge_) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } - } - // If do_merge_ = false then the current value shouldn't be part of - // merge_context_->operand_list - } + Merge(nullptr); + // If do_merge_ = false then the current value shouldn't be part of + // merge_context_->operand_list } return false; @@ -297,20 +351,7 @@ merge_operator_->ShouldMerge( merge_context_->GetOperandsDirectionBackward())) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - // do_merge_ = true this is the case where this function is called - // as part of DB Get API hence merge operators should be merged. - if (do_merge_) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } - } - } + Merge(nullptr); return false; } return true; @@ -325,6 +366,39 @@ return false; } +void GetContext::Merge(const Slice* value) { + if (LIKELY(pinnable_val_ != nullptr)) { + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, value, merge_context_->GetOperands(), + pinnable_val_->GetSelf(), logger_, statistics_, clock_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } + } + } +} + +bool GetContext::GetBlobValue(const Slice& blob_index, + PinnableSlice* blob_value) { + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr uint64_t* bytes_read = nullptr; + + Status status = blob_fetcher_->FetchBlob( + user_key_, blob_index, prefetch_buffer, blob_value, bytes_read); + if (!status.ok()) { + if (status.IsIncomplete()) { + MarkKeyMayExist(); + return false; + } + state_ = kCorrupt; + return false; + } + *is_blob_index_ = false; + return true; +} + void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && value_pinner != nullptr) { diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,17 +5,20 @@ #pragma once #include -#include "db/dbformat.h" -#include "db/merge_context.h" + #include "db/read_callback.h" -#include "rocksdb/env.h" -#include "rocksdb/statistics.h" #include "rocksdb/types.h" -#include "table/block_based/block.h" namespace ROCKSDB_NAMESPACE { +class BlobFetcher; +class Comparator; +class Logger; class MergeContext; +class MergeOperator; class PinnedIteratorsManager; +class Statistics; +class SystemClock; +struct ParsedInternalKey; // Data structure for accumulating statistics during a point lookup. At the // end of the point lookup, the corresponding ticker stats are updated. This @@ -33,15 +36,25 @@ uint64_t num_cache_bytes_read = 0; uint64_t num_cache_miss = 0; uint64_t num_cache_add = 0; + uint64_t num_cache_add_redundant = 0; uint64_t num_cache_bytes_write = 0; uint64_t num_cache_index_add = 0; + uint64_t num_cache_index_add_redundant = 0; uint64_t num_cache_index_bytes_insert = 0; uint64_t num_cache_data_add = 0; + uint64_t num_cache_data_add_redundant = 0; uint64_t num_cache_data_bytes_insert = 0; uint64_t num_cache_filter_add = 0; + uint64_t num_cache_filter_add_redundant = 0; uint64_t num_cache_filter_bytes_insert = 0; uint64_t num_cache_compression_dict_add = 0; + uint64_t num_cache_compression_dict_add_redundant = 0; uint64_t num_cache_compression_dict_bytes_insert = 0; + // MultiGet stats. + uint64_t num_filter_read = 0; + uint64_t num_index_read = 0; + uint64_t num_data_read = 0; + uint64_t num_sst_read = 0; }; // A class to hold context about a point lookup, such as pointer to value @@ -61,7 +74,7 @@ kDeleted, kCorrupt, kMerge, // saver contains the current merge result (the operands) - kBlobIndex, + kUnexpectedBlobIndex, }; GetContextStats get_context_stats_; @@ -89,11 +102,21 @@ Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* value, bool* value_found, MergeContext* merge_context, bool do_merge, - SequenceNumber* max_covering_tombstone_seq, Env* env, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, + SequenceNumber* seq = nullptr, + PinnedIteratorsManager* _pinned_iters_mgr = nullptr, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* value, + std::string* timestamp, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, - uint64_t tracing_get_id = 0); + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); GetContext() = delete; @@ -150,6 +173,9 @@ void push_operand(const Slice& value, Cleanable* value_pinner); private: + void Merge(const Slice* value); + bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value); + const Comparator* ucmp_; const MergeOperator* merge_operator_; // the merge operations encountered; @@ -159,10 +185,11 @@ GetState state_; Slice user_key_; PinnableSlice* pinnable_val_; + std::string* timestamp_; bool* value_found_; // Is value set correctly? Used by KeyMayExist MergeContext* merge_context_; SequenceNumber* max_covering_tombstone_seq_; - Env* env_; + SystemClock* clock_; // If a key is found, seq_ will be set to the SequenceNumber of most recent // write to the key or kMaxSequenceNumber if unknown SequenceNumber* seq_; @@ -179,6 +206,7 @@ // Used for block cache tracing only. A tracing get id uniquely identifies a // Get or a MultiGet. const uint64_t tracing_get_id_; + BlobFetcher* blob_fetcher_; }; // Call this to replay a log and bring the get_context up to date. The replay diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/internal_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/internal_iterator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,7 +7,9 @@ #pragma once #include + #include "db/dbformat.h" +#include "file/readahead_file_info.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -17,9 +19,17 @@ class PinnedIteratorsManager; +enum class IterBoundCheck : char { + kUnknown = 0, + kOutOfBound, + kInbound, +}; + struct IterateResult { Slice key; - bool may_be_out_of_upper_bound; + IterBoundCheck bound_check_result = IterBoundCheck::kUnknown; + // If false, PrepareValue() needs to be called before value(). + bool value_prepared = true; }; template @@ -52,6 +62,7 @@ // All Seek*() methods clear any error status() that the iterator had prior to // the call; after the seek, status() indicates only the error (if any) that // happened during the seek, not any past errors. + // 'target' contains user timestamp if timestamp is enabled. virtual void Seek(const Slice& target) = 0; // Position at the first key in the source that at or before target @@ -66,7 +77,7 @@ // Moves to the next entry in the source, and return result. Iterator // implementation should override this method to help methods inline better, - // or when MayBeOutOfUpperBound() is non-trivial. + // or when UpperBoundCheckResult() is non-trivial. // REQUIRES: Valid() virtual bool NextAndGetResult(IterateResult* result) { Next(); @@ -74,10 +85,11 @@ if (is_valid) { result->key = key(); // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual - // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // call. If an implementation has non-trivial UpperBoundCheckResult(), // it should also override NextAndGetResult(). - result->may_be_out_of_upper_bound = true; - assert(MayBeOutOfUpperBound()); + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = false; + assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound); } return is_valid; } @@ -101,6 +113,7 @@ // the returned slice is valid only until the next modification of // the iterator. // REQUIRES: Valid() + // REQUIRES: PrepareValue() has been called if needed (see PrepareValue()). virtual TValue value() const = 0; // If an error has occurred, return it. Else return an ok status. @@ -108,21 +121,32 @@ // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // True if the iterator is invalidated because it reached a key that is above - // the iterator upper bound. Used by LevelIterator to decide whether it should - // stop or move on to the next file. - // Important: if iterator reached the end of the file without encountering any - // keys above the upper bound, IsOutOfBound() must return false. - virtual bool IsOutOfBound() { return false; } + // For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may + // load key but not value (to avoid the IO cost of reading the value from disk + // if it won't be not needed). This method loads the value in such situation. + // + // Needs to be called before value() at least once after each iterator + // movement (except if IterateResult::value_prepared = true), for iterators + // created with allow_unprepared_value = true. + // + // Returns false if an error occurred; in this case Valid() is also changed + // to false, and status() is changed to non-ok. + // REQUIRES: Valid() + virtual bool PrepareValue() { return true; } // Keys return from this iterator can be smaller than iterate_lower_bound. virtual bool MayBeOutOfLowerBound() { return true; } - // Keys return from this iterator can be larger or equal to - // iterate_upper_bound. - virtual bool MayBeOutOfUpperBound() { return true; } + // If the iterator has checked the key against iterate_upper_bound, returns + // the result here. The function can be used by user of the iterator to skip + // their own checks. If Valid() = true, IterBoundCheck::kUnknown is always + // a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates + // that the iterator is filtered out by upper bound checks. + virtual IterBoundCheck UpperBoundCheckResult() { + return IterBoundCheck::kUnknown; + } - // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont + // Pass the PinnedIteratorsManager to the Iterator, most Iterators don't // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager // they will implement this function and use the passed pointer to communicate @@ -143,12 +167,25 @@ // If true, this means that the Slice returned by value() is valid as long as // PinnedIteratorsManager::ReleasePinnedData is not called and the // Iterator is not deleted. + // REQUIRES: Same as for value(). virtual bool IsValuePinned() const { return false; } virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) { return Status::NotSupported(""); } + // When iterator moves from one file to another file at same level, new file's + // readahead state (details of last block read) is updated with previous + // file's readahead state. This way internal readahead_size of Prefetch Buffer + // doesn't start from scratch and can fall back to 8KB with no prefetch if + // reads are not sequential. + // + // Default implementation is no-op and its implemented by iterators. + virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {} + + // Default implementation is no-op and its implemented by iterators. + virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {} + protected: void SeekForPrevImpl(const Slice& target, const Comparator* cmp) { Seek(target); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/iterator_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/iterator_wrapper.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h 2025-05-19 16:14:27.000000000 +0000 @@ -70,11 +70,32 @@ assert(iter_); return iter_->status(); } + bool PrepareValue() { + assert(Valid()); + if (result_.value_prepared) { + return true; + } + if (iter_->PrepareValue()) { + result_.value_prepared = true; + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } void Next() { assert(iter_); valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } + bool NextAndGetResult(IterateResult* result) { + assert(iter_); + valid_ = iter_->NextAndGetResult(&result_); + *result = result_; + assert(!valid_ || iter_->status().ok()); + return valid_; + } void Prev() { assert(iter_); iter_->Prev(); @@ -106,9 +127,9 @@ return iter_->MayBeOutOfLowerBound(); } - bool MayBeOutOfUpperBound() { + IterBoundCheck UpperBoundCheckResult() { assert(Valid()); - return result_.may_be_out_of_upper_bound; + return result_.bound_check_result; } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { @@ -124,13 +145,31 @@ return iter_->IsValuePinned(); } + bool IsValuePrepared() const { + return result_.value_prepared; + } + + Slice user_key() const { + assert(Valid()); + return iter_->user_key(); + } + + void UpdateReadaheadState(InternalIteratorBase* old_iter) { + if (old_iter && iter_) { + ReadaheadFileInfo readahead_file_info; + old_iter->GetReadaheadState(&readahead_file_info); + iter_->SetReadaheadState(&readahead_file_info); + } + } + private: void Update() { valid_ = iter_->Valid(); if (valid_) { assert(iter_->status().ok()); result_.key = iter_->key(); - result_.may_be_out_of_upper_bound = true; + result_.bound_check_result = IterBoundCheck::kUnknown; + result_.value_prepared = false; } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merger_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,12 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include #include +#include #include "table/merging_iterator.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/random.h" +#include "util/vector_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -24,7 +26,7 @@ std::vector ret; for (size_t i = 0; i < len; ++i) { - InternalKey ik(test::RandomHumanReadableString(&rnd_, string_len), 0, + InternalKey ik(rnd_.HumanReadableString(string_len), 0, ValueType::kTypeValue); ret.push_back(ik.Encode().ToString(false)); } @@ -44,8 +46,7 @@ } void SeekToRandom() { - InternalKey ik(test::RandomHumanReadableString(&rnd_, 5), 0, - ValueType::kTypeValue); + InternalKey ik(rnd_.HumanReadableString(5), 0, ValueType::kTypeValue); Seek(ik.Encode().ToString(false)); } @@ -101,14 +102,14 @@ std::vector small_iterators; for (size_t i = 0; i < num_iterators; ++i) { auto strings = GenerateStrings(strings_per_iterator, letters_per_string); - small_iterators.push_back(new test::VectorIterator(strings)); + small_iterators.push_back(new VectorIterator(strings, strings, &icomp_)); all_keys_.insert(all_keys_.end(), strings.begin(), strings.end()); } merging_iterator_.reset( NewMergingIterator(&icomp_, &small_iterators[0], static_cast(small_iterators.size()))); - single_iterator_.reset(new test::VectorIterator(all_keys_)); + single_iterator_.reset(new VectorIterator(all_keys_, all_keys_, &icomp_)); } InternalKeyComparator icomp_; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -28,8 +28,8 @@ namespace ROCKSDB_NAMESPACE { // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace { -typedef BinaryHeap MergerMaxIterHeap; -typedef BinaryHeap MergerMinIterHeap; +using MergerMaxIterHeap = BinaryHeap; +using MergerMinIterHeap = BinaryHeap; } // namespace const size_t kNumIterReserve = 4; @@ -40,20 +40,16 @@ InternalIterator** children, int n, bool is_arena_mode, bool prefix_seek_mode) : is_arena_mode_(is_arena_mode), + prefix_seek_mode_(prefix_seek_mode), + direction_(kForward), comparator_(comparator), current_(nullptr), - direction_(kForward), minHeap_(comparator_), - prefix_seek_mode_(prefix_seek_mode), pinned_iters_mgr_(nullptr) { children_.resize(n); for (int i = 0; i < n; i++) { children_[i].Set(children[i]); } - for (auto& child : children_) { - AddToMinHeapOrCheckStatus(&child); - } - current_ = CurrentForward(); } void considerStatus(Status s) { @@ -63,22 +59,20 @@ } virtual void AddIterator(InternalIterator* iter) { - assert(direction_ == kForward); children_.emplace_back(iter); if (pinned_iters_mgr_) { iter->SetPinnedItersMgr(pinned_iters_mgr_); } - auto new_wrapper = children_.back(); - AddToMinHeapOrCheckStatus(&new_wrapper); - if (new_wrapper.Valid()) { - current_ = CurrentForward(); - } + // Invalidate to ensure `Seek*()` is called to construct the heaps before + // use. + current_ = nullptr; } ~MergingIterator() override { for (auto& child : children_) { child.DeleteIter(is_arena_mode_); } + status_.PermitUncheckedError(); } bool Valid() const override { return current_ != nullptr && status_.ok(); } @@ -194,7 +188,8 @@ bool is_valid = Valid(); if (is_valid) { result->key = key(); - result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = current_->IsValuePrepared(); } return is_valid; } @@ -240,6 +235,17 @@ return current_->value(); } + bool PrepareValue() override { + assert(Valid()); + if (current_->PrepareValue()) { + return true; + } + + considerStatus(current_->status()); + assert(!status_.ok()); + return false; + } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result // from current child iterator. Potentially as long as one of child iterator // report out of bound is not possible, we know current key is within bound. @@ -249,9 +255,9 @@ return current_->MayBeOutOfLowerBound(); } - bool MayBeOutOfUpperBound() override { + IterBoundCheck UpperBoundCheckResult() override { assert(Valid()); - return current_->MayBeOutOfUpperBound(); + return current_->UpperBoundCheckResult(); } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { @@ -281,6 +287,10 @@ void InitMaxHeap(); bool is_arena_mode_; + bool prefix_seek_mode_; + // Which direction is the iterator moving? + enum Direction : uint8_t { kForward, kReverse }; + Direction direction_; const InternalKeyComparator* comparator_; autovector children_; @@ -290,14 +300,7 @@ IteratorWrapper* current_; // If any of the children have non-ok status, this is one of them. Status status_; - // Which direction is the iterator moving? - enum Direction { - kForward, - kReverse - }; - Direction direction_; MergerMinIterHeap minHeap_; - bool prefix_seek_mode_; // Max heap is used for reverse iteration, which is way less common than // forward. Lazily initialize it to save memory. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h 2025-05-19 16:14:27.000000000 +0000 @@ -9,14 +9,14 @@ #pragma once -#include "db/dbformat.h" +#include "rocksdb/slice.h" #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { -class Comparator; -class Env; class Arena; +class InternalKeyComparator; + template class InternalIteratorBase; using InternalIterator = InternalIteratorBase; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc 2025-05-19 16:14:27.000000000 +0000 @@ -10,18 +10,28 @@ #include "block_fetcher.h" #include "db/table_properties_collector.h" #include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/block_based/block.h" +#include "table/block_based/reader_common.h" #include "table/format.h" #include "table/internal_iterator.h" #include "table/persistent_cache_helper.h" +#include "table/sst_file_writer_collectors.h" #include "table/table_properties_internal.h" #include "test_util/sync_point.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { +const std::string kPropertiesBlockName = "rocksdb.properties"; +// Old property block name for backward compatibility +const std::string kPropertiesBlockOldName = "rocksdb.stats"; +const std::string kCompressionDictBlockName = "rocksdb.compression_dict"; +const std::string kRangeDelBlockName = "rocksdb.range_del"; + MetaIndexBuilder::MetaIndexBuilder() : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} @@ -71,6 +81,7 @@ TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start", const_cast(&props)); + Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number); Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); Add(TablePropertiesNames::kDataSize, props.data_size); @@ -83,6 +94,7 @@ Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, props.index_value_is_delta_encoded); Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries); Add(TablePropertiesNames::kDeletedKeys, props.num_deletions); Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands); Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); @@ -96,6 +108,23 @@ if (props.file_creation_time > 0) { Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time); } + if (props.slow_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize, + props.slow_compression_estimated_data_size); + } + if (props.fast_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, + props.fast_compression_estimated_data_size); + } + if (!props.db_id.empty()) { + Add(TablePropertiesNames::kDbId, props.db_id); + } + if (!props.db_session_id.empty()) { + Add(TablePropertiesNames::kDbSessionId, props.db_session_id); + } + if (!props.db_host_id.empty()) { + Add(TablePropertiesNames::kDbHostId, props.db_host_id); + } if (!props.filter_policy_name.empty()) { Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name); @@ -135,8 +164,8 @@ return properties_block_->Finish(); } -void LogPropertiesCollectionError( - Logger* info_log, const std::string& method, const std::string& name) { +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name) { assert(method == "Add" || method == "Finish"); std::string msg = @@ -163,11 +192,11 @@ void NotifyCollectTableCollectorsOnBlockAdd( const std::vector>& collectors, - const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast, - const uint64_t blockCompressedBytesSlow) { + const uint64_t block_raw_bytes, const uint64_t block_compressed_bytes_fast, + const uint64_t block_compressed_bytes_slow) { for (auto& collector : collectors) { - collector->BlockAdd(blockRawBytes, blockCompressedBytesFast, - blockCompressedBytesSlow); + collector->BlockAdd(block_raw_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); } } @@ -191,50 +220,48 @@ return all_succeeded; } -Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, const Footer& footer, - const ImmutableCFOptions& ioptions, - TableProperties** table_properties, bool verify_checksum, - BlockHandle* ret_block_handle, - CacheAllocationPtr* verification_buf, - bool /*compression_type_missing*/, - MemoryAllocator* memory_allocator) { +// FIXME: should be a parameter for reading table properties to use persistent +// cache? +Status ReadTablePropertiesHelper( + const ReadOptions& ro, const BlockHandle& handle, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ImmutableOptions& ioptions, + std::unique_ptr* table_properties, + MemoryAllocator* memory_allocator) { assert(table_properties); - Slice v = handle_value; - BlockHandle handle; - if (!handle.DecodeFrom(&v).ok()) { - return Status::InvalidArgument("Failed to decode properties block handle"); - } - + // If this is an external SST file ingested with write_global_seqno set to + // true, then we expect the checksum mismatch because checksum was written + // by SstFileWriter, but its global seqno in the properties block may have + // been changed during ingestion. For this reason, we initially read + // and process without checksum verification, then later try checksum + // verification so that if it fails, we can copy to a temporary buffer with + // global seqno set to its original value, i.e. 0, and attempt checksum + // verification again. + ReadOptions modified_ro = ro; + modified_ro.verify_checksums = false; BlockContents block_contents; - ReadOptions read_options; - read_options.verify_checksums = verify_checksum; - Status s; - PersistentCacheOptions cache_options; - - BlockFetcher block_fetcher( - file, prefetch_buffer, footer, read_options, handle, &block_contents, - ioptions, false /* decompress */, false /*maybe_compressed*/, - BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options, - memory_allocator); - s = block_fetcher.ReadBlockContents(); - // property block is never compressed. Need to add uncompress logic if we are - // to compress it.. - + BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle, + &block_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kProperties, + UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator); + Status s = block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; } - Block properties_block(std::move(block_contents), - kDisableGlobalSequenceNumber); - DataBlockIter iter; - properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(), - &iter); + // Unfortunately, Block::size() might not equal block_contents.data.size(), + // and Block hides block_contents + uint64_t block_size = block_contents.data.size(); + Block properties_block(std::move(block_contents)); + std::unique_ptr iter(properties_block.NewMetaIterator()); - auto new_table_properties = new TableProperties(); + std::unique_ptr new_table_properties{new TableProperties}; // All pre-defined properties of type uint64_t std::unordered_map predefined_uint64_properties = { + {TablePropertiesNames::kOriginalFileNumber, + &new_table_properties->orig_file_number}, {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, {TablePropertiesNames::kIndexPartitions, @@ -252,6 +279,8 @@ {TablePropertiesNames::kNumDataBlocks, &new_table_properties->num_data_blocks}, {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumFilterEntries, + &new_table_properties->num_filter_entries}, {TablePropertiesNames::kDeletedKeys, &new_table_properties->num_deletions}, {TablePropertiesNames::kMergeOperands, @@ -270,16 +299,20 @@ &new_table_properties->oldest_key_time}, {TablePropertiesNames::kFileCreationTime, &new_table_properties->file_creation_time}, + {TablePropertiesNames::kSlowCompressionEstimatedDataSize, + &new_table_properties->slow_compression_estimated_data_size}, + {TablePropertiesNames::kFastCompressionEstimatedDataSize, + &new_table_properties->fast_compression_estimated_data_size}, }; std::string last_key; - for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) { - s = iter.status(); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + s = iter->status(); if (!s.ok()) { break; } - auto key = iter.key().ToString(); + auto key = iter->key().ToString(); // properties block should be strictly sorted with no duplicate key. if (!last_key.empty() && BytewiseComparator()->Compare(key, last_key) <= 0) { @@ -288,11 +321,13 @@ } last_key = key; - auto raw_val = iter.value(); + auto raw_val = iter->value(); auto pos = predefined_uint64_properties.find(key); - new_table_properties->properties_offsets.insert( - {key, handle.offset() + iter.ValueOffset()}); + if (key == ExternalSstFilePropertyNames::kGlobalSeqno) { + new_table_properties->external_sst_file_global_seqno_offset = + handle.offset() + iter->ValueOffset(); + } if (pos != predefined_uint64_properties.end()) { if (key == TablePropertiesNames::kDeletedKeys || @@ -308,10 +343,16 @@ auto error_msg = "Detect malformed value in properties meta-block:" "\tkey: " + key + "\tval: " + raw_val.ToString(); - ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str()); + ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); continue; } *(pos->second) = val; + } else if (key == TablePropertiesNames::kDbId) { + new_table_properties->db_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbSessionId) { + new_table_properties->db_session_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbHostId) { + new_table_properties->db_host_id = raw_val.ToString(); } else if (key == TablePropertiesNames::kFilterPolicy) { new_table_properties->filter_policy_name = raw_val.ToString(); } else if (key == TablePropertiesNames::kColumnFamilyName) { @@ -334,21 +375,28 @@ {key, raw_val.ToString()}); } } - if (s.ok()) { - *table_properties = new_table_properties; - if (ret_block_handle != nullptr) { - *ret_block_handle = handle; - } - if (verification_buf != nullptr) { - size_t len = static_cast(handle.size() + kBlockTrailerSize); - *verification_buf = - ROCKSDB_NAMESPACE::AllocateBlock(len, memory_allocator); - if (verification_buf->get() != nullptr) { - memcpy(verification_buf->get(), block_contents.data.data(), len); + + // Modified version of BlockFetcher checksum verification + // (See write_global_seqno comment above) + if (s.ok() && footer.GetBlockTrailerSize() > 0) { + s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(), + block_size, file->file_name(), handle.offset()); + if (s.IsCorruption()) { + if (new_table_properties->external_sst_file_global_seqno_offset != 0) { + std::string tmp_buf(properties_block.data(), + block_fetcher.GetBlockSizeWithTrailer()); + uint64_t global_seqno_offset = + new_table_properties->external_sst_file_global_seqno_offset - + handle.offset(); + EncodeFixed64(&tmp_buf[static_cast(global_seqno_offset)], 0); + s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(), + block_size, file->file_name(), handle.offset()); } } - } else { - delete new_table_properties; + } + + if (s.ok()) { + *table_properties = std::move(new_table_properties); } return s; @@ -356,111 +404,101 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, - TableProperties** properties, - bool compression_type_missing, - MemoryAllocator* memory_allocator) { - // -- Read metaindex block + const ImmutableOptions& ioptions, + std::unique_ptr* properties, + MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer) { + BlockHandle block_handle; Footer footer; - auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, - &footer, table_magic_number); + Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + kPropertiesBlockName, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!s.ok()) { return s; } - auto metaindex_handle = footer.metaindex_handle(); - BlockContents metaindex_contents; - ReadOptions read_options; - read_options.verify_checksums = false; - PersistentCacheOptions cache_options; - - BlockFetcher block_fetcher( - file, nullptr /* prefetch_buffer */, footer, read_options, - metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, BlockType::kMetaIndex, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - return s; - } - // property blocks are never compressed. Need to add uncompress logic if we - // are to compress it. - Block metaindex_block(std::move(metaindex_contents), - kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator())); - - // -- Read property block - bool found_properties_block = true; - s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block); - if (!s.ok()) { - return s; - } - - TableProperties table_properties; - if (found_properties_block == true) { - s = ReadProperties( - meta_iter->value(), file, nullptr /* prefetch_buffer */, footer, - ioptions, properties, false /* verify_checksum */, - nullptr /* ret_block_hanel */, nullptr /* ret_block_contents */, - compression_type_missing, memory_allocator); + if (!block_handle.IsNull()) { + s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file, + prefetch_buffer, footer, ioptions, properties, + memory_allocator); } else { s = Status::NotFound(); } - return s; } +Status FindOptionalMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle) { + assert(block_handle != nullptr); + meta_index_iter->Seek(meta_block_name); + if (meta_index_iter->status().ok()) { + if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } else if (meta_block_name == kPropertiesBlockName) { + // Have to try old name for compatibility + meta_index_iter->Seek(kPropertiesBlockOldName); + if (meta_index_iter->status().ok() && meta_index_iter->Valid() && + meta_index_iter->key() == kPropertiesBlockOldName) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } + } + } + // else + *block_handle = BlockHandle::NullBlockHandle(); + return meta_index_iter->status(); +} + Status FindMetaBlock(InternalIterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle) { - meta_index_iter->Seek(meta_block_name); - if (meta_index_iter->status().ok() && meta_index_iter->Valid() && - meta_index_iter->key() == meta_block_name) { - Slice v = meta_index_iter->value(); - return block_handle->DecodeFrom(&v); - } else { + Status s = + FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle); + if (s.ok() && block_handle->IsNull()) { return Status::Corruption("Cannot find the meta block", meta_block_name); + } else { + return s; } } -Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, - BlockHandle* block_handle, - bool /*compression_type_missing*/, - MemoryAllocator* memory_allocator) { +Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const std::string& meta_block_name, + BlockHandle* block_handle, + MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, + Footer* footer_out) { Footer footer; - auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, - &footer, table_magic_number); + IOOptions opts; + auto s = ReadFooterFromFile(opts, file, prefetch_buffer, file_size, &footer, + table_magic_number); if (!s.ok()) { return s; } + if (footer_out) { + *footer_out = footer; + } auto metaindex_handle = footer.metaindex_handle(); BlockContents metaindex_contents; - ReadOptions read_options; - read_options.verify_checksums = false; - PersistentCacheOptions cache_options; - BlockFetcher block_fetcher( - file, nullptr /* prefetch_buffer */, footer, read_options, - metaindex_handle, &metaindex_contents, ioptions, - false /* do decompression */, false /*maybe_compressed*/, - BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options, - memory_allocator); - s = block_fetcher.ReadBlockContents(); + s = BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), + metaindex_handle, &metaindex_contents, ioptions, + false /* do decompression */, false /*maybe_compressed*/, + BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator) + .ReadBlockContents(); if (!s.ok()) { return s; } // meta blocks are never compressed. Need to add uncompress logic if we are to // compress it. - Block metaindex_block(std::move(metaindex_contents), - kDisableGlobalSequenceNumber); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), - BytewiseComparator())); + meta_iter.reset(metaindex_block.NewMetaIterator()); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -468,58 +506,29 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockType block_type, - BlockContents* contents, bool /*compression_type_missing*/, + BlockContents* contents, MemoryAllocator* memory_allocator) { - Status status; - Footer footer; - status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer, - table_magic_number); - if (!status.ok()) { - return status; - } - - // Reading metaindex block - auto metaindex_handle = footer.metaindex_handle(); - BlockContents metaindex_contents; - ReadOptions read_options; - read_options.verify_checksums = false; - PersistentCacheOptions cache_options; - - BlockFetcher block_fetcher( - file, prefetch_buffer, footer, read_options, metaindex_handle, - &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, BlockType::kMetaIndex, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); - status = block_fetcher.ReadBlockContents(); - if (!status.ok()) { - return status; - } - // meta block is never compressed. Need to add uncompress logic if we are to - // compress it. - - // Finding metablock - Block metaindex_block(std::move(metaindex_contents), - kDisableGlobalSequenceNumber); - - std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), - BytewiseComparator())); + // TableProperties requires special handling because of checksum issues. + // Call ReadTableProperties instead for that case. + assert(block_type != BlockType::kProperties); BlockHandle block_handle; - status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); - + Footer footer; + Status status = FindMetaBlockInFile( + file, file_size, table_magic_number, ioptions, meta_block_name, + &block_handle, memory_allocator, prefetch_buffer, &footer); if (!status.ok()) { return status; } - // Reading metablock - BlockFetcher block_fetcher2( - file, prefetch_buffer, footer, read_options, block_handle, contents, - ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); - return block_fetcher2.ReadBlockContents(); + return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), + block_handle, contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, block_type, + UncompressionDict::GetEmptyDict(), + PersistentCacheOptions::kEmpty, memory_allocator) + .ReadBlockContents(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h 2025-05-19 16:14:27.000000000 +0000 @@ -30,6 +30,12 @@ class RandomAccessFile; struct TableProperties; +// Meta block names for metaindex +extern const std::string kPropertiesBlockName; +extern const std::string kPropertiesBlockOldName; +extern const std::string kCompressionDictBlockName; +extern const std::string kRangeDelBlockName; + class MetaIndexBuilder { public: MetaIndexBuilder(const MetaIndexBuilder&) = delete; @@ -70,8 +76,8 @@ // Were we encounter any error occurs during user-defined statistics collection, // we'll write the warning message to info log. -void LogPropertiesCollectionError( - Logger* info_log, const std::string& method, const std::string& name); +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name); // Utility functions help table builder to trigger batch events for user // defined property collectors. @@ -86,8 +92,8 @@ void NotifyCollectTableCollectorsOnBlockAdd( const std::vector>& collectors, - uint64_t blockRawBytes, uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow); + uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow); // NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all // property collectors. The collected properties will be added to `builder`. @@ -95,47 +101,49 @@ const std::vector>& collectors, Logger* info_log, PropertyBlockBuilder* builder); -// Read the properties from the table. +// Read table properties from a file using known BlockHandle. // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, const Footer& footer, - const ImmutableCFOptions& ioptions, - TableProperties** table_properties, bool verify_checksum, - BlockHandle* block_handle, - CacheAllocationPtr* verification_buf, - bool compression_type_missing = false, - MemoryAllocator* memory_allocator = nullptr); +Status ReadTablePropertiesHelper( + const ReadOptions& ro, const BlockHandle& handle, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ImmutableOptions& ioptions, + std::unique_ptr* table_properties, + MemoryAllocator* memory_allocator = nullptr); -// Directly read the properties from the properties block of a plain table. +// Read table properties from the properties block of a plain table. // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -// certain tables do not have compression_type byte setup properly for -// uncompressed blocks, caller can request to reset compression type by -// passing compression_type_missing = true, the same applies to -// `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock` Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, - TableProperties** properties, - bool compression_type_missing = false, - MemoryAllocator* memory_allocator = nullptr); + const ImmutableOptions& ioptions, + std::unique_ptr* properties, + MemoryAllocator* memory_allocator = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr); + +// Find the meta block from the meta index block. Returns OK and +// block_handle->IsNull() if not found. +Status FindOptionalMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle); -// Find the meta block from the meta index block. +// Find the meta block from the meta index block. Returns Corruption if not +// found. Status FindMetaBlock(InternalIterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle); // Find the meta block -Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, - BlockHandle* block_handle, - bool compression_type_missing = false, - MemoryAllocator* memory_allocator = nullptr); +Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableOptions& ioptions, + const std::string& meta_block_name, + BlockHandle* block_handle, + MemoryAllocator* memory_allocator = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr, + Footer* footer_out = nullptr); // Read the specified meta block with name meta_block_name // from `file` and initialize `contents` with contents of this block. @@ -143,10 +151,9 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, - bool compression_type_missing = false, MemoryAllocator* memory_allocator = nullptr); } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc 2025-05-19 16:14:27.000000000 +0000 @@ -16,21 +16,187 @@ namespace ROCKSDB_NAMESPACE { namespace mock { -namespace { +KVVector MakeMockFile(std::initializer_list l) { return KVVector(l); } -const InternalKeyComparator icmp_(BytewiseComparator()); +void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) { + InternalKeyComparator icmp(ucmp); + std::sort(kv_vector->begin(), kv_vector->end(), + [icmp](KVPair a, KVPair b) -> bool { + return icmp.Compare(a.first, b.first) < 0; + }); +} -} // namespace +class MockTableReader : public TableReader { + public: + explicit MockTableReader(const KVVector& table) : table_(table) {} + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } -stl_wrappers::KVMap MakeMockFile( - std::initializer_list> l) { - return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); -} + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override {} + + std::shared_ptr GetTableProperties() const override; + + ~MockTableReader() {} + + private: + const KVVector& table_; +}; + +class MockTableIterator : public InternalIterator { + public: + explicit MockTableIterator(const KVVector& table) : table_(table) { + itr_ = table_.end(); + } + + bool Valid() const override { return itr_ != table_.end(); } + + void SeekToFirst() override { itr_ = table_.begin(); } + + void SeekToLast() override { + itr_ = table_.end(); + --itr_; + } + + void Seek(const Slice& target) override { + KVPair target_pair(target.ToString(), ""); + InternalKeyComparator icmp(BytewiseComparator()); + itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair, + [icmp](KVPair a, KVPair b) -> bool { + return icmp.Compare(a.first, b.first) < 0; + }); + } + + void SeekForPrev(const Slice& target) override { + KVPair target_pair(target.ToString(), ""); + InternalKeyComparator icmp(BytewiseComparator()); + itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair, + [icmp](KVPair a, KVPair b) -> bool { + return icmp.Compare(a.first, b.first) < 0; + }); + Prev(); + } + + void Next() override { ++itr_; } + + void Prev() override { + if (itr_ == table_.begin()) { + itr_ = table_.end(); + } else { + --itr_; + } + } + + Slice key() const override { return Slice(itr_->first); } + + Slice value() const override { return Slice(itr_->second); } + + Status status() const override { return Status::OK(); } + + private: + const KVVector& table_; + KVVector::const_iterator itr_; +}; + +class MockTableBuilder : public TableBuilder { + public: + MockTableBuilder(uint32_t id, MockTableFileSystem* file_system, + MockTableFactory::MockCorruptionMode corrupt_mode = + MockTableFactory::kCorruptNone) + : id_(id), file_system_(file_system), corrupt_mode_(corrupt_mode) { + table_ = MakeMockFile({}); + } + + // REQUIRES: Either Finish() or Abandon() has been called. + ~MockTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override { + if (corrupt_mode_ == MockTableFactory::kCorruptValue) { + // Corrupt the value + table_.push_back({key.ToString(), value.ToString() + " "}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) { + table_.push_back({key.ToString() + " ", value.ToString()}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) { + if (prev_key_.empty()) { + prev_key_ = key.ToString(); + prev_value_ = value.ToString(); + } else { + table_.push_back({key.ToString(), value.ToString()}); + table_.push_back({prev_key_, prev_value_}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } + } else { + table_.push_back({key.ToString(), value.ToString()}); + } + } + + // Return non-ok iff some error has been detected. + Status status() const override { return Status::OK(); } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return IOStatus::OK(); } + + Status Finish() override { + MutexLock lock_guard(&file_system_->mutex); + file_system_->files.insert({id_, table_}); + return Status::OK(); + } + + void Abandon() override {} + + uint64_t NumEntries() const override { return table_.size(); } + + uint64_t FileSize() const override { return table_.size(); } + + TableProperties GetTableProperties() const override { + return TableProperties(); + } + + // Get file checksum + std::string GetFileChecksum() const override { return kUnknownFileChecksum; } + // Get file checksum function name + const char* GetFileChecksumFuncName() const override { + return kUnknownFileChecksumFuncName; + } + + private: + uint32_t id_; + std::string prev_key_; + std::string prev_value_; + MockTableFileSystem* file_system_; + int corrupt_mode_; + KVVector table_; +}; InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, - size_t /*compaction_readahead_size*/) { + size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { return new MockTableIterator(table_); } @@ -41,8 +207,10 @@ std::unique_ptr iter(new MockTableIterator(table_)); for (iter->Seek(key); iter->Valid(); iter->Next()) { ParsedInternalKey parsed_key; - if (!ParseInternalKey(iter->key(), &parsed_key)) { - return Status::Corruption(Slice()); + Status pik_status = + ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */); + if (!pik_status.ok()) { + return pik_status; } bool dont_care __attribute__((__unused__)); @@ -58,14 +226,20 @@ return std::shared_ptr(new TableProperties()); } -MockTableFactory::MockTableFactory() : next_id_(1) {} +MockTableFactory::MockTableFactory() + : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {} Status MockTableFactory::NewTableReader( + const ReadOptions& /*ro*/, const TableReaderOptions& /*table_reader_options*/, std::unique_ptr&& file, uint64_t /*file_size*/, std::unique_ptr* table_reader, bool /*prefetch_index_and_filter_in_cache*/) const { - uint32_t id = GetIDFromFile(file.get()); + uint32_t id; + Status s = GetIDFromFile(file.get(), &id); + if (!s.ok()) { + return s; + } MutexLock lock_guard(&file_system_.mutex); @@ -81,52 +255,54 @@ TableBuilder* MockTableFactory::NewTableBuilder( const TableBuilderOptions& /*table_builder_options*/, - uint32_t /*column_family_id*/, WritableFileWriter* file) const { - uint32_t id = GetAndWriteNextID(file); + WritableFileWriter* file) const { + uint32_t id; + Status s = GetAndWriteNextID(file, &id); + assert(s.ok()); - return new MockTableBuilder(id, &file_system_); + return new MockTableBuilder(id, &file_system_, corrupt_mode_); } Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, - stl_wrappers::KVMap file_contents) { - std::unique_ptr file; - auto s = env->NewWritableFile(fname, &file, EnvOptions()); + KVVector file_contents) { + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create(env->GetFileSystem(), fname, + FileOptions(), &file_writer, nullptr); if (!s.ok()) { return s; } - - WritableFileWriter file_writer(NewLegacyWritableFileWrapper(std::move(file)), - fname, EnvOptions()); - - uint32_t id = GetAndWriteNextID(&file_writer); - file_system_.files.insert({id, std::move(file_contents)}); - return Status::OK(); + uint32_t id; + s = GetAndWriteNextID(file_writer.get(), &id); + if (s.ok()) { + file_system_.files.insert({id, std::move(file_contents)}); + } + return s; } -uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const { - uint32_t next_id = next_id_.fetch_add(1); +Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file, + uint32_t* next_id) const { + *next_id = next_id_.fetch_add(1); char buf[4]; - EncodeFixed32(buf, next_id); - file->Append(Slice(buf, 4)); - return next_id; + EncodeFixed32(buf, *next_id); + return file->Append(Slice(buf, 4)); } -uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { +Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file, + uint32_t* id) const { char buf[4]; Slice result; - file->Read(0, 4, &result, buf); + Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr); assert(result.size() == 4); - return DecodeFixed32(buf); + *id = DecodeFixed32(buf); + return s; } -void MockTableFactory::AssertSingleFile( - const stl_wrappers::KVMap& file_contents) { +void MockTableFactory::AssertSingleFile(const KVVector& file_contents) { ASSERT_EQ(file_system_.files.size(), 1U); ASSERT_EQ(file_contents, file_system_.files.begin()->second); } -void MockTableFactory::AssertLatestFile( - const stl_wrappers::KVMap& file_contents) { +void MockTableFactory::AssertLatestFile(const KVVector& file_contents) { ASSERT_GE(file_system_.files.size(), 1U); auto latest = file_system_.files.end(); --latest; @@ -137,8 +313,9 @@ ParsedInternalKey ikey; std::string key, value; std::tie(key, value) = kv; - ParseInternalKey(Slice(key), &ikey); - std::cout << ikey.DebugString(false) << " -> " << value << std::endl; + ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */)); + std::cout << ikey.DebugString(true, false) << " -> " << value + << std::endl; } FAIL(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h 2025-05-19 16:14:27.000000000 +0000 @@ -15,6 +15,7 @@ #include "db/version_edit.h" #include "port/port.h" #include "rocksdb/comparator.h" +#include "rocksdb/io_status.h" #include "rocksdb/table.h" #include "table/internal_iterator.h" #include "table/table_builder.h" @@ -26,188 +27,63 @@ namespace ROCKSDB_NAMESPACE { namespace mock { +using KVPair = std::pair; +using KVVector = std::vector; -stl_wrappers::KVMap MakeMockFile( - std::initializer_list> l = {}); +KVVector MakeMockFile(std::initializer_list l = {}); +void SortKVVector(KVVector* kv_vector, + const Comparator* ucmp = BytewiseComparator()); struct MockTableFileSystem { port::Mutex mutex; - std::map files; -}; - -class MockTableReader : public TableReader { - public: - explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} - - InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, - TableReaderCaller caller, - size_t compaction_readahead_size = 0) override; - - Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context, const SliceTransform* prefix_extractor, - bool skip_filters = false) override; - - uint64_t ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - size_t ApproximateMemoryUsage() const override { return 0; } - - void SetupForCompaction() override {} - - std::shared_ptr GetTableProperties() const override; - - ~MockTableReader() {} - - private: - const stl_wrappers::KVMap& table_; -}; - -class MockTableIterator : public InternalIterator { - public: - explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { - itr_ = table_.end(); - } - - bool Valid() const override { return itr_ != table_.end(); } - - void SeekToFirst() override { itr_ = table_.begin(); } - - void SeekToLast() override { - itr_ = table_.end(); - --itr_; - } - - void Seek(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.lower_bound(str_target); - } - - void SeekForPrev(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.upper_bound(str_target); - Prev(); - } - - void Next() override { ++itr_; } - - void Prev() override { - if (itr_ == table_.begin()) { - itr_ = table_.end(); - } else { - --itr_; - } - } - - Slice key() const override { return Slice(itr_->first); } - - Slice value() const override { return Slice(itr_->second); } - - Status status() const override { return Status::OK(); } - - private: - const stl_wrappers::KVMap& table_; - stl_wrappers::KVMap::const_iterator itr_; -}; - -class MockTableBuilder : public TableBuilder { - public: - MockTableBuilder(uint32_t id, MockTableFileSystem* file_system) - : id_(id), file_system_(file_system) { - table_ = MakeMockFile({}); - } - - // REQUIRES: Either Finish() or Abandon() has been called. - ~MockTableBuilder() {} - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override { - table_.insert({key.ToString(), value.ToString()}); - } - - // Return non-ok iff some error has been detected. - Status status() const override { return Status::OK(); } - - Status Finish() override { - MutexLock lock_guard(&file_system_->mutex); - file_system_->files.insert({id_, table_}); - return Status::OK(); - } - - void Abandon() override {} - - uint64_t NumEntries() const override { return table_.size(); } - - uint64_t FileSize() const override { return table_.size(); } - - TableProperties GetTableProperties() const override { - return TableProperties(); - } - - // Get file checksum - const std::string& GetFileChecksum() const override { return file_checksum_; } - // Get file checksum function name - const char* GetFileChecksumFuncName() const override { - return kUnknownFileChecksumFuncName.c_str(); - } - - private: - uint32_t id_; - MockTableFileSystem* file_system_; - stl_wrappers::KVMap table_; - std::string file_checksum_ = kUnknownFileChecksum; + std::map files; }; class MockTableFactory : public TableFactory { public: + enum MockCorruptionMode { + kCorruptNone, + kCorruptKey, + kCorruptValue, + kCorruptReorderKey, + }; + MockTableFactory(); - const char* Name() const override { return "MockTable"; } + static const char* kClassName() { return "MockTable"; } + const char* Name() const override { return kClassName(); } + using TableFactory::NewTableReader; Status NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& ro, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_familly_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; // This function will directly create mock table instead of going through // MockTableBuilder. file_contents has to have a format of . Those key-value pairs will then be inserted into the mock table. Status CreateMockTable(Env* env, const std::string& fname, - stl_wrappers::KVMap file_contents); - - virtual Status SanitizeOptions( - const DBOptions& /*db_opts*/, - const ColumnFamilyOptions& /*cf_opts*/) const override { - return Status::OK(); - } + KVVector file_contents); - virtual std::string GetPrintableTableOptions() const override { + virtual std::string GetPrintableOptions() const override { return std::string(); } + void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } // This function will assert that only a single file exists and that the // contents are equal to file_contents - void AssertSingleFile(const stl_wrappers::KVMap& file_contents); - void AssertLatestFile(const stl_wrappers::KVMap& file_contents); + void AssertSingleFile(const KVVector& file_contents); + void AssertLatestFile(const KVVector& file_contents); private: - uint32_t GetAndWriteNextID(WritableFileWriter* file) const; - uint32_t GetIDFromFile(RandomAccessFileReader* file) const; + Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const; + Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const; mutable MockTableFileSystem file_system_; mutable std::atomic next_id_; + MockCorruptionMode corrupt_mode_; }; } // namespace mock diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/multiget_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/multiget_context.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h 2025-05-19 16:14:27.000000000 +0000 @@ -7,12 +7,15 @@ #include #include #include + +#include "db/dbformat.h" #include "db/lookup_key.h" #include "db/merge_context.h" #include "rocksdb/env.h" #include "rocksdb/statistics.h" #include "rocksdb/types.h" #include "util/autovector.h" +#include "util/math.h" namespace ROCKSDB_NAMESPACE { class GetContext; @@ -20,27 +23,32 @@ struct KeyContext { const Slice* key; LookupKey* lkey; - Slice ukey; + Slice ukey_with_ts; + Slice ukey_without_ts; Slice ikey; ColumnFamilyHandle* column_family; Status* s; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq; bool key_exists; + bool is_blob_index; void* cb_arg; PinnableSlice* value; + std::string* timestamp; GetContext* get_context; KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key, - PinnableSlice* val, Status* stat) + PinnableSlice* val, std::string* ts, Status* stat) : key(&user_key), lkey(nullptr), column_family(col_family), s(stat), max_covering_tombstone_seq(0), key_exists(false), + is_blob_index(false), cb_arg(nullptr), value(val), + timestamp(ts), get_context(nullptr) {} KeyContext() = default; @@ -84,16 +92,21 @@ class MultiGetContext { public: // Limit the number of keys in a batch to this number. Benchmarks show that - // there is negligible benefit for batches exceeding this. Keeping this < 64 + // there is negligible benefit for batches exceeding this. Keeping this < 32 // simplifies iteration, as well as reduces the amount of stack allocations - // htat need to be performed + // that need to be performed static const int MAX_BATCH_SIZE = 32; + static_assert(MAX_BATCH_SIZE < 64, "MAX_BATCH_SIZE cannot exceed 63"); + MultiGetContext(autovector* sorted_keys, - size_t begin, size_t num_keys, SequenceNumber snapshot) + size_t begin, size_t num_keys, SequenceNumber snapshot, + const ReadOptions& read_opts) : num_keys_(num_keys), value_mask_(0), + value_size_(0), lookup_key_ptr_(reinterpret_cast(lookup_key_stack_buf)) { + assert(num_keys <= MAX_BATCH_SIZE); if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) { lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]); lookup_key_ptr_ = reinterpret_cast( @@ -104,8 +117,11 @@ // autovector may not be contiguous storage, so make a copy sorted_keys_[iter] = (*sorted_keys)[begin + iter]; sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter]) - LookupKey(*sorted_keys_[iter]->key, snapshot); - sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key(); + LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp); + sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key(); + sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey( + sorted_keys_[iter]->lkey->user_key(), + read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size()); sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key(); } } @@ -123,6 +139,7 @@ std::array sorted_keys_; size_t num_keys_; uint64_t value_mask_; + uint64_t value_size_; std::unique_ptr lookup_key_heap_buf; LookupKey* lookup_key_ptr_; @@ -144,17 +161,17 @@ class Iterator { public: // -- iterator traits - typedef Iterator self_type; - typedef KeyContext value_type; - typedef KeyContext& reference; - typedef KeyContext* pointer; - typedef int difference_type; - typedef std::forward_iterator_tag iterator_category; + using self_type = Iterator; + using value_type = KeyContext; + using reference = KeyContext&; + using pointer = KeyContext*; + using difference_type = int; + using iterator_category = std::forward_iterator_tag; Iterator(const Range* range, size_t idx) : range_(range), ctx_(range->ctx_), index_(idx) { while (index_ < range_->end_ && - (1ull << index_) & + (uint64_t{1} << index_) & (range_->ctx_->value_mask_ | range_->skip_mask_)) index_++; } @@ -164,7 +181,7 @@ Iterator& operator++() { while (++index_ < range_->end_ && - (1ull << index_) & + (uint64_t{1} << index_) & (range_->ctx_->value_mask_ | range_->skip_mask_)) ; return *this; @@ -206,6 +223,8 @@ start_ = first.index_; end_ = last.index_; skip_mask_ = mget_range.skip_mask_; + assert(start_ < 64); + assert(end_ < 64); } Range() = default; @@ -214,33 +233,37 @@ Iterator end() const { return Iterator(this, end_); } - bool empty() { - return (((1ull << end_) - 1) & ~((1ull << start_) - 1) & - ~(ctx_->value_mask_ | skip_mask_)) == 0; - } + bool empty() const { return RemainingMask() == 0; } + + void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; } - void SkipKey(const Iterator& iter) { skip_mask_ |= 1ull << iter.index_; } + void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); } + + bool IsKeySkipped(const Iterator& iter) const { + return skip_mask_ & (uint64_t{1} << iter.index_); + } // Update the value_mask_ in MultiGetContext so its // immediately reflected in all the Range Iterators void MarkKeyDone(Iterator& iter) { - ctx_->value_mask_ |= (1ull << iter.index_); + ctx_->value_mask_ |= (uint64_t{1} << iter.index_); } - bool CheckKeyDone(Iterator& iter) { - return ctx_->value_mask_ & (1ull << iter.index_); + bool CheckKeyDone(Iterator& iter) const { + return ctx_->value_mask_ & (uint64_t{1} << iter.index_); } - uint64_t KeysLeft() { - uint64_t new_val = skip_mask_ | ctx_->value_mask_; - uint64_t count = 0; - while (new_val) { - new_val = new_val & (new_val - 1); - count++; - } - return end_ - count; + uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); } + + void AddSkipsFrom(const Range& other) { + assert(ctx_ == other.ctx_); + skip_mask_ |= other.skip_mask_; } + uint64_t GetValueSize() { return ctx_->value_size_; } + + void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; } + private: friend MultiGetContext; MultiGetContext* ctx_; @@ -249,7 +272,14 @@ uint64_t skip_mask_; Range(MultiGetContext* ctx, size_t num_keys) - : ctx_(ctx), start_(0), end_(num_keys), skip_mask_(0) {} + : ctx_(ctx), start_(0), end_(num_keys), skip_mask_(0) { + assert(num_keys < 64); + } + + uint64_t RemainingMask() const { + return (((uint64_t{1} << end_) - 1) & ~((uint64_t{1} << start_) - 1) & + ~(ctx_->value_mask_ | skip_mask_)); + } }; // Return the initial range that encompasses all the keys in the batch diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,19 +9,19 @@ namespace ROCKSDB_NAMESPACE { +const PersistentCacheOptions PersistentCacheOptions::kEmpty; + void PersistentCacheHelper::InsertRawPage( const PersistentCacheOptions& cache_options, const BlockHandle& handle, const char* data, const size_t size) { assert(cache_options.persistent_cache); assert(cache_options.persistent_cache->IsCompressed()); - // construct the page key - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), - cache_options.key_prefix.size(), - handle, cache_key); - // insert content to cache - cache_options.persistent_cache->Insert(key, data, size); + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + cache_options.persistent_cache->Insert(key.AsSlice(), data, size) + .PermitUncheckedError(); } void PersistentCacheHelper::InsertUncompressedPage( @@ -33,14 +33,13 @@ // (1) content is cacheable // (2) content is not compressed - // construct the page key - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), - cache_options.key_prefix.size(), - handle, cache_key); - // insert block contents to page cache - cache_options.persistent_cache->Insert(key, contents.data.data(), - contents.data.size()); + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + + cache_options.persistent_cache + ->Insert(key.AsSlice(), contents.data.data(), contents.data.size()) + .PermitUncheckedError(); + ; } Status PersistentCacheHelper::LookupRawPage( @@ -52,14 +51,12 @@ assert(cache_options.persistent_cache); assert(cache_options.persistent_cache->IsCompressed()); - // construct the page key - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), - cache_options.key_prefix.size(), - handle, cache_key); - // Lookup page + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + size_t size; - Status s = cache_options.persistent_cache->Lookup(key, raw_data, &size); + Status s = + cache_options.persistent_cache->Lookup(key.AsSlice(), raw_data, &size); if (!s.ok()) { // cache miss RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); @@ -67,7 +64,8 @@ } // cache hit - assert(raw_data_size == handle.size() + kBlockTrailerSize); + // Block-based table is assumed + assert(raw_data_size == handle.size() + BlockBasedTable::kBlockTrailerSize); assert(size == raw_data_size); RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); return Status::OK(); @@ -84,15 +82,13 @@ return Status::NotFound(); } - // construct the page key - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), - cache_options.key_prefix.size(), - handle, cache_key); - // Lookup page + CacheKey key = + BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle); + std::unique_ptr data; size_t size; - Status s = cache_options.persistent_cache->Lookup(key, &data, &size); + Status s = + cache_options.persistent_cache->Lookup(key.AsSlice(), &data, &size); if (!s.ok()) { // cache miss RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_options.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h 2025-05-19 16:14:27.000000000 +0000 @@ -6,6 +6,7 @@ #include +#include "cache/cache_key.h" #include "monitoring/statistics.h" #include "rocksdb/persistent_cache.h" @@ -19,16 +20,18 @@ PersistentCacheOptions() {} explicit PersistentCacheOptions( const std::shared_ptr& _persistent_cache, - const std::string _key_prefix, Statistics* const _statistics) + const OffsetableCacheKey& _base_cache_key, Statistics* const _statistics) : persistent_cache(_persistent_cache), - key_prefix(_key_prefix), + base_cache_key(_base_cache_key), statistics(_statistics) {} virtual ~PersistentCacheOptions() {} std::shared_ptr persistent_cache; - std::string key_prefix; + OffsetableCacheKey base_cache_key; Statistics* statistics = nullptr; + + static const PersistentCacheOptions kEmpty; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h 2025-05-19 16:14:27.000000000 +0000 @@ -132,4 +132,4 @@ PlainTableBloomV1 bloom_; }; -}; // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,12 +8,13 @@ #include -#include #include #include +#include #include "db/dbformat.h" #include "file/writable_file_writer.h" +#include "logging/logging.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" @@ -36,16 +37,16 @@ // a utility that helps writing block content to the file // @offset will advance if @block_contents was successfully written. // @block_handle the block handle this particular block. -Status WriteBlock(const Slice& block_contents, WritableFileWriter* file, - uint64_t* offset, BlockHandle* block_handle) { +IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { block_handle->set_offset(*offset); block_handle->set_size(block_contents.size()); - Status s = file->Append(block_contents); + IOStatus io_s = file->Append(block_contents); - if (s.ok()) { + if (io_s.ok()) { *offset += block_contents.size(); } - return s; + return io_s; } } // namespace @@ -57,14 +58,14 @@ extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len, - EncodingType encoding_type, size_t index_sparseness, + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + uint32_t column_family_id, int level_at_creation, WritableFileWriter* file, + uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, const std::string& column_family_name, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, - bool store_index_in_file) + bool store_index_in_file, const std::string& db_id, + const std::string& db_session_id, uint64_t file_number) : ioptions_(ioptions), moptions_(moptions), bloom_block_(num_probes), @@ -97,22 +98,38 @@ properties_.format_version = (encoding_type == kPlain) ? 0 : 1; properties_.column_family_id = column_family_id; properties_.column_family_name = column_family_name; - properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr - ? moptions_.prefix_extractor->Name() - : "nullptr"; + properties_.db_id = db_id; + properties_.db_session_id = db_session_id; + properties_.db_host_id = ioptions.db_host_id; + if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set"); + } + properties_.orig_file_number = file_number; + properties_.prefix_extractor_name = + moptions_.prefix_extractor != nullptr + ? moptions_.prefix_extractor->AsString() + : "nullptr"; std::string val; PutFixed32(&val, static_cast(encoder_.GetEncodingType())); properties_.user_collected_properties [PlainTablePropertyNames::kEncodingType] = val; - for (auto& collector_factories : *int_tbl_prop_collector_factories) { + assert(int_tbl_prop_collector_factories); + for (auto& factory : *int_tbl_prop_collector_factories) { + assert(factory); + table_properties_collectors_.emplace_back( - collector_factories->CreateIntTblPropCollector(column_family_id)); + factory->CreateIntTblPropCollector(column_family_id, + level_at_creation)); } } PlainTableBuilder::~PlainTableBuilder() { + // They are supposed to have been passed to users through Finish() + // if the file succeeds. + status_.PermitUncheckedError(); + io_status_.PermitUncheckedError(); } void PlainTableBuilder::Add(const Slice& key, const Slice& value) { @@ -121,7 +138,8 @@ size_t meta_bytes_buf_size = 0; ParsedInternalKey internal_key; - if (!ParseInternalKey(key, &internal_key)) { + if (!ParseInternalKey(key, &internal_key, false /* log_err_key */) + .ok()) { // TODO assert(false); return; } @@ -145,41 +163,46 @@ assert(offset_ <= std::numeric_limits::max()); auto prev_offset = static_cast(offset_); // Write out the key - encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, - &meta_bytes_buf_size); + io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, + &meta_bytes_buf_size); if (SaveIndexInFile()) { index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset); } // Write value length uint32_t value_size = static_cast(value.size()); - char* end_ptr = - EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); - assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); - meta_bytes_buf_size = end_ptr - meta_bytes_buf; - file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + if (io_status_.ok()) { + char* end_ptr = + EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); + assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); + meta_bytes_buf_size = end_ptr - meta_bytes_buf; + io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + } // Write value - file_->Append(value); - offset_ += value_size + meta_bytes_buf_size; + if (io_status_.ok()) { + io_status_ = file_->Append(value); + offset_ += value_size + meta_bytes_buf_size; + } - properties_.num_entries++; - properties_.raw_key_size += key.size(); - properties_.raw_value_size += value.size(); - if (internal_key.type == kTypeDeletion || - internal_key.type == kTypeSingleDeletion) { - properties_.num_deletions++; - } else if (internal_key.type == kTypeMerge) { - properties_.num_merge_operands++; + if (io_status_.ok()) { + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + if (internal_key.type == kTypeDeletion || + internal_key.type == kTypeSingleDeletion) { + properties_.num_deletions++; + } else if (internal_key.type == kTypeMerge) { + properties_.num_merge_operands++; + } } // notify property collectors NotifyCollectTableCollectorsOnAdd( - key, value, offset_, table_properties_collectors_, ioptions_.info_log); + key, value, offset_, table_properties_collectors_, ioptions_.logger); + status_ = io_status_; } -Status PlainTableBuilder::status() const { return status_; } - Status PlainTableBuilder::Finish() { assert(!closed_); closed_ = true; @@ -197,13 +220,12 @@ if (store_index_in_file_ && (properties_.num_entries > 0)) { assert(properties_.num_entries <= std::numeric_limits::max()); - Status s; BlockHandle bloom_block_handle; if (bloom_bits_per_key_ > 0) { bloom_block_.SetTotalBits( &arena_, static_cast(properties_.num_entries) * bloom_bits_per_key_, - ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log); + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -214,10 +236,12 @@ Slice bloom_finish_result = bloom_block_.Finish(); properties_.filter_size = bloom_finish_result.size(); - s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle); + io_status_ = + WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle); - if (!s.ok()) { - return s; + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle); } @@ -225,10 +249,12 @@ Slice index_finish_result = index_builder_->Finish(); properties_.index_size = index_finish_result.size(); - s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle); + io_status_ = + WriteBlock(index_finish_result, file_, &offset_, &index_block_handle); - if (!s.ok()) { - return s; + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock, @@ -243,51 +269,38 @@ property_block_builder.Add(properties_.user_collected_properties); // -- Add user collected properties - NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - ioptions_.info_log, - &property_block_builder); + NotifyCollectTableCollectorsOnFinish( + table_properties_collectors_, ioptions_.logger, &property_block_builder); // -- Write property block BlockHandle property_block_handle; - auto s = WriteBlock( - property_block_builder.Finish(), - file_, - &offset_, - &property_block_handle - ); + IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_, + &property_block_handle); if (!s.ok()) { - return s; + return std::move(s); } - meta_index_builer.Add(kPropertiesBlock, property_block_handle); + meta_index_builer.Add(kPropertiesBlockName, property_block_handle); // -- write metaindex block BlockHandle metaindex_block_handle; - s = WriteBlock( - meta_index_builer.Finish(), - file_, - &offset_, - &metaindex_block_handle - ); - if (!s.ok()) { - return s; + io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_, + &metaindex_block_handle); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } // Write Footer // no need to write out new footer if we're using default checksum - Footer footer(kLegacyPlainTableMagicNumber, 0); - footer.set_metaindex_handle(metaindex_block_handle); - footer.set_index_handle(BlockHandle::NullBlockHandle()); - std::string footer_encoding; - footer.EncodeTo(&footer_encoding); - s = file_->Append(footer_encoding); - if (s.ok()) { - offset_ += footer_encoding.size(); + FooterBuilder footer; + footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_, + kNoChecksum, metaindex_block_handle); + io_status_ = file_->Append(footer.GetSlice()); + if (io_status_.ok()) { + offset_ += footer.GetSlice().size(); } - - if (file_ != nullptr) { - file_checksum_ = file_->GetFileChecksum(); - } - return s; + status_ = io_status_; + return status_; } void PlainTableBuilder::Abandon() { @@ -302,11 +315,19 @@ return offset_; } +std::string PlainTableBuilder::GetFileChecksum() const { + if (file_ != nullptr) { + return file_->GetFileChecksum(); + } else { + return kUnknownFileChecksum; + } +} + const char* PlainTableBuilder::GetFileChecksumFuncName() const { if (file_ != nullptr) { return file_->GetFileChecksumFuncName(); } else { - return kUnknownFileChecksumFuncName.c_str(); + return kUnknownFileChecksumFuncName; } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -37,15 +37,16 @@ // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. PlainTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, - uint32_t user_key_size, EncodingType encoding_type, - size_t index_sparseness, uint32_t bloom_bits_per_key, - const std::string& column_family_name, uint32_t num_probes = 6, - size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, - bool store_index_in_file = false); + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + uint32_t column_family_id, int level_at_creation, + WritableFileWriter* file, uint32_t user_key_size, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, + double hash_table_ratio = 0, bool store_index_in_file = false, + const std::string& db_id = "", const std::string& db_session_id = "", + uint64_t file_number = 0); // No copying allowed PlainTableBuilder(const PlainTableBuilder&) = delete; void operator=(const PlainTableBuilder&) = delete; @@ -59,7 +60,10 @@ void Add(const Slice& key, const Slice& value) override; // Return non-ok iff some error has been detected. - Status status() const override; + Status status() const override { return status_; } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return io_status_; } // Finish building the table. Stops using the file passed to the // constructor after this function returns. @@ -85,14 +89,14 @@ bool SaveIndexInFile() const { return store_index_in_file_; } // Get file checksum - const std::string& GetFileChecksum() const override { return file_checksum_; } + std::string GetFileChecksum() const override; // Get file checksum function name const char* GetFileChecksumFuncName() const override; private: Arena arena_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const MutableCFOptions& moptions_; std::vector> table_properties_collectors_; @@ -105,6 +109,7 @@ uint32_t bloom_bits_per_key_; size_t huge_page_tlb_size_; Status status_; + IOStatus io_status_; TableProperties properties_; PlainTableKeyEncoder encoder_; @@ -115,9 +120,6 @@ const SliceTransform* prefix_extractor_; - // Store file checksum. If checksum is disabled, its value is "0". - std::string file_checksum_ = kUnknownFileChecksum; - Slice GetPrefix(const Slice& target) const { assert(target.size() >= 8); // target is internal key return GetPrefixFromUserKey(GetUserKey(target)); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc 2025-05-19 16:14:27.000000000 +0000 @@ -3,23 +3,61 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "table/plain/plain_table_factory.h" #include + #include + #include "db/dbformat.h" -#include "options/options_helper.h" #include "port/port.h" #include "rocksdb/convenience.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" #include "table/plain/plain_table_builder.h" #include "table/plain/plain_table_reader.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +#ifndef ROCKSDB_LITE +static std::unordered_map plain_table_type_info = { + {"user_key_len", + {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"bloom_bits_per_key", + {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"hash_table_ratio", + {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"index_sparseness", + {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"huge_page_tlb_size", + {offsetof(struct PlainTableOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"encoding_type", + {offsetof(struct PlainTableOptions, encoding_type), + OptionType::kEncodingType, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"full_scan_mode", + {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"store_index_in_file", + {offsetof(struct PlainTableOptions, store_index_in_file), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +PlainTableFactory::PlainTableFactory(const PlainTableOptions& options) + : table_options_(options) { + RegisterOptions(&table_options_, &plain_table_type_info); +} Status PlainTableFactory::NewTableReader( - const TableReaderOptions& table_reader_options, + const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const { @@ -29,11 +67,11 @@ table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, table_options_.index_sparseness, table_options_.huge_page_tlb_size, table_options_.full_scan_mode, table_reader_options.immortal, - table_reader_options.prefix_extractor); + table_reader_options.prefix_extractor.get()); } TableBuilder* PlainTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { // Ignore the skip_filters flag. PlainTable format is optimized for small // in-memory dbs. The skip_filters optimization is not useful for plain @@ -41,15 +79,18 @@ // return new PlainTableBuilder( table_builder_options.ioptions, table_builder_options.moptions, - table_builder_options.int_tbl_prop_collector_factories, column_family_id, - file, table_options_.user_key_len, table_options_.encoding_type, + table_builder_options.int_tbl_prop_collector_factories, + table_builder_options.column_family_id, + table_builder_options.level_at_creation, file, + table_options_.user_key_len, table_options_.encoding_type, table_options_.index_sparseness, table_options_.bloom_bits_per_key, table_builder_options.column_family_name, 6, table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, - table_options_.store_index_in_file); + table_options_.store_index_in_file, table_builder_options.db_id, + table_builder_options.db_session_id, table_builder_options.cur_file_num); } -std::string PlainTableFactory::GetPrintableTableOptions() const { +std::string PlainTableFactory::GetPrintableOptions() const { std::string ret; ret.reserve(20000); const int kBufferSize = 200; @@ -82,11 +123,19 @@ return ret; } -const PlainTableOptions& PlainTableFactory::table_options() const { - return table_options_; +Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options) { + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + config_options.invoke_prepare_options = false; + return GetPlainTableOptionsFromString(config_options, table_options, opts_str, + new_table_options); } -Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, +Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, + const PlainTableOptions& table_options, const std::string& opts_str, PlainTableOptions* new_table_options) { std::unordered_map opts_map; @@ -94,128 +143,183 @@ if (!s.ok()) { return s; } - return GetPlainTableOptionsFromMap(table_options, opts_map, - new_table_options); -} -Status GetMemTableRepFactoryFromString( - const std::string& opts_str, - std::unique_ptr* new_mem_factory) { - std::vector opts_list = StringSplit(opts_str, ':'); - size_t len = opts_list.size(); - - if (opts_list.empty() || opts_list.size() > 2) { - return Status::InvalidArgument("Can't parse memtable_factory option ", - opts_str); - } - - MemTableRepFactory* mem_factory = nullptr; - - if (opts_list[0] == "skip_list") { - // Expecting format - // skip_list: - if (2 == len) { - size_t lookahead = ParseSizeT(opts_list[1]); - mem_factory = new SkipListFactory(lookahead); - } else if (1 == len) { - mem_factory = new SkipListFactory(); - } - } else if (opts_list[0] == "prefix_hash") { - // Expecting format - // prfix_hash: - if (2 == len) { - size_t hash_bucket_count = ParseSizeT(opts_list[1]); - mem_factory = NewHashSkipListRepFactory(hash_bucket_count); - } else if (1 == len) { - mem_factory = NewHashSkipListRepFactory(); - } - } else if (opts_list[0] == "hash_linkedlist") { - // Expecting format - // hash_linkedlist: - if (2 == len) { - size_t hash_bucket_count = ParseSizeT(opts_list[1]); - mem_factory = NewHashLinkListRepFactory(hash_bucket_count); - } else if (1 == len) { - mem_factory = NewHashLinkListRepFactory(); - } - } else if (opts_list[0] == "vector") { - // Expecting format - // vector: - if (2 == len) { - size_t count = ParseSizeT(opts_list[1]); - mem_factory = new VectorRepFactory(count); - } else if (1 == len) { - mem_factory = new VectorRepFactory(); - } - } else if (opts_list[0] == "cuckoo") { - return Status::NotSupported( - "cuckoo hash memtable is not supported anymore."); + s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map, + new_table_options); + // Translate any errors (NotFound, NotSupported, to InvalidArgument + if (s.ok() || s.IsInvalidArgument()) { + return s; } else { - return Status::InvalidArgument("Unrecognized memtable_factory option ", - opts_str); + return Status::InvalidArgument(s.getState()); } +} +#endif // ROCKSDB_LITE - if (mem_factory != nullptr) { - new_mem_factory->reset(mem_factory); - } +#ifndef ROCKSDB_LITE +static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, + const std::string& /*arg*/) { + // The MemTableRepFactory built-in classes will be either a class + // (VectorRepFactory) or a nickname (vector), followed optionally by ":#", + // where # is the "size" of the factory. + auto AsPattern = [](const std::string& name, const std::string& alt) { + auto pattern = ObjectLibrary::PatternEntry(name, true); + pattern.AnotherName(alt); + pattern.AddNumber(":"); + return pattern; + }; + library.AddFactory( + AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new VectorRepFactory(count)); + } else { + guard->reset(new VectorRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t lookahead = ParseSizeT(uri.substr(colon + 1)); + guard->reset(new SkipListFactory(lookahead)); + } else { + guard->reset(new SkipListFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern("HashLinkListRepFactory", "hash_linkedlist"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + // Expecting format: hash_linkedlist: + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewHashLinkListRepFactory(hash_bucket_count)); + } else { + guard->reset(NewHashLinkListRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + AsPattern("HashSkipListRepFactory", "prefix_hash"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + // Expecting format: prefix_hash: + auto colon = uri.find(":"); + if (colon != std::string::npos) { + size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewHashSkipListRepFactory(hash_bucket_count)); + } else { + guard->reset(NewHashSkipListRepFactory()); + } + return guard->get(); + }); + library.AddFactory( + "cuckoo", + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, std::string* errmsg) { + *errmsg = "cuckoo hash memtable is not supported anymore."; + return nullptr; + }); + + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE - return Status::OK(); +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, std::unique_ptr* result) { + ConfigOptions config_options; + config_options.ignore_unsupported_options = false; + config_options.ignore_unknown_options = false; + return MemTableRepFactory::CreateFromString(config_options, opts_str, result); } -std::string ParsePlainTableOptions(const std::string& name, - const std::string& org_value, - PlainTableOptions* new_options, - bool input_strings_escaped = false, - bool ignore_unknown_options = false) { - const std::string& value = - input_strings_escaped ? UnescapeOptionString(org_value) : org_value; - const auto iter = plain_table_type_info.find(name); - if (iter == plain_table_type_info.end()) { - if (ignore_unknown_options) { - return ""; - } else { - return "Unrecognized option"; +Status MemTableRepFactory::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::unique_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (value.empty()) { + // No Id and no options. Clear the object + result->reset(); + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { +#ifndef ROCKSDB_LITE + status = NewUniqueObject(config_options, id, opt_map, + result); +#else + // To make it possible to configure the memtables in LITE mode, the ID + // is of the form :, where name is the name of the class and + // is the length of the object (e.g. skip_list:10). + std::vector opts_list = StringSplit(id, ':'); + if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) { + status = Status::InvalidArgument("Can't parse memtable_factory option ", + value); + } else if (opts_list[0] == "skip_list" || + opts_list[0] == SkipListFactory::kClassName()) { + // Expecting format + // skip_list: + if (opts_list.size() == 2) { + size_t lookahead = ParseSizeT(opts_list[1]); + result->reset(new SkipListFactory(lookahead)); + } else { + result->reset(new SkipListFactory()); + } + } else if (!config_options.ignore_unsupported_options) { + status = Status::NotSupported("Cannot load object in LITE mode ", id); } +#endif // ROCKSDB_LITE } - const auto& opt_info = iter->second; - if (opt_info.verification != OptionVerificationType::kDeprecated && - !ParseOptionHelper(reinterpret_cast(new_options) + opt_info.offset, - opt_info.type, value)) { - return "Invalid value"; - } - return ""; + return status; } +#ifndef ROCKSDB_LITE Status GetPlainTableOptionsFromMap( const PlainTableOptions& table_options, const std::unordered_map& opts_map, PlainTableOptions* new_table_options, bool input_strings_escaped, - bool /*ignore_unknown_options*/) { + bool ignore_unknown_options) { + ConfigOptions config_options; + config_options.input_strings_escaped = input_strings_escaped; + config_options.ignore_unknown_options = ignore_unknown_options; + return GetPlainTableOptionsFromMap(config_options, table_options, opts_map, + new_table_options); +} + +Status GetPlainTableOptionsFromMap( + const ConfigOptions& config_options, const PlainTableOptions& table_options, + const std::unordered_map& opts_map, + PlainTableOptions* new_table_options) { assert(new_table_options); - *new_table_options = table_options; - for (const auto& o : opts_map) { - auto error_message = ParsePlainTableOptions( - o.first, o.second, new_table_options, input_strings_escaped); - if (error_message != "") { - const auto iter = plain_table_type_info.find(o.first); - if (iter == plain_table_type_info.end() || - !input_strings_escaped || // !input_strings_escaped indicates - // the old API, where everything is - // parsable. - (iter->second.verification != OptionVerificationType::kByName && - iter->second.verification != - OptionVerificationType::kByNameAllowNull && - iter->second.verification != - OptionVerificationType::kByNameAllowFromNull && - iter->second.verification != OptionVerificationType::kDeprecated)) { - // Restore "new_options" to the default "base_options". - *new_table_options = table_options; - return Status::InvalidArgument("Can't parse PlainTableOptions:", - o.first + " " + error_message); - } - } + PlainTableFactory ptf(table_options); + Status s = ptf.ConfigureFromMap(config_options, opts_map); + if (s.ok()) { + *new_table_options = *(ptf.GetOptions()); + } else { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; } - return Status::OK(); + return s; } extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { @@ -231,5 +335,5 @@ const std::string PlainTablePropertyNames::kNumBloomBlocks = "rocksdb.plain.table.bloom.numblocks"; -} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,8 +10,6 @@ #include #include -#include "options/options_helper.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { @@ -35,7 +33,7 @@ // 1. Data compression is not supported. // 2. Data is not checksumed. // it is not recommended to use this format on other type of file systems. -// +// // PlainTable requires fixed length key, configured as a constructor // parameter of the factory class. Output file format: // +-------------+-----------------+ @@ -156,68 +154,29 @@ // page TLB and the page size if allocating from there. See comments of // Arena::AllocateAligned() for details. explicit PlainTableFactory( - const PlainTableOptions& _table_options = PlainTableOptions()) - : table_options_(_table_options) {} + const PlainTableOptions& _table_options = PlainTableOptions()); - const char* Name() const override { return "PlainTable"; } - Status NewTableReader(const TableReaderOptions& table_reader_options, + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kPlainTableName(); } + const char* Name() const override { return kPlainTableName(); } + using TableFactory::NewTableReader; + Status NewTableReader(const ReadOptions& ro, + const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table, bool prefetch_index_and_filter_in_cache) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; - - std::string GetPrintableTableOptions() const override; - - const PlainTableOptions& table_options() const; + WritableFileWriter* file) const override; + std::string GetPrintableOptions() const override; static const char kValueTypeSeqId0 = char(~0); - // Sanitizes the specified DB Options. - Status SanitizeOptions( - const DBOptions& /*db_opts*/, - const ColumnFamilyOptions& /*cf_opts*/) const override { - return Status::OK(); - } - - void* GetOptions() override { return &table_options_; } - - Status GetOptionString(std::string* /*opt_string*/, - const std::string& /*delimiter*/) const override { - return Status::OK(); - } - private: PlainTableOptions table_options_; }; -static std::unordered_map plain_table_type_info = { - {"user_key_len", - {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, - OptionVerificationType::kNormal, false, 0}}, - {"bloom_bits_per_key", - {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, - OptionVerificationType::kNormal, false, 0}}, - {"hash_table_ratio", - {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, - OptionVerificationType::kNormal, false, 0}}, - {"index_sparseness", - {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, - OptionVerificationType::kNormal, false, 0}}, - {"huge_page_tlb_size", - {offsetof(struct PlainTableOptions, huge_page_tlb_size), - OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, - {"encoding_type", - {offsetof(struct PlainTableOptions, encoding_type), - OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}}, - {"full_scan_mode", - {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, - OptionVerificationType::kNormal, false, 0}}, - {"store_index_in_file", - {offsetof(struct PlainTableOptions, store_index_in_file), - OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,10 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE +#include "table/plain/plain_table_index.h" #include -#include "table/plain/plain_table_index.h" +#include "logging/logging.h" #include "util/coding.h" #include "util/hash.h" @@ -98,7 +99,7 @@ BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -153,12 +154,12 @@ Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - ROCKS_LOG_DEBUG(ioptions_.info_log, + ROCKS_LOG_DEBUG(ioptions_.logger, "Reserving %" PRIu32 " bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + total_allocate_size, huge_page_tlb_size_, ioptions_.logger); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = @@ -198,7 +199,7 @@ } assert(sub_index_offset == sub_index_size_); - ROCKS_LOG_DEBUG(ioptions_.info_log, + ROCKS_LOG_DEBUG(ioptions_.logger, "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); @@ -206,6 +207,6 @@ const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = "PlainTableIndexBlock"; -}; // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,6 @@ #include #include -#include "db/dbformat.h" #include "memory/arena.h" #include "monitoring/histogram.h" #include "options/cf_options.h" @@ -20,7 +19,7 @@ // The file contains two classes PlainTableIndex and PlainTableIndexBuilder // The two classes implement the index format of PlainTable. -// For descripton of PlainTable format, see comments of class +// For description of PlainTable format, see comments of class // PlainTableFactory // // @@ -131,7 +130,7 @@ // The class is used by PlainTableBuilder class. class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor, size_t index_sparseness, double hash_table_ratio, size_t huge_page_tlb_size) @@ -222,7 +221,7 @@ const std::vector& entries_per_bucket); Arena* arena_; - const ImmutableCFOptions ioptions_; + const ImmutableOptions ioptions_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; @@ -244,6 +243,6 @@ static const size_t kRecordsPerGroup = 256; }; -}; // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc 2025-05-19 16:14:27.000000000 +0000 @@ -80,13 +80,15 @@ } } -Status PlainTableKeyEncoder::AppendKey(const Slice& key, - WritableFileWriter* file, - uint64_t* offset, char* meta_bytes_buf, - size_t* meta_bytes_buf_size) { +IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size) { ParsedInternalKey parsed_key; - if (!ParseInternalKey(key, &parsed_key)) { - return Status::Corruption(Slice()); + Status pik_status = + ParseInternalKey(key, &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + return IOStatus::Corruption(pik_status.getState()); } Slice key_to_write = key; // Portion of internal key to write out. @@ -99,9 +101,9 @@ char* ptr = EncodeVarint32(key_size_buf, user_key_size); assert(ptr <= key_size_buf + sizeof(key_size_buf)); auto len = ptr - key_size_buf; - Status s = file->Append(Slice(key_size_buf, len)); - if (!s.ok()) { - return s; + IOStatus io_s = file->Append(Slice(key_size_buf, len)); + if (!io_s.ok()) { + return io_s; } *offset += len; } @@ -117,9 +119,9 @@ key_count_for_prefix_ = 1; pre_prefix_.SetUserKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); - Status s = file->Append(Slice(size_bytes, size_bytes_pos)); - if (!s.ok()) { - return s; + IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!io_s.ok()) { + return io_s; } *offset += size_bytes_pos; } else { @@ -135,9 +137,9 @@ static_cast(pre_prefix_.GetUserKey().size()); size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, size_bytes + size_bytes_pos); - Status s = file->Append(Slice(size_bytes, size_bytes_pos)); - if (!s.ok()) { - return s; + IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!io_s.ok()) { + return io_s; } *offset += size_bytes_pos; key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len); @@ -149,20 +151,23 @@ // If the row is of value type with seqId 0, flush the special flag together // in this buffer to safe one file append call, which takes 1 byte. if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { - Status s = + IOStatus io_s = file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); - if (!s.ok()) { - return s; + if (!io_s.ok()) { + return io_s; } *offset += key_to_write.size() - 8; meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; *meta_bytes_buf_size += 1; } else { - file->Append(key_to_write); + IOStatus io_s = file->Append(key_to_write); + if (!io_s.ok()) { + return io_s; + } *offset += key_to_write.size(); } - return Status::OK(); + return IOStatus::OK(); } Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset, @@ -207,8 +212,9 @@ new_buffer->buf_len = 0; } Slice read_result; - Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, - new_buffer->buf.get()); + Status s = + file_info_->file->Read(IOOptions(), file_offset, size_to_read, + &read_result, new_buffer->buf.get(), nullptr); if (!s.ok()) { status_ = s; return false; @@ -275,9 +281,12 @@ return file_reader_.status(); } *internal_key_valid = true; - if (!ParseInternalKey(*internal_key, parsed_key)) { + Status pik_status = ParseInternalKey(*internal_key, parsed_key, + false /* log_err_key */); // TODO + if (!pik_status.ok()) { return Status::Corruption( - Slice("Incorrect value type found when reading the next key")); + Slice("Corrupted key found during next key read. "), + pik_status.getState()); } *bytes_read += user_key_size + 8; } @@ -483,7 +492,6 @@ if (seekable != nullptr) { *seekable = true; } - Status s; if (encoding_type_ == kPlain) { return NextPlainEncodingKey(start_offset, parsed_key, internal_key, bytes_read, seekable); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h 2025-05-19 16:14:27.000000000 +0000 @@ -8,7 +8,7 @@ #ifndef ROCKSDB_LITE #include -#include "db/dbformat.h" + #include "rocksdb/slice.h" #include "table/plain/plain_table_reader.h" @@ -44,8 +44,9 @@ // meta_bytes_buf: buffer for extra meta bytes // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated // if meta_bytes_buf is updated. - Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, - char* meta_bytes_buf, size_t* meta_bytes_buf_size); + IOStatus AppendKey(const Slice& key, WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size); // Return actual encoding type to be picked EncodingType GetEncodingType() { return encoding_type_; } @@ -67,6 +68,12 @@ public: explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) : file_info_(_file_info), num_buf_(0) {} + + ~PlainTableFileReader() { + // Should fix. + status_.PermitUncheckedError(); + } + // In mmaped mode, the results point to mmaped area of the file, which // means it is always valid before closing the file. // In non-mmap mode, the results point to an internal buffer. If the caller @@ -145,6 +152,7 @@ fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), in_prefix_(false) {} + // Find the next key. // start: char array where the key starts. // limit: boundary of the char array diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -93,7 +93,7 @@ extern const uint64_t kPlainTableMagicNumber; PlainTableReader::PlainTableReader( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, std::unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, @@ -113,10 +113,12 @@ table_properties_(nullptr) {} PlainTableReader::~PlainTableReader() { + // Should fix? + status_.PermitUncheckedError(); } Status PlainTableReader::Open( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const ImmutableOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, const int bloom_bits_per_key, @@ -127,11 +129,9 @@ return Status::NotSupported("File is too large for PlainTableReader!"); } - TableProperties* props_ptr = nullptr; + std::unique_ptr props; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions, &props_ptr, - true /* compression_type_missing */); - std::shared_ptr props(props_ptr); + ioptions, &props); if (!s.ok()) { return s; } @@ -147,8 +147,7 @@ return Status::InvalidArgument( "Prefix extractor is missing when opening a PlainTable built " "using a prefix extractor"); - } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) != - 0) { + } else if (prefix_extractor_in_file != prefix_extractor->AsString()) { return Status::InvalidArgument( "Prefix extractor given doesn't match the one used to build " "PlainTable"); @@ -185,7 +184,7 @@ new_reader->full_scan_mode_ = true; } // PopulateIndex can add to the props, so don't store them until now - new_reader->table_properties_ = props; + new_reader->table_properties_ = std::move(props); if (immortal_table && new_reader->file_info_.is_mmap_mode) { new_reader->dummy_cleanable_.reset(new Cleanable()); @@ -201,7 +200,8 @@ InternalIterator* PlainTableReader::NewIterator( const ReadOptions& options, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, - size_t /*compaction_readahead_size*/) { + size_t /*compaction_readahead_size*/, + bool /* allow_unprepared_value */) { // Not necessarily used here, but make sure this has been initialized assert(table_properties_); @@ -274,7 +274,7 @@ if (bloom_total_bits > 0) { enable_bloom_ = true; bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); + huge_page_tlb_size, ioptions_.logger); } } @@ -288,7 +288,9 @@ Status PlainTableReader::MmapDataIfNeeded() { if (file_info_.is_mmap_mode) { // Get mmapped memory. - return file_info_.file->Read(0, static_cast(file_size_), &file_info_.file_data, nullptr); + return file_info_.file->Read(IOOptions(), 0, + static_cast(file_size_), + &file_info_.file_data, nullptr, nullptr); } return Status::OK(); } @@ -304,8 +306,7 @@ Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, PlainTableIndexBuilder::kPlainTableIndexBlock, - BlockType::kIndex, &index_block_contents, - true /* compression_type_missing */); + BlockType::kIndex, &index_block_contents); bool index_in_file = s.ok(); @@ -316,8 +317,7 @@ s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, BloomBlockBuilder::kBloomBlock, BlockType::kFilter, - &bloom_block_contents, - true /* compression_type_missing */); + &bloom_block_contents); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } @@ -445,23 +445,23 @@ } // point to sub-index, need to do a binary search - uint32_t upper_bound; + uint32_t upper_bound = 0; const char* base_ptr = index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); uint32_t low = 0; uint32_t high = upper_bound; ParsedInternalKey mid_key; ParsedInternalKey parsed_target; - if (!ParseInternalKey(target, &parsed_target)) { - return Status::Corruption(Slice()); - } + Status s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO + if (!s.ok()) return s; // The key is between [low, high). Do a binary search between it. while (high - low > 1) { uint32_t mid = (high + low) / 2; uint32_t file_offset = GetFixed32Element(base_ptr, mid); uint32_t tmp; - Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); + s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); if (!s.ok()) { return s; } @@ -486,7 +486,7 @@ ParsedInternalKey low_key; uint32_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); - Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); + s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); if (!s.ok()) { return s; } @@ -589,9 +589,10 @@ } ParsedInternalKey found_key; ParsedInternalKey parsed_target; - if (!ParseInternalKey(target, &parsed_target)) { - return Status::Corruption(Slice()); - } + s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO + if (!s.ok()) return s; + Slice found_value; while (offset < file_info_.data_end_offset) { s = Next(&decoder, &offset, &found_key, nullptr, &found_value); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -12,7 +12,6 @@ #include #include -#include "db/dbformat.h" #include "file/random_access_file_reader.h" #include "memory/arena.h" #include "rocksdb/env.h" @@ -67,7 +66,7 @@ // whether it points to the data offset of the first key with the key prefix // or the offset of it. If there are too many keys share this prefix, it will // create a binary search-able index from the suffix to offset on disk. - static Status Open(const ImmutableCFOptions& ioptions, + static Status Open(const ImmutableOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, @@ -84,7 +83,8 @@ const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, - size_t compaction_readahead_size = 0) override; + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; void Prepare(const Slice& target) override; @@ -109,7 +109,7 @@ return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const ImmutableCFOptions& ioptions, + PlainTableReader(const ImmutableOptions& ioptions, std::unique_ptr&& file, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, @@ -162,7 +162,7 @@ CacheAllocationPtr index_block_alloc_; CacheAllocationPtr bloom_block_alloc_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; std::unique_ptr dummy_cleanable_; uint64_t file_size_; protected: // for testing diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,502 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#ifndef ROCKSDB_LITE + +#include "table/sst_file_dumper.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "db/blob/blob_index.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "options/cf_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/utilities/ldb_cmd.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_factory.h" +#include "table/table_reader.h" +#include "util/compression.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +SstFileDumper::SstFileDumper(const Options& options, + const std::string& file_path, + size_t readahead_size, bool verify_checksum, + bool output_hex, bool decode_blob_index, + const EnvOptions& soptions, bool silent) + : file_name_(file_path), + read_num_(0), + output_hex_(output_hex), + decode_blob_index_(decode_blob_index), + soptions_(soptions), + silent_(silent), + options_(options), + ioptions_(options_), + moptions_(ColumnFamilyOptions(options_)), + read_options_(verify_checksum, false), + internal_comparator_(BytewiseComparator()) { + read_options_.readahead_size = readahead_size; + if (!silent_) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + } + init_result_ = GetTableReader(file_name_); +} + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; + +const char* testFileName = "test_file_name"; + +Status SstFileDumper::GetTableReader(const std::string& file_path) { + // Warning about 'magic_number' being uninitialized shows up only in UBsan + // builds. Though access is guarded by 's.ok()' checks, fix the issue to + // avoid any warnings. + uint64_t magic_number = Footer::kNullTableMagicNumber; + + // read table magic number + Footer footer; + + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file; + uint64_t file_size = 0; + Status s = fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file, + nullptr); + if (s.ok()) { + s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); + } + + // check empty file + // if true, skip further processing of this file + if (file_size == 0) { + return Status::Aborted(file_path, "Empty file"); + } + + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); + + FilePrefetchBuffer prefetch_buffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + false /* track_min_offset */); + if (s.ok()) { + const uint64_t kSstDumpTailPrefetchSize = 512 * 1024; + uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize) + ? kSstDumpTailPrefetchSize + : file_size; + uint64_t prefetch_off = file_size - prefetch_size; + IOOptions opts; + s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off, + static_cast(prefetch_size)); + + s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size, + &footer); + } + if (s.ok()) { + magic_number = footer.table_magic_number(); + } + + if (s.ok()) { + if (magic_number == kPlainTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + + fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file, + nullptr); + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); + } + options_.comparator = &internal_comparator_; + // For old sst format, ReadTableProperties might fail but file can be read + if (ReadTableProperties(magic_number, file_.get(), file_size, + (magic_number == kBlockBasedTableMagicNumber) + ? &prefetch_buffer + : nullptr) + .ok()) { + s = SetTableOptionsByMagicNumber(magic_number); + } else { + s = SetOldTableOptions(); + } + } + + if (s.ok()) { + s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size, + &table_reader_); + } + return s; +} + +Status SstFileDumper::NewTableReader( + const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, + const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, + std::unique_ptr* /*table_reader*/) { + auto t_opt = + TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_, + internal_comparator_, false /* skip_filters */, + false /* imortal */, true /* force_direct_prefetch */); + // Allow open file with global sequence number for backward compatibility. + t_opt.largest_seqno = kMaxSequenceNumber; + + // We need to turn off pre-fetching of index and filter nodes for + // BlockBasedTable + if (options_.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName())) { + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_, + /*enable_prefetch=*/false); + } + + // For all other factory implementation + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_); +} + +Status SstFileDumper::VerifyChecksum() { + // We could pass specific readahead setting into read options if needed. + return table_reader_->VerifyChecksum(read_options_, + TableReaderCaller::kSSTDumpTool); +} + +Status SstFileDumper::DumpTable(const std::string& out_filename) { + std::unique_ptr out_file; + Env* env = options_.env; + Status s = env->NewWritableFile(out_filename, &out_file, soptions_); + if (s.ok()) { + s = table_reader_->DumpTable(out_file.get()); + } + if (!s.ok()) { + // close the file before return error, ignore the close error if there's any + out_file->Close().PermitUncheckedError(); + return s; + } + return out_file->Close(); +} + +Status SstFileDumper::CalculateCompressedTableSize( + const TableBuilderOptions& tb_options, size_t block_size, + uint64_t* num_data_blocks, uint64_t* compressed_table_size) { + std::unique_ptr env(NewMemEnv(options_.env)); + std::unique_ptr dest_writer; + Status s = + WritableFileWriter::Create(env->GetFileSystem(), testFileName, + FileOptions(soptions_), &dest_writer, nullptr); + if (!s.ok()) { + return s; + } + BlockBasedTableOptions table_options; + table_options.block_size = block_size; + BlockBasedTableFactory block_based_tf(table_options); + std::unique_ptr table_builder; + table_builder.reset(block_based_tf.NewTableBuilder( + tb_options, + dest_writer.get())); + std::unique_ptr iter(table_reader_->NewIterator( + read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + table_builder->Add(iter->key(), iter->value()); + } + s = iter->status(); + if (!s.ok()) { + return s; + } + s = table_builder->Finish(); + if (!s.ok()) { + return s; + } + *compressed_table_size = table_builder->FileSize(); + assert(num_data_blocks != nullptr); + *num_data_blocks = table_builder->GetTableProperties().num_data_blocks; + return env->DeleteFile(testFileName); +} + +Status SstFileDumper::ShowAllCompressionSizes( + size_t block_size, + const std::vector>& + compression_types, + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes) { + fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size); + for (auto& i : compression_types) { + if (CompressionTypeSupported(i.first)) { + fprintf(stdout, "Compression: %-24s\n", i.second); + CompressionOptions compress_opt; + compress_opt.max_dict_bytes = max_dict_bytes; + compress_opt.zstd_max_train_bytes = zstd_max_train_bytes; + compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes; + for (int32_t j = compress_level_from; j <= compress_level_to; j++) { + fprintf(stdout, "Compression level: %d", j); + compress_opt.level = j; + Status s = ShowCompressionSize(block_size, i.first, compress_opt); + if (!s.ok()) { + return s; + } + } + } else { + fprintf(stdout, "Unsupported compression type: %s.\n", i.second); + } + } + return Status::OK(); +} + +Status SstFileDumper::ShowCompressionSize( + size_t block_size, CompressionType compress_type, + const CompressionOptions& compress_opt) { + Options opts; + opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + opts.statistics->set_stats_level(StatsLevel::kAll); + const ImmutableOptions imoptions(opts); + const ColumnFamilyOptions cfo(opts); + const MutableCFOptions moptions(cfo); + ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); + IntTblPropCollectorFactories block_based_table_factories; + + std::string column_family_name; + int unknown_level = -1; + TableBuilderOptions tb_opts( + imoptions, moptions, ikc, &block_based_table_factories, compress_type, + compress_opt, + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + column_family_name, unknown_level); + uint64_t num_data_blocks = 0; + std::chrono::steady_clock::time_point start = + std::chrono::steady_clock::now(); + uint64_t file_size; + Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks, + &file_size); + if (!s.ok()) { + return s; + } + + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + fprintf(stdout, " Size: %10" PRIu64, file_size); + fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks); + fprintf(stdout, " Time Taken: %10s microsecs", + std::to_string( + std::chrono::duration_cast(end - start) + .count()) + .c_str()); + const uint64_t compressed_blocks = + opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED); + const uint64_t not_compressed_blocks = + opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED); + // When the option enable_index_compression is true, + // NUMBER_BLOCK_COMPRESSED is incremented for index block(s). + if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) { + num_data_blocks = compressed_blocks + not_compressed_blocks; + } + + const uint64_t ratio_not_compressed_blocks = + (num_data_blocks - compressed_blocks) - not_compressed_blocks; + const double compressed_pcnt = + (0 == num_data_blocks) ? 0.0 + : ((static_cast(compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + const double ratio_not_compressed_pcnt = + (0 == num_data_blocks) + ? 0.0 + : ((static_cast(ratio_not_compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + const double not_compressed_pcnt = + (0 == num_data_blocks) ? 0.0 + : ((static_cast(not_compressed_blocks) / + static_cast(num_data_blocks)) * + 100.0); + fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks, + compressed_pcnt); + fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)", + ratio_not_compressed_blocks, ratio_not_compressed_pcnt); + fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n", + not_compressed_blocks, not_compressed_pcnt); + return Status::OK(); +} + +// Reads TableProperties prior to opening table reader in order to set up +// options. +Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, + RandomAccessFileReader* file, + uint64_t file_size, + FilePrefetchBuffer* prefetch_buffer) { + Status s = ROCKSDB_NAMESPACE::ReadTableProperties( + file, file_size, table_magic_number, ioptions_, &table_properties_, + /* memory_allocator= */ nullptr, prefetch_buffer); + if (!s.ok()) { + if (!silent_) { + fprintf(stdout, "Not able to read table properties\n"); + } + } + return s; +} + +Status SstFileDumper::SetTableOptionsByMagicNumber( + uint64_t table_magic_number) { + assert(table_properties_); + if (table_magic_number == kBlockBasedTableMagicNumber || + table_magic_number == kLegacyBlockBasedTableMagicNumber) { + BlockBasedTableFactory* bbtf = new BlockBasedTableFactory(); + // To force tail prefetching, we fake reporting two useful reads of 512KB + // from the tail. + // It needs at least two data points to warm up the stats. + bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); + bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024); + + options_.table_factory.reset(bbtf); + if (!silent_) { + fprintf(stdout, "Sst file format: block-based\n"); + } + + auto& props = table_properties_->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + auto index_type_on_file = static_cast( + DecodeFixed32(pos->second.c_str())); + if (index_type_on_file == + BlockBasedTableOptions::IndexType::kHashSearch) { + options_.prefix_extractor.reset(NewNoopTransform()); + } + } + } else if (table_magic_number == kPlainTableMagicNumber || + table_magic_number == kLegacyPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 1; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + plain_table_options.full_scan_mode = true; + + options_.table_factory.reset(NewPlainTableFactory(plain_table_options)); + if (!silent_) { + fprintf(stdout, "Sst file format: plain table\n"); + } + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx", + (long)table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + +Status SstFileDumper::SetOldTableOptions() { + assert(table_properties_ == nullptr); + options_.table_factory = std::make_shared(); + if (!silent_) { + fprintf(stdout, "Sst file format: block-based(old version)\n"); + } + + return Status::OK(); +} + +Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, + bool has_from, const std::string& from_key, + bool has_to, const std::string& to_key, + bool use_from_as_prefix) { + if (!table_reader_) { + return init_result_; + } + + InternalIterator* iter = table_reader_->NewIterator( + read_options_, moptions_.prefix_extractor.get(), + /*arena=*/nullptr, /*skip_filters=*/false, + TableReaderCaller::kSSTDumpTool); + uint64_t i = 0; + if (has_from) { + InternalKey ikey; + ikey.SetMinPossibleForUserKey(from_key); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) break; + + ParsedInternalKey ikey; + Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */); + if (!pik_status.ok()) { + std::cerr << pik_status.getState() << "\n"; + continue; + } + + // the key returned is not prefixed with out 'from' key + if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) { + break; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } else { + BlobIndex blob_index; + + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + fprintf(stderr, "%s => error decoding blob index\n", + ikey.DebugString(true, output_hex_).c_str()); + continue; + } + + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + blob_index.DebugString(output_hex_).c_str()); + } + } + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +// Provides TableProperties to API user +Status SstFileDumper::ReadTableProperties( + std::shared_ptr* table_properties) { + if (!table_reader_) { + return init_result_; + } + + *table_properties = table_reader_->GetTableProperties(); + return init_result_; +} +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,97 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" + +namespace ROCKSDB_NAMESPACE { + +class SstFileDumper { + public: + explicit SstFileDumper(const Options& options, const std::string& file_name, + size_t readahead_size, bool verify_checksum, + bool output_hex, bool decode_blob_index, + const EnvOptions& soptions = EnvOptions(), + bool silent = false); + + Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from, + const std::string& from_key, bool has_to, + const std::string& to_key, + bool use_from_as_prefix = false); + + Status ReadTableProperties( + std::shared_ptr* table_properties); + uint64_t GetReadNumber() { return read_num_; } + TableProperties* GetInitTableProperties() { return table_properties_.get(); } + + Status VerifyChecksum(); + Status DumpTable(const std::string& out_filename); + Status getStatus() { return init_result_; } + + Status ShowAllCompressionSizes( + size_t block_size, + const std::vector>& + compression_types, + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes); + + Status ShowCompressionSize(size_t block_size, CompressionType compress_type, + const CompressionOptions& compress_opt); + + private: + // Get the TableReader implementation for the sst file + Status GetTableReader(const std::string& file_path); + Status ReadTableProperties(uint64_t table_magic_number, + RandomAccessFileReader* file, uint64_t file_size, + FilePrefetchBuffer* prefetch_buffer); + + Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options, + size_t block_size, + uint64_t* num_data_blocks, + uint64_t* compressed_table_size); + + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); + Status SetOldTableOptions(); + + // Helper function to call the factory with settings specific to the + // factory implementation + Status NewTableReader(const ImmutableOptions& ioptions, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + uint64_t file_size, + std::unique_ptr* table_reader); + + std::string file_name_; + uint64_t read_num_; + bool output_hex_; + bool decode_blob_index_; + EnvOptions soptions_; + // less verbose in stdout/stderr + bool silent_; + + // options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options options_; + + Status init_result_; + std::unique_ptr table_reader_; + std::unique_ptr file_; + + const ImmutableOptions ioptions_; + const MutableCFOptions moptions_; + ReadOptions read_options_; + InternalKeyComparator internal_comparator_; + std::unique_ptr table_properties_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,11 +7,13 @@ #include "rocksdb/sst_file_reader.h" +#include "db/arena_wrapped_db_iter.h" #include "db/db_iter.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/random_access_file_reader.h" #include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "table/get_context.h" #include "table/table_builder.h" #include "table/table_reader.h" @@ -21,7 +23,7 @@ struct SstFileReader::Rep { Options options; EnvOptions soptions; - ImmutableCFOptions ioptions; + ImmutableOptions ioptions; MutableCFOptions moptions; std::unique_ptr table_reader; @@ -41,18 +43,20 @@ auto r = rep_.get(); Status s; uint64_t file_size = 0; - std::unique_ptr file; + std::unique_ptr file; std::unique_ptr file_reader; - s = r->options.env->GetFileSize(file_path, &file_size); + FileOptions fopts(r->soptions); + const auto& fs = r->options.env->GetFileSystem(); + + s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr); if (s.ok()) { - s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions); + s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); } if (s.ok()) { - file_reader.reset(new RandomAccessFileReader( - NewLegacyRandomAccessFileWrapper(file), file_path)); + file_reader.reset(new RandomAccessFileReader(std::move(file), file_path)); } if (s.ok()) { - TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(), + TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, r->soptions, r->ioptions.internal_comparator); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; @@ -62,18 +66,24 @@ return s; } -Iterator* SstFileReader::NewIterator(const ReadOptions& options) { +Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { auto r = rep_.get(); - auto sequence = options.snapshot != nullptr - ? options.snapshot->GetSequenceNumber() + auto sequence = roptions.snapshot != nullptr + ? roptions.snapshot->GetSequenceNumber() : kMaxSequenceNumber; + ArenaWrappedDBIter* res = new ArenaWrappedDBIter(); + res->Init(r->options.env, roptions, r->ioptions, r->moptions, + nullptr /* version */, sequence, + r->moptions.max_sequential_skip_in_iterations, + 0 /* version_number */, nullptr /* read_callback */, + nullptr /* db_impl */, nullptr /* cfd */, + true /* expose_blob_index */, false /* allow_refresh */); auto internal_iter = r->table_reader->NewIterator( - options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr, - /*skip_filters=*/false, TableReaderCaller::kSSTFileReader); - return NewDBIterator(r->options.env, options, r->ioptions, r->moptions, - r->ioptions.user_comparator, internal_iter, sequence, - r->moptions.max_sequential_skip_in_iterations, - nullptr /* read_callback */); + res->GetReadOptions(), r->moptions.prefix_extractor.get(), + res->GetArena(), false /* skip_filters */, + TableReaderCaller::kSSTFileReader); + res->SetIterUnderDBIter(internal_iter); + return res; } std::shared_ptr SstFileReader::GetTableProperties() diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -5,10 +5,13 @@ #ifndef ROCKSDB_LITE +#include "rocksdb/sst_file_reader.h" + #include +#include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" -#include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" #include "table/sst_file_writer_collectors.h" #include "test_util/testharness.h" @@ -34,11 +37,18 @@ SstFileReaderTest() { options_.merge_operator = MergeOperators::CreateUInt64AddOperator(); sst_name_ = test::PerThreadDBPath("sst_file"); + + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + EXPECT_NE(nullptr, base_env); + env_ = base_env; + options_.env = env_; } ~SstFileReaderTest() { - Status s = Env::Default()->DeleteFile(sst_name_); - assert(s.ok()); + Status s = env_->DeleteFile(sst_name_); + EXPECT_OK(s); } void CreateFile(const std::string& file_name, @@ -76,6 +86,9 @@ if (check_global_seqno) { auto properties = reader.GetTableProperties(); ASSERT_TRUE(properties); + std::string hostname; + ASSERT_OK(env_->GetHostNameString(&hostname)); + ASSERT_EQ(properties->db_host_id, hostname); auto& user_properties = properties->user_collected_properties; ASSERT_TRUE( user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno)); @@ -91,6 +104,8 @@ Options options_; EnvOptions soptions_; std::string sst_name_; + std::shared_ptr env_guard_; + Env* env_; }; const uint64_t kNumKeys = 100; @@ -112,6 +127,31 @@ CreateFileAndCheck(keys); } +TEST_F(SstFileReaderTest, ReadOptionsOutOfScope) { + // Repro a bug where the SstFileReader depended on its configured ReadOptions + // outliving it. + options_.comparator = test::Uint64Comparator(); + std::vector keys; + for (uint64_t i = 0; i < kNumKeys; i++) { + keys.emplace_back(EncodeAsUint64(i)); + } + CreateFile(sst_name_, keys); + + SstFileReader reader(options_); + ASSERT_OK(reader.Open(sst_name_)); + std::unique_ptr iter; + { + // Make sure ReadOptions go out of scope ASAP so we know the iterator + // operations do not depend on it. + ReadOptions ropts; + iter.reset(reader.NewIterator(ropts)); + } + iter->SeekToFirst(); + while (iter->Valid()) { + iter->Next(); + } +} + TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) { std::vector keys; for (uint64_t i = 0; i < kNumKeys; i++) { @@ -155,10 +195,230 @@ ASSERT_OK(DestroyDB(db_name, options)); } +TEST_F(SstFileReaderTest, TimestampSizeMismatch) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + // Comparator is not timestamp-aware; calls to APIs taking timestamps should + // fail. + ASSERT_NOK(writer.Put("key", EncodeAsUint64(100), "value")); + ASSERT_NOK(writer.Delete("another_key", EncodeAsUint64(200))); +} + +class SstFileReaderTimestampTest : public testing::Test { + public: + SstFileReaderTimestampTest() { + Env* env = Env::Default(); + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_)); + EXPECT_NE(nullptr, env); + + options_.env = env; + + options_.comparator = test::ComparatorWithU64Ts(); + + sst_name_ = test::PerThreadDBPath("sst_file_ts"); + } + + ~SstFileReaderTimestampTest() { + EXPECT_OK(options_.env->DeleteFile(sst_name_)); + } + + struct KeyValueDesc { + KeyValueDesc(std::string k, std::string ts, std::string v) + : key(std::move(k)), timestamp(std::move(ts)), value(std::move(v)) {} + + std::string key; + std::string timestamp; + std::string value; + }; + + struct InputKeyValueDesc : public KeyValueDesc { + InputKeyValueDesc(std::string k, std::string ts, std::string v, bool is_del, + bool use_contig_buf) + : KeyValueDesc(std::move(k), std::move(ts), std::move(v)), + is_delete(is_del), + use_contiguous_buffer(use_contig_buf) {} + + bool is_delete = false; + bool use_contiguous_buffer = false; + }; + + struct OutputKeyValueDesc : public KeyValueDesc { + OutputKeyValueDesc(std::string k, std::string ts, std::string v) + : KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {} + }; + + void CreateFile(const std::vector& descs) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + for (const auto& desc : descs) { + if (desc.is_delete) { + if (desc.use_contiguous_buffer) { + std::string key_with_ts(desc.key + desc.timestamp); + ASSERT_OK(writer.Delete(Slice(key_with_ts.data(), desc.key.size()), + Slice(key_with_ts.data() + desc.key.size(), + desc.timestamp.size()))); + } else { + ASSERT_OK(writer.Delete(desc.key, desc.timestamp)); + } + } else { + if (desc.use_contiguous_buffer) { + std::string key_with_ts(desc.key + desc.timestamp); + ASSERT_OK(writer.Put(Slice(key_with_ts.data(), desc.key.size()), + Slice(key_with_ts.data() + desc.key.size(), + desc.timestamp.size()), + desc.value)); + } else { + ASSERT_OK(writer.Put(desc.key, desc.timestamp, desc.value)); + } + } + } + + ASSERT_OK(writer.Finish()); + } + + void CheckFile(const std::string& timestamp, + const std::vector& descs) { + SstFileReader reader(options_); + + ASSERT_OK(reader.Open(sst_name_)); + ASSERT_OK(reader.VerifyChecksum()); + + Slice ts_slice(timestamp); + + ReadOptions read_options; + read_options.timestamp = &ts_slice; + + std::unique_ptr iter(reader.NewIterator(read_options)); + iter->SeekToFirst(); + + for (const auto& desc : descs) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), desc.key); + ASSERT_EQ(iter->timestamp(), desc.timestamp); + ASSERT_EQ(iter->value(), desc.value); + + iter->Next(); + } + + ASSERT_FALSE(iter->Valid()); + } + + protected: + std::shared_ptr env_guard_; + Options options_; + EnvOptions soptions_; + std::string sst_name_; +}; + +TEST_F(SstFileReaderTimestampTest, Basic) { + std::vector input_descs; + + for (uint64_t k = 0; k < kNumKeys; k += 4) { + // A Put with key k, timestamp k that gets overwritten by a subsequent Put + // with timestamp (k + 1). Note that the comparator uses descending order + // for the timestamp part, so we add the later Put first. + input_descs.emplace_back( + /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k + 1), + /* value */ EncodeAsString(k * 2), /* is_delete */ false, + /* use_contiguous_buffer */ false); + input_descs.emplace_back( + /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k), + /* value */ EncodeAsString(k * 3), /* is_delete */ false, + /* use_contiguous_buffer */ true); + + // A Put with key (k + 2), timestamp (k + 2) that gets cancelled out by a + // Delete with timestamp (k + 3). Note that the comparator uses descending + // order for the timestamp part, so we add the Delete first. + input_descs.emplace_back(/* key */ EncodeAsString(k + 2), + /* timestamp */ EncodeAsUint64(k + 3), + /* value */ std::string(), /* is_delete */ true, + /* use_contiguous_buffer */ (k % 8) == 0); + input_descs.emplace_back( + /* key */ EncodeAsString(k + 2), /* timestamp */ EncodeAsUint64(k + 2), + /* value */ EncodeAsString(k * 5), /* is_delete */ false, + /* use_contiguous_buffer */ (k % 8) != 0); + } + + CreateFile(input_descs); + + // Note: below, we check the results as of each timestamp in the range, + // updating the expected result as needed. + std::vector output_descs; + + for (uint64_t ts = 0; ts < kNumKeys; ++ts) { + const uint64_t k = ts - (ts % 4); + + switch (ts % 4) { + case 0: // Initial Put for key k + output_descs.emplace_back(/* key */ EncodeAsString(k), + /* timestamp */ EncodeAsUint64(ts), + /* value */ EncodeAsString(k * 3)); + break; + + case 1: // Second Put for key k + assert(output_descs.back().key == EncodeAsString(k)); + assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1)); + assert(output_descs.back().value == EncodeAsString(k * 3)); + output_descs.back().timestamp = EncodeAsUint64(ts); + output_descs.back().value = EncodeAsString(k * 2); + break; + + case 2: // Put for key (k + 2) + output_descs.emplace_back(/* key */ EncodeAsString(k + 2), + /* timestamp */ EncodeAsUint64(ts), + /* value */ EncodeAsString(k * 5)); + break; + + case 3: // Delete for key (k + 2) + assert(output_descs.back().key == EncodeAsString(k + 2)); + assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1)); + assert(output_descs.back().value == EncodeAsString(k * 5)); + output_descs.pop_back(); + break; + } + + CheckFile(EncodeAsUint64(ts), output_descs); + } +} + +TEST_F(SstFileReaderTimestampTest, TimestampsOutOfOrder) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + // Note: KVs that have the same user key disregarding timestamps should be in + // descending order of timestamps. + ASSERT_OK(writer.Put("key", EncodeAsUint64(1), "value1")); + ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value2")); +} + +TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) { + SstFileWriter writer(soptions_, options_); + + ASSERT_OK(writer.Open(sst_name_)); + + // Comparator expects 64-bit timestamps; timestamps with other sizes as well + // as calls to the timestamp-less APIs should be rejected. + ASSERT_NOK(writer.Put("key", "not_an_actual_64_bit_timestamp", "value")); + ASSERT_NOK(writer.Delete("another_key", "timestamp_of_unexpected_size")); + + ASSERT_NOK(writer.Put("key_without_timestamp", "value")); + ASSERT_NOK(writer.Merge("another_key_missing_a_timestamp", "merge_operand")); + ASSERT_NOK(writer.Delete("yet_another_key_still_no_timestamp")); + ASSERT_NOK(writer.DeleteRange("begin_key_timestamp_absent", + "end_key_with_a_complete_lack_of_timestamps")); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,9 +7,10 @@ #include +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/writable_file_writer.h" +#include "rocksdb/file_system.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" @@ -29,7 +30,8 @@ struct SstFileWriter::Rep { Rep(const EnvOptions& _env_options, const Options& options, Env::IOPriority _io_priority, const Comparator* _user_comparator, - ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters) + ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters, + std::string _db_session_id) : env_options(_env_options), ioptions(options), mutable_cf_options(options), @@ -37,13 +39,13 @@ internal_comparator(_user_comparator), cfh(_cfh), invalidate_page_cache(_invalidate_page_cache), - last_fadvise_size(0), - skip_filters(_skip_filters) {} + skip_filters(_skip_filters), + db_session_id(_db_session_id) {} std::unique_ptr file_writer; std::unique_ptr builder; EnvOptions env_options; - ImmutableCFOptions ioptions; + ImmutableOptions ioptions; MutableCFOptions mutable_cf_options; Env::IOPriority io_priority; InternalKeyComparator internal_comparator; @@ -56,10 +58,13 @@ bool invalidate_page_cache; // The size of the file during the last time we called Fadvise to remove // cached pages from page cache. - uint64_t last_fadvise_size; + uint64_t last_fadvise_size = 0; bool skip_filters; - Status Add(const Slice& user_key, const Slice& value, - const ValueType value_type) { + std::string db_session_id; + uint64_t next_file_number = 1; + + Status AddImpl(const Slice& user_key, const Slice& value, + ValueType value_type) { if (!builder) { return Status::InvalidArgument("File is not opened"); } @@ -75,23 +80,14 @@ } } - // TODO(tec) : For external SST files we could omit the seqno and type. - switch (value_type) { - case ValueType::kTypeValue: - ikey.Set(user_key, 0 /* Sequence Number */, - ValueType::kTypeValue /* Put */); - break; - case ValueType::kTypeMerge: - ikey.Set(user_key, 0 /* Sequence Number */, - ValueType::kTypeMerge /* Merge */); - break; - case ValueType::kTypeDeletion: - ikey.Set(user_key, 0 /* Sequence Number */, - ValueType::kTypeDeletion /* Delete */); - break; - default: - return Status::InvalidArgument("Value type is not supported"); - } + assert(value_type == kTypeValue || value_type == kTypeMerge || + value_type == kTypeDeletion || + value_type == kTypeDeletionWithTimestamp); + + constexpr SequenceNumber sequence_number = 0; + + ikey.Set(user_key, sequence_number, value_type); + builder->Add(ikey.Encode(), value); // update file info @@ -99,12 +95,46 @@ file_info.largest_key.assign(user_key.data(), user_key.size()); file_info.file_size = builder->FileSize(); - InvalidatePageCache(false /* closing */); - + InvalidatePageCache(false /* closing */).PermitUncheckedError(); return Status::OK(); } + Status Add(const Slice& user_key, const Slice& value, ValueType value_type) { + if (internal_comparator.timestamp_size() != 0) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + + return AddImpl(user_key, value, value_type); + } + + Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value, + ValueType value_type) { + const size_t timestamp_size = timestamp.size(); + + if (internal_comparator.timestamp_size() != timestamp_size) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + + const size_t user_key_size = user_key.size(); + + if (user_key.data() + user_key_size == timestamp.data()) { + Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size); + return AddImpl(user_key_with_ts, value, value_type); + } + + std::string user_key_with_ts; + user_key_with_ts.reserve(user_key_size + timestamp_size); + user_key_with_ts.append(user_key.data(), user_key_size); + user_key_with_ts.append(timestamp.data(), timestamp_size); + + return AddImpl(user_key_with_ts, value, value_type); + } + Status DeleteRange(const Slice& begin_key, const Slice& end_key) { + if (internal_comparator.timestamp_size() != 0) { + return Status::InvalidArgument("Timestamp size mismatch"); + } + if (!builder) { return Status::InvalidArgument("File is not opened"); } @@ -135,27 +165,32 @@ file_info.num_range_del_entries++; file_info.file_size = builder->FileSize(); - InvalidatePageCache(false /* closing */); - + InvalidatePageCache(false /* closing */).PermitUncheckedError(); return Status::OK(); } - void InvalidatePageCache(bool closing) { + Status InvalidatePageCache(bool closing) { + Status s = Status::OK(); if (invalidate_page_cache == false) { // Fadvise disabled - return; + return s; } uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size; if (bytes_since_last_fadvise > kFadviseTrigger || closing) { TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache", &(bytes_since_last_fadvise)); - // Tell the OS that we dont need this file in page cache - file_writer->InvalidateCache(0, 0); + // Tell the OS that we don't need this file in page cache + s = file_writer->InvalidateCache(0, 0); + if (s.IsNotSupported()) { + // NotSupported is fine as it could be a file type that doesn't use page + // cache. + s = Status::OK(); + } last_fadvise_size = builder->FileSize(); } + return s; } - }; SstFileWriter::SstFileWriter(const EnvOptions& env_options, @@ -165,7 +200,14 @@ bool invalidate_page_cache, Env::IOPriority io_priority, bool skip_filters) : rep_(new Rep(env_options, options, io_priority, user_comparator, - column_family, invalidate_page_cache, skip_filters)) { + column_family, invalidate_page_cache, skip_filters, + DBImpl::GenerateDbSessionId(options.env))) { + // SstFileWriter is used to create sst files that can be added to database + // later. Therefore, no real db_id and db_session_id are associated with it. + // Here we mimic the way db_session_id behaves by getting a db_session_id + // for each SstFileWriter, and (later below) assign unique file numbers + // in the table properties. The db_id is set to be "SST Writer" for clarity. + rep_->file_info.file_size = 0; } @@ -180,8 +222,10 @@ Status SstFileWriter::Open(const std::string& file_path) { Rep* r = rep_.get(); Status s; - std::unique_ptr sst_file; - s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options); + std::unique_ptr sst_file; + FileOptions cur_file_opts(r->env_options); + s = r->ioptions.env->GetFileSystem()->NewWritableFile( + file_path, cur_file_opts, &sst_file, nullptr); if (!s.ok()) { return s; } @@ -190,26 +234,24 @@ CompressionType compression_type; CompressionOptions compression_opts; - if (r->ioptions.bottommost_compression != kDisableCompressionOption) { - compression_type = r->ioptions.bottommost_compression; - if (r->ioptions.bottommost_compression_opts.enabled) { - compression_opts = r->ioptions.bottommost_compression_opts; + if (r->mutable_cf_options.bottommost_compression != + kDisableCompressionOption) { + compression_type = r->mutable_cf_options.bottommost_compression; + if (r->mutable_cf_options.bottommost_compression_opts.enabled) { + compression_opts = r->mutable_cf_options.bottommost_compression_opts; } else { - compression_opts = r->ioptions.compression_opts; + compression_opts = r->mutable_cf_options.compression_opts; } } else if (!r->ioptions.compression_per_level.empty()) { // Use the compression of the last level if we have per level compression compression_type = *(r->ioptions.compression_per_level.rbegin()); - compression_opts = r->ioptions.compression_opts; + compression_opts = r->mutable_cf_options.compression_opts; } else { compression_type = r->mutable_cf_options.compression; - compression_opts = r->ioptions.compression_opts; + compression_opts = r->mutable_cf_options.compression_opts; } - uint64_t sample_for_compression = - r->mutable_cf_options.sample_for_compression; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; // SstFileWriter properties collector to add SstFileWriter version. int_tbl_prop_collector_factories.emplace_back( @@ -236,21 +278,33 @@ r->column_family_name = ""; cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; } - TableBuilderOptions table_builder_options( r->ioptions, r->mutable_cf_options, r->internal_comparator, - &int_tbl_prop_collector_factories, compression_type, - sample_for_compression, compression_opts, r->skip_filters, - r->column_family_name, unknown_level); - r->file_writer.reset( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(sst_file)), - file_path, r->env_options, r->ioptions.env, - nullptr /* stats */, r->ioptions.listeners)); + &int_tbl_prop_collector_factories, compression_type, compression_opts, + cf_id, r->column_family_name, unknown_level, false /* is_bottommost */, + TableFileCreationReason::kMisc, 0 /* creation_time */, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "SST Writer" /* db_id */, r->db_session_id, 0 /* target_file_size */, + r->next_file_number); + // External SST files used to each get a unique session id. Now for + // slightly better uniqueness probability in constructing cache keys, we + // assign fake file numbers to each file (into table properties) and keep + // the same session id for the life of the SstFileWriter. + r->next_file_number++; + // XXX: when we can remove skip_filters from the SstFileWriter public API + // we can remove it from TableBuilderOptions. + table_builder_options.skip_filters = r->skip_filters; + FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; + r->file_writer.reset(new WritableFileWriter( + std::move(sst_file), file_path, r->env_options, r->ioptions.clock, + nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners, + r->ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); // TODO(tec) : If table_factory is using compressed block cache, we will // be adding the external sst file blocks into it, which is wasteful. r->builder.reset(r->ioptions.table_factory->NewTableBuilder( - table_builder_options, cf_id, r->file_writer.get())); + table_builder_options, r->file_writer.get())); r->file_info = ExternalSstFileInfo(); r->file_info.file_path = file_path; @@ -266,6 +320,11 @@ return rep_->Add(user_key, value, ValueType::kTypeValue); } +Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp, + const Slice& value) { + return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue); +} + Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) { return rep_->Add(user_key, value, ValueType::kTypeMerge); } @@ -274,6 +333,11 @@ return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion); } +Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) { + return rep_->Add(user_key, timestamp, Slice(), + ValueType::kTypeDeletionWithTimestamp); +} + Status SstFileWriter::DeleteRange(const Slice& begin_key, const Slice& end_key) { return rep_->DeleteRange(begin_key, end_key); @@ -294,11 +358,16 @@ if (s.ok()) { s = r->file_writer->Sync(r->ioptions.use_fsync); - r->InvalidatePageCache(true /* closing */); + r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); if (s.ok()) { s = r->file_writer->Close(); } } + if (s.ok()) { + r->file_info.file_checksum = r->file_writer->GetFileChecksum(); + r->file_info.file_checksum_func_name = + r->file_writer->GetFileChecksumFuncName(); + } if (!s.ok()) { r->ioptions.env->DeleteFile(r->file_info.file_path); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,9 +5,10 @@ #pragma once #include -#include "db/dbformat.h" + #include "db/table_properties_collector.h" #include "rocksdb/types.h" +#include "util/coding.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -35,9 +36,9 @@ return Status::OK(); } - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. return; @@ -78,7 +79,7 @@ : version_(version), global_seqno_(global_seqno) {} virtual IntTblPropCollector* CreateIntTblPropCollector( - uint32_t /*column_family_id*/) override { + uint32_t /*column_family_id*/, int /* level_at_creation */) override { return new SstFileWriterPropertiesCollector(version_, global_seqno_); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_builder.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,9 +10,11 @@ #pragma once #include + #include #include #include + #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "file/writable_file_writer.h" @@ -28,92 +30,133 @@ struct TableReaderOptions { // @param skip_filters Disables loading/accessing the filter block - TableReaderOptions(const ImmutableCFOptions& _ioptions, - const SliceTransform* _prefix_extractor, - const EnvOptions& _env_options, - const InternalKeyComparator& _internal_comparator, - bool _skip_filters = false, bool _immortal = false, - int _level = -1, - BlockCacheTracer* const _block_cache_tracer = nullptr) - : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, - _internal_comparator, _skip_filters, _immortal, - _level, 0 /* _largest_seqno */, - _block_cache_tracer) {} + TableReaderOptions( + const ImmutableOptions& _ioptions, + const std::shared_ptr& _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, + bool _skip_filters = false, bool _immortal = false, + bool _force_direct_prefetch = false, int _level = -1, + BlockCacheTracer* const _block_cache_tracer = nullptr, + size_t _max_file_size_for_l0_meta_pin = 0, + const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0) + : TableReaderOptions( + _ioptions, _prefix_extractor, _env_options, _internal_comparator, + _skip_filters, _immortal, _force_direct_prefetch, _level, + 0 /* _largest_seqno */, _block_cache_tracer, + _max_file_size_for_l0_meta_pin, _cur_db_session_id, _cur_file_num) { + } // @param skip_filters Disables loading/accessing the filter block - TableReaderOptions(const ImmutableCFOptions& _ioptions, - const SliceTransform* _prefix_extractor, - const EnvOptions& _env_options, - const InternalKeyComparator& _internal_comparator, - bool _skip_filters, bool _immortal, int _level, - SequenceNumber _largest_seqno, - BlockCacheTracer* const _block_cache_tracer) + TableReaderOptions( + const ImmutableOptions& _ioptions, + const std::shared_ptr& _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, bool _skip_filters, + bool _immortal, bool _force_direct_prefetch, int _level, + SequenceNumber _largest_seqno, + BlockCacheTracer* const _block_cache_tracer, + size_t _max_file_size_for_l0_meta_pin, + const std::string& _cur_db_session_id, uint64_t _cur_file_num) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), internal_comparator(_internal_comparator), skip_filters(_skip_filters), immortal(_immortal), + force_direct_prefetch(_force_direct_prefetch), level(_level), largest_seqno(_largest_seqno), - block_cache_tracer(_block_cache_tracer) {} + block_cache_tracer(_block_cache_tracer), + max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), + cur_db_session_id(_cur_db_session_id), + cur_file_num(_cur_file_num) {} - const ImmutableCFOptions& ioptions; - const SliceTransform* prefix_extractor; + const ImmutableOptions& ioptions; + const std::shared_ptr& prefix_extractor; const EnvOptions& env_options; const InternalKeyComparator& internal_comparator; // This is only used for BlockBasedTable (reader) bool skip_filters; // Whether the table will be valid as long as the DB is open bool immortal; - // what level this table/file is on, -1 for "not set, don't know" + // When data prefetching is needed, even if direct I/O is off, read data to + // fetch into RocksDB's buffer, rather than relying + // RandomAccessFile::Prefetch(). + bool force_direct_prefetch; + // What level this table/file is on, -1 for "not set, don't know." Used + // for level-specific statistics. int level; // largest seqno in the table SequenceNumber largest_seqno; BlockCacheTracer* const block_cache_tracer; + // Largest L0 file size whose meta-blocks may be pinned (can be zero when + // unknown). + const size_t max_file_size_for_l0_meta_pin; + + std::string cur_db_session_id; + + uint64_t cur_file_num; }; struct TableBuilderOptions { TableBuilderOptions( - const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, + const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, const InternalKeyComparator& _internal_comparator, - const std::vector>* - _int_tbl_prop_collector_factories, - CompressionType _compression_type, uint64_t _sample_for_compression, - const CompressionOptions& _compression_opts, bool _skip_filters, + const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories, + CompressionType _compression_type, + const CompressionOptions& _compression_opts, uint32_t _column_family_id, const std::string& _column_family_name, int _level, + bool _is_bottommost = false, + TableFileCreationReason _reason = TableFileCreationReason::kMisc, const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0, - const uint64_t _target_file_size = 0, - const uint64_t _file_creation_time = 0) + const uint64_t _file_creation_time = 0, const std::string& _db_id = "", + const std::string& _db_session_id = "", + const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) : ioptions(_ioptions), moptions(_moptions), internal_comparator(_internal_comparator), int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), compression_type(_compression_type), - sample_for_compression(_sample_for_compression), compression_opts(_compression_opts), - skip_filters(_skip_filters), + column_family_id(_column_family_id), column_family_name(_column_family_name), - level(_level), creation_time(_creation_time), oldest_key_time(_oldest_key_time), target_file_size(_target_file_size), - file_creation_time(_file_creation_time) {} - const ImmutableCFOptions& ioptions; + file_creation_time(_file_creation_time), + db_id(_db_id), + db_session_id(_db_session_id), + level_at_creation(_level), + is_bottommost(_is_bottommost), + reason(_reason), + cur_file_num(_cur_file_num) {} + + const ImmutableOptions& ioptions; const MutableCFOptions& moptions; const InternalKeyComparator& internal_comparator; - const std::vector>* - int_tbl_prop_collector_factories; - CompressionType compression_type; - uint64_t sample_for_compression; + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories; + const CompressionType compression_type; const CompressionOptions& compression_opts; - bool skip_filters; // only used by BlockBasedTableBuilder + const uint32_t column_family_id; const std::string& column_family_name; - int level; // what level this table/file is on, -1 for "not set, don't know" const uint64_t creation_time; const int64_t oldest_key_time; const uint64_t target_file_size; const uint64_t file_creation_time; + const std::string db_id; + const std::string db_session_id; + // BEGIN for FilterBuildingContext + const int level_at_creation; + const bool is_bottommost; + const TableFileCreationReason reason; + // END for FilterBuildingContext + + // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you + // want to skip filters, that should be (for example) null filter_policy + // in the table options of the ioptions.table_factory + bool skip_filters = false; + const uint64_t cur_file_num; }; // TableBuilder provides the interface used to build a Table @@ -136,6 +179,9 @@ // Return non-ok iff some error has been detected. virtual Status status() const = 0; + // Return non-ok iff some error happens during IO. + virtual IOStatus io_status() const = 0; + // Finish building the table. // REQUIRES: Finish(), Abandon() have not been called virtual Status Finish() = 0; @@ -149,10 +195,21 @@ // Number of calls to Add() so far. virtual uint64_t NumEntries() const = 0; + // Whether the output file is completely empty. It has neither entries + // or tombstones. + virtual bool IsEmpty() const { + return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0; + } + // Size of the file generated so far. If invoked after a successful // Finish() call, returns the size of the final generated file. virtual uint64_t FileSize() const = 0; + // Estimated size of the file generated so far. This is used when + // FileSize() cannot estimate final SST size, e.g. parallel compression + // is enabled. + virtual uint64_t EstimatedFileSize() const { return FileSize(); } + // If the user defined table properties collector suggest the file to // be further compacted. virtual bool NeedCompact() const { return false; } @@ -161,7 +218,7 @@ virtual TableProperties GetTableProperties() const = 0; // Return file checksum - virtual const std::string& GetFileChecksum() const = 0; + virtual std::string GetFileChecksum() const = 0; // Return file checksum function name virtual const char* GetFileChecksumFuncName() const = 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_factory.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,65 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/plain/plain_table_factory.h" + +namespace ROCKSDB_NAMESPACE { + +static void RegisterTableFactories(const std::string& /*arg*/) { +#ifndef ROCKSDB_LITE + static std::once_flag loaded; + std::call_once(loaded, []() { + auto library = ObjectLibrary::Default(); + library->AddFactory( + TableFactory::kBlockBasedTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new BlockBasedTableFactory()); + return guard->get(); + }); + library->AddFactory( + TableFactory::kPlainTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new PlainTableFactory()); + return guard->get(); + }); + library->AddFactory( + TableFactory::kCuckooTableName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new CuckooTableFactory()); + return guard->get(); + }); + }); +#endif // ROCKSDB_LITE +} + +static bool LoadFactory(const std::string& name, + std::shared_ptr* factory) { + if (name == TableFactory::kBlockBasedTableName()) { + factory->reset(new BlockBasedTableFactory()); + return true; + } else { + return false; + } +} + +Status TableFactory::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* factory) { + RegisterTableFactories(""); + return LoadSharedObject(config_options, value, LoadFactory, + factory); +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,10 +7,10 @@ #include "port/port.h" #include "rocksdb/env.h" -#include "rocksdb/iterator.h" -#include "table/block_based/block.h" -#include "table/internal_iterator.h" +#include "rocksdb/unique_id.h" #include "table/table_properties_internal.h" +#include "table/unique_id_impl.h" +#include "util/random.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -42,31 +42,6 @@ props, key, ToString(value), prop_delim, kv_delim ); } - - // Seek to the specified meta block. - // Return true if it successfully seeks to that block. - Status SeekToMetaBlock(InternalIterator* meta_iter, - const std::string& block_name, bool* is_found, - BlockHandle* block_handle = nullptr) { - if (block_handle != nullptr) { - *block_handle = BlockHandle::NullBlockHandle(); - } - *is_found = true; - meta_iter->Seek(block_name); - if (meta_iter->status().ok()) { - if (meta_iter->Valid() && meta_iter->key() == block_name) { - *is_found = true; - if (block_handle) { - Slice v = meta_iter->value(); - return block_handle->DecodeFrom(&v); - } - } else { - *is_found = false; - return Status::OK(); - } - } - return meta_iter->status(); - } } std::string TableProperties::ToString( @@ -111,6 +86,8 @@ } AppendProperty(result, "filter block size", filter_size, prop_delim, kv_delim); + AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim, + kv_delim); AppendProperty(result, "(estimated) table size", data_size + index_size + filter_size, prop_delim, kv_delim); @@ -168,6 +145,26 @@ AppendProperty(result, "file creation time", file_creation_time, prop_delim, kv_delim); + AppendProperty(result, "slow compression estimated data size", + slow_compression_estimated_data_size, prop_delim, kv_delim); + AppendProperty(result, "fast compression estimated data size", + fast_compression_estimated_data_size, prop_delim, kv_delim); + + // DB identity and DB session ID + AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim); + AppendProperty(result, "DB session identity", db_session_id, prop_delim, + kv_delim); + AppendProperty(result, "DB host id", db_host_id, prop_delim, kv_delim); + AppendProperty(result, "original file number", orig_file_number, prop_delim, + kv_delim); + + // Unique ID, when available + std::string id; + Status s = GetUniqueIdFromTableProperties(*this, &id); + AppendProperty(result, "unique ID", + s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim, + kv_delim); + return result; } @@ -183,11 +180,46 @@ raw_value_size += tp.raw_value_size; num_data_blocks += tp.num_data_blocks; num_entries += tp.num_entries; + num_filter_entries += tp.num_filter_entries; num_deletions += tp.num_deletions; num_merge_operands += tp.num_merge_operands; num_range_deletions += tp.num_range_deletions; + slow_compression_estimated_data_size += + tp.slow_compression_estimated_data_size; + fast_compression_estimated_data_size += + tp.fast_compression_estimated_data_size; } +std::map +TableProperties::GetAggregatablePropertiesAsMap() const { + std::map rv; + rv["data_size"] = data_size; + rv["index_size"] = index_size; + rv["index_partitions"] = index_partitions; + rv["top_level_index_size"] = top_level_index_size; + rv["filter_size"] = filter_size; + rv["raw_key_size"] = raw_key_size; + rv["raw_value_size"] = raw_value_size; + rv["num_data_blocks"] = num_data_blocks; + rv["num_entries"] = num_entries; + rv["num_filter_entries"] = num_filter_entries; + rv["num_deletions"] = num_deletions; + rv["num_merge_operands"] = num_merge_operands; + rv["num_range_deletions"] = num_range_deletions; + rv["slow_compression_estimated_data_size"] = + slow_compression_estimated_data_size; + rv["fast_compression_estimated_data_size"] = + fast_compression_estimated_data_size; + return rv; +} + +const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity"; +const std::string TablePropertiesNames::kDbSessionId = + "rocksdb.creating.session.identity"; +const std::string TablePropertiesNames::kDbHostId = + "rocksdb.creating.host.identity"; +const std::string TablePropertiesNames::kOriginalFileNumber = + "rocksdb.original.file.number"; const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size"; const std::string TablePropertiesNames::kIndexSize = @@ -210,6 +242,8 @@ "rocksdb.num.data.blocks"; const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumFilterEntries = + "rocksdb.num.filter_entries"; const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; const std::string TablePropertiesNames::kMergeOperands = "rocksdb.merge.operands"; @@ -240,33 +274,32 @@ "rocksdb.oldest.key.time"; const std::string TablePropertiesNames::kFileCreationTime = "rocksdb.file.creation.time"; +const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.slow.data.size"; +const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.fast.data.size"; + +#ifndef NDEBUG +void TEST_SetRandomTableProperties(TableProperties* props) { + Random* r = Random::GetTLSInstance(); + // For now, TableProperties is composed of a number of uint64_t followed by + // a number of std::string, followed by some extras starting with + // user_collected_properties. + uint64_t* pu = &props->orig_file_number; + assert(static_cast(pu) == static_cast(props)); + std::string* ps = &props->db_id; + const uint64_t* const pu_end = reinterpret_cast(ps); + const std::string* const ps_end = + reinterpret_cast(&props->user_collected_properties); -extern const std::string kPropertiesBlock = "rocksdb.properties"; -// Old property block name for backward compatibility -extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; -extern const std::string kCompressionDictBlock = "rocksdb.compression_dict"; -extern const std::string kRangeDelBlock = "rocksdb.range_del"; - -// Seek to the properties block. -// Return true if it successfully seeks to the properties block. -Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) { - Status status = SeekToMetaBlock(meta_iter, kPropertiesBlock, is_found); - if (!*is_found && status.ok()) { - status = SeekToMetaBlock(meta_iter, kPropertiesBlockOldName, is_found); + for (; pu < pu_end; ++pu) { + *pu = r->Next64(); + } + assert(static_cast(pu) == static_cast(ps)); + for (; ps < ps_end; ++ps) { + *ps = r->RandomBinaryString(13); } - return status; -} - -// Seek to the compression dictionary block. -// Return true if it successfully seeks to that block. -Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found, - BlockHandle* block_handle) { - return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found, block_handle); -} - -Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found, - BlockHandle* block_handle = nullptr) { - return SeekToMetaBlock(meta_iter, kRangeDelBlock, is_found, block_handle); } +#endif } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties_internal.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,26 +5,10 @@ #pragma once -#include "rocksdb/status.h" -#include "rocksdb/iterator.h" +#include "rocksdb/table_properties.h" namespace ROCKSDB_NAMESPACE { - -class BlockHandle; - -// Seek to the properties block. -// If it successfully seeks to the properties block, "is_found" will be -// set to true. -Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found); - -// Seek to the compression dictionary block. -// If it successfully seeks to the properties block, "is_found" will be -// set to true. -Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found, - BlockHandle* block_handle); - -// TODO(andrewkr) should not put all meta block in table_properties.h/cc -Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found, - BlockHandle* block_handle); - +#ifndef NDEBUG +void TEST_SetRandomTableProperties(TableProperties* props); +#endif } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h 2025-05-19 16:14:27.000000000 +0000 @@ -39,6 +39,8 @@ // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). + // + // read_options: Must outlive the returned iterator. // arena: If not null, the arena needs to be used to allocate the Iterator. // When destroying the iterator, the caller will not call "delete" // but Iterator::~Iterator() directly. The destructor needs to destroy @@ -48,9 +50,10 @@ // compaction_readahead_size: its value will only be used if caller = // kCompaction virtual InternalIterator* NewIterator( - const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena, - bool skip_filters, TableReaderCaller caller, - size_t compaction_readahead_size = 0) = 0; + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) = 0; virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& /*read_options*/) { @@ -63,12 +66,19 @@ // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. + // TODO(peterd): Since this function is only used for approximate size + // from beginning of file, reduce code duplication by removing this + // function and letting ApproximateSize take optional start and end, so + // that absolute start and end can be specified and optimized without + // key / index work. virtual uint64_t ApproximateOffsetOf(const Slice& key, TableReaderCaller caller) = 0; // Given start and end keys, return the approximate data size in the file // between the keys. The returned value is in terms of file bytes, and so - // includes effects like compression of the underlying data. + // includes effects like compression of the underlying data and applicable + // portions of metadata including filters and indexes. Nullptr for start or + // end (or both) indicates absolute start or end of the table. virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, TableReaderCaller caller) = 0; diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_bench.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc 2025-05-19 16:14:27.000000000 +0000 @@ -13,11 +13,12 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/random_access_file_reader.h" #include "monitoring/histogram.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" @@ -50,8 +51,8 @@ return key.Encode().ToString(); } -uint64_t Now(Env* env, bool measured_by_nanosecond) { - return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) { + return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros(); } } // namespace @@ -81,30 +82,28 @@ std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db"); WriteOptions wo; Env* env = Env::Default(); + auto* clock = env->GetSystemClock().get(); TableBuilder* tb = nullptr; DB* db = nullptr; Status s; - const ImmutableCFOptions ioptions(opts); + const ImmutableOptions ioptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); std::unique_ptr file_writer; if (!through_db) { - std::unique_ptr file; - env->NewWritableFile(file_name, &file, env_options); + ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name, + FileOptions(env_options), &file_writer, + nullptr)); - std::vector > - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; - file_writer.reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), file_name, env_options)); int unknown_level = -1; tb = opts.table_factory->NewTableBuilder( TableBuilderOptions( ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, 0 /* sample_for_compression */, - CompressionOptions(), false /* skip_filters */, - kDefaultColumnFamilyName, unknown_level), - 0 /* column_family_id */, file_writer.get()); + CompressionType::kNoCompression, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level), + file_writer.get()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -130,20 +129,22 @@ std::unique_ptr table_reader; if (!through_db) { - std::unique_ptr raf; - s = env->NewRandomAccessFile(file_name, &raf, env_options); + const auto& fs = env->GetFileSystem(); + FileOptions fopts(env_options); + + std::unique_ptr raf; + s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr); if (!s.ok()) { fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str()); exit(1); } uint64_t file_size; - env->GetFileSize(file_name, &file_size); + fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr); std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(raf), - file_name)); + new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor.get(), - env_options, ikc), + TableReaderOptions(ioptions, moptions.prefix_extractor, env_options, + ikc), std::move(file_reader), file_size, &table_reader); if (!s.ok()) { fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); @@ -168,21 +169,21 @@ if (!for_iterator) { // Query one existing key; std::string key = MakeKey(r1, r2, through_db); - uint64_t start_time = Now(env, measured_by_nanosecond); + uint64_t start_time = Now(clock, measured_by_nanosecond); if (!through_db) { PinnableSlice value; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - GetContext get_context(ioptions.user_comparator, - ioptions.merge_operator, ioptions.info_log, - ioptions.statistics, GetContext::kNotFound, - Slice(key), &value, nullptr, &merge_context, - true, &max_covering_tombstone_seq, env); + GetContext get_context( + ioptions.user_comparator, ioptions.merge_operator.get(), + ioptions.logger, ioptions.stats, GetContext::kNotFound, + Slice(key), &value, nullptr, &merge_context, true, + &max_covering_tombstone_seq, clock); s = table_reader->Get(read_options, key, &get_context, nullptr); } else { s = db->Get(read_options, key, &result); } - hist.Add(Now(env, measured_by_nanosecond) - start_time); + hist.Add(Now(clock, measured_by_nanosecond) - start_time); } else { int r2_len; if (if_query_empty_keys) { @@ -196,7 +197,7 @@ std::string start_key = MakeKey(r1, r2, through_db); std::string end_key = MakeKey(r1, r2 + r2_len, through_db); uint64_t total_time = 0; - uint64_t start_time = Now(env, measured_by_nanosecond); + uint64_t start_time = Now(clock, measured_by_nanosecond); Iterator* iter = nullptr; InternalIterator* iiter = nullptr; if (!through_db) { @@ -214,10 +215,10 @@ break; } // verify key; - total_time += Now(env, measured_by_nanosecond) - start_time; + total_time += Now(clock, measured_by_nanosecond) - start_time; assert(Slice(MakeKey(r1, r2 + count, through_db)) == (through_db ? iter->key() : iiter->key())); - start_time = Now(env, measured_by_nanosecond); + start_time = Now(clock, measured_by_nanosecond); if (++count >= r2_len) { break; } @@ -229,7 +230,7 @@ assert(false); } delete iter; - total_time += Now(env, measured_by_nanosecond) - start_time; + total_time += Now(clock, measured_by_nanosecond) - start_time; hist.Add(total_time); } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_caller.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_caller.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,8 @@ #pragma once +#include "rocksdb/rocksdb_namespace.h" + namespace ROCKSDB_NAMESPACE { // A list of callers for a table reader. It is used to trace the caller that // accesses on a block. This is only used for block cache tracing and analysis. diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_test.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,33 +7,44 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/table.h" + +#include +#include #include + #include #include #include #include #include +#include #include -#include "block_fetcher.h" #include "cache/lru_cache.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "memtable/stl_wrappers.h" -#include "meta_blocks.h" #include "monitoring/statistics.h" +#include "options/options_helper.h" #include "port/port.h" +#include "port/stack_trace.h" #include "rocksdb/cache.h" +#include "rocksdb/compression_type.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/iterator.h" #include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" +#include "rocksdb/table_properties.h" +#include "rocksdb/trace_record.h" +#include "rocksdb/unique_id.h" #include "rocksdb/write_buffer_manager.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_builder.h" @@ -41,19 +52,24 @@ #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" #include "table/block_based/flush_block_policy.h" +#include "table/block_fetcher.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" +#include "table/meta_blocks.h" #include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/coding_lean.h" #include "util/compression.h" #include "util/file_checksum_helper.h" #include "util/random.h" #include "util/string_util.h" +#include "utilities/memory_allocators.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { @@ -70,7 +86,7 @@ // DummyPropertiesCollector used to test BlockBasedTableProperties class DummyPropertiesCollector : public TablePropertiesCollector { public: - const char* Name() const override { return ""; } + const char* Name() const override { return "DummyPropertiesCollector"; } Status Finish(UserCollectedProperties* /*properties*/) override { return Status::OK(); @@ -92,7 +108,9 @@ TablePropertiesCollectorFactory::Context /*context*/) override { return new DummyPropertiesCollector(); } - const char* Name() const override { return "DummyPropertiesCollector1"; } + const char* Name() const override { + return "DummyPropertiesCollectorFactory1"; + } }; class DummyPropertiesCollectorFactory2 @@ -102,7 +120,9 @@ TablePropertiesCollectorFactory::Context /*context*/) override { return new DummyPropertiesCollector(); } - const char* Name() const override { return "DummyPropertiesCollector2"; } + const char* Name() const override { + return "DummyPropertiesCollectorFactory2"; + } }; // Return reverse of "key". @@ -151,6 +171,9 @@ } } +const auto kUnknownColumnFamily = + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + } // namespace // Helper class for tests to unify the interface between @@ -168,12 +191,12 @@ // Finish constructing the data structure with all the keys that have // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" - void Finish(const Options& options, const ImmutableCFOptions& ioptions, + void Finish(const Options& options, const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::vector* keys, stl_wrappers::KVMap* kvmap) { - last_internal_key_ = &internal_comparator; + last_internal_comparator_ = &internal_comparator; *kvmap = data_; keys->clear(); for (const auto& kv : data_) { @@ -187,7 +210,7 @@ // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, @@ -205,52 +228,12 @@ virtual bool AnywayDeleteIterator() const { return false; } protected: - const InternalKeyComparator* last_internal_key_; + const InternalKeyComparator* last_internal_comparator_; private: stl_wrappers::KVMap data_; }; -class BlockConstructor: public Constructor { - public: - explicit BlockConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp), - block_(nullptr) { } - ~BlockConstructor() override { delete block_; } - Status FinishImpl(const Options& /*options*/, - const ImmutableCFOptions& /*ioptions*/, - const MutableCFOptions& /*moptions*/, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& /*internal_comparator*/, - const stl_wrappers::KVMap& kv_map) override { - delete block_; - block_ = nullptr; - BlockBuilder builder(table_options.block_restart_interval); - - for (const auto kv : kv_map) { - builder.Add(kv.first, kv.second); - } - // Open the block - data_ = builder.Finish().ToString(); - BlockContents contents; - contents.data = data_; - block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber); - return Status::OK(); - } - InternalIterator* NewIterator( - const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewDataIterator(comparator_, comparator_); - } - - private: - const Comparator* comparator_; - std::string data_; - Block* block_; - - BlockConstructor(); -}; - // A helper class that converts internal format keys into user keys class KeyConvertingIterator : public InternalIterator { public: @@ -281,14 +264,18 @@ void SeekToLast() override { iter_->SeekToLast(); } void Next() override { iter_->Next(); } void Prev() override { iter_->Prev(); } - bool IsOutOfBound() override { return iter_->IsOutOfBound(); } + IterBoundCheck UpperBoundCheckResult() override { + return iter_->UpperBoundCheckResult(); + } Slice key() const override { assert(Valid()); ParsedInternalKey parsed_key; - if (!ParseInternalKey(iter_->key(), &parsed_key)) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); + Status pik_status = + ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */); + if (!pik_status.ok()) { + status_ = pik_status; + return Slice(status_.getState()); } return parsed_key.user_key; } @@ -308,7 +295,56 @@ void operator=(const KeyConvertingIterator&); }; -class TableConstructor: public Constructor { +// `BlockConstructor` APIs always accept/return user keys. +class BlockConstructor : public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), comparator_(cmp), block_(nullptr) {} + ~BlockConstructor() override { delete block_; } + Status FinishImpl(const Options& /*options*/, + const ImmutableOptions& /*ioptions*/, + const MutableCFOptions& /*moptions*/, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { + delete block_; + block_ = nullptr; + BlockBuilder builder(table_options.block_restart_interval); + + for (const auto& kv : kv_map) { + // `DataBlockIter` assumes it reads only internal keys. `BlockConstructor` + // clients provide user keys, so we need to convert to internal key format + // before writing the data block. + ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder.Add(encoded, kv.second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + block_ = new Block(std::move(contents)); + return Status::OK(); + } + InternalIterator* NewIterator( + const SliceTransform* /*prefix_extractor*/) const override { + // `DataBlockIter` returns the internal keys it reads. + // `KeyConvertingIterator` converts them to user keys before they are + // exposed to the `BlockConstructor` clients. + return new KeyConvertingIterator( + block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber)); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor : public Constructor { public: explicit TableConstructor(const Comparator* cmp, bool convert_to_internal_key = false, @@ -321,18 +357,18 @@ } ~TableConstructor() override { Reset(); } - Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, + Status FinishImpl(const Options& options, const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& internal_comparator, const stl_wrappers::KVMap& kv_map) override { Reset(); soptions.use_mmap_reads = ioptions.allow_mmap_reads; - file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(), - "" /* don't care */)); + std::unique_ptr sink(new test::StringSink()); + file_writer_.reset(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); std::unique_ptr builder; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; if (largest_seqno_ != 0) { // Pretend that it's an external file written by SstFileWriter. @@ -345,13 +381,11 @@ builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, internal_comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level_), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level_), file_writer_.get())); - for (const auto kv : kv_map) { + for (const auto& kv : kv_map) { if (convert_to_internal_key_) { ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); std::string encoded; @@ -360,34 +394,25 @@ } else { builder->Add(kv.first, kv.second); } - EXPECT_TRUE(builder->status().ok()); + EXPECT_OK(builder->status()); } Status s = builder->Finish(); - file_writer_->Flush(); + EXPECT_OK(file_writer_->Flush()); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize()); // Open the table uniq_id_ = cur_uniq_id_++; - file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( - TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); - const bool kSkipFilters = true; - const bool kImmortal = true; - return ioptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, - internal_comparator, !kSkipFilters, !kImmortal, - level_, largest_seqno_, &block_cache_tracer_), - std::move(file_reader_), TEST_GetSink()->contents().size(), - &table_reader_); + + return Reopen(ioptions, moptions); } InternalIterator* NewIterator( const SliceTransform* prefix_extractor) const override { - ReadOptions ro; InternalIterator* iter = table_reader_->NewIterator( - ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false, - TableReaderCaller::kUncategorized); + read_options_, prefix_extractor, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -406,13 +431,18 @@ key, TableReaderCaller::kUncategorized); } - virtual Status Reopen(const ImmutableCFOptions& ioptions, + virtual Status Reopen(const ImmutableOptions& ioptions, const MutableCFOptions& moptions) { - file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( - TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); + std::unique_ptr source(new test::StringSource( + TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)); + + file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return ioptions.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, - *last_internal_key_), + TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, + *last_internal_comparator_, /*skip_filters*/ false, + /*immortal*/ false, false, level_, largest_seqno_, + &block_cache_tracer_, moptions.write_buffer_size, "", + uniq_id_), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -428,8 +458,7 @@ bool ConvertToInternalKey() { return convert_to_internal_key_; } test::StringSink* TEST_GetSink() { - return ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter( - file_writer_.get()); + return static_cast(file_writer_->writable_file()); } BlockCacheTracer block_cache_tracer_; @@ -442,6 +471,7 @@ file_reader_.reset(); } + const ReadOptions read_options_; uint64_t uniq_id_; std::unique_ptr file_writer_; std::unique_ptr file_reader_; @@ -466,27 +496,31 @@ write_buffer_manager_(wb), table_factory_(new SkipListFactory) { options_.memtable_factory = table_factory_; - ImmutableCFOptions ioptions(options_); + ImmutableOptions ioptions(options_); memtable_ = new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_), wb, kMaxSequenceNumber, 0 /* column_family_id */); memtable_->Ref(); } ~MemTableConstructor() override { delete memtable_->Unref(); } - Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions, + Status FinishImpl(const Options&, const ImmutableOptions& ioptions, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& /*internal_comparator*/, const stl_wrappers::KVMap& kv_map) override { delete memtable_->Unref(); - ImmutableCFOptions mem_ioptions(ioptions); + ImmutableOptions mem_ioptions(ioptions); memtable_ = new MemTable(internal_comparator_, mem_ioptions, MutableCFOptions(options_), write_buffer_manager_, kMaxSequenceNumber, 0 /* column_family_id */); memtable_->Ref(); int seq = 1; - for (const auto kv : kv_map) { - memtable_->Add(seq, kTypeValue, kv.first, kv.second); + for (const auto& kv : kv_map) { + Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second, + nullptr /* kv_prot_info */); + if (!s.ok()) { + return s; + } seq++; } return Status::OK(); @@ -538,7 +572,7 @@ } ~DBConstructor() override { delete db_; } Status FinishImpl(const Options& /*options*/, - const ImmutableCFOptions& /*ioptions*/, + const ImmutableOptions& /*ioptions*/, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& /*internal_comparator*/, @@ -546,9 +580,9 @@ delete db_; db_ = nullptr; NewDB(); - for (const auto kv : kv_map) { + for (const auto& kv : kv_map) { WriteBatch batch; - batch.Put(kv.first, kv.second); + EXPECT_OK(batch.Put(kv.first, kv.second)); EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok()); } return Status::OK(); @@ -598,10 +632,22 @@ bool reverse_compare; int restart_interval; CompressionType compression; + uint32_t compression_parallel_threads; uint32_t format_version; bool use_mmap; }; +std::ostream& operator<<(std::ostream& os, const TestArgs& args) { + os << "type: " << args.type << " reverse_compare: " << args.reverse_compare + << " restart_interval: " << args.restart_interval + << " compression: " << args.compression + << " compression_parallel_threads: " << args.compression_parallel_threads + << " format_version: " << args.format_version + << " use_mmap: " << args.use_mmap; + + return os; +} + static std::vector GenerateArgList() { std::vector test_args; std::vector test_types = { @@ -615,6 +661,7 @@ MEMTABLE_TEST, DB_TEST}; std::vector reverse_compare_types = {false, true}; std::vector restart_intervals = {16, 1, 1024}; + std::vector compression_parallel_threads = {1, 4}; // Only add compression if it is supported std::vector> compression_types; @@ -657,6 +704,8 @@ one_arg.reverse_compare = reverse_compare; one_arg.restart_interval = restart_intervals[0]; one_arg.compression = compression_types[0].first; + one_arg.compression_parallel_threads = 1; + one_arg.format_version = 0; one_arg.use_mmap = true; test_args.push_back(one_arg); one_arg.use_mmap = false; @@ -667,14 +716,17 @@ for (auto restart_interval : restart_intervals) { for (auto compression_type : compression_types) { - TestArgs one_arg; - one_arg.type = test_type; - one_arg.reverse_compare = reverse_compare; - one_arg.restart_interval = restart_interval; - one_arg.compression = compression_type.first; - one_arg.format_version = compression_type.second ? 2 : 1; - one_arg.use_mmap = false; - test_args.push_back(one_arg); + for (auto num_threads : compression_parallel_threads) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type.first; + one_arg.compression_parallel_threads = num_threads; + one_arg.format_version = compression_type.second ? 2 : 1; + one_arg.use_mmap = false; + test_args.push_back(one_arg); + } } } } @@ -715,41 +767,38 @@ class HarnessTest : public testing::Test { public: - HarnessTest() - : ioptions_(options_), + explicit HarnessTest(const TestArgs& args) + : args_(args), + ioptions_(options_), moptions_(options_), - constructor_(nullptr), - write_buffer_(options_.db_write_buffer_size) {} - - void Init(const TestArgs& args) { - delete constructor_; - constructor_ = nullptr; - options_ = Options(); - options_.compression = args.compression; + write_buffer_(options_.db_write_buffer_size), + support_prev_(true), + only_support_prefix_seek_(false) { + options_.compression = args_.compression; + options_.compression_opts.parallel_threads = + args_.compression_parallel_threads; // Use shorter block size for tests to exercise block boundary // conditions more. - if (args.reverse_compare) { + if (args_.reverse_compare) { options_.comparator = &reverse_key_comparator; } internal_comparator_.reset( new test::PlainInternalKeyComparator(options_.comparator)); - support_prev_ = true; - only_support_prefix_seek_ = false; - options_.allow_mmap_reads = args.use_mmap; - switch (args.type) { + options_.allow_mmap_reads = args_.use_mmap; + switch (args_.type) { case BLOCK_BASED_TABLE_TEST: table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); table_options_.block_size = 256; - table_options_.block_restart_interval = args.restart_interval; - table_options_.index_block_restart_interval = args.restart_interval; - table_options_.format_version = args.format_version; + table_options_.block_restart_interval = args_.restart_interval; + table_options_.index_block_restart_interval = args_.restart_interval; + table_options_.format_version = args_.format_version; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); - constructor_ = new TableConstructor( - options_.comparator, true /* convert_to_internal_key_ */); + constructor_.reset(new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */)); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -760,8 +809,8 @@ only_support_prefix_seek_ = true; options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2)); options_.table_factory.reset(NewPlainTableFactory()); - constructor_ = new TableConstructor( - options_.comparator, true /* convert_to_internal_key_ */); + constructor_.reset(new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */)); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -770,8 +819,8 @@ only_support_prefix_seek_ = true; options_.prefix_extractor.reset(NewNoopTransform()); options_.table_factory.reset(NewPlainTableFactory()); - constructor_ = new TableConstructor( - options_.comparator, true /* convert_to_internal_key_ */); + constructor_.reset(new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */)); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -789,8 +838,8 @@ options_.table_factory.reset( NewPlainTableFactory(plain_table_options)); } - constructor_ = new TableConstructor( - options_.comparator, true /* convert_to_internal_key_ */); + constructor_.reset(new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */)); internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; @@ -799,28 +848,26 @@ table_options_.block_size = 256; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); - constructor_ = new BlockConstructor(options_.comparator); + constructor_.reset(new BlockConstructor(options_.comparator)); break; case MEMTABLE_TEST: table_options_.block_size = 256; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); - constructor_ = new MemTableConstructor(options_.comparator, - &write_buffer_); + constructor_.reset( + new MemTableConstructor(options_.comparator, &write_buffer_)); break; case DB_TEST: table_options_.block_size = 256; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); - constructor_ = new DBConstructor(options_.comparator); + constructor_.reset(new DBConstructor(options_.comparator)); break; } - ioptions_ = ImmutableCFOptions(options_); + ioptions_ = ImmutableOptions(options_); moptions_ = MutableCFOptions(options_); } - ~HarnessTest() override { delete constructor_; } - void Add(const std::string& key, const std::string& value) { constructor_->Add(key, value); } @@ -843,12 +890,15 @@ InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); iter->SeekToFirst(); + ASSERT_OK(iter->status()); for (stl_wrappers::KVMap::const_iterator model_iter = data.begin(); model_iter != data.end(); ++model_iter) { ASSERT_EQ(ToString(data, model_iter), ToString(iter)); iter->Next(); + ASSERT_OK(iter->status()); } ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { iter->~InternalIterator(); } else { @@ -861,12 +911,15 @@ InternalIterator* iter = constructor_->NewIterator(); ASSERT_TRUE(!iter->Valid()); iter->SeekToLast(); + ASSERT_OK(iter->status()); for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin(); model_iter != data.rend(); ++model_iter) { ASSERT_EQ(ToString(data, model_iter), ToString(iter)); iter->Prev(); + ASSERT_OK(iter->status()); } ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { iter->~InternalIterator(); } else { @@ -888,6 +941,7 @@ if (iter->Valid()) { if (kVerbose) fprintf(stderr, "Next\n"); iter->Next(); + ASSERT_OK(iter->status()); ++model_iter; ASSERT_EQ(ToString(data, model_iter), ToString(iter)); } @@ -897,6 +951,7 @@ case 1: { if (kVerbose) fprintf(stderr, "SeekToFirst\n"); iter->SeekToFirst(); + ASSERT_OK(iter->status()); model_iter = data.begin(); ASSERT_EQ(ToString(data, model_iter), ToString(iter)); break; @@ -908,6 +963,7 @@ if (kVerbose) fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str()); iter->Seek(Slice(key)); + ASSERT_OK(iter->status()); ASSERT_EQ(ToString(data, model_iter), ToString(iter)); break; } @@ -916,6 +972,7 @@ if (iter->Valid()) { if (kVerbose) fprintf(stderr, "Prev\n"); iter->Prev(); + ASSERT_OK(iter->status()); if (model_iter == data.begin()) { model_iter = data.end(); // Wrap around to invalid value } else { @@ -929,6 +986,7 @@ case 4: { if (kVerbose) fprintf(stderr, "SeekToLast\n"); iter->SeekToLast(); + ASSERT_OK(iter->status()); if (keys.empty()) { model_iter = data.end(); } else { @@ -1006,40 +1064,37 @@ // Returns nullptr if not running against a DB DB* db() const { return constructor_->db(); } - void RandomizedHarnessTest(size_t part, size_t total) { - std::vector args = GenerateArgList(); - assert(part); - assert(part <= total); - for (size_t i = 0; i < args.size(); i++) { - if ((i % total) + 1 != part) { - continue; - } - Init(args[i]); - Random rnd(test::RandomSeed() + 5); - for (int num_entries = 0; num_entries < 2000; - num_entries += (num_entries < 50 ? 1 : 200)) { - for (int e = 0; e < num_entries; e++) { - std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); - } - Test(&rnd); - } - } - } - private: - Options options_ = Options(); - ImmutableCFOptions ioptions_; + TestArgs args_; + Options options_; + ImmutableOptions ioptions_; MutableCFOptions moptions_; - BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); - Constructor* constructor_; + BlockBasedTableOptions table_options_; + std::unique_ptr constructor_; WriteBufferManager write_buffer_; bool support_prev_; bool only_support_prefix_seek_; std::shared_ptr internal_comparator_; }; +class ParameterizedHarnessTest : public HarnessTest, + public testing::WithParamInterface { + public: + ParameterizedHarnessTest() : HarnessTest(GetParam()) {} +}; + +INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest, + ::testing::ValuesIn(GenerateArgList())); + +class DBHarnessTest : public HarnessTest { + public: + DBHarnessTest() + : HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false, + /* restart_interval */ 16, kNoCompression, + /* compression_parallel_threads */ 1, + /* format_version */ 0, /* use_mmap */ false}) {} +}; + static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); if (!result) { @@ -1091,7 +1146,11 @@ std::unique_ptr trace_writer; EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_, &trace_writer)); - c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer)); + // Always return Status::OK(). + assert(c->block_cache_tracer_ + .StartTrace(env_->GetSystemClock().get(), trace_opt, + std::move(trace_writer)) + .ok()); { std::string user_key = "k01"; InternalKey internal_key(user_key, 0, kTypeValue); @@ -1111,51 +1170,53 @@ const std::vector& expected_records) { c->block_cache_tracer_.EndTrace(); - std::unique_ptr trace_reader; - Status s = - NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); - EXPECT_OK(s); - BlockCacheTraceReader reader(std::move(trace_reader)); - BlockCacheTraceHeader header; - EXPECT_OK(reader.ReadHeader(&header)); - uint32_t index = 0; - while (s.ok()) { - BlockCacheTraceRecord access; - s = reader.ReadAccess(&access); - if (!s.ok()) { - break; - } - ASSERT_LT(index, expected_records.size()); - EXPECT_NE("", access.block_key); - EXPECT_EQ(access.block_type, expected_records[index].block_type); - EXPECT_GT(access.block_size, 0); - EXPECT_EQ(access.caller, expected_records[index].caller); - EXPECT_EQ(access.no_insert, expected_records[index].no_insert); - EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit); - // Get - if (access.caller == TableReaderCaller::kUserGet) { - EXPECT_EQ(access.referenced_key, - expected_records[index].referenced_key); - EXPECT_EQ(access.get_id, expected_records[index].get_id); - EXPECT_EQ(access.get_from_user_specified_snapshot, - expected_records[index].get_from_user_specified_snapshot); - if (access.block_type == TraceType::kBlockTraceDataBlock) { - EXPECT_GT(access.referenced_data_size, 0); - EXPECT_GT(access.num_keys_in_block, 0); - EXPECT_EQ(access.referenced_key_exist_in_block, - expected_records[index].referenced_key_exist_in_block); + { + std::unique_ptr trace_reader; + Status s = + NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + EXPECT_OK(s); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + EXPECT_OK(reader.ReadHeader(&header)); + uint32_t index = 0; + while (s.ok()) { + BlockCacheTraceRecord access; + s = reader.ReadAccess(&access); + if (!s.ok()) { + break; } - } else { - EXPECT_EQ(access.referenced_key, ""); - EXPECT_EQ(access.get_id, 0); - EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse); - EXPECT_EQ(access.referenced_data_size, 0); - EXPECT_EQ(access.num_keys_in_block, 0); - EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse); + ASSERT_LT(index, expected_records.size()); + EXPECT_NE("", access.block_key); + EXPECT_EQ(access.block_type, expected_records[index].block_type); + EXPECT_GT(access.block_size, 0); + EXPECT_EQ(access.caller, expected_records[index].caller); + EXPECT_EQ(access.no_insert, expected_records[index].no_insert); + EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit); + // Get + if (access.caller == TableReaderCaller::kUserGet) { + EXPECT_EQ(access.referenced_key, + expected_records[index].referenced_key); + EXPECT_EQ(access.get_id, expected_records[index].get_id); + EXPECT_EQ(access.get_from_user_specified_snapshot, + expected_records[index].get_from_user_specified_snapshot); + if (access.block_type == TraceType::kBlockTraceDataBlock) { + EXPECT_GT(access.referenced_data_size, 0); + EXPECT_GT(access.num_keys_in_block, 0); + EXPECT_EQ(access.referenced_key_exist_in_block, + expected_records[index].referenced_key_exist_in_block); + } + } else { + EXPECT_EQ(access.referenced_key, ""); + EXPECT_EQ(access.get_id, 0); + EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse); + EXPECT_EQ(access.referenced_data_size, 0); + EXPECT_EQ(access.num_keys_in_block, 0); + EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse); + } + index++; } - index++; + EXPECT_EQ(index, expected_records.size()); } - EXPECT_EQ(index, expected_records.size()); EXPECT_OK(env_->DeleteFile(trace_file_path_)); EXPECT_OK(env_->DeleteDir(test_path_)); } @@ -1178,17 +1239,21 @@ public: FileChecksumTestHelper(bool convert_to_internal_key = false) : convert_to_internal_key_(convert_to_internal_key) { - sink_ = new test::StringSink(); } ~FileChecksumTestHelper() {} void CreateWriteableFile() { - file_writer_.reset(test::GetWritableFileWriter(sink_, "" /* don't care */)); + sink_ = new test::StringSink(); + std::unique_ptr holder(sink_); + file_writer_.reset(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); } - void SetFileChecksumFunc(FileChecksumFunc* checksum_func) { + void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) { if (file_writer_ != nullptr) { - file_writer_->TEST_SetFileChecksumFunc(checksum_func); + file_writer_->TEST_SetFileChecksumGenerator(checksum_generator); + } else { + delete checksum_generator; } } @@ -1203,14 +1268,13 @@ void AddKVtoKVMap(int num_entries) { Random rnd(test::RandomSeed()); for (int i = 0; i < num_entries; i++) { - std::string v; - test::RandomString(&rnd, 100, &v); + std::string v = rnd.RandomString(100); kv_map_[test::RandomKey(&rnd, 20)] = v; } } Status WriteKVAndFlushTable() { - for (const auto kv : kv_map_) { + for (const auto& kv : kv_map_) { if (convert_to_internal_key_) { ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); std::string encoded; @@ -1222,54 +1286,53 @@ EXPECT_TRUE(table_builder_->status().ok()); } Status s = table_builder_->Finish(); - file_writer_->Flush(); - EXPECT_TRUE(s.ok()); + EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(s); EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize()); return s; } - std::string GetFileChecksum() { return table_builder_->GetFileChecksum(); } + std::string GetFileChecksum() { + EXPECT_OK(file_writer_->Close()); + return table_builder_->GetFileChecksum(); + } const char* GetFileChecksumFuncName() { return table_builder_->GetFileChecksumFuncName(); } - Status CalculateFileChecksum(FileChecksumFunc* file_checksum_func, + Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator, std::string* checksum) { - assert(file_checksum_func != nullptr); + assert(file_checksum_generator != nullptr); cur_uniq_id_ = checksum_uniq_id_++; test::StringSink* ss_rw = - ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter( - file_writer_.get()); - file_reader_.reset(test::GetRandomAccessFileReader( - new test::StringSource(ss_rw->contents()))); + static_cast(file_writer_->writable_file()); + std::unique_ptr source( + new test::StringSource(ss_rw->contents())); + file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); + std::unique_ptr scratch(new char[2048]); Slice result; uint64_t offset = 0; - std::string tmp_checksum; - bool first_read = true; Status s; - s = file_reader_->Read(offset, 2048, &result, scratch.get(), false); + s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(), + nullptr, false); if (!s.ok()) { return s; } while (result.size() != 0) { - if (first_read) { - first_read = false; - tmp_checksum = file_checksum_func->Value(scratch.get(), result.size()); - } else { - tmp_checksum = file_checksum_func->Extend(tmp_checksum, scratch.get(), - result.size()); - } + file_checksum_generator->Update(scratch.get(), result.size()); offset += static_cast(result.size()); - s = file_reader_->Read(offset, 2048, &result, scratch.get(), false); + s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(), + nullptr, false); if (!s.ok()) { return s; } } EXPECT_EQ(offset, static_cast(table_builder_->FileSize())); - *checksum = tmp_checksum; + file_checksum_generator->Finalize(); + *checksum = file_checksum_generator->GetChecksum(); return Status::OK(); } @@ -1280,17 +1343,15 @@ std::unique_ptr file_reader_; std::unique_ptr table_builder_; stl_wrappers::KVMap kv_map_; - test::StringSink* sink_; + test::StringSink* sink_ = nullptr; static uint64_t checksum_uniq_id_; }; uint64_t FileChecksumTestHelper::checksum_uniq_id_ = 1; -INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest, - testing::Values(test::kDefaultFormatVersion)); -INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest, - testing::Values(test::kLatestFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest, + testing::ValuesIn(test::kFooterFormatVersionsToTest)); // This test serves as the living tutorial for the prefix scan of user collected // properties. @@ -1306,7 +1367,7 @@ {"num.555.3", "3"}, }; // prefixes that exist - for (const std::string& prefix : {"num.111", "num.333", "num.555"}) { + for (const std::string prefix : {"num.111", "num.333", "num.555"}) { int num = 0; for (auto pos = props.lower_bound(prefix); pos != props.end() && @@ -1321,7 +1382,7 @@ } // prefixes that don't exist - for (const std::string& prefix : + for (const std::string prefix : {"num.000", "num.222", "num.444", "num.666"}) { auto pos = props.lower_bound(prefix); ASSERT_TRUE(pos == props.end() || @@ -1329,6 +1390,257 @@ } } +namespace { +struct TestIds { + UniqueId64x3 internal_id; + UniqueId64x3 external_id; +}; + +inline bool operator==(const TestIds& lhs, const TestIds& rhs) { + return lhs.internal_id == rhs.internal_id && + lhs.external_id == rhs.external_id; +} + +std::ostream& operator<<(std::ostream& os, const TestIds& ids) { + return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x" + << ids.internal_id[1] << "U, 0x" << ids.internal_id[2] + << "U }}, {{ 0x" << ids.external_id[0] << "U, 0x" + << ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}"; +} + +TestIds GetUniqueId(TableProperties* tp, std::unordered_set* seen, + const std::string& db_id, const std::string& db_session_id, + uint64_t file_number) { + // First test session id logic + if (db_session_id.size() == 20) { + uint64_t upper; + uint64_t lower; + EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower)); + EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id); + } + + // Get external using public API + tp->db_id = db_id; + tp->db_session_id = db_session_id; + tp->orig_file_number = file_number; + TestIds t; + { + std::string uid; + EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid)); + EXPECT_EQ(uid.size(), 24U); + t.external_id[0] = DecodeFixed64(&uid[0]); + t.external_id[1] = DecodeFixed64(&uid[8]); + t.external_id[2] = DecodeFixed64(&uid[16]); + } + // All these should be effectively random + EXPECT_TRUE(seen->insert(t.external_id[0]).second); + EXPECT_TRUE(seen->insert(t.external_id[1]).second); + EXPECT_TRUE(seen->insert(t.external_id[2]).second); + + // Get internal with internal API + EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number, + &t.internal_id)); + + // Verify relationship + UniqueId64x3 tmp = t.internal_id; + InternalUniqueIdToExternal(&tmp); + EXPECT_EQ(tmp, t.external_id); + ExternalUniqueIdToInternal(&tmp); + EXPECT_EQ(tmp, t.internal_id); + return t; +} +} // namespace + +TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) { + // To ensure the computation only depends on the expected entries, we set + // the rest randomly + TableProperties tp; + TEST_SetRandomTableProperties(&tp); + + // DB id is normally RFC-4122 + const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d"; + // Allow other forms of DB id + const std::string db_id2 = "1728000184588763620"; + const std::string db_id3 = "x"; + + // DB session id is normally 20 chars in base-36, but 13 to 24 chars + // is ok, roughly 64 to 128 bits. + const std::string ses_id1 = "ABCDEFGHIJ0123456789"; + // Same trailing 13 digits + const std::string ses_id2 = "HIJ0123456789"; + const std::string ses_id3 = "0123ABCDEFGHIJ0123456789"; + // Different trailing 12 digits + const std::string ses_id4 = "ABCDEFGH888888888888"; + // And change length + const std::string ses_id5 = "ABCDEFGHIJ012"; + const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD"; + + using T = TestIds; + std::unordered_set seen; + // Establish a stable schema for the unique IDs. These values must not + // change for existing table files. + // (Note: parens needed for macro parsing, extra braces needed for some + // compilers.) + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}}, + {{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}})); + // Only change internal_id[1] with file number + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 2), + T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}}, + {{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789), + T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}}, + {{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}})); + // Change internal_id[1] and internal_id[2] with db_id + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id2, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}}, + {{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id3, ses_id1, 1), + T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}}, + {{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}})); + // Keeping same last 13 digits of ses_id keeps same internal_id[0] + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id2, 1), + T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}}, + {{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id3, 1), + T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}}, + {{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}})); + // Changing last 12 digits of ses_id only changes internal_id[0] + // (vs. db_id1, ses_id1, 1) + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id4, 1), + T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}}, + {{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}})); + // ses_id can change everything. + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id5, 1), + T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}}, + {{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}})); + EXPECT_EQ( + GetUniqueId(&tp, &seen, db_id1, ses_id6, 1), + T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}}, + {{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}})); + + // Now verify more thoroughly that any small change in inputs completely + // changes external unique id. + // (Relying on 'seen' checks etc. in GetUniqueId) + std::string db_id = "00000000-0000-0000-0000-000000000000"; + std::string ses_id = "000000000000000000000000"; + uint64_t file_num = 1; + // change db_id + for (size_t i = 0; i < db_id.size(); ++i) { + if (db_id[i] == '-') { + continue; + } + for (char alt : std::string("123456789abcdef")) { + db_id[i] = alt; + GetUniqueId(&tp, &seen, db_id, ses_id, file_num); + } + db_id[i] = '0'; + } + // change ses_id + for (size_t i = 0; i < ses_id.size(); ++i) { + for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) { + ses_id[i] = alt; + GetUniqueId(&tp, &seen, db_id, ses_id, file_num); + } + ses_id[i] = '0'; + } + // change file_num + for (int i = 1; i < 64; ++i) { + GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i); + } + + // Verify that "all zeros" in first 128 bits is equivalent for internal and + // external IDs. This way, as long as we avoid "all zeros" in internal IDs, + // we avoid it in external IDs. + { + UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}}; + UniqueId64x3 id2 = id1; + InternalUniqueIdToExternal(&id1); + EXPECT_EQ(id1, id2); + ExternalUniqueIdToInternal(&id2); + EXPECT_EQ(id1, id2); + } +} + +namespace { +void SetGoodTableProperties(TableProperties* tp) { + // To ensure the computation only depends on the expected entries, we set + // the rest randomly + TEST_SetRandomTableProperties(tp); + tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d"; + tp->db_session_id = "ABCDEFGHIJ0123456789"; + tp->orig_file_number = 1; +} +} // namespace + +TEST_F(TablePropertyTest, UniqueIdHumanStrings) { + TableProperties tp; + SetGoodTableProperties(&tp); + + std::string tmp; + EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp)); + EXPECT_EQ(tmp, + (std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23', + '\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3', + '\x03', '\x93', '\x08', '\xca', '\x17', '\x28', + '\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}})); + EXPECT_EQ(UniqueIdToHumanString(tmp), + "6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B"); + + // including zero padding + tmp = std::string(24U, '\0'); + tmp[15] = '\x12'; + tmp[23] = '\xAB'; + EXPECT_EQ(UniqueIdToHumanString(tmp), + "0000000000000000-0000000000000012-00000000000000AB"); + + // And shortened + tmp = std::string(20U, '\0'); + tmp[5] = '\x12'; + tmp[10] = '\xAB'; + tmp[17] = '\xEF'; + EXPECT_EQ(UniqueIdToHumanString(tmp), + "0000000000120000-0000AB0000000000-00EF0000"); + + tmp.resize(16); + EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000"); + + tmp.resize(11); + EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB"); + + tmp.resize(6); + EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012"); +} + +TEST_F(TablePropertyTest, UniqueIdsFailure) { + TableProperties tp; + std::string tmp; + + // Missing DB id + SetGoodTableProperties(&tp); + tp.db_id = ""; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); + + // Missing session id + SetGoodTableProperties(&tp); + tp.db_session_id = ""; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); + + // Missing file number + SetGoodTableProperties(&tp); + tp.orig_file_number = 0; + EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported()); +} + // This test include all the basic checks except those for index size and block // size, which will be conducted in separated unit tests. TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) { @@ -1355,9 +1667,8 @@ table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); - ioptions.statistics = options.statistics.get(); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0); @@ -1379,7 +1690,8 @@ block_builder.Add(item.first, item.second); } Slice content = block_builder.Finish(); - ASSERT_EQ(content.size() + kBlockTrailerSize + diff_internal_user_bytes, + ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize + + diff_internal_user_bytes, props.data_size); c.ResetTableReader(); } @@ -1404,9 +1716,8 @@ table_options.enable_index_compression = compressed; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); - ioptions.statistics = options.statistics.get(); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); c.ResetTableReader(); @@ -1431,7 +1742,7 @@ BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1465,7 +1776,7 @@ options.table_properties_collector_factories.emplace_back( new DummyPropertiesCollectorFactory2()); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1475,8 +1786,9 @@ ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name); ASSERT_EQ("UInt64AddOperator", props.merge_operator_name); ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name); - ASSERT_EQ("[DummyPropertiesCollector1,DummyPropertiesCollector2]", - props.property_collectors_names); + ASSERT_EQ( + "[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]", + props.property_collectors_names); ASSERT_EQ("", props.filter_policy_name); // no filter policy is used c.ResetTableReader(); } @@ -1508,7 +1820,7 @@ table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); std::unique_ptr internal_cmp( new InternalKeyComparator(options.comparator)); @@ -1529,7 +1841,8 @@ for (size_t i = 0; i < expected_tombstones.size(); i++) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey parsed_key; - ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); + ASSERT_OK( + ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */)); RangeTombstone t(parsed_key, iter->value()); const auto& expected_t = expected_tombstones[i]; ASSERT_EQ(t.start_key_, expected_t.start_key_); @@ -1551,7 +1864,7 @@ Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1595,7 +1908,7 @@ // reset the cache and reopen the table table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt->table_factory.reset(NewBlockBasedTableFactory(*table_options)); - const ImmutableCFOptions ioptions2(*opt); + const ImmutableOptions ioptions2(*opt); const MutableCFOptions moptions(*opt); ASSERT_OK(c->Reopen(ioptions2, moptions)); @@ -1653,7 +1966,7 @@ c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); c.ResetTableReader(); @@ -1754,7 +2067,7 @@ c.Add("cccc2", std::string('a', 56)); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1813,7 +2126,7 @@ c.Add(key.Encode().ToString(), "b"); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, @@ -1851,19 +2164,20 @@ c.Add(key.Encode().ToString(), "test"); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, &keys, &kvmap); // TODO(Zhongyi): update test to use MutableCFOptions options.prefix_extractor.reset(NewFixedPrefixTransform(9)); - const ImmutableCFOptions new_ioptions(options); + const ImmutableOptions new_ioptions(options); const MutableCFOptions new_moptions(options); - c.Reopen(new_ioptions, new_moptions); + ASSERT_OK(c.Reopen(new_ioptions, new_moptions)); auto reader = c.GetTableReader(); + ReadOptions read_options; std::unique_ptr db_iter(reader->NewIterator( - ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup @@ -1877,16 +2191,156 @@ } } -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; +TEST_P(BlockBasedTableTest, BadChecksumType) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + + Options options; + options.comparator = BytewiseComparator(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(options.comparator); + InternalKey key("abc", 1, kTypeValue); + c.Add(key.Encode().ToString(), "test"); + std::vector keys; + stl_wrappers::KVMap kvmap; + const ImmutableOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + // Corrupt checksum type (123 is invalid) + auto& sink = *c.TEST_GetSink(); + size_t len = sink.contents_.size(); + ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength], kCRC32c); + sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123}; + + // (Re-)Open table file with bad checksum type + const ImmutableOptions new_ioptions(options); + const MutableCFOptions new_moptions(options); + Status s = c.Reopen(new_ioptions, new_moptions); + ASSERT_NOK(s); + ASSERT_EQ(s.ToString(), + "Corruption: Corrupt or unsupported checksum type: 123"); +} + +namespace { +std::string ChecksumAsString(const std::string& data, + ChecksumType checksum_type) { + uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size()); + + // Verify consistency with other function + if (data.size() >= 1) { + EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte( + checksum_type, data.data(), data.size() - 1, data.back())); + } + // Little endian as in file + std::array raw_bytes; + EncodeFixed32(raw_bytes.data(), v); + return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true); +} + +std::string ChecksumAsString(std::string* data, char new_last_byte, + ChecksumType checksum_type) { + data->back() = new_last_byte; + return ChecksumAsString(*data, checksum_type); +} +} // namespace + +// Make sure that checksum values don't change in later versions, even if +// consistent within current version. +TEST_P(BlockBasedTableTest, ChecksumSchemas) { + std::string b0 = "x"; + std::string b1 = "This is a short block!x"; + std::string b2; + for (int i = 0; i < 100; ++i) { + b2.append("This is a long block!"); + } + b2.append("x"); + // Trailing 'x' will be replaced by compression type + + std::string empty; + + char ct1 = kNoCompression; + char ct2 = kSnappyCompression; + char ct3 = kZSTD; + + // Note: first byte of trailer is compression type, last 4 are checksum + + for (ChecksumType t : GetSupportedChecksums()) { + switch (t) { + case kNoChecksum: + EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000"); + break; + case kCRC32c: + EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63"); + break; + case kxxHash: + EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338"); + break; + case kxxHash64: + EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1"); + break; + case kXXH3: + EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE"); + break; + default: + // Force this test to be updated on new ChecksumTypes + assert(false); + break; + } + } } void AddInternalKey(TableConstructor* c, const std::string& prefix, std::string value = "v", int /*suffix_len*/ = 800) { static Random rnd(1023); - InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); + InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue); c->Add(k.Encode().ToString(), value); } @@ -1920,7 +2374,7 @@ std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, &kvmap); @@ -1930,14 +2384,16 @@ ASSERT_EQ(5u, props->num_data_blocks); // TODO(Zhongyi): update test to use MutableCFOptions + ReadOptions read_options; std::unique_ptr index_iter(reader->NewIterator( - ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // -- Find keys do not exist, but have common prefix. std::vector prefixes = {"001", "003", "005", "007", "009"}; - std::vector lower_bound = {keys[0], keys[1], keys[2], - keys[7], keys[9], }; + std::vector lower_bound = { + keys[0], keys[1], keys[2], keys[7], keys[9], + }; // find the lower bound of the prefix for (size_t i = 0; i < prefixes.size(); ++i) { @@ -2014,6 +2470,80 @@ ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0); } } + + { + // Test reseek case. It should impact partitioned index more. + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr index_iter2(reader->NewIterator( + ro, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Things to cover in partitioned index: + // 1. Both of Seek() and SeekToLast() has optimization to prevent + // rereek leaf index block if it remains to the same one, and + // they reuse the same variable. + // 2. When Next() or Prev() is called, the block moves, so the + // optimization should kick in only with the current one. + index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode()); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->SeekToLast(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode()); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->SeekToLast(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode()); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->SeekToLast(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode()); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + index_iter2->Prev(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode()); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->Next(); + ASSERT_TRUE(index_iter2->Valid()); + index_iter2->Next(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + + index_iter2->SeekToLast(); + ASSERT_TRUE(index_iter2->Valid()); + ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4)); + } + c.ResetTableReader(); } @@ -2047,7 +2577,7 @@ BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator()); @@ -2092,7 +2622,8 @@ explicit CustomFlushBlockPolicy(std::vector keys_per_block) : keys_per_block_(keys_per_block) {} - const char* Name() const override { return "table_test"; } + const char* Name() const override { return "CustomFlushBlockPolicy"; } + FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&, const BlockBuilder&) const override { return new CustomFlushBlockPolicy(keys_per_block_); @@ -2133,7 +2664,7 @@ Statistics* stats = options.statistics.get(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator()); @@ -2164,9 +2695,11 @@ auto reader = c.GetTableReader(); auto props = reader->GetTableProperties(); ASSERT_EQ(4u, props->num_data_blocks); + ReadOptions read_options; std::unique_ptr iter(reader->NewIterator( - ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, - /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized, + /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true)); // Shouldn't have read data blocks before iterator is seeked. EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); @@ -2183,6 +2716,7 @@ EXPECT_EQ(keys[2], iter->key().ToString()); EXPECT_EQ(use_first_key ? 0 : 1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v2", iter->value().ToString()); EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2193,6 +2727,7 @@ EXPECT_EQ(keys[4], iter->key().ToString()); EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v4", iter->value().ToString()); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2208,6 +2743,7 @@ ASSERT_TRUE(iter->Valid()); EXPECT_EQ(keys[5], iter->key().ToString()); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v5", iter->value().ToString()); EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2225,6 +2761,7 @@ ASSERT_TRUE(iter->Valid()); EXPECT_EQ(keys[7], iter->key().ToString()); EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v7", iter->value().ToString()); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2246,6 +2783,7 @@ EXPECT_EQ(keys[3], iter->key().ToString()); EXPECT_EQ(use_first_key ? 1 : 2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v3", iter->value().ToString()); EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); @@ -2265,6 +2803,7 @@ stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); // All blocks are in cache now, there'll be no more misses ever. EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v1", iter->value().ToString()); // Next into the next block again. @@ -2292,6 +2831,7 @@ EXPECT_EQ(keys[4], iter->key().ToString()); EXPECT_EQ(use_first_key ? 3 : 6, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v4", iter->value().ToString()); EXPECT_EQ(use_first_key ? 3 : 6, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2301,6 +2841,7 @@ EXPECT_EQ(keys[7], iter->key().ToString()); EXPECT_EQ(use_first_key ? 4 : 7, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("v7", iter->value().ToString()); EXPECT_EQ(use_first_key ? 4 : 7, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2321,7 +2862,7 @@ options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false, @@ -2339,9 +2880,11 @@ auto reader = c.GetTableReader(); auto props = reader->GetTableProperties(); ASSERT_EQ(1u, props->num_data_blocks); + ReadOptions read_options; std::unique_ptr iter(reader->NewIterator( - ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, - /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized, + /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true)); iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString()); ASSERT_TRUE(iter->Valid()); @@ -2351,6 +2894,7 @@ // Key should have been served from index, without reading data blocks. EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_TRUE(iter->PrepareValue()); EXPECT_EQ("x", iter->value().ToString()); EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); @@ -2373,7 +2917,7 @@ std::vector keys; for (int i = 0; i < 100; ++i) { - keys.push_back(RandomString(&rnd, 10000)); + keys.push_back(rnd.RandomString(10000)); } // Each time we load one more key to the table. the table index block @@ -2393,7 +2937,7 @@ table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); @@ -2417,12 +2961,12 @@ for (int i = 0; i < 10; ++i) { // the key/val are slightly smaller than block size, so that each block // holds roughly one key/value pair. - c.Add(RandomString(&rnd, 900), "val"); + c.Add(rnd.RandomString(900), "val"); } std::vector ks; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); @@ -2443,7 +2987,7 @@ SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2517,7 +3061,7 @@ SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2561,14 +3105,15 @@ SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); for (uint32_t i = 1; i <= 2; i++) { + ReadOptions read_options; std::unique_ptr iter(c.GetTableReader()->NewIterator( - ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUserIterator)); iter->SeekToFirst(); while (iter->Valid()) { @@ -2690,7 +3235,7 @@ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2712,8 +3257,8 @@ GetContext::kNotFound, Slice(), nullptr, nullptr, nullptr, true, nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. - reader->Get(ReadOptions(), "non-exist-key", &get_context, - moptions.prefix_extractor.get()); + ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context, + moptions.prefix_extractor.get())); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertIndexBlockStat(0, 0); props.AssertFilterBlockStat(0, 0); @@ -2742,7 +3287,7 @@ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2787,6 +3332,7 @@ // Only data block will be accessed { iter->SeekToFirst(); + ASSERT_OK(iter->status()); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, 1, 0 + 1, // data block miss 0); @@ -2801,6 +3347,7 @@ { iter.reset(c.NewIterator(moptions.prefix_extractor.get())); iter->SeekToFirst(); + ASSERT_OK(iter->status()); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, 1 + 1, /* index block hit */ 1, 0 + 1 /* data block hit */); @@ -2820,9 +3367,9 @@ table_options.block_cache = NewLRUCache(1, 4); options.statistics = CreateDBStatistics(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(options); + const ImmutableOptions ioptions2(options); const MutableCFOptions moptions2(options); - c.Reopen(ioptions2, moptions2); + ASSERT_OK(c.Reopen(ioptions2, moptions2)); { BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, // index block miss @@ -2848,6 +3395,7 @@ // SeekToFirst() accesses data block. With similar reason, we expect data // block's cache miss. iter->SeekToFirst(); + ASSERT_OK(iter->status()); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(2, 0, 0 + 1, // data block miss 0); @@ -2866,7 +3414,7 @@ std::string user_key = "k01"; InternalKey internal_key(user_key, 0, kTypeValue); c3.Add(internal_key.Encode().ToString(), "hello"); - ImmutableCFOptions ioptions3(options); + ImmutableOptions ioptions3(options); MutableCFOptions moptions3(options); // Generate table without filter policy c3.Finish(options, ioptions3, moptions3, table_options, @@ -2877,7 +3425,7 @@ table_options.filter_policy.reset(NewBloomFilterPolicy(1)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); - ImmutableCFOptions ioptions4(options); + ImmutableOptions ioptions4(options); MutableCFOptions moptions4(options); ASSERT_OK(c3.Reopen(ioptions4, moptions4)); reader = dynamic_cast(c3.GetTableReader()); @@ -2900,7 +3448,7 @@ BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); const BlockBasedTableOptions* normalized_table_options = - (const BlockBasedTableOptions*)factory->GetOptions(); + factory->GetOptions(); ASSERT_EQ(normalized_table_options->block_size_deviation, expected); delete factory; @@ -2912,7 +3460,7 @@ BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); const BlockBasedTableOptions* normalized_table_options = - (const BlockBasedTableOptions*)factory->GetOptions(); + factory->GetOptions(); ASSERT_EQ(normalized_table_options->block_restart_interval, expected); delete factory; @@ -2961,7 +3509,7 @@ InternalKey internal_key(user_key, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c.Add(encoded_key, "hello"); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); // Generate table with filter policy c.Finish(options, ioptions, moptions, table_options, @@ -3049,7 +3597,7 @@ c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); @@ -3064,7 +3612,7 @@ ASSERT_OK(iter->status()); iter.reset(); - const ImmutableCFOptions ioptions1(opt); + const ImmutableOptions ioptions1(opt); const MutableCFOptions moptions1(opt); ASSERT_OK(c.Reopen(ioptions1, moptions1)); auto table_reader = dynamic_cast(c.GetTableReader()); @@ -3077,7 +3625,7 @@ // rerun with different block cache table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(opt); + const ImmutableOptions ioptions2(opt); const MutableCFOptions moptions2(opt); ASSERT_OK(c.Reopen(ioptions2, moptions2)); table_reader = dynamic_cast(c.GetTableReader()); @@ -3088,30 +3636,10 @@ c.ResetTableReader(); } -namespace { -class CustomMemoryAllocator : public MemoryAllocator { - public: - const char* Name() const override { return "CustomMemoryAllocator"; } - - void* Allocate(size_t size) override { - ++numAllocations; - auto ptr = new char[size + 16]; - memcpy(ptr, "memory_allocator_", 16); // mangle first 16 bytes - return reinterpret_cast(ptr + 16); - } - void Deallocate(void* p) override { - ++numDeallocations; - char* ptr = reinterpret_cast(p) - 16; - delete[] ptr; - } - - std::atomic numAllocations; - std::atomic numDeallocations; -}; -} // namespace - TEST_P(BlockBasedTableTest, MemoryAllocator) { - auto custom_memory_allocator = std::make_shared(); + auto default_memory_allocator = std::make_shared(); + auto custom_memory_allocator = + std::make_shared(default_memory_allocator); { Options opt; std::unique_ptr ikc; @@ -3137,7 +3665,7 @@ c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); @@ -3154,31 +3682,22 @@ // out of scope, block cache should have been deleted, all allocations // deallocated - EXPECT_EQ(custom_memory_allocator->numAllocations.load(), - custom_memory_allocator->numDeallocations.load()); + EXPECT_EQ(custom_memory_allocator->GetNumAllocations(), + custom_memory_allocator->GetNumDeallocations()); // make sure that allocations actually happened through the cache allocator - EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0); + EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0); } // Test the file checksum of block based table TEST_P(BlockBasedTableTest, NoFileChecksum) { Options options; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - SequenceNumber largest_seqno = 0; int level = 0; - std::vector> - int_tbl_prop_collector_factories; - - if (largest_seqno != 0) { - // Pretend that it's an external file written by SstFileWriter. - int_tbl_prop_collector_factories.emplace_back( - new SstFileWriterPropertiesCollectorFactory(2 /* version */, - 0 /* global_seqno*/)); - } + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; FileChecksumTestHelper f(true); @@ -3187,61 +3706,66 @@ builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, *comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); - f.ResetTableBuilder(std::move(builder)); + ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); - ASSERT_STREQ(f.GetFileChecksumFuncName(), - kUnknownFileChecksumFuncName.c_str()); - ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str()); + ASSERT_OK(f.WriteKVAndFlushTable()); + ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName); + ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum); } -TEST_P(BlockBasedTableTest, Crc32FileChecksum) { +TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { + FileChecksumGenCrc32cFactory* file_checksum_gen_factory = + new FileChecksumGenCrc32cFactory(); Options options; - options.sst_file_checksum_func = - std::shared_ptr(CreateFileChecksumFuncCrc32c()); - ImmutableCFOptions ioptions(options); + options.file_checksum_gen_factory.reset(file_checksum_gen_factory); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - SequenceNumber largest_seqno = 0; int level = 0; - std::vector> - int_tbl_prop_collector_factories; - - if (largest_seqno != 0) { - // Pretend that it's an external file written by SstFileWriter. - int_tbl_prop_collector_factories.emplace_back( - new SstFileWriterPropertiesCollectorFactory(2 /* version */, - 0 /* global_seqno*/)); - } + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; + FileChecksumGenContext gen_context; + gen_context.file_name = "db/tmp"; + std::unique_ptr checksum_crc32c_gen1 = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); FileChecksumTestHelper f(true); f.CreateWriteableFile(); - f.SetFileChecksumFunc(options.sst_file_checksum_func.get()); + f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); std::unique_ptr builder; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, *comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); - f.ResetTableBuilder(std::move(builder)); + ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); + + std::unique_ptr checksum_crc32c_gen2 = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); std::string checksum; - ASSERT_OK( - f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum)); + ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum)); ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); + + // Unit test the generator itself for schema stability + std::unique_ptr checksum_crc32c_gen3 = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + const char data[] = "here is some data"; + checksum_crc32c_gen3->Update(data, sizeof(data)); + checksum_crc32c_gen3->Finalize(); + checksum = checksum_crc32c_gen3->GetChecksum(); + ASSERT_STREQ(checksum.c_str(), "\345\245\277\110"); } // Plain table is not supported in ROCKSDB_LITE @@ -3253,23 +3777,21 @@ plain_table_options.hash_table_ratio = 0; PlainTableFactory factory(plain_table_options); - test::StringSink sink; - std::unique_ptr file_writer( - test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + std::unique_ptr sink(new test::StringSink()); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); Options options; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { @@ -3279,19 +3801,18 @@ builder->Add(key, value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); test::StringSink* ss = - ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(file_writer.get()); + static_cast(file_writer->writable_file()); + std::unique_ptr source( + new test::StringSource(ss->contents(), 72242, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss->contents(), 72242, true))); + new RandomAccessFileReader(std::move(source), "test")); - TableProperties* props = nullptr; + std::unique_ptr props; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), - kPlainTableMagicNumber, ioptions, - &props, true /* compression_type_missing */); - std::unique_ptr props_guard(props); + kPlainTableMagicNumber, ioptions, &props); ASSERT_OK(s); ASSERT_EQ(0ul, props->index_size); @@ -3310,66 +3831,71 @@ PlainTableFactory factory(plain_table_options); Options options; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; FileChecksumTestHelper f(true); f.CreateWriteableFile(); std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), f.GetFileWriter())); - f.ResetTableBuilder(std::move(builder)); + ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); - ASSERT_STREQ(f.GetFileChecksumFuncName(), - kUnknownFileChecksumFuncName.c_str()); - EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str()); + ASSERT_OK(f.WriteKVAndFlushTable()); + ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName); + EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum); } -TEST_F(PlainTableTest, Crc32FileChecksum) { +TEST_F(PlainTableTest, Crc32cFileChecksum) { PlainTableOptions plain_table_options; plain_table_options.user_key_len = 20; plain_table_options.bloom_bits_per_key = 8; plain_table_options.hash_table_ratio = 0; PlainTableFactory factory(plain_table_options); + FileChecksumGenCrc32cFactory* file_checksum_gen_factory = + new FileChecksumGenCrc32cFactory(); Options options; - options.sst_file_checksum_func = - std::shared_ptr(CreateFileChecksumFuncCrc32c()); - const ImmutableCFOptions ioptions(options); + options.file_checksum_gen_factory.reset(file_checksum_gen_factory); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; + + FileChecksumGenContext gen_context; + gen_context.file_name = "db/tmp"; + std::unique_ptr checksum_crc32c_gen1 = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); FileChecksumTestHelper f(true); f.CreateWriteableFile(); - f.SetFileChecksumFunc(options.sst_file_checksum_func.get()); + f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), f.GetFileWriter())); - f.ResetTableBuilder(std::move(builder)); + ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); + + std::unique_ptr checksum_crc32c_gen2 = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); std::string checksum; - ASSERT_OK( - f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum)); + ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum)); EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); } @@ -3387,11 +3913,12 @@ std::vector keys; stl_wrappers::KVMap kvmap; Options options; + options.db_host_id = ""; test::PlainInternalKeyComparator internal_comparator(options.comparator); options.compression = kNoCompression; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, internal_comparator, &keys, &kvmap); @@ -3427,16 +3954,16 @@ options.compression = comp; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6500)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7050)); c.ResetTableReader(); } @@ -3482,75 +4009,26 @@ } } -#ifndef ROCKSDB_VALGRIND_RUN -// RandomizedHarnessTest is very slow for certain combination of arguments -// Split into 8 pieces to reduce the time individual tests take. -TEST_F(HarnessTest, Randomized1) { - // part 1 out of 8 - const size_t part = 1; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized2) { - // part 2 out of 8 - const size_t part = 2; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized3) { - // part 3 out of 8 - const size_t part = 3; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized4) { - // part 4 out of 8 - const size_t part = 4; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized5) { - // part 5 out of 8 - const size_t part = 5; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized6) { - // part 6 out of 8 - const size_t part = 6; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized7) { - // part 7 out of 8 - const size_t part = 7; - const size_t total = 8; - RandomizedHarnessTest(part, total); -} - -TEST_F(HarnessTest, Randomized8) { - // part 8 out of 8 - const size_t part = 8; - const size_t total = 8; - RandomizedHarnessTest(part, total); +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) { + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + for (int e = 0; e < num_entries; e++) { + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + rnd.RandomString(rnd.Skewed(5))); + } + Test(&rnd); + } } #ifndef ROCKSDB_LITE -TEST_F(HarnessTest, RandomizedLongDB) { +TEST_F(DBHarnessTest, RandomizedLongDB) { Random rnd(test::RandomSeed()); - TestArgs args = {DB_TEST, false, 16, kNoCompression, 0, false}; - Init(args); int num_entries = 100000; for (int e = 0; e < num_entries; e++) { std::string v; - Add(test::RandomKey(&rnd, rnd.Skewed(4)), - test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5))); } Test(&rnd); @@ -3566,30 +4044,44 @@ ASSERT_GT(files, 0); } #endif // ROCKSDB_LITE -#endif // ROCKSDB_VALGRIND_RUN +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) -class MemTableTest : public testing::Test {}; +class MemTableTest : public testing::Test { + public: + MemTableTest() { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared(); + options_.memtable_factory = table_factory; + ImmutableOptions ioptions(options_); + wb_ = new WriteBufferManager(options_.db_write_buffer_size); + memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_, + kMaxSequenceNumber, 0 /* column_family_id */); + memtable_->Ref(); + } + + ~MemTableTest() { + delete memtable_->Unref(); + delete wb_; + } + + MemTable* GetMemTable() { return memtable_; } + + private: + MemTable* memtable_; + Options options_; + WriteBufferManager* wb_; +}; TEST_F(MemTableTest, Simple) { - InternalKeyComparator cmp(BytewiseComparator()); - auto table_factory = std::make_shared(); - Options options; - options.memtable_factory = table_factory; - ImmutableCFOptions ioptions(options); - WriteBufferManager wb(options.db_write_buffer_size); - MemTable* memtable = - new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, - kMaxSequenceNumber, 0 /* column_family_id */); - memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); - batch.Put(std::string("k1"), std::string("v1")); - batch.Put(std::string("k2"), std::string("v2")); - batch.Put(std::string("k3"), std::string("v3")); - batch.Put(std::string("largekey"), std::string("vlarge")); - batch.DeleteRange(std::string("chi"), std::string("xigua")); - batch.DeleteRange(std::string("begin"), std::string("end")); - ColumnFamilyMemTablesDefault cf_mems_default(memtable); + ASSERT_OK(batch.Put(std::string("k1"), std::string("v1"))); + ASSERT_OK(batch.Put(std::string("k2"), std::string("v2"))); + ASSERT_OK(batch.Put(std::string("k3"), std::string("v3"))); + ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge"))); + ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua"))); + ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end"))); + ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable()); ASSERT_TRUE( WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr) .ok()); @@ -3600,10 +4092,10 @@ std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = memtable->NewIterator(ReadOptions(), &arena); + iter = GetMemTable()->NewIterator(ReadOptions(), &arena); arena_iter_guard.set(iter); } else { - iter = memtable->NewRangeTombstoneIterator( + iter = GetMemTable()->NewRangeTombstoneIterator( ReadOptions(), kMaxSequenceNumber /* read_seq */); iter_guard.reset(iter); } @@ -3617,174 +4109,123 @@ iter->Next(); } } - - delete memtable->Unref(); } // Test the empty key -TEST_F(HarnessTest, SimpleEmptyKey) { - auto args = GenerateArgList(); - for (const auto& arg : args) { - Init(arg); - Random rnd(test::RandomSeed() + 1); - Add("", "v"); - Test(&rnd); - } +TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) { + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); } -TEST_F(HarnessTest, SimpleSingle) { - auto args = GenerateArgList(); - for (const auto& arg : args) { - Init(arg); - Random rnd(test::RandomSeed() + 2); - Add("abc", "v"); - Test(&rnd); - } +TEST_P(ParameterizedHarnessTest, SimpleSingle) { + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); } -TEST_F(HarnessTest, SimpleMulti) { - auto args = GenerateArgList(); - for (const auto& arg : args) { - Init(arg); - Random rnd(test::RandomSeed() + 3); - Add("abc", "v"); - Add("abcd", "v"); - Add("ac", "v2"); - Test(&rnd); - } +TEST_P(ParameterizedHarnessTest, SimpleMulti) { + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); } -TEST_F(HarnessTest, SimpleSpecialKey) { - auto args = GenerateArgList(); - for (const auto& arg : args) { - Init(arg); - Random rnd(test::RandomSeed() + 4); - Add("\xff\xff", "v3"); - Test(&rnd); - } +TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) { + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); } -TEST_F(HarnessTest, FooterTests) { - { - // upconvert legacy block based - std::string encoded; - Footer footer(kLegacyBlockBasedTableMagicNumber, 0); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.EncodeTo(&encoded); - Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); - ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kCRC32c); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 0U); - } - { - // xxhash block based - std::string encoded; - Footer footer(kBlockBasedTableMagicNumber, 1); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.set_checksum(kxxHash); - footer.EncodeTo(&encoded); - Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); - ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kxxHash); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 1U); - } +TEST(TableTest, FooterTests) { + Random* r = Random::GetTLSInstance(); + uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100); + uint64_t index_size = r->Uniform(1000000000); + uint64_t metaindex_size = r->Uniform(1000000); + // 5 == block trailer size + BlockHandle index(data_size + 5, index_size); + BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size); + uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5; { - // xxhash64 block based - std::string encoded; - Footer footer(kBlockBasedTableMagicNumber, 1); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.set_checksum(kxxHash64); - footer.EncodeTo(&encoded); + // legacy block based + FooterBuilder footer; + footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0, + footer_offset, kCRC32c, meta_index, index); Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kxxHash64); + ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c); ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 1U); + ASSERT_EQ(decoded_footer.format_version(), 0U); + ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U); + // Ensure serialized with legacy magic + ASSERT_EQ( + DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8), + kLegacyBlockBasedTableMagicNumber); + } + // block based, various checksums, various versions + for (auto t : GetSupportedChecksums()) { + for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) { + FooterBuilder footer; + footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t, + meta_index, index); + Footer decoded_footer; + ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); + ASSERT_EQ(decoded_footer.table_magic_number(), + kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum_type(), t); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), + meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.format_version(), fv); + ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U); + } } // Plain table is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE { - // upconvert legacy plain table - std::string encoded; - Footer footer(kLegacyPlainTableMagicNumber, 0); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.EncodeTo(&encoded); + // legacy plain table + FooterBuilder footer; + footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset, + kNoChecksum, meta_index); Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c); ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 0U); + ASSERT_EQ(decoded_footer.index_handle().offset(), 0U); + ASSERT_EQ(decoded_footer.index_handle().size(), 0U); + ASSERT_EQ(decoded_footer.format_version(), 0U); + ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); + // Ensure serialized with legacy magic + ASSERT_EQ( + DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8), + kLegacyPlainTableMagicNumber); } { - // xxhash block based - std::string encoded; - Footer footer(kPlainTableMagicNumber, 1); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.set_checksum(kxxHash); - footer.EncodeTo(&encoded); + // xxhash plain table (not currently used) + FooterBuilder footer; + footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset, + kxxHash, meta_index); Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kxxHash); + ASSERT_EQ(decoded_footer.checksum_type(), kxxHash); ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 1U); + ASSERT_EQ(decoded_footer.index_handle().offset(), 0U); + ASSERT_EQ(decoded_footer.index_handle().size(), 0U); + ASSERT_EQ(decoded_footer.format_version(), 1U); + ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); } #endif // !ROCKSDB_LITE - { - // version == 2 - std::string encoded; - Footer footer(kBlockBasedTableMagicNumber, 2); - BlockHandle meta_index(10, 5), index(20, 15); - footer.set_metaindex_handle(meta_index); - footer.set_index_handle(index); - footer.EncodeTo(&encoded); - Footer decoded_footer; - Slice encoded_slice(encoded); - decoded_footer.DecodeFrom(&encoded_slice); - ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); - ASSERT_EQ(decoded_footer.checksum(), kCRC32c); - ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); - ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); - ASSERT_EQ(decoded_footer.version(), 2U); - } } class IndexBlockRestartIntervalTest @@ -3816,28 +4257,31 @@ table_options.index_block_restart_interval = index_block_restart_interval; if (value_delta_encoding) { table_options.format_version = 4; + } else { + table_options.format_version = 3; } options.table_factory.reset(new BlockBasedTableFactory(table_options)); TableConstructor c(BytewiseComparator()); static Random rnd(301); for (int i = 0; i < kKeysInTable; i++) { - InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue); - c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue); + c.Add(k.Encode().ToString(), rnd.RandomString(kValSize)); } std::vector keys; stl_wrappers::KVMap kvmap; std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, &kvmap); auto reader = c.GetTableReader(); + ReadOptions read_options; std::unique_ptr db_iter(reader->NewIterator( - ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup @@ -3881,8 +4325,7 @@ } bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override { - assert(IsValid(src)); - return true; + return IsValid(src); } bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override { @@ -3925,7 +4368,7 @@ const std::string kDBPath = test::PerThreadDBPath("table_prefix_test"); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyDB(kDBPath, options); + ASSERT_OK(DestroyDB(kDBPath, options)); ROCKSDB_NAMESPACE::DB* db; ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); @@ -3934,12 +4377,12 @@ std::string prefix = "[" + std::to_string(i) + "]"; for (int j = 0; j < 10; j++) { std::string key = prefix + std::to_string(j); - db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"); + ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1")); } } // Trigger compaction. - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); delete db; // In the second round, turn whole_key_filtering off and expect // rocksdb still works. @@ -3955,15 +4398,15 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; int_tbl_prop_collector_factories.emplace_back( new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); @@ -3971,9 +4414,8 @@ std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { @@ -3984,7 +4426,7 @@ builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); test::RandomRWStringSink ss_rw(sink); uint32_t version; @@ -3993,24 +4435,22 @@ // Helper function to get version, global_seqno, global_seqno_offset std::function GetVersionAndGlobalSeqno = [&]() { + std::unique_ptr source( + new test::StringSource(ss_rw.contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "")); - TableProperties* props = nullptr; + std::unique_ptr props; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props, true /* compression_type_missing */)); + &props)); UserCollectedProperties user_props = props->user_collected_properties; version = DecodeFixed32( user_props[ExternalSstFilePropertyNames::kVersion].c_str()); global_seqno = DecodeFixed64( user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str()); - global_seqno_offset = - props->properties_offsets[ExternalSstFilePropertyNames::kGlobalSeqno]; - - delete props; + global_seqno_offset = props->external_sst_file_global_seqno_offset; }; // Helper function to update the value of the global seqno in the file @@ -4018,23 +4458,26 @@ std::string new_global_seqno; PutFixed64(&new_global_seqno, val); - ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno)); + ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(), + nullptr)); }; // Helper function to get the contents of the table InternalIterator std::unique_ptr table_reader; + const ReadOptions read_options; std::function GetTableInternalIter = [&]() { + std::unique_ptr source( + new test::StringSource(ss_rw.contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "")); options.table_factory->NewTableReader( - TableReaderOptions(ioptions, moptions.prefix_extractor.get(), - EnvOptions(), ikc), + TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), + ikc), std::move(file_reader), ss_rw.contents().size(), &table_reader); return table_reader->NewIterator( - ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized); }; @@ -4046,7 +4489,7 @@ char current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 0); @@ -4067,7 +4510,7 @@ current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 10); @@ -4085,7 +4528,7 @@ ASSERT_TRUE(iter->Valid()); ParsedInternalKey pik; - ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 10); @@ -4104,7 +4547,7 @@ current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 3); @@ -4123,7 +4566,7 @@ ASSERT_TRUE(iter->Valid()); ParsedInternalKey pik; - ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 3); @@ -4138,23 +4581,22 @@ BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.compression = kNoCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (int i = 1; i <= 10000; ++i) { @@ -4167,24 +4609,22 @@ builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); - test::RandomRWStringSink ss_rw(sink); + std::unique_ptr source( + new test::StringSource(sink->contents(), 73342, false)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); - + new RandomAccessFileReader(std::move(source), "test")); // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { - TableProperties* props = nullptr; - ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), + std::unique_ptr props; + ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props, true /* compression_type_missing */)); + &props)); uint64_t data_block_size = props->data_size / props->num_data_blocks; ASSERT_EQ(data_block_size, 4096); ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks); - delete props; }; VerifyBlockAlignment(); @@ -4196,17 +4636,17 @@ bbto.block_align = false; Options options2; options2.table_factory.reset(NewBlockBasedTableFactory(bbto)); - ImmutableCFOptions ioptions2(options2); + ImmutableOptions ioptions2(options2); const MutableCFOptions moptions2(options2); ASSERT_OK(ioptions.table_factory->NewTableReader( - TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(), - EnvOptions(), + TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), GetPlainInternalComparator(options2.comparator)), - std::move(file_reader), ss_rw.contents().size(), &table_reader)); + std::move(file_reader), sink->contents().size(), &table_reader)); + ReadOptions read_options; std::unique_ptr db_iter(table_reader->NewIterator( - ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); int expected_key = 1; @@ -4229,26 +4669,25 @@ BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.compression = kNoCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (int i = 1; i <= 10000; ++i) { @@ -4261,20 +4700,22 @@ builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); - test::RandomRWStringSink ss_rw(sink); + std::unique_ptr source( + new test::StringSource(sink->contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "test")); { RandomAccessFileReader* file = file_reader.get(); - uint64_t file_size = ss_rw.contents().size(); + uint64_t file_size = sink->contents().size(); Footer footer; - ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, - &footer, kBlockBasedTableMagicNumber)); + IOOptions opts; + ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */, + file_size, &footer, + kBlockBasedTableMagicNumber)); auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { @@ -4297,25 +4738,20 @@ BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex, &metaindex_contents); - Block metaindex_block(std::move(metaindex_contents), - kDisableGlobalSequenceNumber); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator())); - bool found_properties_block = true; - ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); - ASSERT_TRUE(found_properties_block); + BytewiseComparator(), kDisableGlobalSequenceNumber)); // -- Read properties block - Slice v = meta_iter->value(); BlockHandle properties_handle; - ASSERT_OK(properties_handle.DecodeFrom(&v)); + ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName, + &properties_handle)); + ASSERT_FALSE(properties_handle.IsNull()); BlockContents properties_contents; - BlockFetchHelper(properties_handle, BlockType::kProperties, &properties_contents); - Block properties_block(std::move(properties_contents), - kDisableGlobalSequenceNumber); + Block properties_block(std::move(properties_contents)); ASSERT_EQ(properties_block.NumRestarts(), 1u); } @@ -4344,7 +4780,7 @@ table_options.filter_policy.reset(NewBloomFilterPolicy( 8 /* bits_per_key */, false /* use_block_based_filter */)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); std::vector keys; stl_wrappers::KVMap kvmap; @@ -4353,15 +4789,17 @@ // get file reader test::StringSink* table_sink = c.TEST_GetSink(); - std::unique_ptr table_reader{ - test::GetRandomAccessFileReader( - new test::StringSource(table_sink->contents(), 0 /* unique_id */, - false /* allow_mmap_reads */))}; + std::unique_ptr source(new test::StringSource( + table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */)); + + std::unique_ptr table_reader( + new RandomAccessFileReader(std::move(source), "test")); size_t table_size = table_sink->contents().size(); // read footer Footer footer; - ASSERT_OK(ReadFooterFromFile(table_reader.get(), + IOOptions opts; + ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), nullptr /* prefetch_buffer */, table_size, &footer, kBlockBasedTableMagicNumber)); @@ -4376,12 +4814,11 @@ UncompressionDict::GetEmptyDict(), pcache_opts, nullptr /*memory_allocator*/); ASSERT_OK(block_fetcher.ReadBlockContents()); - Block metaindex_block(std::move(metaindex_contents), - kDisableGlobalSequenceNumber); + Block metaindex_block(std::move(metaindex_contents)); // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewDataIterator(options.comparator, options.comparator)}; + metaindex_block.NewMetaIterator()}; uint64_t max_offset = 0; std::string key_at_max_offset; for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); @@ -4394,13 +4831,97 @@ key_at_max_offset = metaindex_iter->key().ToString(); } } - ASSERT_EQ(kPropertiesBlock, key_at_max_offset); + ASSERT_EQ(kPropertiesBlockName, key_at_max_offset); // index handle is stored in footer rather than metaindex block, so need // separate logic to verify it comes before properties block. ASSERT_GT(max_offset, footer.index_handle().offset()); c.ResetTableReader(); } +TEST_P(BlockBasedTableTest, SeekMetaBlocks) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("foo_a1", "val1"); + c.Add("foo_b2", "val2"); + c.Add("foo_c3", "val3"); + c.Add("foo_d4", "val4"); + c.Add("foo_e5", "val5"); + c.Add("foo_f6", "val6"); + c.Add("foo_g7", "val7"); + c.Add("foo_h8", "val8"); + c.Add("foo_j9", "val9"); + + // write an SST file + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy( + 8 /* bits_per_key */, false /* use_block_based_filter */)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + // get file reader + test::StringSink* table_sink = c.TEST_GetSink(); + std::unique_ptr source(new test::StringSource( + table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */)); + + std::unique_ptr table_reader( + new RandomAccessFileReader(std::move(source), "test")); + size_t table_size = table_sink->contents().size(); + + // read footer + Footer footer; + IOOptions opts; + ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), + nullptr /* prefetch_buffer */, table_size, + &footer, kBlockBasedTableMagicNumber)); + + // read metaindex + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + PersistentCacheOptions pcache_opts; + BlockFetcher block_fetcher( + table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), + metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), pcache_opts, + nullptr /*memory_allocator*/); + ASSERT_OK(block_fetcher.ReadBlockContents()); + Block metaindex_block(std::move(metaindex_contents)); + + // verify properties block comes last + std::unique_ptr metaindex_iter( + metaindex_block.NewMetaIterator()); + bool has_hash_prefixes = false; + bool has_hash_metadata = false; + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) { + has_hash_prefixes = true; + } else if (metaindex_iter->key().ToString() == + kHashIndexPrefixesMetadataBlock) { + has_hash_metadata = true; + } + } + if (has_hash_metadata) { + metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock); + ASSERT_TRUE(metaindex_iter->Valid()); + ASSERT_EQ(kHashIndexPrefixesMetadataBlock, + metaindex_iter->key().ToString()); + } + if (has_hash_prefixes) { + metaindex_iter->Seek(kHashIndexPrefixesBlock); + ASSERT_TRUE(metaindex_iter->Valid()); + ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString()); + } + c.ResetTableReader(); +} + TEST_P(BlockBasedTableTest, BadOptions) { ROCKSDB_NAMESPACE::Options options; options.compression = kNoCompression; @@ -4411,7 +4932,7 @@ const std::string kDBPath = test::PerThreadDBPath("block_based_table_bad_options_test"); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyDB(kDBPath, options); + ASSERT_OK(DestroyDB(kDBPath, options)); ROCKSDB_NAMESPACE::DB* db; ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); @@ -4457,10 +4978,18 @@ TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { TailPrefetchStats tpstats; - FilePrefetchBuffer buffer(nullptr, 0, 0, false, true); - buffer.TryReadFromCache(500, 10, nullptr); - buffer.TryReadFromCache(480, 10, nullptr); - buffer.TryReadFromCache(490, 10, nullptr); + FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */, + false /* enable */, true /* track_min_offset */); + IOOptions opts; + buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */, + 10 /* n */, nullptr /* result */, + nullptr /* status */); + buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */, + 10 /* n */, nullptr /* result */, + nullptr /* status */); + buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */, + 10 /* n */, nullptr /* result */, + nullptr /* status */); ASSERT_EQ(480, buffer.min_offset_read()); } @@ -4483,14 +5012,14 @@ static Random rnd(1048); for (int i = 0; i < kNumKeys; i++) { // padding one "0" to mark existent keys. - std::string random_key(RandomString(&rnd, kKeySize - 1) + "1"); + std::string random_key(rnd.RandomString(kKeySize - 1) + "1"); InternalKey k(random_key, 0, kTypeValue); - c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + c.Add(k.Encode().ToString(), rnd.RandomString(kValSize)); } std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, @@ -4499,8 +5028,9 @@ auto reader = c.GetTableReader(); std::unique_ptr seek_iter; + ReadOptions read_options; seek_iter.reset(reader->NewIterator( - ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); for (int i = 0; i < 2; ++i) { ReadOptions ro; @@ -4572,7 +5102,7 @@ Options options; BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_opt, GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); @@ -4587,13 +5117,15 @@ /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->SeekToFirst(); ASSERT_FALSE(iter->Valid()); - ASSERT_TRUE(iter->IsOutOfBound()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound); iter.reset(new KeyConvertingIterator(reader->NewIterator( read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("foo"); ASSERT_FALSE(iter->Valid()); - ASSERT_TRUE(iter->IsOutOfBound()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound); } // BlockBasedTableIterator should invalidate itself and return @@ -4610,7 +5142,7 @@ table_opt.flush_block_policy_factory = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_opt, GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); @@ -4628,7 +5160,7 @@ ASSERT_EQ("bar", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); - ASSERT_TRUE(iter->IsOutOfBound()); + ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound); std::string ub2 = "foo_after"; Slice ub_slice2(ub2); read_opt.iterate_upper_bound = &ub_slice2; @@ -4640,12 +5172,246 @@ ASSERT_EQ("foo", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); - ASSERT_FALSE(iter->IsOutOfBound()); + ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound); +} + +TEST_P( + BlockBasedTableTest, + IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBuilderFinish) { + constexpr std::size_t kSizeDummyEntry = 256 * 1024; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024; + constexpr std::size_t kMaxDictBytes = 1024; + constexpr std::size_t kMaxDictBufferBytes = 1024; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache(NewLRUCache(lo)); + table_options.block_cache = cache; + table_options.flush_block_policy_factory = + std::make_shared(); + + Options options; + options.compression = kSnappyCompression; + options.compression_opts.max_dict_bytes = kMaxDictBytes; + options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + test::StringSink* sink = new test::StringSink(); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "test_file_name", FileOptions())); + + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + std::unique_ptr builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kSnappyCompression, + options.compression_opts, kUnknownColumnFamily, + "test_cf", -1 /* level */), + file_writer.get())); + + std::string key1 = "key1"; + std::string value1 = "val1"; + InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue); + // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy + // therefore won't trigger any data block's buffering + builder->Add(ik1.Encode(), value1); + ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); + + std::string key2 = "key2"; + std::string value2 = "val2"; + InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue); + // Adding the second key will trigger a flush of the last data block (the one + // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger + // buffering of that data block. + builder->Add(ik2.Encode(), value2); + // Cache reservation will increase for last buffered data block (the one + // containing key1 and value1) since the buffer limit is not exceeded after + // that buffering and the cache will not be full after this reservation + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + ASSERT_OK(builder->Finish()); + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); +} + +TEST_P( + BlockBasedTableTest, + IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBufferLimitExceed) { + constexpr std::size_t kSizeDummyEntry = 256 * 1024; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024; + constexpr std::size_t kMaxDictBytes = 1024; + constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache(NewLRUCache(lo)); + table_options.block_cache = cache; + table_options.flush_block_policy_factory = + std::make_shared(); + + Options options; + options.compression = kSnappyCompression; + options.compression_opts.max_dict_bytes = kMaxDictBytes; + options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + test::StringSink* sink = new test::StringSink(); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "test_file_name", FileOptions())); + + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + std::unique_ptr builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kSnappyCompression, + options.compression_opts, kUnknownColumnFamily, + "test_cf", -1 /* level */), + file_writer.get())); + + std::string key1 = "key1"; + std::string value1(kSizeDummyEntry, '0'); + InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue); + // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy + // therefore won't trigger any data block's buffering + builder->Add(ik1.Encode(), value1); + ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); + + std::string key2 = "key2"; + std::string value2(kSizeDummyEntry, '0'); + InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue); + // Adding the second key will trigger a flush of the last data block (the one + // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger + // buffering of the last data block. + builder->Add(ik2.Encode(), value2); + // Cache reservation will increase for last buffered data block (the one + // containing key1 and value1) since the buffer limit is not exceeded after + // the buffering and the cache will not be full after this reservation + EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry); + EXPECT_LT(cache->GetPinnedUsage(), + 2 * kSizeDummyEntry + kMetaDataChargeOverhead); + + std::string key3 = "key3"; + std::string value3 = "val3"; + InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue); + // Adding the third key will trigger a flush of the last data block (the one + // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger + // buffering of the last data block. + builder->Add(ik3.Encode(), value3); + // Cache reservation will decrease since the buffer limit is now exceeded + // after the last buffering and EnterUnbuffered() is triggered + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); + + ASSERT_OK(builder->Finish()); + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); +} + +TEST_P( + BlockBasedTableTest, + IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnCacheFull) { + constexpr std::size_t kSizeDummyEntry = 256 * 1024; + constexpr std::size_t kMetaDataChargeOverhead = 10000; + // A small kCacheCapacity is chosen so that increase cache reservation for + // buffering two data blocks, each containing key1/value1, key2/a big + // value2, will cause cache full + constexpr std::size_t kCacheCapacity = + 1 * kSizeDummyEntry + kSizeDummyEntry / 2; + constexpr std::size_t kMaxDictBytes = 1024; + // A big kMaxDictBufferBytes is chosen so that adding a big key value pair + // (key2, value2) won't exceed the buffer limit + constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache(NewLRUCache(lo)); + table_options.block_cache = cache; + table_options.flush_block_policy_factory = + std::make_shared(); + + Options options; + options.compression = kSnappyCompression; + options.compression_opts.max_dict_bytes = kMaxDictBytes; + options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + test::StringSink* sink = new test::StringSink(); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "test_file_name", FileOptions())); + + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; + + std::unique_ptr builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kSnappyCompression, + options.compression_opts, kUnknownColumnFamily, + "test_cf", -1 /* level */), + file_writer.get())); + + std::string key1 = "key1"; + std::string value1 = "val1"; + InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue); + // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy + // therefore won't trigger any data block's buffering + builder->Add(ik1.Encode(), value1); + ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); + + std::string key2 = "key2"; + std::string value2(kSizeDummyEntry, '0'); + InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue); + // Adding the second key will trigger a flush of the last data block (the one + // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger + // buffering of the last data block. + builder->Add(ik2.Encode(), value2); + // Cache reservation will increase for the last buffered data block (the one + // containing key1 and value1) since the buffer limit is not exceeded after + // the buffering and the cache will not be full after this reservation + EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry); + EXPECT_LT(cache->GetPinnedUsage(), + 1 * kSizeDummyEntry + kMetaDataChargeOverhead); + + std::string key3 = "key3"; + std::string value3 = "value3"; + InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue); + // Adding the third key will trigger a flush of the last data block (the one + // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger + // buffering of the last data block. + builder->Add(ik3.Encode(), value3); + // Cache reservation will decrease since the cache is now full after + // increasing reservation for the last buffered block and EnterUnbuffered() is + // triggered + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); + + ASSERT_OK(builder->Finish()); + EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry); } } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/two_level_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/two_level_iterator.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc 2025-05-19 16:14:27.000000000 +0000 @@ -43,6 +43,10 @@ assert(Valid()); return second_level_iter_.key(); } + Slice user_key() const override { + assert(Valid()); + return second_level_iter_.user_key(); + } IndexValue value() const override { assert(Valid()); return second_level_iter_.value(); @@ -197,6 +201,10 @@ state_->NewSecondaryIterator(handle); data_block_handle_ = handle; SetSecondLevelIterator(iter); + if (iter == nullptr) { + status_ = Status::Corruption("Missing block for partition " + + handle.ToString()); + } } } } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,166 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "table/unique_id_impl.h" +#include "util/coding_lean.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string EncodeSessionId(uint64_t upper, uint64_t lower) { + std::string db_session_id(20U, '\0'); + char *buf = &db_session_id[0]; + // Preserving `lower` is slightly tricky. 36^12 is slightly more than + // 62 bits, so we use 12 chars plus the bottom two bits of one more. + // (A tiny fraction of 20 digit strings go unused.) + uint64_t a = (upper << 2) | (lower >> 62); + uint64_t b = lower & (UINT64_MAX >> 2); + PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true); + PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true); + assert(buf == &db_session_id.back() + 1); + return db_session_id; +} + +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower) { + const size_t len = db_session_id.size(); + if (len == 0) { + return Status::NotSupported("Missing db_session_id"); + } + // Anything from 13 to 24 chars is reasonable. We don't have to limit to + // exactly 20. + if (len < 13) { + return Status::NotSupported("Too short db_session_id"); + } + if (len > 24) { + return Status::NotSupported("Too long db_session_id"); + } + uint64_t a = 0, b = 0; + const char *buf = &db_session_id.front(); + bool success = ParseBaseChars<36>(&buf, len - 12U, &a); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + success = ParseBaseChars<36>(&buf, 12U, &b); + if (!success) { + return Status::NotSupported("Bad digit in db_session_id"); + } + assert(buf == &db_session_id.back() + 1); + *upper = a >> 2; + *lower = (b & (UINT64_MAX >> 2)) | (a << 62); + return Status::OK(); +} + +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x3 *out) { + if (db_id.empty()) { + return Status::NotSupported("Missing db_id"); + } + if (file_number == 0) { + return Status::NotSupported("Missing or bad file number"); + } + if (db_session_id.empty()) { + return Status::NotSupported("Missing db_session_id"); + } + uint64_t session_upper = 0; // Assignment to appease clang-analyze + uint64_t session_lower = 0; // Assignment to appease clang-analyze + { + Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); + if (!s.ok()) { + return s; + } + } + + // Exactly preserve session lower to ensure that session ids generated + // during the same process lifetime are guaranteed unique. + // DBImpl also guarantees (in recent versions) that this is not zero, + // so that we can guarantee unique ID is never all zeros. (Can't assert + // that here because of testing and old versions.) + // We put this first in anticipation of matching a small-ish set of cache + // key prefixes to cover entries relevant to any DB. + (*out)[0] = session_lower; + + // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) + // for very high global uniqueness entropy. + // (It is possible that many DBs descended from one common DB id are copied + // around and proliferate, in which case session id is critical, but it is + // more common for different DBs to have different DB ids.) + uint64_t db_a, db_b; + Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b); + + // Xor in file number for guaranteed uniqueness by file number for a given + // session and DB id. (Xor slightly better than + here. See + // https://github.com/pdillinger/unique_id ) + (*out)[1] = db_a ^ file_number; + + // Extra (optional) global uniqueness + (*out)[2] = db_b; + + return Status::OK(); +} + +namespace { +// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all +// zeros in first 128 bits to map to itself, so that excluding zero in +// internal IDs (session_lower != 0 above) does the same for external IDs. +// These values are meaningless except for making that work. +constexpr uint64_t kHiOffsetForZero = 17391078804906429400U; +constexpr uint64_t kLoOffsetForZero = 6417269962128484497U; +} // namespace + +void InternalUniqueIdToExternal(UniqueId64x3 *in_out) { + uint64_t hi, lo; + BijectiveHash2x64((*in_out)[1] + kHiOffsetForZero, + (*in_out)[0] + kLoOffsetForZero, &hi, &lo); + (*in_out)[0] = lo; + (*in_out)[1] = hi; + (*in_out)[2] += lo + hi; +} + +void ExternalUniqueIdToInternal(UniqueId64x3 *in_out) { + uint64_t lo = (*in_out)[0]; + uint64_t hi = (*in_out)[1]; + (*in_out)[2] -= lo + hi; + BijectiveUnhash2x64(hi, lo, &hi, &lo); + (*in_out)[0] = lo - kLoOffsetForZero; + (*in_out)[1] = hi - kHiOffsetForZero; +} + +std::string EncodeUniqueIdBytes(const UniqueId64x3 &in) { + std::string ret(24U, '\0'); + EncodeFixed64(&ret[0], in[0]); + EncodeFixed64(&ret[8], in[1]); + EncodeFixed64(&ret[16], in[2]); + return ret; +} + +Status GetUniqueIdFromTableProperties(const TableProperties &props, + std::string *out_id) { + UniqueId64x3 tmp{}; + Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id, + props.orig_file_number, &tmp); + if (s.ok()) { + InternalUniqueIdToExternal(&tmp); + *out_id = EncodeUniqueIdBytes(tmp); + } else { + out_id->clear(); + } + return s; +} + +std::string UniqueIdToHumanString(const std::string &id) { + // Not so efficient, but that's OK + std::string str = Slice(id).ToString(/*hex*/ true); + for (size_t i = 16; i < str.size(); i += 17) { + str.insert(i, "-"); + } + return str; +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id_impl.h 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/unique_id.h" + +namespace ROCKSDB_NAMESPACE { + +using UniqueId64x3 = std::array; + +// Helper for GetUniqueIdFromTableProperties. This function can also be used +// for temporary ids for files without sufficient information in table +// properties. The internal unique id is more structured than the public +// unique id, so can be manipulated in more ways but very carefully. +// These must be long term stable to ensure GetUniqueIdFromTableProperties +// is long term stable. +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x3 *out); + +// Helper for GetUniqueIdFromTableProperties. External unique ids go through +// this extra hashing layer so that prefixes of the unique id have predictable +// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on +// the full 192 bits. +// This transformation must be long term stable to ensure +// GetUniqueIdFromTableProperties is long term stable. +void InternalUniqueIdToExternal(UniqueId64x3 *in_out); + +// Reverse of InternalUniqueIdToExternal mostly for testing purposes +// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits). +void ExternalUniqueIdToInternal(UniqueId64x3 *in_out); + +// Convert numerical format to byte format for public API +std::string EncodeUniqueIdBytes(const UniqueId64x3 &in); + +// Reformat a random value down to our "DB session id" format, +// which is intended to be compact and friendly for use in file names. +// `lower` is fully preserved and data is lost from `upper`. +// +// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of +// entropy, which is enough to expect no collisions across a billion servers +// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id: +// * Save ~ dozen bytes per SST file +// * Shorter shared backup file names (some platforms have low limits) +// * Visually distinct from DB id format (usually RFC-4122) +std::string EncodeSessionId(uint64_t upper, uint64_t lower); + +// Reverse of EncodeSessionId. Returns NotSupported on error rather than +// Corruption because non-standard session IDs should be allowed with degraded +// functionality. +Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper, + uint64_t *lower); + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,437 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright 2014 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// This test uses a custom Env to keep track of the state of a filesystem as of -// the last "sync". It then checks for data loss errors by purposely dropping -// file data (or entire files) not protected by a "sync". - -#include "test_util/fault_injection_test_env.h" -#include -#include - -namespace ROCKSDB_NAMESPACE { - -// Assume a filename, and not a directory name like "/foo/bar/" -std::string GetDirName(const std::string filename) { - size_t found = filename.find_last_of("/\\"); - if (found == std::string::npos) { - return ""; - } else { - return filename.substr(0, found); - } -} - -// A basic file truncation function suitable for this test. -Status Truncate(Env* env, const std::string& filename, uint64_t length) { - std::unique_ptr orig_file; - const EnvOptions options; - Status s = env->NewSequentialFile(filename, &orig_file, options); - if (!s.ok()) { - fprintf(stderr, "Cannot open file %s for truncation: %s\n", - filename.c_str(), s.ToString().c_str()); - return s; - } - - std::unique_ptr scratch(new char[length]); - ROCKSDB_NAMESPACE::Slice result; - s = orig_file->Read(length, &result, scratch.get()); -#ifdef OS_WIN - orig_file.reset(); -#endif - if (s.ok()) { - std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; - std::unique_ptr tmp_file; - s = env->NewWritableFile(tmp_name, &tmp_file, options); - if (s.ok()) { - s = tmp_file->Append(result); - if (s.ok()) { - s = env->RenameFile(tmp_name, filename); - } else { - fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), - filename.c_str(), s.ToString().c_str()); - env->DeleteFile(tmp_name); - } - } - } - if (!s.ok()) { - fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), - s.ToString().c_str()); - } - - return s; -} - -// Trim the tailing "/" in the end of `str` -std::string TrimDirname(const std::string& str) { - size_t found = str.find_last_not_of("/"); - if (found == std::string::npos) { - return str; - } - return str.substr(0, found + 1); -} - -// Return pair of a full path. -std::pair GetDirAndName(const std::string& name) { - std::string dirname = GetDirName(name); - std::string fname = name.substr(dirname.size() + 1); - return std::make_pair(dirname, fname); -} - -Status FileState::DropUnsyncedData(Env* env) const { - ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; - return Truncate(env, filename_, sync_pos); -} - -Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { - ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; - assert(pos_ >= sync_pos); - int range = static_cast(pos_ - sync_pos); - uint64_t truncated_size = - static_cast(sync_pos) + rand->Uniform(range); - return Truncate(env, filename_, truncated_size); -} - -Status TestDirectory::Fsync() { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - env_->SyncDir(dirname_); - return dir_->Fsync(); -} - -TestWritableFile::TestWritableFile(const std::string& fname, - std::unique_ptr&& f, - FaultInjectionTestEnv* env) - : state_(fname), - target_(std::move(f)), - writable_file_opened_(true), - env_(env) { - assert(target_ != nullptr); - state_.pos_ = 0; -} - -TestWritableFile::~TestWritableFile() { - if (writable_file_opened_) { - Close(); - } -} - -Status TestWritableFile::Append(const Slice& data) { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - Status s = target_->Append(data); - if (s.ok()) { - state_.pos_ += data.size(); - env_->WritableFileAppended(state_); - } - return s; -} - -Status TestWritableFile::Close() { - writable_file_opened_ = false; - Status s = target_->Close(); - if (s.ok()) { - env_->WritableFileClosed(state_); - } - return s; -} - -Status TestWritableFile::Flush() { - Status s = target_->Flush(); - if (s.ok() && env_->IsFilesystemActive()) { - state_.pos_at_last_flush_ = state_.pos_; - } - return s; -} - -Status TestWritableFile::Sync() { - if (!env_->IsFilesystemActive()) { - return Status::IOError("FaultInjectionTestEnv: not active"); - } - // No need to actual sync. - state_.pos_at_last_sync_ = state_.pos_; - env_->WritableFileSynced(state_); - return Status::OK(); -} - -TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/, - std::unique_ptr&& f, - FaultInjectionTestEnv* env) - : target_(std::move(f)), file_opened_(true), env_(env) { - assert(target_ != nullptr); -} - -TestRandomRWFile::~TestRandomRWFile() { - if (file_opened_) { - Close(); - } -} - -Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - return target_->Write(offset, data); -} - -Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - return target_->Read(offset, n, result, scratch); -} - -Status TestRandomRWFile::Close() { - file_opened_ = false; - return target_->Close(); -} - -Status TestRandomRWFile::Flush() { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - return target_->Flush(); -} - -Status TestRandomRWFile::Sync() { - if (!env_->IsFilesystemActive()) { - return env_->GetError(); - } - return target_->Sync(); -} - -Status FaultInjectionTestEnv::NewDirectory(const std::string& name, - std::unique_ptr* result) { - std::unique_ptr r; - Status s = target()->NewDirectory(name, &r); - assert(s.ok()); - if (!s.ok()) { - return s; - } - result->reset(new TestDirectory(this, TrimDirname(name), r.release())); - return Status::OK(); -} - -Status FaultInjectionTestEnv::NewWritableFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& soptions) { - if (!IsFilesystemActive()) { - return GetError(); - } - // Not allow overwriting files - Status s = target()->FileExists(fname); - if (s.ok()) { - return Status::Corruption("File already exists."); - } else if (!s.IsNotFound()) { - assert(s.IsIOError()); - return s; - } - s = target()->NewWritableFile(fname, result, soptions); - if (s.ok()) { - result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); - MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = GetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list.insert(dir_and_name.second); - } - return s; -} - -Status FaultInjectionTestEnv::ReopenWritableFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& soptions) { - if (!IsFilesystemActive()) { - return GetError(); - } - Status s = target()->ReopenWritableFile(fname, result, soptions); - if (s.ok()) { - result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); - MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = GetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list.insert(dir_and_name.second); - } - return s; -} - -Status FaultInjectionTestEnv::NewRandomRWFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& soptions) { - if (!IsFilesystemActive()) { - return GetError(); - } - Status s = target()->NewRandomRWFile(fname, result, soptions); - if (s.ok()) { - result->reset(new TestRandomRWFile(fname, std::move(*result), this)); - // WritableFileWriter* file is opened - // again then it will be truncated - so forget our saved state. - UntrackFile(fname); - MutexLock l(&mutex_); - open_files_.insert(fname); - auto dir_and_name = GetDirAndName(fname); - auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; - list.insert(dir_and_name.second); - } - return s; -} - -Status FaultInjectionTestEnv::NewRandomAccessFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& soptions) { - if (!IsFilesystemActive()) { - return GetError(); - } - return target()->NewRandomAccessFile(fname, result, soptions); -} - -Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { - if (!IsFilesystemActive()) { - return GetError(); - } - Status s = EnvWrapper::DeleteFile(f); - if (!s.ok()) { - fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), - s.ToString().c_str()); - } - if (s.ok()) { - UntrackFile(f); - } - return s; -} - -Status FaultInjectionTestEnv::RenameFile(const std::string& s, - const std::string& t) { - if (!IsFilesystemActive()) { - return GetError(); - } - Status ret = EnvWrapper::RenameFile(s, t); - - if (ret.ok()) { - MutexLock l(&mutex_); - if (db_file_state_.find(s) != db_file_state_.end()) { - db_file_state_[t] = db_file_state_[s]; - db_file_state_.erase(s); - } - - auto sdn = GetDirAndName(s); - auto tdn = GetDirAndName(t); - if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { - auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; - assert(tlist.find(tdn.second) == tlist.end()); - tlist.insert(tdn.second); - } - } - - return ret; -} - -void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { - MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { - db_file_state_[state.filename_] = state; - open_files_.erase(state.filename_); - } -} - -void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) { - MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { - if (db_file_state_.find(state.filename_) == db_file_state_.end()) { - db_file_state_.insert(std::make_pair(state.filename_, state)); - } else { - db_file_state_[state.filename_] = state; - } - } -} - -void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) { - MutexLock l(&mutex_); - if (open_files_.find(state.filename_) != open_files_.end()) { - if (db_file_state_.find(state.filename_) == db_file_state_.end()) { - db_file_state_.insert(std::make_pair(state.filename_, state)); - } else { - db_file_state_[state.filename_] = state; - } - } -} - -// For every file that is not fully synced, make a call to `func` with -// FileState of the file as the parameter. -Status FaultInjectionTestEnv::DropFileData( - std::function func) { - Status s; - MutexLock l(&mutex_); - for (std::map::const_iterator it = - db_file_state_.begin(); - s.ok() && it != db_file_state_.end(); ++it) { - const FileState& state = it->second; - if (!state.IsFullySynced()) { - s = func(target(), state); - } - } - return s; -} - -Status FaultInjectionTestEnv::DropUnsyncedFileData() { - return DropFileData([&](Env* env, const FileState& state) { - return state.DropUnsyncedData(env); - }); -} - -Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) { - return DropFileData([&](Env* env, const FileState& state) { - return state.DropRandomUnsyncedData(env, rnd); - }); -} - -Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { - // Because DeleteFile access this container make a copy to avoid deadlock - std::map> map_copy; - { - MutexLock l(&mutex_); - map_copy.insert(dir_to_new_files_since_last_sync_.begin(), - dir_to_new_files_since_last_sync_.end()); - } - - for (auto& pair : map_copy) { - for (std::string name : pair.second) { - Status s = DeleteFile(pair.first + "/" + name); - if (!s.ok()) { - return s; - } - } - } - return Status::OK(); -} -void FaultInjectionTestEnv::ResetState() { - MutexLock l(&mutex_); - db_file_state_.clear(); - dir_to_new_files_since_last_sync_.clear(); - SetFilesystemActiveNoLock(true); -} - -void FaultInjectionTestEnv::UntrackFile(const std::string& f) { - MutexLock l(&mutex_); - auto dir_and_name = GetDirAndName(f); - dir_to_new_files_since_last_sync_[dir_and_name.first].erase( - dir_and_name.second); - db_file_state_.erase(f); - open_files_.erase(f); -} -} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,225 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright 2014 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// This test uses a custom Env to keep track of the state of a filesystem as of -// the last "sync". It then checks for data loss errors by purposely dropping -// file data (or entire files) not protected by a "sync". - -#pragma once - -#include -#include -#include - -#include "db/version_set.h" -#include "env/mock_env.h" -#include "file/filename.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "util/mutexlock.h" -#include "util/random.h" - -namespace ROCKSDB_NAMESPACE { - -class TestWritableFile; -class FaultInjectionTestEnv; - -struct FileState { - std::string filename_; - ssize_t pos_; - ssize_t pos_at_last_sync_; - ssize_t pos_at_last_flush_; - - explicit FileState(const std::string& filename) - : filename_(filename), - pos_(-1), - pos_at_last_sync_(-1), - pos_at_last_flush_(-1) {} - - FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} - - bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } - - Status DropUnsyncedData(Env* env) const; - - Status DropRandomUnsyncedData(Env* env, Random* rand) const; -}; - -// A wrapper around WritableFileWriter* file -// is written to or sync'ed. -class TestWritableFile : public WritableFile { - public: - explicit TestWritableFile(const std::string& fname, - std::unique_ptr&& f, - FaultInjectionTestEnv* env); - virtual ~TestWritableFile(); - virtual Status Append(const Slice& data) override; - virtual Status Truncate(uint64_t size) override { - return target_->Truncate(size); - } - virtual Status Close() override; - virtual Status Flush() override; - virtual Status Sync() override; - virtual bool IsSyncThreadSafe() const override { return true; } - virtual Status PositionedAppend(const Slice& data, - uint64_t offset) override { - return target_->PositionedAppend(data, offset); - } - virtual bool use_direct_io() const override { - return target_->use_direct_io(); - }; - - private: - FileState state_; - std::unique_ptr target_; - bool writable_file_opened_; - FaultInjectionTestEnv* env_; -}; - -// A wrapper around WritableFileWriter* file -// is written to or sync'ed. -class TestRandomRWFile : public RandomRWFile { - public: - explicit TestRandomRWFile(const std::string& fname, - std::unique_ptr&& f, - FaultInjectionTestEnv* env); - virtual ~TestRandomRWFile(); - Status Write(uint64_t offset, const Slice& data) override; - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; - Status Close() override; - Status Flush() override; - Status Sync() override; - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - bool use_direct_io() const override { return target_->use_direct_io(); }; - - private: - std::unique_ptr target_; - bool file_opened_; - FaultInjectionTestEnv* env_; -}; - -class TestDirectory : public Directory { - public: - explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, - Directory* dir) - : env_(env), dirname_(dirname), dir_(dir) {} - ~TestDirectory() {} - - virtual Status Fsync() override; - - private: - FaultInjectionTestEnv* env_; - std::string dirname_; - std::unique_ptr dir_; -}; - -class FaultInjectionTestEnv : public EnvWrapper { - public: - explicit FaultInjectionTestEnv(Env* base) - : EnvWrapper(base), filesystem_active_(true) {} - virtual ~FaultInjectionTestEnv() {} - - Status NewDirectory(const std::string& name, - std::unique_ptr* result) override; - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - virtual Status DeleteFile(const std::string& f) override; - - virtual Status RenameFile(const std::string& s, - const std::string& t) override; - -// Undef to eliminate clash on Windows -#undef GetFreeSpace - virtual Status GetFreeSpace(const std::string& path, - uint64_t* disk_free) override { - if (!IsFilesystemActive() && error_ == Status::NoSpace()) { - *disk_free = 0; - return Status::OK(); - } else { - return target()->GetFreeSpace(path, disk_free); - } - } - - void WritableFileClosed(const FileState& state); - - void WritableFileSynced(const FileState& state); - - void WritableFileAppended(const FileState& state); - - // For every file that is not fully synced, make a call to `func` with - // FileState of the file as the parameter. - Status DropFileData(std::function func); - - Status DropUnsyncedFileData(); - - Status DropRandomUnsyncedFileData(Random* rnd); - - Status DeleteFilesCreatedAfterLastDirSync(); - - void ResetState(); - - void UntrackFile(const std::string& f); - - void SyncDir(const std::string& dirname) { - MutexLock l(&mutex_); - dir_to_new_files_since_last_sync_.erase(dirname); - } - - // Setting the filesystem to inactive is the test equivalent to simulating a - // system reset. Setting to inactive will freeze our saved filesystem state so - // that it will stop being recorded. It can then be reset back to the state at - // the time of the reset. - bool IsFilesystemActive() { - MutexLock l(&mutex_); - return filesystem_active_; - } - void SetFilesystemActiveNoLock(bool active, - Status error = Status::Corruption("Not active")) { - filesystem_active_ = active; - if (!active) { - error_ = error; - } - } - void SetFilesystemActive(bool active, - Status error = Status::Corruption("Not active")) { - MutexLock l(&mutex_); - SetFilesystemActiveNoLock(active, error); - } - void AssertNoOpenFile() { assert(open_files_.empty()); } - Status GetError() { return error_; } - - private: - port::Mutex mutex_; - std::map db_file_state_; - std::set open_files_; - std::unordered_map> - dir_to_new_files_since_last_sync_; - bool filesystem_active_; // Record flushes, syncs, writes - Status error_; -}; - -} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/mock_time_env.h" + +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO: this is a workaround for the different behavior on different platform +// for timedwait timeout. Ideally timedwait API should be moved to env. +// details: PR #7101. +void MockSystemClock::InstallTimedWaitFixCallback() { +#ifndef NDEBUG + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +#ifdef OS_MACOSX + // This is an alternate way (vs. SpecialEnv) of dealing with the fact + // that on some platforms, pthread_cond_timedwait does not appear to + // release the lock for other threads to operate if the deadline time + // is already passed. (TimedWait calls are currently a bad abstraction + // because the deadline parameter is usually computed from Env time, + // but is interpreted in real clock time.) + SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < this->RealNowMicros()) { + *reinterpret_cast(arg) = this->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX + SyncPoint::GetInstance()->EnableProcessing(); +#endif // !NDEBUG +} + +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,41 +5,73 @@ #pragma once -#include "rocksdb/env.h" +#include +#include + +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { -class MockTimeEnv : public EnvWrapper { +// NOTE: SpecialEnv offers most of this functionality, along with hooks +// for safe DB behavior under a mock time environment, so should be used +// instead of MockSystemClock for DB tests. +class MockSystemClock : public SystemClockWrapper { public: - explicit MockTimeEnv(Env* base) : EnvWrapper(base) {} + explicit MockSystemClock(const std::shared_ptr& base) + : SystemClockWrapper(base) {} - virtual Status GetCurrentTime(int64_t* time) override { - assert(time != nullptr); - assert(current_time_ <= - static_cast(std::numeric_limits::max())); - *time = static_cast(current_time_); + static const char* kClassName() { return "MockSystemClock"; } + const char* Name() const override { return kClassName(); } + virtual Status GetCurrentTime(int64_t* time_sec) override { + assert(time_sec != nullptr); + *time_sec = static_cast(current_time_us_ / kMicrosInSecond); return Status::OK(); } - virtual uint64_t NowMicros() override { - assert(current_time_ <= std::numeric_limits::max() / 1000000); - return current_time_ * 1000000; - } + virtual uint64_t NowSeconds() { return current_time_us_ / kMicrosInSecond; } + + virtual uint64_t NowMicros() override { return current_time_us_; } virtual uint64_t NowNanos() override { - assert(current_time_ <= std::numeric_limits::max() / 1000000000); - return current_time_ * 1000000000; + assert(current_time_us_ <= std::numeric_limits::max() / 1000); + return current_time_us_ * 1000; + } + + uint64_t RealNowMicros() { return target_->NowMicros(); } + + void SetCurrentTime(uint64_t time_sec) { + assert(time_sec < std::numeric_limits::max() / kMicrosInSecond); + assert(time_sec * kMicrosInSecond >= current_time_us_); + current_time_us_ = time_sec * kMicrosInSecond; } - uint64_t RealNowMicros() { return target()->NowMicros(); } + // It's a fake sleep that just updates the Env current time, which is similar + // to `NoSleepEnv.SleepForMicroseconds()` and + // `SpecialEnv.MockSleepForMicroseconds()`. + // It's also similar to `set_current_time()`, which takes an absolute time in + // seconds, vs. this one takes the sleep in microseconds. + // Note: Not thread safe. + void SleepForMicroseconds(int micros) override { + assert(micros >= 0); + assert(current_time_us_ + static_cast(micros) >= + current_time_us_); + current_time_us_.fetch_add(micros); + } - void set_current_time(uint64_t time) { - assert(time >= current_time_); - current_time_ = time; + void MockSleepForSeconds(int seconds) { + assert(seconds >= 0); + int micros = seconds * kMicrosInSecond; + SleepForMicroseconds(micros); } + // TODO: this is a workaround for the different behavior on different platform + // for timedwait timeout. Ideally timedwait API should be moved to env. + // details: PR #7101. + void InstallTimedWaitFixCallback(); + private: - std::atomic current_time_{0}; + std::atomic current_time_us_{0}; + static constexpr uint64_t kMicrosInSecond = 1000U * 1000U; }; } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc 2025-05-19 16:14:27.000000000 +0000 @@ -4,10 +4,12 @@ // (found in the LICENSE.Apache file in the root directory). #include "test_util/sync_point.h" + +#include + #include "test_util/sync_point_impl.h" -int rocksdb_kill_odds = 0; -std::vector rocksdb_kill_prefix_blacklist; +std::vector rocksdb_kill_exclude_prefixes; #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { @@ -58,9 +60,33 @@ impl_->ClearTrace(); } -void SyncPoint::Process(const std::string& point, void* cb_arg) { +void SyncPoint::Process(const Slice& point, void* cb_arg) { impl_->Process(point, cb_arg); } } // namespace ROCKSDB_NAMESPACE #endif // NDEBUG + +namespace ROCKSDB_NAMESPACE { +void SetupSyncPointsToMockDirectIO() { +#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \ + !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewSequentialFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); +#endif +} +} // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h 2025-05-19 16:14:27.000000000 +0000 @@ -5,6 +5,7 @@ #pragma once #include + #include #include #include @@ -12,35 +13,44 @@ #include #include "rocksdb/rocksdb_namespace.h" - -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -extern int rocksdb_kill_odds; -// If kill point has a prefix on this list, will skip killing. -extern std::vector rocksdb_kill_prefix_blacklist; +#include "rocksdb/slice.h" #ifdef NDEBUG // empty in release build -#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) +#define TEST_KILL_RANDOM(kill_point) #else namespace ROCKSDB_NAMESPACE { -// Kill the process with probability 1/odds for testing. -extern void TestKillRandom(std::string kill_point, int odds, - const std::string& srcfile, int srcline); // To avoid crashing always at some frequently executed codepaths (during // kill random test), use this factor to reduce odds #define REDUCE_ODDS 2 #define REDUCE_ODDS2 4 -#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) \ - { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ +// A class used to pass when a kill point is reached. +struct KillPoint { + public: + // This is only set from db_stress.cc and for testing only. + // If non-zero, kill at various points in source code with probability 1/this + int rocksdb_kill_odds = 0; + // If kill point has a prefix on this list, will skip killing. + std::vector rocksdb_kill_exclude_prefixes; + // Kill the process with probability 1/odds for testing. + void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline); + + static KillPoint* GetInstance(); +}; + +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \ + { \ + KillPoint::GetInstance()->TestKillRandom( \ + kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__); \ } +#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1) } // namespace ROCKSDB_NAMESPACE + #endif #ifdef NDEBUG @@ -109,7 +119,16 @@ // triggered by TEST_SYNC_POINT, blocking execution until all predecessors // are executed. // And/or call registered callback function, with argument `cb_arg` - void Process(const std::string& point, void* cb_arg = nullptr); + void Process(const Slice& point, void* cb_arg = nullptr); + + // template gets length of const string at compile time, + // avoiding strlen() at runtime + template + void Process(const char (&point)[kLen], void* cb_arg = nullptr) { + static_assert(kLen > 0, "Must not be empty"); + assert(point[kLen - 1] == '\0'); + Process(Slice(point, kLen - 1), cb_arg); + } // TODO: it might be useful to provide a function that blocks until all // sync points are cleared. @@ -124,10 +143,13 @@ Data* impl_; }; +// Sets up sync points to mock direct IO instead of actually issuing direct IO +// to the file system. +void SetupSyncPointsToMockDirectIO(); } // namespace ROCKSDB_NAMESPACE // Use TEST_SYNC_POINT to specify sync points inside code base. -// Sync points can have happens-after depedency on other sync points, +// Sync points can have happens-after dependency on other sync points, // configured at runtime via SyncPoint::LoadDependency. This could be // utilized to re-produce race conditions between threads. // See TransactionLogIteratorRace in db_test.cc for an example use case. @@ -142,3 +164,17 @@ #define INIT_SYNC_POINT_SINGLETONS() \ (void)ROCKSDB_NAMESPACE::SyncPoint::GetInstance(); #endif // NDEBUG + +// Callback sync point for any read IO errors that should be ignored by +// the fault injection framework +// Disable in release mode +#ifdef NDEBUG +#define IGNORE_STATUS_IF_ERROR(_status_) +#else +#define IGNORE_STATUS_IF_ERROR(_status_) \ + { \ + if (!_status_.ok()) { \ + TEST_SYNC_POINT("FaultInjectionIgnoreError"); \ + } \ + } +#endif // NDEBUG diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc 2025-05-19 16:14:27.000000000 +0000 @@ -7,10 +7,18 @@ #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { +KillPoint* KillPoint::GetInstance() { + static KillPoint kp; + return &kp; +} -void TestKillRandom(std::string kill_point, int odds, - const std::string& srcfile, int srcline) { - for (auto& p : rocksdb_kill_prefix_blacklist) { +void KillPoint::TestKillRandom(std::string kill_point, int odds_weight, + const std::string& srcfile, int srcline) { + if (rocksdb_kill_odds <= 0) { + return; + } + int odds = rocksdb_kill_odds * odds_weight; + for (auto& p : rocksdb_kill_exclude_prefixes) { if (kill_point.substr(0, p.length()) == p) { return; } @@ -29,7 +37,6 @@ } } - void SyncPoint::Data::LoadDependency(const std::vector& dependencies) { std::lock_guard lock(mutex_); successors_.clear(); @@ -38,6 +45,8 @@ for (const auto& dependency : dependencies) { successors_[dependency.predecessor].push_back(dependency.successor); predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); } cv_.notify_all(); } @@ -54,11 +63,15 @@ for (const auto& dependency : dependencies) { successors_[dependency.predecessor].push_back(dependency.successor); predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); } for (const auto& marker : markers) { successors_[marker.predecessor].push_back(marker.successor); predecessors_[marker.successor].push_back(marker.predecessor); markers_[marker.predecessor].push_back(marker.successor); + point_filter_.Add(marker.predecessor); + point_filter_.Add(marker.successor); } cv_.notify_all(); } @@ -88,33 +101,42 @@ callbacks_.clear(); } -void SyncPoint::Data::Process(const std::string& point, void* cb_arg) { +void SyncPoint::Data::Process(const Slice& point, void* cb_arg) { if (!enabled_) { return; } + // Use a filter to prevent mutex lock if possible. + if (!point_filter_.MayContain(point)) { + return; + } + + // Must convert to std::string for remaining work. Take + // heap hit. + std::string point_string(point.ToString()); std::unique_lock lock(mutex_); auto thread_id = std::this_thread::get_id(); - auto marker_iter = markers_.find(point); + auto marker_iter = markers_.find(point_string); if (marker_iter != markers_.end()) { for (auto& marked_point : marker_iter->second) { marked_thread_id_.emplace(marked_point, thread_id); + point_filter_.Add(marked_point); } } - if (DisabledByMarker(point, thread_id)) { + if (DisabledByMarker(point_string, thread_id)) { return; } - while (!PredecessorsAllCleared(point)) { + while (!PredecessorsAllCleared(point_string)) { cv_.wait(lock); - if (DisabledByMarker(point, thread_id)) { + if (DisabledByMarker(point_string, thread_id)) { return; } } - auto callback_pair = callbacks_.find(point); + auto callback_pair = callbacks_.find(point_string); if (callback_pair != callbacks_.end()) { num_callbacks_running_++; mutex_.unlock(); @@ -122,7 +144,7 @@ mutex_.lock(); num_callbacks_running_--; } - cleared_points_.insert(point); + cleared_points_.insert(point_string); cv_.notify_all(); } } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h 2025-05-19 16:14:27.000000000 +0000 @@ -3,9 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "test_util/sync_point.h" - #include + #include #include #include @@ -15,15 +14,39 @@ #include #include +#include "memory/concurrent_arena.h" #include "port/port.h" +#include "test_util/sync_point.h" +#include "util/dynamic_bloom.h" #include "util/random.h" #pragma once #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { +// A hacky allocator for single use. +// Arena depends on SyncPoint and create circular dependency. +class SingleAllocator : public Allocator { + public: + char* Allocate(size_t) override { + assert(false); + return nullptr; + } + char* AllocateAligned(size_t bytes, size_t, Logger*) override { + buf_.resize(bytes); + return const_cast(buf_.data()); + } + size_t BlockSize() const override { + assert(false); + return 0; + } + + private: + std::string buf_; +}; + struct SyncPoint::Data { - Data() : enabled_(false) {} + Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {} // Enable proper deletion by subclasses virtual ~Data() {} // successor/predecessor map loaded from LoadDependency @@ -37,6 +60,9 @@ std::condition_variable cv_; // sync points that have been passed through std::unordered_set cleared_points_; + SingleAllocator alloc_; + // A filter before holding mutex to speed up process. + DynamicBloom point_filter_; std::atomic enabled_; int num_callbacks_running_ = 0; @@ -48,6 +74,7 @@ const std::function& callback) { std::lock_guard lock(mutex_); callbacks_[point] = callback; + point_filter_.Add(point); } void ClearCallBack(const std::string& point); @@ -68,7 +95,7 @@ return marked_point_iter != marked_thread_id_.end() && thread_id != marked_point_iter->second; } - void Process(const std::string& point, void* cb_arg); + void Process(const Slice& point, void* cb_arg); }; } // namespace ROCKSDB_NAMESPACE #endif // NDEBUG diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc 2025-05-19 16:14:27.000000000 +0000 @@ -8,12 +8,22 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "test_util/testharness.h" + +#include #include #include namespace ROCKSDB_NAMESPACE { namespace test { +#ifdef OS_WIN +#include + +std::string GetPidStr() { return std::to_string(GetCurrentProcessId()); } +#else +std::string GetPidStr() { return std::to_string(getpid()); } +#endif + ::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) { if (s.ok()) { return ::testing::AssertionSuccess(); @@ -26,13 +36,13 @@ std::string TmpDir(Env* env) { std::string dir; Status s = env->GetTestDirectory(&dir); - EXPECT_TRUE(s.ok()) << s.ToString(); + EXPECT_OK(s); return dir; } std::string PerThreadDBPath(std::string dir, std::string name) { size_t tid = std::hash()(std::this_thread::get_id()); - return dir + "/" + name + "_" + std::to_string(tid); + return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid); } std::string PerThreadDBPath(std::string name) { @@ -52,5 +62,49 @@ return result; } +TestRegex::TestRegex(const std::string& pattern) + : impl_(std::make_shared(pattern)), pattern_(pattern) {} +TestRegex::TestRegex(const char* pattern) + : impl_(std::make_shared(pattern)), pattern_(pattern) {} + +const std::string& TestRegex::GetPattern() const { return pattern_; } + +// Sorry about code duplication with regex.cc, but it doesn't support LITE +// due to exception handling +class TestRegex::Impl : public std::regex { + public: + using std::regex::basic_regex; +}; + +bool TestRegex::Matches(const std::string& str) const { + if (impl_) { + return std::regex_match(str, *impl_); + } else { + // Should not call Matches on unset Regex + assert(false); + return false; + } +} + +::testing::AssertionResult AssertMatchesRegex(const char* str_expr, + const char* pattern_expr, + const std::string& str, + const TestRegex& pattern) { + if (pattern.Matches(str)) { + return ::testing::AssertionSuccess(); + } else if (TestRegex("\".*\"").Matches(pattern_expr)) { + // constant regex string + return ::testing::AssertionFailure() + << str << " (" << str_expr << ")" << std::endl + << "does not match regex " << pattern.GetPattern(); + } else { + // runtime regex string + return ::testing::AssertionFailure() + << str << " (" << str_expr << ")" << std::endl + << "does not match regex" << std::endl + << pattern.GetPattern() << " (" << pattern_expr << ")"; + } +} + } // namespace test } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h 2025-05-19 16:14:27.000000000 +0000 @@ -14,6 +14,43 @@ #else #include #endif +#include "rocksdb/utilities/regex.h" + +// A "skipped" test has a specific meaning in Facebook infrastructure: the +// test is in good shape and should be run, but something about the +// compilation or execution environment means the test cannot be run. +// Specifically, there is a hole in intended testing if any +// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all +// tested build configurations/platforms/etc. +// +// If GTEST_SKIP is available, use it. Otherwise, define skip as success. +// +// The GTEST macros do not seem to print the message, even with -verbose, +// so these print to stderr. Note that these do not exit the test themselves; +// calling code should 'return' or similar from the test. +#ifdef GTEST_SKIP_ +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SKIP_(m); \ + } while (false) /* user ; */ +#else +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SUCCESS_("SKIPPED: " m); \ + } while (false) /* user ; */ +#endif + +// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to +// be a permanent condition, e.g. for intentionally omitting or disabling some +// parameterizations for some tests. (Use _DISABLED at the end of the test +// name to disable an entire test.) +#define ROCKSDB_GTEST_BYPASS(m) \ + do { \ + fputs("BYPASSED: " m "\n", stderr); \ + GTEST_SUCCESS_("BYPASSED: " m); \ + } while (false) /* user ; */ #include #include "rocksdb/env.h" @@ -43,5 +80,39 @@ EXPECT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s) #define EXPECT_NOK(s) EXPECT_FALSE((s).ok()) +// Useful for testing +// * No need to deal with Status like in Regex public API +// * No triggering lint reports on use of std::regex in tests +// * Available in LITE (unlike public API) +class TestRegex { + public: + // These throw on bad pattern + /*implicit*/ TestRegex(const std::string& pattern); + /*implicit*/ TestRegex(const char* pattern); + + // Checks that the whole of str is matched by this regex + bool Matches(const std::string& str) const; + + const std::string& GetPattern() const; + + private: + class Impl; + std::shared_ptr impl_; // shared_ptr for simple implementation + std::string pattern_; +}; + +::testing::AssertionResult AssertMatchesRegex(const char* str_expr, + const char* pattern_expr, + const std::string& str, + const TestRegex& pattern); + +#define ASSERT_MATCHES_REGEX(str, pattern) \ + ASSERT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern) +#define EXPECT_MATCHES_REGEX(str, pattern) \ + EXPECT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern) + } // namespace test + +using test::TestRegex; + } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc 2025-05-19 16:14:27.000000000 +0000 @@ -9,6 +9,9 @@ #include "test_util/testutil.h" +#include +#include + #include #include #include @@ -20,29 +23,27 @@ #include "file/sequence_file_reader.h" #include "file/writable_file_writer.h" #include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/mock_time_env.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +#ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif namespace ROCKSDB_NAMESPACE { namespace test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; -const uint32_t kLatestFormatVersion = 5u; - -Slice RandomString(Random* rnd, int len, std::string* dst) { - dst->resize(len); - for (int i = 0; i < len; i++) { - (*dst)[i] = static_cast(' ' + rnd->Uniform(95)); // ' ' .. '~' - } - return Slice(*dst); -} - -extern std::string RandomHumanReadableString(Random* rnd, int len) { - std::string ret; - ret.resize(len); - for (int i = 0; i < len; ++i) { - ret[i] = static_cast('a' + rnd->Uniform(26)); - } - return ret; -} +const std::set kFooterFormatVersionsToTest{ + 5U, + // In case any interesting future changes + kDefaultFormatVersion, + kLatestFormatVersion, +}; std::string RandomKey(Random* rnd, int len, RandomKeyType type) { // Make sure to generate a wide variety of characters so we @@ -75,8 +76,7 @@ int len, std::string* dst) { int raw = static_cast(len * compressed_fraction); if (raw < 1) raw = 1; - std::string raw_data; - RandomString(rnd, raw, &raw_data); + std::string raw_data = rnd->RandomString(raw); // Duplicate the random data until we have filled "len" bytes dst->clear(); @@ -118,6 +118,59 @@ void FindShortSuccessor(std::string* /*key*/) const override { return; } }; + +// A test implementation of comparator with 64-bit integer timestamp. +class ComparatorWithU64TsImpl : public Comparator { + public: + ComparatorWithU64TsImpl() + : Comparator(/*ts_sz=*/sizeof(uint64_t)), + cmp_without_ts_(BytewiseComparator()) { + assert(cmp_without_ts_); + assert(cmp_without_ts_->timestamp_size() == 0); + } + const char* Name() const override { return "ComparatorWithU64Ts"; } + void FindShortSuccessor(std::string*) const override {} + void FindShortestSeparator(std::string*, const Slice&) const override {} + int Compare(const Slice& a, const Slice& b) const override { + int ret = CompareWithoutTimestamp(a, b); + size_t ts_sz = timestamp_size(); + if (ret != 0) { + return ret; + } + // Compare timestamp. + // For the same user key with different timestamps, larger (newer) timestamp + // comes first. + return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz), + ExtractTimestampFromUserKey(b, ts_sz)); + } + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + const size_t ts_sz = timestamp_size(); + assert(!a_has_ts || a.size() >= ts_sz); + assert(!b_has_ts || b.size() >= ts_sz); + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b; + return cmp_without_ts_->Compare(lhs, rhs); + } + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + assert(ts1.size() == sizeof(uint64_t)); + assert(ts2.size() == sizeof(uint64_t)); + uint64_t lhs = DecodeFixed64(ts1.data()); + uint64_t rhs = DecodeFixed64(ts2.data()); + if (lhs < rhs) { + return -1; + } else if (lhs > rhs) { + return 1; + } else { + return 0; + } + } + + private: + const Comparator* cmp_without_ts_{nullptr}; +}; + } // namespace const Comparator* Uint64Comparator() { @@ -125,23 +178,9 @@ return &uint64comp; } -WritableFileWriter* GetWritableFileWriter(WritableFile* wf, - const std::string& fname) { - std::unique_ptr file(wf); - return new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)), - fname, EnvOptions()); -} - -RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) { - std::unique_ptr file(raf); - return new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), - "[test RandomAccessFileReader]"); -} - -SequentialFileReader* GetSequentialFileReader(SequentialFile* se, - const std::string& fname) { - std::unique_ptr file(se); - return new SequentialFileReader(NewLegacySequentialFileWrapper(file), fname); +const Comparator* ComparatorWithU64Ts() { + static ComparatorWithU64TsImpl comp_with_u64_ts; + return &comp_with_u64_ts; } void CorruptKeyType(InternalKey* ikey) { @@ -159,6 +198,38 @@ return k.Encode().ToString(); } +std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt) { + std::string user_key_with_ts(user_key); + std::string ts_str; + PutFixed64(&ts_str, ts); + user_key_with_ts.append(ts_str); + return KeyStr(user_key_with_ts, seq, t, corrupt); +} + +bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) { + auto abs_time = SystemClock::Default()->NowMicros() + wait_time; + MutexLock l(&mutex_); + while (!sleeping_ || !should_sleep_) { + if (bg_cv_.TimedWait(abs_time)) { + return true; + } + } + return false; +} + +bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) { + auto abs_time = SystemClock::Default()->NowMicros() + wait_time; + MutexLock l(&mutex_); + while (!done_with_sleep_) { + if (bg_cv_.TimedWait(abs_time)) { + return true; + } + } + return false; +} + std::string RandomName(Random* rnd, const size_t len) { std::stringstream ss; for (size_t i = 0; i < len; ++i) { @@ -263,6 +334,7 @@ db_opt->error_if_exists = rnd->Uniform(2); db_opt->is_fd_close_on_exec = rnd->Uniform(2); db_opt->paranoid_checks = rnd->Uniform(2); + db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2); db_opt->skip_log_error_on_recovery = rnd->Uniform(2); db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2); @@ -323,12 +395,17 @@ cf_opt->force_consistency_checks = rnd->Uniform(2); cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2); cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); + cf_opt->enable_blob_files = rnd->Uniform(2); + cf_opt->enable_blob_garbage_collection = rnd->Uniform(2); // double options cf_opt->hard_rate_limit = static_cast(rnd->Uniform(10000)) / 13; cf_opt->soft_rate_limit = static_cast(rnd->Uniform(10000)) / 13; cf_opt->memtable_prefix_bloom_size_ratio = static_cast(rnd->Uniform(10000)) / 20000.0; + cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0; + cf_opt->blob_garbage_collection_force_threshold = + rnd->Uniform(10000) / 10000.0; // int options cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); @@ -372,6 +449,9 @@ cf_opt->target_file_size_base * rnd->Uniform(100); cf_opt->compaction_options_fifo.max_table_files_size = uint_max + rnd->Uniform(10000); + cf_opt->min_blob_size = uint_max + rnd->Uniform(10000); + cf_opt->blob_file_size = uint_max + rnd->Uniform(10000); + cf_opt->blob_compaction_readahead_size = uint_max + rnd->Uniform(10000); // unsigned int options cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000); @@ -390,31 +470,7 @@ cf_opt->compression = RandomCompressionType(rnd); RandomCompressionTypeVector(cf_opt->num_levels, &cf_opt->compression_per_level, rnd); -} - -Status DestroyDir(Env* env, const std::string& dir) { - Status s; - if (env->FileExists(dir).IsNotFound()) { - return s; - } - std::vector files_in_dir; - s = env->GetChildren(dir, &files_in_dir); - if (s.ok()) { - for (auto& file_in_dir : files_in_dir) { - if (file_in_dir == "." || file_in_dir == "..") { - continue; - } - s = env->DeleteFile(dir + "/" + file_in_dir); - if (!s.ok()) { - break; - } - } - } - - if (s.ok()) { - s = env->DeleteDir(dir); - } - return s; + cf_opt->blob_compression_type = RandomCompressionType(rnd); } bool IsDirectIOSupported(Env* env, const std::string& dir) { @@ -433,6 +489,26 @@ return s.ok(); } +bool IsPrefetchSupported(const std::shared_ptr& fs, + const std::string& dir) { + bool supported = false; + std::string tmp = TempFileName(dir, 999); + Random rnd(301); + std::string test_string = rnd.RandomString(4096); + Slice data(test_string); + Status s = WriteStringToFile(fs.get(), data, tmp, true); + if (s.ok()) { + std::unique_ptr file; + auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr); + if (io_s.ok()) { + supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr) + .IsNotSupported()); + } + s = fs->DeleteFile(tmp, IOOptions(), nullptr); + } + return s.ok() && supported; +} + size_t GetLinesCount(const std::string& fname, const std::string& pattern) { std::stringstream ssbuf; std::string line; @@ -450,5 +526,269 @@ return count; } +Status CorruptFile(Env* env, const std::string& fname, int offset, + int bytes_to_corrupt, bool verify_checksum /*=true*/) { + uint64_t size; + Status s = env->GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } else if (offset < 0) { + // Relative to end of file; make it absolute + if (-offset > static_cast(size)) { + offset = 0; + } else { + offset = static_cast(size + offset); + } + } + if (offset > static_cast(size)) { + offset = static_cast(size); + } + if (offset + bytes_to_corrupt > static_cast(size)) { + bytes_to_corrupt = static_cast(size - offset); + } + + // Do it + std::string contents; + s = ReadFileToString(env, fname, &contents); + if (s.ok()) { + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(env, contents, fname); + } + if (s.ok() && verify_checksum) { +#ifndef ROCKSDB_LITE + Options options; + options.env = env; + EnvOptions env_options; + Status v = VerifySstFileChecksum(options, env_options, fname); + assert(!v.ok()); +#endif + } + return s; +} + +Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) { + uint64_t old_length; + Status s = env->GetFileSize(fname, &old_length); + if (!s.ok() || new_length == old_length) { + return s; + } + // Do it + std::string contents; + s = ReadFileToString(env, fname, &contents); + if (s.ok()) { + contents.resize(static_cast(new_length), 'b'); + s = WriteStringToFile(env, contents, fname); + } + return s; +} + +// Try and delete a directory if it exists +Status TryDeleteDir(Env* env, const std::string& dirname) { + bool is_dir = false; + Status s = env->IsDirectory(dirname, &is_dir); + if (s.ok() && is_dir) { + s = env->DeleteDir(dirname); + } + return s; +} + +// Delete a directory if it exists +void DeleteDir(Env* env, const std::string& dirname) { + TryDeleteDir(env, dirname).PermitUncheckedError(); +} + +Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result, + std::shared_ptr* guard) { + const char* env_uri = getenv("TEST_ENV_URI"); + const char* fs_uri = getenv("TEST_FS_URI"); + if (env_uri || fs_uri) { + return Env::CreateFromUri(config_options, + (env_uri != nullptr) ? env_uri : "", + (fs_uri != nullptr) ? fs_uri : "", result, guard); + } else { + // Neither specified. Use the default + *result = config_options.env; + guard->reset(); + return Status::OK(); + } +} +namespace { +// A hacky skip list mem table that triggers flush after number of entries. +class SpecialMemTableRep : public MemTableRep { + public: + explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable, + int num_entries_flush) + : MemTableRep(allocator), + memtable_(memtable), + num_entries_flush_(num_entries_flush), + num_entries_(0) {} + + virtual KeyHandle Allocate(const size_t len, char** buf) override { + return memtable_->Allocate(len, buf); + } + + // Insert key into the list. + // REQUIRES: nothing that compares equal to key is currently in the list. + virtual void Insert(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + + void InsertConcurrently(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + + // Returns true iff an entry that compares equal to key is in the list. + virtual bool Contains(const char* key) const override { + return memtable_->Contains(key); + } + + virtual size_t ApproximateMemoryUsage() override { + // Return a high memory usage when number of entries exceeds the threshold + // to trigger a flush. + return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024; + } + + virtual void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, + const char* entry)) override { + memtable_->Get(k, callback_args, callback_func); + } + + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) override { + return memtable_->ApproximateNumEntries(start_ikey, end_ikey); + } + + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { + return memtable_->GetIterator(arena); + } + + virtual ~SpecialMemTableRep() override {} + + private: + std::unique_ptr memtable_; + int num_entries_flush_; + int num_entries_; +}; +class SpecialSkipListFactory : public MemTableRepFactory { + public: +#ifndef ROCKSDB_LITE + static bool Register(ObjectLibrary& library, const std::string& /*arg*/) { + library.AddFactory( + ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + auto colon = uri.find(":"); + if (colon != std::string::npos) { + auto count = ParseInt(uri.substr(colon + 1)); + guard->reset(new SpecialSkipListFactory(count)); + } else { + guard->reset(new SpecialSkipListFactory(2)); + } + return guard->get(); + }); + return true; + } +#endif // ROCKSDB_LITE + // After number of inserts exceeds `num_entries_flush` in a mem table, trigger + // flush. + explicit SpecialSkipListFactory(int num_entries_flush) + : num_entries_flush_(num_entries_flush) {} + + using MemTableRepFactory::CreateMemTableRep; + virtual MemTableRep* CreateMemTableRep( + const MemTableRep::KeyComparator& compare, Allocator* allocator, + const SliceTransform* transform, Logger* /*logger*/) override { + return new SpecialMemTableRep( + allocator, + factory_.CreateMemTableRep(compare, allocator, transform, nullptr), + num_entries_flush_); + } + static const char* kClassName() { return "SpecialSkipListFactory"; } + virtual const char* Name() const override { return kClassName(); } + std::string GetId() const override { + std::string id = Name(); + if (num_entries_flush_ > 0) { + id.append(":").append(ROCKSDB_NAMESPACE::ToString(num_entries_flush_)); + } + return id; + } + + bool IsInsertConcurrentlySupported() const override { + return factory_.IsInsertConcurrentlySupported(); + } + + private: + SkipListFactory factory_; + int num_entries_flush_; +}; +} // namespace + +MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) { + RegisterTestLibrary(); + return new SpecialSkipListFactory(num_entries_per_flush); +} + +#ifndef ROCKSDB_LITE +// This method loads existing test classes into the ObjectRegistry +int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) { + size_t num_types; + library.AddFactory( + test::SimpleSuffixReverseComparator::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { + static test::SimpleSuffixReverseComparator ssrc; + return &ssrc; + }); + SpecialSkipListFactory::Register(library, arg); + library.AddFactory( + "Changling", + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new test::ChanglingMergeOperator(uri)); + return guard->get(); + }); + library.AddFactory( + "Changling", + [](const std::string& uri, std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { + return new test::ChanglingCompactionFilter(uri); + }); + library.AddFactory( + "Changling", [](const std::string& uri, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new test::ChanglingCompactionFilterFactory(uri)); + return guard->get(); + }); + library.AddFactory( + MockSystemClock::kClassName(), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new MockSystemClock(SystemClock::Default())); + return guard->get(); + }); + return static_cast(library.GetFactoryCount(&num_types)); +} + +#endif // ROCKSDB_LITE + +void RegisterTestLibrary(const std::string& arg) { + static bool registered = false; + if (!registered) { + registered = true; +#ifndef ROCKSDB_LITE + ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg); +#else + (void)arg; +#endif // ROCKSDB_LITE + } +} } // namespace test } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h 2025-05-19 16:14:27.000000000 +0000 @@ -22,26 +22,29 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/block_based/block_based_table_factory.h" #include "table/internal_iterator.h" -#include "table/plain/plain_table_factory.h" #include "util/mutexlock.h" -#include "util/random.h" + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int argc, char** argv); +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS namespace ROCKSDB_NAMESPACE { +class FileSystem; +class MemTableRepFactory; +class ObjectLibrary; +class Random; class SequentialFile; class SequentialFileReader; namespace test { extern const uint32_t kDefaultFormatVersion; -extern const uint32_t kLatestFormatVersion; - -// Store in *dst a random string of length "len" and return a Slice that -// references the generated data. -extern Slice RandomString(Random* rnd, int len, std::string* dst); - -extern std::string RandomHumanReadableString(Random* rnd, int len); +extern const std::set kFooterFormatVersionsToTest; // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). @@ -55,28 +58,6 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction, int len, std::string* dst); -// A wrapper that allows injection of errors. -class ErrorEnv : public EnvWrapper { - public: - bool writable_file_error_; - int num_writable_file_errors_; - - ErrorEnv() : EnvWrapper(Env::Default()), - writable_file_error_(false), - num_writable_file_errors_(0) { } - - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override { - result->reset(); - if (writable_file_error_) { - ++num_writable_file_errors_; - return Status::IOError(fname, "fake error"); - } - return target()->NewWritableFile(fname, result, soptions); - } -}; - #ifndef NDEBUG // An internal comparator that just forward comparing results from the // user comparator in it. Can be used to test entities that have no dependency @@ -104,10 +85,8 @@ class SimpleSuffixReverseComparator : public Comparator { public: SimpleSuffixReverseComparator() {} - - virtual const char* Name() const override { - return "SimpleSuffixReverseComparator"; - } + static const char* kClassName() { return "SimpleSuffixReverseComparator"; } + virtual const char* Name() const override { return kClassName(); } virtual int Compare(const Slice& a, const Slice& b) const override { Slice prefix_a = Slice(a.data(), 8); @@ -134,74 +113,15 @@ // endian machines. extern const Comparator* Uint64Comparator(); -// Iterator over a vector of keys/values -class VectorIterator : public InternalIterator { - public: - explicit VectorIterator(const std::vector& keys) - : keys_(keys), current_(keys.size()) { - std::sort(keys_.begin(), keys_.end()); - values_.resize(keys.size()); - } - - VectorIterator(const std::vector& keys, - const std::vector& values) - : keys_(keys), values_(values), current_(keys.size()) { - assert(keys_.size() == values_.size()); - } - - virtual bool Valid() const override { return current_ < keys_.size(); } - - virtual void SeekToFirst() override { current_ = 0; } - virtual void SeekToLast() override { current_ = keys_.size() - 1; } - - virtual void Seek(const Slice& target) override { - current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - - keys_.begin(); - } - - virtual void SeekForPrev(const Slice& target) override { - current_ = std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) - - keys_.begin(); - if (!Valid()) { - SeekToLast(); - } else { - Prev(); - } - } - - virtual void Next() override { current_++; } - virtual void Prev() override { current_--; } - - virtual Slice key() const override { return Slice(keys_[current_]); } - virtual Slice value() const override { return Slice(values_[current_]); } - - virtual Status status() const override { return Status::OK(); } - - virtual bool IsKeyPinned() const override { return true; } - virtual bool IsValuePinned() const override { return true; } - - private: - std::vector keys_; - std::vector values_; - size_t current_; -}; -extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf, - const std::string& fname); - -extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf); - -extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se, - const std::string& fname); - -class StringSink: public WritableFile { +class StringSink : public FSWritableFile { public: std::string contents_; - explicit StringSink(Slice* reader_contents = nullptr) : - WritableFile(), - contents_(""), - reader_contents_(reader_contents), - last_flush_(0) { + explicit StringSink(Slice* reader_contents = nullptr) + : FSWritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { if (reader_contents_ != nullptr) { *reader_contents_ = Slice(contents_.data(), 0); } @@ -209,12 +129,15 @@ const std::string& contents() const { return contents_; } - virtual Status Truncate(uint64_t size) override { + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.resize(static_cast(size)); - return Status::OK(); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { if (reader_contents_ != nullptr) { assert(reader_contents_->size() <= last_flush_); size_t offset = last_flush_ - reader_contents_->size(); @@ -224,12 +147,17 @@ last_flush_ = contents_.size(); } - return Status::OK(); + return IOStatus::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.append(slice.data(), slice.size()); - return Status::OK(); + return IOStatus::OK(); } void Drop(size_t bytes) { if (reader_contents_ != nullptr) { @@ -246,36 +174,44 @@ }; // A wrapper around a StringSink to give it a RandomRWFile interface -class RandomRWStringSink : public RandomRWFile { +class RandomRWStringSink : public FSRandomRWFile { public: explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {} - Status Write(uint64_t offset, const Slice& data) override { + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { if (offset + data.size() > ss_->contents_.size()) { ss_->contents_.resize(static_cast(offset) + data.size(), '\0'); } char* pos = const_cast(ss_->contents_.data() + offset); memcpy(pos, data.data(), data.size()); - return Status::OK(); + return IOStatus::OK(); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* /*scratch*/) const override { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* /*scratch*/, + IODebugContext* /*dbg*/) const override { *result = Slice(nullptr, 0); if (offset < ss_->contents_.size()) { size_t str_res_sz = std::min(static_cast(ss_->contents_.size() - offset), n); *result = Slice(ss_->contents_.data() + offset, str_res_sz); } - return Status::OK(); + return IOStatus::OK(); } - Status Flush() override { return Status::OK(); } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Sync() override { return Status::OK(); } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Close() override { return Status::OK(); } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } const std::string& contents() const { return ss_->contents(); } @@ -286,34 +222,42 @@ // Like StringSink, this writes into a string. Unlink StringSink, it // has some initial content and overwrites it, just like a recycled // log file. -class OverwritingStringSink : public WritableFile { +class OverwritingStringSink : public FSWritableFile { public: explicit OverwritingStringSink(Slice* reader_contents) - : WritableFile(), + : FSWritableFile(), contents_(""), reader_contents_(reader_contents), last_flush_(0) {} const std::string& contents() const { return contents_; } - virtual Status Truncate(uint64_t size) override { + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.resize(static_cast(size)); - return Status::OK(); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { if (last_flush_ < contents_.size()) { assert(reader_contents_->size() >= contents_.size()); memcpy((char*)reader_contents_->data() + last_flush_, contents_.data() + last_flush_, contents_.size() - last_flush_); last_flush_ = contents_.size(); } - return Status::OK(); + return IOStatus::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.append(slice.data(), slice.size()); - return Status::OK(); + return IOStatus::OK(); } void Drop(size_t bytes) { contents_.resize(contents_.size() - bytes); @@ -326,7 +270,7 @@ size_t last_flush_; }; -class StringSource: public RandomAccessFile { +class StringSource : public FSRandomAccessFile { public: explicit StringSource(const Slice& contents, uint64_t uniq_id = 0, bool mmap = false) @@ -339,11 +283,23 @@ uint64_t Size() const { return contents_.size(); } - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + // If we are using mmap_, it is equivalent to performing a prefetch + if (mmap_) { + return IOStatus::OK(); + } else { + return IOStatus::NotSupported("Prefetch not supported"); + } + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { total_reads_++; if (offset > contents_.size()) { - return Status::InvalidArgument("invalid Read offset"); + return IOStatus::InvalidArgument("invalid Read offset"); } if (offset + n > contents_.size()) { n = contents_.size() - static_cast(offset); @@ -354,10 +310,10 @@ } else { *result = Slice(&contents_[static_cast(offset)], n); } - return Status::OK(); + return IOStatus::OK(); } - virtual size_t GetUniqueId(char* id, size_t max_size) const override { + size_t GetUniqueId(char* id, size_t max_size) const override { if (max_size < 20) { return 0; } @@ -379,13 +335,6 @@ mutable int total_reads_; }; -inline StringSink* GetStringSinkFromLegacyWriter( - const WritableFileWriter* writer) { - LegacyWritableFileWrapper* file = - static_cast(writer->writable_file()); - return static_cast(file->target()); -} - class NullLogger : public Logger { public: using Logger::Logv; @@ -400,6 +349,10 @@ const SequenceNumber& seq, const ValueType& t, bool corrupt = false); +extern std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); + class SleepingBackgroundTask { public: SleepingBackgroundTask() @@ -433,16 +386,8 @@ // otherwise times out. // wait_time is in microseconds. // Returns true when times out, false otherwise. - bool TimedWaitUntilSleeping(uint64_t wait_time) { - auto abs_time = Env::Default()->NowMicros() + wait_time; - MutexLock l(&mutex_); - while (!sleeping_ || !should_sleep_) { - if (bg_cv_.TimedWait(abs_time)) { - return true; - } - } - return false; - } + bool TimedWaitUntilSleeping(uint64_t wait_time); + void WakeUp() { MutexLock l(&mutex_); should_sleep_ = false; @@ -456,16 +401,8 @@ } // Similar to TimedWaitUntilSleeping. // Waits until the task is done. - bool TimedWaitUntilDone(uint64_t wait_time) { - auto abs_time = Env::Default()->NowMicros() + wait_time; - MutexLock l(&mutex_); - while (!done_with_sleep_) { - if (bg_cv_.TimedWait(abs_time)) { - return true; - } - } - return false; - } + bool TimedWaitUntilDone(uint64_t wait_time); + bool WokenUp() { MutexLock l(&mutex_); return should_sleep_ == false; @@ -528,173 +465,223 @@ return result; } - class SeqStringSource : public SequentialFile { +class SeqStringSource : public FSSequentialFile { + public: + SeqStringSource(const std::string& data, std::atomic* read_count) + : data_(data), offset_(0), read_count_(read_count) {} + ~SeqStringSource() override {} + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + std::string output; + if (offset_ < data_.size()) { + n = std::min(data_.size() - offset_, n); + memcpy(scratch, data_.data() + offset_, n); + offset_ += n; + *result = Slice(scratch, n); + } else { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + (*read_count_)++; + return IOStatus::OK(); + } + + IOStatus Skip(uint64_t n) override { + if (offset_ >= data_.size()) { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + // TODO(yhchiang): Currently doesn't handle the overflow case. + offset_ += static_cast(n); + return IOStatus::OK(); + } + + private: + std::string data_; + size_t offset_; + std::atomic* read_count_; +}; + +class StringFS : public FileSystemWrapper { + public: + class StringSink : public FSWritableFile { public: - SeqStringSource(const std::string& data, std::atomic* read_count) - : data_(data), offset_(0), read_count_(read_count) {} - ~SeqStringSource() override {} - Status Read(size_t n, Slice* result, char* scratch) override { - std::string output; - if (offset_ < data_.size()) { - n = std::min(data_.size() - offset_, n); - memcpy(scratch, data_.data() + offset_, n); - offset_ += n; - *result = Slice(scratch, n); - } else { - return Status::InvalidArgument( - "Attemp to read when it already reached eof."); - } - (*read_count_)++; - return Status::OK(); - } - Status Skip(uint64_t n) override { - if (offset_ >= data_.size()) { - return Status::InvalidArgument( - "Attemp to read when it already reached eof."); - } - // TODO(yhchiang): Currently doesn't handle the overflow case. - offset_ += static_cast(n); - return Status::OK(); + explicit StringSink(std::string* contents) + : FSWritableFile(), contents_(contents) {} + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->resize(static_cast(size)); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->append(slice.data(), slice.size()); + return IOStatus::OK(); } private: - std::string data_; - size_t offset_; - std::atomic* read_count_; + std::string* contents_; }; - class StringEnv : public EnvWrapper { - public: - class StringSink : public WritableFile { - public: - explicit StringSink(std::string* contents) - : WritableFile(), contents_(contents) {} - virtual Status Truncate(uint64_t size) override { - contents_->resize(static_cast(size)); - return Status::OK(); - } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { return Status::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { - contents_->append(slice.data(), slice.size()); - return Status::OK(); - } - - private: - std::string* contents_; - }; + explicit StringFS(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + ~StringFS() override {} - explicit StringEnv(Env* t) : EnvWrapper(t) {} - ~StringEnv() override {} + static const char* kClassName() { return "StringFS"; } + const char* Name() const override { return kClassName(); } - const std::string& GetContent(const std::string& f) { return files_[f]; } + const std::string& GetContent(const std::string& f) { return files_[f]; } - const Status WriteToNewFile(const std::string& file_name, + const IOStatus WriteToNewFile(const std::string& file_name, const std::string& content) { - std::unique_ptr r; - auto s = NewWritableFile(file_name, &r, EnvOptions()); - if (!s.ok()) { - return s; - } - r->Append(content); - r->Flush(); - r->Close(); - assert(files_[file_name] == content); - return Status::OK(); - } - - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist", f); - } - r->reset(new SeqStringSource(iter->second, &num_seq_file_read_)); - return Status::OK(); - } - Status NewRandomAccessFile(const std::string& /*f*/, - std::unique_ptr* /*r*/, - const EnvOptions& /*options*/) override { - return Status::NotSupported(); - } - Status NewWritableFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter != files_.end()) { - return Status::IOError("The specified file already exists", f); - } - r->reset(new StringSink(&files_[f])); - return Status::OK(); - } - virtual Status NewDirectory( - const std::string& /*name*/, - std::unique_ptr* /*result*/) override { - return Status::NotSupported(); - } - Status FileExists(const std::string& f) override { - if (files_.find(f) == files_.end()) { - return Status::NotFound(); - } - return Status::OK(); - } - Status GetChildren(const std::string& /*dir*/, - std::vector* /*r*/) override { - return Status::NotSupported(); - } - Status DeleteFile(const std::string& f) override { - files_.erase(f); - return Status::OK(); - } - Status CreateDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status CreateDirIfMissing(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status DeleteDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status GetFileSize(const std::string& f, uint64_t* s) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist:", f); - } - *s = iter->second.size(); - return Status::OK(); - } - - Status GetFileModificationTime(const std::string& /*fname*/, - uint64_t* /*file_mtime*/) override { - return Status::NotSupported(); - } - - Status RenameFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } - - Status LinkFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } - - Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { - return Status::NotSupported(); + std::unique_ptr r; + FileOptions file_opts; + IOOptions io_opts; + + auto s = NewWritableFile(file_name, file_opts, &r, nullptr); + if (s.ok()) { + s = r->Append(content, io_opts, nullptr); + } + if (s.ok()) { + s = r->Flush(io_opts, nullptr); + } + if (s.ok()) { + s = r->Close(io_opts, nullptr); + } + assert(!s.ok() || files_[file_name] == content); + return s; + } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, + const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist", f); } + r->reset(new SeqStringSource(iter->second, &num_seq_file_read_)); + return IOStatus::OK(); + } - Status UnlockFile(FileLock* /*l*/) override { - return Status::NotSupported(); + IOStatus NewRandomAccessFile(const std::string& /*f*/, + const FileOptions& /*options*/, + std::unique_ptr* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter != files_.end()) { + return IOStatus::IOError("The specified file already exists", f); } + r->reset(new StringSink(&files_[f])); + return IOStatus::OK(); + } + IOStatus NewDirectory(const std::string& /*name*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } - std::atomic num_seq_file_read_; + IOStatus FileExists(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (files_.find(f) == files_.end()) { + return IOStatus::NotFound(); + } + return IOStatus::OK(); + } - protected: - std::unordered_map files_; - }; + IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/, + std::vector* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + files_.erase(f); + return IOStatus::OK(); + } + + IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus CreateDirIfMissing(const std::string& /*d*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return IOStatus::OK(); + } + + IOStatus GetFileModificationTime(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*file_mtime*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/, + FileLock** /*l*/, IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + std::atomic num_seq_file_read_; + + protected: + std::unordered_map files_; +}; // Randomly initialize the given DBOptions void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); @@ -723,6 +710,15 @@ Logger* /*logger*/) const override { return false; } + static const char* kClassName() { return "ChanglingMergeOperator"; } + virtual bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return MergeOperator::IsInstanceOf(id); + } + } + virtual const char* Name() const override { return name_.c_str(); } protected: @@ -747,6 +743,15 @@ return false; } + static const char* kClassName() { return "ChanglingCompactionFilter"; } + virtual bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return CompactionFilter::IsInstanceOf(id); + } + } + const char* Name() const override { return name_.c_str(); } private: @@ -772,11 +777,25 @@ // Returns a name that identifies this compaction filter factory. const char* Name() const override { return name_.c_str(); } + static const char* kClassName() { return "ChanglingCompactionFilterFactory"; } + virtual bool IsInstanceOf(const std::string& id) const override { + if (id == kClassName()) { + return true; + } else { + return CompactionFilterFactory::IsInstanceOf(id); + } + } protected: std::string name_; }; +// The factory for the hacky skip list mem table that triggers flush after +// number of entries exceeds a threshold. +extern MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush); + +extern const Comparator* ComparatorWithU64Ts(); + CompressionType RandomCompressionType(Random* rnd); void RandomCompressionTypeVector(const size_t count, @@ -791,12 +810,40 @@ std::string RandomName(Random* rnd, const size_t len); -Status DestroyDir(Env* env, const std::string& dir); - bool IsDirectIOSupported(Env* env, const std::string& dir); +bool IsPrefetchSupported(const std::shared_ptr& fs, + const std::string& dir); + // Return the number of lines where a given pattern was found in a file. size_t GetLinesCount(const std::string& fname, const std::string& pattern); +// TEST_TMPDIR may be set to /dev/shm in Makefile, +// but /dev/shm does not support direct IO. +// Tries to set TEST_TMPDIR to a directory supporting direct IO. +void ResetTmpDirForDirectIO(); + +Status CorruptFile(Env* env, const std::string& fname, int offset, + int bytes_to_corrupt, bool verify_checksum = true); +Status TruncateFile(Env* env, const std::string& fname, uint64_t length); + +// Try and delete a directory if it exists +Status TryDeleteDir(Env* env, const std::string& dirname); + +// Delete a directory if it exists +void DeleteDir(Env* env, const std::string& dirname); + +// Creates an Env from the system environment by looking at the system +// environment variables. +Status CreateEnvFromSystem(const ConfigOptions& options, Env** result, + std::shared_ptr* guard); + +#ifndef ROCKSDB_LITE +// Registers the testutil classes with the ObjectLibrary +int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/); +#endif // ROCKSDB_LITE + +// Register the testutil classes with the default ObjectRegistry/Library +void RegisterTestLibrary(const std::string& arg = ""); } // namespace test } // namespace ROCKSDB_NAMESPACE diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil_test.cc 1970-01-01 00:00:00.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc 2025-05-19 16:14:27.000000000 +0000 @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/testutil.h" + +#include "file/file_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +void CreateFile(Env* env, const std::string& path) { + std::unique_ptr f; + ASSERT_OK(env->NewWritableFile(path, &f, EnvOptions())); + f->Close(); +} + +TEST(TestUtil, DestroyDirRecursively) { + auto env = Env::Default(); + // test_util/file + // /dir + // /dir/file + std::string test_dir = test::PerThreadDBPath("test_util"); + ASSERT_OK(env->CreateDir(test_dir)); + CreateFile(env, test_dir + "/file"); + ASSERT_OK(env->CreateDir(test_dir + "/dir")); + CreateFile(env, test_dir + "/dir/file"); + + ASSERT_OK(DestroyDir(env, test_dir)); + auto s = env->FileExists(test_dir); + ASSERT_TRUE(s.IsNotFound()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc --- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc 2025-05-19 16:14:27.000000000 +0000 @@ -139,7 +139,7 @@ std::vector set_vec(num_sets_); std::iota(set_vec.begin(), set_vec.end(), static_cast(0)); - std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{}); + RandomShuffle(set_vec.begin(), set_vec.end()); // For each set, pick a key at random and increment it for (uint16_t set_i : set_vec) { @@ -165,12 +165,19 @@ // Increment key std::string sum = ToString(int_value + incr); if (txn != nullptr) { - s = txn->Put(key, sum); + s = txn->SingleDelete(key); if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) { // If the initial get was not for update, then the key is not locked // before put and put could fail due to concurrent writes. break; } else if (!s.ok()) { + // Since we did a GetForUpdate, SingleDelete should not fail. + fprintf(stderr, "SingleDelete returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + s = txn->Put(key, sum); + if (!s.ok()) { // Since we did a GetForUpdate, Put should not fail. fprintf(stderr, "Put returned an unexpected error: %s\n", s.ToString().c_str()); @@ -197,6 +204,10 @@ if (with_prepare) { // Also try commit without prepare s = txn->Prepare(); + if (!s.ok()) { + fprintf(stderr, "Prepare returned an unexpected error: %s\n", + s.ToString().c_str()); + } assert(s.ok()); ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Prepare of %" PRIu64 " %s (%s)", txn->GetId(), @@ -296,7 +307,7 @@ std::vector set_vec(num_sets); std::iota(set_vec.begin(), set_vec.end(), static_cast(0)); - std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{}); + RandomShuffle(set_vec.begin(), set_vec.end()); // For each set of keys with the same prefix, sum all the values for (uint16_t set_i : set_vec) { @@ -349,6 +360,7 @@ static_cast(key.size()), key.data(), int_value); total += int_value; } + iter->status().PermitUncheckedError(); delete iter; } diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h 2025-05-19 16:14:27.000000000 +0000 @@ -31,6 +31,12 @@ #define FOLLY_PPC64 0 #endif +#if defined(__s390x__) +#define FOLLY_S390X 1 +#else +#define FOLLY_S390X 0 +#endif + #if defined(__has_builtin) #define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__) #else @@ -57,6 +63,7 @@ constexpr bool kIsArchAmd64 = FOLLY_X64 == 1; constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1; constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1; +constexpr bool kIsArchS390X = FOLLY_S390X == 1; } // namespace folly namespace folly { @@ -82,3 +89,11 @@ constexpr bool kIsSanitizeThread = false; #endif } // namespace folly + +namespace folly { +#if defined(__linux__) && !FOLLY_MOBILE +constexpr auto kIsLinux = true; +#else +constexpr auto kIsLinux = false; +#endif +} // namespace folly diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h 2025-05-19 16:14:27.000000000 +0000 @@ -10,7 +10,7 @@ #include #include -#if _MSC_VER +#if _MSC_VER && (defined(_M_IX86) || defined(_M_X64)) extern "C" std::uint64_t __rdtsc(); #pragma intrinsic(__rdtsc) #endif @@ -18,7 +18,7 @@ namespace folly { inline std::uint64_t hardware_timestamp() { -#if _MSC_VER +#if _MSC_VER && (defined(_M_IX86) || defined(_M_X64)) return __rdtsc(); #elif __GNUC__ && (__i386__ || FOLLY_X64) return __builtin_ia32_rdtsc(); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp 2025-05-19 16:14:27.000000000 +0000 @@ -20,8 +20,6 @@ #include #endif -using namespace std::chrono; - namespace folly { namespace detail { @@ -69,7 +67,7 @@ } template -struct timespec timeSpecFromTimePoint(time_point absTime) { +struct timespec timeSpecFromTimePoint(std::chrono::time_point absTime) { auto epoch = absTime.time_since_epoch(); if (epoch.count() < 0) { // kernel timespec_valid requires non-negative seconds and nanos in [0,1G) @@ -79,20 +77,21 @@ // timespec-safe seconds and nanoseconds; // chrono::{nano,}seconds are `long long int` // whereas timespec uses smaller types - using time_t_seconds = duration; - using long_nanos = duration; + using time_t_seconds = + std::chrono::duration; + using long_nanos = + std::chrono::duration; - auto secs = duration_cast(epoch); - auto nanos = duration_cast(epoch - secs); + auto secs = std::chrono::duration_cast(epoch); + auto nanos = std::chrono::duration_cast(epoch - secs); struct timespec result = {secs.count(), nanos.count()}; return result; } FutexResult nativeFutexWaitImpl( - const void* addr, - uint32_t expected, - system_clock::time_point const* absSystemTime, - steady_clock::time_point const* absSteadyTime, + const void* addr, uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, uint32_t waitMask) { assert(absSystemTime == nullptr || absSteadyTime == nullptr); @@ -171,10 +170,9 @@ template FutexResult emulatedFutexWaitImpl( - F* futex, - uint32_t expected, - system_clock::time_point const* absSystemTime, - steady_clock::time_point const* absSteadyTime, + F* futex, uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, uint32_t waitMask) { static_assert( std::is_same>::value || @@ -235,10 +233,9 @@ } FutexResult futexWaitImpl( - const Futex* futex, - uint32_t expected, - system_clock::time_point const* absSystemTime, - steady_clock::time_point const* absSteadyTime, + const Futex* futex, uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, uint32_t waitMask) { #ifdef __linux__ return nativeFutexWaitImpl( @@ -250,10 +247,9 @@ } FutexResult futexWaitImpl( - const Futex* futex, - uint32_t expected, - system_clock::time_point const* absSystemTime, - steady_clock::time_point const* absSteadyTime, + const Futex* futex, uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, uint32_t waitMask) { return emulatedFutexWaitImpl( futex, expected, absSystemTime, absSteadyTime, waitMask); diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h 2025-05-19 16:14:27.000000000 +0000 @@ -1,14 +1,110 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once +#include #include +#include +#include + +// Work around bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56019 +#ifdef __GNUC__ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9) +namespace std { +using ::max_align_t; +} +#endif +#endif + namespace folly { +// has_extended_alignment +// +// True if it may be presumed that the platform has static extended alignment; +// false if it may not be so presumed, even when the platform might actually +// have it. Static extended alignment refers to extended alignment of objects +// with automatic, static, or thread storage. Whether the there is support for +// dynamic extended alignment is a property of the allocator which is used for +// each given dynamic allocation. +// +// Currently, very heuristical - only non-mobile 64-bit linux gets the extended +// alignment treatment. Theoretically, this could be tuned better. +constexpr bool has_extended_alignment = + kIsLinux && sizeof(void*) >= sizeof(std::uint64_t); + +namespace detail { + +// Implemented this way because of a bug in Clang for ARMv7, which gives the +// wrong result for `alignof` a `union` with a field of each scalar type. +// Modified for RocksDB to use C++11 only +constexpr std::size_t max_align_v = constexpr_max( + alignof(long double), + alignof(double), + alignof(float), + alignof(long long int), + alignof(long int), + alignof(int), + alignof(short int), + alignof(bool), + alignof(char), + alignof(char16_t), + alignof(char32_t), + alignof(wchar_t), + alignof(void*), + alignof(std::max_align_t)); + +} // namespace detail + +// max_align_v is the alignment of max_align_t. +// +// max_align_t is a type which is aligned at least as strictly as the +// most-aligned basic type (see the specification of std::max_align_t). This +// implementation exists because 32-bit iOS platforms have a broken +// std::max_align_t (see below). +// +// You should refer to this as `::folly::max_align_t` in portable code, even if +// you have `using namespace folly;` because C11 defines a global namespace +// `max_align_t` type. +// +// To be certain, we consider every non-void fundamental type specified by the +// standard. On most platforms `long double` would be enough, but iOS 32-bit +// has an 8-byte aligned `double` and `long long int` and a 4-byte aligned +// `long double`. +// +// So far we've covered locals and other non-allocated storage, but we also need +// confidence that allocated storage from `malloc`, `new`, etc will also be +// suitable for objects with this alignment requirement. +// +// Apple document that their implementation of malloc will issue 16-byte +// granularity chunks for small allocations (large allocations are page-size +// granularity and page-aligned). We think that allocated storage will be +// suitable for these objects based on the following assumptions: +// +// 1. 16-byte granularity also means 16-byte aligned. +// 2. `new` and other allocators follow the `malloc` rules. +// +// We also have some anecdotal evidence: we don't see lots of misaligned-storage +// crashes on 32-bit iOS apps that use `double`. +// +// Apple's allocation reference: http://bit.ly/malloc-small +constexpr std::size_t max_align_v = detail::max_align_v; +struct alignas(max_align_v) max_align_t {}; + // Memory locations within the same cache line are subject to destructive // interference, also known as false sharing, which is when concurrent // accesses to these different memory locations from different cores, where at @@ -23,7 +119,9 @@ // to avoid destructive interference. // // mimic: std::hardware_destructive_interference_size, C++17 -constexpr std::size_t hardware_destructive_interference_size = 128; +constexpr std::size_t hardware_destructive_interference_size = + (kIsArchArm || kIsArchS390X) ? 64 : 128; +static_assert(hardware_destructive_interference_size >= max_align_v, "math?"); // Memory locations within the same cache line are subject to constructive // interference, also known as true sharing, which is when accesses to some @@ -33,6 +131,14 @@ // // mimic: std::hardware_constructive_interference_size, C++17 constexpr std::size_t hardware_constructive_interference_size = 64; +static_assert(hardware_constructive_interference_size >= max_align_v, "math?"); -} // namespace folly +// A value corresponding to hardware_constructive_interference_size but which +// may be used with alignas, since hardware_constructive_interference_size may +// be too large on some platforms to be used with alignas. +constexpr std::size_t cacheline_align_v = has_extended_alignment + ? hardware_constructive_interference_size + : max_align_v; +struct alignas(cacheline_align_v) cacheline_align_t {}; +} // namespace folly diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h 2025-05-19 16:14:27.000000000 +0000 @@ -249,7 +249,8 @@ bool tryWaitSlow( const std::chrono::time_point& deadline, const WaitOptions& opt) noexcept { - switch (detail::spin_pause_until(deadline, opt, [=] { return ready(); })) { + switch ( + detail::spin_pause_until(deadline, opt, [this] { return ready(); })) { case detail::spin_result::success: return true; case detail::spin_result::timeout: @@ -259,7 +260,7 @@ } if (!MayBlock) { - switch (detail::spin_yield_until(deadline, [=] { return ready(); })) { + switch (detail::spin_yield_until(deadline, [this] { return ready(); })) { case detail::spin_result::success: return true; case detail::spin_result::timeout: diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h --- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h 2025-01-30 11:01:26.000000000 +0000 +++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h 2025-05-19 16:14:27.000000000 +0000 @@ -1374,7 +1374,8 @@ // we need release here because of the write to waker_ and also because we // are unlocking the mutex, the thread we do the handoff to here should // see the modified data - new (&waiter->metadata_) Metadata(waker, bit_cast(sleepers)); + new (&waiter->metadata_) + Metadata(waker, folly::bit_cast(sleepers)); waiter->futex_.store(kWake, std::memory_order_release); return 0; } @@ -1527,7 +1528,7 @@ template